1/* ANSI C standard library function memset.
2
3   Copyright (c) 2001-2008 Tensilica Inc.
4
5   Permission is hereby granted, free of charge, to any person obtaining
6   a copy of this software and associated documentation files (the
7   "Software"), to deal in the Software without restriction, including
8   without limitation the rights to use, copy, modify, merge, publish,
9   distribute, sublicense, and/or sell copies of the Software, and to
10   permit persons to whom the Software is furnished to do so, subject to
11   the following conditions:
12
13   The above copyright notice and this permission notice shall be included
14   in all copies or substantial portions of the Software.
15
16   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19   IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20   CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22   SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
23
24#include <picolibc.h>
25
26#include "xtensa-asm.h"
27
28/* void *memset (void *dst, int c, size_t length)
29
30   The algorithm is as follows:
31
32   Create a word with c in all byte positions.
33
34   If the destination is aligned, set 16B chunks with a loop, and then
35   finish up with 8B, 4B, 2B, and 1B stores conditional on the length.
36
37   If the destination is unaligned, align it by conditionally
38   setting 1B and/or 2B and then go to aligned case.
39
40   This code tries to use fall-through branches for the common
41   case of an aligned destination (except for the branches to
42   the alignment labels).  */
43
44
45/* Byte-by-byte set.  */
46
47	.text
48	.begin schedule
49	.align	XCHAL_INST_FETCH_WIDTH
50	.literal_position
51__memset_aux:
52
53	/* Skip bytes to get proper alignment for three-byte loop */
54.skip XCHAL_INST_FETCH_WIDTH - 3
55
56.Lbyteset:
57#if XCHAL_HAVE_LOOPS
58	loopnez	a4, 2f
59#else
60	beqz	a4, 2f
61	add	a6, a5, a4	// a6 = ending address
62#endif
631:	s8i	a3, a5, 0
64#if XTENSA_ESP32_PSRAM_CACHE_FIX
65	memw
66#endif
67	addi	a5, a5, 1
68#if !XCHAL_HAVE_LOOPS
69	bltu	a5, a6, 1b
70#endif
712:	leaf_return
72
73
74/* Destination is unaligned.  */
75
76	.align	4
77
78.Ldst1mod2: // dst is only byte aligned
79
80	/* Do short sizes byte-by-byte.  */
81	bltui	a4, 8, .Lbyteset
82
83	/* Set 1 byte.  */
84	s8i	a3, a5, 0
85	addi	a5, a5, 1
86	addi	a4, a4, -1
87#if XTENSA_ESP32_PSRAM_CACHE_FIX
88	memw
89#endif
90
91	/* Now retest if dst is aligned.  */
92	_bbci.l	a5, 1, .Ldstaligned
93
94.Ldst2mod4: // dst has 16-bit alignment
95
96	/* Do short sizes byte-by-byte.  */
97	bltui	a4, 8, .Lbyteset
98
99	/* Set 2 bytes.  */
100	s16i	a3, a5, 0
101	addi	a5, a5, 2
102	addi	a4, a4, -2
103#if XTENSA_ESP32_PSRAM_CACHE_FIX
104	memw
105#endif
106
107	/* dst is now aligned; return to main algorithm */
108	j	.Ldstaligned
109
110
111	.align	4
112	.global	memset
113	.type	memset, @function
114memset:
115	leaf_entry sp, 16
116	/* a2 = dst, a3 = c, a4 = length */
117
118	/* Duplicate character into all bytes of word.  */
119	extui	a3, a3, 0, 8
120	slli	a7, a3, 8
121	or	a3, a3, a7
122	slli	a7, a3, 16
123	or	a3, a3, a7
124
125	mov	a5, a2		// copy dst so that a2 is return value
126
127	/* Check if dst is unaligned.  */
128	_bbsi.l	a2, 0, .Ldst1mod2
129	_bbsi.l	a2, 1, .Ldst2mod4
130.Ldstaligned:
131
132	/* Get number of loop iterations with 16B per iteration.  */
133	srli	a7, a4, 4
134
135#if XTENSA_ESP32_PSRAM_CACHE_FIX
136	//do not do this if we have less than one iteration to do
137	beqz	a7, 2f
138	//this seems to work to prefetch the cache line
139	s32i	a3, a5, 0
140	nop
141#endif
142
143	/* Destination is word-aligned.  */
144#if XCHAL_HAVE_LOOPS
145	loopnez	a7, 2f
146#else
147	beqz	a7, 2f
148	slli	a6, a7, 4
149	add	a6, a6, a5	// a6 = end of last 16B chunk
150#endif
151	/* Set 16 bytes per iteration.  */
1521:	s32i	a3, a5, 0
153	s32i	a3, a5, 4
154	s32i	a3, a5, 8
155	s32i	a3, a5, 12
156	addi	a5, a5, 16
157#if !XCHAL_HAVE_LOOPS
158	bltu	a5, a6, 1b
159#endif
160
161	/* Set any leftover pieces smaller than 16B.  */
1622:	bbci.l	a4, 3, 3f
163
164	/* Set 8 bytes.  */
165	s32i	a3, a5, 0
166	s32i	a3, a5, 4
167	addi	a5, a5, 8
168
1693:	bbci.l	a4, 2, 4f
170
171	/* Set 4 bytes.  */
172	s32i	a3, a5, 0
173	addi	a5, a5, 4
174
1754:	bbci.l	a4, 1, 5f
176
177	/* Set 2 bytes.  */
178	s16i	a3, a5, 0
179	addi	a5, a5, 2
180#if XTENSA_ESP32_PSRAM_CACHE_FIX
181	memw
182#endif
183
1845:	bbci.l	a4, 0, 6f
185
186	/* Set 1 byte.  */
187	s8i	a3, a5, 0
188#if XTENSA_ESP32_PSRAM_CACHE_FIX
189	memw
190#endif
1916:	leaf_return
192
193	.end schedule
194
195	.size	memset, . - memset
196