1/* ANSI C standard library function memset.
2
3   Copyright (c) 2001-2008 Tensilica Inc.
4
5   Permission is hereby granted, free of charge, to any person obtaining
6   a copy of this software and associated documentation files (the
7   "Software"), to deal in the Software without restriction, including
8   without limitation the rights to use, copy, modify, merge, publish,
9   distribute, sublicense, and/or sell copies of the Software, and to
10   permit persons to whom the Software is furnished to do so, subject to
11   the following conditions:
12
13   The above copyright notice and this permission notice shall be included
14   in all copies or substantial portions of the Software.
15
16   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19   IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20   CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22   SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
23
24#include "xtensa-asm.h"
25
26/* void *memset (void *dst, int c, size_t length)
27
28   The algorithm is as follows:
29
30   Create a word with c in all byte positions.
31
32   If the destination is aligned, set 16B chunks with a loop, and then
33   finish up with 8B, 4B, 2B, and 1B stores conditional on the length.
34
35   If the destination is unaligned, align it by conditionally
36   setting 1B and/or 2B and then go to aligned case.
37
38   This code tries to use fall-through branches for the common
39   case of an aligned destination (except for the branches to
40   the alignment labels).  */
41
42
43/* Byte-by-byte set.  */
44
45	.text
46	.begin schedule
47	.align	XCHAL_INST_FETCH_WIDTH
48	.literal_position
49__memset_aux:
50
51	/* Skip bytes to get proper alignment for three-byte loop */
52.skip XCHAL_INST_FETCH_WIDTH - 3
53
54.Lbyteset:
55#if XCHAL_HAVE_LOOPS
56	loopnez	a4, 2f
57#else
58	beqz	a4, 2f
59	add	a6, a5, a4	// a6 = ending address
60#endif
611:	s8i	a3, a5, 0
62#if XTENSA_ESP32_PSRAM_CACHE_FIX
63	memw
64#endif
65	addi	a5, a5, 1
66#if !XCHAL_HAVE_LOOPS
67	bltu	a5, a6, 1b
68#endif
692:	leaf_return
70
71
72/* Destination is unaligned.  */
73
74	.align	4
75
76.Ldst1mod2: // dst is only byte aligned
77
78	/* Do short sizes byte-by-byte.  */
79	bltui	a4, 8, .Lbyteset
80
81	/* Set 1 byte.  */
82	s8i	a3, a5, 0
83	addi	a5, a5, 1
84	addi	a4, a4, -1
85#if XTENSA_ESP32_PSRAM_CACHE_FIX
86	memw
87#endif
88
89	/* Now retest if dst is aligned.  */
90	_bbci.l	a5, 1, .Ldstaligned
91
92.Ldst2mod4: // dst has 16-bit alignment
93
94	/* Do short sizes byte-by-byte.  */
95	bltui	a4, 8, .Lbyteset
96
97	/* Set 2 bytes.  */
98	s16i	a3, a5, 0
99	addi	a5, a5, 2
100	addi	a4, a4, -2
101#if XTENSA_ESP32_PSRAM_CACHE_FIX
102	memw
103#endif
104
105	/* dst is now aligned; return to main algorithm */
106	j	.Ldstaligned
107
108
109	.align	4
110	.global	memset
111	.type	memset, @function
112memset:
113	leaf_entry sp, 16
114	/* a2 = dst, a3 = c, a4 = length */
115
116	/* Duplicate character into all bytes of word.  */
117	extui	a3, a3, 0, 8
118	slli	a7, a3, 8
119	or	a3, a3, a7
120	slli	a7, a3, 16
121	or	a3, a3, a7
122
123	mov	a5, a2		// copy dst so that a2 is return value
124
125	/* Check if dst is unaligned.  */
126	_bbsi.l	a2, 0, .Ldst1mod2
127	_bbsi.l	a2, 1, .Ldst2mod4
128.Ldstaligned:
129
130	/* Get number of loop iterations with 16B per iteration.  */
131	srli	a7, a4, 4
132
133#if XTENSA_ESP32_PSRAM_CACHE_FIX
134	//do not do this if we have less than one iteration to do
135	beqz	a7, 2f
136	//this seems to work to prefetch the cache line
137	s32i	a3, a5, 0
138	nop
139#endif
140
141	/* Destination is word-aligned.  */
142#if XCHAL_HAVE_LOOPS
143	loopnez	a7, 2f
144#else
145	beqz	a7, 2f
146	slli	a6, a7, 4
147	add	a6, a6, a5	// a6 = end of last 16B chunk
148#endif
149	/* Set 16 bytes per iteration.  */
1501:	s32i	a3, a5, 0
151	s32i	a3, a5, 4
152	s32i	a3, a5, 8
153	s32i	a3, a5, 12
154	addi	a5, a5, 16
155#if !XCHAL_HAVE_LOOPS
156	bltu	a5, a6, 1b
157#endif
158
159	/* Set any leftover pieces smaller than 16B.  */
1602:	bbci.l	a4, 3, 3f
161
162	/* Set 8 bytes.  */
163	s32i	a3, a5, 0
164	s32i	a3, a5, 4
165	addi	a5, a5, 8
166
1673:	bbci.l	a4, 2, 4f
168
169	/* Set 4 bytes.  */
170	s32i	a3, a5, 0
171	addi	a5, a5, 4
172
1734:	bbci.l	a4, 1, 5f
174
175	/* Set 2 bytes.  */
176	s16i	a3, a5, 0
177	addi	a5, a5, 2
178#if XTENSA_ESP32_PSRAM_CACHE_FIX
179	memw
180#endif
181
1825:	bbci.l	a4, 0, 6f
183
184	/* Set 1 byte.  */
185	s8i	a3, a5, 0
186#if XTENSA_ESP32_PSRAM_CACHE_FIX
187	memw
188#endif
1896:	leaf_return
190
191	.end schedule
192
193	.size	memset, . - memset
194