1/*
2   Copyright (c) 2024, Synopsys, Inc. All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6
7   1) Redistributions of source code must retain the above copyright notice,
8   this list of conditions and the following disclaimer.
9
10   2) Redistributions in binary form must reproduce the above copyright notice,
11   this list of conditions and the following disclaimer in the documentation
12   and/or other materials provided with the distribution.
13
14   3) Neither the name of the Synopsys, Inc., nor the names of its contributors
15   may be used to endorse or promote products derived from this software
16   without specific prior written permission.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28   POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include <picolibc.h>
32
33#include <sys/asm.h>
34
35
36; r0 char* dest
37; r1 const char* src
38
39; dest and src MUST NOT intercept
40
41; Brief:
42; Perform the same operation as strlen for finding the end of r0 string
43; If r0 and r1 have
44; If 4 byte aligned
45; 	Do 4 byte search until there are no more 4 byte chunks
46;	Then, do 1 byte search
47; Otherwise, 1 byte search until alignment
48;	Then, do 4 byte search as previously specified
49;
50;; More in depth description at the end
51;
52; R0 char* dest (destination string)
53; R1 const char* src (source string)
54; ret (R0):
55;		- char* (destiantion string)
56;
57
58#if defined (__ARC64_ARCH32__)
59
60ENTRY (strcat)
61; Find end of r0 string
62; ========================== STRLEN CODE START ==========================
63
64; Preserve r0 for size calculation when returning
65	mov	r13, r0
66	xor	r6, r6, r6
67
68; Setup byte detector (more information below) [1]
69	mov	r8, NULL_32DT_1
70	asl	r9, r8, 7
71
72.L_4_4B_search:
73
74#if defined (__ARC64_LL64__)
75
76	ldd.ab	r2r3, [r13, +8]
77	ldd.ab	r4r5, [r13, +8]
78
79#else
80
81	ld.ab	r2, [r13, +4]
82	ld.ab	r3, [r13, +4]
83	ld.ab	r4, [r13, +4]
84	ld.ab	r5, [r13, +4]
85
86#endif
87
88; NULL byte position is detected and encoded in r6 [0] [9]
89	sub	r10, r2, r8
90	sub	r11, r3, r8
91	sub	r12, r4, r8
92	sub	r7, r5, r8
93
94	bic	r10, r10, r2
95	bic	r11, r11, r3
96	bic	r12, r12, r4
97	bic	r7, r7, r5
98
99	tst	r10, r9
100	bset.ne	r6, r6, 4
101
102	tst	r11, r9
103	bset.ne	r6, r6, 3
104
105	tst	r12, r9
106	bset.ne	r6, r6, 2
107
108	tst	r7, r9
109	bset.ne	r6, r6, 1
110
111	breq.d	r6, 0, @.L_4_4B_search
112
113	fls	r5, r6 ; [2]
114
115; Point r13 to first NULL byte containing double word [3]
116	sub2	r13, r13, r5
117
118	; Select appropriate register to analyze [4]
119	mov	r2, r7
120
121	asr.f	r6, r6, 3
122	mov.c	r2, r12
123
124	asr.f	r6, r6, 1
125	mov.c	r2, r11
126
127	asr.f	r6, r6, 1
128	mov.c	r2, r10
129
130; Point r13 to first NULL byte in selected double word
131	and	r2, r2, r9 ; [5]
132
133	ffs	r2, r2 ; [6]
134
135	xbfu 	r2, r2, 0b0111000011 ; [7]
136
137	add	r13, r13, r2 ; [8]
138
139
140; ========================== STRLEN CODE END >|< ==========================
141
142	xor	r6, r6, r6
143
144.L_4_4B_search_src:
145
146#if defined (__ARC64_LL64__)
147
148	ldd.ab	r2r3, [r1, +8]
149	ldd.ab	r4r5, [r1, +8]
150
151#else
152
153	ld.ab	r2, [r1, +4]
154	ld.ab	r3, [r1, +4]
155	ld.ab	r4, [r1, +4]
156	ld.ab	r5, [r1, +4]
157
158#endif
159
160; NULL byte position is detected and encoded in r6 [0] [9]
161	sub	r10, r2, r8
162	sub	r11, r3, r8
163	sub	r12, r4, r8
164	sub	r7, r5, r8
165
166	bic	r10, r10, r2
167	bic	r11, r11, r3
168	bic	r12, r12, r4
169	bic	r7, r7, r5
170
171	tst	r10, r9
172	bset.ne	r6, r6, 4
173
174	tst	r11, r9
175	bset.ne	r6, r6, 3
176
177	tst	r12, r9
178	bset.ne	r6, r6, 2
179
180	tst	r7, r9
181	bset.ne	r6, r6, 1
182
183	brne	r6, 0, @.L_found_in_32B
184
185#if defined (__ARC64_LL64__)
186
187	std.ab	r2r3, [r13, +8]
188	std.ab	r4r5, [r13, +8]
189
190#else
191
192	st.ab	r2, [r13, +4]
193	st.ab	r3, [r13, +4]
194	st.ab	r4, [r13, +4]
195	st.ab	r5, [r13, +4]
196
197#endif
198
199	b	@.L_4_4B_search_src
200
201.L_found_in_32B:
202
203	fls	r6, r6 ; [2]
204
205; Point r1 to first NULL byte containing double word [3]
206	sub2	r1, r1, r6
207
208;; Store the already loaded data
209
210	; 4 -> 1 to 3 -> 0
211	;subl	r6, r6, 1
212
213; Invert so the biggest branch is at the end, and we dont need to increase
214; block size
215	; 3 -> 0 to 0 -> 3
216	;subl	r6, 3, r6
217
218	; Condense the two subs here
219	rsub	r6, r6, 4
220
221	asl	r6, r6, 2
222
223; Store double words
224	bi	[r6]
225
226	b.d	@.L_store_lastL32bits
227	mov	r11, r2
228	nop
229	nop
230
231	st.ab	r2, [r13, +4]
232	b.d	@.L_store_lastL32bits
233	mov	r11, r3
234	nop
235
236	st.ab	r2, [r13, +4]
237	st.ab	r3, [r13, +4]
238	b.d	@.L_store_lastL32bits
239	mov	r11, r4
240
241	st.ab	r2, [r13, +4]
242	st.ab	r3, [r13, +4]
243	st.ab	r4, [r13, +4]
244	mov	r11, r5
245
246; r11 now contains the data to write
247.L_store_lastL32bits:
248	sub r10, r11, r8
249	bic	r10, r10, r11
250	and	r10, r10, r9 ; [5]
251
252	ffs	r2, r10 ; [6]
253	add	r2, r2, 1
254
255	xbfu	r2, r2, 0b0111000011 ; [7]
256
257	mov	r3, -1; Bitmask setup
258
259	; If the NULL byte is in byte 3 (starting from the right)
260	; we want to store 8-3 bytes
261	rsub	r2, r2, 8
262	asl	r2, r2, 3
263
264	; According to the target byte, setup masks
265	lsr	r3, r3, r2
266	not	r4, r3
267
268	; Obtain relevant data from destination
269	ld	r10, [r13]
270
271	; Get which data from dest is not to be overwritten and OR it
272	; with the relevant data to write
273	and	r3, r3, r11
274	and	r4, r4, r10
275
276	or	r3, r3, r4
277
278	j_s.d	[blink]
279	st.ab	r3, [r13, +4]
280
281
282
283ENDFUNC (strcat)
284
285#else
286
287ENTRY (strcat)
288; Find end of r0 string
289; ========================== STRLEN CODE START ==========================
290
291; Preserve r0 for size calculation when returning
292	movl	r13, r0
293	xorl	r6, r6, r6
294
295; Setup byte detector (more information below) [1]
296	vpack2wl	r8, NULL_32DT_1, NULL_32DT_1
297	asll	r9, r8, 7
298
299.L_4_8B_search:
300
301; Using 128-bit memory operations
302#if defined (__ARC64_M128__)
303
304	lddl.ab r2r3, [r13, +16]
305	lddl.ab r4r5, [r13, +16]
306
307; The 64-bit crunching implementation.
308#elif defined (__ARC64_ARCH64__)
309
310	ldl.ab	r2, [r13, +8]
311	ldl.ab	r3, [r13, +8]
312	ldl.ab	r4, [r13, +8]
313	ldl.ab	r5, [r13, +8]
314
315#else
316# error Unknown configuration
317#endif
318
319; NULL byte position is detected and encoded in r6 [0] [9]
320	subl	r10, r2, r8
321	subl	r11, r3, r8
322	subl	r12, r4, r8
323	subl	r7, r5, r8
324
325	bicl	r10, r10, r2
326	bicl	r11, r11, r3
327	bicl	r12, r12, r4
328	bicl	r7, r7, r5
329
330	tstl	r10, r9
331	bset.ne	r6, r6, 4
332
333	tstl	r11, r9
334	bset.ne	r6, r6, 3
335
336	tstl	r12, r9
337	bset.ne	r6, r6, 2
338
339	tstl	r7, r9
340	bset.ne	r6, r6, 1
341
342	breq.d	r6, 0, @.L_4_8B_search
343
344	fls	r5, r6 ; [2]
345
346; Point r13 to first NULL byte containing double word [3]
347	sub3l	r13, r13, r5
348
349	; Select appropriate register to analyze [4]
350	MOVP	r2, r7
351
352	asr.f	r6, r6, 3
353	MOVP.c	r2, r12
354
355	asr.f	r6, r6, 1
356	MOVP.c	r2, r11
357
358	asr.f	r6, r6, 1
359	MOVP.c	r2, r10
360
361; Point r13 to first NULL byte in selected double word
362	andl	r2, r2, r9 ; [5]
363
364	ffsl	r2, r2 ; [6]
365
366	xbful 	r2, r2, 0b0111000011 ; [7]
367
368	addl	r13, r13, r2 ; [8]
369
370
371; ========================== STRLEN CODE END >|< ==========================
372
373	xorl	r6, r6, r6
374
375.L_4_8B_search_src:
376#if defined (__ARC64_M128__)
377
378	lddl.ab	r2r3, [r1, +16]
379	lddl.ab	r4r5, [r1, +16]
380
381#elif defined (__ARC64_ARCH64__)
382
383	ldl.ab	r2, [r1, +8]
384	ldl.ab	r3, [r1, +8]
385	ldl.ab	r4, [r1, +8]
386	ldl.ab	r5, [r1, +8]
387
388#else
389	# error Unknown configuration
390#endif
391
392; NULL byte position is detected and encoded in r6 [0] [9]
393	subl	r10, r2, r8
394	subl	r11, r3, r8
395	subl	r12, r4, r8
396	subl	r7, r5, r8
397
398	bicl	r10, r10, r2
399	bicl	r11, r11, r3
400	bicl	r12, r12, r4
401	bicl	r7, r7, r5
402
403	tstl	r10, r9
404	bset.ne	r6, r6, 4
405
406	tstl	r11, r9
407	bset.ne	r6, r6, 3
408
409	tstl	r12, r9
410	bset.ne	r6, r6, 2
411
412	tstl	r7, r9
413	bset.ne	r6, r6, 1
414
415	brne	r6, 0, @.L_found_in_32B
416
417#if defined (__ARC64_M128__)
418
419	stdl.ab	r2r3, [r13, +16]
420	stdl.ab	r4r5, [r13, +16]
421
422#elif defined (__ARC64_ARCH64__)
423
424	stl.ab	r2, [r13, +8]
425	stl.ab	r3, [r13, +8]
426	stl.ab	r4, [r13, +8]
427	stl.ab	r5, [r13, +8]
428
429#else
430# error Unknown configuration
431#endif
432
433	b	@.L_4_8B_search_src
434
435.L_found_in_32B:
436
437	fls	r6, r6 ; [2]
438
439; Point r1 to first NULL byte containing double word [3]
440	sub3l	r1, r1, r6
441
442;; Store the already loaded data
443
444	; 4 -> 1 to 3 -> 0
445	;subl	r6, r6, 1
446
447; Invert so the biggest branch is at the end, and we dont need to increase
448; block size
449	; 3 -> 0 to 0 -> 3
450	;subl	r6, 3, r6
451
452	; Condense the two subs here
453	rsubl	r6, r6, 4
454
455	asll	r6, r6, 2
456
457; Store double words
458	bi	[r6]
459
460	b.d	@.L_store_lastL64bits
461	MOVP	r11, r2
462	nop
463	nop
464
465	stl.ab	r2, [r13, +8]
466	b.d	@.L_store_lastL64bits
467	MOVP	r11, r3
468	nop
469
470	stl.ab	r2, [r13, +8]
471	stl.ab	r3, [r13, +8]
472	b.d	@.L_store_lastL64bits
473	MOVP	r11, r4
474
475	stl.ab	r2, [r13, +8]
476	stl.ab	r3, [r13, +8]
477	stl.ab	r4, [r13, +8]
478	MOVP	r11, r5
479
480; r11 now contains the data to write
481.L_store_lastL64bits:
482	subl	r10, r11, r8
483	bicl	r10, r10, r11
484
485	andl	r10, r10, r9 ; [5]
486
487	ffsl	r2, r10 ; [6]
488	addl	r2, r2, 1
489
490	xbful 	r2, r2, 0b0111000011 ; [7]
491
492	movl	r3, -1; Bitmask setup
493
494	; If the NULL byte is in byte 3 (starting from the right)
495	; we want to store 8-3 bytes
496	rsubl	r2, r2, 8
497	asl	r2, r2, 3
498
499	; According to the target byte, setup masks
500	lsrl	r3, r3, r2
501	notl	r4, r3
502
503	; Obtain relevant data from destination
504	ldl	r10, [r13]
505
506	; Get which data from dest is not to be overwritten and OR it
507	; with the relevant data to write
508	andl	r3, r3, r11
509	andl	r4, r4, r10
510
511	orl	r3, r3, r4
512
513	j_s.d	[blink]
514	stl.ab	r3, [r13, +8]
515
516
517ENDFUNC (strcat)
518
519#endif
520
521;; This code uses a common technique for NULL byte detection inside a word.
522;; Details on this technique can be found in:
523;; (https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord)
524;
525; In sum, this technique allows for detecting a NULL byte inside any given
526; amount of bits by performing the following operation
527; 		DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080) [0]
528;
529; The code above implements this by setting r8 to a 0x01010101... sequence and
530; r9 to a 0x80808080... sequence of appropriate length
531; As LIMM are 32 bit only, we need to perform MOVHL and ORL [1] operations to
532; have the appropriate 64 bit values in place
533;
534;; Search is done 32 bytes at a time, either with 64 bit loads or 128 bit loads
535;; If a NULL byte is detected, the position of the double word is encoded
536;; in r6, which is then used to adjust r13 to the exact byte
537;
538; r6 is set via bset, which means we can simply use a fls to obtain the first
539; match (or ffs depending on the values in bset) [2].
540; The reason for starting at 1 and not 0 is so r6 encodes how many double
541; words to go back, and it wouldnt make sense to go back 0 (the NULL would be
542; in the next loop iteration).
543;
544; The first step to take is point r13 to the appropriate double word.
545; As the chosen encoded information is how many double words to go back,
546; we can simply multiply r6 by 8 and reduce r13 by that amount [3]
547;
548; Then, we need to place the loaded double word containing the first NULL byte
549; into a "common" register we can operate on later [4].
550;
551; To do this without any jumps, we can shift r6 and perform a conditional mov
552; based on the carry flag value.
553; The order is very important because the NULL byte can appear in several
554; double words, so we want to analyze from last to first.
555;
556; We can ignore the first asr (which would be asr.f 2, as we started r6 on 1)
557; because if r7 isnt the NULL byte, r2 will always be overwritten so we can
558; just decide to start at r7, and overwrite it if needed.
559;
560; Now comes the tricky part. In order to obtain the first NULL byte, we need to
561; understand the NULL byte detection operation. It is explained in depth in the
562; link above but in short, it works by first setting the highest bit of each
563; byte to 1, if the corresponding byte is either 0 or less than 0x80
564; Then, separately, it makes the highest bit of each byte 1, if the byte is
565; less than 0x80. The last step is to and these two values (this operation is
566; simplified with the subl, bicl and tst instructions).
567;
568; This means that the evaluated equation result value [5] has zeros for all non
569; zero bytes, except for the NULL bytes. Therefore, we can simply find the
570; first non zero bit (counting from bit 0) which will be inside the position of
571; the first NULL byte.
572;
573; One thing to note, is that ffs oddly returns 31 if no bit is found, setting
574; the zero flag. As r9 is never all 0s at this stage (would mean there is no
575; NULL byte and we wouldnt be here) we dont need to worry about that. [6]
576;
577; We can then convert the bit position into the last byte position by looking
578; into bits 3 to 5, and shifting 3 bits to the right. This can be combined into
579; a single xbful operation. The bottom 000011 represent shift by 3 and the top
580; 0111 represents the mask (3 to 5 shifted by 3 is 0 to 2). We dont need to worry
581; about the case where ffs does not find a bit, because we know for sure there is
582; at least one NULL byte, and therefore one of the highest bits is set to 1 [7]
583;
584; Finally, we can add the NULL byte position inside the loaded double word to
585; r13 and subtract r0 from r13 to obtain the string size [8]
586;
587; Some operations are re-ordered such that register dependency is reduced,
588; allowing the CPU to run more instructions in parallel [9]
589;
590;
591; Some data was already read, and needs to be stored following the same read
592; order. To do this, we need to make the
593;
594;
595