1/*
2   Copyright (c) 2024, Synopsys, Inc. All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6
7   1) Redistributions of source code must retain the above copyright notice,
8   this list of conditions and the following disclaimer.
9
10   2) Redistributions in binary form must reproduce the above copyright notice,
11   this list of conditions and the following disclaimer in the documentation
12   and/or other materials provided with the distribution.
13
14   3) Neither the name of the Synopsys, Inc., nor the names of its contributors
15   may be used to endorse or promote products derived from this software
16   without specific prior written permission.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28   POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include <picolibc.h>
32
33#include <sys/asm.h>
34
35#if defined (__ARC64_ARCH64__)
36
37; R0: lhs
38; R1: rhs
39; R2: count
40; ret (R0):
41;   - lhs < rhs: <0
42;   - lhs = rhs:  0
43;   - lhs > rhs: >0
44ENTRY (memcmp)
45	cmpl	  r2, 64
46	bls.d	  @.L_compare_1_bytes
47	movl	  r3, r0	; "r0" will be used as return value
48	; If one is curious why the code below looks like the way it does,
49	; there is a documentation at the end of this file.
50	lsrl	  r12, r2, 5	; counter for 32-byte chunks
51	xor	  r13, r13, r13	; the mask showing inequal registers
52	ldl.ab	  r4,  [r3, +8]
53	ldl.ab	  r5,  [r1, +8]
54.L_compare_32_bytes:
55	ldl.ab	  r6,  [r3, +8]
56	ldl.ab	  r7,  [r1, +8]
57	ldl.ab	  r8,  [r3, +8]
58	ldl.ab	  r9,  [r1, +8]
59	ldl.ab	  r10, [r3, +8]
60	ldl.ab	  r11, [r1, +8]
61	xorl.f	  0, r4, r5
62	xor.ne	  r13, r13, 0b0001
63	xorl.f	  0, r6, r7
64	xor.ne	  r13, r13, 0b0010
65	xorl.f	  0, r8, r9
66	xor.ne	  r13, r13, 0b0100
67	xorl.f	  0, r10, r11
68	xor.ne	  r13, r13, 0b1000
69	brne	  r13, 0, @.L_unequal_find
70	ldl.ab	  r4,  [r3, +8]
71	dbnz.d	  r12, @.L_compare_32_bytes
72	ldl.ab	  r5,  [r1, +8]
73	; Adjusting the pointers because of the extra loads in the end
74	subl	  r1, r1, 8
75	subl	  r3, r3, 8
76	bmsk_s	  r2, r2, 4	; any remaining bytes to compare
77.L_compare_1_bytes:
78	cmp	  r2, 0
79	jeq.d	  [blink]
80	xor_s     r0, r0, r0
81	ldb.ab	  r4, [r3, +1]
82	ldb.ab	  r5, [r1, +1]
832:
84	sub.f	  r0, r4, r5
85	jne.d	  [blink]
86	ldb.ab	  r4, [r3, +1]
87	dbnz.d	  r2, @2b
88	ldb.ab	  r5, [r1, +1]	; this load may read beyond the "count".
89	j_s	  [blink]
90; At this point, we want to find the _first_ comparison that marked the
91; inequality of "lhs" and "rhs".  The rest acts like a multiplexer:
92;
93; if r4  was not equal to r5  --> r1=r4,  r2=r5
94; if r6  was not equal to r7  --> r1=r6,  r2=r7
95; if r8  was not equal to r9  --> r1=r8,  r2=r9
96; if r10 was not equal to r11 --> r1=r10, r2=r11
97; find_different_byte(r1, r2)
98;
99; About the "bi [n]" (branch index) instruction:  This instruction alters
100; next PC (program counter):
101;
102; next_pc = current_pc + n*4                n*4 is the same as n<<2
103;
104; In other words, it tells the processor to execute the n'th instruction
105; from where we are (assuming all the next instructions are 4 bytes long).
106;
107; We used this to our benefit.  We made each "case" (unequal_r4r5,
108; unequal_r5r6, ...) 16 bytes long (power of 2) and fed "bi" an index
109; that is already multiplied by 4 (asl r13, r13, 2).  This translates
110; into "bi [n]" jumping to 16-bytes slots.  The last slot we did not
111; make 16 bytes long with "nop" because we don't need to address after
112; it.
113.L_unequal_find:
114	ffs	  r13, r13
115	asl	  r13, r13, 2
116	bi	  [r13]
117.L_unequal_r4r5:
118	movl	  r1, r4
119	b.d	  @.L_diff_byte_in_regs
120	movl	  r2, r5
121	nop
122.L_unequal_r6r7:
123	movl	  r1, r6
124	b.d	  @.L_diff_byte_in_regs
125	movl	  r2, r7
126	nop
127.L_unequal_r8r9:
128	movl	  r1, r8
129	b.d	  @.L_diff_byte_in_regs
130	movl	  r2, r9
131	nop
132.L_unequal_r10r11:
133	movl	  r1, r10
134	movl	  r2, r11
135	; fall-through
136; If we're here, that means the two operands are not equal.
137; 1) First we have to get a mask of their inequality through "xor"
138; 2) Then, find the first bit position that they're different: "ffs"
139; 3) Depending on the bit position, we want the whole byte containing
140;    that bit, in both operands, to become the very first byte (least
141;    significant byte), so that we can subtract one from another.
142;    Below is an illustration of bit positions and how much we should
143;    shift the numbers right:
144;    bit position range : (in binary)       | shift right by : (in binary)
145;    -------------------+-------------------+----------------+------------
146;    [ 0, 7]            : (000000 - 000111) | lsr  0         : 000000
147;    [ 8,15]            : (001000 - 001111) | lsr  8         : 001000
148;    [16,23]            : (010000 - 010111) | lsr 16         : 010000
149;    [24,31]            : (011000 - 011111) | lsr 24         : 011000
150;    ...                : ...               | ...            : ...
151;    [56,63]            : (111000 - 111111) | lsr 56         : 111000
152;    We need to ignore the least 3 bits of "position" to get "shift right"
153;    amount: "and 0x38, ..."
154; 4) When the bytes are positioned at byte #0, mask out the rest of the
155;    bytes and subtract the two operands: lhs - rhs
156.L_diff_byte_in_regs:
157	xorl	  r0, r1, r2	; (1)
158	ffsl	  r0, r0	; (2)
159	and	  r0, r0, 0x38	; (3)
160	lsrl	  r1, r1, r0	; (3)
161	lsrl	  r2, r2, r0	; (3)
162	bmsk_s	  r1, r1, 7	; (4)
163	bmsk_s	  r2, r2, 7	; (4)
164	j_s.d	  [blink]
165	subl	  r0, r1, r2	; (4)
166ENDFUNC (memcmp)
167
168; __ARC64_ARCH64__
169#endif
170
171; The loop at the heart of the "memcmp" function follows some specific
172; logic and has gone through a few optimisation filters.  Knowing them
173; will help understand the code better.
174;
175; The comparison logic
176; --------------------
177; In each loop, we compare 32 bytes of data from "lhs" and "rhs".  Those
178; comparisons takes place by using 8 sets of registers:
179;
180; r4  == r5        xor.f 0, r4,  r5       lhs[i+0]  == rhs[i+0]
181; r6  == r7        xor.f 0, r6,  r7       lhs[i+8]  == rhs[i+8]
182; r8  == r9        xor.f 0, r8,  r9       lhs[i+16] == rhs[i+16]
183; r10 == r11       xor.f 0, r10, r11      lhs[i+24] == rhs[i+32]
184;
185; The idea is to set a corresponding bit in r13 register for each
186; comparison that fails.  The relation between the bits and the
187; comparisons are:
188;
189; r13[0..63] = 0
190; r13[0]     = 1 if r4  != r5
191; r13[1]     = 1 if r6  != r7
192; r13[2]     = 1 if r8  != r9
193; r13[3]     = 1 if r10 != r11
194;
195; If r13 remains 0, the next possible iteration of the loop begins.
196; If it is not 0 anymore, the algorithm will be interested in the
197; lowest bit that is set to 1.  That is achieved by the "ffs"
198; (find first set) instruction.
199;
200; The loop transformation
201; -----------------------
202; 1) At first, the loop looks like below:
203;
204;    .loop
205;      ldl.ab  r4,  [r3, +8]
206;      ldl.ab  r5,  [r1, +8]
207;      ...
208;      ldl.ab  r10, [r3, +8]
209;      ldl.ab  r11, [r1, +8]
210;      xorl.f  0, r4, r5
211;      xor.ne  r13, r13, 0b0001
212;      ...
213;      xorl.f  0, r10, r11
214;      xor.ne  r13, r13, 0b1000
215;      brne    r13, 0, @.unequal_find
216;      dbnz    r12, @.loop
217;
218; 2) "dbnz" instruction has a delay slot.  To make the code more
219;    efficient, we can bring the first 2 instructions of the loop
220;    to the end (they will be executed just before the next iteration
221;    begins).  To make the logic of the program sound, those 2
222;    instructions need to be duplicated before the loop start as well:
223;
224;      ldl.ab  r4,  [r3, +8]
225;      ldl.ab  r5,  [r1, +8]
226;    .loop
227;      ldl.ab  r6, [r3, +8]
228;      ldl.ab  r7, [r1, +8]
229;      ...
230;      ldl.ab  r10, [r3, +8]
231;      ldl.ab  r11, [r1, +8]
232;      xorl.f  0, r4, r5
233;      xor.ne  r13, r13, 0b0001
234;      ...
235;      xorl.f  0, r10, r11
236;      xor.ne  r13, r13, 0b1000
237;      brne    r13, 0, @.unequal_find
238;      ldl.ab  r4,  [r3, +8]
239;      dbnz.d  r12, @.loop
240;      ldl.ab  r5,  [r1, +8]
241;
242;    There is one more loose end to take care of:  At the last iteration
243;    of the loop, there is an extra load into r4 and r5 registers while
244;    incrementing the pointers (r3 and r1).  We have to correct for that
245;    after the loop:
246;
247;    .loop:
248;      ..
249;      brne    r13, 0, @.unequal_find
250;      ldl.ab  r4,  [r3, +8]
251;      dbnz.d  r12, @.loop
252;      ldl.ab  r5,  [r1, +8]
253;      subl    r1, r1, 8
254;      subl    r3, r3, 8
255;
256;    One last remark about NOT filling the delay slot of "brne" with
257;    "ldl.ab r4, ...".  If the branch is taken, the rest of code that
258;    is responsible for finding the differentiating bytes relies that
259;    all 8 registers hold the comparison data of the loop.  Putting
260;    "ldl.ab r4, ..." into the delay slot of "brne ..." would clobber
261;    the "r4" register:
262;
263;    .loop:
264;      ..
265;      brne.d  r13, 0, @.unequal_find    --> this branch might be taken
266;      ldl.ab  r4,  [r3, +8]             --> clobbers r4
267;      dbnz.d  r12, @.loop
268;      ldl.ab  r5,  [r1, +8]
269;
270;    Having "ldl.ab r4, ..." between "brne" and "dbnz" as two control flow
271;    altering instructions is good enough.
272