1/* 2 Copyright (c) 2024, Synopsys, Inc. All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 7 1) Redistributions of source code must retain the above copyright notice, 8 this list of conditions and the following disclaimer. 9 10 2) Redistributions in binary form must reproduce the above copyright notice, 11 this list of conditions and the following disclaimer in the documentation 12 and/or other materials provided with the distribution. 13 14 3) Neither the name of the Synopsys, Inc., nor the names of its contributors 15 may be used to endorse or promote products derived from this software 16 without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include <picolibc.h> 32 33#include <sys/asm.h> 34 35#if defined (__ARC64_ARCH64__) 36 37; R0: lhs 38; R1: rhs 39; R2: count 40; ret (R0): 41; - lhs < rhs: <0 42; - lhs = rhs: 0 43; - lhs > rhs: >0 44ENTRY (memcmp) 45 cmpl r2, 64 46 bls.d @.L_compare_1_bytes 47 movl r3, r0 ; "r0" will be used as return value 48 ; If one is curious why the code below looks like the way it does, 49 ; there is a documentation at the end of this file. 50 lsrl r12, r2, 5 ; counter for 32-byte chunks 51 xor r13, r13, r13 ; the mask showing inequal registers 52 ldl.ab r4, [r3, +8] 53 ldl.ab r5, [r1, +8] 54.L_compare_32_bytes: 55 ldl.ab r6, [r3, +8] 56 ldl.ab r7, [r1, +8] 57 ldl.ab r8, [r3, +8] 58 ldl.ab r9, [r1, +8] 59 ldl.ab r10, [r3, +8] 60 ldl.ab r11, [r1, +8] 61 xorl.f 0, r4, r5 62 xor.ne r13, r13, 0b0001 63 xorl.f 0, r6, r7 64 xor.ne r13, r13, 0b0010 65 xorl.f 0, r8, r9 66 xor.ne r13, r13, 0b0100 67 xorl.f 0, r10, r11 68 xor.ne r13, r13, 0b1000 69 brne r13, 0, @.L_unequal_find 70 ldl.ab r4, [r3, +8] 71 dbnz.d r12, @.L_compare_32_bytes 72 ldl.ab r5, [r1, +8] 73 ; Adjusting the pointers because of the extra loads in the end 74 subl r1, r1, 8 75 subl r3, r3, 8 76 bmsk_s r2, r2, 4 ; any remaining bytes to compare 77.L_compare_1_bytes: 78 cmp r2, 0 79 jeq.d [blink] 80 xor_s r0, r0, r0 81 ldb.ab r4, [r3, +1] 82 ldb.ab r5, [r1, +1] 832: 84 sub.f r0, r4, r5 85 jne.d [blink] 86 ldb.ab r4, [r3, +1] 87 dbnz.d r2, @2b 88 ldb.ab r5, [r1, +1] ; this load may read beyond the "count". 89 j_s [blink] 90; At this point, we want to find the _first_ comparison that marked the 91; inequality of "lhs" and "rhs". The rest acts like a multiplexer: 92; 93; if r4 was not equal to r5 --> r1=r4, r2=r5 94; if r6 was not equal to r7 --> r1=r6, r2=r7 95; if r8 was not equal to r9 --> r1=r8, r2=r9 96; if r10 was not equal to r11 --> r1=r10, r2=r11 97; find_different_byte(r1, r2) 98; 99; About the "bi [n]" (branch index) instruction: This instruction alters 100; next PC (program counter): 101; 102; next_pc = current_pc + n*4 n*4 is the same as n<<2 103; 104; In other words, it tells the processor to execute the n'th instruction 105; from where we are (assuming all the next instructions are 4 bytes long). 106; 107; We used this to our benefit. We made each "case" (unequal_r4r5, 108; unequal_r5r6, ...) 16 bytes long (power of 2) and fed "bi" an index 109; that is already multiplied by 4 (asl r13, r13, 2). This translates 110; into "bi [n]" jumping to 16-bytes slots. The last slot we did not 111; make 16 bytes long with "nop" because we don't need to address after 112; it. 113.L_unequal_find: 114 ffs r13, r13 115 asl r13, r13, 2 116 bi [r13] 117.L_unequal_r4r5: 118 movl r1, r4 119 b.d @.L_diff_byte_in_regs 120 movl r2, r5 121 nop 122.L_unequal_r6r7: 123 movl r1, r6 124 b.d @.L_diff_byte_in_regs 125 movl r2, r7 126 nop 127.L_unequal_r8r9: 128 movl r1, r8 129 b.d @.L_diff_byte_in_regs 130 movl r2, r9 131 nop 132.L_unequal_r10r11: 133 movl r1, r10 134 movl r2, r11 135 ; fall-through 136; If we're here, that means the two operands are not equal. 137; 1) First we have to get a mask of their inequality through "xor" 138; 2) Then, find the first bit position that they're different: "ffs" 139; 3) Depending on the bit position, we want the whole byte containing 140; that bit, in both operands, to become the very first byte (least 141; significant byte), so that we can subtract one from another. 142; Below is an illustration of bit positions and how much we should 143; shift the numbers right: 144; bit position range : (in binary) | shift right by : (in binary) 145; -------------------+-------------------+----------------+------------ 146; [ 0, 7] : (000000 - 000111) | lsr 0 : 000000 147; [ 8,15] : (001000 - 001111) | lsr 8 : 001000 148; [16,23] : (010000 - 010111) | lsr 16 : 010000 149; [24,31] : (011000 - 011111) | lsr 24 : 011000 150; ... : ... | ... : ... 151; [56,63] : (111000 - 111111) | lsr 56 : 111000 152; We need to ignore the least 3 bits of "position" to get "shift right" 153; amount: "and 0x38, ..." 154; 4) When the bytes are positioned at byte #0, mask out the rest of the 155; bytes and subtract the two operands: lhs - rhs 156.L_diff_byte_in_regs: 157 xorl r0, r1, r2 ; (1) 158 ffsl r0, r0 ; (2) 159 and r0, r0, 0x38 ; (3) 160 lsrl r1, r1, r0 ; (3) 161 lsrl r2, r2, r0 ; (3) 162 bmsk_s r1, r1, 7 ; (4) 163 bmsk_s r2, r2, 7 ; (4) 164 j_s.d [blink] 165 subl r0, r1, r2 ; (4) 166ENDFUNC (memcmp) 167 168; __ARC64_ARCH64__ 169#endif 170 171; The loop at the heart of the "memcmp" function follows some specific 172; logic and has gone through a few optimisation filters. Knowing them 173; will help understand the code better. 174; 175; The comparison logic 176; -------------------- 177; In each loop, we compare 32 bytes of data from "lhs" and "rhs". Those 178; comparisons takes place by using 8 sets of registers: 179; 180; r4 == r5 xor.f 0, r4, r5 lhs[i+0] == rhs[i+0] 181; r6 == r7 xor.f 0, r6, r7 lhs[i+8] == rhs[i+8] 182; r8 == r9 xor.f 0, r8, r9 lhs[i+16] == rhs[i+16] 183; r10 == r11 xor.f 0, r10, r11 lhs[i+24] == rhs[i+32] 184; 185; The idea is to set a corresponding bit in r13 register for each 186; comparison that fails. The relation between the bits and the 187; comparisons are: 188; 189; r13[0..63] = 0 190; r13[0] = 1 if r4 != r5 191; r13[1] = 1 if r6 != r7 192; r13[2] = 1 if r8 != r9 193; r13[3] = 1 if r10 != r11 194; 195; If r13 remains 0, the next possible iteration of the loop begins. 196; If it is not 0 anymore, the algorithm will be interested in the 197; lowest bit that is set to 1. That is achieved by the "ffs" 198; (find first set) instruction. 199; 200; The loop transformation 201; ----------------------- 202; 1) At first, the loop looks like below: 203; 204; .loop 205; ldl.ab r4, [r3, +8] 206; ldl.ab r5, [r1, +8] 207; ... 208; ldl.ab r10, [r3, +8] 209; ldl.ab r11, [r1, +8] 210; xorl.f 0, r4, r5 211; xor.ne r13, r13, 0b0001 212; ... 213; xorl.f 0, r10, r11 214; xor.ne r13, r13, 0b1000 215; brne r13, 0, @.unequal_find 216; dbnz r12, @.loop 217; 218; 2) "dbnz" instruction has a delay slot. To make the code more 219; efficient, we can bring the first 2 instructions of the loop 220; to the end (they will be executed just before the next iteration 221; begins). To make the logic of the program sound, those 2 222; instructions need to be duplicated before the loop start as well: 223; 224; ldl.ab r4, [r3, +8] 225; ldl.ab r5, [r1, +8] 226; .loop 227; ldl.ab r6, [r3, +8] 228; ldl.ab r7, [r1, +8] 229; ... 230; ldl.ab r10, [r3, +8] 231; ldl.ab r11, [r1, +8] 232; xorl.f 0, r4, r5 233; xor.ne r13, r13, 0b0001 234; ... 235; xorl.f 0, r10, r11 236; xor.ne r13, r13, 0b1000 237; brne r13, 0, @.unequal_find 238; ldl.ab r4, [r3, +8] 239; dbnz.d r12, @.loop 240; ldl.ab r5, [r1, +8] 241; 242; There is one more loose end to take care of: At the last iteration 243; of the loop, there is an extra load into r4 and r5 registers while 244; incrementing the pointers (r3 and r1). We have to correct for that 245; after the loop: 246; 247; .loop: 248; .. 249; brne r13, 0, @.unequal_find 250; ldl.ab r4, [r3, +8] 251; dbnz.d r12, @.loop 252; ldl.ab r5, [r1, +8] 253; subl r1, r1, 8 254; subl r3, r3, 8 255; 256; One last remark about NOT filling the delay slot of "brne" with 257; "ldl.ab r4, ...". If the branch is taken, the rest of code that 258; is responsible for finding the differentiating bytes relies that 259; all 8 registers hold the comparison data of the loop. Putting 260; "ldl.ab r4, ..." into the delay slot of "brne ..." would clobber 261; the "r4" register: 262; 263; .loop: 264; .. 265; brne.d r13, 0, @.unequal_find --> this branch might be taken 266; ldl.ab r4, [r3, +8] --> clobbers r4 267; dbnz.d r12, @.loop 268; ldl.ab r5, [r1, +8] 269; 270; Having "ldl.ab r4, ..." between "brne" and "dbnz" as two control flow 271; altering instructions is good enough. 272