1/* 2 Copyright (c) 2024, Synopsys, Inc. All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 7 1) Redistributions of source code must retain the above copyright notice, 8 this list of conditions and the following disclaimer. 9 10 2) Redistributions in binary form must reproduce the above copyright notice, 11 this list of conditions and the following disclaimer in the documentation 12 and/or other materials provided with the distribution. 13 14 3) Neither the name of the Synopsys, Inc., nor the names of its contributors 15 may be used to endorse or promote products derived from this software 16 without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include <picolibc.h> 32 33#include <sys/asm.h> 34 35; r0 void* ptr 36; r1 int ch 37; r2 size_t count 38 39#if defined (__ARC64_ARCH32__) 40 41ENTRY (memchr) 42 LSRP.f 0, r2, 4 ; counter for 16-byte chunks 43 beq.d @.L_start_1_byte_search 44 45 ; Filter for 1 byte 46 bmsk r1, r1, 7 47 lsl8 r9, r1 48 49 or r9, r9, r1 50 vpack2hl r1, r9, r9 51 52 ; r1 is now setup with the special 4 byte repetition of the target byte 53 ; We use r1 because we dont have any more registers free inside the main loop 54 ; r9 can be repurposed 55 mov r8, NULL_32DT_1 56 ror r9, r8 57 58 xor r3, r3, r3 59 60.L_search_16_bytes: 61 62#if defined (__ARC64_LL64__) 63 64 ldd.ab r4r5, [r0, +8] 65 ldd.ab r6r7, [r0, +8] 66 67#else 68 69 ld.ab r4, [r0, +4] 70 ld.ab r5, [r0, +4] 71 ld.ab r6, [r0, +4] 72 ld.ab r7, [r0, +4] 73 74#endif 75 76 xor r4, r4, r1 77 xor r5, r5, r1 78 xor r6, r6, r1 79 xor r7, r7, r1 80 81 sub r10, r4, r8 82 sub r11, r5, r8 83 sub r12, r6, r8 84 sub r13, r7, r8 85 86 bic r10, r10, r4 87 bic r11, r11, r5 88 bic r12, r12, r6 89 bic r13, r13, r7 90 91 tst r10, r9 92 bset.ne r3, r3, 4 93 94 tst r11, r9 95 bset.ne r3, r3, 3 96 97 tst r12, r9 98 bset.ne r3, r3, 2 99 100 tst r13, r9 101 bset.ne r3, r3, 1 102 103 ; Break if found 104 brne.d r3, 0, @.L_found_in_16B 105 106 ; Keep going we have more 16 byte chunks 107 sub r2, r2, 16 108 109 brge r2, 16, @.L_search_16_bytes 110 111 ; Reset byte repetition of r1 to 1 single byte 112 bmsk r1, r1, 7 113 114.L_start_1_byte_search: 115 ; Check if r2 is 0 116 breq.d r2, 0, @.L_byte_not_found 117 ldb.ab r10, [r0, +1] 118 119.L_search_1_byte: 120 121 breq r10, r1, @.L_found_byte 122 123 dbnz.d r2, @.L_search_1_byte 124 ldb.ab r10, [r0, +1] 125 126; Byte not found 127.L_byte_not_found: 128 j.d [blink] 129 MOVP r0, 0 130 131.L_found_byte: 132 j_s.d [blink] 133 SUBP r0, r0, 1 134 135.L_found_in_16B: 136 137 fls r5, r3 ; [2] 138 139; Select appropriate register to analyze [4] 140 mov r2, r13 141 142; Point r13 to first NULL byte containing double word [3] 143 sub2 r0, r0, r5 144 145 146 asr.f r3, r3, 3 147 mov.c r2, r12 148 149 asr.f r3, r3, 1 150 mov.c r2, r11 151 152 asr.f r3, r3, 1 153 mov.c r2, r10 154 155 and r2, r2, r9 ; [5] 156 157 ffs r2, r2 ; [6] 158 159 xbfu r2, r2, 0b0111000011 ; [7] 160 161 j.d [blink] 162 add r0, r0, r2 ; [8] 163 164ENDFUNC (memchr) 165 166#else 167 168ENTRY (memchr) 169 lsrl.f 0, r2, 5 ; counter for 32-byte chunks 170 beq.d @.L_start_1_byte_search 171 172 ; Filter for 1 byte 173 bmsk r1, r1, 7 174 lsl8 r9, r1 175 176 or r9, r9, r1 177 178 vpack2hl r1, r9, r9 179 vpack2wl r1, r1, r1 180 181 ; r1 is now setup with the special 4 byte repetition of the target byte 182 ; We use r1 because we dont have any more registers free inside the main loop 183 ; r9 can be repurposed 184 vpack2wl r8, NULL_32DT_1, NULL_32DT_1 185 asll r9, r8, 7 186 187 xorl r3, r3, r3 188 189.L_search_32_bytes: 190 191; Using 128-bit memory operations 192#if defined (__ARC64_M128__) 193 194 lddl.ab r4r5, [r0, +16] 195 lddl.ab r6r7, [r0, +16] 196 197; The 64-bit crunching implementation. 198#elif defined (__ARC64_ARCH64__) 199 200 ldl.ab r4, [r0, +8] 201 ldl.ab r5, [r0, +8] 202 ldl.ab r6, [r0, +8] 203 ldl.ab r7, [r0, +8] 204 205#else 206 # error Unknown configuration 207#endif 208 209 xorl r4, r4, r1 210 xorl r5, r5, r1 211 xorl r6, r6, r1 212 xorl r7, r7, r1 213 214 subl r10, r4, r8 215 subl r11, r5, r8 216 subl r12, r6, r8 217 subl r13, r7, r8 218 219 bicl r10, r10, r4 220 bicl r11, r11, r5 221 bicl r12, r12, r6 222 bicl r13, r13, r7 223 224 tstl r10, r9 225 bset.ne r3, r3, 4 226 227 tstl r11, r9 228 bset.ne r3, r3, 3 229 230 tstl r12, r9 231 bset.ne r3, r3, 2 232 233 tstl r13, r9 234 bset.ne r3, r3, 1 235 236 ; Break if found 237 brne.d r3, 0, @.L_found_in_32B 238 239 ; Keep going we have more 16 byte chunks 240 subl r2, r2, 32 241 brge r2, 32, @.L_search_32_bytes 242 243 ; Reset byte repetition of r1 to 1 single byte 244 bmskl r1, r1, 7 245 246.L_start_1_byte_search: 247 ; Check if r2 is 0 248 breq.d r2, 0, @.L_byte_not_found 249 ldb.ab r10, [r0, +1] 250 251.L_search_1_byte: 252 253 breq r10, r1, @.L_found_byte 254 255 dbnz.d r2, @.L_search_1_byte 256 ldb.ab r10, [r0, +1] 257 258; Byte not found 259.L_byte_not_found: 260 j.d [blink] 261 movl r0, 0 262 263.L_found_byte: 264 j_s.d [blink] 265 subl r0, r0, 1 266 267.L_found_in_32B: 268 269 fls r5, r3 ; [2] 270 271; Select appropriate register to analyze [4] 272 movl r2, r13 273 274; Point r13 to first NULL byte containing double word [3] 275 sub3l r0, r0, r5 276 277 asr.f r3, r3, 3 278 movl.c r2, r12 279 280 asr.f r3, r3, 1 281 movl.c r2, r11 282 283 asr.f r3, r3, 1 284 movl.c r2, r10 285 286 andl r2, r2, r9 ; [5] 287 288 ffsl r2, r2 ; [6] 289 290 xbful r2, r2, 0b0111000011 ; [7] 291 292 j.d [blink] 293 addl r0, r0, r2 ; [8] 294 295ENDFUNC (memchr) 296#endif 297 298;; This code uses a common technique for NULL byte detection inside a word. 299;; Details on this technique can be found in: 300;; (https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord) 301; 302; In sum, this technique allows for detecting a NULL byte inside any given 303; amount of bits by performing the following operation 304; DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080) [0] 305; 306; The code above implements this by setting r8 to a 0x01010101... sequence and 307; r9 to a 0x80808080... sequence of appropriate length 308; As LIMM are 32 bit only, we need to perform MOVHL and ORL [1] operations to 309; have the appropriate 64 bit values in place 310; 311; As we want a specific byte and not a NULL byte, we create in r1 a constant 312; that is made up of the target byte, on each byte position, that we xor with 313; the loaded data to force a NULL byte only if the target byte is present. 314; After that we can use the technique directly 315; 316;; Search is done 32 bytes at a time, either with 64 bit loads or 128 bit loads 317;; If the target byte is detected, the position of the double word is encoded 318;; in r3, which is eventually used to adjust r0 319; 320; r3 is set via bset, which means we can simply use a fls to obtain the first 321; match (or ffs depending on the values in bset) [2]. 322; The reason for starting at 1 and not 0 is so r3 encodes how many double 323; words to go back, and it wouldnt make sense to go back 0 (the byte would be 324; in the next loop iteration). 325; 326; The first step to take is point r0 to the appropriate double word. 327; As the chosen encoded information is how many double words to go back, 328; we can simply multiply r3 by 8 and reduce r0 by that amount [3] 329; 330; Then, we need to place the loaded double word containing the first target byte 331; found, into a "common" register we can operate on later [4]. 332; 333; To do this without any jumps, we can shift r3 and perform a conditional mov 334; based on the carry flag value. 335; The order is very important because the byte can appear in several double 336; words, so we want to analyze from last to first. 337; 338; We can ignore the first asr (which would be asr.f 2, as we started r3 on 1) 339; because if r13 isnt the target byte, r2 will always be overwritten so we can 340; just decide to start at r7, and overwrite it if needed. 341; 342; Now comes the tricky part. In order to obtain the first target byte, we need 343; to understand the NULL byte detection operation. It is explained in depth in 344; the link above but in short, it works by first setting the highest bit of each 345; byte to 1, if the corresponding byte is either 0 or more than 0x80 346; Then, separately, it makes the highest bit of each byte 1, if the byte is 347; less than 0x80. The last step is to AND these two values (this operation is 348; simplified with the SUB, BIC and TST instructions). 349; 350; This means that the evaluated equation result value [5] has zeros for all non 351; zero bytes, except for the NULL bytes (which are the target bytes after the 352; xor). Therefore, we can simply find the first non zero bit (counting from bit 353; 0) which will be inside the position of the first NULL byte. 354; 355; One thing to note, is that ffs oddly returns 31 if no bit is found, setting 356; the zero flag. As r9 is never all 0s at this stage (would mean there is no 357; NULL byte and we wouldnt be here) we dont need to worry about that. [6] 358; 359; We can then convert the bit position into the last byte position by looking 360; into bits 3 to 5, and shifting 3 bits to the right. This can be combined into 361; a single xbful operation. The bottom 000011 represent shift by 3 and the top 362; 0111 represents the mask (3 to 5 shifted by 3 is 0 to 2). We dont need to 363; worry about the case where ffs does not find a bit, because we know for sure 364; there is at least one NULL byte, and therefore one of the highest bits is set 365; to 1 [7] 366; 367; Finally, we can add the NULL/target byte position inside the loaded double 368; word to r0 to obtain the bytes absolute position [8] 369; 370; 371; Some operations are re-ordered such that register dependency is reduced, 372; allowing the CPU to run more instructions in parallel 373; 374