1/* 2 Copyright (c) 2024, Synopsys, Inc. All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 7 1) Redistributions of source code must retain the above copyright notice, 8 this list of conditions and the following disclaimer. 9 10 2) Redistributions in binary form must reproduce the above copyright notice, 11 this list of conditions and the following disclaimer in the documentation 12 and/or other materials provided with the distribution. 13 14 3) Neither the name of the Synopsys, Inc., nor the names of its contributors 15 may be used to endorse or promote products derived from this software 16 without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include <picolibc.h> 32 33#include <sys/asm.h> 34 35; Code Brief (more info at the bottom): 36; Searches the provided string, 32 bytes at a time, using 128 bit loads 37; Finds the NULL bytes inside the loaded data 38; Analyzes the first NULL byte containing double word and calculates 39; size appropriately 40; 41; R0 const char* ptr (string to measure) 42; ret (R0): 43; - unsigned (string size) 44; 45 46#if defined (__ARC64_ARCH32__) 47 48ENTRY (strlen) 49 50; Preserve r0 for size calculation when returning 51 mov r13, r0 52 xor r12, r12, r12 53 54; Setup byte detector (more information bellow) [1] 55 mov r8, NULL_32DT_1 56; Set r9 as a copy of r8 for vectorized sub 57 mov r9, r8 58 59 asl r1, r8, 7 60 61.L_4_4B_search: 62 63#if defined (__ARC64_LL64__) 64 65 ldd.ab r2r3, [r13, +8] 66 ldd.ab r4r5, [r13, +8] 67 68#else 69 70 ld.ab r2, [r13, +4] 71 ld.ab r3, [r13, +4] 72 ld.ab r4, [r13, +4] 73 ld.ab r5, [r13, +4] 74 75#endif 76 77; NULL byte position is detected and encoded in r12 [0] [9] 78 79 vsub2 r10, r2, r8 80 vsub2 r6, r4, r8 81 82 bic r10, r10, r2 83 bic r11, r11, r3 84 bic r6, r6, r4 85 bic r7, r7, r5 86 87 tst r10, r1 88 bset.ne r12, r12, 4 89 90 tst r11, r1 91 bset.ne r12, r12, 3 92 93 tst r6, r1 94 bset.ne r12, r12, 2 95 96 tst r7, r1 97 bset.ne r12, r12, 1 98 99 breq.d r12, 0, @.L_4_4B_search 100 101 fls r5, r12 ; [2] 102 103; Point r13 to first NULL byte containing double word [3] 104 sub2 r13, r13, r5 105 106; Select appropriate register to analyze [4] 107 mov r2, r7 108 109 asr.f r12, r12, 3 110 mov.c r2, r6 111 112 asr.f r12, r12, 1 113 mov.c r2, r11 114 115 asr.f r12, r12, 1 116 mov.c r2, r10 117 118; Point r13 to first NULL byte in selected double word 119.L_fix_r13: 120 and r1, r2, r1 ; [5] 121 122 ffs r1, r1 ; [6] 123 124 xbfu r1, r1, 0b0111000011 ; [7] 125 126 add r13, r13, r1 ; [8] 127 128 j_s.d [blink] 129 sub r0, r13, r0 130 131 132ENDFUNC (strlen) 133 134#else 135 136ENTRY (strlen) 137 138; Preserve r0 for size calculation when returning 139 movl r13, r0 140 xor r12, r12, r12 141 142; Setup byte detector (more information bellow) [1] 143 vpack2wl r8, NULL_32DT_1, NULL_32DT_1 144 145 asll r1, r8, 7 146 147.L_4_8B_search: 148 149; Using 128-bit memory operations 150#if defined (__ARC64_M128__) 151 152 lddl.ab r2r3, [r13, +16] 153 lddl.ab r4r5, [r13, +16] 154 155; The 64-bit crunching implementation. 156#elif defined (__ARC64_ARCH64__) 157 158 ldl.ab r2, [r13, +8] 159 ldl.ab r3, [r13, +8] 160 ldl.ab r4, [r13, +8] 161 ldl.ab r5, [r13, +8] 162 163#else 164 # error Unknown configuration 165#endif 166 167; NULL byte position is detected and encoded in r6 [0] [9] 168 subl r10, r2, r8 169 subl r11, r3, r8 170 subl r6, r4, r8 171 subl r7, r5, r8 172 173 bicl r10, r10, r2 174 bicl r11, r11, r3 175 bicl r6, r6, r4 176 bicl r7, r7, r5 177 178 tstl r10, r1 179 bset.ne r12, r12, 4 180 181 tstl r11, r1 182 bset.ne r12, r12, 3 183 184 tstl r6, r1 185 bset.ne r12, r12, 2 186 187 tstl r7, r1 188 bset.ne r12, r12, 1 189 190 breq.d r12, 0, @.L_4_8B_search 191 192 flsl r5, r12 ; [2] 193 194; Point r13 to first NULL byte containing double word [3] 195 sub3l r13, r13, r5 196 197; Select appropriate register to analyze [4] 198 movl r2, r7 199 200 asr.f r12, r12, 3 201 movl.c r2, r6 202 203 asr.f r12, r12, 1 204 movl.c r2, r11 205 206 asr.f r12, r12, 1 207 movl.c r2, r10 208 209; Point r13 to first NULL byte in selected double word 210.L_fix_r13: 211 andl r1, r2, r1 ; [5] 212 213 ffsl r1, r1 ; [6] 214 215 xbful r1, r1, 0b0111000011 ; [7] 216 217 addl r13, r13, r1 ; [8] 218 219 j_s.d [blink] 220 subl r0, r13, r0 221 222 223ENDFUNC (strlen) 224 225#endif 226 227;; This code uses a common technique for NULL byte detection inside a word. 228;; Details on this technique can be found in: 229;; (https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord) 230; 231; In sum, this technique allows for detecting a NULL byte inside any given 232; amount of bits by performing the following operation 233; DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080) [0] 234; 235; The code above implements this by setting r8 to a 236; 0x01010101... sequence and r1 to a 0x80808080... sequence of 237; appropriate length As LIMM are 32 bit only, we need to perform MOVHL 238; and ORL [1] operations to have the appropriate 64 bit values in 239; place 240; 241;; Search is done 32 bytes at a time, either with 64 bit loads or 128 242;; bit loads If a NULL byte is detected, the position of the double 243;; word is encoded in r12, which is then used to adjust r13 244; 245; r12 is set via bset, which means we can simply use a fls to obtain 246; the first match (or ffs depending on the values in bset) [2]. The 247; reason for starting at 1 and not 0 is so r12 encodes how many double 248; words to go back, and it wouldnt make sense to go back 0 (the NULL 249; would be in the next loop iteration). 250; 251; The first step to take is point r13 to the appropriate double word. 252; As the chosen encoded information is how many double words to go 253; back, we can simply multiply r12 by 8 and reduce r13 by that amount 254; [3] 255; 256; Then, we need to place the loaded double word containing the first 257; NULL byte into a "common" register we can operate on later [4]. 258; 259; To do this without any jumps, we can shift r12 and perform a 260; conditional mov based on the carry flag value. The order is very 261; important because the NULL byte can appear in several double words, 262; so we want to analyze from last to first. 263; 264; We can ignore the first asr (which would be asr.f 2, as we started 265; r12 on 1) because if r7 isnt the NULL byte, r2 will always be 266; overwritten so we can just decide to start at r7, and overwrite it 267; if needed. 268; 269; Now comes the tricky part. In order to obtain the first NULL byte, 270; we need to understand the NULL byte detection operation. It is 271; explained in depth in the link above but in short, it works by first 272; setting the highest bit of each byte to 1, if the corresponding byte 273; is either 0 or more than 0x80 Then, separately, it makes the highest 274; bit of each byte 1, if the byte is less than 0x80. The last step is 275; to AND these two values (this operation is simplified with the SUB, 276; BIC and TST instructions). 277; 278; This means that the evaluated equation result value [5] has zeros 279; for all non zero bytes, except for the NULL bytes. Therefore, we can 280; simply find the first non zero bit (counting from bit 0) which will 281; be inside the position of the first NULL byte. 282; 283; One thing to note, is that ffs oddly returns 31 if no bit is found, 284; setting the zero flag. As r9 is never all 0s at this stage (would 285; mean there is no NULL byte and we wouldnt be here) we dont need to 286; worry about that. [6] 287; 288; We can then convert the bit position into the last byte position by 289; looking into bits 3 to 5, and shifting 3 bits to the right. This can 290; be combined into a single xbful operation. The bottom 000011 291; represent shift by 3 and the top 0111 represents the mask (3 to 5 292; shifted by 3 is 0 to 2). We dont need to worry about the case where 293; ffs does not find a bit, because we know for sure there is at least 294; one NULL byte, and therefore one of the highest bits is set to 1 [7] 295; 296; Finally, we can add the NULL byte position inside the loaded double 297; word to r13 and subtract r0 from r13 to obtain the string size [8] 298; 299; 300; Some operations are re-ordered such that register dependency is 301; reduced, allowing the CPU to run more instructions in parallel [9] 302; 303; 304