1/* 2 Copyright (c) 2024, Synopsys, Inc. All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 7 1) Redistributions of source code must retain the above copyright notice, 8 this list of conditions and the following disclaimer. 9 10 2) Redistributions in binary form must reproduce the above copyright notice, 11 this list of conditions and the following disclaimer in the documentation 12 and/or other materials provided with the distribution. 13 14 3) Neither the name of the Synopsys, Inc., nor the names of its contributors 15 may be used to endorse or promote products derived from this software 16 without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include <picolibc.h> 32 33#include <sys/asm.h> 34 35 36; r0 char* dest 37; r1 const char* src 38 39; dest and src MUST NOT intercept 40 41; Brief: 42; Perform the same operation as strlen for finding the end of r0 string 43; If r0 and r1 have 44; If 4 byte aligned 45; Do 4 byte search until there are no more 4 byte chunks 46; Then, do 1 byte search 47; Otherwise, 1 byte search until alignment 48; Then, do 4 byte search as previously specified 49; 50;; More in depth description at the end 51; 52; R0 char* dest (destination string) 53; R1 const char* src (source string) 54; ret (R0): 55; - char* (destiantion string) 56; 57 58#if defined (__ARC64_ARCH32__) 59 60ENTRY (strcat) 61; Find end of r0 string 62; ========================== STRLEN CODE START ========================== 63 64; Preserve r0 for size calculation when returning 65 mov r13, r0 66 xor r6, r6, r6 67 68; Setup byte detector (more information below) [1] 69 mov r8, NULL_32DT_1 70 asl r9, r8, 7 71 72.L_4_4B_search: 73 74#if defined (__ARC64_LL64__) 75 76 ldd.ab r2r3, [r13, +8] 77 ldd.ab r4r5, [r13, +8] 78 79#else 80 81 ld.ab r2, [r13, +4] 82 ld.ab r3, [r13, +4] 83 ld.ab r4, [r13, +4] 84 ld.ab r5, [r13, +4] 85 86#endif 87 88; NULL byte position is detected and encoded in r6 [0] [9] 89 sub r10, r2, r8 90 sub r11, r3, r8 91 sub r12, r4, r8 92 sub r7, r5, r8 93 94 bic r10, r10, r2 95 bic r11, r11, r3 96 bic r12, r12, r4 97 bic r7, r7, r5 98 99 tst r10, r9 100 bset.ne r6, r6, 4 101 102 tst r11, r9 103 bset.ne r6, r6, 3 104 105 tst r12, r9 106 bset.ne r6, r6, 2 107 108 tst r7, r9 109 bset.ne r6, r6, 1 110 111 breq.d r6, 0, @.L_4_4B_search 112 113 fls r5, r6 ; [2] 114 115; Point r13 to first NULL byte containing double word [3] 116 sub2 r13, r13, r5 117 118 ; Select appropriate register to analyze [4] 119 mov r2, r7 120 121 asr.f r6, r6, 3 122 mov.c r2, r12 123 124 asr.f r6, r6, 1 125 mov.c r2, r11 126 127 asr.f r6, r6, 1 128 mov.c r2, r10 129 130; Point r13 to first NULL byte in selected double word 131 and r2, r2, r9 ; [5] 132 133 ffs r2, r2 ; [6] 134 135 xbfu r2, r2, 0b0111000011 ; [7] 136 137 add r13, r13, r2 ; [8] 138 139 140; ========================== STRLEN CODE END >|< ========================== 141 142 xor r6, r6, r6 143 144.L_4_4B_search_src: 145 146#if defined (__ARC64_LL64__) 147 148 ldd.ab r2r3, [r1, +8] 149 ldd.ab r4r5, [r1, +8] 150 151#else 152 153 ld.ab r2, [r1, +4] 154 ld.ab r3, [r1, +4] 155 ld.ab r4, [r1, +4] 156 ld.ab r5, [r1, +4] 157 158#endif 159 160; NULL byte position is detected and encoded in r6 [0] [9] 161 sub r10, r2, r8 162 sub r11, r3, r8 163 sub r12, r4, r8 164 sub r7, r5, r8 165 166 bic r10, r10, r2 167 bic r11, r11, r3 168 bic r12, r12, r4 169 bic r7, r7, r5 170 171 tst r10, r9 172 bset.ne r6, r6, 4 173 174 tst r11, r9 175 bset.ne r6, r6, 3 176 177 tst r12, r9 178 bset.ne r6, r6, 2 179 180 tst r7, r9 181 bset.ne r6, r6, 1 182 183 brne r6, 0, @.L_found_in_32B 184 185#if defined (__ARC64_LL64__) 186 187 std.ab r2r3, [r13, +8] 188 std.ab r4r5, [r13, +8] 189 190#else 191 192 st.ab r2, [r13, +4] 193 st.ab r3, [r13, +4] 194 st.ab r4, [r13, +4] 195 st.ab r5, [r13, +4] 196 197#endif 198 199 b @.L_4_4B_search_src 200 201.L_found_in_32B: 202 203 fls r6, r6 ; [2] 204 205; Point r1 to first NULL byte containing double word [3] 206 sub2 r1, r1, r6 207 208;; Store the already loaded data 209 210 ; 4 -> 1 to 3 -> 0 211 ;subl r6, r6, 1 212 213; Invert so the biggest branch is at the end, and we dont need to increase 214; block size 215 ; 3 -> 0 to 0 -> 3 216 ;subl r6, 3, r6 217 218 ; Condense the two subs here 219 rsub r6, r6, 4 220 221 asl r6, r6, 2 222 223; Store double words 224 bi [r6] 225 226 b.d @.L_store_lastL32bits 227 mov r11, r2 228 nop 229 nop 230 231 st.ab r2, [r13, +4] 232 b.d @.L_store_lastL32bits 233 mov r11, r3 234 nop 235 236 st.ab r2, [r13, +4] 237 st.ab r3, [r13, +4] 238 b.d @.L_store_lastL32bits 239 mov r11, r4 240 241 st.ab r2, [r13, +4] 242 st.ab r3, [r13, +4] 243 st.ab r4, [r13, +4] 244 mov r11, r5 245 246; r11 now contains the data to write 247.L_store_lastL32bits: 248 sub r10, r11, r8 249 bic r10, r10, r11 250 and r10, r10, r9 ; [5] 251 252 ffs r2, r10 ; [6] 253 add r2, r2, 1 254 255 xbfu r2, r2, 0b0111000011 ; [7] 256 257 mov r3, -1; Bitmask setup 258 259 ; If the NULL byte is in byte 3 (starting from the right) 260 ; we want to store 8-3 bytes 261 rsub r2, r2, 8 262 asl r2, r2, 3 263 264 ; According to the target byte, setup masks 265 lsr r3, r3, r2 266 not r4, r3 267 268 ; Obtain relevant data from destination 269 ld r10, [r13] 270 271 ; Get which data from dest is not to be overwritten and OR it 272 ; with the relevant data to write 273 and r3, r3, r11 274 and r4, r4, r10 275 276 or r3, r3, r4 277 278 j_s.d [blink] 279 st.ab r3, [r13, +4] 280 281 282 283ENDFUNC (strcat) 284 285#else 286 287ENTRY (strcat) 288; Find end of r0 string 289; ========================== STRLEN CODE START ========================== 290 291; Preserve r0 for size calculation when returning 292 movl r13, r0 293 xorl r6, r6, r6 294 295; Setup byte detector (more information below) [1] 296 vpack2wl r8, NULL_32DT_1, NULL_32DT_1 297 asll r9, r8, 7 298 299.L_4_8B_search: 300 301; Using 128-bit memory operations 302#if defined (__ARC64_M128__) 303 304 lddl.ab r2r3, [r13, +16] 305 lddl.ab r4r5, [r13, +16] 306 307; The 64-bit crunching implementation. 308#elif defined (__ARC64_ARCH64__) 309 310 ldl.ab r2, [r13, +8] 311 ldl.ab r3, [r13, +8] 312 ldl.ab r4, [r13, +8] 313 ldl.ab r5, [r13, +8] 314 315#else 316# error Unknown configuration 317#endif 318 319; NULL byte position is detected and encoded in r6 [0] [9] 320 subl r10, r2, r8 321 subl r11, r3, r8 322 subl r12, r4, r8 323 subl r7, r5, r8 324 325 bicl r10, r10, r2 326 bicl r11, r11, r3 327 bicl r12, r12, r4 328 bicl r7, r7, r5 329 330 tstl r10, r9 331 bset.ne r6, r6, 4 332 333 tstl r11, r9 334 bset.ne r6, r6, 3 335 336 tstl r12, r9 337 bset.ne r6, r6, 2 338 339 tstl r7, r9 340 bset.ne r6, r6, 1 341 342 breq.d r6, 0, @.L_4_8B_search 343 344 fls r5, r6 ; [2] 345 346; Point r13 to first NULL byte containing double word [3] 347 sub3l r13, r13, r5 348 349 ; Select appropriate register to analyze [4] 350 MOVP r2, r7 351 352 asr.f r6, r6, 3 353 MOVP.c r2, r12 354 355 asr.f r6, r6, 1 356 MOVP.c r2, r11 357 358 asr.f r6, r6, 1 359 MOVP.c r2, r10 360 361; Point r13 to first NULL byte in selected double word 362 andl r2, r2, r9 ; [5] 363 364 ffsl r2, r2 ; [6] 365 366 xbful r2, r2, 0b0111000011 ; [7] 367 368 addl r13, r13, r2 ; [8] 369 370 371; ========================== STRLEN CODE END >|< ========================== 372 373 xorl r6, r6, r6 374 375.L_4_8B_search_src: 376#if defined (__ARC64_M128__) 377 378 lddl.ab r2r3, [r1, +16] 379 lddl.ab r4r5, [r1, +16] 380 381#elif defined (__ARC64_ARCH64__) 382 383 ldl.ab r2, [r1, +8] 384 ldl.ab r3, [r1, +8] 385 ldl.ab r4, [r1, +8] 386 ldl.ab r5, [r1, +8] 387 388#else 389 # error Unknown configuration 390#endif 391 392; NULL byte position is detected and encoded in r6 [0] [9] 393 subl r10, r2, r8 394 subl r11, r3, r8 395 subl r12, r4, r8 396 subl r7, r5, r8 397 398 bicl r10, r10, r2 399 bicl r11, r11, r3 400 bicl r12, r12, r4 401 bicl r7, r7, r5 402 403 tstl r10, r9 404 bset.ne r6, r6, 4 405 406 tstl r11, r9 407 bset.ne r6, r6, 3 408 409 tstl r12, r9 410 bset.ne r6, r6, 2 411 412 tstl r7, r9 413 bset.ne r6, r6, 1 414 415 brne r6, 0, @.L_found_in_32B 416 417#if defined (__ARC64_M128__) 418 419 stdl.ab r2r3, [r13, +16] 420 stdl.ab r4r5, [r13, +16] 421 422#elif defined (__ARC64_ARCH64__) 423 424 stl.ab r2, [r13, +8] 425 stl.ab r3, [r13, +8] 426 stl.ab r4, [r13, +8] 427 stl.ab r5, [r13, +8] 428 429#else 430# error Unknown configuration 431#endif 432 433 b @.L_4_8B_search_src 434 435.L_found_in_32B: 436 437 fls r6, r6 ; [2] 438 439; Point r1 to first NULL byte containing double word [3] 440 sub3l r1, r1, r6 441 442;; Store the already loaded data 443 444 ; 4 -> 1 to 3 -> 0 445 ;subl r6, r6, 1 446 447; Invert so the biggest branch is at the end, and we dont need to increase 448; block size 449 ; 3 -> 0 to 0 -> 3 450 ;subl r6, 3, r6 451 452 ; Condense the two subs here 453 rsubl r6, r6, 4 454 455 asll r6, r6, 2 456 457; Store double words 458 bi [r6] 459 460 b.d @.L_store_lastL64bits 461 MOVP r11, r2 462 nop 463 nop 464 465 stl.ab r2, [r13, +8] 466 b.d @.L_store_lastL64bits 467 MOVP r11, r3 468 nop 469 470 stl.ab r2, [r13, +8] 471 stl.ab r3, [r13, +8] 472 b.d @.L_store_lastL64bits 473 MOVP r11, r4 474 475 stl.ab r2, [r13, +8] 476 stl.ab r3, [r13, +8] 477 stl.ab r4, [r13, +8] 478 MOVP r11, r5 479 480; r11 now contains the data to write 481.L_store_lastL64bits: 482 subl r10, r11, r8 483 bicl r10, r10, r11 484 485 andl r10, r10, r9 ; [5] 486 487 ffsl r2, r10 ; [6] 488 addl r2, r2, 1 489 490 xbful r2, r2, 0b0111000011 ; [7] 491 492 movl r3, -1; Bitmask setup 493 494 ; If the NULL byte is in byte 3 (starting from the right) 495 ; we want to store 8-3 bytes 496 rsubl r2, r2, 8 497 asl r2, r2, 3 498 499 ; According to the target byte, setup masks 500 lsrl r3, r3, r2 501 notl r4, r3 502 503 ; Obtain relevant data from destination 504 ldl r10, [r13] 505 506 ; Get which data from dest is not to be overwritten and OR it 507 ; with the relevant data to write 508 andl r3, r3, r11 509 andl r4, r4, r10 510 511 orl r3, r3, r4 512 513 j_s.d [blink] 514 stl.ab r3, [r13, +8] 515 516 517ENDFUNC (strcat) 518 519#endif 520 521;; This code uses a common technique for NULL byte detection inside a word. 522;; Details on this technique can be found in: 523;; (https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord) 524; 525; In sum, this technique allows for detecting a NULL byte inside any given 526; amount of bits by performing the following operation 527; DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080) [0] 528; 529; The code above implements this by setting r8 to a 0x01010101... sequence and 530; r9 to a 0x80808080... sequence of appropriate length 531; As LIMM are 32 bit only, we need to perform MOVHL and ORL [1] operations to 532; have the appropriate 64 bit values in place 533; 534;; Search is done 32 bytes at a time, either with 64 bit loads or 128 bit loads 535;; If a NULL byte is detected, the position of the double word is encoded 536;; in r6, which is then used to adjust r13 to the exact byte 537; 538; r6 is set via bset, which means we can simply use a fls to obtain the first 539; match (or ffs depending on the values in bset) [2]. 540; The reason for starting at 1 and not 0 is so r6 encodes how many double 541; words to go back, and it wouldnt make sense to go back 0 (the NULL would be 542; in the next loop iteration). 543; 544; The first step to take is point r13 to the appropriate double word. 545; As the chosen encoded information is how many double words to go back, 546; we can simply multiply r6 by 8 and reduce r13 by that amount [3] 547; 548; Then, we need to place the loaded double word containing the first NULL byte 549; into a "common" register we can operate on later [4]. 550; 551; To do this without any jumps, we can shift r6 and perform a conditional mov 552; based on the carry flag value. 553; The order is very important because the NULL byte can appear in several 554; double words, so we want to analyze from last to first. 555; 556; We can ignore the first asr (which would be asr.f 2, as we started r6 on 1) 557; because if r7 isnt the NULL byte, r2 will always be overwritten so we can 558; just decide to start at r7, and overwrite it if needed. 559; 560; Now comes the tricky part. In order to obtain the first NULL byte, we need to 561; understand the NULL byte detection operation. It is explained in depth in the 562; link above but in short, it works by first setting the highest bit of each 563; byte to 1, if the corresponding byte is either 0 or less than 0x80 564; Then, separately, it makes the highest bit of each byte 1, if the byte is 565; less than 0x80. The last step is to and these two values (this operation is 566; simplified with the subl, bicl and tst instructions). 567; 568; This means that the evaluated equation result value [5] has zeros for all non 569; zero bytes, except for the NULL bytes. Therefore, we can simply find the 570; first non zero bit (counting from bit 0) which will be inside the position of 571; the first NULL byte. 572; 573; One thing to note, is that ffs oddly returns 31 if no bit is found, setting 574; the zero flag. As r9 is never all 0s at this stage (would mean there is no 575; NULL byte and we wouldnt be here) we dont need to worry about that. [6] 576; 577; We can then convert the bit position into the last byte position by looking 578; into bits 3 to 5, and shifting 3 bits to the right. This can be combined into 579; a single xbful operation. The bottom 000011 represent shift by 3 and the top 580; 0111 represents the mask (3 to 5 shifted by 3 is 0 to 2). We dont need to worry 581; about the case where ffs does not find a bit, because we know for sure there is 582; at least one NULL byte, and therefore one of the highest bits is set to 1 [7] 583; 584; Finally, we can add the NULL byte position inside the loaded double word to 585; r13 and subtract r0 from r13 to obtain the string size [8] 586; 587; Some operations are re-ordered such that register dependency is reduced, 588; allowing the CPU to run more instructions in parallel [9] 589; 590; 591; Some data was already read, and needs to be stored following the same read 592; order. To do this, we need to make the 593; 594; 595