1/* 2 Copyright (c) 2024, Synopsys, Inc. All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 7 1) Redistributions of source code must retain the above copyright notice, 8 this list of conditions and the following disclaimer. 9 10 2) Redistributions in binary form must reproduce the above copyright notice, 11 this list of conditions and the following disclaimer in the documentation 12 and/or other materials provided with the distribution. 13 14 3) Neither the name of the Synopsys, Inc., nor the names of its contributors 15 may be used to endorse or promote products derived from this software 16 without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include <picolibc.h> 32 33#include <sys/asm.h> 34 35; This file contains variants of the same function with different 36; instructions. The generic one, the implementation that comes the 37; last after the #else macro, is the most commented. 38 39; Using 128-bit memory operations 40#if defined (__ARC64_M128__) 41 42ENTRY (memcpy) 43 lsrl.f r12, r2, 6 ; Check size < 64bytes 44 beq.d @.L_write_1_bytes 45 movl r3, r0 46.L_write_64_bytes: 47 lddl.ab r4r5, [r1, +16] 48 lddl.ab r6r7, [r1, +16] 49 lddl.ab r8r9, [r1, +16] 50 lddl.ab r10r11, [r1, +16] 51 stdl.ab r4r5, [r3, +16] 52 stdl.ab r6r7, [r3, +16] 53 stdl.ab r8r9, [r3, +16] 54 dbnz.d r12, @.L_write_64_bytes 55 stdl.ab r10r11, [r3, +16] 56.L_write_1_bytes: 57 ;; Handle anything between 15bytes < size < 64bytes 58 ;; The algorithm has two phases: 59 ;; - copy 16, 32, or 48 bytes of data using 128bit ops 60 ;; - copy the remaining 15 bytes of data using a single stdl/lddl pair 61 bmsk.f r2, r2, 5 ; Check size == 0 62 jeq.d [blink] 63 lsr.f r12, r2, 4 ; Check size < 16bytes 64 beq.d @1f 65 xor r12, r12, 3 66 ;; R12 can be 3,2, or 1, which are indicating how much data we should 67 ;; copy: 3 -> 48bytes, 2 -> 32bytes, 1 -> 16bytes. 68 ;; Zero case shouldn't happen as we check for it above. 69 ;; Then I use the BI instructions to implement the following code 70 ;; switch ($R12) 71 ;; case 3: 72 ;; lddl RA, ... 73 ;; stdl RA, ... 74 ;; case 2: 75 ;; lddl RA, ... 76 ;; stdl RA, ... 77 ;; case 1: 78 ;; lddl RA, ... 79 ;; stdl RA, ... 80 ;; case 0: 81 ;; break 82 ;; N.B the BI instruction works the other way than I expected, namely 83 ;; BI's entry 0 is the closest to instruction, hence I need to bit 84 ;; invert R12 to get the desired behaviour (done by above XOR). 85 asl r12,r12,1 86 bi [r12] 87 lddl.ab r4r5, [r1, +16] 88 stdl.ab r4r5, [r3, +16] 89 lddl.ab r6r7, [r1, +16] 90 stdl.ab r6r7, [r3, +16] 91 lddl.ab r8r9, [r1, +16] 92 stdl.ab r8r9, [r3, +16] 93 bmsk.f r2, r2, 3 ; Check size == 0 94 jeq.d [blink] 95 subl r2, r2, 16 96 ;; We are still having 15 bytes top to transfer, exactly like in the 97 ;; case of below byte-by-byte transfer. However, we already transfered 98 ;; at least 16bytes before, thus, we can create a new 16byte load which 99 ;; re-reads parts of the already transfer data AND the remaining up to 100 ;; 15 bytes of data still to be transfered. 101 ;; The position of the window is controlled by the $r12 which is the 102 ;; complement of the number of remaining bytes. 103 addl r3, r3, r2 104 lddl r4r5, [r1, r2] 105 j_s.d [blink] 106 stdl r4r5, [r3] 1071: 108 ;; Anything size < 16 we go byte by byte. 109 ldb.ab r4, [r1, +1] 110 dbnz.d r2, @1b 111 stb.ab r4, [r3, +1] 112 j_s [blink] 113ENDFUNC (memcpy) 114 115; The 64-bit crunching implementation. 116#elif defined (__ARC64_ARCH64__) \ 117 || (defined (__ARC64_ARCH32__) && defined (__ARC64_LL64__)) 118 119; R0: dest 120; R1: source 121; R2: count 122; ret (R0): dest 123; clobber: r1, r3, r4r5, r6r7, r8r9, r10r11, r12 124ENTRY (memcpy) 125 LSRP.f r12, r2, 5 ; counter for 32-byte chunks 126 beq.d @.L_write_31_bytes 127 MOVP r3, r0 ; do not clobber the "dest" 128.L_write_32_bytes: ; Take care of 32 byte chunks 129 LD64.ab r4, [r1, +8] 130 LD64.ab r6, [r1, +8] 131 LD64.ab r8, [r1, +8] 132 LD64.ab r10,[r1, +8] 133 ST64.ab r4, [r3, +8] 134 ST64.ab r6, [r3, +8] 135 ST64.ab r8, [r3, +8] 136 dbnz.d r12, @.L_write_32_bytes 137 ST64.ab r10, [r3, +8] ; Shove store in delay slot 138 bmsk_s r2, r2, 4 ; From now on, we only care for the remainder % 32 139 140 141; The remainder bits indicating how many more bytes to copy 142; .------------------------. 143; | b4 | b3 | b2 | b1 | b0 | 144; `------------------------' 145; 16 8 4 2 1 146.L_write_31_bytes: 147 bbit0.d r2, 2, @1f ; is b2 set? then copy 4 bytes 148 lsr r12, r2, 3 ; see the notes below 149 ld.ab r4, [r1, 4] 150 st.ab r4, [r3, 4] 1511: 152 bbit0.d r2, 1, @1f ; is b1 set? then copy 2 bytes 153 xor r12, r12, 3 154 ldh.ab r4, [r1, 2] 155 sth.ab r4, [r3, 2] 1561: 157 bbit0.d r2, 0, @1f ; is b0 set? then copy 1 byte 158 asl r12, r12, 1 159 ldb.ab r4, [r1, 1] 160 stb.ab r4, [r3, 1] 161 162; Interpreting bits (b4,b3) [1] and how they correlate to branch index: 163; 164; (b4,b3) | bytes to copy | branch index 165; --------+---------------+------------- 166; 00b | 0 | 3 (11b) 167; 01b | 8 | 2 (10b) 168; 10b | 16 | 1 (01b) 169; 11b | 24 | 0 (00b) 170; 171; To go from (b4,b3) to branch index, the bits must be flipped. 172; In other words, they must be XORed with 11b [2]. 173; 174; Last but not least, "bi" jumps at boundaries of 4. We need to double 175; the index to jump 8 bytes [3]. 176; 177; Hence, the 3 operations for calculating the branch index that are spread 178; in "bbit0" delay slots: 179; 180; lsr r12, r2, 3 [1] 181; xor r12, r12, 3 [2] 182; asl r12, r12, 1 [3] 1831: 184 bi [r12] 185 LD64.ab r4, [r1, 8] 186 ST64.ab r4, [r3, 8] 187 LD64.ab r4, [r1, 8] 188 ST64.ab r4, [r3, 8] 189 LD64.ab r4, [r1, 8] 190 ST64.ab r4, [r3, 8] 191 192 j_s [blink] 193ENDFUNC (memcpy) 194 195#elif defined (__ARC64_ARCH32__) 196 197ENTRY (memcpy) 198 lsr.f r11, r2, 4 ; counter for 16-byte chunks 199 beq.d @.L_write_15_bytes 200 mov r3, r0 ; work on a copy of "r0" 201.L_write_16_bytes: 202 ld.ab r4, [r1, 4] 203 ld.ab r5, [r1, 4] 204 ld.ab r6, [r1, 4] 205 ld.ab r7, [r1, 4] 206 st.ab r4, [r3, 4] 207 st.ab r5, [r3, 4] 208 st.ab r6, [r3, 4] 209 dbnz.d r11, @.L_write_16_bytes 210 st.ab r7, [r3, 4] 211 bmsk_s r2, r2, 3 212 213.L_write_15_bytes: 214 bbit0.d r2, 1, @1f 215 lsr r11, r2, 2 216 ldh.ab r4, [r1, 2] 217 sth.ab r4, [r3, 2] 2181: 219 bbit0.d r2, 0, @1f 220 xor r11, r11, 3 221 ldb.ab r4, [r1, 1] 222 stb.ab r4, [r3, 1] 2231: 224 asl r11, r11, 1 225 bi [r11] 226 ld.ab r4,[r1, 4] 227 st.ab r4,[r3, 4] 228 ld.ab r4,[r1, 4] 229 st.ab r4,[r3, 4] 230 ld r4,[r1] 231 st r4,[r3] 232 233 j_s [blink] 234ENDFUNC (memcpy) 235 236#else 237# error Unknown configuration 238#endif 239