1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2012-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, unaligned accesses. 11 * 12 */ 13 14#include <picolibc.h> 15 16#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 17/* See memcpy-stub.c */ 18#else 19#include "asmdefs.h" 20 21#define dstin x0 22#define src x1 23#define count x2 24#define dst x3 25#define srcend x4 26#define dstend x5 27#define A_l x6 28#define A_lw w6 29#define A_h x7 30#define B_l x8 31#define B_lw w8 32#define B_h x9 33#define C_l x10 34#define C_lw w10 35#define C_h x11 36#define D_l x12 37#define D_h x13 38#define E_l x14 39#define E_h x15 40#define F_l x16 41#define F_h x17 42#define G_l count 43#define G_h dst 44#define H_l src 45#define H_h srcend 46#define tmp1 x14 47 48/* This implementation handles overlaps and supports both memcpy and memmove 49 from a single entry point. It uses unaligned accesses and branchless 50 sequences to keep the code small, simple and improve performance. 51 52 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 53 copies of up to 128 bytes, and large copies. The overhead of the overlap 54 check is negligible since it is only required for large copies. 55 56 Large copies use a software pipelined loop processing 64 bytes per iteration. 57 The destination pointer is 16-byte aligned to minimize unaligned accesses. 58 The loop tail is handled by always copying 64 bytes from the end. 59*/ 60 61ENTRY_ALIAS (memmove) 62ENTRY (memcpy) 63 PTR_ARG (0) 64 PTR_ARG (1) 65 SIZE_ARG (2) 66 add srcend, src, count 67 add dstend, dstin, count 68 cmp count, 128 69 b.hi L(copy_long) 70 cmp count, 32 71 b.hi L(copy32_128) 72 73 /* Small copies: 0..32 bytes. */ 74 cmp count, 16 75 b.lo L(copy16) 76 ldp A_l, A_h, [src] 77 ldp D_l, D_h, [srcend, -16] 78 stp A_l, A_h, [dstin] 79 stp D_l, D_h, [dstend, -16] 80 ret 81 82 /* Copy 8-15 bytes. */ 83L(copy16): 84 tbz count, 3, L(copy8) 85 ldr A_l, [src] 86 ldr A_h, [srcend, -8] 87 str A_l, [dstin] 88 str A_h, [dstend, -8] 89 ret 90 91 .p2align 3 92 /* Copy 4-7 bytes. */ 93L(copy8): 94 tbz count, 2, L(copy4) 95 ldr A_lw, [src] 96 ldr B_lw, [srcend, -4] 97 str A_lw, [dstin] 98 str B_lw, [dstend, -4] 99 ret 100 101 /* Copy 0..3 bytes using a branchless sequence. */ 102L(copy4): 103 cbz count, L(copy0) 104 lsr tmp1, count, 1 105 ldrb A_lw, [src] 106 ldrb C_lw, [srcend, -1] 107 ldrb B_lw, [src, tmp1] 108 strb A_lw, [dstin] 109 strb B_lw, [dstin, tmp1] 110 strb C_lw, [dstend, -1] 111L(copy0): 112 ret 113 114 .p2align 4 115 /* Medium copies: 33..128 bytes. */ 116L(copy32_128): 117 ldp A_l, A_h, [src] 118 ldp B_l, B_h, [src, 16] 119 ldp C_l, C_h, [srcend, -32] 120 ldp D_l, D_h, [srcend, -16] 121 cmp count, 64 122 b.hi L(copy128) 123 stp A_l, A_h, [dstin] 124 stp B_l, B_h, [dstin, 16] 125 stp C_l, C_h, [dstend, -32] 126 stp D_l, D_h, [dstend, -16] 127 ret 128 129 .p2align 4 130 /* Copy 65..128 bytes. */ 131L(copy128): 132 ldp E_l, E_h, [src, 32] 133 ldp F_l, F_h, [src, 48] 134 cmp count, 96 135 b.ls L(copy96) 136 ldp G_l, G_h, [srcend, -64] 137 ldp H_l, H_h, [srcend, -48] 138 stp G_l, G_h, [dstend, -64] 139 stp H_l, H_h, [dstend, -48] 140L(copy96): 141 stp A_l, A_h, [dstin] 142 stp B_l, B_h, [dstin, 16] 143 stp E_l, E_h, [dstin, 32] 144 stp F_l, F_h, [dstin, 48] 145 stp C_l, C_h, [dstend, -32] 146 stp D_l, D_h, [dstend, -16] 147 ret 148 149 .p2align 4 150 /* Copy more than 128 bytes. */ 151L(copy_long): 152 /* Use backwards copy if there is an overlap. */ 153 sub tmp1, dstin, src 154 cbz tmp1, L(copy0) 155 cmp tmp1, count 156 b.lo L(copy_long_backwards) 157 158 /* Copy 16 bytes and then align dst to 16-byte alignment. */ 159 160 ldp D_l, D_h, [src] 161 and tmp1, dstin, 15 162 bic dst, dstin, 15 163 sub src, src, tmp1 164 add count, count, tmp1 /* Count is now 16 too large. */ 165 ldp A_l, A_h, [src, 16] 166 stp D_l, D_h, [dstin] 167 ldp B_l, B_h, [src, 32] 168 ldp C_l, C_h, [src, 48] 169 ldp D_l, D_h, [src, 64]! 170 subs count, count, 128 + 16 /* Test and readjust count. */ 171 b.ls L(copy64_from_end) 172 173L(loop64): 174 stp A_l, A_h, [dst, 16] 175 ldp A_l, A_h, [src, 16] 176 stp B_l, B_h, [dst, 32] 177 ldp B_l, B_h, [src, 32] 178 stp C_l, C_h, [dst, 48] 179 ldp C_l, C_h, [src, 48] 180 stp D_l, D_h, [dst, 64]! 181 ldp D_l, D_h, [src, 64]! 182 subs count, count, 64 183 b.hi L(loop64) 184 185 /* Write the last iteration and copy 64 bytes from the end. */ 186L(copy64_from_end): 187 ldp E_l, E_h, [srcend, -64] 188 stp A_l, A_h, [dst, 16] 189 ldp A_l, A_h, [srcend, -48] 190 stp B_l, B_h, [dst, 32] 191 ldp B_l, B_h, [srcend, -32] 192 stp C_l, C_h, [dst, 48] 193 ldp C_l, C_h, [srcend, -16] 194 stp D_l, D_h, [dst, 64] 195 stp E_l, E_h, [dstend, -64] 196 stp A_l, A_h, [dstend, -48] 197 stp B_l, B_h, [dstend, -32] 198 stp C_l, C_h, [dstend, -16] 199 ret 200 201 .p2align 4 202 203 /* Large backwards copy for overlapping copies. 204 Copy 16 bytes and then align dst to 16-byte alignment. */ 205L(copy_long_backwards): 206 ldp D_l, D_h, [srcend, -16] 207 and tmp1, dstend, 15 208 sub srcend, srcend, tmp1 209 sub count, count, tmp1 210 ldp A_l, A_h, [srcend, -16] 211 stp D_l, D_h, [dstend, -16] 212 ldp B_l, B_h, [srcend, -32] 213 ldp C_l, C_h, [srcend, -48] 214 ldp D_l, D_h, [srcend, -64]! 215 sub dstend, dstend, tmp1 216 subs count, count, 128 217 b.ls L(copy64_from_start) 218 219L(loop64_backwards): 220 stp A_l, A_h, [dstend, -16] 221 ldp A_l, A_h, [srcend, -16] 222 stp B_l, B_h, [dstend, -32] 223 ldp B_l, B_h, [srcend, -32] 224 stp C_l, C_h, [dstend, -48] 225 ldp C_l, C_h, [srcend, -48] 226 stp D_l, D_h, [dstend, -64]! 227 ldp D_l, D_h, [srcend, -64]! 228 subs count, count, 64 229 b.hi L(loop64_backwards) 230 231 /* Write the last iteration and copy 64 bytes from the start. */ 232L(copy64_from_start): 233 ldp G_l, G_h, [src, 48] 234 stp A_l, A_h, [dstend, -16] 235 ldp A_l, A_h, [src, 32] 236 stp B_l, B_h, [dstend, -32] 237 ldp B_l, B_h, [src, 16] 238 stp C_l, C_h, [dstend, -48] 239 ldp C_l, C_h, [src] 240 stp D_l, D_h, [dstend, -64] 241 stp G_l, G_h, [dstin, 48] 242 stp A_l, A_h, [dstin, 32] 243 stp B_l, B_h, [dstin, 16] 244 stp C_l, C_h, [dstin] 245 ret 246 247END (memcpy) 248#endif 249