1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2012-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, unaligned accesses. 11 * 12 */ 13 14#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 15/* See memcpy-stub.c */ 16#else 17#include "asmdefs.h" 18 19#define dstin x0 20#define src x1 21#define count x2 22#define dst x3 23#define srcend x4 24#define dstend x5 25#define A_l x6 26#define A_lw w6 27#define A_h x7 28#define B_l x8 29#define B_lw w8 30#define B_h x9 31#define C_l x10 32#define C_lw w10 33#define C_h x11 34#define D_l x12 35#define D_h x13 36#define E_l x14 37#define E_h x15 38#define F_l x16 39#define F_h x17 40#define G_l count 41#define G_h dst 42#define H_l src 43#define H_h srcend 44#define tmp1 x14 45 46/* This implementation handles overlaps and supports both memcpy and memmove 47 from a single entry point. It uses unaligned accesses and branchless 48 sequences to keep the code small, simple and improve performance. 49 50 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 51 copies of up to 128 bytes, and large copies. The overhead of the overlap 52 check is negligible since it is only required for large copies. 53 54 Large copies use a software pipelined loop processing 64 bytes per iteration. 55 The destination pointer is 16-byte aligned to minimize unaligned accesses. 56 The loop tail is handled by always copying 64 bytes from the end. 57*/ 58 59ENTRY_ALIAS (memmove) 60ENTRY (memcpy) 61 PTR_ARG (0) 62 PTR_ARG (1) 63 SIZE_ARG (2) 64 add srcend, src, count 65 add dstend, dstin, count 66 cmp count, 128 67 b.hi L(copy_long) 68 cmp count, 32 69 b.hi L(copy32_128) 70 71 /* Small copies: 0..32 bytes. */ 72 cmp count, 16 73 b.lo L(copy16) 74 ldp A_l, A_h, [src] 75 ldp D_l, D_h, [srcend, -16] 76 stp A_l, A_h, [dstin] 77 stp D_l, D_h, [dstend, -16] 78 ret 79 80 /* Copy 8-15 bytes. */ 81L(copy16): 82 tbz count, 3, L(copy8) 83 ldr A_l, [src] 84 ldr A_h, [srcend, -8] 85 str A_l, [dstin] 86 str A_h, [dstend, -8] 87 ret 88 89 .p2align 3 90 /* Copy 4-7 bytes. */ 91L(copy8): 92 tbz count, 2, L(copy4) 93 ldr A_lw, [src] 94 ldr B_lw, [srcend, -4] 95 str A_lw, [dstin] 96 str B_lw, [dstend, -4] 97 ret 98 99 /* Copy 0..3 bytes using a branchless sequence. */ 100L(copy4): 101 cbz count, L(copy0) 102 lsr tmp1, count, 1 103 ldrb A_lw, [src] 104 ldrb C_lw, [srcend, -1] 105 ldrb B_lw, [src, tmp1] 106 strb A_lw, [dstin] 107 strb B_lw, [dstin, tmp1] 108 strb C_lw, [dstend, -1] 109L(copy0): 110 ret 111 112 .p2align 4 113 /* Medium copies: 33..128 bytes. */ 114L(copy32_128): 115 ldp A_l, A_h, [src] 116 ldp B_l, B_h, [src, 16] 117 ldp C_l, C_h, [srcend, -32] 118 ldp D_l, D_h, [srcend, -16] 119 cmp count, 64 120 b.hi L(copy128) 121 stp A_l, A_h, [dstin] 122 stp B_l, B_h, [dstin, 16] 123 stp C_l, C_h, [dstend, -32] 124 stp D_l, D_h, [dstend, -16] 125 ret 126 127 .p2align 4 128 /* Copy 65..128 bytes. */ 129L(copy128): 130 ldp E_l, E_h, [src, 32] 131 ldp F_l, F_h, [src, 48] 132 cmp count, 96 133 b.ls L(copy96) 134 ldp G_l, G_h, [srcend, -64] 135 ldp H_l, H_h, [srcend, -48] 136 stp G_l, G_h, [dstend, -64] 137 stp H_l, H_h, [dstend, -48] 138L(copy96): 139 stp A_l, A_h, [dstin] 140 stp B_l, B_h, [dstin, 16] 141 stp E_l, E_h, [dstin, 32] 142 stp F_l, F_h, [dstin, 48] 143 stp C_l, C_h, [dstend, -32] 144 stp D_l, D_h, [dstend, -16] 145 ret 146 147 .p2align 4 148 /* Copy more than 128 bytes. */ 149L(copy_long): 150 /* Use backwards copy if there is an overlap. */ 151 sub tmp1, dstin, src 152 cbz tmp1, L(copy0) 153 cmp tmp1, count 154 b.lo L(copy_long_backwards) 155 156 /* Copy 16 bytes and then align dst to 16-byte alignment. */ 157 158 ldp D_l, D_h, [src] 159 and tmp1, dstin, 15 160 bic dst, dstin, 15 161 sub src, src, tmp1 162 add count, count, tmp1 /* Count is now 16 too large. */ 163 ldp A_l, A_h, [src, 16] 164 stp D_l, D_h, [dstin] 165 ldp B_l, B_h, [src, 32] 166 ldp C_l, C_h, [src, 48] 167 ldp D_l, D_h, [src, 64]! 168 subs count, count, 128 + 16 /* Test and readjust count. */ 169 b.ls L(copy64_from_end) 170 171L(loop64): 172 stp A_l, A_h, [dst, 16] 173 ldp A_l, A_h, [src, 16] 174 stp B_l, B_h, [dst, 32] 175 ldp B_l, B_h, [src, 32] 176 stp C_l, C_h, [dst, 48] 177 ldp C_l, C_h, [src, 48] 178 stp D_l, D_h, [dst, 64]! 179 ldp D_l, D_h, [src, 64]! 180 subs count, count, 64 181 b.hi L(loop64) 182 183 /* Write the last iteration and copy 64 bytes from the end. */ 184L(copy64_from_end): 185 ldp E_l, E_h, [srcend, -64] 186 stp A_l, A_h, [dst, 16] 187 ldp A_l, A_h, [srcend, -48] 188 stp B_l, B_h, [dst, 32] 189 ldp B_l, B_h, [srcend, -32] 190 stp C_l, C_h, [dst, 48] 191 ldp C_l, C_h, [srcend, -16] 192 stp D_l, D_h, [dst, 64] 193 stp E_l, E_h, [dstend, -64] 194 stp A_l, A_h, [dstend, -48] 195 stp B_l, B_h, [dstend, -32] 196 stp C_l, C_h, [dstend, -16] 197 ret 198 199 .p2align 4 200 201 /* Large backwards copy for overlapping copies. 202 Copy 16 bytes and then align dst to 16-byte alignment. */ 203L(copy_long_backwards): 204 ldp D_l, D_h, [srcend, -16] 205 and tmp1, dstend, 15 206 sub srcend, srcend, tmp1 207 sub count, count, tmp1 208 ldp A_l, A_h, [srcend, -16] 209 stp D_l, D_h, [dstend, -16] 210 ldp B_l, B_h, [srcend, -32] 211 ldp C_l, C_h, [srcend, -48] 212 ldp D_l, D_h, [srcend, -64]! 213 sub dstend, dstend, tmp1 214 subs count, count, 128 215 b.ls L(copy64_from_start) 216 217L(loop64_backwards): 218 stp A_l, A_h, [dstend, -16] 219 ldp A_l, A_h, [srcend, -16] 220 stp B_l, B_h, [dstend, -32] 221 ldp B_l, B_h, [srcend, -32] 222 stp C_l, C_h, [dstend, -48] 223 ldp C_l, C_h, [srcend, -48] 224 stp D_l, D_h, [dstend, -64]! 225 ldp D_l, D_h, [srcend, -64]! 226 subs count, count, 64 227 b.hi L(loop64_backwards) 228 229 /* Write the last iteration and copy 64 bytes from the start. */ 230L(copy64_from_start): 231 ldp G_l, G_h, [src, 48] 232 stp A_l, A_h, [dstend, -16] 233 ldp A_l, A_h, [src, 32] 234 stp B_l, B_h, [dstend, -32] 235 ldp B_l, B_h, [src, 16] 236 stp C_l, C_h, [dstend, -48] 237 ldp C_l, C_h, [src] 238 stp D_l, D_h, [dstend, -64] 239 stp G_l, G_h, [dstin, 48] 240 stp A_l, A_h, [dstin, 32] 241 stp B_l, B_h, [dstin, 16] 242 stp C_l, C_h, [dstin] 243 ret 244 245END (memcpy) 246#endif 247