1/* Copyright (c) 2012-2013, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27/* 28 * Copyright (c) 2015 ARM Ltd 29 * All rights reserved. 30 * 31 * Redistribution and use in source and binary forms, with or without 32 * modification, are permitted provided that the following conditions 33 * are met: 34 * 1. Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * 2. Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in the 38 * documentation and/or other materials provided with the distribution. 39 * 3. The name of the company may not be used to endorse or promote 40 * products derived from this software without specific prior written 41 * permission. 42 * 43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 53 */ 54 55/* Assumptions: 56 * 57 * ARMv8-a, AArch64, unaligned accesses. 58 * 59 */ 60 61#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 62/* See memcpy-stub.c */ 63#else 64 65#define dstin x0 66#define src x1 67#define count x2 68#define dst x3 69#define srcend x4 70#define dstend x5 71#define A_l x6 72#define A_lw w6 73#define A_h x7 74#define A_hw w7 75#define B_l x8 76#define B_lw w8 77#define B_h x9 78#define C_l x10 79#define C_h x11 80#define D_l x12 81#define D_h x13 82#define E_l src 83#define E_h count 84#define F_l srcend 85#define F_h dst 86#define tmp1 x9 87 88#define L(l) .L ## l 89 90 .macro def_fn f p2align=0 91 .text 92 .p2align \p2align 93 .global \f 94 .type \f, %function 95\f: 96 .endm 97 98/* Copies are split into 3 main cases: small copies of up to 16 bytes, 99 medium copies of 17..96 bytes which are fully unrolled. Large copies 100 of more than 96 bytes align the destination and use an unrolled loop 101 processing 64 bytes per iteration. 102 Small and medium copies read all data before writing, allowing any 103 kind of overlap, and memmove tailcalls memcpy for these cases as 104 well as non-overlapping copies. 105*/ 106 107def_fn memcpy p2align=6 108 prfm PLDL1KEEP, [src] 109 add srcend, src, count 110 add dstend, dstin, count 111 cmp count, 16 112 b.ls L(copy16) 113 cmp count, 96 114 b.hi L(copy_long) 115 116 /* Medium copies: 17..96 bytes. */ 117 sub tmp1, count, 1 118 ldp A_l, A_h, [src] 119 tbnz tmp1, 6, L(copy96) 120 ldp D_l, D_h, [srcend, -16] 121 tbz tmp1, 5, 1f 122 ldp B_l, B_h, [src, 16] 123 ldp C_l, C_h, [srcend, -32] 124 stp B_l, B_h, [dstin, 16] 125 stp C_l, C_h, [dstend, -32] 1261: 127 stp A_l, A_h, [dstin] 128 stp D_l, D_h, [dstend, -16] 129 ret 130 131 .p2align 4 132 /* Small copies: 0..16 bytes. */ 133L(copy16): 134 cmp count, 8 135 b.lo 1f 136 ldr A_l, [src] 137 ldr A_h, [srcend, -8] 138 str A_l, [dstin] 139 str A_h, [dstend, -8] 140 ret 141 .p2align 4 1421: 143 tbz count, 2, 1f 144 ldr A_lw, [src] 145 ldr A_hw, [srcend, -4] 146 str A_lw, [dstin] 147 str A_hw, [dstend, -4] 148 ret 149 150 /* Copy 0..3 bytes. Use a branchless sequence that copies the same 151 byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1521: 153 cbz count, 2f 154 lsr tmp1, count, 1 155 ldrb A_lw, [src] 156 ldrb A_hw, [srcend, -1] 157 ldrb B_lw, [src, tmp1] 158 strb A_lw, [dstin] 159 strb B_lw, [dstin, tmp1] 160 strb A_hw, [dstend, -1] 1612: ret 162 163 .p2align 4 164 /* Copy 64..96 bytes. Copy 64 bytes from the start and 165 32 bytes from the end. */ 166L(copy96): 167 ldp B_l, B_h, [src, 16] 168 ldp C_l, C_h, [src, 32] 169 ldp D_l, D_h, [src, 48] 170 ldp E_l, E_h, [srcend, -32] 171 ldp F_l, F_h, [srcend, -16] 172 stp A_l, A_h, [dstin] 173 stp B_l, B_h, [dstin, 16] 174 stp C_l, C_h, [dstin, 32] 175 stp D_l, D_h, [dstin, 48] 176 stp E_l, E_h, [dstend, -32] 177 stp F_l, F_h, [dstend, -16] 178 ret 179 180 /* Align DST to 16 byte alignment so that we don't cross cache line 181 boundaries on both loads and stores. There are at least 96 bytes 182 to copy, so copy 16 bytes unaligned and then align. The loop 183 copies 64 bytes per iteration and prefetches one iteration ahead. */ 184 185 .p2align 4 186L(copy_long): 187 and tmp1, dstin, 15 188 bic dst, dstin, 15 189 ldp D_l, D_h, [src] 190 sub src, src, tmp1 191 add count, count, tmp1 /* Count is now 16 too large. */ 192 ldp A_l, A_h, [src, 16] 193 stp D_l, D_h, [dstin] 194 ldp B_l, B_h, [src, 32] 195 ldp C_l, C_h, [src, 48] 196 ldp D_l, D_h, [src, 64]! 197 subs count, count, 128 + 16 /* Test and readjust count. */ 198 b.ls 2f 1991: 200 stp A_l, A_h, [dst, 16] 201 ldp A_l, A_h, [src, 16] 202 stp B_l, B_h, [dst, 32] 203 ldp B_l, B_h, [src, 32] 204 stp C_l, C_h, [dst, 48] 205 ldp C_l, C_h, [src, 48] 206 stp D_l, D_h, [dst, 64]! 207 ldp D_l, D_h, [src, 64]! 208 subs count, count, 64 209 b.hi 1b 210 211 /* Write the last full set of 64 bytes. The remainder is at most 64 212 bytes, so it is safe to always copy 64 bytes from the end even if 213 there is just 1 byte left. */ 2142: 215 ldp E_l, E_h, [srcend, -64] 216 stp A_l, A_h, [dst, 16] 217 ldp A_l, A_h, [srcend, -48] 218 stp B_l, B_h, [dst, 32] 219 ldp B_l, B_h, [srcend, -32] 220 stp C_l, C_h, [dst, 48] 221 ldp C_l, C_h, [srcend, -16] 222 stp D_l, D_h, [dst, 64] 223 stp E_l, E_h, [dstend, -64] 224 stp A_l, A_h, [dstend, -48] 225 stp B_l, B_h, [dstend, -32] 226 stp C_l, C_h, [dstend, -16] 227 ret 228 229 .size memcpy, . - memcpy 230#endif 231