1/* Copyright (c) 2012-2013, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27/* 28 * Copyright (c) 2015 ARM Ltd 29 * All rights reserved. 30 * 31 * Redistribution and use in source and binary forms, with or without 32 * modification, are permitted provided that the following conditions 33 * are met: 34 * 1. Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * 2. Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in the 38 * documentation and/or other materials provided with the distribution. 39 * 3. The name of the company may not be used to endorse or promote 40 * products derived from this software without specific prior written 41 * permission. 42 * 43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 53 */ 54 55/* Assumptions: 56 * 57 * ARMv8-a, AArch64, unaligned accesses 58 * 59 */ 60 61#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 62/* See memset-stub.c */ 63#else 64 65#define dstin x0 66#define val x1 67#define valw w1 68#define count x2 69#define dst x3 70#define dstend x4 71#define tmp1 x5 72#define tmp1w w5 73#define tmp2 x6 74#define tmp2w w6 75#define zva_len x7 76#define zva_lenw w7 77 78#define L(l) .L ## l 79 80 .macro def_fn f p2align=0 81 .text 82 .p2align \p2align 83 .global \f 84 .type \f, %function 85\f: 86 .endm 87 88def_fn memset p2align=6 89 90 dup v0.16B, valw 91 add dstend, dstin, count 92 93 cmp count, 96 94 b.hi L(set_long) 95 cmp count, 16 96 b.hs L(set_medium) 97 mov val, v0.D[0] 98 99 /* Set 0..15 bytes. */ 100 tbz count, 3, 1f 101 str val, [dstin] 102 str val, [dstend, -8] 103 ret 104 nop 1051: tbz count, 2, 2f 106 str valw, [dstin] 107 str valw, [dstend, -4] 108 ret 1092: cbz count, 3f 110 strb valw, [dstin] 111 tbz count, 1, 3f 112 strh valw, [dstend, -2] 1133: ret 114 115 /* Set 17..96 bytes. */ 116L(set_medium): 117 str q0, [dstin] 118 tbnz count, 6, L(set96) 119 str q0, [dstend, -16] 120 tbz count, 5, 1f 121 str q0, [dstin, 16] 122 str q0, [dstend, -32] 1231: ret 124 125 .p2align 4 126 /* Set 64..96 bytes. Write 64 bytes from the start and 127 32 bytes from the end. */ 128L(set96): 129 str q0, [dstin, 16] 130 stp q0, q0, [dstin, 32] 131 stp q0, q0, [dstend, -32] 132 ret 133 134 .p2align 3 135 nop 136L(set_long): 137 and valw, valw, 255 138 bic dst, dstin, 15 139 str q0, [dstin] 140 cmp count, 256 141 ccmp valw, 0, 0, cs 142 b.eq L(try_zva) 143L(no_zva): 144 sub count, dstend, dst /* Count is 16 too large. */ 145 sub dst, dst, 16 /* Dst is biased by -32. */ 146 sub count, count, 64 + 16 /* Adjust count and bias for loop. */ 1471: stp q0, q0, [dst, 32] 148 stp q0, q0, [dst, 64]! 149L(tail64): 150 subs count, count, 64 151 b.hi 1b 1522: stp q0, q0, [dstend, -64] 153 stp q0, q0, [dstend, -32] 154 ret 155 156 .p2align 3 157L(try_zva): 158 mrs tmp1, dczid_el0 159 tbnz tmp1w, 4, L(no_zva) 160 and tmp1w, tmp1w, 15 161 cmp tmp1w, 4 /* ZVA size is 64 bytes. */ 162 b.ne L(zva_128) 163 164 /* Write the first and last 64 byte aligned block using stp rather 165 than using DC ZVA. This is faster on some cores. 166 */ 167L(zva_64): 168 str q0, [dst, 16] 169 stp q0, q0, [dst, 32] 170 bic dst, dst, 63 171 stp q0, q0, [dst, 64] 172 stp q0, q0, [dst, 96] 173 sub count, dstend, dst /* Count is now 128 too large. */ 174 sub count, count, 128+64+64 /* Adjust count and bias for loop. */ 175 add dst, dst, 128 176 nop 1771: dc zva, dst 178 add dst, dst, 64 179 subs count, count, 64 180 b.hi 1b 181 stp q0, q0, [dst, 0] 182 stp q0, q0, [dst, 32] 183 stp q0, q0, [dstend, -64] 184 stp q0, q0, [dstend, -32] 185 ret 186 187 .p2align 3 188L(zva_128): 189 cmp tmp1w, 5 /* ZVA size is 128 bytes. */ 190 b.ne L(zva_other) 191 192 str q0, [dst, 16] 193 stp q0, q0, [dst, 32] 194 stp q0, q0, [dst, 64] 195 stp q0, q0, [dst, 96] 196 bic dst, dst, 127 197 sub count, dstend, dst /* Count is now 128 too large. */ 198 sub count, count, 128+128 /* Adjust count and bias for loop. */ 199 add dst, dst, 128 2001: dc zva, dst 201 add dst, dst, 128 202 subs count, count, 128 203 b.hi 1b 204 stp q0, q0, [dstend, -128] 205 stp q0, q0, [dstend, -96] 206 stp q0, q0, [dstend, -64] 207 stp q0, q0, [dstend, -32] 208 ret 209 210L(zva_other): 211 mov tmp2w, 4 212 lsl zva_lenw, tmp2w, tmp1w 213 add tmp1, zva_len, 64 /* Max alignment bytes written. */ 214 cmp count, tmp1 215 blo L(no_zva) 216 217 sub tmp2, zva_len, 1 218 add tmp1, dst, zva_len 219 add dst, dst, 16 220 subs count, tmp1, dst /* Actual alignment bytes to write. */ 221 bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ 222 beq 2f 2231: stp q0, q0, [dst], 64 224 stp q0, q0, [dst, -32] 225 subs count, count, 64 226 b.hi 1b 2272: mov dst, tmp1 228 sub count, dstend, tmp1 /* Remaining bytes to write. */ 229 subs count, count, zva_len 230 b.lo 4f 2313: dc zva, dst 232 add dst, dst, zva_len 233 subs count, count, zva_len 234 b.hs 3b 2354: add count, count, zva_len 236 sub dst, dst, 32 /* Bias dst for tail loop. */ 237 b L(tail64) 238 239 .size memset, . - memset 240#endif 241