1/* memcmp - compare memory 2 * 3 * Copyright (c) 2013-2022, Arm Limited. 4 * SPDX-License-Identifier: MIT 5 */ 6 7#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 8/* See memcmp-stub.c */ 9#else 10 11/* Assumptions: 12 * 13 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 14 */ 15 16#include "asmdefs.h" 17 18#define src1 x0 19#define src2 x1 20#define limit x2 21#define result w0 22 23#define data1 x3 24#define data1w w3 25#define data2 x4 26#define data2w w4 27#define data3 x5 28#define data3w w5 29#define data4 x6 30#define data4w w6 31#define tmp x6 32#define src1end x7 33#define src2end x8 34 35 36ENTRY (memcmp) 37 PTR_ARG (0) 38 PTR_ARG (1) 39 SIZE_ARG (2) 40 41 cmp limit, 16 42 b.lo L(less16) 43 ldp data1, data3, [src1] 44 ldp data2, data4, [src2] 45 ccmp data1, data2, 0, ne 46 ccmp data3, data4, 0, eq 47 b.ne L(return2) 48 49 add src1end, src1, limit 50 add src2end, src2, limit 51 cmp limit, 32 52 b.ls L(last_bytes) 53 cmp limit, 160 54 b.hs L(loop_align) 55 sub limit, limit, 32 56 57 .p2align 4 58L(loop32): 59 ldp data1, data3, [src1, 16] 60 ldp data2, data4, [src2, 16] 61 cmp data1, data2 62 ccmp data3, data4, 0, eq 63 b.ne L(return2) 64 cmp limit, 16 65 b.ls L(last_bytes) 66 67 ldp data1, data3, [src1, 32] 68 ldp data2, data4, [src2, 32] 69 cmp data1, data2 70 ccmp data3, data4, 0, eq 71 b.ne L(return2) 72 add src1, src1, 32 73 add src2, src2, 32 74L(last64): 75 subs limit, limit, 32 76 b.hi L(loop32) 77 78 /* Compare last 1-16 bytes using unaligned access. */ 79L(last_bytes): 80 ldp data1, data3, [src1end, -16] 81 ldp data2, data4, [src2end, -16] 82L(return2): 83 cmp data1, data2 84 csel data1, data1, data3, ne 85 csel data2, data2, data4, ne 86 87 /* Compare data bytes and set return value to 0, -1 or 1. */ 88L(return): 89#ifndef __AARCH64EB__ 90 rev data1, data1 91 rev data2, data2 92#endif 93 cmp data1, data2 94 cset result, ne 95 cneg result, result, lo 96 ret 97 98 .p2align 4 99L(less16): 100 add src1end, src1, limit 101 add src2end, src2, limit 102 tbz limit, 3, L(less8) 103 ldr data1, [src1] 104 ldr data2, [src2] 105 ldr data3, [src1end, -8] 106 ldr data4, [src2end, -8] 107 b L(return2) 108 109 .p2align 4 110L(less8): 111 tbz limit, 2, L(less4) 112 ldr data1w, [src1] 113 ldr data2w, [src2] 114 ldr data3w, [src1end, -4] 115 ldr data4w, [src2end, -4] 116 b L(return2) 117 118L(less4): 119 tbz limit, 1, L(less2) 120 ldrh data1w, [src1] 121 ldrh data2w, [src2] 122 cmp data1w, data2w 123 b.ne L(return) 124L(less2): 125 mov result, 0 126 tbz limit, 0, L(return_zero) 127 ldrb data1w, [src1end, -1] 128 ldrb data2w, [src2end, -1] 129 sub result, data1w, data2w 130L(return_zero): 131 ret 132 133L(loop_align): 134 ldp data1, data3, [src1, 16] 135 ldp data2, data4, [src2, 16] 136 cmp data1, data2 137 ccmp data3, data4, 0, eq 138 b.ne L(return2) 139 140 /* Align src2 and adjust src1, src2 and limit. */ 141 and tmp, src2, 15 142 sub tmp, tmp, 16 143 sub src2, src2, tmp 144 add limit, limit, tmp 145 sub src1, src1, tmp 146 sub limit, limit, 64 + 16 147 148 .p2align 4 149L(loop64): 150 ldr q0, [src1, 16] 151 ldr q1, [src2, 16] 152 subs limit, limit, 64 153 ldr q2, [src1, 32] 154 ldr q3, [src2, 32] 155 eor v0.16b, v0.16b, v1.16b 156 eor v1.16b, v2.16b, v3.16b 157 ldr q2, [src1, 48] 158 ldr q3, [src2, 48] 159 umaxp v0.16b, v0.16b, v1.16b 160 ldr q4, [src1, 64]! 161 ldr q5, [src2, 64]! 162 eor v1.16b, v2.16b, v3.16b 163 eor v2.16b, v4.16b, v5.16b 164 umaxp v1.16b, v1.16b, v2.16b 165 umaxp v0.16b, v0.16b, v1.16b 166 umaxp v0.16b, v0.16b, v0.16b 167 fmov tmp, d0 168 ccmp tmp, 0, 0, hi 169 b.eq L(loop64) 170 171 /* If equal, process last 1-64 bytes using scalar loop. */ 172 add limit, limit, 64 + 16 173 cbz tmp, L(last64) 174 175 /* Determine the 8-byte aligned offset of the first difference. */ 176#ifdef __AARCH64EB__ 177 rev16 tmp, tmp 178#endif 179 rev tmp, tmp 180 clz tmp, tmp 181 bic tmp, tmp, 7 182 sub tmp, tmp, 48 183 ldr data1, [src1, tmp] 184 ldr data2, [src2, tmp] 185#ifndef __AARCH64EB__ 186 rev data1, data1 187 rev data2, data2 188#endif 189 mov result, 1 190 cmp data1, data2 191 cneg result, result, lo 192 ret 193 194END (memcmp) 195#endif 196