1/* memcmp - compare memory 2 * 3 * Copyright (c) 2013-2022, Arm Limited. 4 * SPDX-License-Identifier: MIT 5 */ 6 7#include <picolibc.h> 8 9#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) || !defined(__ARM_NEON) 10/* See memcmp-stub.c */ 11#else 12 13/* Assumptions: 14 * 15 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 16 */ 17 18#include "asmdefs.h" 19 20#define src1 x0 21#define src2 x1 22#define limit x2 23#define result w0 24 25#define data1 x3 26#define data1w w3 27#define data2 x4 28#define data2w w4 29#define data3 x5 30#define data3w w5 31#define data4 x6 32#define data4w w6 33#define tmp x6 34#define src1end x7 35#define src2end x8 36 37 38ENTRY (memcmp) 39 PTR_ARG (0) 40 PTR_ARG (1) 41 SIZE_ARG (2) 42 43 cmp limit, 16 44 b.lo L(less16) 45 ldp data1, data3, [src1] 46 ldp data2, data4, [src2] 47 ccmp data1, data2, 0, ne 48 ccmp data3, data4, 0, eq 49 b.ne L(return2) 50 51 add src1end, src1, limit 52 add src2end, src2, limit 53 cmp limit, 32 54 b.ls L(last_bytes) 55 cmp limit, 160 56 b.hs L(loop_align) 57 sub limit, limit, 32 58 59 .p2align 4 60L(loop32): 61 ldp data1, data3, [src1, 16] 62 ldp data2, data4, [src2, 16] 63 cmp data1, data2 64 ccmp data3, data4, 0, eq 65 b.ne L(return2) 66 cmp limit, 16 67 b.ls L(last_bytes) 68 69 ldp data1, data3, [src1, 32] 70 ldp data2, data4, [src2, 32] 71 cmp data1, data2 72 ccmp data3, data4, 0, eq 73 b.ne L(return2) 74 add src1, src1, 32 75 add src2, src2, 32 76L(last64): 77 subs limit, limit, 32 78 b.hi L(loop32) 79 80 /* Compare last 1-16 bytes using unaligned access. */ 81L(last_bytes): 82 ldp data1, data3, [src1end, -16] 83 ldp data2, data4, [src2end, -16] 84L(return2): 85 cmp data1, data2 86 csel data1, data1, data3, ne 87 csel data2, data2, data4, ne 88 89 /* Compare data bytes and set return value to 0, -1 or 1. */ 90L(return): 91#ifndef __AARCH64EB__ 92 rev data1, data1 93 rev data2, data2 94#endif 95 cmp data1, data2 96 cset result, ne 97 cneg result, result, lo 98 ret 99 100 .p2align 4 101L(less16): 102 add src1end, src1, limit 103 add src2end, src2, limit 104 tbz limit, 3, L(less8) 105 ldr data1, [src1] 106 ldr data2, [src2] 107 ldr data3, [src1end, -8] 108 ldr data4, [src2end, -8] 109 b L(return2) 110 111 .p2align 4 112L(less8): 113 tbz limit, 2, L(less4) 114 ldr data1w, [src1] 115 ldr data2w, [src2] 116 ldr data3w, [src1end, -4] 117 ldr data4w, [src2end, -4] 118 b L(return2) 119 120L(less4): 121 tbz limit, 1, L(less2) 122 ldrh data1w, [src1] 123 ldrh data2w, [src2] 124 cmp data1w, data2w 125 b.ne L(return) 126L(less2): 127 mov result, 0 128 tbz limit, 0, L(return_zero) 129 ldrb data1w, [src1end, -1] 130 ldrb data2w, [src2end, -1] 131 sub result, data1w, data2w 132L(return_zero): 133 ret 134 135L(loop_align): 136 ldp data1, data3, [src1, 16] 137 ldp data2, data4, [src2, 16] 138 cmp data1, data2 139 ccmp data3, data4, 0, eq 140 b.ne L(return2) 141 142 /* Align src2 and adjust src1, src2 and limit. */ 143 and tmp, src2, 15 144 sub tmp, tmp, 16 145 sub src2, src2, tmp 146 add limit, limit, tmp 147 sub src1, src1, tmp 148 sub limit, limit, 64 + 16 149 150 .p2align 4 151L(loop64): 152 ldr q0, [src1, 16] 153 ldr q1, [src2, 16] 154 subs limit, limit, 64 155 ldr q2, [src1, 32] 156 ldr q3, [src2, 32] 157 eor v0.16b, v0.16b, v1.16b 158 eor v1.16b, v2.16b, v3.16b 159 ldr q2, [src1, 48] 160 ldr q3, [src2, 48] 161 umaxp v0.16b, v0.16b, v1.16b 162 ldr q4, [src1, 64]! 163 ldr q5, [src2, 64]! 164 eor v1.16b, v2.16b, v3.16b 165 eor v2.16b, v4.16b, v5.16b 166 umaxp v1.16b, v1.16b, v2.16b 167 umaxp v0.16b, v0.16b, v1.16b 168 umaxp v0.16b, v0.16b, v0.16b 169 fmov tmp, d0 170 ccmp tmp, 0, 0, hi 171 b.eq L(loop64) 172 173 /* If equal, process last 1-64 bytes using scalar loop. */ 174 add limit, limit, 64 + 16 175 cbz tmp, L(last64) 176 177 /* Determine the 8-byte aligned offset of the first difference. */ 178#ifdef __AARCH64EB__ 179 rev16 tmp, tmp 180#endif 181 rev tmp, tmp 182 clz tmp, tmp 183 bic tmp, tmp, 7 184 sub tmp, tmp, 48 185 ldr data1, [src1, tmp] 186 ldr data2, [src2, tmp] 187#ifndef __AARCH64EB__ 188 rev data1, data1 189 rev data2, data2 190#endif 191 mov result, 1 192 cmp data1, data2 193 cneg result, result, lo 194 ret 195 196END (memcmp) 197#endif 198