1/* 2 * memrchr - find last character in a memory zone. 3 * 4 * Copyright (c) 2020-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#if defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) 15/* See memrchr-stub.c */ 16#else 17#include "asmdefs.h" 18 19#define srcin x0 20#define chrin w1 21#define cntin x2 22#define result x0 23 24#define src x3 25#define cntrem x4 26#define synd x5 27#define shift x6 28#define tmp x7 29#define end x8 30#define endm1 x9 31 32#define vrepchr v0 33#define qdata q1 34#define vdata v1 35#define vhas_chr v2 36#define vend v3 37#define dend d3 38 39/* 40 Core algorithm: 41 For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 42 per byte. We take 4 bits of every comparison byte with shift right and narrow 43 by 4 instruction. Since the bits in the nibble mask reflect the order in 44 which things occur in the original string, counting leading zeros identifies 45 exactly which byte matched. */ 46 47ENTRY (memrchr) 48 PTR_ARG (0) 49 add end, srcin, cntin 50 sub endm1, end, 1 51 bic src, endm1, 15 52 cbz cntin, L(nomatch) 53 ld1 {vdata.16b}, [src] 54 dup vrepchr.16b, chrin 55 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 56 neg shift, end, lsl 2 57 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 58 fmov synd, dend 59 lsl synd, synd, shift 60 cbz synd, L(start_loop) 61 62 clz synd, synd 63 sub result, endm1, synd, lsr 2 64 cmp cntin, synd, lsr 2 65 csel result, result, xzr, hi 66 ret 67 68 nop 69L(start_loop): 70 subs cntrem, src, srcin 71 b.ls L(nomatch) 72 73 /* Make sure that it won't overread by a 16-byte chunk */ 74 sub cntrem, cntrem, 1 75 tbz cntrem, 4, L(loop32_2) 76 add src, src, 16 77 78 .p2align 5 79L(loop32): 80 ldr qdata, [src, -32]! 81 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 82 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 83 fmov synd, dend 84 cbnz synd, L(end) 85 86L(loop32_2): 87 ldr qdata, [src, -16] 88 subs cntrem, cntrem, 32 89 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 90 b.lo L(end_2) 91 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 92 fmov synd, dend 93 cbz synd, L(loop32) 94L(end_2): 95 sub src, src, 16 96L(end): 97 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 98 fmov synd, dend 99 100 add tmp, src, 15 101#ifdef __AARCH64EB__ 102 rbit synd, synd 103#endif 104 clz synd, synd 105 sub tmp, tmp, synd, lsr 2 106 cmp tmp, srcin 107 csel result, tmp, xzr, hs 108 ret 109 110L(nomatch): 111 mov result, 0 112 ret 113 114END (memrchr) 115#endif 116