1/* 2 * memrchr - find last character in a memory zone. 3 * 4 * Copyright (c) 2020-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#include <picolibc.h> 15 16#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) || !defined(__ARM_NEON) 17/* See memrchr-stub.c */ 18#else 19#include "asmdefs.h" 20 21#define srcin x0 22#define chrin w1 23#define cntin x2 24#define result x0 25 26#define src x3 27#define cntrem x4 28#define synd x5 29#define shift x6 30#define tmp x7 31#define end x8 32#define endm1 x9 33 34#define vrepchr v0 35#define qdata q1 36#define vdata v1 37#define vhas_chr v2 38#define vend v3 39#define dend d3 40 41/* 42 Core algorithm: 43 For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 44 per byte. We take 4 bits of every comparison byte with shift right and narrow 45 by 4 instruction. Since the bits in the nibble mask reflect the order in 46 which things occur in the original string, counting leading zeros identifies 47 exactly which byte matched. */ 48 49ENTRY (memrchr) 50 PTR_ARG (0) 51 add end, srcin, cntin 52 sub endm1, end, 1 53 bic src, endm1, 15 54 cbz cntin, L(nomatch) 55 ld1 {vdata.16b}, [src] 56 dup vrepchr.16b, chrin 57 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 58 neg shift, end, lsl 2 59 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 60 fmov synd, dend 61 lsl synd, synd, shift 62 cbz synd, L(start_loop) 63 64 clz synd, synd 65 sub result, endm1, synd, lsr 2 66 cmp cntin, synd, lsr 2 67 csel result, result, xzr, hi 68 ret 69 70 nop 71L(start_loop): 72 subs cntrem, src, srcin 73 b.ls L(nomatch) 74 75 /* Make sure that it won't overread by a 16-byte chunk */ 76 sub cntrem, cntrem, 1 77 tbz cntrem, 4, L(loop32_2) 78 add src, src, 16 79 80 .p2align 5 81L(loop32): 82 ldr qdata, [src, -32]! 83 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 84 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 85 fmov synd, dend 86 cbnz synd, L(end) 87 88L(loop32_2): 89 ldr qdata, [src, -16] 90 subs cntrem, cntrem, 32 91 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 92 b.lo L(end_2) 93 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 94 fmov synd, dend 95 cbz synd, L(loop32) 96L(end_2): 97 sub src, src, 16 98L(end): 99 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 100 fmov synd, dend 101 102 add tmp, src, 15 103#ifdef __AARCH64EB__ 104 rbit synd, synd 105#endif 106 clz synd, synd 107 sub tmp, tmp, synd, lsr 2 108 cmp tmp, srcin 109 csel result, tmp, xzr, hs 110 ret 111 112L(nomatch): 113 mov result, 0 114 ret 115 116END (memrchr) 117#endif 118