1/*
2 * memrchr - find last character in a memory zone.
3 *
4 * Copyright (c) 2020-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#if defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)
15/* See memrchr-stub.c */
16#else
17#include "asmdefs.h"
18
19#define srcin		x0
20#define chrin		w1
21#define cntin		x2
22#define result		x0
23
24#define src		x3
25#define cntrem		x4
26#define synd		x5
27#define shift		x6
28#define	tmp		x7
29#define end		x8
30#define endm1		x9
31
32#define vrepchr		v0
33#define qdata		q1
34#define vdata		v1
35#define vhas_chr	v2
36#define vend		v3
37#define dend		d3
38
39/*
40   Core algorithm:
41   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
42   per byte. We take 4 bits of every comparison byte with shift right and narrow
43   by 4 instruction. Since the bits in the nibble mask reflect the order in
44   which things occur in the original string, counting leading zeros identifies
45   exactly which byte matched.  */
46
47ENTRY (memrchr)
48	PTR_ARG (0)
49	add	end, srcin, cntin
50	sub	endm1, end, 1
51	bic	src, endm1, 15
52	cbz	cntin, L(nomatch)
53	ld1	{vdata.16b}, [src]
54	dup	vrepchr.16b, chrin
55	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
56	neg	shift, end, lsl 2
57	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
58	fmov	synd, dend
59	lsl	synd, synd, shift
60	cbz	synd, L(start_loop)
61
62	clz	synd, synd
63	sub	result, endm1, synd, lsr 2
64	cmp	cntin, synd, lsr 2
65	csel	result, result, xzr, hi
66	ret
67
68	nop
69L(start_loop):
70	subs	cntrem, src, srcin
71	b.ls	L(nomatch)
72
73	/* Make sure that it won't overread by a 16-byte chunk */
74	sub	cntrem, cntrem, 1
75	tbz	cntrem, 4, L(loop32_2)
76	add	src, src, 16
77
78	.p2align 5
79L(loop32):
80	ldr	qdata, [src, -32]!
81	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
82	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
83	fmov	synd, dend
84	cbnz	synd, L(end)
85
86L(loop32_2):
87	ldr	qdata, [src, -16]
88	subs	cntrem, cntrem, 32
89	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
90	b.lo	L(end_2)
91	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
92	fmov	synd, dend
93	cbz	synd, L(loop32)
94L(end_2):
95	sub	src, src, 16
96L(end):
97	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
98	fmov	synd, dend
99
100	add	tmp, src, 15
101#ifdef __AARCH64EB__
102	rbit	synd, synd
103#endif
104	clz	synd, synd
105	sub	tmp, tmp, synd, lsr 2
106	cmp	tmp, srcin
107	csel	result, tmp, xzr, hs
108	ret
109
110L(nomatch):
111	mov	result, 0
112	ret
113
114END (memrchr)
115#endif
116