1/*
2 * memrchr - find last character in a memory zone.
3 *
4 * Copyright (c) 2020-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include <picolibc.h>
15
16#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) || !defined(__ARM_NEON)
17/* See memrchr-stub.c */
18#else
19#include "asmdefs.h"
20
21#define srcin		x0
22#define chrin		w1
23#define cntin		x2
24#define result		x0
25
26#define src		x3
27#define cntrem		x4
28#define synd		x5
29#define shift		x6
30#define	tmp		x7
31#define end		x8
32#define endm1		x9
33
34#define vrepchr		v0
35#define qdata		q1
36#define vdata		v1
37#define vhas_chr	v2
38#define vend		v3
39#define dend		d3
40
41/*
42   Core algorithm:
43   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
44   per byte. We take 4 bits of every comparison byte with shift right and narrow
45   by 4 instruction. Since the bits in the nibble mask reflect the order in
46   which things occur in the original string, counting leading zeros identifies
47   exactly which byte matched.  */
48
49ENTRY (memrchr)
50	PTR_ARG (0)
51	add	end, srcin, cntin
52	sub	endm1, end, 1
53	bic	src, endm1, 15
54	cbz	cntin, L(nomatch)
55	ld1	{vdata.16b}, [src]
56	dup	vrepchr.16b, chrin
57	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
58	neg	shift, end, lsl 2
59	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
60	fmov	synd, dend
61	lsl	synd, synd, shift
62	cbz	synd, L(start_loop)
63
64	clz	synd, synd
65	sub	result, endm1, synd, lsr 2
66	cmp	cntin, synd, lsr 2
67	csel	result, result, xzr, hi
68	ret
69
70	nop
71L(start_loop):
72	subs	cntrem, src, srcin
73	b.ls	L(nomatch)
74
75	/* Make sure that it won't overread by a 16-byte chunk */
76	sub	cntrem, cntrem, 1
77	tbz	cntrem, 4, L(loop32_2)
78	add	src, src, 16
79
80	.p2align 5
81L(loop32):
82	ldr	qdata, [src, -32]!
83	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
84	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
85	fmov	synd, dend
86	cbnz	synd, L(end)
87
88L(loop32_2):
89	ldr	qdata, [src, -16]
90	subs	cntrem, cntrem, 32
91	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
92	b.lo	L(end_2)
93	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
94	fmov	synd, dend
95	cbz	synd, L(loop32)
96L(end_2):
97	sub	src, src, 16
98L(end):
99	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
100	fmov	synd, dend
101
102	add	tmp, src, 15
103#ifdef __AARCH64EB__
104	rbit	synd, synd
105#endif
106	clz	synd, synd
107	sub	tmp, tmp, synd, lsr 2
108	cmp	tmp, srcin
109	csel	result, tmp, xzr, hs
110	ret
111
112L(nomatch):
113	mov	result, 0
114	ret
115
116END (memrchr)
117#endif
118