1/* memcmp - compare memory
2 *
3 * Copyright (c) 2013-2022, Arm Limited.
4 * SPDX-License-Identifier: MIT
5 */
6
7#include <picolibc.h>
8
9#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) || !defined(__ARM_NEON)
10/* See memcmp-stub.c  */
11#else
12
13/* Assumptions:
14 *
15 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
16 */
17
18#include "asmdefs.h"
19
20#define src1	x0
21#define src2	x1
22#define limit	x2
23#define result	w0
24
25#define data1	x3
26#define data1w	w3
27#define data2	x4
28#define data2w	w4
29#define data3	x5
30#define data3w	w5
31#define data4	x6
32#define data4w	w6
33#define tmp	x6
34#define src1end	x7
35#define src2end	x8
36
37
38ENTRY (memcmp)
39	PTR_ARG (0)
40	PTR_ARG (1)
41	SIZE_ARG (2)
42
43	cmp	limit, 16
44	b.lo	L(less16)
45	ldp	data1, data3, [src1]
46	ldp	data2, data4, [src2]
47	ccmp	data1, data2, 0, ne
48	ccmp	data3, data4, 0, eq
49	b.ne	L(return2)
50
51	add	src1end, src1, limit
52	add	src2end, src2, limit
53	cmp	limit, 32
54	b.ls	L(last_bytes)
55	cmp	limit, 160
56	b.hs	L(loop_align)
57	sub	limit, limit, 32
58
59	.p2align 4
60L(loop32):
61	ldp	data1, data3, [src1, 16]
62	ldp	data2, data4, [src2, 16]
63	cmp	data1, data2
64	ccmp	data3, data4, 0, eq
65	b.ne	L(return2)
66	cmp	limit, 16
67	b.ls	L(last_bytes)
68
69	ldp	data1, data3, [src1, 32]
70	ldp	data2, data4, [src2, 32]
71	cmp	data1, data2
72	ccmp	data3, data4, 0, eq
73	b.ne	L(return2)
74	add	src1, src1, 32
75	add	src2, src2, 32
76L(last64):
77	subs	limit, limit, 32
78	b.hi	L(loop32)
79
80	/* Compare last 1-16 bytes using unaligned access.  */
81L(last_bytes):
82	ldp	data1, data3, [src1end, -16]
83	ldp	data2, data4, [src2end, -16]
84L(return2):
85	cmp	data1, data2
86	csel	data1, data1, data3, ne
87	csel	data2, data2, data4, ne
88
89	/* Compare data bytes and set return value to 0, -1 or 1.  */
90L(return):
91#ifndef __AARCH64EB__
92	rev	data1, data1
93	rev	data2, data2
94#endif
95	cmp	data1, data2
96	cset	result, ne
97	cneg	result, result, lo
98	ret
99
100	.p2align 4
101L(less16):
102	add	src1end, src1, limit
103	add	src2end, src2, limit
104	tbz	limit, 3, L(less8)
105	ldr	data1, [src1]
106	ldr	data2, [src2]
107	ldr	data3, [src1end, -8]
108	ldr	data4, [src2end, -8]
109	b	L(return2)
110
111	.p2align 4
112L(less8):
113	tbz	limit, 2, L(less4)
114	ldr	data1w, [src1]
115	ldr	data2w, [src2]
116	ldr	data3w, [src1end, -4]
117	ldr	data4w, [src2end, -4]
118	b	L(return2)
119
120L(less4):
121	tbz	limit, 1, L(less2)
122	ldrh	data1w, [src1]
123	ldrh	data2w, [src2]
124	cmp	data1w, data2w
125	b.ne	L(return)
126L(less2):
127	mov	result, 0
128	tbz	limit, 0, L(return_zero)
129	ldrb	data1w, [src1end, -1]
130	ldrb	data2w, [src2end, -1]
131	sub	result, data1w, data2w
132L(return_zero):
133	ret
134
135L(loop_align):
136	ldp	data1, data3, [src1, 16]
137	ldp	data2, data4, [src2, 16]
138	cmp	data1, data2
139	ccmp	data3, data4, 0, eq
140	b.ne	L(return2)
141
142	/* Align src2 and adjust src1, src2 and limit.  */
143	and	tmp, src2, 15
144	sub	tmp, tmp, 16
145	sub	src2, src2, tmp
146	add	limit, limit, tmp
147	sub	src1, src1, tmp
148	sub	limit, limit, 64 + 16
149
150	.p2align 4
151L(loop64):
152	ldr	q0, [src1, 16]
153	ldr	q1, [src2, 16]
154	subs	limit, limit, 64
155	ldr	q2, [src1, 32]
156	ldr	q3, [src2, 32]
157	eor	v0.16b, v0.16b, v1.16b
158	eor	v1.16b, v2.16b, v3.16b
159	ldr	q2, [src1, 48]
160	ldr	q3, [src2, 48]
161	umaxp	v0.16b, v0.16b, v1.16b
162	ldr	q4, [src1, 64]!
163	ldr	q5, [src2, 64]!
164	eor	v1.16b, v2.16b, v3.16b
165	eor	v2.16b, v4.16b, v5.16b
166	umaxp	v1.16b, v1.16b, v2.16b
167	umaxp	v0.16b, v0.16b, v1.16b
168	umaxp	v0.16b, v0.16b, v0.16b
169	fmov	tmp, d0
170	ccmp	tmp, 0, 0, hi
171	b.eq	L(loop64)
172
173	/* If equal, process last 1-64 bytes using scalar loop.  */
174	add	limit, limit, 64 + 16
175	cbz	tmp, L(last64)
176
177	/* Determine the 8-byte aligned offset of the first difference.  */
178#ifdef __AARCH64EB__
179	rev16	tmp, tmp
180#endif
181	rev	tmp, tmp
182	clz	tmp, tmp
183	bic	tmp, tmp, 7
184	sub	tmp, tmp, 48
185	ldr	data1, [src1, tmp]
186	ldr	data2, [src2, tmp]
187#ifndef __AARCH64EB__
188	rev	data1, data1
189	rev	data2, data2
190#endif
191	mov	result, 1
192	cmp	data1, data2
193	cneg	result, result, lo
194	ret
195
196END (memcmp)
197#endif
198