1/* memcmp - compare memory
2 *
3 * Copyright (c) 2013-2022, Arm Limited.
4 * SPDX-License-Identifier: MIT
5 */
6
7#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__)
8/* See memcmp-stub.c  */
9#else
10
11/* Assumptions:
12 *
13 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
14 */
15
16#include "asmdefs.h"
17
18#define src1	x0
19#define src2	x1
20#define limit	x2
21#define result	w0
22
23#define data1	x3
24#define data1w	w3
25#define data2	x4
26#define data2w	w4
27#define data3	x5
28#define data3w	w5
29#define data4	x6
30#define data4w	w6
31#define tmp	x6
32#define src1end	x7
33#define src2end	x8
34
35
36ENTRY (memcmp)
37	PTR_ARG (0)
38	PTR_ARG (1)
39	SIZE_ARG (2)
40
41	cmp	limit, 16
42	b.lo	L(less16)
43	ldp	data1, data3, [src1]
44	ldp	data2, data4, [src2]
45	ccmp	data1, data2, 0, ne
46	ccmp	data3, data4, 0, eq
47	b.ne	L(return2)
48
49	add	src1end, src1, limit
50	add	src2end, src2, limit
51	cmp	limit, 32
52	b.ls	L(last_bytes)
53	cmp	limit, 160
54	b.hs	L(loop_align)
55	sub	limit, limit, 32
56
57	.p2align 4
58L(loop32):
59	ldp	data1, data3, [src1, 16]
60	ldp	data2, data4, [src2, 16]
61	cmp	data1, data2
62	ccmp	data3, data4, 0, eq
63	b.ne	L(return2)
64	cmp	limit, 16
65	b.ls	L(last_bytes)
66
67	ldp	data1, data3, [src1, 32]
68	ldp	data2, data4, [src2, 32]
69	cmp	data1, data2
70	ccmp	data3, data4, 0, eq
71	b.ne	L(return2)
72	add	src1, src1, 32
73	add	src2, src2, 32
74L(last64):
75	subs	limit, limit, 32
76	b.hi	L(loop32)
77
78	/* Compare last 1-16 bytes using unaligned access.  */
79L(last_bytes):
80	ldp	data1, data3, [src1end, -16]
81	ldp	data2, data4, [src2end, -16]
82L(return2):
83	cmp	data1, data2
84	csel	data1, data1, data3, ne
85	csel	data2, data2, data4, ne
86
87	/* Compare data bytes and set return value to 0, -1 or 1.  */
88L(return):
89#ifndef __AARCH64EB__
90	rev	data1, data1
91	rev	data2, data2
92#endif
93	cmp	data1, data2
94	cset	result, ne
95	cneg	result, result, lo
96	ret
97
98	.p2align 4
99L(less16):
100	add	src1end, src1, limit
101	add	src2end, src2, limit
102	tbz	limit, 3, L(less8)
103	ldr	data1, [src1]
104	ldr	data2, [src2]
105	ldr	data3, [src1end, -8]
106	ldr	data4, [src2end, -8]
107	b	L(return2)
108
109	.p2align 4
110L(less8):
111	tbz	limit, 2, L(less4)
112	ldr	data1w, [src1]
113	ldr	data2w, [src2]
114	ldr	data3w, [src1end, -4]
115	ldr	data4w, [src2end, -4]
116	b	L(return2)
117
118L(less4):
119	tbz	limit, 1, L(less2)
120	ldrh	data1w, [src1]
121	ldrh	data2w, [src2]
122	cmp	data1w, data2w
123	b.ne	L(return)
124L(less2):
125	mov	result, 0
126	tbz	limit, 0, L(return_zero)
127	ldrb	data1w, [src1end, -1]
128	ldrb	data2w, [src2end, -1]
129	sub	result, data1w, data2w
130L(return_zero):
131	ret
132
133L(loop_align):
134	ldp	data1, data3, [src1, 16]
135	ldp	data2, data4, [src2, 16]
136	cmp	data1, data2
137	ccmp	data3, data4, 0, eq
138	b.ne	L(return2)
139
140	/* Align src2 and adjust src1, src2 and limit.  */
141	and	tmp, src2, 15
142	sub	tmp, tmp, 16
143	sub	src2, src2, tmp
144	add	limit, limit, tmp
145	sub	src1, src1, tmp
146	sub	limit, limit, 64 + 16
147
148	.p2align 4
149L(loop64):
150	ldr	q0, [src1, 16]
151	ldr	q1, [src2, 16]
152	subs	limit, limit, 64
153	ldr	q2, [src1, 32]
154	ldr	q3, [src2, 32]
155	eor	v0.16b, v0.16b, v1.16b
156	eor	v1.16b, v2.16b, v3.16b
157	ldr	q2, [src1, 48]
158	ldr	q3, [src2, 48]
159	umaxp	v0.16b, v0.16b, v1.16b
160	ldr	q4, [src1, 64]!
161	ldr	q5, [src2, 64]!
162	eor	v1.16b, v2.16b, v3.16b
163	eor	v2.16b, v4.16b, v5.16b
164	umaxp	v1.16b, v1.16b, v2.16b
165	umaxp	v0.16b, v0.16b, v1.16b
166	umaxp	v0.16b, v0.16b, v0.16b
167	fmov	tmp, d0
168	ccmp	tmp, 0, 0, hi
169	b.eq	L(loop64)
170
171	/* If equal, process last 1-64 bytes using scalar loop.  */
172	add	limit, limit, 64 + 16
173	cbz	tmp, L(last64)
174
175	/* Determine the 8-byte aligned offset of the first difference.  */
176#ifdef __AARCH64EB__
177	rev16	tmp, tmp
178#endif
179	rev	tmp, tmp
180	clz	tmp, tmp
181	bic	tmp, tmp, 7
182	sub	tmp, tmp, 48
183	ldr	data1, [src1, tmp]
184	ldr	data2, [src2, tmp]
185#ifndef __AARCH64EB__
186	rev	data1, data1
187	rev	data2, data2
188#endif
189	mov	result, 1
190	cmp	data1, data2
191	cneg	result, result, lo
192	ret
193
194END (memcmp)
195#endif
196