1/* Copyright (c) 2012-2013, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/*
28 * Copyright (c) 2015 ARM Ltd
29 * All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 *    notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 *    notice, this list of conditions and the following disclaimer in the
38 *    documentation and/or other materials provided with the distribution.
39 * 3. The name of the company may not be used to endorse or promote
40 *    products derived from this software without specific prior written
41 *    permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55/* Assumptions:
56 *
57 * ARMv8-a, AArch64, unaligned accesses
58 *
59 */
60
61#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__)
62/* See memset-stub.c  */
63#else
64
65#define dstin	x0
66#define val	x1
67#define valw	w1
68#define count	x2
69#define dst	x3
70#define dstend	x4
71#define tmp1	x5
72#define tmp1w	w5
73#define tmp2	x6
74#define tmp2w	w6
75#define zva_len x7
76#define zva_lenw w7
77
78#define L(l) .L ## l
79
80	.macro def_fn f p2align=0
81	.text
82	.p2align \p2align
83	.global \f
84	.type \f, %function
85\f:
86	.endm
87
88def_fn memset p2align=6
89
90	dup	v0.16B, valw
91	add	dstend, dstin, count
92
93	cmp	count, 96
94	b.hi	L(set_long)
95	cmp	count, 16
96	b.hs	L(set_medium)
97	mov	val, v0.D[0]
98
99	/* Set 0..15 bytes.  */
100	tbz	count, 3, 1f
101	str	val, [dstin]
102	str	val, [dstend, -8]
103	ret
104	nop
1051:	tbz	count, 2, 2f
106	str	valw, [dstin]
107	str	valw, [dstend, -4]
108	ret
1092:	cbz	count, 3f
110	strb	valw, [dstin]
111	tbz	count, 1, 3f
112	strh	valw, [dstend, -2]
1133:	ret
114
115	/* Set 17..96 bytes.  */
116L(set_medium):
117	str	q0, [dstin]
118	tbnz	count, 6, L(set96)
119	str	q0, [dstend, -16]
120	tbz	count, 5, 1f
121	str	q0, [dstin, 16]
122	str	q0, [dstend, -32]
1231:	ret
124
125	.p2align 4
126	/* Set 64..96 bytes.  Write 64 bytes from the start and
127	   32 bytes from the end.  */
128L(set96):
129	str	q0, [dstin, 16]
130	stp	q0, q0, [dstin, 32]
131	stp	q0, q0, [dstend, -32]
132	ret
133
134	.p2align 3
135	nop
136L(set_long):
137	and	valw, valw, 255
138	bic	dst, dstin, 15
139	str	q0, [dstin]
140	cmp	count, 256
141	ccmp	valw, 0, 0, cs
142	b.eq	L(try_zva)
143L(no_zva):
144	sub	count, dstend, dst	/* Count is 16 too large.  */
145	sub	dst, dst, 16		/* Dst is biased by -32.  */
146	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
1471:	stp	q0, q0, [dst, 32]
148	stp	q0, q0, [dst, 64]!
149L(tail64):
150	subs	count, count, 64
151	b.hi	1b
1522:	stp	q0, q0, [dstend, -64]
153	stp	q0, q0, [dstend, -32]
154	ret
155
156	.p2align 3
157L(try_zva):
158	mrs	tmp1, dczid_el0
159	tbnz	tmp1w, 4, L(no_zva)
160	and	tmp1w, tmp1w, 15
161	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
162	b.ne	 L(zva_128)
163
164	/* Write the first and last 64 byte aligned block using stp rather
165	   than using DC ZVA.  This is faster on some cores.
166	 */
167L(zva_64):
168	str	q0, [dst, 16]
169	stp	q0, q0, [dst, 32]
170	bic	dst, dst, 63
171	stp	q0, q0, [dst, 64]
172	stp	q0, q0, [dst, 96]
173	sub	count, dstend, dst	/* Count is now 128 too large.	*/
174	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
175	add	dst, dst, 128
176	nop
1771:	dc	zva, dst
178	add	dst, dst, 64
179	subs	count, count, 64
180	b.hi	1b
181	stp	q0, q0, [dst, 0]
182	stp	q0, q0, [dst, 32]
183	stp	q0, q0, [dstend, -64]
184	stp	q0, q0, [dstend, -32]
185	ret
186
187	.p2align 3
188L(zva_128):
189	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
190	b.ne	L(zva_other)
191
192	str	q0, [dst, 16]
193	stp	q0, q0, [dst, 32]
194	stp	q0, q0, [dst, 64]
195	stp	q0, q0, [dst, 96]
196	bic	dst, dst, 127
197	sub	count, dstend, dst	/* Count is now 128 too large.	*/
198	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
199	add	dst, dst, 128
2001:	dc	zva, dst
201	add	dst, dst, 128
202	subs	count, count, 128
203	b.hi	1b
204	stp	q0, q0, [dstend, -128]
205	stp	q0, q0, [dstend, -96]
206	stp	q0, q0, [dstend, -64]
207	stp	q0, q0, [dstend, -32]
208	ret
209
210L(zva_other):
211	mov	tmp2w, 4
212	lsl	zva_lenw, tmp2w, tmp1w
213	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
214	cmp	count, tmp1
215	blo	L(no_zva)
216
217	sub	tmp2, zva_len, 1
218	add	tmp1, dst, zva_len
219	add	dst, dst, 16
220	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
221	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
222	beq	2f
2231:	stp	q0, q0, [dst], 64
224	stp	q0, q0, [dst, -32]
225	subs	count, count, 64
226	b.hi	1b
2272:	mov	dst, tmp1
228	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
229	subs	count, count, zva_len
230	b.lo	4f
2313:	dc	zva, dst
232	add	dst, dst, zva_len
233	subs	count, count, zva_len
234	b.hs	3b
2354:	add	count, count, zva_len
236	sub	dst, dst, 32		/* Bias dst for tail loop.  */
237	b	L(tail64)
238
239	.size	memset, . - memset
240#endif
241