1/*
2 * memset - fill memory with a constant byte
3 *
4 * Copyright (c) 2012-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11 *
12 */
13
14#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__)
15/* See memset-stub.c  */
16#else
17#include "asmdefs.h"
18
19#define dstin	x0
20#define val	x1
21#define valw	w1
22#define count	x2
23#define dst	x3
24#define dstend	x4
25#define zva_val	x5
26
27ENTRY (memset)
28	PTR_ARG (0)
29	SIZE_ARG (2)
30
31	dup	v0.16B, valw
32	add	dstend, dstin, count
33
34	cmp	count, 96
35	b.hi	L(set_long)
36	cmp	count, 16
37	b.hs	L(set_medium)
38	mov	val, v0.D[0]
39
40	/* Set 0..15 bytes.  */
41	tbz	count, 3, 1f
42	str	val, [dstin]
43	str	val, [dstend, -8]
44	ret
45	.p2align 4
461:	tbz	count, 2, 2f
47	str	valw, [dstin]
48	str	valw, [dstend, -4]
49	ret
502:	cbz	count, 3f
51	strb	valw, [dstin]
52	tbz	count, 1, 3f
53	strh	valw, [dstend, -2]
543:	ret
55
56	/* Set 17..96 bytes.  */
57L(set_medium):
58	str	q0, [dstin]
59	tbnz	count, 6, L(set96)
60	str	q0, [dstend, -16]
61	tbz	count, 5, 1f
62	str	q0, [dstin, 16]
63	str	q0, [dstend, -32]
641:	ret
65
66	.p2align 4
67	/* Set 64..96 bytes.  Write 64 bytes from the start and
68	   32 bytes from the end.  */
69L(set96):
70	str	q0, [dstin, 16]
71	stp	q0, q0, [dstin, 32]
72	stp	q0, q0, [dstend, -32]
73	ret
74
75	.p2align 4
76L(set_long):
77	and	valw, valw, 255
78	bic	dst, dstin, 15
79	str	q0, [dstin]
80	cmp	count, 160
81	ccmp	valw, 0, 0, hs
82	b.ne	L(no_zva)
83
84#ifndef SKIP_ZVA_CHECK
85	mrs	zva_val, dczid_el0
86	and	zva_val, zva_val, 31
87	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
88	b.ne	L(no_zva)
89#endif
90	str	q0, [dst, 16]
91	stp	q0, q0, [dst, 32]
92	bic	dst, dst, 63
93	sub	count, dstend, dst	/* Count is now 64 too large.  */
94	sub	count, count, 128	/* Adjust count and bias for loop.  */
95
96	.p2align 4
97L(zva_loop):
98	add	dst, dst, 64
99	dc	zva, dst
100	subs	count, count, 64
101	b.hi	L(zva_loop)
102	stp	q0, q0, [dstend, -64]
103	stp	q0, q0, [dstend, -32]
104	ret
105
106L(no_zva):
107	sub	count, dstend, dst	/* Count is 16 too large.  */
108	sub	dst, dst, 16		/* Dst is biased by -32.  */
109	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
110L(no_zva_loop):
111	stp	q0, q0, [dst, 32]
112	stp	q0, q0, [dst, 64]!
113	subs	count, count, 64
114	b.hi	L(no_zva_loop)
115	stp	q0, q0, [dstend, -64]
116	stp	q0, q0, [dstend, -32]
117	ret
118
119END (memset)
120#endif
121