1/*
2 * memset - fill memory with a constant byte
3 *
4 * Copyright (c) 2012-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11 *
12 */
13
14#include <picolibc.h>
15
16#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) || !defined(__ARM_NEON)
17/* See memset-stub.c  */
18#else
19#include "asmdefs.h"
20
21#define dstin	x0
22#define val	x1
23#define valw	w1
24#define count	x2
25#define dst	x3
26#define dstend	x4
27#define zva_val	x5
28
29ENTRY (memset)
30	PTR_ARG (0)
31	SIZE_ARG (2)
32
33	dup	v0.16B, valw
34	add	dstend, dstin, count
35
36	cmp	count, 96
37	b.hi	L(set_long)
38	cmp	count, 16
39	b.hs	L(set_medium)
40	mov	val, v0.D[0]
41
42	/* Set 0..15 bytes.  */
43	tbz	count, 3, 1f
44	str	val, [dstin]
45	str	val, [dstend, -8]
46	ret
47	.p2align 4
481:	tbz	count, 2, 2f
49	str	valw, [dstin]
50	str	valw, [dstend, -4]
51	ret
522:	cbz	count, 3f
53	strb	valw, [dstin]
54	tbz	count, 1, 3f
55	strh	valw, [dstend, -2]
563:	ret
57
58	/* Set 17..96 bytes.  */
59L(set_medium):
60	str	q0, [dstin]
61	tbnz	count, 6, L(set96)
62	str	q0, [dstend, -16]
63	tbz	count, 5, 1f
64	str	q0, [dstin, 16]
65	str	q0, [dstend, -32]
661:	ret
67
68	.p2align 4
69	/* Set 64..96 bytes.  Write 64 bytes from the start and
70	   32 bytes from the end.  */
71L(set96):
72	str	q0, [dstin, 16]
73	stp	q0, q0, [dstin, 32]
74	stp	q0, q0, [dstend, -32]
75	ret
76
77	.p2align 4
78L(set_long):
79	and	valw, valw, 255
80	bic	dst, dstin, 15
81	str	q0, [dstin]
82	cmp	count, 160
83	ccmp	valw, 0, 0, hs
84	b.ne	L(no_zva)
85
86#ifndef SKIP_ZVA_CHECK
87	mrs	zva_val, dczid_el0
88	and	zva_val, zva_val, 31
89	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
90	b.ne	L(no_zva)
91#endif
92	str	q0, [dst, 16]
93	stp	q0, q0, [dst, 32]
94	bic	dst, dst, 63
95	sub	count, dstend, dst	/* Count is now 64 too large.  */
96	sub	count, count, 128	/* Adjust count and bias for loop.  */
97
98	.p2align 4
99L(zva_loop):
100	add	dst, dst, 64
101	dc	zva, dst
102	subs	count, count, 64
103	b.hi	L(zva_loop)
104	stp	q0, q0, [dstend, -64]
105	stp	q0, q0, [dstend, -32]
106	ret
107
108L(no_zva):
109	sub	count, dstend, dst	/* Count is 16 too large.  */
110	sub	dst, dst, 16		/* Dst is biased by -32.  */
111	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
112L(no_zva_loop):
113	stp	q0, q0, [dst, 32]
114	stp	q0, q0, [dst, 64]!
115	subs	count, count, 64
116	b.hi	L(no_zva_loop)
117	stp	q0, q0, [dstend, -64]
118	stp	q0, q0, [dstend, -32]
119	ret
120
121END (memset)
122#endif
123