1 /******************************************************************************
2  *
3  *  Copyright 2022 Google LLC
4  *
5  *  Licensed under the Apache License, Version 2.0 (the "License");
6  *  you may not use this file except in compliance with the License.
7  *  You may obtain a copy of the License at:
8  *
9  *  http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  *
17  ******************************************************************************/
18 
19 #if __ARM_NEON && __ARM_ARCH_ISA_A64
20 
21 #include <arm_neon.h>
22 
23 #else
24 
25 #include <stdint.h>
26 
27 
28 /* ----------------------------------------------------------------------------
29  *  Integer
30  * -------------------------------------------------------------------------- */
31 
32 typedef struct { int16_t e[4]; } int16x4_t;
33 
34 typedef struct { int16_t e[8]; } int16x8_t;
35 typedef struct { int32_t e[4]; } int32x4_t;
36 typedef struct { int64_t e[2]; } int64x2_t;
37 
38 
39 /**
40  * Load / Store
41  */
42 
43 __attribute__((unused))
vld1_s16(const int16_t * p)44 static int16x4_t vld1_s16(const int16_t *p)
45 {
46     return (int16x4_t){ { p[0], p[1], p[2], p[3] } };
47 }
48 
49 
50 /**
51  * Arithmetic
52  */
53 
54 __attribute__((unused))
vmull_s16(int16x4_t a,int16x4_t b)55 static int32x4_t vmull_s16(int16x4_t a, int16x4_t b)
56 {
57     return (int32x4_t){ { a.e[0] * b.e[0], a.e[1] * b.e[1],
58                           a.e[2] * b.e[2], a.e[3] * b.e[3]  } };
59 }
60 
61 __attribute__((unused))
vmlal_s16(int32x4_t r,int16x4_t a,int16x4_t b)62 static int32x4_t vmlal_s16(int32x4_t r, int16x4_t a, int16x4_t b)
63 {
64     return (int32x4_t){ {
65         r.e[0] + a.e[0] * b.e[0], r.e[1] + a.e[1] * b.e[1],
66         r.e[2] + a.e[2] * b.e[2], r.e[3] + a.e[3] * b.e[3] } };
67 }
68 
69 __attribute__((unused))
vpadalq_s32(int64x2_t a,int32x4_t b)70 static int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
71 {
72     int64x2_t r;
73 
74     r.e[0] = a.e[0] + ((int64_t)b.e[0] + b.e[1]);
75     r.e[1] = a.e[1] + ((int64_t)b.e[2] + b.e[3]);
76 
77     return r;
78 }
79 
80 
81 /**
82  * Reduce
83  */
84 
85 __attribute__((unused))
vaddvq_s32(int32x4_t v)86 static int32_t vaddvq_s32(int32x4_t v)
87 {
88     return v.e[0] + v.e[1] + v.e[2] + v.e[3];
89 }
90 
91 __attribute__((unused))
vaddvq_s64(int64x2_t v)92 static int64_t vaddvq_s64(int64x2_t v)
93 {
94     return v.e[0] + v.e[1];
95 }
96 
97 
98 /**
99  * Manipulation
100  */
101 
102 __attribute__((unused))
vext_s16(int16x4_t a,int16x4_t b,const int n)103 static int16x4_t vext_s16(int16x4_t a, int16x4_t b, const int n)
104 {
105     int16_t x[] = { a.e[0], a.e[1], a.e[2], a.e[3],
106                     b.e[0], b.e[1], b.e[2], b.e[3] };
107 
108     return (int16x4_t){ { x[n], x[n+1], x[n+2], x[n+3] } };
109 }
110 
111 __attribute__((unused))
vmovq_n_s32(uint32_t v)112 static int32x4_t vmovq_n_s32(uint32_t v)
113 {
114     return (int32x4_t){ { v, v, v, v } };
115 }
116 
117 __attribute__((unused))
vmovq_n_s64(int64_t v)118 static int64x2_t vmovq_n_s64(int64_t v)
119 {
120     return (int64x2_t){ { v, v, } };
121 }
122 
123 
124 
125 /* ----------------------------------------------------------------------------
126  *  Floating Point
127  * -------------------------------------------------------------------------- */
128 
129 typedef struct { float e[2]; } float32x2_t;
130 typedef struct { float e[4]; } float32x4_t;
131 
132 typedef struct { float32x2_t val[2]; } float32x2x2_t;
133 typedef struct { float32x4_t val[2]; } float32x4x2_t;
134 
135 
136 /**
137  * Load / Store
138  */
139 
140 __attribute__((unused))
vld1_f32(const float * p)141 static float32x2_t vld1_f32(const float *p)
142 {
143     return (float32x2_t){ { p[0], p[1] } };
144 }
145 
146 __attribute__((unused))
vld1q_f32(const float * p)147 static float32x4_t vld1q_f32(const float *p)
148 {
149     return (float32x4_t){ { p[0], p[1], p[2], p[3] } };
150 }
151 
152 __attribute__((unused))
vld1q_dup_f32(const float * p)153 static float32x4_t vld1q_dup_f32(const float *p)
154 {
155     return (float32x4_t){ { p[0], p[0], p[0], p[0] } };
156 }
157 
158 __attribute__((unused))
vld2_f32(const float * p)159 static float32x2x2_t vld2_f32(const float *p)
160 {
161     return (float32x2x2_t){ .val[0] = { { p[0], p[2] } },
162                             .val[1] = { { p[1], p[3] } } };
163 }
164 
165 __attribute__((unused))
vld2q_f32(const float * p)166 static float32x4x2_t vld2q_f32(const float *p)
167 {
168     return (float32x4x2_t){ .val[0] = { { p[0], p[2], p[4], p[6] } },
169                             .val[1] = { { p[1], p[3], p[5], p[7] } } };
170 }
171 
172 __attribute__((unused))
vst1_f32(float * p,float32x2_t v)173 static void vst1_f32(float *p, float32x2_t v)
174 {
175     p[0] = v.e[0], p[1] = v.e[1];
176 }
177 
178 __attribute__((unused))
vst1q_f32(float * p,float32x4_t v)179 static void vst1q_f32(float *p, float32x4_t v)
180 {
181     p[0] = v.e[0], p[1] = v.e[1], p[2] = v.e[2], p[3] = v.e[3];
182 }
183 
184 /**
185  * Arithmetic
186  */
187 
188 __attribute__((unused))
vneg_f32(float32x2_t a)189 static float32x2_t vneg_f32(float32x2_t a)
190 {
191     return (float32x2_t){ { -a.e[0], -a.e[1] } };
192 }
193 
194 __attribute__((unused))
vnegq_f32(float32x4_t a)195 static float32x4_t vnegq_f32(float32x4_t a)
196 {
197     return (float32x4_t){ { -a.e[0], -a.e[1], -a.e[2], -a.e[3] } };
198 }
199 
200 __attribute__((unused))
vaddq_f32(float32x4_t a,float32x4_t b)201 static float32x4_t vaddq_f32(float32x4_t a, float32x4_t b)
202 {
203     return (float32x4_t){ { a.e[0] + b.e[0], a.e[1] + b.e[1],
204                             a.e[2] + b.e[2], a.e[3] + b.e[3] } };
205 }
206 
207 __attribute__((unused))
vsubq_f32(float32x4_t a,float32x4_t b)208 static float32x4_t vsubq_f32(float32x4_t a, float32x4_t b)
209 {
210     return (float32x4_t){ { a.e[0] - b.e[0], a.e[1] - b.e[1],
211                             a.e[2] - b.e[2], a.e[3] - b.e[3] } };
212 }
213 
214 __attribute__((unused))
vfma_f32(float32x2_t a,float32x2_t b,float32x2_t c)215 static float32x2_t vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c)
216 {
217     return (float32x2_t){ {
218         a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1] } };
219 }
220 
221 __attribute__((unused))
vfmaq_f32(float32x4_t a,float32x4_t b,float32x4_t c)222 static float32x4_t vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
223 {
224     return (float32x4_t){ {
225         a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1],
226         a.e[2] + b.e[2] * c.e[2], a.e[3] + b.e[3] * c.e[3] } };
227 }
228 
229 __attribute__((unused))
vfms_f32(float32x2_t a,float32x2_t b,float32x2_t c)230 static float32x2_t vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c)
231 {
232     return (float32x2_t){ {
233         a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1] } };
234 }
235 
236 __attribute__((unused))
vfmsq_f32(float32x4_t a,float32x4_t b,float32x4_t c)237 static float32x4_t vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
238 {
239     return (float32x4_t){ {
240         a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1],
241         a.e[2] - b.e[2] * c.e[2], a.e[3] - b.e[3] * c.e[3] } };
242 }
243 
244 
245 /**
246  * Manipulation
247  */
248 
249 __attribute__((unused))
vcreate_f32(uint64_t u)250 static float32x2_t vcreate_f32(uint64_t u)
251 {
252     float *f = (float *)&u;
253     return (float32x2_t){ { f[0] , f[1] } };
254 }
255 
256 __attribute__((unused))
vcombine_f32(float32x2_t a,float32x2_t b)257 static float32x4_t vcombine_f32(float32x2_t a, float32x2_t b)
258 {
259     return (float32x4_t){ { a.e[0], a.e[1], b.e[0], b.e[1] } };
260 }
261 
262 __attribute__((unused))
vget_low_f32(float32x4_t a)263 static float32x2_t vget_low_f32(float32x4_t a)
264 {
265     return (float32x2_t){ { a.e[0], a.e[1] } };
266 }
267 
268 __attribute__((unused))
vget_high_f32(float32x4_t a)269 static float32x2_t vget_high_f32(float32x4_t a)
270 {
271     return (float32x2_t){ { a.e[2], a.e[3] } };
272 }
273 
274 __attribute__((unused))
vmovq_n_f32(float v)275 static float32x4_t vmovq_n_f32(float v)
276 {
277     return (float32x4_t){ { v, v, v, v } };
278 }
279 
280 __attribute__((unused))
vrev64_f32(float32x2_t v)281 static float32x2_t vrev64_f32(float32x2_t v)
282 {
283     return (float32x2_t){ { v.e[1], v.e[0] } };
284 }
285 
286 __attribute__((unused))
vrev64q_f32(float32x4_t v)287 static float32x4_t vrev64q_f32(float32x4_t v)
288 {
289     return (float32x4_t){ { v.e[1], v.e[0], v.e[3], v.e[2] } };
290 }
291 
292 __attribute__((unused))
vtrn1_f32(float32x2_t a,float32x2_t b)293 static float32x2_t vtrn1_f32(float32x2_t a, float32x2_t b)
294 {
295     return (float32x2_t){ { a.e[0], b.e[0] } };
296 }
297 
298 __attribute__((unused))
vtrn2_f32(float32x2_t a,float32x2_t b)299 static float32x2_t vtrn2_f32(float32x2_t a, float32x2_t b)
300 {
301     return (float32x2_t){ { a.e[1], b.e[1] } };
302 }
303 
304 __attribute__((unused))
vtrn1q_f32(float32x4_t a,float32x4_t b)305 static float32x4_t vtrn1q_f32(float32x4_t a, float32x4_t b)
306 {
307     return (float32x4_t){ { a.e[0], b.e[0], a.e[2], b.e[2] } };
308 }
309 
310 __attribute__((unused))
vtrn2q_f32(float32x4_t a,float32x4_t b)311 static float32x4_t vtrn2q_f32(float32x4_t a, float32x4_t b)
312 {
313     return (float32x4_t){ { a.e[1], b.e[1], a.e[3], b.e[3] } };
314 }
315 
316 __attribute__((unused))
vzip1q_f32(float32x4_t a,float32x4_t b)317 static float32x4_t vzip1q_f32(float32x4_t a, float32x4_t b)
318 {
319     return (float32x4_t){ { a.e[0], b.e[0], a.e[1], b.e[1] } };
320 }
321 
322 __attribute__((unused))
vzip2q_f32(float32x4_t a,float32x4_t b)323 static float32x4_t vzip2q_f32(float32x4_t a, float32x4_t b)
324 {
325     return (float32x4_t){ { a.e[2], b.e[2], a.e[3], b.e[3] } };
326 }
327 
328 
329 #endif /* __ARM_NEON */
330