1
2 /* ----------------------------------------------------------------------
3 * Project: CMSIS DSP Library
4 * Title: arm_fir_f16.c
5 * Description: Floating-point FIR filter processing function
6 *
7 * $Date: 23 April 2021
8 * $Revision: V1.9.0
9 *
10 * Target Processor: Cortex-M and Cortex-A cores
11 * -------------------------------------------------------------------- */
12 /*
13 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
14 *
15 * SPDX-License-Identifier: Apache-2.0
16 *
17 * Licensed under the Apache License, Version 2.0 (the License); you may
18 * not use this file except in compliance with the License.
19 * You may obtain a copy of the License at
20 *
21 * www.apache.org/licenses/LICENSE-2.0
22 *
23 * Unless required by applicable law or agreed to in writing, software
24 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26 * See the License for the specific language governing permissions and
27 * limitations under the License.
28 */
29
30 #include "dsp/filtering_functions_f16.h"
31
32 #if defined(ARM_FLOAT16_SUPPORTED)
33 /**
34 @ingroup groupFilters
35 */
36
37
38 /**
39 @addtogroup FIR
40 @{
41 */
42
43 /**
44 @brief Processing function for floating-point FIR filter.
45 @param[in] S points to an instance of the floating-point FIR filter structure
46 @param[in] pSrc points to the block of input data
47 @param[out] pDst points to the block of output data
48 @param[in] blockSize number of samples to process
49 */
50
51 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
52
53 #define FIR_F32_MAX_COEF_BLK 8
54
55 #define FIR_F16_CORE(pSamples, c, NB_TAPS) \
56 vecAcc0 = vdupq_n_f16(0.0f16); \
57 for (int i = 0; i < NB_TAPS; i++) { \
58 vecIn0 = vld1q(&pSamples[i]); \
59 vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]); \
60 }
61
62 #define NB_TAPS 4
arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S,const float16_t * __restrict pSrc,float16_t * __restrict pDst,uint32_t blockSize)63 __STATIC_INLINE void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S,
64 const float16_t * __restrict pSrc,
65 float16_t * __restrict pDst, uint32_t blockSize)
66 {
67 float16_t *pState = S->pState; /* State pointer */
68 const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
69 float16_t *pStateCur; /* Points to the current sample of the state */
70 const float16_t *pSamples; /* Temporary pointer to the sample buffer */
71 float16_t *pOutput; /* Temporary pointer to the output buffer */
72 const float16_t *pTempSrc; /* Temporary pointer to the source data */
73 float16_t *pTempDest; /* Temporary pointer to the destination buffer */
74 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
75 int32_t blkCnt;
76 float16x8_t vecIn0;
77 float16x8_t vecAcc0;
78 float16_t c[NB_TAPS];
79
80
81 /*
82 * pState points to state array which contains previous frame (numTaps - 1) samples
83 * pStateCur points to the location where the new input data should be written
84 */
85 pStateCur = &(pState[(numTaps - 1u)]);
86 /*
87 * Copy new data into state so that we obtain a continuous sample buffer
88 * containing both the tail end of the old data and the new data.
89 */
90 pSamples = pState;
91 pTempSrc = pSrc;
92 pOutput = pDst;
93
94 for (int i = 0; i < NB_TAPS; i++)
95 c[i] = pCoeffs[i];
96
97 blkCnt = blockSize >> 3;
98 while (blkCnt > 0) {
99 /*
100 * Save 8 input samples in the history buffer
101 */
102 vst1q(pStateCur, vld1q(pTempSrc));
103 pStateCur += 8;
104 pTempSrc += 8;
105
106 FIR_F16_CORE(pSamples, c, NB_TAPS);
107
108 vst1q(pOutput, vecAcc0);
109
110 pOutput += 8;
111 pSamples += 8;
112
113 blkCnt--;
114 }
115
116 blkCnt = blockSize & 7;
117 if (blkCnt)
118 {
119 mve_pred16_t p0 = vctp16q(blkCnt);
120
121 vst1q(pStateCur, vld1q(pTempSrc));
122 pStateCur += 8;
123 pTempSrc += 8;
124
125 FIR_F16_CORE(pSamples, c, NB_TAPS);
126
127 vstrhq_p_f16(pOutput, vecAcc0, p0);
128 }
129
130 /*
131 * Copy the samples back into the history buffer start
132 */
133 pTempSrc = &pState[blockSize];
134 pTempDest = pState;
135
136 blkCnt = numTaps >> 3;
137 while (blkCnt > 0) {
138 vst1q(pTempDest, vld1q(pTempSrc));
139 pTempSrc += 8;
140 pTempDest += 8;
141 blkCnt--;
142 }
143 blkCnt = numTaps & 7;
144 if (blkCnt > 0) {
145 mve_pred16_t p0 = vctp16q(blkCnt);
146 vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
147 }
148
149 }
150 #undef NB_TAPS
151
152 #define NB_TAPS 8
arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S,const float16_t * __restrict pSrc,float16_t * __restrict pDst,uint32_t blockSize)153 __STATIC_INLINE void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S,
154 const float16_t * __restrict pSrc,
155 float16_t * __restrict pDst, uint32_t blockSize)
156 {
157 float16_t *pState = S->pState; /* State pointer */
158 const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
159 float16_t *pStateCur; /* Points to the current sample of the state */
160 const float16_t *pSamples; /* Temporary pointer to the sample buffer */
161 float16_t *pOutput; /* Temporary pointer to the output buffer */
162 const float16_t *pTempSrc; /* Temporary pointer to the source data */
163 float16_t *pTempDest; /* Temporary pointer to the destination buffer */
164 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
165 int32_t blkCnt;
166 float16x8_t vecIn0;
167 float16x8_t vecAcc0;
168 float16_t c[NB_TAPS];
169
170
171 /*
172 * pState points to state array which contains previous frame (numTaps - 1) samples
173 * pStateCur points to the location where the new input data should be written
174 */
175 pStateCur = &(pState[(numTaps - 1u)]);
176 /*
177 * Copy new data into state so that we obtain a continuous sample buffer
178 * containing both the tail end of the old data and the new data.
179 */
180 pSamples = pState;
181 pTempSrc = pSrc;
182 pOutput = pDst;
183
184 for (int i = 0; i < NB_TAPS; i++)
185 c[i] = pCoeffs[i];
186
187 blkCnt = blockSize >> 3;
188 while (blkCnt > 0) {
189 /*
190 * Save 8 input samples in the history buffer
191 */
192 vst1q(pStateCur, vld1q(pTempSrc));
193 pStateCur += 8;
194 pTempSrc += 8;
195
196 FIR_F16_CORE(pSamples, c, NB_TAPS);
197
198 vst1q(pOutput, vecAcc0);
199
200 pOutput += 8;
201 pSamples += 8;
202
203 blkCnt--;
204 }
205
206 blkCnt = blockSize & 7;
207 if (blkCnt)
208 {
209 mve_pred16_t p0 = vctp16q(blkCnt);
210
211 vst1q(pStateCur, vld1q(pTempSrc));
212 pStateCur += 8;
213 pTempSrc += 8;
214
215 FIR_F16_CORE(pSamples, c, NB_TAPS);
216
217 vstrhq_p_f16(pOutput, vecAcc0, p0);
218 }
219
220 /*
221 * Copy the samples back into the history buffer start
222 */
223 pTempSrc = &pState[blockSize];
224 pTempDest = pState;
225
226 blkCnt = numTaps >> 3;
227 while (blkCnt > 0) {
228 vst1q(pTempDest, vld1q(pTempSrc));
229 pTempSrc += 8;
230 pTempDest += 8;
231 blkCnt--;
232 }
233 blkCnt = numTaps & 7;
234 if (blkCnt > 0) {
235 mve_pred16_t p0 = vctp16q(blkCnt);
236 vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
237 }
238
239 }
240 #undef NB_TAPS
241
arm_fir_f16(const arm_fir_instance_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)242 ARM_DSP_ATTRIBUTE void arm_fir_f16(const arm_fir_instance_f16 * S,
243 const float16_t * pSrc,
244 float16_t * pDst,
245 uint32_t blockSize)
246 {
247 float16_t *pRefStatePtr = S->pState + ARM_ROUND_UP(blockSize, 8);
248 float16_t *pState = pRefStatePtr ; /* State pointer */
249 const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
250 const float16_t *pSamples; /* Temporary pointer to the sample buffer */
251 float16_t *pOutput; /* Temporary pointer to the output buffer */
252 const float16_t *pTempSrc; /* Temporary pointer to the source data */
253 float16_t *pTempDest; /* Temporary pointer to the destination buffer */
254 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
255 uint32_t blkCnt;
256 float16_t c0, c1, c2, c3;
257 float16_t c4, c5, c6, c7;
258
259 /*
260 * [1 to 8 taps] specialized routines
261 */
262 if (numTaps <= 4) {
263 arm_fir_f16_1_4_mve(S, pSrc, pDst, blockSize);
264 return;
265 } else if (numTaps <= 8) {
266 arm_fir_f16_5_8_mve(S, pSrc, pDst, blockSize);
267 return;
268 }
269
270 pTempSrc = pSrc;
271 pTempDest = &(pState[(numTaps - 1u)]);
272 int cnt = blockSize;
273 do {
274 mve_pred16_t p0 = vctp16q(cnt);
275 vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
276 pTempDest += 8;
277 pTempSrc += 8;
278 cnt -= 8;
279 } while (cnt > 0);
280
281 float16_t *partial_accu_ptr = S->pState;
282
283 pSamples = pState;
284 c0 = *pCoeffs++;
285 c1 = *pCoeffs++;
286 c2 = *pCoeffs++;
287 c3 = *pCoeffs++;
288 c4 = *pCoeffs++;
289 c5 = *pCoeffs++;
290 c6 = *pCoeffs++;
291 c7 = *pCoeffs++;
292
293 cnt = blockSize >> 3;
294 while (cnt > 0) {
295 float16x8_t vecAcc0;
296 float16x8_t vecIn0;
297
298 vecIn0 = vld1q(pSamples);
299 vecAcc0 = vmulq(vecIn0, c0);
300 vecIn0 = vld1q(&pSamples[1]);
301 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
302 vecIn0 = vld1q(&pSamples[2]);
303 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
304 vecIn0 = vld1q(&pSamples[3]);
305 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
306 vecIn0 = vld1q(&pSamples[4]);
307 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
308 vecIn0 = vld1q(&pSamples[5]);
309 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
310 vecIn0 = vld1q(&pSamples[6]);
311 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
312 vecIn0 = vld1q(&pSamples[7]);
313 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
314 pSamples += 8;
315 vst1q(partial_accu_ptr, vecAcc0);
316 cnt--;
317 partial_accu_ptr += 8;
318 }
319
320 cnt = blockSize & 7;
321 if (cnt > 0) {
322 float16x8_t vecAcc0;
323 float16x8_t vecIn0;
324
325 mve_pred16_t p0 = vctp16q(cnt);
326
327
328 vecIn0 = vld1q(pSamples);
329 vecAcc0 = vmulq(vecIn0, c0);
330 vecIn0 = vld1q(&pSamples[1]);
331 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
332 vecIn0 = vld1q(&pSamples[2]);
333 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
334 vecIn0 = vld1q(&pSamples[3]);
335 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
336 vecIn0 = vld1q(&pSamples[4]);
337 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
338 vecIn0 = vld1q(&pSamples[5]);
339 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
340 vecIn0 = vld1q(&pSamples[6]);
341 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
342 vecIn0 = vld1q(&pSamples[7]);
343 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
344 vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
345 }
346
347 int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
348 int sample_offset = FIR_F32_MAX_COEF_BLK;
349 while (localTaps > FIR_F32_MAX_COEF_BLK) {
350 c0 = *pCoeffs++;
351 c1 = *pCoeffs++;
352 c2 = *pCoeffs++;
353 c3 = *pCoeffs++;
354 c4 = *pCoeffs++;
355 c5 = *pCoeffs++;
356 c6 = *pCoeffs++;
357 c7 = *pCoeffs++;
358
359 partial_accu_ptr = S->pState;
360 pSamples = pState + sample_offset;
361 int cnt = blockSize >> 3;
362 while (cnt > 0) {
363 float16x8_t vecAcc0;
364 float16x8_t vecIn0;
365
366
367 vecIn0 = vld1q(pSamples);
368 vecAcc0 = vmulq(vecIn0, c0);
369 vecIn0 = vld1q(&pSamples[1]);
370 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
371 vecIn0 = vld1q(&pSamples[2]);
372 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
373 vecIn0 = vld1q(&pSamples[3]);
374 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
375 vecIn0 = vld1q(&pSamples[4]);
376 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
377 vecIn0 = vld1q(&pSamples[5]);
378 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
379 vecIn0 = vld1q(&pSamples[6]);
380 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
381 vecIn0 = vld1q(&pSamples[7]);
382 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
383 pSamples += 8;
384 vecAcc0 += vld1q_f16(partial_accu_ptr);
385 vst1q(partial_accu_ptr, vecAcc0);
386 cnt--;
387 partial_accu_ptr += 8;
388 }
389
390 cnt = blockSize & 7;
391 if (cnt > 0) {
392 float16x8_t vecAcc0;
393 float16x8_t vecIn0;
394
395 mve_pred16_t p0 = vctp16q(cnt);
396
397 vecIn0 = vld1q(pSamples);
398 vecAcc0 = vmulq(vecIn0, c0);
399 vecIn0 = vld1q(&pSamples[1]);
400 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
401 vecIn0 = vld1q(&pSamples[2]);
402 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
403 vecIn0 = vld1q(&pSamples[3]);
404 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
405 vecIn0 = vld1q(&pSamples[4]);
406 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
407 vecIn0 = vld1q(&pSamples[5]);
408 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
409 vecIn0 = vld1q(&pSamples[6]);
410 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
411 vecIn0 = vld1q(&pSamples[7]);
412 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
413 vecAcc0 += vld1q_f16(partial_accu_ptr);
414 vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
415 }
416
417 localTaps -= FIR_F32_MAX_COEF_BLK;
418 sample_offset += FIR_F32_MAX_COEF_BLK;
419 }
420
421 pSamples = pState + sample_offset;
422
423 if (localTaps > 4) {
424 c0 = *pCoeffs++;
425 c1 = *pCoeffs++;
426 c2 = *pCoeffs++;
427 c3 = *pCoeffs++;
428 c4 = *pCoeffs++;
429 c5 = *pCoeffs++;
430 c6 = *pCoeffs++;
431 c7 = *pCoeffs++;
432 pOutput = pDst;
433
434 partial_accu_ptr = S->pState;
435 cnt = blockSize >> 3;
436 while (cnt > 0) {
437 float16x8_t vecAcc0;
438 float16x8_t vecIn0;
439
440 vecIn0 = vld1q(pSamples);
441 vecAcc0 = vmulq(vecIn0, c0);
442 vecIn0 = vld1q(&pSamples[1]);
443 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
444 vecIn0 = vld1q(&pSamples[2]);
445 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
446 vecIn0 = vld1q(&pSamples[3]);
447 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
448 vecIn0 = vld1q(&pSamples[4]);
449 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
450 vecIn0 = vld1q(&pSamples[5]);
451 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
452 vecIn0 = vld1q(&pSamples[6]);
453 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
454 vecIn0 = vld1q(&pSamples[7]);
455 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
456 pSamples += 8;
457 float16x8_t pap = vld1q_f16(partial_accu_ptr);
458 vst1q(pOutput, vecAcc0 + pap);
459 cnt--;
460 partial_accu_ptr += 8;
461 pOutput += 8;
462 }
463
464 cnt = blockSize & 7;
465 if (cnt > 0) {
466 float16x8_t vecAcc0;
467 float16x8_t vecIn0;
468
469 mve_pred16_t p0 = vctp16q(cnt);
470
471 vecIn0 = vld1q(pSamples);
472 vecAcc0 = vmulq(vecIn0, c0);
473 vecIn0 = vld1q(&pSamples[1]);
474 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
475 vecIn0 = vld1q(&pSamples[2]);
476 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
477 vecIn0 = vld1q(&pSamples[3]);
478 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
479 vecIn0 = vld1q(&pSamples[4]);
480 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
481 vecIn0 = vld1q(&pSamples[5]);
482 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
483 vecIn0 = vld1q(&pSamples[6]);
484 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
485 vecIn0 = vld1q(&pSamples[7]);
486 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
487 float16x8_t pap = vld1q_f16(partial_accu_ptr);
488 vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
489 pOutput += cnt;
490 }
491
492 } else {
493 c0 = *pCoeffs++;
494 c1 = *pCoeffs++;
495 c2 = *pCoeffs++;
496 c3 = *pCoeffs++;
497 pOutput = pDst;
498
499 partial_accu_ptr = S->pState;
500 cnt = blockSize >> 3;
501 while (cnt > 0) {
502 float16x8_t vecAcc0;
503 float16x8_t vecIn0;
504
505 vecIn0 = vld1q(pSamples);
506 vecAcc0 = vmulq(vecIn0, c0);
507 vecIn0 = vld1q(&pSamples[1]);
508 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
509 vecIn0 = vld1q(&pSamples[2]);
510 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
511 vecIn0 = vld1q(&pSamples[3]);
512 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
513 pSamples += 8;
514 float16x8_t pap = vld1q_f16(partial_accu_ptr);
515 vst1q(pOutput, vecAcc0 + pap);
516 cnt--;
517 partial_accu_ptr += 8;
518 pOutput += 8;
519 }
520
521 cnt = blockSize & 7;
522 if (cnt > 0) {
523 float16x8_t vecAcc0;
524 float16x8_t vecIn0;
525
526 mve_pred16_t p0 = vctp16q(cnt);
527
528 vecIn0 = vld1q(pSamples);
529 vecAcc0 = vmulq(vecIn0, c0);
530 vecIn0 = vld1q(&pSamples[1]);
531 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
532 vecIn0 = vld1q(&pSamples[2]);
533 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
534 vecIn0 = vld1q(&pSamples[3]);
535 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
536 float16x8_t pap = vld1q_f16(partial_accu_ptr);
537 vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
538 pOutput += cnt;
539 }
540 }
541
542 /*
543 * Copy the samples back into the history buffer start
544 */
545 pTempSrc = &pState[blockSize];
546 pTempDest = pState;
547
548 blkCnt = numTaps >> 3;
549 while (blkCnt > 0U) {
550 vst1q(pTempDest, vld1q(pTempSrc));
551 pTempSrc += 8;
552 pTempDest += 8;
553 blkCnt--;
554 }
555 blkCnt = numTaps & 7;
556 if (blkCnt > 0U) {
557 mve_pred16_t p0 = vctp16q(blkCnt);
558 vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
559 }
560 }
561
562 #else
563
arm_fir_f16(const arm_fir_instance_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)564 ARM_DSP_ATTRIBUTE void arm_fir_f16(
565 const arm_fir_instance_f16 * S,
566 const float16_t * pSrc,
567 float16_t * pDst,
568 uint32_t blockSize)
569 {
570 float16_t *pState = S->pState; /* State pointer */
571 const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
572 float16_t *pStateCurnt; /* Points to the current sample of the state */
573 float16_t *px; /* Temporary pointer for state buffer */
574 const float16_t *pb; /* Temporary pointer for coefficient buffer */
575 _Float16 acc0; /* Accumulator */
576 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
577 uint32_t i, tapCnt, blkCnt; /* Loop counters */
578
579 #if defined (ARM_MATH_LOOPUNROLL)
580 _Float16 acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */
581 _Float16 x0, x1, x2, x3, x4, x5, x6, x7; /* Temporary variables to hold state values */
582 _Float16 c0; /* Temporary variable to hold coefficient value */
583 #endif
584
585 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
586 /* pStateCurnt points to the location where the new input data should be written */
587 pStateCurnt = &(S->pState[(numTaps - 1U)]);
588
589 #if defined (ARM_MATH_LOOPUNROLL)
590
591 /* Loop unrolling: Compute 8 output values simultaneously.
592 * The variables acc0 ... acc7 hold output values that are being computed:
593 *
594 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
595 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
596 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
597 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
598 */
599
600 blkCnt = blockSize >> 3U;
601
602 while (blkCnt > 0U)
603 {
604 /* Copy 4 new input samples into the state buffer. */
605 *pStateCurnt++ = *pSrc++;
606 *pStateCurnt++ = *pSrc++;
607 *pStateCurnt++ = *pSrc++;
608 *pStateCurnt++ = *pSrc++;
609
610 /* Set all accumulators to zero */
611 acc0 = 0.0f;
612 acc1 = 0.0f;
613 acc2 = 0.0f;
614 acc3 = 0.0f;
615 acc4 = 0.0f;
616 acc5 = 0.0f;
617 acc6 = 0.0f;
618 acc7 = 0.0f;
619
620 /* Initialize state pointer */
621 px = pState;
622
623 /* Initialize coefficient pointer */
624 pb = pCoeffs;
625
626 /* This is separated from the others to avoid
627 * a call to __aeabi_memmove which would be slower
628 */
629 *pStateCurnt++ = *pSrc++;
630 *pStateCurnt++ = *pSrc++;
631 *pStateCurnt++ = *pSrc++;
632 *pStateCurnt++ = *pSrc++;
633
634 /* Read the first 7 samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
635 x0 = *px++;
636 x1 = *px++;
637 x2 = *px++;
638 x3 = *px++;
639 x4 = *px++;
640 x5 = *px++;
641 x6 = *px++;
642
643 /* Loop unrolling: process 8 taps at a time. */
644 tapCnt = numTaps >> 3U;
645
646 while (tapCnt > 0U)
647 {
648 /* Read the b[numTaps-1] coefficient */
649 c0 = *(pb++);
650
651 /* Read x[n-numTaps-3] sample */
652 x7 = *(px++);
653
654 /* acc0 += b[numTaps-1] * x[n-numTaps] */
655 acc0 += x0 * c0;
656
657 /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
658 acc1 += x1 * c0;
659
660 /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
661 acc2 += x2 * c0;
662
663 /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
664 acc3 += x3 * c0;
665
666 /* acc4 += b[numTaps-1] * x[n-numTaps-4] */
667 acc4 += x4 * c0;
668
669 /* acc1 += b[numTaps-1] * x[n-numTaps-5] */
670 acc5 += x5 * c0;
671
672 /* acc2 += b[numTaps-1] * x[n-numTaps-6] */
673 acc6 += x6 * c0;
674
675 /* acc3 += b[numTaps-1] * x[n-numTaps-7] */
676 acc7 += x7 * c0;
677
678 /* Read the b[numTaps-2] coefficient */
679 c0 = *(pb++);
680
681 /* Read x[n-numTaps-4] sample */
682 x0 = *(px++);
683
684 /* Perform the multiply-accumulate */
685 acc0 += x1 * c0;
686 acc1 += x2 * c0;
687 acc2 += x3 * c0;
688 acc3 += x4 * c0;
689 acc4 += x5 * c0;
690 acc5 += x6 * c0;
691 acc6 += x7 * c0;
692 acc7 += x0 * c0;
693
694 /* Read the b[numTaps-3] coefficient */
695 c0 = *(pb++);
696
697 /* Read x[n-numTaps-5] sample */
698 x1 = *(px++);
699
700 /* Perform the multiply-accumulates */
701 acc0 += x2 * c0;
702 acc1 += x3 * c0;
703 acc2 += x4 * c0;
704 acc3 += x5 * c0;
705 acc4 += x6 * c0;
706 acc5 += x7 * c0;
707 acc6 += x0 * c0;
708 acc7 += x1 * c0;
709
710 /* Read the b[numTaps-4] coefficient */
711 c0 = *(pb++);
712
713 /* Read x[n-numTaps-6] sample */
714 x2 = *(px++);
715
716 /* Perform the multiply-accumulates */
717 acc0 += x3 * c0;
718 acc1 += x4 * c0;
719 acc2 += x5 * c0;
720 acc3 += x6 * c0;
721 acc4 += x7 * c0;
722 acc5 += x0 * c0;
723 acc6 += x1 * c0;
724 acc7 += x2 * c0;
725
726 /* Read the b[numTaps-4] coefficient */
727 c0 = *(pb++);
728
729 /* Read x[n-numTaps-6] sample */
730 x3 = *(px++);
731 /* Perform the multiply-accumulates */
732 acc0 += x4 * c0;
733 acc1 += x5 * c0;
734 acc2 += x6 * c0;
735 acc3 += x7 * c0;
736 acc4 += x0 * c0;
737 acc5 += x1 * c0;
738 acc6 += x2 * c0;
739 acc7 += x3 * c0;
740
741 /* Read the b[numTaps-4] coefficient */
742 c0 = *(pb++);
743
744 /* Read x[n-numTaps-6] sample */
745 x4 = *(px++);
746
747 /* Perform the multiply-accumulates */
748 acc0 += x5 * c0;
749 acc1 += x6 * c0;
750 acc2 += x7 * c0;
751 acc3 += x0 * c0;
752 acc4 += x1 * c0;
753 acc5 += x2 * c0;
754 acc6 += x3 * c0;
755 acc7 += x4 * c0;
756
757 /* Read the b[numTaps-4] coefficient */
758 c0 = *(pb++);
759
760 /* Read x[n-numTaps-6] sample */
761 x5 = *(px++);
762
763 /* Perform the multiply-accumulates */
764 acc0 += x6 * c0;
765 acc1 += x7 * c0;
766 acc2 += x0 * c0;
767 acc3 += x1 * c0;
768 acc4 += x2 * c0;
769 acc5 += x3 * c0;
770 acc6 += x4 * c0;
771 acc7 += x5 * c0;
772
773 /* Read the b[numTaps-4] coefficient */
774 c0 = *(pb++);
775
776 /* Read x[n-numTaps-6] sample */
777 x6 = *(px++);
778
779 /* Perform the multiply-accumulates */
780 acc0 += x7 * c0;
781 acc1 += x0 * c0;
782 acc2 += x1 * c0;
783 acc3 += x2 * c0;
784 acc4 += x3 * c0;
785 acc5 += x4 * c0;
786 acc6 += x5 * c0;
787 acc7 += x6 * c0;
788
789 /* Decrement loop counter */
790 tapCnt--;
791 }
792
793 /* Loop unrolling: Compute remaining outputs */
794 tapCnt = numTaps % 0x8U;
795
796 while (tapCnt > 0U)
797 {
798 /* Read coefficients */
799 c0 = *(pb++);
800
801 /* Fetch 1 state variable */
802 x7 = *(px++);
803
804 /* Perform the multiply-accumulates */
805 acc0 += x0 * c0;
806 acc1 += x1 * c0;
807 acc2 += x2 * c0;
808 acc3 += x3 * c0;
809 acc4 += x4 * c0;
810 acc5 += x5 * c0;
811 acc6 += x6 * c0;
812 acc7 += x7 * c0;
813
814 /* Reuse the present sample states for next sample */
815 x0 = x1;
816 x1 = x2;
817 x2 = x3;
818 x3 = x4;
819 x4 = x5;
820 x5 = x6;
821 x6 = x7;
822
823 /* Decrement loop counter */
824 tapCnt--;
825 }
826
827 /* Advance the state pointer by 8 to process the next group of 8 samples */
828 pState = pState + 8;
829
830 /* The results in the 8 accumulators, store in the destination buffer. */
831 *pDst++ = acc0;
832 *pDst++ = acc1;
833 *pDst++ = acc2;
834 *pDst++ = acc3;
835 *pDst++ = acc4;
836 *pDst++ = acc5;
837 *pDst++ = acc6;
838 *pDst++ = acc7;
839
840
841 /* Decrement loop counter */
842 blkCnt--;
843 }
844
845 /* Loop unrolling: Compute remaining output samples */
846 blkCnt = blockSize % 0x8U;
847
848 #else
849
850 /* Initialize blkCnt with number of taps */
851 blkCnt = blockSize;
852
853 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
854
855 while (blkCnt > 0U)
856 {
857 /* Copy one sample at a time into state buffer */
858 *pStateCurnt++ = *pSrc++;
859
860 /* Set the accumulator to zero */
861 acc0 = 0.0f;
862
863 /* Initialize state pointer */
864 px = pState;
865
866 /* Initialize Coefficient pointer */
867 pb = pCoeffs;
868
869 i = numTaps;
870
871 /* Perform the multiply-accumulates */
872 while (i > 0U)
873 {
874 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
875 acc0 += (_Float16)*px++ * (_Float16)*pb++;
876
877 i--;
878 }
879
880 /* Store result in destination buffer. */
881 *pDst++ = acc0;
882
883 /* Advance state pointer by 1 for the next sample */
884 pState = pState + 1U;
885
886 /* Decrement loop counter */
887 blkCnt--;
888 }
889
890 /* Processing is complete.
891 Now copy the last numTaps - 1 samples to the start of the state buffer.
892 This prepares the state buffer for the next function call. */
893
894 /* Points to the start of the state buffer */
895 pStateCurnt = S->pState;
896
897 #if defined (ARM_MATH_LOOPUNROLL)
898
899 /* Loop unrolling: Compute 4 taps at a time */
900 tapCnt = (numTaps - 1U) >> 2U;
901
902 /* Copy data */
903 while (tapCnt > 0U)
904 {
905 *pStateCurnt++ = *pState++;
906 *pStateCurnt++ = *pState++;
907 *pStateCurnt++ = *pState++;
908 *pStateCurnt++ = *pState++;
909
910 /* Decrement loop counter */
911 tapCnt--;
912 }
913
914 /* Calculate remaining number of copies */
915 tapCnt = (numTaps - 1U) % 0x4U;
916
917 #else
918
919 /* Initialize tapCnt with number of taps */
920 tapCnt = (numTaps - 1U);
921
922 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
923
924 /* Copy remaining data */
925 while (tapCnt > 0U)
926 {
927 *pStateCurnt++ = *pState++;
928
929 /* Decrement loop counter */
930 tapCnt--;
931 }
932
933 }
934
935 #endif /* #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
936 /**
937 * @} end of FIR group
938 */
939
940 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
941