1 
2 /* ----------------------------------------------------------------------
3  * Project:      CMSIS DSP Library
4  * Title:        arm_fir_f16.c
5  * Description:  Floating-point FIR filter processing function
6  *
7  * $Date:        23 April 2021
8  * $Revision:    V1.9.0
9  *
10  * Target Processor: Cortex-M and Cortex-A cores
11  * -------------------------------------------------------------------- */
12 /*
13  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
14  *
15  * SPDX-License-Identifier: Apache-2.0
16  *
17  * Licensed under the Apache License, Version 2.0 (the License); you may
18  * not use this file except in compliance with the License.
19  * You may obtain a copy of the License at
20  *
21  * www.apache.org/licenses/LICENSE-2.0
22  *
23  * Unless required by applicable law or agreed to in writing, software
24  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26  * See the License for the specific language governing permissions and
27  * limitations under the License.
28  */
29 
30 #include "dsp/filtering_functions_f16.h"
31 
32 #if defined(ARM_FLOAT16_SUPPORTED)
33 /**
34   @ingroup groupFilters
35  */
36 
37 
38 /**
39   @addtogroup FIR
40   @{
41  */
42 
43 /**
44   @brief         Processing function for floating-point FIR filter.
45   @param[in]     S          points to an instance of the floating-point FIR filter structure
46   @param[in]     pSrc       points to the block of input data
47   @param[out]    pDst       points to the block of output data
48   @param[in]     blockSize  number of samples to process
49  */
50 
51 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
52 
53 #define FIR_F32_MAX_COEF_BLK        8
54 
55 #define FIR_F16_CORE(pSamples, c, NB_TAPS)                                 \
56         vecAcc0 = vdupq_n_f16(0.0f16);                                     \
57         for (int i = 0; i < NB_TAPS; i++) {                                \
58             vecIn0 = vld1q(&pSamples[i]);                                  \
59             vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]);                        \
60         }
61 
62 #define NB_TAPS 4
arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S,const float16_t * __restrict pSrc,float16_t * __restrict pDst,uint32_t blockSize)63 __STATIC_INLINE void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S,
64     const float16_t * __restrict pSrc,
65     float16_t * __restrict pDst, uint32_t blockSize)
66 {
67     float16_t      *pState = S->pState;     /* State pointer */
68     const float16_t *pCoeffs = S->pCoeffs;  /* Coefficient pointer */
69     float16_t      *pStateCur;              /* Points to the current sample of the state */
70     const float16_t *pSamples;              /* Temporary pointer to the sample buffer */
71     float16_t      *pOutput;                /* Temporary pointer to the output buffer */
72     const float16_t *pTempSrc;              /* Temporary pointer to the source data */
73     float16_t      *pTempDest;              /* Temporary pointer to the destination buffer */
74     uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */
75     int32_t         blkCnt;
76     float16x8_t         vecIn0;
77     float16x8_t         vecAcc0;
78     float16_t       c[NB_TAPS];
79 
80 
81     /*
82      * pState points to state array which contains previous frame (numTaps - 1) samples
83      * pStateCur points to the location where the new input data should be written
84      */
85     pStateCur = &(pState[(numTaps - 1u)]);
86     /*
87      * Copy new data into state so that we obtain a continuous sample buffer
88      * containing both the tail end of the old data and the new data.
89      */
90     pSamples = pState;
91     pTempSrc = pSrc;
92     pOutput = pDst;
93 
94     for (int i = 0; i < NB_TAPS; i++)
95         c[i] = pCoeffs[i];
96 
97     blkCnt = blockSize >> 3;
98     while (blkCnt > 0) {
99         /*
100          * Save 8 input samples in the history buffer
101          */
102         vst1q(pStateCur, vld1q(pTempSrc));
103         pStateCur += 8;
104         pTempSrc += 8;
105 
106         FIR_F16_CORE(pSamples, c, NB_TAPS);
107 
108         vst1q(pOutput, vecAcc0);
109 
110         pOutput += 8;
111         pSamples += 8;
112 
113         blkCnt--;
114     }
115 
116     blkCnt = blockSize & 7;
117     if (blkCnt)
118     {
119         mve_pred16_t    p0 = vctp16q(blkCnt);
120 
121         vst1q(pStateCur, vld1q(pTempSrc));
122         pStateCur += 8;
123         pTempSrc += 8;
124 
125         FIR_F16_CORE(pSamples, c, NB_TAPS);
126 
127         vstrhq_p_f16(pOutput, vecAcc0, p0);
128     }
129 
130     /*
131      * Copy the samples back into the history buffer start
132      */
133     pTempSrc = &pState[blockSize];
134     pTempDest = pState;
135 
136     blkCnt = numTaps >> 3;
137     while (blkCnt > 0) {
138         vst1q(pTempDest, vld1q(pTempSrc));
139         pTempSrc += 8;
140         pTempDest += 8;
141         blkCnt--;
142     }
143     blkCnt = numTaps & 7;
144     if (blkCnt > 0) {
145         mve_pred16_t    p0 = vctp16q(blkCnt);
146         vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
147     }
148 
149 }
150 #undef NB_TAPS
151 
152 #define NB_TAPS 8
arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S,const float16_t * __restrict pSrc,float16_t * __restrict pDst,uint32_t blockSize)153 __STATIC_INLINE void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S,
154     const float16_t * __restrict pSrc,
155     float16_t * __restrict pDst, uint32_t blockSize)
156 {
157     float16_t      *pState = S->pState;     /* State pointer */
158     const float16_t *pCoeffs = S->pCoeffs;  /* Coefficient pointer */
159     float16_t      *pStateCur;              /* Points to the current sample of the state */
160     const float16_t *pSamples;              /* Temporary pointer to the sample buffer */
161     float16_t      *pOutput;                /* Temporary pointer to the output buffer */
162     const float16_t *pTempSrc;              /* Temporary pointer to the source data */
163     float16_t      *pTempDest;              /* Temporary pointer to the destination buffer */
164     uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */
165     int32_t         blkCnt;
166     float16x8_t         vecIn0;
167     float16x8_t         vecAcc0;
168     float16_t       c[NB_TAPS];
169 
170 
171     /*
172      * pState points to state array which contains previous frame (numTaps - 1) samples
173      * pStateCur points to the location where the new input data should be written
174      */
175     pStateCur = &(pState[(numTaps - 1u)]);
176     /*
177      * Copy new data into state so that we obtain a continuous sample buffer
178      * containing both the tail end of the old data and the new data.
179      */
180     pSamples = pState;
181     pTempSrc = pSrc;
182     pOutput = pDst;
183 
184     for (int i = 0; i < NB_TAPS; i++)
185         c[i] = pCoeffs[i];
186 
187     blkCnt = blockSize >> 3;
188     while (blkCnt > 0) {
189         /*
190          * Save 8 input samples in the history buffer
191          */
192         vst1q(pStateCur, vld1q(pTempSrc));
193         pStateCur += 8;
194         pTempSrc += 8;
195 
196         FIR_F16_CORE(pSamples, c, NB_TAPS);
197 
198         vst1q(pOutput, vecAcc0);
199 
200         pOutput += 8;
201         pSamples += 8;
202 
203         blkCnt--;
204     }
205 
206     blkCnt = blockSize & 7;
207     if (blkCnt)
208     {
209         mve_pred16_t    p0 = vctp16q(blkCnt);
210 
211         vst1q(pStateCur, vld1q(pTempSrc));
212         pStateCur += 8;
213         pTempSrc += 8;
214 
215         FIR_F16_CORE(pSamples, c, NB_TAPS);
216 
217         vstrhq_p_f16(pOutput, vecAcc0, p0);
218     }
219 
220     /*
221      * Copy the samples back into the history buffer start
222      */
223     pTempSrc = &pState[blockSize];
224     pTempDest = pState;
225 
226     blkCnt = numTaps >> 3;
227     while (blkCnt > 0) {
228         vst1q(pTempDest, vld1q(pTempSrc));
229         pTempSrc += 8;
230         pTempDest += 8;
231         blkCnt--;
232     }
233     blkCnt = numTaps & 7;
234     if (blkCnt > 0) {
235         mve_pred16_t    p0 = vctp16q(blkCnt);
236         vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
237     }
238 
239 }
240 #undef NB_TAPS
241 
arm_fir_f16(const arm_fir_instance_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)242 ARM_DSP_ATTRIBUTE void arm_fir_f16(const arm_fir_instance_f16 * S,
243   const float16_t * pSrc,
244   float16_t * pDst,
245   uint32_t blockSize)
246 {
247     float16_t *pRefStatePtr = S->pState + ARM_ROUND_UP(blockSize, 8);
248     float16_t *pState = pRefStatePtr ;      /* State pointer */
249     const float16_t *pCoeffs = S->pCoeffs;      /* Coefficient pointer */
250     const float16_t *pSamples;  /* Temporary pointer to the sample buffer */
251     float16_t      *pOutput;    /* Temporary pointer to the output buffer */
252     const float16_t *pTempSrc;  /* Temporary pointer to the source data */
253     float16_t      *pTempDest;  /* Temporary pointer to the destination buffer */
254     uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
255     uint32_t        blkCnt;
256     float16_t       c0, c1, c2, c3;
257     float16_t       c4, c5, c6, c7;
258 
259     /*
260      * [1 to 8 taps] specialized routines
261      */
262     if (numTaps <= 4) {
263         arm_fir_f16_1_4_mve(S, pSrc, pDst, blockSize);
264         return;
265     } else if (numTaps <= 8) {
266         arm_fir_f16_5_8_mve(S, pSrc, pDst, blockSize);
267         return;
268     }
269 
270     pTempSrc = pSrc;
271     pTempDest = &(pState[(numTaps - 1u)]);
272     int             cnt = blockSize;
273     do {
274         mve_pred16_t    p0 = vctp16q(cnt);
275         vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
276         pTempDest += 8;
277         pTempSrc += 8;
278         cnt -= 8;
279     } while (cnt > 0);
280 
281     float16_t      *partial_accu_ptr = S->pState;
282 
283     pSamples = pState;
284     c0 = *pCoeffs++;
285     c1 = *pCoeffs++;
286     c2 = *pCoeffs++;
287     c3 = *pCoeffs++;
288     c4 = *pCoeffs++;
289     c5 = *pCoeffs++;
290     c6 = *pCoeffs++;
291     c7 = *pCoeffs++;
292 
293     cnt = blockSize >> 3;
294     while (cnt > 0) {
295         float16x8_t     vecAcc0;
296         float16x8_t     vecIn0;
297 
298         vecIn0 = vld1q(pSamples);
299         vecAcc0 = vmulq(vecIn0, c0);
300         vecIn0 = vld1q(&pSamples[1]);
301         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
302         vecIn0 = vld1q(&pSamples[2]);
303         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
304         vecIn0 = vld1q(&pSamples[3]);
305         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
306         vecIn0 = vld1q(&pSamples[4]);
307         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
308         vecIn0 = vld1q(&pSamples[5]);
309         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
310         vecIn0 = vld1q(&pSamples[6]);
311         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
312         vecIn0 = vld1q(&pSamples[7]);
313         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
314         pSamples += 8;
315         vst1q(partial_accu_ptr, vecAcc0);
316         cnt--;
317         partial_accu_ptr += 8;
318     }
319 
320     cnt = blockSize & 7;
321     if (cnt > 0) {
322         float16x8_t     vecAcc0;
323         float16x8_t     vecIn0;
324 
325         mve_pred16_t p0 = vctp16q(cnt);
326 
327 
328         vecIn0 = vld1q(pSamples);
329         vecAcc0 = vmulq(vecIn0, c0);
330         vecIn0 = vld1q(&pSamples[1]);
331         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
332         vecIn0 = vld1q(&pSamples[2]);
333         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
334         vecIn0 = vld1q(&pSamples[3]);
335         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
336         vecIn0 = vld1q(&pSamples[4]);
337         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
338         vecIn0 = vld1q(&pSamples[5]);
339         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
340         vecIn0 = vld1q(&pSamples[6]);
341         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
342         vecIn0 = vld1q(&pSamples[7]);
343         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
344         vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
345     }
346 
347     int             localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
348     int             sample_offset = FIR_F32_MAX_COEF_BLK;
349     while (localTaps > FIR_F32_MAX_COEF_BLK) {
350         c0 = *pCoeffs++;
351         c1 = *pCoeffs++;
352         c2 = *pCoeffs++;
353         c3 = *pCoeffs++;
354         c4 = *pCoeffs++;
355         c5 = *pCoeffs++;
356         c6 = *pCoeffs++;
357         c7 = *pCoeffs++;
358 
359         partial_accu_ptr = S->pState;
360         pSamples = pState + sample_offset;
361         int  cnt = blockSize >> 3;
362         while (cnt > 0) {
363             float16x8_t     vecAcc0;
364             float16x8_t     vecIn0;
365 
366 
367             vecIn0 = vld1q(pSamples);
368             vecAcc0 = vmulq(vecIn0, c0);
369             vecIn0 = vld1q(&pSamples[1]);
370             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
371             vecIn0 = vld1q(&pSamples[2]);
372             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
373             vecIn0 = vld1q(&pSamples[3]);
374             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
375             vecIn0 = vld1q(&pSamples[4]);
376             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
377             vecIn0 = vld1q(&pSamples[5]);
378             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
379             vecIn0 = vld1q(&pSamples[6]);
380             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
381             vecIn0 = vld1q(&pSamples[7]);
382             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
383             pSamples += 8;
384             vecAcc0 += vld1q_f16(partial_accu_ptr);
385             vst1q(partial_accu_ptr, vecAcc0);
386             cnt--;
387             partial_accu_ptr += 8;
388         }
389 
390         cnt = blockSize & 7;
391         if (cnt > 0) {
392             float16x8_t     vecAcc0;
393             float16x8_t     vecIn0;
394 
395             mve_pred16_t p0 = vctp16q(cnt);
396 
397             vecIn0 = vld1q(pSamples);
398             vecAcc0 = vmulq(vecIn0, c0);
399             vecIn0 = vld1q(&pSamples[1]);
400             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
401             vecIn0 = vld1q(&pSamples[2]);
402             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
403             vecIn0 = vld1q(&pSamples[3]);
404             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
405             vecIn0 = vld1q(&pSamples[4]);
406             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
407             vecIn0 = vld1q(&pSamples[5]);
408             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
409             vecIn0 = vld1q(&pSamples[6]);
410             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
411             vecIn0 = vld1q(&pSamples[7]);
412             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
413             vecAcc0 += vld1q_f16(partial_accu_ptr);
414             vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
415         }
416 
417         localTaps -= FIR_F32_MAX_COEF_BLK;
418         sample_offset += FIR_F32_MAX_COEF_BLK;
419     }
420 
421     pSamples = pState + sample_offset;
422 
423     if (localTaps > 4) {
424         c0 = *pCoeffs++;
425         c1 = *pCoeffs++;
426         c2 = *pCoeffs++;
427         c3 = *pCoeffs++;
428         c4 = *pCoeffs++;
429         c5 = *pCoeffs++;
430         c6 = *pCoeffs++;
431         c7 = *pCoeffs++;
432         pOutput = pDst;
433 
434         partial_accu_ptr = S->pState;
435         cnt = blockSize >> 3;
436         while (cnt > 0) {
437             float16x8_t     vecAcc0;
438             float16x8_t     vecIn0;
439 
440             vecIn0 = vld1q(pSamples);
441             vecAcc0 = vmulq(vecIn0, c0);
442             vecIn0 = vld1q(&pSamples[1]);
443             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
444             vecIn0 = vld1q(&pSamples[2]);
445             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
446             vecIn0 = vld1q(&pSamples[3]);
447             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
448             vecIn0 = vld1q(&pSamples[4]);
449             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
450             vecIn0 = vld1q(&pSamples[5]);
451             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
452             vecIn0 = vld1q(&pSamples[6]);
453             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
454             vecIn0 = vld1q(&pSamples[7]);
455             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
456             pSamples += 8;
457             float16x8_t     pap = vld1q_f16(partial_accu_ptr);
458             vst1q(pOutput, vecAcc0 + pap);
459             cnt--;
460             partial_accu_ptr += 8;
461             pOutput += 8;
462         }
463 
464         cnt = blockSize & 7;
465         if (cnt > 0) {
466             float16x8_t     vecAcc0;
467             float16x8_t     vecIn0;
468 
469             mve_pred16_t p0 = vctp16q(cnt);
470 
471             vecIn0 = vld1q(pSamples);
472             vecAcc0 = vmulq(vecIn0, c0);
473             vecIn0 = vld1q(&pSamples[1]);
474             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
475             vecIn0 = vld1q(&pSamples[2]);
476             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
477             vecIn0 = vld1q(&pSamples[3]);
478             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
479             vecIn0 = vld1q(&pSamples[4]);
480             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
481             vecIn0 = vld1q(&pSamples[5]);
482             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
483             vecIn0 = vld1q(&pSamples[6]);
484             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
485             vecIn0 = vld1q(&pSamples[7]);
486             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
487             float16x8_t     pap = vld1q_f16(partial_accu_ptr);
488             vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
489             pOutput += cnt;
490         }
491 
492     } else {
493         c0 = *pCoeffs++;
494         c1 = *pCoeffs++;
495         c2 = *pCoeffs++;
496         c3 = *pCoeffs++;
497         pOutput = pDst;
498 
499         partial_accu_ptr = S->pState;
500         cnt = blockSize >> 3;
501         while (cnt > 0) {
502             float16x8_t     vecAcc0;
503             float16x8_t     vecIn0;
504 
505             vecIn0 = vld1q(pSamples);
506             vecAcc0 = vmulq(vecIn0, c0);
507             vecIn0 = vld1q(&pSamples[1]);
508             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
509             vecIn0 = vld1q(&pSamples[2]);
510             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
511             vecIn0 = vld1q(&pSamples[3]);
512             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
513             pSamples += 8;
514             float16x8_t     pap = vld1q_f16(partial_accu_ptr);
515             vst1q(pOutput, vecAcc0 + pap);
516             cnt--;
517             partial_accu_ptr += 8;
518             pOutput += 8;
519         }
520 
521         cnt = blockSize & 7;
522         if (cnt > 0) {
523             float16x8_t     vecAcc0;
524             float16x8_t     vecIn0;
525 
526             mve_pred16_t p0 = vctp16q(cnt);
527 
528             vecIn0 = vld1q(pSamples);
529             vecAcc0 = vmulq(vecIn0, c0);
530             vecIn0 = vld1q(&pSamples[1]);
531             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
532             vecIn0 = vld1q(&pSamples[2]);
533             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
534             vecIn0 = vld1q(&pSamples[3]);
535             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
536             float16x8_t     pap = vld1q_f16(partial_accu_ptr);
537             vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
538             pOutput += cnt;
539         }
540     }
541 
542     /*
543      * Copy the samples back into the history buffer start
544      */
545     pTempSrc = &pState[blockSize];
546     pTempDest = pState;
547 
548     blkCnt = numTaps >> 3;
549     while (blkCnt > 0U) {
550         vst1q(pTempDest, vld1q(pTempSrc));
551         pTempSrc += 8;
552         pTempDest += 8;
553         blkCnt--;
554     }
555     blkCnt = numTaps & 7;
556     if (blkCnt > 0U) {
557         mve_pred16_t    p0 = vctp16q(blkCnt);
558         vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
559     }
560 }
561 
562 #else
563 
arm_fir_f16(const arm_fir_instance_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)564 ARM_DSP_ATTRIBUTE void arm_fir_f16(
565   const arm_fir_instance_f16 * S,
566   const float16_t * pSrc,
567         float16_t * pDst,
568         uint32_t blockSize)
569 {
570         float16_t *pState = S->pState;                 /* State pointer */
571   const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
572         float16_t *pStateCurnt;                        /* Points to the current sample of the state */
573         float16_t *px;                                 /* Temporary pointer for state buffer */
574   const float16_t *pb;                                 /* Temporary pointer for coefficient buffer */
575         _Float16 acc0;                                /* Accumulator */
576         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
577         uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
578 
579 #if defined (ARM_MATH_LOOPUNROLL)
580         _Float16 acc1, acc2, acc3, acc4, acc5, acc6, acc7;     /* Accumulators */
581         _Float16 x0, x1, x2, x3, x4, x5, x6, x7;               /* Temporary variables to hold state values */
582         _Float16 c0;                                           /* Temporary variable to hold coefficient value */
583 #endif
584 
585   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
586   /* pStateCurnt points to the location where the new input data should be written */
587   pStateCurnt = &(S->pState[(numTaps - 1U)]);
588 
589 #if defined (ARM_MATH_LOOPUNROLL)
590 
591   /* Loop unrolling: Compute 8 output values simultaneously.
592    * The variables acc0 ... acc7 hold output values that are being computed:
593    *
594    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
595    *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
596    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
597    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
598    */
599 
600   blkCnt = blockSize >> 3U;
601 
602   while (blkCnt > 0U)
603   {
604     /* Copy 4 new input samples into the state buffer. */
605     *pStateCurnt++ = *pSrc++;
606     *pStateCurnt++ = *pSrc++;
607     *pStateCurnt++ = *pSrc++;
608     *pStateCurnt++ = *pSrc++;
609 
610     /* Set all accumulators to zero */
611     acc0 = 0.0f;
612     acc1 = 0.0f;
613     acc2 = 0.0f;
614     acc3 = 0.0f;
615     acc4 = 0.0f;
616     acc5 = 0.0f;
617     acc6 = 0.0f;
618     acc7 = 0.0f;
619 
620     /* Initialize state pointer */
621     px = pState;
622 
623     /* Initialize coefficient pointer */
624     pb = pCoeffs;
625 
626     /* This is separated from the others to avoid
627      * a call to __aeabi_memmove which would be slower
628      */
629     *pStateCurnt++ = *pSrc++;
630     *pStateCurnt++ = *pSrc++;
631     *pStateCurnt++ = *pSrc++;
632     *pStateCurnt++ = *pSrc++;
633 
634     /* Read the first 7 samples from the state buffer:  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
635     x0 = *px++;
636     x1 = *px++;
637     x2 = *px++;
638     x3 = *px++;
639     x4 = *px++;
640     x5 = *px++;
641     x6 = *px++;
642 
643     /* Loop unrolling: process 8 taps at a time. */
644     tapCnt = numTaps >> 3U;
645 
646     while (tapCnt > 0U)
647     {
648       /* Read the b[numTaps-1] coefficient */
649       c0 = *(pb++);
650 
651       /* Read x[n-numTaps-3] sample */
652       x7 = *(px++);
653 
654       /* acc0 +=  b[numTaps-1] * x[n-numTaps] */
655       acc0 += x0 * c0;
656 
657       /* acc1 +=  b[numTaps-1] * x[n-numTaps-1] */
658       acc1 += x1 * c0;
659 
660       /* acc2 +=  b[numTaps-1] * x[n-numTaps-2] */
661       acc2 += x2 * c0;
662 
663       /* acc3 +=  b[numTaps-1] * x[n-numTaps-3] */
664       acc3 += x3 * c0;
665 
666       /* acc4 +=  b[numTaps-1] * x[n-numTaps-4] */
667       acc4 += x4 * c0;
668 
669       /* acc1 +=  b[numTaps-1] * x[n-numTaps-5] */
670       acc5 += x5 * c0;
671 
672       /* acc2 +=  b[numTaps-1] * x[n-numTaps-6] */
673       acc6 += x6 * c0;
674 
675       /* acc3 +=  b[numTaps-1] * x[n-numTaps-7] */
676       acc7 += x7 * c0;
677 
678       /* Read the b[numTaps-2] coefficient */
679       c0 = *(pb++);
680 
681       /* Read x[n-numTaps-4] sample */
682       x0 = *(px++);
683 
684       /* Perform the multiply-accumulate */
685       acc0 += x1 * c0;
686       acc1 += x2 * c0;
687       acc2 += x3 * c0;
688       acc3 += x4 * c0;
689       acc4 += x5 * c0;
690       acc5 += x6 * c0;
691       acc6 += x7 * c0;
692       acc7 += x0 * c0;
693 
694       /* Read the b[numTaps-3] coefficient */
695       c0 = *(pb++);
696 
697       /* Read x[n-numTaps-5] sample */
698       x1 = *(px++);
699 
700       /* Perform the multiply-accumulates */
701       acc0 += x2 * c0;
702       acc1 += x3 * c0;
703       acc2 += x4 * c0;
704       acc3 += x5 * c0;
705       acc4 += x6 * c0;
706       acc5 += x7 * c0;
707       acc6 += x0 * c0;
708       acc7 += x1 * c0;
709 
710       /* Read the b[numTaps-4] coefficient */
711       c0 = *(pb++);
712 
713       /* Read x[n-numTaps-6] sample */
714       x2 = *(px++);
715 
716       /* Perform the multiply-accumulates */
717       acc0 += x3 * c0;
718       acc1 += x4 * c0;
719       acc2 += x5 * c0;
720       acc3 += x6 * c0;
721       acc4 += x7 * c0;
722       acc5 += x0 * c0;
723       acc6 += x1 * c0;
724       acc7 += x2 * c0;
725 
726       /* Read the b[numTaps-4] coefficient */
727       c0 = *(pb++);
728 
729       /* Read x[n-numTaps-6] sample */
730       x3 = *(px++);
731       /* Perform the multiply-accumulates */
732       acc0 += x4 * c0;
733       acc1 += x5 * c0;
734       acc2 += x6 * c0;
735       acc3 += x7 * c0;
736       acc4 += x0 * c0;
737       acc5 += x1 * c0;
738       acc6 += x2 * c0;
739       acc7 += x3 * c0;
740 
741       /* Read the b[numTaps-4] coefficient */
742       c0 = *(pb++);
743 
744       /* Read x[n-numTaps-6] sample */
745       x4 = *(px++);
746 
747       /* Perform the multiply-accumulates */
748       acc0 += x5 * c0;
749       acc1 += x6 * c0;
750       acc2 += x7 * c0;
751       acc3 += x0 * c0;
752       acc4 += x1 * c0;
753       acc5 += x2 * c0;
754       acc6 += x3 * c0;
755       acc7 += x4 * c0;
756 
757       /* Read the b[numTaps-4] coefficient */
758       c0 = *(pb++);
759 
760       /* Read x[n-numTaps-6] sample */
761       x5 = *(px++);
762 
763       /* Perform the multiply-accumulates */
764       acc0 += x6 * c0;
765       acc1 += x7 * c0;
766       acc2 += x0 * c0;
767       acc3 += x1 * c0;
768       acc4 += x2 * c0;
769       acc5 += x3 * c0;
770       acc6 += x4 * c0;
771       acc7 += x5 * c0;
772 
773       /* Read the b[numTaps-4] coefficient */
774       c0 = *(pb++);
775 
776       /* Read x[n-numTaps-6] sample */
777       x6 = *(px++);
778 
779       /* Perform the multiply-accumulates */
780       acc0 += x7 * c0;
781       acc1 += x0 * c0;
782       acc2 += x1 * c0;
783       acc3 += x2 * c0;
784       acc4 += x3 * c0;
785       acc5 += x4 * c0;
786       acc6 += x5 * c0;
787       acc7 += x6 * c0;
788 
789       /* Decrement loop counter */
790       tapCnt--;
791     }
792 
793     /* Loop unrolling: Compute remaining outputs */
794     tapCnt = numTaps % 0x8U;
795 
796     while (tapCnt > 0U)
797     {
798       /* Read coefficients */
799       c0 = *(pb++);
800 
801       /* Fetch 1 state variable */
802       x7 = *(px++);
803 
804       /* Perform the multiply-accumulates */
805       acc0 += x0 * c0;
806       acc1 += x1 * c0;
807       acc2 += x2 * c0;
808       acc3 += x3 * c0;
809       acc4 += x4 * c0;
810       acc5 += x5 * c0;
811       acc6 += x6 * c0;
812       acc7 += x7 * c0;
813 
814       /* Reuse the present sample states for next sample */
815       x0 = x1;
816       x1 = x2;
817       x2 = x3;
818       x3 = x4;
819       x4 = x5;
820       x5 = x6;
821       x6 = x7;
822 
823       /* Decrement loop counter */
824       tapCnt--;
825     }
826 
827     /* Advance the state pointer by 8 to process the next group of 8 samples */
828     pState = pState + 8;
829 
830     /* The results in the 8 accumulators, store in the destination buffer. */
831     *pDst++ = acc0;
832     *pDst++ = acc1;
833     *pDst++ = acc2;
834     *pDst++ = acc3;
835     *pDst++ = acc4;
836     *pDst++ = acc5;
837     *pDst++ = acc6;
838     *pDst++ = acc7;
839 
840 
841     /* Decrement loop counter */
842     blkCnt--;
843   }
844 
845   /* Loop unrolling: Compute remaining output samples */
846   blkCnt = blockSize % 0x8U;
847 
848 #else
849 
850   /* Initialize blkCnt with number of taps */
851   blkCnt = blockSize;
852 
853 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
854 
855   while (blkCnt > 0U)
856   {
857     /* Copy one sample at a time into state buffer */
858     *pStateCurnt++ = *pSrc++;
859 
860     /* Set the accumulator to zero */
861     acc0 = 0.0f;
862 
863     /* Initialize state pointer */
864     px = pState;
865 
866     /* Initialize Coefficient pointer */
867     pb = pCoeffs;
868 
869     i = numTaps;
870 
871     /* Perform the multiply-accumulates */
872     while (i > 0U)
873     {
874       /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
875       acc0 += (_Float16)*px++ * (_Float16)*pb++;
876 
877       i--;
878     }
879 
880     /* Store result in destination buffer. */
881     *pDst++ = acc0;
882 
883     /* Advance state pointer by 1 for the next sample */
884     pState = pState + 1U;
885 
886     /* Decrement loop counter */
887     blkCnt--;
888   }
889 
890   /* Processing is complete.
891      Now copy the last numTaps - 1 samples to the start of the state buffer.
892      This prepares the state buffer for the next function call. */
893 
894   /* Points to the start of the state buffer */
895   pStateCurnt = S->pState;
896 
897 #if defined (ARM_MATH_LOOPUNROLL)
898 
899   /* Loop unrolling: Compute 4 taps at a time */
900   tapCnt = (numTaps - 1U) >> 2U;
901 
902   /* Copy data */
903   while (tapCnt > 0U)
904   {
905     *pStateCurnt++ = *pState++;
906     *pStateCurnt++ = *pState++;
907     *pStateCurnt++ = *pState++;
908     *pStateCurnt++ = *pState++;
909 
910     /* Decrement loop counter */
911     tapCnt--;
912   }
913 
914   /* Calculate remaining number of copies */
915   tapCnt = (numTaps - 1U) % 0x4U;
916 
917 #else
918 
919   /* Initialize tapCnt with number of taps */
920   tapCnt = (numTaps - 1U);
921 
922 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
923 
924   /* Copy remaining data */
925   while (tapCnt > 0U)
926   {
927     *pStateCurnt++ = *pState++;
928 
929     /* Decrement loop counter */
930     tapCnt--;
931   }
932 
933 }
934 
935 #endif /* #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
936 /**
937 * @} end of FIR group
938 */
939 
940 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
941