1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_f32.c
4  * Description:  Floating-point FIR filter processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @defgroup FIR Finite Impulse Response (FIR) Filters
37 
38   This set of functions implements Finite Impulse Response (FIR) filters
39   for Q7, Q15, Q31, and floating-point data types.  Fast versions of Q15 and Q31 are also provided.
40   The functions operate on blocks of input and output data and each call to the function processes
41   <code>blockSize</code> samples through the filter.  <code>pSrc</code> and
42   <code>pDst</code> points to input and output arrays containing <code>blockSize</code> values.
43 
44   @par           Algorithm
45                    The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.
46                    Each filter coefficient <code>b[n]</code> is multiplied by a state variable which equals a previous input sample <code>x[n]</code>.
47   <pre>
48       y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
49   </pre>
50   @par
51                    \image html FIR.GIF "Finite Impulse Response filter"
52   @par
53                    <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
54                    Coefficients are stored in time reversed order.
55   @par
56   <pre>
57       {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
58   </pre>
59   @par
60                    <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
61                    Samples in the state buffer are stored in the following order.
62   @par
63   <pre>
64       {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[n](==pSrc[0]), x[n+1](==pSrc[1]), ..., x[n+blockSize-1](==pSrc[blockSize-1])}
65   </pre>
66   @par
67                    Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>.
68                    The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
69                    to be avoided and yields a significant speed improvement.
70                    The state variables are updated after each block of data is processed; the coefficients are untouched.
71 
72   @par           Instance Structure
73                    The coefficients and state variables for a filter are stored together in an instance data structure.
74                    A separate instance structure must be defined for each filter.
75                    Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
76                    There are separate instance structure declarations for each of the 4 supported data types.
77 
78   @par           Initialization Functions
79                    There is also an associated initialization function for each data type.
80                    The initialization function performs the following operations:
81                    - Sets the values of the internal structure fields.
82                    - Zeros out the values in the state buffer.
83                    To do this manually without calling the init function, assign the follow subfields of the instance structure:
84                    numTaps, pCoeffs, pState. Also set all of the values in pState to zero.
85   @par
86                    Use of the initialization function is optional.
87                    However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
88                    To place an instance structure into a const data section, the instance structure must be manually initialized.
89                    Set the values in the state buffer to zeros before static initialization.
90                    The code below statically initializes each of the 4 different data type filter instance structures
91   <pre>
92       arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
93       arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
94       arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
95       arm_fir_instance_q7 S =  {numTaps, pState, pCoeffs};
96   </pre>
97                    where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
98                    <code>pCoeffs</code> is the address of the coefficient buffer.
99   @par          Initialization of Helium version
100                  For Helium version the array of coefficients must be padded with zero to contain
101                  a full number of lanes.
102 
103                  The array length L must be a multiple of x. L = x * a :
104                  - x is 4  for f32
105                  - x is 4  for q31
106                  - x is 4  for f16 (so managed like the f32 version and not like the q15 one)
107                  - x is 8  for q15
108                  - x is 16 for q7
109 
110                  The additional coefficients
111                  (x * a - numTaps) must be set to 0.
112                  numTaps is still set to its right value in the init function. It means that
113                  the implementation may require to read more coefficients due to the vectorization and
114                  to avoid having to manage too many different cases in the code.
115 
116 
117   @par          Helium state buffer
118                  The state buffer must contain some additional temporary data
119                  used during the computation but which is not the state of the FIR.
120                  The first A samples are temporary data.
121                  The remaining samples are the state of the FIR filter.
122   @par
123                  So the state buffer has size <code> numTaps + A + blockSize - 1 </code> :
124                  - A is blockSize for f32
125                  - A is 8*ceil(blockSize/8) for f16
126                  - A is 8*ceil(blockSize/4) for q31
127                  - A is 0 for other datatypes (q15 and q7)
128 
129 
130   @par           Fixed-Point Behavior
131                    Care must be taken when using the fixed-point versions of the FIR filter functions.
132                    In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
133                    Refer to the function specific documentation below for usage guidelines.
134 
135  */
136 
137 /**
138   @addtogroup FIR
139   @{
140  */
141 
142 /**
143   @brief         Processing function for floating-point FIR filter.
144   @param[in]     S          points to an instance of the floating-point FIR filter structure
145   @param[in]     pSrc       points to the block of input data
146   @param[out]    pDst       points to the block of output data
147   @param[in]     blockSize  number of samples to process
148   @return        none
149  */
150 
151 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
152 
153 #define FIR_F32_MAX_COEF_BLK        8
154 
155 #define FIR_F32_CORE(pSamples, c, NB_TAPS)                                 \
156         vecAcc0 = vdupq_n_f32(0.0f);                                       \
157         for (int i = 0; i < NB_TAPS; i++) {                                \
158             vecIn0 = vld1q(&pSamples[i]);                                  \
159             vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]);                        \
160         }
161 
162 
163 #define NB_TAPS 4
arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,const float32_t * __restrict pSrc,float32_t * __restrict pDst,uint32_t blockSize)164 __STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
165   const float32_t * __restrict pSrc,
166   float32_t * __restrict pDst, uint32_t blockSize)
167 {
168     float32_t *pRefStatePtr = S->pState + blockSize;
169     float32_t      *pState = pRefStatePtr; /* State pointer */
170     const float32_t *pCoeffs = S->pCoeffs;      /* Coefficient pointer */
171     float32_t      *pStateCur;  /* Points to the current sample of the state */
172     const float32_t *pSamples;  /* Temporary pointer to the sample buffer */
173     float32_t      *pOutput;    /* Temporary pointer to the output buffer */
174     const float32_t *pTempSrc;  /* Temporary pointer to the source data */
175     float32_t      *pTempDest;  /* Temporary pointer to the destination buffer */
176     uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
177     int32_t         blkCnt;
178     float32x4_t         vecIn0;
179     float32x4_t         vecAcc0;
180     float32_t       c[NB_TAPS];
181     const float32_t *pCoeffsCur = pCoeffs;
182 
183     /*
184      * pState points to state array which contains previous frame (numTaps - 1) samples
185      * pStateCur points to the location where the new input data should be written
186      */
187     pStateCur = &(pState[(numTaps - 1u)]);
188     pTempSrc = pSrc;
189 
190     pSamples = pState;
191     pOutput = pDst;
192 
193     for (int i = 0; i < NB_TAPS; i++)
194         c[i] = *pCoeffsCur++;
195 
196     blkCnt = blockSize >> 2;
197     while (blkCnt > 0) {
198         /*
199          * Save 4 input samples in the history buffer
200          */
201         vst1q(pStateCur, vld1q(pTempSrc));
202         pStateCur += 4;
203         pTempSrc += 4;
204 
205         FIR_F32_CORE(pSamples, c, NB_TAPS);
206 
207         vst1q(pOutput, vecAcc0);
208 
209         pOutput += 4;
210         pSamples += 4;
211 
212         blkCnt--;
213     }
214 
215     blkCnt = blockSize & 3;
216     if (blkCnt)
217     {
218         mve_pred16_t    p0 = vctp32q(blkCnt);
219 
220         vst1q(pStateCur, vld1q(pTempSrc));
221         pStateCur += 4;
222         pTempSrc += 4;
223 
224         FIR_F32_CORE(pSamples, c, NB_TAPS);
225 
226         vstrwq_p_f32(pOutput, vecAcc0, p0);
227     }
228 
229     /*
230      * Copy the samples back into the history buffer start
231      */
232     pTempSrc = &pState[blockSize];
233     pTempDest = pState;
234 
235     blkCnt = numTaps - 1;
236     do {
237         mve_pred16_t    p = vctp32q(blkCnt);
238 
239         vstrwq_p_f32(pTempDest, vldrwq_z_f32(pTempSrc, p), p);
240         pTempSrc += 4;
241         pTempDest += 4;
242         blkCnt -= 4;
243     }
244     while (blkCnt > 0);
245 }
246 #undef NB_TAPS
247 
arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,const float32_t * __restrict pSrc,float32_t * __restrict pDst,uint32_t blockSize)248 __STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,
249   const float32_t * __restrict pSrc,
250   float32_t * __restrict pDst, uint32_t blockSize)
251 {
252     float32_t *pRefStatePtr = S->pState + blockSize;
253     float32_t *pState = pRefStatePtr;      /* State pointer */
254     const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
255     const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
256     const float32_t *pTempSrc;          /* Temporary pointer to the source data */
257     float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
258     uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
259     int32_t  blkCnt;
260     float32_t c0, c1, c2, c3;
261     float32_t c4, c5, c6, c7;
262 
263 
264     pTempSrc = pSrc;
265     pTempDest = &(pState[(numTaps - 1u)]);
266     int cnt = blockSize;
267     do {
268         mve_pred16_t p0 = vctp32q(cnt);
269         vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
270         pTempDest += 4;
271         pTempSrc += 4;
272         cnt -= 4;
273     } while(cnt > 0);
274 
275 
276 
277     pSamples = pState;
278     c0 = *pCoeffs++;
279     c1 = *pCoeffs++;
280     c2 = *pCoeffs++;
281     c3 = *pCoeffs++;
282     c4 = *pCoeffs++;
283     c5 = *pCoeffs++;
284     c6 = *pCoeffs++;
285     c7 = *pCoeffs++;
286 
287     cnt = blockSize >> 2;
288     while(cnt > 0)
289     {
290         float32x4_t vecAcc0;
291         float32x4_t vecIn0;
292 
293         vecIn0 = vld1q(pSamples);
294         vecAcc0 = vmulq(vecIn0, c0);
295         vecIn0 = vld1q(&pSamples[1]);
296         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
297         vecIn0 = vld1q(&pSamples[2]);
298         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
299         vecIn0 = vld1q(&pSamples[3]);
300         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
301         vecIn0 = vld1q(&pSamples[4]);
302         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
303         vecIn0 = vld1q(&pSamples[5]);
304         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
305         vecIn0 = vld1q(&pSamples[6]);
306         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
307         vecIn0 = vld1q(&pSamples[7]);
308         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
309         pSamples += 4;
310         vst1q(pDst, vecAcc0);
311         cnt--;
312         pDst += 4;
313     }
314 
315     cnt = blockSize & 3;
316     if (cnt > 0)
317     {
318         float32x4_t vecAcc0;
319         float32x4_t vecIn0;
320 
321         mve_pred16_t p0 = vctp32q(cnt);
322 
323         vecIn0 = vld1q(pSamples);
324         vecAcc0 = vmulq(vecIn0, c0);
325         vecIn0 = vld1q(&pSamples[1]);
326         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
327         vecIn0 = vld1q(&pSamples[2]);
328         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
329         vecIn0 = vld1q(&pSamples[3]);
330         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
331         vecIn0 = vld1q(&pSamples[4]);
332         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
333         vecIn0 = vld1q(&pSamples[5]);
334         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
335         vecIn0 = vld1q(&pSamples[6]);
336         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
337         vecIn0 = vld1q(&pSamples[7]);
338         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
339         vstrwq_p_f32(pDst, vecAcc0,p0);
340     }
341 
342 
343     /*
344      * Copy the samples back into the history buffer start
345      */
346     pTempSrc = &pState[blockSize];
347     pTempDest = pState;
348     blkCnt = numTaps;
349     while (blkCnt > 0)
350     {
351         *pTempDest++ = *pTempSrc++;
352         blkCnt--;
353     }
354 }
355 
356 
357 
arm_fir_f32(const arm_fir_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)358 void arm_fir_f32(
359 const arm_fir_instance_f32 * S,
360 const float32_t * pSrc,
361 float32_t * pDst,
362 uint32_t blockSize)
363 {
364     /*
365        S->pState is the arm_fir_partial_accu
366        S->pState + blockSize is the FIR state
367     */
368     float32_t *pRefStatePtr = S->pState + blockSize;
369     float32_t *pState = pRefStatePtr ;      /* State pointer */
370     const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
371     const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
372     float32_t *pOutput;                 /* Temporary pointer to the output buffer */
373     const float32_t *pTempSrc;          /* Temporary pointer to the source data */
374     float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
375     uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
376     uint32_t  blkCnt;
377     float32_t c0, c1, c2, c3;
378     float32_t c4, c5, c6, c7;
379 
380     /*
381      * [1 to 8 taps] specialized routines
382      */
383     if (numTaps <= 4)
384     {
385         arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
386         return;
387     }
388     else if (numTaps <= 8)
389     {
390         arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
391         return;
392     }
393 
394     pTempSrc = pSrc;
395     pTempDest = &(pState[(numTaps - 1u)]);
396     int cnt = blockSize;
397     do {
398         mve_pred16_t p0 = vctp32q(cnt);
399         vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
400         pTempDest += 4;
401         pTempSrc += 4;
402         cnt -= 4;
403     } while(cnt > 0);
404 
405     float32_t *partial_accu_ptr = S->pState;
406 
407     pSamples = pState;
408     c0 = *pCoeffs++;
409     c1 = *pCoeffs++;
410     c2 = *pCoeffs++;
411     c3 = *pCoeffs++;
412     c4 = *pCoeffs++;
413     c5 = *pCoeffs++;
414     c6 = *pCoeffs++;
415     c7 = *pCoeffs++;
416 
417     cnt = blockSize >> 2;
418     while(cnt > 0) {
419         float32x4_t vecAcc0;
420         float32x4_t vecIn0;
421 
422         vecIn0 = vld1q(pSamples);
423         vecAcc0 = vmulq(vecIn0, c0);
424         vecIn0 = vld1q(&pSamples[1]);
425         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
426         vecIn0 = vld1q(&pSamples[2]);
427         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
428         vecIn0 = vld1q(&pSamples[3]);
429         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
430         vecIn0 = vld1q(&pSamples[4]);
431         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
432         vecIn0 = vld1q(&pSamples[5]);
433         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
434         vecIn0 = vld1q(&pSamples[6]);
435         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
436         vecIn0 = vld1q(&pSamples[7]);
437         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
438         pSamples += 4;
439         vst1q(partial_accu_ptr, vecAcc0);
440         cnt--;
441         partial_accu_ptr += 4;
442     }
443 
444     cnt = blockSize & 3;
445     if (cnt > 0)
446     {
447         float32x4_t vecAcc0;
448         float32x4_t vecIn0;
449 
450         mve_pred16_t p0 = vctp32q(cnt);
451 
452         vecIn0 = vld1q(pSamples);
453         vecAcc0 = vmulq(vecIn0, c0);
454         vecIn0 = vld1q(&pSamples[1]);
455         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
456         vecIn0 = vld1q(&pSamples[2]);
457         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
458         vecIn0 = vld1q(&pSamples[3]);
459         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
460         vecIn0 = vld1q(&pSamples[4]);
461         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
462         vecIn0 = vld1q(&pSamples[5]);
463         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
464         vecIn0 = vld1q(&pSamples[6]);
465         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
466         vecIn0 = vld1q(&pSamples[7]);
467         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
468         vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
469     }
470 
471     int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
472     int sample_offset = FIR_F32_MAX_COEF_BLK;
473     while (localTaps > FIR_F32_MAX_COEF_BLK) {
474         c0 = *pCoeffs++;
475         c1 = *pCoeffs++;
476         c2 = *pCoeffs++;
477         c3 = *pCoeffs++;
478         c4 = *pCoeffs++;
479         c5 = *pCoeffs++;
480         c6 = *pCoeffs++;
481         c7 = *pCoeffs++;
482 
483         partial_accu_ptr = S->pState;
484         pSamples = pState + sample_offset;
485         int cnt = blockSize >> 2;
486         while(cnt > 0) {
487             float32x4_t vecAcc0;
488             float32x4_t vecIn0;
489 
490             vecIn0 = vld1q(pSamples);
491             vecAcc0 = vmulq(vecIn0, c0);
492             vecIn0 = vld1q(&pSamples[1]);
493             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
494             vecIn0 = vld1q(&pSamples[2]);
495             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
496             vecIn0 = vld1q(&pSamples[3]);
497             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
498             vecIn0 = vld1q(&pSamples[4]);
499             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
500             vecIn0 = vld1q(&pSamples[5]);
501             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
502             vecIn0 = vld1q(&pSamples[6]);
503             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
504             vecIn0 = vld1q(&pSamples[7]);
505             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
506             pSamples += 4;
507             vecAcc0 += vld1q_f32(partial_accu_ptr);
508             vst1q(partial_accu_ptr, vecAcc0);
509             cnt--;
510             partial_accu_ptr += 4;
511         }
512 
513         cnt = blockSize & 3;
514         if (cnt > 0) {
515             float32x4_t vecAcc0;
516             float32x4_t vecIn0;
517 
518             mve_pred16_t p0 = vctp32q(cnt);
519 
520             vecIn0 = vld1q(pSamples);
521             vecAcc0 = vmulq(vecIn0, c0);
522             vecIn0 = vld1q(&pSamples[1]);
523             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
524             vecIn0 = vld1q(&pSamples[2]);
525             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
526             vecIn0 = vld1q(&pSamples[3]);
527             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
528             vecIn0 = vld1q(&pSamples[4]);
529             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
530             vecIn0 = vld1q(&pSamples[5]);
531             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
532             vecIn0 = vld1q(&pSamples[6]);
533             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
534             vecIn0 = vld1q(&pSamples[7]);
535             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
536             vecAcc0 += vld1q_f32(partial_accu_ptr);
537             vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
538         }
539 
540         localTaps -= FIR_F32_MAX_COEF_BLK;
541         sample_offset += FIR_F32_MAX_COEF_BLK;
542     }
543 
544     pSamples = pState + sample_offset;
545 
546     if (localTaps > 4) {
547         c0 = *pCoeffs++;
548         c1 = *pCoeffs++;
549         c2 = *pCoeffs++;
550         c3 = *pCoeffs++;
551         c4 = *pCoeffs++;
552         c5 = *pCoeffs++;
553         c6 = *pCoeffs++;
554         c7 = *pCoeffs++;
555         pOutput = pDst;
556 
557         partial_accu_ptr = S->pState;
558         cnt = blockSize  >> 2;
559         while(cnt > 0) {
560             float32x4_t vecAcc0;
561             float32x4_t vecIn0;
562 
563             vecIn0 = vld1q(pSamples);
564             vecAcc0 = vmulq(vecIn0, c0);
565             vecIn0 = vld1q(&pSamples[1]);
566             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
567             vecIn0 = vld1q(&pSamples[2]);
568             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
569             vecIn0 = vld1q(&pSamples[3]);
570             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
571             vecIn0 = vld1q(&pSamples[4]);
572             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
573             vecIn0 = vld1q(&pSamples[5]);
574             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
575             vecIn0 = vld1q(&pSamples[6]);
576             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
577             vecIn0 = vld1q(&pSamples[7]);
578             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
579             pSamples += 4;
580             float32x4_t pap = vld1q_f32(partial_accu_ptr);
581             vst1q(pOutput, vecAcc0+pap);
582             cnt--;
583             partial_accu_ptr += 4;
584             pOutput += 4;
585         }
586 
587         cnt = blockSize  & 3;
588         if (cnt > 0) {
589             float32x4_t vecAcc0;
590             float32x4_t vecIn0;
591 
592             mve_pred16_t p0 = vctp32q(cnt);
593 
594             vecIn0 = vld1q(pSamples);
595             vecAcc0 = vmulq(vecIn0, c0);
596             vecIn0 = vld1q(&pSamples[1]);
597             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
598             vecIn0 = vld1q(&pSamples[2]);
599             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
600             vecIn0 = vld1q(&pSamples[3]);
601             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
602             vecIn0 = vld1q(&pSamples[4]);
603             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
604             vecIn0 = vld1q(&pSamples[5]);
605             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
606             vecIn0 = vld1q(&pSamples[6]);
607             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
608             vecIn0 = vld1q(&pSamples[7]);
609             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
610             float32x4_t pap = vld1q_f32(partial_accu_ptr);
611             vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
612             pOutput += cnt;
613         }
614     }
615     else {
616         c0 = *pCoeffs++;
617         c1 = *pCoeffs++;
618         c2 = *pCoeffs++;
619         c3 = *pCoeffs++;
620         pOutput = pDst;
621 
622         partial_accu_ptr = S->pState;
623         cnt = blockSize >> 2;
624         while(cnt > 0) {
625             float32x4_t vecAcc0;
626             float32x4_t vecIn0;
627 
628             vecIn0 = vld1q(pSamples);
629             vecAcc0 = vmulq(vecIn0, c0);
630             vecIn0 = vld1q(&pSamples[1]);
631             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
632             vecIn0 = vld1q(&pSamples[2]);
633             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
634             vecIn0 = vld1q(&pSamples[3]);
635             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
636             pSamples += 4;
637             float32x4_t pap = vld1q_f32(partial_accu_ptr);
638             vst1q(pOutput, vecAcc0+pap);
639             cnt--;
640             partial_accu_ptr += 4;
641             pOutput += 4;
642         }
643 
644         cnt = blockSize & 3;
645         if (cnt > 0) {
646             float32x4_t vecAcc0;
647             float32x4_t vecIn0;
648 
649             mve_pred16_t p0 = vctp32q(cnt);
650 
651             vecIn0 = vld1q(pSamples);
652             vecAcc0 = vmulq(vecIn0, c0);
653             vecIn0 = vld1q(&pSamples[1]);
654             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
655             vecIn0 = vld1q(&pSamples[2]);
656             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
657             vecIn0 = vld1q(&pSamples[3]);
658             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
659             float32x4_t pap = vld1q_f32(partial_accu_ptr);
660             vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
661             pOutput += cnt;
662         }
663     }
664 
665     /*
666      * Copy the samples back into the history buffer start
667      */
668     pTempSrc = &pRefStatePtr[blockSize];
669     pTempDest = pRefStatePtr;
670 
671     blkCnt = numTaps >> 2;
672     while (blkCnt > 0)
673     {
674         vst1q(pTempDest, vld1q(pTempSrc));
675         pTempSrc += 4;
676         pTempDest += 4;
677         blkCnt--;
678     }
679     blkCnt = numTaps & 3;
680     if (blkCnt > 0)
681     {
682         mve_pred16_t p0 = vctp32q(blkCnt);
683         vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
684     }
685 }
686 
687 #else
688 #if defined(ARM_MATH_NEON)
689 
arm_fir_f32(const arm_fir_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)690 void arm_fir_f32(
691 const arm_fir_instance_f32 * S,
692 const float32_t * pSrc,
693 float32_t * pDst,
694 uint32_t blockSize)
695 {
696    float32_t *pState = S->pState;                 /* State pointer */
697    const float32_t *pCoeffs = S->pCoeffs;         /* Coefficient pointer */
698    float32_t *pStateCurnt;                        /* Points to the current sample of the state */
699    float32_t *px;                                 /* Temporary pointers for state buffer */
700    const float32_t *pb;                           /* Temporary pointers for coefficient buffer */
701    uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
702    uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
703 
704    float32x4_t accv0,accv1,samples0,samples1,x0,x1,x2,xa,xb,b;
705    float32_t acc;
706 
707    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
708    /* pStateCurnt points to the location where the new input data should be written */
709    pStateCurnt = &(S->pState[(numTaps - 1U)]);
710 
711    /* Loop unrolling */
712    blkCnt = blockSize >> 3;
713 
714    while (blkCnt > 0U)
715    {
716       /* Copy 8 samples at a time into state buffers */
717       samples0 = vld1q_f32(pSrc);
718       vst1q_f32(pStateCurnt,samples0);
719 
720       pStateCurnt += 4;
721       pSrc += 4 ;
722 
723       samples1 = vld1q_f32(pSrc);
724       vst1q_f32(pStateCurnt,samples1);
725 
726       pStateCurnt += 4;
727       pSrc += 4 ;
728 
729       /* Set the accumulators to zero */
730       accv0 = vdupq_n_f32(0);
731       accv1 = vdupq_n_f32(0);
732 
733       /* Initialize state pointer */
734       px = pState;
735 
736       /* Initialize coefficient pointer */
737       pb = pCoeffs;
738 
739       /* Loop unroling */
740       i = numTaps >> 2;
741 
742       /* Perform the multiply-accumulates */
743       x0 = vld1q_f32(px);
744       x1 = vld1q_f32(px + 4);
745 
746       while(i > 0)
747       {
748          /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
749          x2 = vld1q_f32(px + 8);
750          b = vld1q_f32(pb);
751          xa = x0;
752          xb = x1;
753          accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 0));
754          accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 0));
755 
756          xa = vextq_f32(x0,x1,1);
757          xb = vextq_f32(x1,x2,1);
758 
759          accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 1));
760          accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 1));
761 
762          xa = vextq_f32(x0,x1,2);
763          xb = vextq_f32(x1,x2,2);
764 
765          accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 2));
766          accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 2));
767 
768          xa = vextq_f32(x0,x1,3);
769          xb = vextq_f32(x1,x2,3);
770 
771          accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 3));
772          accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 3));
773 
774          pb += 4;
775          x0 = x1;
776          x1 = x2;
777          px += 4;
778          i--;
779 
780       }
781 
782       /* Tail */
783       i = numTaps & 3;
784       x2 = vld1q_f32(px + 8);
785 
786       /* Perform the multiply-accumulates */
787       switch(i)
788       {
789          case 3:
790          {
791            accv0 = vmlaq_n_f32(accv0,x0,*pb);
792            accv1 = vmlaq_n_f32(accv1,x1,*pb);
793 
794            pb++;
795 
796            xa = vextq_f32(x0,x1,1);
797            xb = vextq_f32(x1,x2,1);
798 
799            accv0 = vmlaq_n_f32(accv0,xa,*pb);
800            accv1 = vmlaq_n_f32(accv1,xb,*pb);
801 
802            pb++;
803 
804            xa = vextq_f32(x0,x1,2);
805            xb = vextq_f32(x1,x2,2);
806 
807            accv0 = vmlaq_n_f32(accv0,xa,*pb);
808            accv1 = vmlaq_n_f32(accv1,xb,*pb);
809 
810          }
811          break;
812          case 2:
813          {
814            accv0 = vmlaq_n_f32(accv0,x0,*pb);
815            accv1 = vmlaq_n_f32(accv1,x1,*pb);
816 
817            pb++;
818 
819            xa = vextq_f32(x0,x1,1);
820            xb = vextq_f32(x1,x2,1);
821 
822            accv0 = vmlaq_n_f32(accv0,xa,*pb);
823            accv1 = vmlaq_n_f32(accv1,xb,*pb);
824 
825          }
826          break;
827          case 1:
828          {
829 
830            accv0 = vmlaq_n_f32(accv0,x0,*pb);
831            accv1 = vmlaq_n_f32(accv1,x1,*pb);
832 
833          }
834          break;
835          default:
836          break;
837       }
838 
839       /* The result is stored in the destination buffer. */
840       vst1q_f32(pDst,accv0);
841       pDst += 4;
842       vst1q_f32(pDst,accv1);
843       pDst += 4;
844 
845       /* Advance state pointer by 8 for the next 8 samples */
846       pState = pState + 8;
847 
848       blkCnt--;
849    }
850 
851    /* Tail */
852    blkCnt = blockSize & 0x7;
853 
854    while (blkCnt > 0U)
855    {
856       /* Copy one sample at a time into state buffer */
857       *pStateCurnt++ = *pSrc++;
858 
859       /* Set the accumulator to zero */
860       acc = 0.0f;
861 
862       /* Initialize state pointer */
863       px = pState;
864 
865       /* Initialize Coefficient pointer */
866       pb = pCoeffs;
867 
868       i = numTaps;
869 
870       /* Perform the multiply-accumulates */
871       do
872       {
873          /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
874          acc += *px++ * *pb++;
875          i--;
876 
877       } while (i > 0U);
878 
879       /* The result is stored in the destination buffer. */
880       *pDst++ = acc;
881 
882       /* Advance state pointer by 1 for the next sample */
883       pState = pState + 1;
884 
885       blkCnt--;
886    }
887 
888    /* Processing is complete.
889    ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
890    ** This prepares the state buffer for the next function call. */
891 
892    /* Points to the start of the state buffer */
893    pStateCurnt = S->pState;
894 
895    /* Copy numTaps number of values */
896    tapCnt = numTaps - 1U;
897 
898    /* Copy data */
899    while (tapCnt > 0U)
900    {
901       *pStateCurnt++ = *pState++;
902 
903       /* Decrement the loop counter */
904       tapCnt--;
905    }
906 
907 }
908 #else
arm_fir_f32(const arm_fir_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)909 void arm_fir_f32(
910   const arm_fir_instance_f32 * S,
911   const float32_t * pSrc,
912         float32_t * pDst,
913         uint32_t blockSize)
914 {
915         float32_t *pState = S->pState;                 /* State pointer */
916   const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
917         float32_t *pStateCurnt;                        /* Points to the current sample of the state */
918         float32_t *px;                                 /* Temporary pointer for state buffer */
919   const float32_t *pb;                                 /* Temporary pointer for coefficient buffer */
920         float32_t acc0;                                /* Accumulator */
921         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
922         uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
923 
924 #if defined (ARM_MATH_LOOPUNROLL)
925         float32_t acc1, acc2, acc3, acc4, acc5, acc6, acc7;     /* Accumulators */
926         float32_t x0, x1, x2, x3, x4, x5, x6, x7;               /* Temporary variables to hold state values */
927         float32_t c0;                                           /* Temporary variable to hold coefficient value */
928 #endif
929 
930   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
931   /* pStateCurnt points to the location where the new input data should be written */
932   pStateCurnt = &(S->pState[(numTaps - 1U)]);
933 
934 #if defined (ARM_MATH_LOOPUNROLL)
935 
936   /* Loop unrolling: Compute 8 output values simultaneously.
937    * The variables acc0 ... acc7 hold output values that are being computed:
938    *
939    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
940    *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
941    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
942    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
943    */
944 
945   blkCnt = blockSize >> 3U;
946 
947   while (blkCnt > 0U)
948   {
949     /* Copy 4 new input samples into the state buffer. */
950     *pStateCurnt++ = *pSrc++;
951     *pStateCurnt++ = *pSrc++;
952     *pStateCurnt++ = *pSrc++;
953     *pStateCurnt++ = *pSrc++;
954 
955     /* Set all accumulators to zero */
956     acc0 = 0.0f;
957     acc1 = 0.0f;
958     acc2 = 0.0f;
959     acc3 = 0.0f;
960     acc4 = 0.0f;
961     acc5 = 0.0f;
962     acc6 = 0.0f;
963     acc7 = 0.0f;
964 
965     /* Initialize state pointer */
966     px = pState;
967 
968     /* Initialize coefficient pointer */
969     pb = pCoeffs;
970 
971     /* This is separated from the others to avoid
972      * a call to __aeabi_memmove which would be slower
973      */
974     *pStateCurnt++ = *pSrc++;
975     *pStateCurnt++ = *pSrc++;
976     *pStateCurnt++ = *pSrc++;
977     *pStateCurnt++ = *pSrc++;
978 
979     /* Read the first 7 samples from the state buffer:  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
980     x0 = *px++;
981     x1 = *px++;
982     x2 = *px++;
983     x3 = *px++;
984     x4 = *px++;
985     x5 = *px++;
986     x6 = *px++;
987 
988     /* Loop unrolling: process 8 taps at a time. */
989     tapCnt = numTaps >> 3U;
990 
991     while (tapCnt > 0U)
992     {
993       /* Read the b[numTaps-1] coefficient */
994       c0 = *(pb++);
995 
996       /* Read x[n-numTaps-3] sample */
997       x7 = *(px++);
998 
999       /* acc0 +=  b[numTaps-1] * x[n-numTaps] */
1000       acc0 += x0 * c0;
1001 
1002       /* acc1 +=  b[numTaps-1] * x[n-numTaps-1] */
1003       acc1 += x1 * c0;
1004 
1005       /* acc2 +=  b[numTaps-1] * x[n-numTaps-2] */
1006       acc2 += x2 * c0;
1007 
1008       /* acc3 +=  b[numTaps-1] * x[n-numTaps-3] */
1009       acc3 += x3 * c0;
1010 
1011       /* acc4 +=  b[numTaps-1] * x[n-numTaps-4] */
1012       acc4 += x4 * c0;
1013 
1014       /* acc1 +=  b[numTaps-1] * x[n-numTaps-5] */
1015       acc5 += x5 * c0;
1016 
1017       /* acc2 +=  b[numTaps-1] * x[n-numTaps-6] */
1018       acc6 += x6 * c0;
1019 
1020       /* acc3 +=  b[numTaps-1] * x[n-numTaps-7] */
1021       acc7 += x7 * c0;
1022 
1023       /* Read the b[numTaps-2] coefficient */
1024       c0 = *(pb++);
1025 
1026       /* Read x[n-numTaps-4] sample */
1027       x0 = *(px++);
1028 
1029       /* Perform the multiply-accumulate */
1030       acc0 += x1 * c0;
1031       acc1 += x2 * c0;
1032       acc2 += x3 * c0;
1033       acc3 += x4 * c0;
1034       acc4 += x5 * c0;
1035       acc5 += x6 * c0;
1036       acc6 += x7 * c0;
1037       acc7 += x0 * c0;
1038 
1039       /* Read the b[numTaps-3] coefficient */
1040       c0 = *(pb++);
1041 
1042       /* Read x[n-numTaps-5] sample */
1043       x1 = *(px++);
1044 
1045       /* Perform the multiply-accumulates */
1046       acc0 += x2 * c0;
1047       acc1 += x3 * c0;
1048       acc2 += x4 * c0;
1049       acc3 += x5 * c0;
1050       acc4 += x6 * c0;
1051       acc5 += x7 * c0;
1052       acc6 += x0 * c0;
1053       acc7 += x1 * c0;
1054 
1055       /* Read the b[numTaps-4] coefficient */
1056       c0 = *(pb++);
1057 
1058       /* Read x[n-numTaps-6] sample */
1059       x2 = *(px++);
1060 
1061       /* Perform the multiply-accumulates */
1062       acc0 += x3 * c0;
1063       acc1 += x4 * c0;
1064       acc2 += x5 * c0;
1065       acc3 += x6 * c0;
1066       acc4 += x7 * c0;
1067       acc5 += x0 * c0;
1068       acc6 += x1 * c0;
1069       acc7 += x2 * c0;
1070 
1071       /* Read the b[numTaps-4] coefficient */
1072       c0 = *(pb++);
1073 
1074       /* Read x[n-numTaps-6] sample */
1075       x3 = *(px++);
1076       /* Perform the multiply-accumulates */
1077       acc0 += x4 * c0;
1078       acc1 += x5 * c0;
1079       acc2 += x6 * c0;
1080       acc3 += x7 * c0;
1081       acc4 += x0 * c0;
1082       acc5 += x1 * c0;
1083       acc6 += x2 * c0;
1084       acc7 += x3 * c0;
1085 
1086       /* Read the b[numTaps-4] coefficient */
1087       c0 = *(pb++);
1088 
1089       /* Read x[n-numTaps-6] sample */
1090       x4 = *(px++);
1091 
1092       /* Perform the multiply-accumulates */
1093       acc0 += x5 * c0;
1094       acc1 += x6 * c0;
1095       acc2 += x7 * c0;
1096       acc3 += x0 * c0;
1097       acc4 += x1 * c0;
1098       acc5 += x2 * c0;
1099       acc6 += x3 * c0;
1100       acc7 += x4 * c0;
1101 
1102       /* Read the b[numTaps-4] coefficient */
1103       c0 = *(pb++);
1104 
1105       /* Read x[n-numTaps-6] sample */
1106       x5 = *(px++);
1107 
1108       /* Perform the multiply-accumulates */
1109       acc0 += x6 * c0;
1110       acc1 += x7 * c0;
1111       acc2 += x0 * c0;
1112       acc3 += x1 * c0;
1113       acc4 += x2 * c0;
1114       acc5 += x3 * c0;
1115       acc6 += x4 * c0;
1116       acc7 += x5 * c0;
1117 
1118       /* Read the b[numTaps-4] coefficient */
1119       c0 = *(pb++);
1120 
1121       /* Read x[n-numTaps-6] sample */
1122       x6 = *(px++);
1123 
1124       /* Perform the multiply-accumulates */
1125       acc0 += x7 * c0;
1126       acc1 += x0 * c0;
1127       acc2 += x1 * c0;
1128       acc3 += x2 * c0;
1129       acc4 += x3 * c0;
1130       acc5 += x4 * c0;
1131       acc6 += x5 * c0;
1132       acc7 += x6 * c0;
1133 
1134       /* Decrement loop counter */
1135       tapCnt--;
1136     }
1137 
1138     /* Loop unrolling: Compute remaining outputs */
1139     tapCnt = numTaps % 0x8U;
1140 
1141     while (tapCnt > 0U)
1142     {
1143       /* Read coefficients */
1144       c0 = *(pb++);
1145 
1146       /* Fetch 1 state variable */
1147       x7 = *(px++);
1148 
1149       /* Perform the multiply-accumulates */
1150       acc0 += x0 * c0;
1151       acc1 += x1 * c0;
1152       acc2 += x2 * c0;
1153       acc3 += x3 * c0;
1154       acc4 += x4 * c0;
1155       acc5 += x5 * c0;
1156       acc6 += x6 * c0;
1157       acc7 += x7 * c0;
1158 
1159       /* Reuse the present sample states for next sample */
1160       x0 = x1;
1161       x1 = x2;
1162       x2 = x3;
1163       x3 = x4;
1164       x4 = x5;
1165       x5 = x6;
1166       x6 = x7;
1167 
1168       /* Decrement loop counter */
1169       tapCnt--;
1170     }
1171 
1172     /* Advance the state pointer by 8 to process the next group of 8 samples */
1173     pState = pState + 8;
1174 
1175     /* The results in the 8 accumulators, store in the destination buffer. */
1176     *pDst++ = acc0;
1177     *pDst++ = acc1;
1178     *pDst++ = acc2;
1179     *pDst++ = acc3;
1180     *pDst++ = acc4;
1181     *pDst++ = acc5;
1182     *pDst++ = acc6;
1183     *pDst++ = acc7;
1184 
1185 
1186     /* Decrement loop counter */
1187     blkCnt--;
1188   }
1189 
1190   /* Loop unrolling: Compute remaining output samples */
1191   blkCnt = blockSize % 0x8U;
1192 
1193 #else
1194 
1195   /* Initialize blkCnt with number of taps */
1196   blkCnt = blockSize;
1197 
1198 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1199 
1200   while (blkCnt > 0U)
1201   {
1202     /* Copy one sample at a time into state buffer */
1203     *pStateCurnt++ = *pSrc++;
1204 
1205     /* Set the accumulator to zero */
1206     acc0 = 0.0f;
1207 
1208     /* Initialize state pointer */
1209     px = pState;
1210 
1211     /* Initialize Coefficient pointer */
1212     pb = pCoeffs;
1213 
1214     i = numTaps;
1215 
1216     /* Perform the multiply-accumulates */
1217     while (i > 0U)
1218     {
1219       /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
1220       acc0 += *px++ * *pb++;
1221 
1222       i--;
1223     }
1224 
1225     /* Store result in destination buffer. */
1226     *pDst++ = acc0;
1227 
1228     /* Advance state pointer by 1 for the next sample */
1229     pState = pState + 1U;
1230 
1231     /* Decrement loop counter */
1232     blkCnt--;
1233   }
1234 
1235   /* Processing is complete.
1236      Now copy the last numTaps - 1 samples to the start of the state buffer.
1237      This prepares the state buffer for the next function call. */
1238 
1239   /* Points to the start of the state buffer */
1240   pStateCurnt = S->pState;
1241 
1242 #if defined (ARM_MATH_LOOPUNROLL)
1243 
1244   /* Loop unrolling: Compute 4 taps at a time */
1245   tapCnt = (numTaps - 1U) >> 2U;
1246 
1247   /* Copy data */
1248   while (tapCnt > 0U)
1249   {
1250     *pStateCurnt++ = *pState++;
1251     *pStateCurnt++ = *pState++;
1252     *pStateCurnt++ = *pState++;
1253     *pStateCurnt++ = *pState++;
1254 
1255     /* Decrement loop counter */
1256     tapCnt--;
1257   }
1258 
1259   /* Calculate remaining number of copies */
1260   tapCnt = (numTaps - 1U) % 0x4U;
1261 
1262 #else
1263 
1264   /* Initialize tapCnt with number of taps */
1265   tapCnt = (numTaps - 1U);
1266 
1267 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1268 
1269   /* Copy remaining data */
1270   while (tapCnt > 0U)
1271   {
1272     *pStateCurnt++ = *pState++;
1273 
1274     /* Decrement loop counter */
1275     tapCnt--;
1276   }
1277 
1278 }
1279 
1280 #endif /* #if defined(ARM_MATH_NEON) */
1281 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1282 
1283 /**
1284 * @} end of FIR group
1285 */
1286