1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_f32.c
4  * Description:  Floating-point FIR filter processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @defgroup FIR Finite Impulse Response (FIR) Filters
37 
38   This set of functions implements Finite Impulse Response (FIR) filters
39   for Q7, Q15, Q31, and floating-point data types.  Fast versions of Q15 and Q31 are also provided.
40   The functions operate on blocks of input and output data and each call to the function processes
41   <code>blockSize</code> samples through the filter.  <code>pSrc</code> and
42   <code>pDst</code> points to input and output arrays containing <code>blockSize</code> values.
43 
44   @par           Algorithm
45                    The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.
46                    Each filter coefficient <code>b[n]</code> is multiplied by a state variable which equals a previous input sample <code>x[n]</code>.
47   <pre>
48       y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
49   </pre>
50   @par
51                    \image html FIR.GIF "Finite Impulse Response filter"
52   @par
53                    <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
54                    Coefficients are stored in time reversed order.
55   @par
56   <pre>
57       {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
58   </pre>
59   @par
60                    <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
61                    Samples in the state buffer are stored in the following order.
62   @par
63   <pre>
64       {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[n](==pSrc[0]), x[n+1](==pSrc[1]), ..., x[n+blockSize-1](==pSrc[blockSize-1])}
65   </pre>
66 
67   @par
68                    Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>.
69                    The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
70                    to be avoided and yields a significant speed improvement.
71                    The state variables are updated after each block of data is processed; the coefficients are untouched.
72 
73   @par           Instance Structure
74                    The coefficients and state variables for a filter are stored together in an instance data structure.
75                    A separate instance structure must be defined for each filter.
76                    Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
77                    There are separate instance structure declarations for each of the 4 supported data types.
78 
79   @par           Initialization Functions
80                    There is also an associated initialization function for each data type.
81                    The initialization function performs the following operations:
82                    - Sets the values of the internal structure fields.
83                    - Zeros out the values in the state buffer.
84                    To do this manually without calling the init function, assign the follow subfields of the instance structure:
85                    numTaps, pCoeffs, pState. Also set all of the values in pState to zero.
86 
87   @par
88                    Use of the initialization function is optional.
89                    However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
90                    To place an instance structure into a const data section, the instance structure must be manually initialized.
91                    Set the values in the state buffer to zeros before static initialization.
92                    The code below statically initializes each of the 4 different data type filter instance structures
93   <pre>
94       arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
95       arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
96       arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
97       arm_fir_instance_q7 S =  {numTaps, pState, pCoeffs};
98   </pre>
99                    where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
100                    <code>pCoeffs</code> is the address of the coefficient buffer.
101 
102   @par          Initialization of Helium version
103                  For Helium version the array of coefficients must be padded with zero to contain
104                  a full number of lanes.
105 
106                  The array length L must be a multiple of x. L = x * a :
107                  - x is 4  for f32
108                  - x is 4  for q31
109                  - x is 4  for f16 (so managed like the f32 version and not like the q15 one)
110                  - x is 8  for q15
111                  - x is 16 for q7
112 
113                  The additional coefficients
114                  (x * a - numTaps) must be set to 0.
115                  numTaps is still set to its right value in the init function. It means that
116                  the implementation may require to read more coefficients due to the vectorization and
117                  to avoid having to manage too many different cases in the code.
118 
119   @par          Helium state buffer
120                  The state buffer must contain some additional temporary data
121                  used during the computation but which is not the state of the FIR.
122                  The first A samples are temporary data.
123                  The remaining samples are the state of the FIR filter.
124 
125   @par
126                  So the state buffer has size <code> numTaps + A + blockSize - 1 </code> :
127                  - A is blockSize for f32
128                  - A is 8*ceil(blockSize/8) for f16
129                  - A is 8*ceil(blockSize/4) for q31
130                  - A is 0 for other datatypes (q15 and q7)
131 
132 
133   @par           Fixed-Point Behavior
134                    Care must be taken when using the fixed-point versions of the FIR filter functions.
135                    In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
136                    Refer to the function specific documentation below for usage guidelines.
137 
138  */
139 
140 /**
141   @addtogroup FIR
142   @{
143  */
144 
145 /**
146   @brief         Processing function for floating-point FIR filter.
147   @param[in]     S          points to an instance of the floating-point FIR filter structure
148   @param[in]     pSrc       points to the block of input data
149   @param[out]    pDst       points to the block of output data
150   @param[in]     blockSize  number of samples to process
151  */
152 
153 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
154 
155 #define FIR_F32_MAX_COEF_BLK        8
156 
157 #define FIR_F32_CORE(pSamples, c, NB_TAPS)                                 \
158         vecAcc0 = vdupq_n_f32(0.0f);                                       \
159         for (int i = 0; i < NB_TAPS; i++) {                                \
160             vecIn0 = vld1q(&pSamples[i]);                                  \
161             vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]);                        \
162         }
163 
164 
165 #define NB_TAPS 4
arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,const float32_t * __restrict pSrc,float32_t * __restrict pDst,uint32_t blockSize)166 __STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
167   const float32_t * __restrict pSrc,
168   float32_t * __restrict pDst, uint32_t blockSize)
169 {
170     float32_t *pRefStatePtr = S->pState + blockSize;
171     float32_t      *pState = pRefStatePtr; /* State pointer */
172     const float32_t *pCoeffs = S->pCoeffs;      /* Coefficient pointer */
173     float32_t      *pStateCur;  /* Points to the current sample of the state */
174     const float32_t *pSamples;  /* Temporary pointer to the sample buffer */
175     float32_t      *pOutput;    /* Temporary pointer to the output buffer */
176     const float32_t *pTempSrc;  /* Temporary pointer to the source data */
177     float32_t      *pTempDest;  /* Temporary pointer to the destination buffer */
178     uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
179     int32_t         blkCnt;
180     float32x4_t         vecIn0;
181     float32x4_t         vecAcc0;
182     float32_t       c[NB_TAPS];
183     const float32_t *pCoeffsCur = pCoeffs;
184 
185     /*
186      * pState points to state array which contains previous frame (numTaps - 1) samples
187      * pStateCur points to the location where the new input data should be written
188      */
189     pStateCur = &(pState[(numTaps - 1u)]);
190     pTempSrc = pSrc;
191 
192     pSamples = pState;
193     pOutput = pDst;
194 
195     for (int i = 0; i < NB_TAPS; i++)
196         c[i] = *pCoeffsCur++;
197 
198     blkCnt = blockSize >> 2;
199     while (blkCnt > 0) {
200         /*
201          * Save 4 input samples in the history buffer
202          */
203         vst1q(pStateCur, vld1q(pTempSrc));
204         pStateCur += 4;
205         pTempSrc += 4;
206 
207         FIR_F32_CORE(pSamples, c, NB_TAPS);
208 
209         vst1q(pOutput, vecAcc0);
210 
211         pOutput += 4;
212         pSamples += 4;
213 
214         blkCnt--;
215     }
216 
217     blkCnt = blockSize & 3;
218     if (blkCnt)
219     {
220         mve_pred16_t    p0 = vctp32q(blkCnt);
221 
222         vst1q(pStateCur, vld1q(pTempSrc));
223         pStateCur += 4;
224         pTempSrc += 4;
225 
226         FIR_F32_CORE(pSamples, c, NB_TAPS);
227 
228         vstrwq_p_f32(pOutput, vecAcc0, p0);
229     }
230 
231     /*
232      * Copy the samples back into the history buffer start
233      */
234     pTempSrc = &pState[blockSize];
235     pTempDest = pState;
236 
237     blkCnt = numTaps - 1;
238     do {
239         mve_pred16_t    p = vctp32q(blkCnt);
240 
241         vstrwq_p_f32(pTempDest, vldrwq_z_f32(pTempSrc, p), p);
242         pTempSrc += 4;
243         pTempDest += 4;
244         blkCnt -= 4;
245     }
246     while (blkCnt > 0);
247 }
248 #undef NB_TAPS
249 
arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,const float32_t * __restrict pSrc,float32_t * __restrict pDst,uint32_t blockSize)250 __STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,
251   const float32_t * __restrict pSrc,
252   float32_t * __restrict pDst, uint32_t blockSize)
253 {
254     float32_t *pRefStatePtr = S->pState + blockSize;
255     float32_t *pState = pRefStatePtr;      /* State pointer */
256     const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
257     const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
258     const float32_t *pTempSrc;          /* Temporary pointer to the source data */
259     float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
260     uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
261     int32_t  blkCnt;
262     float32_t c0, c1, c2, c3;
263     float32_t c4, c5, c6, c7;
264 
265 
266     pTempSrc = pSrc;
267     pTempDest = &(pState[(numTaps - 1u)]);
268     int cnt = blockSize;
269     do {
270         mve_pred16_t p0 = vctp32q(cnt);
271         vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
272         pTempDest += 4;
273         pTempSrc += 4;
274         cnt -= 4;
275     } while(cnt > 0);
276 
277 
278 
279     pSamples = pState;
280     c0 = *pCoeffs++;
281     c1 = *pCoeffs++;
282     c2 = *pCoeffs++;
283     c3 = *pCoeffs++;
284     c4 = *pCoeffs++;
285     c5 = *pCoeffs++;
286     c6 = *pCoeffs++;
287     c7 = *pCoeffs++;
288 
289     cnt = blockSize >> 2;
290     while(cnt > 0)
291     {
292         float32x4_t vecAcc0;
293         float32x4_t vecIn0;
294 
295         vecIn0 = vld1q(pSamples);
296         vecAcc0 = vmulq(vecIn0, c0);
297         vecIn0 = vld1q(&pSamples[1]);
298         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
299         vecIn0 = vld1q(&pSamples[2]);
300         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
301         vecIn0 = vld1q(&pSamples[3]);
302         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
303         vecIn0 = vld1q(&pSamples[4]);
304         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
305         vecIn0 = vld1q(&pSamples[5]);
306         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
307         vecIn0 = vld1q(&pSamples[6]);
308         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
309         vecIn0 = vld1q(&pSamples[7]);
310         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
311         pSamples += 4;
312         vst1q(pDst, vecAcc0);
313         cnt--;
314         pDst += 4;
315     }
316 
317     cnt = blockSize & 3;
318     if (cnt > 0)
319     {
320         float32x4_t vecAcc0;
321         float32x4_t vecIn0;
322 
323         mve_pred16_t p0 = vctp32q(cnt);
324 
325         vecIn0 = vld1q(pSamples);
326         vecAcc0 = vmulq(vecIn0, c0);
327         vecIn0 = vld1q(&pSamples[1]);
328         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
329         vecIn0 = vld1q(&pSamples[2]);
330         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
331         vecIn0 = vld1q(&pSamples[3]);
332         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
333         vecIn0 = vld1q(&pSamples[4]);
334         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
335         vecIn0 = vld1q(&pSamples[5]);
336         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
337         vecIn0 = vld1q(&pSamples[6]);
338         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
339         vecIn0 = vld1q(&pSamples[7]);
340         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
341         vstrwq_p_f32(pDst, vecAcc0,p0);
342     }
343 
344 
345     /*
346      * Copy the samples back into the history buffer start
347      */
348     pTempSrc = &pState[blockSize];
349     pTempDest = pState;
350     blkCnt = numTaps;
351     while (blkCnt > 0)
352     {
353         *pTempDest++ = *pTempSrc++;
354         blkCnt--;
355     }
356 }
357 
358 
359 
arm_fir_f32(const arm_fir_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)360 ARM_DSP_ATTRIBUTE void arm_fir_f32(
361 const arm_fir_instance_f32 * S,
362 const float32_t * pSrc,
363 float32_t * pDst,
364 uint32_t blockSize)
365 {
366     /*
367        S->pState is the arm_fir_partial_accu
368        S->pState + blockSize is the FIR state
369     */
370     float32_t *pRefStatePtr = S->pState + blockSize;
371     float32_t *pState = pRefStatePtr ;      /* State pointer */
372     const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
373     const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
374     float32_t *pOutput;                 /* Temporary pointer to the output buffer */
375     const float32_t *pTempSrc;          /* Temporary pointer to the source data */
376     float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
377     uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
378     uint32_t  blkCnt;
379     float32_t c0, c1, c2, c3;
380     float32_t c4, c5, c6, c7;
381 
382     /*
383      * [1 to 8 taps] specialized routines
384      */
385     if (numTaps <= 4)
386     {
387         arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
388         return;
389     }
390     else if (numTaps <= 8)
391     {
392         arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
393         return;
394     }
395 
396     pTempSrc = pSrc;
397     pTempDest = &(pState[(numTaps - 1u)]);
398     int cnt = blockSize;
399     do {
400         mve_pred16_t p0 = vctp32q(cnt);
401         vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
402         pTempDest += 4;
403         pTempSrc += 4;
404         cnt -= 4;
405     } while(cnt > 0);
406 
407     float32_t *partial_accu_ptr = S->pState;
408 
409     pSamples = pState;
410     c0 = *pCoeffs++;
411     c1 = *pCoeffs++;
412     c2 = *pCoeffs++;
413     c3 = *pCoeffs++;
414     c4 = *pCoeffs++;
415     c5 = *pCoeffs++;
416     c6 = *pCoeffs++;
417     c7 = *pCoeffs++;
418 
419     cnt = blockSize >> 2;
420     while(cnt > 0) {
421         float32x4_t vecAcc0;
422         float32x4_t vecIn0;
423 
424         vecIn0 = vld1q(pSamples);
425         vecAcc0 = vmulq(vecIn0, c0);
426         vecIn0 = vld1q(&pSamples[1]);
427         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
428         vecIn0 = vld1q(&pSamples[2]);
429         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
430         vecIn0 = vld1q(&pSamples[3]);
431         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
432         vecIn0 = vld1q(&pSamples[4]);
433         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
434         vecIn0 = vld1q(&pSamples[5]);
435         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
436         vecIn0 = vld1q(&pSamples[6]);
437         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
438         vecIn0 = vld1q(&pSamples[7]);
439         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
440         pSamples += 4;
441         vst1q(partial_accu_ptr, vecAcc0);
442         cnt--;
443         partial_accu_ptr += 4;
444     }
445 
446     cnt = blockSize & 3;
447     if (cnt > 0)
448     {
449         float32x4_t vecAcc0;
450         float32x4_t vecIn0;
451 
452         mve_pred16_t p0 = vctp32q(cnt);
453 
454         vecIn0 = vld1q(pSamples);
455         vecAcc0 = vmulq(vecIn0, c0);
456         vecIn0 = vld1q(&pSamples[1]);
457         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
458         vecIn0 = vld1q(&pSamples[2]);
459         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
460         vecIn0 = vld1q(&pSamples[3]);
461         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
462         vecIn0 = vld1q(&pSamples[4]);
463         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
464         vecIn0 = vld1q(&pSamples[5]);
465         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
466         vecIn0 = vld1q(&pSamples[6]);
467         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
468         vecIn0 = vld1q(&pSamples[7]);
469         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
470         vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
471     }
472 
473     int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
474     int sample_offset = FIR_F32_MAX_COEF_BLK;
475     while (localTaps > FIR_F32_MAX_COEF_BLK) {
476         c0 = *pCoeffs++;
477         c1 = *pCoeffs++;
478         c2 = *pCoeffs++;
479         c3 = *pCoeffs++;
480         c4 = *pCoeffs++;
481         c5 = *pCoeffs++;
482         c6 = *pCoeffs++;
483         c7 = *pCoeffs++;
484 
485         partial_accu_ptr = S->pState;
486         pSamples = pState + sample_offset;
487         int cnt = blockSize >> 2;
488         while(cnt > 0) {
489             float32x4_t vecAcc0;
490             float32x4_t vecIn0;
491 
492             vecIn0 = vld1q(pSamples);
493             vecAcc0 = vmulq(vecIn0, c0);
494             vecIn0 = vld1q(&pSamples[1]);
495             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
496             vecIn0 = vld1q(&pSamples[2]);
497             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
498             vecIn0 = vld1q(&pSamples[3]);
499             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
500             vecIn0 = vld1q(&pSamples[4]);
501             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
502             vecIn0 = vld1q(&pSamples[5]);
503             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
504             vecIn0 = vld1q(&pSamples[6]);
505             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
506             vecIn0 = vld1q(&pSamples[7]);
507             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
508             pSamples += 4;
509             vecAcc0 += vld1q_f32(partial_accu_ptr);
510             vst1q(partial_accu_ptr, vecAcc0);
511             cnt--;
512             partial_accu_ptr += 4;
513         }
514 
515         cnt = blockSize & 3;
516         if (cnt > 0) {
517             float32x4_t vecAcc0;
518             float32x4_t vecIn0;
519 
520             mve_pred16_t p0 = vctp32q(cnt);
521 
522             vecIn0 = vld1q(pSamples);
523             vecAcc0 = vmulq(vecIn0, c0);
524             vecIn0 = vld1q(&pSamples[1]);
525             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
526             vecIn0 = vld1q(&pSamples[2]);
527             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
528             vecIn0 = vld1q(&pSamples[3]);
529             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
530             vecIn0 = vld1q(&pSamples[4]);
531             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
532             vecIn0 = vld1q(&pSamples[5]);
533             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
534             vecIn0 = vld1q(&pSamples[6]);
535             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
536             vecIn0 = vld1q(&pSamples[7]);
537             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
538             vecAcc0 += vld1q_f32(partial_accu_ptr);
539             vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
540         }
541 
542         localTaps -= FIR_F32_MAX_COEF_BLK;
543         sample_offset += FIR_F32_MAX_COEF_BLK;
544     }
545 
546     pSamples = pState + sample_offset;
547 
548     if (localTaps > 4) {
549         c0 = *pCoeffs++;
550         c1 = *pCoeffs++;
551         c2 = *pCoeffs++;
552         c3 = *pCoeffs++;
553         c4 = *pCoeffs++;
554         c5 = *pCoeffs++;
555         c6 = *pCoeffs++;
556         c7 = *pCoeffs++;
557         pOutput = pDst;
558 
559         partial_accu_ptr = S->pState;
560         cnt = blockSize  >> 2;
561         while(cnt > 0) {
562             float32x4_t vecAcc0;
563             float32x4_t vecIn0;
564 
565             vecIn0 = vld1q(pSamples);
566             vecAcc0 = vmulq(vecIn0, c0);
567             vecIn0 = vld1q(&pSamples[1]);
568             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
569             vecIn0 = vld1q(&pSamples[2]);
570             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
571             vecIn0 = vld1q(&pSamples[3]);
572             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
573             vecIn0 = vld1q(&pSamples[4]);
574             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
575             vecIn0 = vld1q(&pSamples[5]);
576             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
577             vecIn0 = vld1q(&pSamples[6]);
578             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
579             vecIn0 = vld1q(&pSamples[7]);
580             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
581             pSamples += 4;
582             float32x4_t pap = vld1q_f32(partial_accu_ptr);
583             vst1q(pOutput, vecAcc0+pap);
584             cnt--;
585             partial_accu_ptr += 4;
586             pOutput += 4;
587         }
588 
589         cnt = blockSize  & 3;
590         if (cnt > 0) {
591             float32x4_t vecAcc0;
592             float32x4_t vecIn0;
593 
594             mve_pred16_t p0 = vctp32q(cnt);
595 
596             vecIn0 = vld1q(pSamples);
597             vecAcc0 = vmulq(vecIn0, c0);
598             vecIn0 = vld1q(&pSamples[1]);
599             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
600             vecIn0 = vld1q(&pSamples[2]);
601             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
602             vecIn0 = vld1q(&pSamples[3]);
603             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
604             vecIn0 = vld1q(&pSamples[4]);
605             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
606             vecIn0 = vld1q(&pSamples[5]);
607             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
608             vecIn0 = vld1q(&pSamples[6]);
609             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
610             vecIn0 = vld1q(&pSamples[7]);
611             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
612             float32x4_t pap = vld1q_f32(partial_accu_ptr);
613             vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
614             pOutput += cnt;
615         }
616     }
617     else {
618         c0 = *pCoeffs++;
619         c1 = *pCoeffs++;
620         c2 = *pCoeffs++;
621         c3 = *pCoeffs++;
622         pOutput = pDst;
623 
624         partial_accu_ptr = S->pState;
625         cnt = blockSize >> 2;
626         while(cnt > 0) {
627             float32x4_t vecAcc0;
628             float32x4_t vecIn0;
629 
630             vecIn0 = vld1q(pSamples);
631             vecAcc0 = vmulq(vecIn0, c0);
632             vecIn0 = vld1q(&pSamples[1]);
633             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
634             vecIn0 = vld1q(&pSamples[2]);
635             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
636             vecIn0 = vld1q(&pSamples[3]);
637             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
638             pSamples += 4;
639             float32x4_t pap = vld1q_f32(partial_accu_ptr);
640             vst1q(pOutput, vecAcc0+pap);
641             cnt--;
642             partial_accu_ptr += 4;
643             pOutput += 4;
644         }
645 
646         cnt = blockSize & 3;
647         if (cnt > 0) {
648             float32x4_t vecAcc0;
649             float32x4_t vecIn0;
650 
651             mve_pred16_t p0 = vctp32q(cnt);
652 
653             vecIn0 = vld1q(pSamples);
654             vecAcc0 = vmulq(vecIn0, c0);
655             vecIn0 = vld1q(&pSamples[1]);
656             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
657             vecIn0 = vld1q(&pSamples[2]);
658             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
659             vecIn0 = vld1q(&pSamples[3]);
660             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
661             float32x4_t pap = vld1q_f32(partial_accu_ptr);
662             vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
663             pOutput += cnt;
664         }
665     }
666 
667     /*
668      * Copy the samples back into the history buffer start
669      */
670     pTempSrc = &pRefStatePtr[blockSize];
671     pTempDest = pRefStatePtr;
672 
673     blkCnt = numTaps >> 2;
674     while (blkCnt > 0)
675     {
676         vst1q(pTempDest, vld1q(pTempSrc));
677         pTempSrc += 4;
678         pTempDest += 4;
679         blkCnt--;
680     }
681     blkCnt = numTaps & 3;
682     if (blkCnt > 0)
683     {
684         mve_pred16_t p0 = vctp32q(blkCnt);
685         vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
686     }
687 }
688 
689 #else
690 #if defined(ARM_MATH_NEON)
691 
arm_fir_f32(const arm_fir_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)692 ARM_DSP_ATTRIBUTE void arm_fir_f32(
693 const arm_fir_instance_f32 * S,
694 const float32_t * pSrc,
695 float32_t * pDst,
696 uint32_t blockSize)
697 {
698    float32_t *pState = S->pState;                 /* State pointer */
699    const float32_t *pCoeffs = S->pCoeffs;         /* Coefficient pointer */
700    float32_t *pStateCurnt;                        /* Points to the current sample of the state */
701    float32_t *px;                                 /* Temporary pointers for state buffer */
702    const float32_t *pb;                           /* Temporary pointers for coefficient buffer */
703    uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
704    uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
705 
706    float32x4_t accv0,accv1,samples0,samples1,x0,x1,x2,xa,xb,b;
707    float32_t acc;
708 
709    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
710    /* pStateCurnt points to the location where the new input data should be written */
711    pStateCurnt = &(S->pState[(numTaps - 1U)]);
712 
713    /* Loop unrolling */
714    blkCnt = blockSize >> 3;
715 
716    while (blkCnt > 0U)
717    {
718       /* Copy 8 samples at a time into state buffers */
719       samples0 = vld1q_f32(pSrc);
720       vst1q_f32(pStateCurnt,samples0);
721 
722       pStateCurnt += 4;
723       pSrc += 4 ;
724 
725       samples1 = vld1q_f32(pSrc);
726       vst1q_f32(pStateCurnt,samples1);
727 
728       pStateCurnt += 4;
729       pSrc += 4 ;
730 
731       /* Set the accumulators to zero */
732       accv0 = vdupq_n_f32(0);
733       accv1 = vdupq_n_f32(0);
734 
735       /* Initialize state pointer */
736       px = pState;
737 
738       /* Initialize coefficient pointer */
739       pb = pCoeffs;
740 
741       /* Loop unroling */
742       i = numTaps >> 2;
743 
744       /* Perform the multiply-accumulates */
745       x0 = vld1q_f32(px);
746       x1 = vld1q_f32(px + 4);
747 
748       while(i > 0)
749       {
750          /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
751          x2 = vld1q_f32(px + 8);
752          b = vld1q_f32(pb);
753          xa = x0;
754          xb = x1;
755          accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 0));
756          accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 0));
757 
758          xa = vextq_f32(x0,x1,1);
759          xb = vextq_f32(x1,x2,1);
760 
761          accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 1));
762          accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 1));
763 
764          xa = vextq_f32(x0,x1,2);
765          xb = vextq_f32(x1,x2,2);
766 
767          accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 2));
768          accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 2));
769 
770          xa = vextq_f32(x0,x1,3);
771          xb = vextq_f32(x1,x2,3);
772 
773          accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 3));
774          accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 3));
775 
776          pb += 4;
777          x0 = x1;
778          x1 = x2;
779          px += 4;
780          i--;
781 
782       }
783 
784       /* Tail */
785       i = numTaps & 3;
786       x2 = vld1q_f32(px + 8);
787 
788       /* Perform the multiply-accumulates */
789       switch(i)
790       {
791          case 3:
792          {
793            accv0 = vmlaq_n_f32(accv0,x0,*pb);
794            accv1 = vmlaq_n_f32(accv1,x1,*pb);
795 
796            pb++;
797 
798            xa = vextq_f32(x0,x1,1);
799            xb = vextq_f32(x1,x2,1);
800 
801            accv0 = vmlaq_n_f32(accv0,xa,*pb);
802            accv1 = vmlaq_n_f32(accv1,xb,*pb);
803 
804            pb++;
805 
806            xa = vextq_f32(x0,x1,2);
807            xb = vextq_f32(x1,x2,2);
808 
809            accv0 = vmlaq_n_f32(accv0,xa,*pb);
810            accv1 = vmlaq_n_f32(accv1,xb,*pb);
811 
812          }
813          break;
814          case 2:
815          {
816            accv0 = vmlaq_n_f32(accv0,x0,*pb);
817            accv1 = vmlaq_n_f32(accv1,x1,*pb);
818 
819            pb++;
820 
821            xa = vextq_f32(x0,x1,1);
822            xb = vextq_f32(x1,x2,1);
823 
824            accv0 = vmlaq_n_f32(accv0,xa,*pb);
825            accv1 = vmlaq_n_f32(accv1,xb,*pb);
826 
827          }
828          break;
829          case 1:
830          {
831 
832            accv0 = vmlaq_n_f32(accv0,x0,*pb);
833            accv1 = vmlaq_n_f32(accv1,x1,*pb);
834 
835          }
836          break;
837          default:
838          break;
839       }
840 
841       /* The result is stored in the destination buffer. */
842       vst1q_f32(pDst,accv0);
843       pDst += 4;
844       vst1q_f32(pDst,accv1);
845       pDst += 4;
846 
847       /* Advance state pointer by 8 for the next 8 samples */
848       pState = pState + 8;
849 
850       blkCnt--;
851    }
852 
853    /* Tail */
854    blkCnt = blockSize & 0x7;
855 
856    while (blkCnt > 0U)
857    {
858       /* Copy one sample at a time into state buffer */
859       *pStateCurnt++ = *pSrc++;
860 
861       /* Set the accumulator to zero */
862       acc = 0.0f;
863 
864       /* Initialize state pointer */
865       px = pState;
866 
867       /* Initialize Coefficient pointer */
868       pb = pCoeffs;
869 
870       i = numTaps;
871 
872       /* Perform the multiply-accumulates */
873       do
874       {
875          /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
876          acc += *px++ * *pb++;
877          i--;
878 
879       } while (i > 0U);
880 
881       /* The result is stored in the destination buffer. */
882       *pDst++ = acc;
883 
884       /* Advance state pointer by 1 for the next sample */
885       pState = pState + 1;
886 
887       blkCnt--;
888    }
889 
890    /* Processing is complete.
891    ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
892    ** This prepares the state buffer for the next function call. */
893 
894    /* Points to the start of the state buffer */
895    pStateCurnt = S->pState;
896 
897    /* Copy numTaps number of values */
898    tapCnt = numTaps - 1U;
899 
900    /* Copy data */
901    while (tapCnt > 0U)
902    {
903       *pStateCurnt++ = *pState++;
904 
905       /* Decrement the loop counter */
906       tapCnt--;
907    }
908 
909 }
910 #else
arm_fir_f32(const arm_fir_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)911 ARM_DSP_ATTRIBUTE void arm_fir_f32(
912   const arm_fir_instance_f32 * S,
913   const float32_t * pSrc,
914         float32_t * pDst,
915         uint32_t blockSize)
916 {
917         float32_t *pState = S->pState;                 /* State pointer */
918   const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
919         float32_t *pStateCurnt;                        /* Points to the current sample of the state */
920         float32_t *px;                                 /* Temporary pointer for state buffer */
921   const float32_t *pb;                                 /* Temporary pointer for coefficient buffer */
922         float32_t acc0;                                /* Accumulator */
923         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
924         uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
925 
926 #if defined (ARM_MATH_LOOPUNROLL)
927         float32_t acc1, acc2, acc3, acc4, acc5, acc6, acc7;     /* Accumulators */
928         float32_t x0, x1, x2, x3, x4, x5, x6, x7;               /* Temporary variables to hold state values */
929         float32_t c0;                                           /* Temporary variable to hold coefficient value */
930 #endif
931 
932   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
933   /* pStateCurnt points to the location where the new input data should be written */
934   pStateCurnt = &(S->pState[(numTaps - 1U)]);
935 
936 #if defined (ARM_MATH_LOOPUNROLL)
937 
938   /* Loop unrolling: Compute 8 output values simultaneously.
939    * The variables acc0 ... acc7 hold output values that are being computed:
940    *
941    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
942    *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
943    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
944    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
945    */
946 
947   blkCnt = blockSize >> 3U;
948 
949   while (blkCnt > 0U)
950   {
951     /* Copy 4 new input samples into the state buffer. */
952     *pStateCurnt++ = *pSrc++;
953     *pStateCurnt++ = *pSrc++;
954     *pStateCurnt++ = *pSrc++;
955     *pStateCurnt++ = *pSrc++;
956 
957     /* Set all accumulators to zero */
958     acc0 = 0.0f;
959     acc1 = 0.0f;
960     acc2 = 0.0f;
961     acc3 = 0.0f;
962     acc4 = 0.0f;
963     acc5 = 0.0f;
964     acc6 = 0.0f;
965     acc7 = 0.0f;
966 
967     /* Initialize state pointer */
968     px = pState;
969 
970     /* Initialize coefficient pointer */
971     pb = pCoeffs;
972 
973     /* This is separated from the others to avoid
974      * a call to __aeabi_memmove which would be slower
975      */
976     *pStateCurnt++ = *pSrc++;
977     *pStateCurnt++ = *pSrc++;
978     *pStateCurnt++ = *pSrc++;
979     *pStateCurnt++ = *pSrc++;
980 
981     /* Read the first 7 samples from the state buffer:  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
982     x0 = *px++;
983     x1 = *px++;
984     x2 = *px++;
985     x3 = *px++;
986     x4 = *px++;
987     x5 = *px++;
988     x6 = *px++;
989 
990     /* Loop unrolling: process 8 taps at a time. */
991     tapCnt = numTaps >> 3U;
992 
993     while (tapCnt > 0U)
994     {
995       /* Read the b[numTaps-1] coefficient */
996       c0 = *(pb++);
997 
998       /* Read x[n-numTaps-3] sample */
999       x7 = *(px++);
1000 
1001       /* acc0 +=  b[numTaps-1] * x[n-numTaps] */
1002       acc0 += x0 * c0;
1003 
1004       /* acc1 +=  b[numTaps-1] * x[n-numTaps-1] */
1005       acc1 += x1 * c0;
1006 
1007       /* acc2 +=  b[numTaps-1] * x[n-numTaps-2] */
1008       acc2 += x2 * c0;
1009 
1010       /* acc3 +=  b[numTaps-1] * x[n-numTaps-3] */
1011       acc3 += x3 * c0;
1012 
1013       /* acc4 +=  b[numTaps-1] * x[n-numTaps-4] */
1014       acc4 += x4 * c0;
1015 
1016       /* acc1 +=  b[numTaps-1] * x[n-numTaps-5] */
1017       acc5 += x5 * c0;
1018 
1019       /* acc2 +=  b[numTaps-1] * x[n-numTaps-6] */
1020       acc6 += x6 * c0;
1021 
1022       /* acc3 +=  b[numTaps-1] * x[n-numTaps-7] */
1023       acc7 += x7 * c0;
1024 
1025       /* Read the b[numTaps-2] coefficient */
1026       c0 = *(pb++);
1027 
1028       /* Read x[n-numTaps-4] sample */
1029       x0 = *(px++);
1030 
1031       /* Perform the multiply-accumulate */
1032       acc0 += x1 * c0;
1033       acc1 += x2 * c0;
1034       acc2 += x3 * c0;
1035       acc3 += x4 * c0;
1036       acc4 += x5 * c0;
1037       acc5 += x6 * c0;
1038       acc6 += x7 * c0;
1039       acc7 += x0 * c0;
1040 
1041       /* Read the b[numTaps-3] coefficient */
1042       c0 = *(pb++);
1043 
1044       /* Read x[n-numTaps-5] sample */
1045       x1 = *(px++);
1046 
1047       /* Perform the multiply-accumulates */
1048       acc0 += x2 * c0;
1049       acc1 += x3 * c0;
1050       acc2 += x4 * c0;
1051       acc3 += x5 * c0;
1052       acc4 += x6 * c0;
1053       acc5 += x7 * c0;
1054       acc6 += x0 * c0;
1055       acc7 += x1 * c0;
1056 
1057       /* Read the b[numTaps-4] coefficient */
1058       c0 = *(pb++);
1059 
1060       /* Read x[n-numTaps-6] sample */
1061       x2 = *(px++);
1062 
1063       /* Perform the multiply-accumulates */
1064       acc0 += x3 * c0;
1065       acc1 += x4 * c0;
1066       acc2 += x5 * c0;
1067       acc3 += x6 * c0;
1068       acc4 += x7 * c0;
1069       acc5 += x0 * c0;
1070       acc6 += x1 * c0;
1071       acc7 += x2 * c0;
1072 
1073       /* Read the b[numTaps-4] coefficient */
1074       c0 = *(pb++);
1075 
1076       /* Read x[n-numTaps-6] sample */
1077       x3 = *(px++);
1078       /* Perform the multiply-accumulates */
1079       acc0 += x4 * c0;
1080       acc1 += x5 * c0;
1081       acc2 += x6 * c0;
1082       acc3 += x7 * c0;
1083       acc4 += x0 * c0;
1084       acc5 += x1 * c0;
1085       acc6 += x2 * c0;
1086       acc7 += x3 * c0;
1087 
1088       /* Read the b[numTaps-4] coefficient */
1089       c0 = *(pb++);
1090 
1091       /* Read x[n-numTaps-6] sample */
1092       x4 = *(px++);
1093 
1094       /* Perform the multiply-accumulates */
1095       acc0 += x5 * c0;
1096       acc1 += x6 * c0;
1097       acc2 += x7 * c0;
1098       acc3 += x0 * c0;
1099       acc4 += x1 * c0;
1100       acc5 += x2 * c0;
1101       acc6 += x3 * c0;
1102       acc7 += x4 * c0;
1103 
1104       /* Read the b[numTaps-4] coefficient */
1105       c0 = *(pb++);
1106 
1107       /* Read x[n-numTaps-6] sample */
1108       x5 = *(px++);
1109 
1110       /* Perform the multiply-accumulates */
1111       acc0 += x6 * c0;
1112       acc1 += x7 * c0;
1113       acc2 += x0 * c0;
1114       acc3 += x1 * c0;
1115       acc4 += x2 * c0;
1116       acc5 += x3 * c0;
1117       acc6 += x4 * c0;
1118       acc7 += x5 * c0;
1119 
1120       /* Read the b[numTaps-4] coefficient */
1121       c0 = *(pb++);
1122 
1123       /* Read x[n-numTaps-6] sample */
1124       x6 = *(px++);
1125 
1126       /* Perform the multiply-accumulates */
1127       acc0 += x7 * c0;
1128       acc1 += x0 * c0;
1129       acc2 += x1 * c0;
1130       acc3 += x2 * c0;
1131       acc4 += x3 * c0;
1132       acc5 += x4 * c0;
1133       acc6 += x5 * c0;
1134       acc7 += x6 * c0;
1135 
1136       /* Decrement loop counter */
1137       tapCnt--;
1138     }
1139 
1140     /* Loop unrolling: Compute remaining outputs */
1141     tapCnt = numTaps % 0x8U;
1142 
1143     while (tapCnt > 0U)
1144     {
1145       /* Read coefficients */
1146       c0 = *(pb++);
1147 
1148       /* Fetch 1 state variable */
1149       x7 = *(px++);
1150 
1151       /* Perform the multiply-accumulates */
1152       acc0 += x0 * c0;
1153       acc1 += x1 * c0;
1154       acc2 += x2 * c0;
1155       acc3 += x3 * c0;
1156       acc4 += x4 * c0;
1157       acc5 += x5 * c0;
1158       acc6 += x6 * c0;
1159       acc7 += x7 * c0;
1160 
1161       /* Reuse the present sample states for next sample */
1162       x0 = x1;
1163       x1 = x2;
1164       x2 = x3;
1165       x3 = x4;
1166       x4 = x5;
1167       x5 = x6;
1168       x6 = x7;
1169 
1170       /* Decrement loop counter */
1171       tapCnt--;
1172     }
1173 
1174     /* Advance the state pointer by 8 to process the next group of 8 samples */
1175     pState = pState + 8;
1176 
1177     /* The results in the 8 accumulators, store in the destination buffer. */
1178     *pDst++ = acc0;
1179     *pDst++ = acc1;
1180     *pDst++ = acc2;
1181     *pDst++ = acc3;
1182     *pDst++ = acc4;
1183     *pDst++ = acc5;
1184     *pDst++ = acc6;
1185     *pDst++ = acc7;
1186 
1187 
1188     /* Decrement loop counter */
1189     blkCnt--;
1190   }
1191 
1192   /* Loop unrolling: Compute remaining output samples */
1193   blkCnt = blockSize % 0x8U;
1194 
1195 #else
1196 
1197   /* Initialize blkCnt with number of taps */
1198   blkCnt = blockSize;
1199 
1200 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1201 
1202   while (blkCnt > 0U)
1203   {
1204     /* Copy one sample at a time into state buffer */
1205     *pStateCurnt++ = *pSrc++;
1206 
1207     /* Set the accumulator to zero */
1208     acc0 = 0.0f;
1209 
1210     /* Initialize state pointer */
1211     px = pState;
1212 
1213     /* Initialize Coefficient pointer */
1214     pb = pCoeffs;
1215 
1216     i = numTaps;
1217 
1218     /* Perform the multiply-accumulates */
1219     while (i > 0U)
1220     {
1221       /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
1222       acc0 += *px++ * *pb++;
1223 
1224       i--;
1225     }
1226 
1227     /* Store result in destination buffer. */
1228     *pDst++ = acc0;
1229 
1230     /* Advance state pointer by 1 for the next sample */
1231     pState = pState + 1U;
1232 
1233     /* Decrement loop counter */
1234     blkCnt--;
1235   }
1236 
1237   /* Processing is complete.
1238      Now copy the last numTaps - 1 samples to the start of the state buffer.
1239      This prepares the state buffer for the next function call. */
1240 
1241   /* Points to the start of the state buffer */
1242   pStateCurnt = S->pState;
1243 
1244 #if defined (ARM_MATH_LOOPUNROLL)
1245 
1246   /* Loop unrolling: Compute 4 taps at a time */
1247   tapCnt = (numTaps - 1U) >> 2U;
1248 
1249   /* Copy data */
1250   while (tapCnt > 0U)
1251   {
1252     *pStateCurnt++ = *pState++;
1253     *pStateCurnt++ = *pState++;
1254     *pStateCurnt++ = *pState++;
1255     *pStateCurnt++ = *pState++;
1256 
1257     /* Decrement loop counter */
1258     tapCnt--;
1259   }
1260 
1261   /* Calculate remaining number of copies */
1262   tapCnt = (numTaps - 1U) % 0x4U;
1263 
1264 #else
1265 
1266   /* Initialize tapCnt with number of taps */
1267   tapCnt = (numTaps - 1U);
1268 
1269 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1270 
1271   /* Copy remaining data */
1272   while (tapCnt > 0U)
1273   {
1274     *pStateCurnt++ = *pState++;
1275 
1276     /* Decrement loop counter */
1277     tapCnt--;
1278   }
1279 
1280 }
1281 
1282 #endif /* #if defined(ARM_MATH_NEON) */
1283 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1284 
1285 /**
1286 * @} end of FIR group
1287 */
1288