1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_decimate_f32.c
4  * Description:  FIR decimation for floating-point sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @defgroup FIR_decimate Finite Impulse Response (FIR) Decimator
37 
38   These functions combine an FIR filter together with a decimator.
39   They are used in multirate systems for reducing the sample rate of a signal without introducing aliasing distortion.
40   Conceptually, the functions are equivalent to the block diagram below:
41   \image html FIRDecimator.gif "Components included in the FIR Decimator functions"
42   When decimating by a factor of <code>M</code>, the signal should be prefiltered by a lowpass filter with a normalized
43   cutoff frequency of <code>1/M</code> in order to prevent aliasing distortion.
44   The user of the function is responsible for providing the filter coefficients.
45 
46   The FIR decimator functions provided in the CMSIS DSP Library combine the FIR filter and the decimator in an efficient manner.
47   Instead of calculating all of the FIR filter outputs and discarding <code>M-1</code> out of every <code>M</code>, only the
48   samples output by the decimator are computed.
49   The functions operate on blocks of input and output data.
50   <code>pSrc</code> points to an array of <code>blockSize</code> input values and
51   <code>pDst</code> points to an array of <code>blockSize/M</code> output values.
52   In order to have an integer number of output samples <code>blockSize</code>
53   must always be a multiple of the decimation factor <code>M</code>.
54 
55   The library provides separate functions for Q15, Q31 and floating-point data types.
56 
57   @par           Algorithm:
58                    The FIR portion of the algorithm uses the standard form filter:
59   <pre>
60       y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
61   </pre>
62                    where, <code>b[n]</code> are the filter coefficients.
63   @par
64                    The <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
65                    Coefficients are stored in time reversed order.
66   @par
67   <pre>
68       {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
69   </pre>
70   @par
71                    <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
72                    Samples in the state buffer are stored in the order:
73   @par
74   <pre>
75       {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
76   </pre>
77                    The state variables are updated after each block of data is processed, the coefficients are untouched.
78 
79   @par           Instance Structure
80                    The coefficients and state variables for a filter are stored together in an instance data structure.
81                    A separate instance structure must be defined for each filter.
82                    Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
83                    There are separate instance structure declarations for each of the 3 supported data types.
84 
85  @par            Initialization Functions
86                    There is also an associated initialization function for each data type.
87                    The initialization function performs the following operations:
88                    - Sets the values of the internal structure fields.
89                    - Zeros out the values in the state buffer.
90                    - Checks to make sure that the size of the input is a multiple of the decimation factor.
91                    To do this manually without calling the init function, assign the follow subfields of the instance structure:
92                    numTaps, pCoeffs, M (decimation factor), pState. Also set all of the values in pState to zero.
93   @par
94                    Use of the initialization function is optional.
95                    However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
96                    To place an instance structure into a const data section, the instance structure must be manually initialized.
97                    The code below statically initializes each of the 3 different data type filter instance structures
98   <pre>
99       arm_fir_decimate_instance_f32 S = {M, numTaps, pCoeffs, pState};
100       arm_fir_decimate_instance_q31 S = {M, numTaps, pCoeffs, pState};
101       arm_fir_decimate_instance_q15 S = {M, numTaps, pCoeffs, pState};
102   </pre>
103                    where <code>M</code> is the decimation factor; <code>numTaps</code> is the number of filter coefficients in the filter;
104                    <code>pCoeffs</code> is the address of the coefficient buffer;
105                    <code>pState</code> is the address of the state buffer.
106                    Be sure to set the values in the state buffer to zeros when doing static initialization.
107 
108   @par           Fixed-Point Behavior
109                    Care must be taken when using the fixed-point versions of the FIR decimate filter functions.
110                    In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
111                    Refer to the function specific documentation below for usage guidelines.
112  */
113 
114 /**
115   @addtogroup FIR_decimate
116   @{
117  */
118 
119 /**
120   @brief         Processing function for floating-point FIR decimator.
121   @param[in]     S         points to an instance of the floating-point FIR decimator structure
122   @param[in]     pSrc      points to the block of input data
123   @param[out]    pDst      points to the block of output data
124   @param[in]     blockSize number of input samples to process
125  */
126 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
127 
128 #include "arm_helium_utils.h"
129 
arm_fir_decimate_f32(const arm_fir_decimate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)130 ARM_DSP_ATTRIBUTE void arm_fir_decimate_f32(
131   const arm_fir_decimate_instance_f32 * S,
132   const float32_t * pSrc,
133   float32_t * pDst,
134   uint32_t blockSize)
135 {
136     float32_t *pState = S->pState;  /* State pointer */
137     const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
138     float32_t *pStateCurnt;     /* Points to the current sample of the state */
139     const float32_t *px, *pb;         /* Temporary pointers for state and coefficient buffers */
140     uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
141     uint32_t  i, tapCnt, blkCnt, outBlockSize = blockSize / S->M;   /* Loop counters */
142     uint32_t  blkCntN4;
143     const float32_t *px0, *px1, *px2, *px3;
144     f32x4_t accv = { 0 }, acc0v, acc1v, acc2v, acc3v;
145     f32x4_t x0v, x1v, x2v, x3v;
146     f32x4_t c0v;
147 
148     /*
149      * S->pState buffer contains previous frame (numTaps - 1) samples
150      * pStateCurnt points to the location where the new input data should be written
151      */
152     pStateCurnt = S->pState + (numTaps - 1U);
153     /*
154      * Total number of output samples to be computed
155      */
156     blkCnt = outBlockSize / 4;
157     blkCntN4 = outBlockSize - (4 * blkCnt);
158 
159     while (blkCnt > 0U)
160     {
161         /*
162          * Copy 4 * decimation factor number of new input samples into the state buffer
163          */
164         i = (4 * S->M) >> 2;
165         do
166         {
167             vst1q(pStateCurnt, vld1q((const float32_t *)pSrc));
168             pSrc += 4;
169             pStateCurnt += 4;
170             i--;
171         }
172         while (i > 0U);
173 
174         /*
175          * Set accumulators to zero
176          */
177         acc0v = vdupq_n_f32(0.0f);
178         acc1v = vdupq_n_f32(0.0f);
179         acc2v = vdupq_n_f32(0.0f);
180         acc3v = vdupq_n_f32(0.0f);
181 
182         /*
183          * Initialize state pointer for all the samples
184          */
185         px0 = pState;
186         px1 = pState + S->M;
187         px2 = pState + 2 * S->M;
188         px3 = pState + 3 * S->M;
189         /*
190          * Initialize coeff pointer
191          */
192         pb = pCoeffs;
193         /*
194          * Loop unrolling.  Process 4 taps at a time.
195          */
196         tapCnt = numTaps >> 2;
197         /*
198          * Loop over the number of taps.  Unroll by a factor of 4.
199          * Repeat until we've computed numTaps-4 coefficients.
200          */
201         while (tapCnt > 0U)
202         {
203             /*
204              * Read the b[numTaps-1] coefficient
205              */
206             c0v = vld1q((const float32_t *)pb);
207             pb += 4;
208             /*
209              * Read x[n-numTaps-1] sample for acc0
210              */
211             x0v = vld1q(px0);
212             x1v = vld1q(px1);
213             x2v = vld1q(px2);
214             x3v = vld1q(px3);
215             px0 += 4;
216             px1 += 4;
217             px2 += 4;
218             px3 += 4;
219 
220             acc0v = vfmaq(acc0v, x0v, c0v);
221             acc1v = vfmaq(acc1v, x1v, c0v);
222             acc2v = vfmaq(acc2v, x2v, c0v);
223             acc3v = vfmaq(acc3v, x3v, c0v);
224             /*
225              * Decrement the loop counter
226              */
227             tapCnt--;
228         }
229 
230         /*
231          * If the filter length is not a multiple of 4, compute the remaining filter taps
232          * should be tail predicated
233          */
234         tapCnt = numTaps % 0x4U;
235         if (tapCnt > 0U)
236         {
237             mve_pred16_t p0 = vctp32q(tapCnt);
238             /*
239              * Read the b[numTaps-1] coefficient
240              */
241             c0v = vldrwq_z_f32(pb, p0);
242             pb += 4;
243             /*
244              * Read x[n-numTaps-1] sample for acc0
245              */
246             x0v = vld1q(px0);
247             x1v = vld1q(px1);
248             x2v = vld1q(px2);
249             x3v = vld1q(px3);
250             px0 += 4;
251             px1 += 4;
252             px2 += 4;
253             px3 += 4;
254 
255             acc0v = vfmaq_f32(acc0v, x0v, c0v);
256             acc1v = vfmaq_f32(acc1v, x1v, c0v);
257             acc2v = vfmaq_f32(acc2v, x2v, c0v);
258             acc3v = vfmaq_f32(acc3v, x3v, c0v);
259         }
260 
261         /* reduction */
262         accv[0] = vecAddAcrossF32Mve(acc0v);
263         accv[1] = vecAddAcrossF32Mve(acc1v);
264         accv[2] = vecAddAcrossF32Mve(acc2v);
265         accv[3] = vecAddAcrossF32Mve(acc3v);
266 
267         /*
268          * Advance the state pointer by the decimation factor
269          * to process the next group of decimation factor number samples
270          */
271         pState = pState + 4 * S->M;
272         /*
273          * The result is in the accumulator, store in the destination buffer.
274          */
275         vst1q(pDst, accv);
276         pDst += 4;
277 
278         /*
279          * Decrement the loop counter
280          */
281         blkCnt--;
282     }
283 
284     while (blkCntN4 > 0U)
285     {
286         /*
287          * Copy decimation factor number of new input samples into the state buffer
288          */
289         i = S->M;
290         do
291         {
292             *pStateCurnt++ = *pSrc++;
293         }
294         while (--i);
295         /*
296          * Set accumulator to zero
297          */
298         acc0v = vdupq_n_f32(0.0f);
299         /*
300          * Initialize state pointer
301          */
302         px = pState;
303         /*
304          * Initialize coeff pointer
305          */
306         pb = pCoeffs;
307         /*
308          * Loop unrolling.  Process 4 taps at a time.
309          */
310         tapCnt = numTaps >> 2;
311         /*
312          * Loop over the number of taps.  Unroll by a factor of 4.
313          * Repeat until we've computed numTaps-4 coefficients.
314          */
315         while (tapCnt > 0U)
316         {
317             c0v = vldrwq_f32(pb);
318             x0v = vldrwq_f32(px);
319             pb += 4;
320             px += 4;
321             acc0v = vfmaq_f32(acc0v, x0v, c0v);
322             /*
323              * Decrement the loop counter
324              */
325             tapCnt--;
326         }
327         tapCnt = numTaps % 0x4U;
328         if (tapCnt > 0U)
329         {
330             mve_pred16_t p0 = vctp32q(tapCnt);
331             c0v = vldrwq_z_f32(pb, p0);
332             x0v = vldrwq_f32(px);
333             acc0v = vfmaq_f32(acc0v, x0v, c0v);
334         }
335         accv[0] = vecAddAcrossF32Mve(acc0v);
336 
337         /*
338          * Advance the state pointer by the decimation factor
339          * * to process the next group of decimation factor number samples
340          */
341         pState = pState + S->M;
342         /*
343          * The result is in the accumulator, store in the destination buffer.
344          */
345         *pDst++ = accv[0];
346         /*
347          * Decrement the loop counter
348          */
349         blkCntN4--;
350     }
351 
352     /*
353      * Processing is complete.
354      * Now copy the last numTaps - 1 samples to the start of the state buffer.
355      * This prepares the state buffer for the next function call.
356      */
357 
358     pStateCurnt = S->pState;
359     blkCnt =(numTaps - 1) >> 2;
360     while (blkCnt > 0U)
361     {
362         vst1q(pStateCurnt, vldrwq_f32(pState));
363         pState += 4;
364         pStateCurnt += 4;
365         blkCnt--;
366     }
367     blkCnt = (numTaps - 1) & 3;
368     if (blkCnt > 0U)
369     {
370         mve_pred16_t p0 = vctp32q(blkCnt);
371         vstrwq_p_f32(pStateCurnt, vldrwq_f32(pState), p0);
372     }
373 }
374 #else
375 #if defined(ARM_MATH_NEON)
arm_fir_decimate_f32(const arm_fir_decimate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)376 ARM_DSP_ATTRIBUTE void arm_fir_decimate_f32(
377   const arm_fir_decimate_instance_f32 * S,
378   const float32_t * pSrc,
379   float32_t * pDst,
380   uint32_t blockSize)
381 {
382   float32_t *pState = S->pState;                 /* State pointer */
383   const float32_t *pCoeffs = S->pCoeffs;         /* Coefficient pointer */
384   float32_t *pStateCurnt;                        /* Points to the current sample of the state */
385   float32_t *px;                                 /* Temporary pointer for state buffer */
386   const float32_t *pb;                           /* Temporary pointer for coefficient buffer */
387   float32_t sum0;                                /* Accumulator */
388   float32_t x0, c0;                              /* Temporary variables to hold state and coefficient values */
389   uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
390   uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
391 
392   uint32_t blkCntN4;
393   float32_t *px0, *px1, *px2, *px3;
394   float32_t x1, x2, x3;
395 
396   float32x4_t accv,acc0v,acc1v,acc2v,acc3v;
397   float32x4_t x0v, x1v, x2v, x3v;
398   float32x4_t c0v;
399   float32x2_t temp;
400   float32x4_t sum0v;
401 
402   /* S->pState buffer contains previous frame (numTaps - 1) samples */
403   /* pStateCurnt points to the location where the new input data should be written */
404   pStateCurnt = S->pState + (numTaps - 1U);
405 
406   /* Total number of output samples to be computed */
407   blkCnt = outBlockSize / 4;
408   blkCntN4 = outBlockSize - (4 * blkCnt);
409 
410   while (blkCnt > 0U)
411   {
412     /* Copy 4 * decimation factor number of new input samples into the state buffer */
413     i = 4 * S->M;
414 
415     do
416     {
417       *pStateCurnt++ = *pSrc++;
418 
419     } while (--i);
420 
421     /* Set accumulators to zero */
422     acc0v = vdupq_n_f32(0.0);
423     acc1v = vdupq_n_f32(0.0);
424     acc2v = vdupq_n_f32(0.0);
425     acc3v = vdupq_n_f32(0.0);
426 
427     /* Initialize state pointer for all the samples */
428     px0 = pState;
429     px1 = pState + S->M;
430     px2 = pState + 2 * S->M;
431     px3 = pState + 3 * S->M;
432 
433     /* Initialize coeff pointer */
434     pb = pCoeffs;
435 
436     /* Process 4 taps at a time. */
437     tapCnt = numTaps >> 2;
438 
439     /* Loop over the number of taps.
440      ** Repeat until we've computed numTaps-4 coefficients. */
441 
442     while (tapCnt > 0U)
443     {
444       /* Read the b[numTaps-1] coefficient */
445       c0v = vld1q_f32(pb);
446       pb += 4;
447 
448       /* Read x[n-numTaps-1] sample for acc0 */
449       x0v = vld1q_f32(px0);
450       x1v = vld1q_f32(px1);
451       x2v = vld1q_f32(px2);
452       x3v = vld1q_f32(px3);
453 
454       px0 += 4;
455       px1 += 4;
456       px2 += 4;
457       px3 += 4;
458 
459       acc0v = vmlaq_f32(acc0v, x0v, c0v);
460       acc1v = vmlaq_f32(acc1v, x1v, c0v);
461       acc2v = vmlaq_f32(acc2v, x2v, c0v);
462       acc3v = vmlaq_f32(acc3v, x3v, c0v);
463 
464       /* Decrement the loop counter */
465       tapCnt--;
466     }
467 
468     temp = vpadd_f32(vget_low_f32(acc0v),vget_high_f32(acc0v));
469     accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,0);
470 
471     temp = vpadd_f32(vget_low_f32(acc1v),vget_high_f32(acc1v));
472     accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,1);
473 
474     temp = vpadd_f32(vget_low_f32(acc2v),vget_high_f32(acc2v));
475     accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,2);
476 
477     temp = vpadd_f32(vget_low_f32(acc3v),vget_high_f32(acc3v));
478     accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,3);
479 
480     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
481     tapCnt = numTaps % 0x4U;
482 
483     while (tapCnt > 0U)
484     {
485       /* Read coefficients */
486       c0 = *(pb++);
487 
488       /* Fetch  state variables for acc0, acc1, acc2, acc3 */
489       x0 = *(px0++);
490       x1 = *(px1++);
491       x2 = *(px2++);
492       x3 = *(px3++);
493 
494       /* Perform the multiply-accumulate */
495       accv = vsetq_lane_f32(vgetq_lane_f32(accv, 0) + x0 * c0,accv,0);
496       accv = vsetq_lane_f32(vgetq_lane_f32(accv, 1) + x1 * c0,accv,1);
497       accv = vsetq_lane_f32(vgetq_lane_f32(accv, 2) + x2 * c0,accv,2);
498       accv = vsetq_lane_f32(vgetq_lane_f32(accv, 3) + x3 * c0,accv,3);
499 
500       /* Decrement the loop counter */
501       tapCnt--;
502     }
503 
504     /* Advance the state pointer by the decimation factor
505      * to process the next group of decimation factor number samples */
506     pState = pState + 4 * S->M;
507 
508     /* The result is in the accumulator, store in the destination buffer. */
509     vst1q_f32(pDst,accv);
510     pDst += 4;
511 
512     /* Decrement the loop counter */
513     blkCnt--;
514   }
515 
516   while (blkCntN4 > 0U)
517   {
518     /* Copy decimation factor number of new input samples into the state buffer */
519     i = S->M;
520 
521     do
522     {
523       *pStateCurnt++ = *pSrc++;
524 
525     } while (--i);
526 
527     /* Set accumulator to zero */
528     sum0v =  vdupq_n_f32(0.0);
529 
530     /* Initialize state pointer */
531     px = pState;
532 
533     /* Initialize coeff pointer */
534     pb = pCoeffs;
535 
536     /* Process 4 taps at a time. */
537     tapCnt = numTaps >> 2;
538 
539     /* Loop over the number of taps.
540      ** Repeat until we've computed numTaps-4 coefficients. */
541     while (tapCnt > 0U)
542     {
543       c0v = vld1q_f32(pb);
544       pb += 4;
545 
546       x0v = vld1q_f32(px);
547       px += 4;
548 
549       sum0v = vmlaq_f32(sum0v, x0v, c0v);
550 
551       /* Decrement the loop counter */
552       tapCnt--;
553     }
554 
555     temp = vpadd_f32(vget_low_f32(sum0v),vget_high_f32(sum0v));
556     sum0 = vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1);
557 
558     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
559     tapCnt = numTaps % 0x4U;
560 
561     while (tapCnt > 0U)
562     {
563       /* Read coefficients */
564       c0 = *(pb++);
565 
566       /* Fetch 1 state variable */
567       x0 = *(px++);
568 
569       /* Perform the multiply-accumulate */
570       sum0 += x0 * c0;
571 
572       /* Decrement the loop counter */
573       tapCnt--;
574     }
575 
576     /* Advance the state pointer by the decimation factor
577      * to process the next group of decimation factor number samples */
578     pState = pState + S->M;
579 
580     /* The result is in the accumulator, store in the destination buffer. */
581     *pDst++ = sum0;
582 
583     /* Decrement the loop counter */
584     blkCntN4--;
585   }
586 
587   /* Processing is complete.
588    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
589    ** This prepares the state buffer for the next function call. */
590 
591   /* Points to the start of the state buffer */
592   pStateCurnt = S->pState;
593 
594   i = (numTaps - 1U) >> 2;
595 
596   /* Copy data */
597   while (i > 0U)
598   {
599     sum0v = vld1q_f32(pState);
600     vst1q_f32(pStateCurnt,sum0v);
601     pState += 4;
602     pStateCurnt += 4;
603 
604     /* Decrement the loop counter */
605     i--;
606   }
607 
608   i = (numTaps - 1U) % 0x04U;
609 
610   /* Copy data */
611   while (i > 0U)
612   {
613     *pStateCurnt++ = *pState++;
614 
615     /* Decrement the loop counter */
616     i--;
617   }
618 }
619 #else
arm_fir_decimate_f32(const arm_fir_decimate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)620 ARM_DSP_ATTRIBUTE void arm_fir_decimate_f32(
621   const arm_fir_decimate_instance_f32 * S,
622   const float32_t * pSrc,
623         float32_t * pDst,
624         uint32_t blockSize)
625 {
626         float32_t *pState = S->pState;                 /* State pointer */
627   const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
628         float32_t *pStateCur;                          /* Points to the current sample of the state */
629         float32_t *px0;                                /* Temporary pointer for state buffer */
630   const float32_t *pb;                                 /* Temporary pointer for coefficient buffer */
631         float32_t x0, c0;                              /* Temporary variables to hold state and coefficient values */
632         float32_t acc0;                                /* Accumulator */
633         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
634         uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
635 
636 #if defined (ARM_MATH_LOOPUNROLL)
637         float32_t *px1, *px2, *px3;
638         float32_t x1, x2, x3;
639         float32_t acc1, acc2, acc3;
640 #endif
641 
642   /* S->pState buffer contains previous frame (numTaps - 1) samples */
643   /* pStateCur points to the location where the new input data should be written */
644   pStateCur = S->pState + (numTaps - 1U);
645 
646 #if defined (ARM_MATH_LOOPUNROLL)
647 
648     /* Loop unrolling: Compute 4 samples at a time */
649   blkCnt = outBlockSize >> 2U;
650 
651   /* Samples loop unrolled by 4 */
652   while (blkCnt > 0U)
653   {
654     /* Copy 4 * decimation factor number of new input samples into the state buffer */
655     i = S->M * 4;
656 
657     do
658     {
659       *pStateCur++ = *pSrc++;
660 
661     } while (--i);
662 
663     /* Set accumulators to zero */
664     acc0 = 0.0f;
665     acc1 = 0.0f;
666     acc2 = 0.0f;
667     acc3 = 0.0f;
668 
669     /* Initialize state pointer for all the samples */
670     px0 = pState;
671     px1 = pState + S->M;
672     px2 = pState + 2 * S->M;
673     px3 = pState + 3 * S->M;
674 
675     /* Initialize coeff pointer */
676     pb = pCoeffs;
677 
678     /* Loop unrolling: Compute 4 taps at a time */
679     tapCnt = numTaps >> 2U;
680 
681     while (tapCnt > 0U)
682     {
683       /* Read the b[numTaps-1] coefficient */
684       c0 = *(pb++);
685 
686       /* Read x[n-numTaps-1] sample for acc0 */
687       x0 = *(px0++);
688       /* Read x[n-numTaps-1] sample for acc1 */
689       x1 = *(px1++);
690       /* Read x[n-numTaps-1] sample for acc2 */
691       x2 = *(px2++);
692       /* Read x[n-numTaps-1] sample for acc3 */
693       x3 = *(px3++);
694 
695       /* Perform the multiply-accumulate */
696       acc0 += x0 * c0;
697       acc1 += x1 * c0;
698       acc2 += x2 * c0;
699       acc3 += x3 * c0;
700 
701       /* Read the b[numTaps-2] coefficient */
702       c0 = *(pb++);
703 
704       /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */
705       x0 = *(px0++);
706       x1 = *(px1++);
707       x2 = *(px2++);
708       x3 = *(px3++);
709 
710       /* Perform the multiply-accumulate */
711       acc0 += x0 * c0;
712       acc1 += x1 * c0;
713       acc2 += x2 * c0;
714       acc3 += x3 * c0;
715 
716       /* Read the b[numTaps-3] coefficient */
717       c0 = *(pb++);
718 
719       /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */
720       x0 = *(px0++);
721       x1 = *(px1++);
722       x2 = *(px2++);
723       x3 = *(px3++);
724 
725       /* Perform the multiply-accumulate */
726       acc0 += x0 * c0;
727       acc1 += x1 * c0;
728       acc2 += x2 * c0;
729       acc3 += x3 * c0;
730 
731       /* Read the b[numTaps-4] coefficient */
732       c0 = *(pb++);
733 
734       /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */
735       x0 = *(px0++);
736       x1 = *(px1++);
737       x2 = *(px2++);
738       x3 = *(px3++);
739 
740       /* Perform the multiply-accumulate */
741       acc0 += x0 * c0;
742       acc1 += x1 * c0;
743       acc2 += x2 * c0;
744       acc3 += x3 * c0;
745 
746       /* Decrement loop counter */
747       tapCnt--;
748     }
749 
750     /* Loop unrolling: Compute remaining taps */
751     tapCnt = numTaps % 0x4U;
752 
753     while (tapCnt > 0U)
754     {
755       /* Read coefficients */
756       c0 = *(pb++);
757 
758       /* Fetch state variables for acc0, acc1, acc2, acc3 */
759       x0 = *(px0++);
760       x1 = *(px1++);
761       x2 = *(px2++);
762       x3 = *(px3++);
763 
764       /* Perform the multiply-accumulate */
765       acc0 += x0 * c0;
766       acc1 += x1 * c0;
767       acc2 += x2 * c0;
768       acc3 += x3 * c0;
769 
770       /* Decrement loop counter */
771       tapCnt--;
772     }
773 
774     /* Advance the state pointer by the decimation factor
775      * to process the next group of decimation factor number samples */
776     pState = pState + S->M * 4;
777 
778     /* The result is in the accumulator, store in the destination buffer. */
779     *pDst++ = acc0;
780     *pDst++ = acc1;
781     *pDst++ = acc2;
782     *pDst++ = acc3;
783 
784     /* Decrement loop counter */
785     blkCnt--;
786   }
787 
788   /* Loop unrolling: Compute remaining samples */
789   blkCnt = outBlockSize % 0x4U;
790 
791 #else
792 
793   /* Initialize blkCnt with number of samples */
794   blkCnt = outBlockSize;
795 
796 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
797 
798   while (blkCnt > 0U)
799   {
800     /* Copy decimation factor number of new input samples into the state buffer */
801     i = S->M;
802 
803     do
804     {
805       *pStateCur++ = *pSrc++;
806 
807     } while (--i);
808 
809     /* Set accumulator to zero */
810     acc0 = 0.0f;
811 
812     /* Initialize state pointer */
813     px0 = pState;
814 
815     /* Initialize coeff pointer */
816     pb = pCoeffs;
817 
818 #if defined (ARM_MATH_LOOPUNROLL)
819 
820     /* Loop unrolling: Compute 4 taps at a time */
821     tapCnt = numTaps >> 2U;
822 
823     while (tapCnt > 0U)
824     {
825       /* Read the b[numTaps-1] coefficient */
826       c0 = *pb++;
827 
828       /* Read x[n-numTaps-1] sample */
829       x0 = *px0++;
830 
831       /* Perform the multiply-accumulate */
832       acc0 += x0 * c0;
833 
834       /* Read the b[numTaps-2] coefficient */
835       c0 = *pb++;
836 
837       /* Read x[n-numTaps-2] sample */
838       x0 = *px0++;
839 
840       /* Perform the multiply-accumulate */
841       acc0 += x0 * c0;
842 
843       /* Read the b[numTaps-3] coefficient */
844       c0 = *pb++;
845 
846       /* Read x[n-numTaps-3] sample */
847       x0 = *px0++;
848 
849       /* Perform the multiply-accumulate */
850       acc0 += x0 * c0;
851 
852       /* Read the b[numTaps-4] coefficient */
853       c0 = *pb++;
854 
855       /* Read x[n-numTaps-4] sample */
856       x0 = *px0++;
857 
858       /* Perform the multiply-accumulate */
859       acc0 += x0 * c0;
860 
861       /* Decrement loop counter */
862       tapCnt--;
863     }
864 
865     /* Loop unrolling: Compute remaining taps */
866     tapCnt = numTaps % 0x4U;
867 
868 #else
869 
870     /* Initialize tapCnt with number of taps */
871     tapCnt = numTaps;
872 
873 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
874 
875     while (tapCnt > 0U)
876     {
877       /* Read coefficients */
878       c0 = *pb++;
879 
880       /* Fetch 1 state variable */
881       x0 = *px0++;
882 
883       /* Perform the multiply-accumulate */
884       acc0 += x0 * c0;
885 
886       /* Decrement loop counter */
887       tapCnt--;
888     }
889 
890     /* Advance the state pointer by the decimation factor
891      * to process the next group of decimation factor number samples */
892     pState = pState + S->M;
893 
894     /* The result is in the accumulator, store in the destination buffer. */
895     *pDst++ = acc0;
896 
897     /* Decrement loop counter */
898     blkCnt--;
899   }
900 
901   /* Processing is complete.
902      Now copy the last numTaps - 1 samples to the satrt of the state buffer.
903      This prepares the state buffer for the next function call. */
904 
905   /* Points to the start of the state buffer */
906   pStateCur = S->pState;
907 
908 #if defined (ARM_MATH_LOOPUNROLL)
909 
910   /* Loop unrolling: Compute 4 taps at a time */
911   tapCnt = (numTaps - 1U) >> 2U;
912 
913   /* Copy data */
914   while (tapCnt > 0U)
915   {
916     *pStateCur++ = *pState++;
917     *pStateCur++ = *pState++;
918     *pStateCur++ = *pState++;
919     *pStateCur++ = *pState++;
920 
921     /* Decrement loop counter */
922     tapCnt--;
923   }
924 
925   /* Loop unrolling: Compute remaining taps */
926   tapCnt = (numTaps - 1U) % 0x04U;
927 
928 #else
929 
930   /* Initialize tapCnt with number of taps */
931   tapCnt = (numTaps - 1U);
932 
933 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
934 
935   /* Copy data */
936   while (tapCnt > 0U)
937   {
938     *pStateCur++ = *pState++;
939 
940     /* Decrement loop counter */
941     tapCnt--;
942   }
943 
944 }
945 #endif /* #if defined(ARM_MATH_NEON) */
946 
947 #endif /*defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
948 /**
949   @} end of FIR_decimate group
950  */
951