1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_decimate_q15.c
4  * Description:  Q15 FIR Decimator
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR_decimate
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q15 FIR decimator.
42   @param[in]     S          points to an instance of the Q15 FIR decimator structure
43   @param[in]     pSrc       points to the block of input data
44   @param[out]    pDst       points to the block of output data
45   @param[in]     blockSize  number of input samples to process per call
46 
47   @par           Scaling and Overflow Behavior
48                    The function is implemented using a 64-bit internal accumulator.
49                    Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
50                    The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
51                    There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
52                    After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
53                    Lastly, the accumulator is saturated to yield a result in 1.15 format.
54 
55  @remark
56                    Refer to \ref arm_fir_decimate_fast_q15() for a faster but less precise implementation of this function.
57  */
58 
59 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
60 
61 #include "arm_helium_utils.h"
62 
arm_fir_decimate_q15(const arm_fir_decimate_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)63 ARM_DSP_ATTRIBUTE void arm_fir_decimate_q15(
64   const arm_fir_decimate_instance_q15 * S,
65   const q15_t * pSrc,
66         q15_t * pDst,
67         uint32_t blockSize)
68 {
69     q15_t    *pState = S->pState;   /* State pointer */
70     const q15_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
71     q15_t    *pStateCurnt;      /* Points to the current sample of the state */
72     const q15_t    *px, *pb;          /* Temporary pointers for state and coefficient buffers */
73     uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
74     uint32_t  i, tapCnt, blkCnt, outBlockSize = blockSize / S->M;   /* Loop counters */
75     uint32_t  blkCntN4;
76     const q15_t  *px0, *px1, *px2, *px3;
77     q63_t     acc0v, acc1v, acc2v, acc3v;
78     q15x8_t x0v, x1v, x2v, x3v;
79     q15x8_t c0v;
80 
81     /*
82      * S->pState buffer contains previous frame (numTaps - 1) samples
83      * pStateCurnt points to the location where the new input data should be written
84      */
85     pStateCurnt = S->pState + (numTaps - 1U);
86     /*
87      * Total number of output samples to be computed
88      */
89     blkCnt = outBlockSize / 4;
90     blkCntN4 = outBlockSize - (4 * blkCnt);
91 
92     while (blkCnt > 0U)
93     {
94         /*
95          * Need extra temp variables as 4 * S->M is not necessarily a multiple of 8
96          * and cause final tail predicated post incremented pointers to jump ahead
97          */
98         const q15_t      *pSrcTmp = pSrc;
99         q15_t      *pStateCurntTmp = pStateCurnt;
100 
101         /*
102          * Copy 4 * decimation factor number of new input samples into the state buffer
103          */
104         i = (4 * S->M) >> 3;
105         while (i > 0U)
106         {
107             vstrhq_s16(pStateCurntTmp, vldrhq_s16(pSrcTmp));
108             pSrcTmp += 8;
109             pStateCurntTmp += 8;
110             i--;
111         }
112         i = (4 * S->M) & 7;
113         if (i > 0U)
114         {
115             mve_pred16_t p0 = vctp16q(i);
116             vstrhq_p_s16(pStateCurntTmp, vldrhq_s16(pSrcTmp), p0);
117         }
118 
119         pSrc += (4 * S->M);
120         pStateCurnt += (4 * S->M);
121 
122         /*
123          * Clear all accumulators
124          */
125         acc0v = 0LL;
126         acc1v = 0LL;
127         acc2v = 0LL;
128         acc3v = 0LL;
129         /*
130          * Initialize state pointer for all the samples
131          */
132         px0 = pState;
133         px1 = pState + S->M;
134         px2 = pState + 2 * S->M;
135         px3 = pState + 3 * S->M;
136         /*
137          * Initialize coeff. pointer
138          */
139         pb = pCoeffs;
140 
141         tapCnt = numTaps >> 3;
142         /*
143          * Loop over the number of taps.  Unroll by a factor of 4.
144          * Repeat until we've computed numTaps-4 coefficients.
145          */
146         while (tapCnt > 0U)
147         {
148             /*
149              * Read the b[numTaps-1] coefficient
150              */
151             c0v = vldrhq_s16(pb);
152             pb += 8;
153             /*
154              * Read x[n-numTaps-1] sample for acc0
155              */
156             x0v = vld1q(px0);
157             x1v = vld1q(px1);
158             x2v = vld1q(px2);
159             x3v = vld1q(px3);
160             px0 += 8;
161             px1 += 8;
162             px2 += 8;
163             px3 += 8;
164 
165             acc0v = vmlaldavaq(acc0v, x0v, c0v);
166             acc1v = vmlaldavaq(acc1v, x1v, c0v);
167             acc2v = vmlaldavaq(acc2v, x2v, c0v);
168             acc3v = vmlaldavaq(acc3v, x3v, c0v);
169             /*
170              * Decrement the loop counter
171              */
172             tapCnt--;
173         }
174 
175         /*
176          * If the filter length is not a multiple of 4, compute the remaining filter taps
177          * should be tail predicated
178          */
179         tapCnt = numTaps & 7;
180         if (tapCnt > 0U)
181         {
182             mve_pred16_t p0 = vctp16q(tapCnt);
183             /*
184              * Read the b[numTaps-1] coefficient
185              */
186             c0v = vldrhq_z_s16(pb, p0);
187             pb += 8;
188             /*
189              * Read x[n-numTaps-1] sample for acc0
190              */
191             x0v = vld1q(px0);
192             x1v = vld1q(px1);
193             x2v = vld1q(px2);
194             x3v = vld1q(px3);
195             px0 += 8;
196             px1 += 8;
197             px2 += 8;
198             px3 += 8;
199 
200             acc0v = vmlaldavaq(acc0v, x0v, c0v);
201             acc1v = vmlaldavaq(acc1v, x1v, c0v);
202             acc2v = vmlaldavaq(acc2v, x2v, c0v);
203             acc3v = vmlaldavaq(acc3v, x3v, c0v);
204         }
205 
206         acc0v = asrl(acc0v, 15);
207         acc1v = asrl(acc1v, 15);
208         acc2v = asrl(acc2v, 15);
209         acc3v = asrl(acc3v, 15);
210         /*
211          * store in the destination buffer.
212          */
213         *pDst++ = (q15_t) __SSAT((q31_t) acc0v, 16);
214         *pDst++ = (q15_t) __SSAT((q31_t) acc1v, 16);;
215         *pDst++ = (q15_t) __SSAT((q31_t) acc2v, 16);;
216         *pDst++ = (q15_t) __SSAT((q31_t) acc3v, 16);;
217 
218         /*
219          * Advance the state pointer by the decimation factor
220          * to process the next group of decimation factor number samples
221          */
222         pState = pState + 4 * S->M;
223         /*
224          * Decrement the loop counter
225          */
226         blkCnt--;
227     }
228 
229     while (blkCntN4 > 0U)
230     {
231         /*
232          * Copy decimation factor number of new input samples into the state buffer
233          */
234         i = S->M;
235         do
236         {
237             *pStateCurnt++ = *pSrc++;
238         }
239         while (--i);
240         /*
241          * Set accumulator to zero
242          */
243         acc0v = 0LL;
244         /*
245          * Initialize state pointer
246          */
247         px = pState;
248         /*
249          * Initialize coeff. pointer
250          */
251         pb = pCoeffs;
252 
253         tapCnt = numTaps >> 3;
254         while (tapCnt > 0U)
255         {
256             c0v = vldrhq_s16(pb);
257             x0v = vldrhq_s16(px);
258             pb += 8;
259             px += 8;
260             acc0v = vmlaldavaq(acc0v, x0v, c0v);
261             /*
262              * Decrement the loop counter
263              */
264             tapCnt--;
265         }
266 
267         tapCnt = numTaps & 7;
268         if (tapCnt > 0U)
269         {
270             mve_pred16_t p0 = vctp16q(tapCnt);
271             c0v = vldrhq_z_s16(pb, p0);
272             x0v = vldrhq_z_s16(px, p0);
273             acc0v = vmlaldavaq_p(acc0v, x0v, c0v, p0);
274         }
275 
276         acc0v = asrl(acc0v, 15);
277 
278         /*
279          * Advance the state pointer by the decimation factor
280          * to process the next group of decimation factor number samples
281          */
282         pState = pState + S->M;
283         /*
284          * The result is in the accumulator, store in the destination buffer.
285          */
286         *pDst++ = (q15_t) __SSAT((q31_t) acc0v, 16);
287         /*
288          * Decrement the loop counter
289          */
290         blkCntN4--;
291     }
292 
293     /*
294      * Processing is complete.
295      * Now copy the last numTaps - 1 samples to the start of the state buffer.
296      * This prepares the state buffer for the next function call.
297      */
298 
299     pStateCurnt = S->pState;
300     blkCnt = (numTaps - 1) >> 3;
301     while (blkCnt > 0U)
302     {
303         vstrhq_s16(pStateCurnt, vldrhq_s16(pState));
304         pState += 8;
305         pStateCurnt += 8;
306         blkCnt--;
307     }
308     blkCnt = (numTaps - 1) & 7;
309     if (blkCnt > 0U)
310     {
311         mve_pred16_t p0 = vctp16q(blkCnt);
312         vstrhq_p_s16(pStateCurnt, vldrhq_s16(pState), p0);
313     }
314 }
315 #else
316 #if defined (ARM_MATH_DSP)
317 
arm_fir_decimate_q15(const arm_fir_decimate_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)318 ARM_DSP_ATTRIBUTE void arm_fir_decimate_q15(
319   const arm_fir_decimate_instance_q15 * S,
320   const q15_t * pSrc,
321         q15_t * pDst,
322         uint32_t blockSize)
323 {
324         q15_t *pState = S->pState;                     /* State pointer */
325   const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
326         q15_t *pStateCur;                              /* Points to the current sample of the state */
327         q15_t *px;                                     /* Temporary pointer for state buffer */
328   const q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
329         q31_t x0, x1, c0;                              /* Temporary variables to hold state and coefficient values */
330         q63_t sum0;                                    /* Accumulators */
331         q63_t acc0, acc1;
332         q15_t *px0, *px1;
333         uint32_t blkCntN3;
334         uint32_t numTaps = S->numTaps;                 /* Number of taps */
335         uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
336 
337 #if defined (ARM_MATH_LOOPUNROLL)
338         q31_t c1;                                      /* Temporary variables to hold state and coefficient values */
339 #endif
340 
341   /* S->pState buffer contains previous frame (numTaps - 1) samples */
342   /* pStateCur points to the location where the new input data should be written */
343   pStateCur = S->pState + (numTaps - 1U);
344 
345   /* Total number of output samples to be computed */
346   blkCnt = outBlockSize / 2;
347   blkCntN3 = outBlockSize - (2 * blkCnt);
348 
349   while (blkCnt > 0U)
350   {
351     /* Copy 2 * decimation factor number of new input samples into the state buffer */
352     i = S->M * 2;
353 
354     do
355     {
356       *pStateCur++ = *pSrc++;
357 
358     } while (--i);
359 
360     /* Set accumulator to zero */
361     acc0 = 0;
362     acc1 = 0;
363 
364     /* Initialize state pointer for all the samples */
365     px0 = pState;
366     px1 = pState + S->M;
367 
368     /* Initialize coeff pointer */
369     pb = pCoeffs;
370 
371 #if defined (ARM_MATH_LOOPUNROLL)
372 
373     /* Loop unrolling: Compute 4 taps at a time */
374     tapCnt = numTaps >> 2U;
375 
376     while (tapCnt > 0U)
377     {
378       /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
379       c0 = read_q15x2_ia ((q15_t **) &pb);
380 
381       /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
382       x0 = read_q15x2_ia (&px0);
383       x1 = read_q15x2_ia (&px1);
384 
385       /* Perform the multiply-accumulate */
386       acc0 = __SMLALD(x0, c0, acc0);
387       acc1 = __SMLALD(x1, c0, acc1);
388 
389       /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
390       c0 = read_q15x2_ia ((q15_t **) &pb);
391 
392       /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
393       x0 = read_q15x2_ia (&px0);
394       x1 = read_q15x2_ia (&px1);
395 
396       /* Perform the multiply-accumulate */
397       acc0 = __SMLALD(x0, c0, acc0);
398       acc1 = __SMLALD(x1, c0, acc1);
399 
400       /* Decrement loop counter */
401       tapCnt--;
402     }
403 
404     /* Loop unrolling: Compute remaining taps */
405     tapCnt = numTaps % 0x4U;
406 
407 #else
408 
409     /* Initialize tapCnt with number of taps */
410     tapCnt = numTaps;
411 
412 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
413 
414     while (tapCnt > 0U)
415     {
416       /* Read coefficients */
417       c0 = *pb++;
418 
419       /* Fetch state variables for acc0, acc1 */
420       x0 = *px0++;
421       x1 = *px1++;
422 
423       /* Perform the multiply-accumulate */
424       acc0 = __SMLALD(x0, c0, acc0);
425       acc1 = __SMLALD(x1, c0, acc1);
426 
427       /* Decrement loop counter */
428       tapCnt--;
429     }
430 
431     /* Advance the state pointer by the decimation factor
432      * to process the next group of decimation factor number samples */
433     pState = pState + S->M * 2;
434 
435     /* Store filter output, smlad returns the values in 2.14 format */
436     /* so downsacle by 15 to get output in 1.15 */
437     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
438     *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
439 
440     /* Decrement loop counter */
441     blkCnt--;
442   }
443 
444   while (blkCntN3 > 0U)
445   {
446     /* Copy decimation factor number of new input samples into the state buffer */
447     i = S->M;
448 
449     do
450     {
451       *pStateCur++ = *pSrc++;
452 
453     } while (--i);
454 
455     /* Set accumulator to zero */
456     sum0 = 0;
457 
458     /* Initialize state pointer */
459     px = pState;
460 
461     /* Initialize coeff pointer */
462     pb = pCoeffs;
463 
464 #if defined (ARM_MATH_LOOPUNROLL)
465 
466     /* Loop unrolling: Compute 4 taps at a time */
467     tapCnt = numTaps >> 2U;
468 
469     while (tapCnt > 0U)
470     {
471       /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
472       c0 = read_q15x2_ia ((q15_t **) &pb);
473 
474       /* Read x[n-numTaps-1] and x[n-numTaps-2] sample */
475       x0 = read_q15x2_ia (&px);
476 
477       /* Read the b[numTaps-3] and b[numTaps-4] coefficients */
478       c1 = read_q15x2_ia ((q15_t **) &pb);
479 
480       /* Perform the multiply-accumulate */
481       sum0 = __SMLALD(x0, c0, sum0);
482 
483       /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
484       x0 = read_q15x2_ia (&px);
485 
486       /* Perform the multiply-accumulate */
487       sum0 = __SMLALD(x0, c1, sum0);
488 
489       /* Decrement loop counter */
490       tapCnt--;
491     }
492 
493     /* Loop unrolling: Compute remaining taps */
494     tapCnt = numTaps % 0x4U;
495 
496 #else
497 
498     /* Initialize tapCnt with number of taps */
499     tapCnt = numTaps;
500 
501 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
502 
503     while (tapCnt > 0U)
504     {
505       /* Read coefficients */
506       c0 = *pb++;
507 
508       /* Fetch 1 state variable */
509       x0 = *px++;
510 
511       /* Perform the multiply-accumulate */
512       sum0 = __SMLALD(x0, c0, sum0);
513 
514       /* Decrement loop counter */
515       tapCnt--;
516     }
517 
518     /* Advance the state pointer by the decimation factor
519      * to process the next group of decimation factor number samples */
520     pState = pState + S->M;
521 
522     /* Store filter output, smlad returns the values in 2.14 format */
523     /* so downsacle by 15 to get output in 1.15 */
524     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
525 
526     /* Decrement loop counter */
527     blkCntN3--;
528   }
529 
530   /* Processing is complete.
531      Now copy the last numTaps - 1 samples to the satrt of the state buffer.
532      This prepares the state buffer for the next function call. */
533 
534   /* Points to the start of the state buffer */
535   pStateCur = S->pState;
536   i = (numTaps - 1U) >> 2U;
537 
538   /* copy data */
539   while (i > 0U)
540   {
541     write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
542     write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
543 
544     /* Decrement loop counter */
545     i--;
546   }
547 
548   i = (numTaps - 1U) % 0x04U;
549 
550   /* Copy data */
551   while (i > 0U)
552   {
553     *pStateCur++ = *pState++;
554 
555     /* Decrement loop counter */
556     i--;
557   }
558 
559 }
560 
561 #else /* #if defined (ARM_MATH_DSP) */
562 
arm_fir_decimate_q15(const arm_fir_decimate_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)563 ARM_DSP_ATTRIBUTE void arm_fir_decimate_q15(
564   const arm_fir_decimate_instance_q15 * S,
565   const q15_t * pSrc,
566         q15_t * pDst,
567         uint32_t blockSize)
568 {
569         q15_t *pState = S->pState;                     /* State pointer */
570   const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
571         q15_t *pStateCur;                              /* Points to the current sample of the state */
572         q15_t *px;                                     /* Temporary pointer for state buffer */
573   const q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
574         q15_t x0, x1, c0;                              /* Temporary variables to hold state and coefficient values */
575         q63_t sum0;                                    /* Accumulators */
576         q63_t acc0, acc1;
577         q15_t *px0, *px1;
578         uint32_t blkCntN3;
579         uint32_t numTaps = S->numTaps;                 /* Number of taps */
580         uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
581 
582 
583   /* S->pState buffer contains previous frame (numTaps - 1) samples */
584   /* pStateCur points to the location where the new input data should be written */
585   pStateCur = S->pState + (numTaps - 1U);
586 
587   /* Total number of output samples to be computed */
588   blkCnt = outBlockSize / 2;
589   blkCntN3 = outBlockSize - (2 * blkCnt);
590 
591   while (blkCnt > 0U)
592   {
593     /* Copy 2 * decimation factor number of new input samples into the state buffer */
594     i = S->M * 2;
595 
596     do
597     {
598       *pStateCur++ = *pSrc++;
599 
600     } while (--i);
601 
602     /* Set accumulator to zero */
603     acc0 = 0;
604     acc1 = 0;
605 
606     /* Initialize state pointer */
607     px0 = pState;
608     px1 = pState + S->M;
609 
610     /* Initialize coeff pointer */
611     pb = pCoeffs;
612 
613 #if defined (ARM_MATH_LOOPUNROLL)
614 
615     /* Loop unrolling: Compute 4 taps at a time */
616     tapCnt = numTaps >> 2U;
617 
618     while (tapCnt > 0U)
619     {
620       /* Read the Read b[numTaps-1] coefficients */
621       c0 = *pb++;
622 
623       /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
624       x0 = *px0++;
625       x1 = *px1++;
626 
627       /* Perform the multiply-accumulate */
628       acc0 += x0 * c0;
629       acc1 += x1 * c0;
630 
631       /* Read the b[numTaps-2] coefficient */
632       c0 = *pb++;
633 
634       /* Read x[n-numTaps-2] for sample 0 and sample 1 */
635       x0 = *px0++;
636       x1 = *px1++;
637 
638       /* Perform the multiply-accumulate */
639       acc0 += x0 * c0;
640       acc1 += x1 * c0;
641 
642       /* Read the b[numTaps-3] coefficients */
643       c0 = *pb++;
644 
645       /* Read x[n-numTaps-3] for sample 0 and sample 1 */
646       x0 = *px0++;
647       x1 = *px1++;
648 
649       /* Perform the multiply-accumulate */
650       acc0 += x0 * c0;
651       acc1 += x1 * c0;
652 
653       /* Read the b[numTaps-4] coefficient */
654       c0 = *pb++;
655 
656       /* Read x[n-numTaps-4] for sample 0 and sample 1 */
657       x0 = *px0++;
658       x1 = *px1++;
659 
660       /* Perform the multiply-accumulate */
661       acc0 += x0 * c0;
662       acc1 += x1 * c0;
663 
664       /* Decrement the loop counter */
665       tapCnt--;
666     }
667 
668     /* Loop unrolling: Compute remaining taps */
669     tapCnt = numTaps % 0x4U;
670 
671 #else
672 
673     /* Initialize tapCnt with number of taps */
674     tapCnt = numTaps;
675 
676 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
677 
678     while (tapCnt > 0U)
679     {
680       /* Read coefficients */
681       c0 = *pb++;
682 
683       /* Fetch 1 state variable */
684       x0 = *px0++;
685       x1 = *px1++;
686 
687       /* Perform the multiply-accumulate */
688       acc0 += x0 * c0;
689       acc1 += x1 * c0;
690 
691       /* Decrement the loop counter */
692       tapCnt--;
693     }
694 
695     /* Advance the state pointer by the decimation factor
696      * to process the next group of decimation factor number samples */
697     pState = pState + S->M * 2;
698 
699     /* Store filter output, smlad returns the values in 2.14 format */
700     /* so downsacle by 15 to get output in 1.15 */
701 
702     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
703     *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
704 
705     /* Decrement loop counter */
706     blkCnt--;
707   }
708 
709   while (blkCntN3 > 0U)
710   {
711     /* Copy decimation factor number of new input samples into the state buffer */
712     i = S->M;
713 
714     do
715     {
716       *pStateCur++ = *pSrc++;
717 
718     } while (--i);
719 
720     /* Set accumulator to zero */
721     sum0 = 0;
722 
723     /* Initialize state pointer */
724     px = pState;
725 
726     /* Initialize coeff pointer */
727     pb = pCoeffs;
728 
729 #if defined (ARM_MATH_LOOPUNROLL)
730 
731     /* Loop unrolling: Compute 4 taps at a time */
732     tapCnt = numTaps >> 2U;
733 
734     while (tapCnt > 0U)
735     {
736       /* Read the b[numTaps-1] coefficient */
737       c0 = *pb++;
738 
739       /* Read x[n-numTaps-1] sample */
740       x0 = *px++;
741 
742       /* Perform the multiply-accumulate */
743       sum0 += x0 * c0;
744 
745       /* Read the b[numTaps-2] coefficient */
746       c0 = *pb++;
747 
748       /* Read x[n-numTaps-2] sample */
749       x0 = *px++;
750 
751       /* Perform the multiply-accumulate */
752       sum0 += x0 * c0;
753 
754       /* Read the b[numTaps-3] coefficient */
755       c0 = *pb++;
756 
757       /* Read x[n-numTaps-3] sample */
758       x0 = *px++;
759 
760       /* Perform the multiply-accumulate */
761       sum0 += x0 * c0;
762 
763       /* Read the b[numTaps-4] coefficient */
764       c0 = *pb++;
765 
766       /* Read x[n-numTaps-4] sample */
767       x0 = *px++;
768 
769       /* Perform the multiply-accumulate */
770       sum0 += x0 * c0;
771 
772       /* Decrement loop counter */
773       tapCnt--;
774     }
775 
776     /* Loop unrolling: Compute remaining taps */
777     tapCnt = numTaps % 0x4U;
778 
779 #else
780 
781     /* Initialize tapCnt with number of taps */
782     tapCnt = numTaps;
783 
784 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
785 
786     while (tapCnt > 0U)
787     {
788       /* Read coefficients */
789       c0 = *pb++;
790 
791       /* Fetch 1 state variable */
792       x0 = *px++;
793 
794       /* Perform the multiply-accumulate */
795       sum0 += x0 * c0;
796 
797       /* Decrement the loop counter */
798       tapCnt--;
799     }
800 
801     /* Advance the state pointer by the decimation factor
802      * to process the next group of decimation factor number samples */
803     pState = pState + S->M;
804 
805     /* Store filter output, smlad returns the values in 2.14 format */
806     /* so downsacle by 15 to get output in 1.15 */
807     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
808 
809     /* Decrement loop counter */
810     blkCntN3--;
811   }
812 
813   /* Processing is complete.
814    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
815    ** This prepares the state buffer for the next function call. */
816 
817   /* Points to the start of the state buffer */
818   pStateCur = S->pState;
819 
820   i = (numTaps - 1U) >> 2U;
821 
822   /* copy data */
823   while (i > 0U)
824   {
825     *pStateCur++ = *pState++;
826     *pStateCur++ = *pState++;
827     *pStateCur++ = *pState++;
828     *pStateCur++ = *pState++;
829 
830     /* Decrement loop counter */
831     i--;
832   }
833 
834   i = (numTaps - 1U) % 0x04U;
835 
836   /* copy data */
837   while (i > 0U)
838   {
839     *pStateCur++ = *pState++;
840 
841     /* Decrement loop counter */
842     i--;
843   }
844 }
845 
846 #endif /* #if defined (ARM_MATH_DSP) */
847 #endif /* defined(ARM_MATH_MVEI) */
848 /**
849   @} end of FIR_decimate group
850  */
851