1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_q7.c
4  * Description:  Q7 FIR filter processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR
37   @{
38  */
39 
40 /**
41   @brief         Processing function for Q7 FIR filter.
42   @param[in]     S          points to an instance of the Q7 FIR filter structure
43   @param[in]     pSrc       points to the block of input data
44   @param[out]    pDst       points to the block of output data
45   @param[in]     blockSize  number of samples to process
46 
47   @par           Scaling and Overflow Behavior
48                    The function is implemented using a 32-bit internal accumulator.
49                    Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
50                    The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
51                    There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
52                    The accumulator is converted to 18.7 format by discarding the low 7 bits.
53                    Finally, the result is truncated to 1.7 format.
54  */
55 
56 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
57 
58 #define FIR_Q7_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs)         \
59         for (int j = 0; j < nbAcc; j++) {                                  \
60             const q7_t     *pSmp = &pSample[j];                            \
61             q31_t           acc[4];                                        \
62                                                                            \
63             acc[j] = 0;                                                    \
64             for (int i = 0; i < nbVecTaps; i++) {                          \
65                 vecIn0 = vld1q(pSmp + 16 * i);                   \
66                 acc[j] = vmladavaq(acc[j], vecIn0, vecCoeffs[i]);          \
67             }                                                              \
68             *pOutput++ = (q7_t) __SSAT((acc[j] >> 7U), 8);                 \
69         }
70 
71 #define FIR_Q7_MAIN_CORE()                                                                  \
72 {                                                                                           \
73      q7_t          *pState = S->pState;     /* State pointer */                             \
74     const q7_t    *pCoeffs = S->pCoeffs;   /* Coefficient pointer */                        \
75     q7_t          *pStateCur;              /* Points to the current sample of the state */  \
76     const q7_t    *pSamples;               /* Temporary pointer to the sample buffer */     \
77     q7_t          *pOutput;                /* Temporary pointer to the output buffer */     \
78     const q7_t    *pTempSrc;               /* Temporary pointer to the source data */       \
79     q7_t          *pTempDest;              /* Temporary pointer to the destination buffer */\
80     uint32_t       numTaps = S->numTaps;   /* Number of filter coefficients in the filter */\
81     int32_t        blkCnt;                                                                  \
82     q7x16_t        vecIn0;                                                                  \
83                                                                                             \
84     /*                                                                                      \
85      * load coefs                                                                           \
86      */                                                                                     \
87     q7x16_t         vecCoeffs[NBVECTAPS];                                                   \
88                                                                                             \
89     for (int i = 0; i < NBVECTAPS; i++)                                                     \
90         vecCoeffs[i] = vldrbq_s8(pCoeffs + 16 * i);                               \
91                                                                                             \
92     /*                                                                                      \
93      * pState points to state array which contains previous frame (numTaps - 1) samples     \
94      * pStateCur points to the location where the new input data should be written          \
95      */                                                                                     \
96     pStateCur = &(pState[(numTaps - 1u)]);                                                  \
97     pTempSrc = pSrc;                                                                        \
98     pSamples = pState;                                                                      \
99     pOutput = pDst;                                                                         \
100                                                                                             \
101     blkCnt = blockSize >> 2;                                                                \
102     while (blkCnt > 0) {                                                                   \
103         /*                                                                                  \
104          * Save 4 input samples in the history buffer                                       \
105          */                                                                                 \
106         vstrbq_s32(pStateCur, vldrbq_s32(pTempSrc));                                        \
107         pStateCur += 4;                                                                     \
108         pTempSrc += 4;                                                                      \
109                                                                                             \
110         FIR_Q7_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs);                            \
111         pSamples += 4;                                                                      \
112                                                                                             \
113         blkCnt--;                                                                           \
114     }                                                                                       \
115                                                                                             \
116     /* tail */                                                                              \
117     int32_t        residual = blockSize & 3;                                               \
118                                                                                             \
119     for (int i = 0; i < residual; i++)                                                      \
120         *pStateCur++ = *pTempSrc++;                                                         \
121                                                                                             \
122     FIR_Q7_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs);                         \
123                                                                                             \
124                                                                                             \
125     /*                                                                                      \
126      * Copy the samples back into the history buffer start                                  \
127      */                                                                                     \
128     pTempSrc = &pState[blockSize];                                                          \
129     pTempDest = pState;                                                                     \
130     blkCnt = numTaps - 1;                                                                   \
131     do {                                                                                    \
132         mve_pred16_t    p = vctp8q(blkCnt);                                                 \
133                                                                                             \
134         vstrbq_p_s8(pTempDest, vldrbq_z_s8(pTempSrc, p), p);                                \
135         pTempSrc += 16;                                                           \
136         pTempDest += 16;                                                          \
137         blkCnt -= 16;                                                             \
138     }                                                                                       \
139     while (blkCnt > 0);                                                                     \
140 }
141 
142 
arm_fir_q7_49_64_mve(const arm_fir_instance_q7 * S,const q7_t * __restrict pSrc,q7_t * __restrict pDst,uint32_t blockSize)143 static void arm_fir_q7_49_64_mve(const arm_fir_instance_q7 * S,
144   const q7_t * __restrict pSrc,
145   q7_t * __restrict pDst, uint32_t blockSize)
146 {
147     #define NBTAPS 64
148     #define NBVECTAPS (NBTAPS / 16)
149     FIR_Q7_MAIN_CORE();
150     #undef NBVECTAPS
151     #undef NBTAPS
152 }
153 
154 
arm_fir_q7_33_48_mve(const arm_fir_instance_q7 * S,const q7_t * __restrict pSrc,q7_t * __restrict pDst,uint32_t blockSize)155 static void arm_fir_q7_33_48_mve(const arm_fir_instance_q7 * S,
156   const q7_t * __restrict pSrc,
157   q7_t * __restrict pDst, uint32_t blockSize)
158 {
159     #define NBTAPS 48
160     #define NBVECTAPS (NBTAPS / 16)
161     FIR_Q7_MAIN_CORE();
162     #undef NBVECTAPS
163     #undef NBTAPS
164 }
165 
arm_fir_q7_17_32_mve(const arm_fir_instance_q7 * S,const q7_t * __restrict pSrc,q7_t * __restrict pDst,uint32_t blockSize)166 static void arm_fir_q7_17_32_mve(const arm_fir_instance_q7 * S,
167   const q7_t * __restrict pSrc,
168   q7_t * __restrict pDst, uint32_t blockSize)
169 {
170     #define NBTAPS 32
171     #define NBVECTAPS (NBTAPS / 16)
172     FIR_Q7_MAIN_CORE();
173     #undef NBVECTAPS
174     #undef NBTAPS
175 }
176 
177 
arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S,const q7_t * __restrict pSrc,q7_t * __restrict pDst,uint32_t blockSize)178 static void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S,
179   const q7_t * __restrict pSrc,
180   q7_t * __restrict pDst, uint32_t blockSize)
181 {
182     #define NBTAPS 16
183     #define NBVECTAPS (NBTAPS / 16)
184     FIR_Q7_MAIN_CORE();
185     #undef NBVECTAPS
186     #undef NBTAPS
187 }
188 
arm_fir_q7(const arm_fir_instance_q7 * S,const q7_t * pSrc,q7_t * pDst,uint32_t blockSize)189 ARM_DSP_ATTRIBUTE void arm_fir_q7(
190   const arm_fir_instance_q7 * S,
191   const q7_t * pSrc,
192         q7_t * pDst,
193         uint32_t blockSize)
194 {
195     q7_t     *pState = S->pState;   /* State pointer */
196     const q7_t     *pCoeffs = S->pCoeffs; /* Coefficient pointer */
197     q7_t     *pStateCur;        /* Points to the current sample of the state */
198     const q7_t     *pSamples;         /* Temporary pointer to the sample buffer */
199     q7_t     *pOutput;          /* Temporary pointer to the output buffer */
200     const q7_t     *pTempSrc;         /* Temporary pointer to the source data */
201     q7_t     *pTempDest;        /* Temporary pointer to the destination buffer */
202     uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
203     uint32_t  blkCnt;
204     q7x16_t  vecIn0;
205     uint32_t  tapsBlkCnt = (numTaps + 15) / 16;
206     q31_t     acc0, acc1, acc2, acc3;
207     q7x16_t  vecCoeffs;
208 
209     if (numTaps <= 16)
210     {
211         /*
212          * [1 to 16 taps] specialized routine
213          */
214         arm_fir_q7_1_16_mve(S, pSrc, pDst, blockSize);
215         return;
216     }
217     else if (numTaps <= 32)
218     {
219         /*
220          * [17 to 32 taps] specialized routine
221          */
222         arm_fir_q7_17_32_mve(S, pSrc, pDst, blockSize);
223         return;
224     }
225     else if (numTaps <= 48)
226     {
227         /*
228          * [33 to 48 taps] specialized routine
229          */
230         arm_fir_q7_33_48_mve(S, pSrc, pDst, blockSize);
231         return;
232     }
233     else if (numTaps <= 64)
234     {
235         /*
236          * [49 to 64 taps] specialized routine
237          */
238         arm_fir_q7_49_64_mve(S, pSrc, pDst, blockSize);
239         return;
240     }
241 
242     /*
243      * pState points to state array which contains previous frame (numTaps - 1) samples
244      * pStateCur points to the location where the new input data should be written
245      */
246     pStateCur   = &(pState[(numTaps - 1u)]);
247     pSamples    = pState;
248     pTempSrc    = pSrc;
249     pOutput     = pDst;
250     blkCnt      = blockSize >> 2;
251 
252     /*
253      * outer samples loop
254      */
255     while (blkCnt > 0U)
256     {
257         const q7_t     *pCoeffsTmp = pCoeffs;
258         const q7_t     *pSamplesTmp = pSamples;
259 
260         acc0 = 0;
261         acc1 = 0;
262         acc2 = 0;
263         acc3 = 0;
264         /*
265          * Save 16 input samples in the history buffer
266          */
267         vst1q(pStateCur, vld1q(pTempSrc));
268         pStateCur += 16;
269         pTempSrc += 16;
270 
271         /*
272          * inner coefficients loop
273          */
274         int       i = tapsBlkCnt;
275         while (i > 0)
276         {
277             /*
278              * load 16 coefs
279              */
280             vecCoeffs = *(q7x16_t *) pCoeffsTmp;
281 
282             vecIn0 = vld1q(pSamplesTmp);
283             acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
284 
285             vecIn0 = vld1q(&pSamplesTmp[1]);
286             acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
287 
288             vecIn0 = vld1q(&pSamplesTmp[2]);
289             acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
290 
291             vecIn0 = vld1q(&pSamplesTmp[3]);
292             acc3 = vmladavaq(acc3, vecIn0, vecCoeffs);
293 
294             pSamplesTmp += 16;
295             pCoeffsTmp += 16;
296             /*
297              * Decrement the taps block loop counter
298              */
299             i--;
300         }
301         /*
302          * Store the 1.7 format filter output in destination buffer
303          */
304         *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
305         *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
306         *pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8);
307         *pOutput++ = (q7_t) __SSAT((acc3 >> 7U), 8);
308 
309         pSamples += 4;
310         /*
311          * Decrement the sample block loop counter
312          */
313         blkCnt--;
314     }
315 
316     uint32_t  residual = blockSize & 3;
317     switch (residual)
318     {
319     case 3:
320         {
321             const q7_t     *pCoeffsTmp = pCoeffs;
322             const q7_t     *pSamplesTmp = pSamples;
323 
324             acc0 = 0;
325             acc1 = 0;
326             acc2 = 0;
327             /*
328              * Save 16 input samples in the history buffer
329              */
330             vst1q(pStateCur, vld1q(pTempSrc));
331             pStateCur += 16;
332             pTempSrc += 16;
333 
334             int       i = tapsBlkCnt;
335             while (i > 0)
336             {
337                 vecCoeffs = *(q7x16_t *) pCoeffsTmp;
338 
339                 vecIn0 = vld1q(pSamplesTmp);
340                 acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
341 
342                 vecIn0 = vld1q(&pSamplesTmp[4]);
343                 acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
344 
345                 vecIn0 = vld1q(&pSamplesTmp[8]);
346                 acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
347 
348                 pSamplesTmp += 16;
349                 pCoeffsTmp += 16;
350                 i--;
351             }
352 
353             *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
354             *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
355             *pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8);
356         }
357         break;
358 
359     case 2:
360         {
361             const q7_t     *pCoeffsTmp = pCoeffs;
362             const q7_t     *pSamplesTmp = pSamples;
363 
364             acc0 = 0;
365             acc1 = 0;
366             /*
367              * Save 16 input samples in the history buffer
368              */
369             vst1q(pStateCur, vld1q(pTempSrc));
370             pStateCur += 16;
371             pTempSrc += 16;
372 
373             int       i = tapsBlkCnt;
374             while (i > 0)
375             {
376                 vecCoeffs = *(q7x16_t *) pCoeffsTmp;
377 
378                 vecIn0 = vld1q(pSamplesTmp);
379                 acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
380 
381                 vecIn0 = vld1q(&pSamplesTmp[4]);
382                 acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
383 
384                 pSamplesTmp += 16;
385                 pCoeffsTmp += 16;
386                 i--;
387             }
388 
389             *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
390             *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
391         }
392         break;
393 
394     case 1:
395         {
396             const q7_t     *pCoeffsTmp = pCoeffs;
397             const q7_t     *pSamplesTmp = pSamples;
398 
399             acc0 = 0;
400             /*
401              * Save 16 input samples in the history buffer
402              */
403             vst1q(pStateCur, vld1q(pTempSrc));
404             pStateCur += 16;
405             pTempSrc += 16;
406 
407             int       i = tapsBlkCnt;
408             while (i > 0)
409             {
410                 vecCoeffs = *(q7x16_t *) pCoeffsTmp;
411 
412                 vecIn0 = vld1q(pSamplesTmp);
413                 acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
414 
415                 pSamplesTmp += 16;
416                 pCoeffsTmp += 16;
417                 i--;
418             }
419             *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
420         }
421         break;
422     }
423 
424     /*
425      * Copy the samples back into the history buffer start
426      */
427     pTempSrc = &pState[blockSize];
428     pTempDest = pState;
429 
430     blkCnt = numTaps >> 4;
431     while (blkCnt > 0U)
432     {
433         vst1q(pTempDest, vld1q(pTempSrc));
434         pTempSrc += 16;
435         pTempDest += 16;
436         blkCnt--;
437     }
438     blkCnt = numTaps & 0xF;
439     if (blkCnt > 0U)
440     {
441         mve_pred16_t p0 = vctp8q(blkCnt);
442         vstrbq_p_s8(pTempDest, vld1q(pTempSrc), p0);
443     }
444 }
445 #else
arm_fir_q7(const arm_fir_instance_q7 * S,const q7_t * pSrc,q7_t * pDst,uint32_t blockSize)446 ARM_DSP_ATTRIBUTE void arm_fir_q7(
447   const arm_fir_instance_q7 * S,
448   const q7_t * pSrc,
449         q7_t * pDst,
450         uint32_t blockSize)
451 {
452         q7_t *pState = S->pState;                      /* State pointer */
453   const q7_t *pCoeffs = S->pCoeffs;                    /* Coefficient pointer */
454         q7_t *pStateCurnt;                             /* Points to the current sample of the state */
455         q7_t *px;                                      /* Temporary pointer for state buffer */
456   const q7_t *pb;                                      /* Temporary pointer for coefficient buffer */
457         q31_t acc0;                                    /* Accumulators */
458         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
459         uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
460 
461 #if defined (ARM_MATH_LOOPUNROLL)
462         q31_t acc1, acc2, acc3;                        /* Accumulators */
463         q7_t x0, x1, x2, x3, c0;                       /* Temporary variables to hold state */
464 #endif
465 
466   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
467   /* pStateCurnt points to the location where the new input data should be written */
468   pStateCurnt = &(S->pState[(numTaps - 1U)]);
469 
470 #if defined (ARM_MATH_LOOPUNROLL)
471 
472   /* Loop unrolling: Compute 4 output values simultaneously.
473    * The variables acc0 ... acc3 hold output values that are being computed:
474    *
475    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
476    *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
477    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
478    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
479    */
480   blkCnt = blockSize >> 2U;
481 
482   while (blkCnt > 0U)
483   {
484     /* Copy 4 new input samples into the state buffer. */
485     *pStateCurnt++ = *pSrc++;
486     *pStateCurnt++ = *pSrc++;
487     *pStateCurnt++ = *pSrc++;
488     *pStateCurnt++ = *pSrc++;
489 
490     /* Set all accumulators to zero */
491     acc0 = 0;
492     acc1 = 0;
493     acc2 = 0;
494     acc3 = 0;
495 
496     /* Initialize state pointer */
497     px = pState;
498 
499     /* Initialize coefficient pointer */
500     pb = pCoeffs;
501 
502     /* Read the first 3 samples from the state buffer:
503      *  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
504     x0 = *px++;
505     x1 = *px++;
506     x2 = *px++;
507 
508     /* Loop unrolling. Process 4 taps at a time. */
509     tapCnt = numTaps >> 2U;
510 
511     /* Loop over the number of taps.  Unroll by a factor of 4.
512        Repeat until we've computed numTaps-4 coefficients. */
513     while (tapCnt > 0U)
514     {
515       /* Read the b[numTaps] coefficient */
516       c0 = *pb;
517 
518       /* Read x[n-numTaps-3] sample */
519       x3 = *px;
520 
521       /* acc0 +=  b[numTaps] * x[n-numTaps] */
522       acc0 += ((q15_t) x0 * c0);
523 
524       /* acc1 +=  b[numTaps] * x[n-numTaps-1] */
525       acc1 += ((q15_t) x1 * c0);
526 
527       /* acc2 +=  b[numTaps] * x[n-numTaps-2] */
528       acc2 += ((q15_t) x2 * c0);
529 
530       /* acc3 +=  b[numTaps] * x[n-numTaps-3] */
531       acc3 += ((q15_t) x3 * c0);
532 
533       /* Read the b[numTaps-1] coefficient */
534       c0 = *(pb + 1U);
535 
536       /* Read x[n-numTaps-4] sample */
537       x0 = *(px + 1U);
538 
539       /* Perform the multiply-accumulates */
540       acc0 += ((q15_t) x1 * c0);
541       acc1 += ((q15_t) x2 * c0);
542       acc2 += ((q15_t) x3 * c0);
543       acc3 += ((q15_t) x0 * c0);
544 
545       /* Read the b[numTaps-2] coefficient */
546       c0 = *(pb + 2U);
547 
548       /* Read x[n-numTaps-5] sample */
549       x1 = *(px + 2U);
550 
551       /* Perform the multiply-accumulates */
552       acc0 += ((q15_t) x2 * c0);
553       acc1 += ((q15_t) x3 * c0);
554       acc2 += ((q15_t) x0 * c0);
555       acc3 += ((q15_t) x1 * c0);
556 
557       /* Read the b[numTaps-3] coefficients */
558       c0 = *(pb + 3U);
559 
560       /* Read x[n-numTaps-6] sample */
561       x2 = *(px + 3U);
562 
563       /* Perform the multiply-accumulates */
564       acc0 += ((q15_t) x3 * c0);
565       acc1 += ((q15_t) x0 * c0);
566       acc2 += ((q15_t) x1 * c0);
567       acc3 += ((q15_t) x2 * c0);
568 
569       /* update coefficient pointer */
570       pb += 4U;
571       px += 4U;
572 
573       /* Decrement loop counter */
574       tapCnt--;
575     }
576 
577     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
578     tapCnt = numTaps % 0x4U;
579 
580     while (tapCnt > 0U)
581     {
582       /* Read coefficients */
583       c0 = *(pb++);
584 
585       /* Fetch 1 state variable */
586       x3 = *(px++);
587 
588       /* Perform the multiply-accumulates */
589       acc0 += ((q15_t) x0 * c0);
590       acc1 += ((q15_t) x1 * c0);
591       acc2 += ((q15_t) x2 * c0);
592       acc3 += ((q15_t) x3 * c0);
593 
594       /* Reuse the present sample states for next sample */
595       x0 = x1;
596       x1 = x2;
597       x2 = x3;
598 
599       /* Decrement loop counter */
600       tapCnt--;
601     }
602 
603     /* The results in the 4 accumulators are in 2.62 format. Convert to 1.31
604        Then store the 4 outputs in the destination buffer. */
605     acc0 = __SSAT((acc0 >> 7U), 8);
606     *pDst++ = acc0;
607     acc1 = __SSAT((acc1 >> 7U), 8);
608     *pDst++ = acc1;
609     acc2 = __SSAT((acc2 >> 7U), 8);
610     *pDst++ = acc2;
611     acc3 = __SSAT((acc3 >> 7U), 8);
612     *pDst++ = acc3;
613 
614     /* Advance the state pointer by 4 to process the next group of 4 samples */
615     pState = pState + 4U;
616 
617     /* Decrement loop counter */
618     blkCnt--;
619   }
620 
621   /* Loop unrolling: Compute remaining output samples */
622   blkCnt = blockSize % 0x4U;
623 
624 #else
625 
626   /* Initialize blkCnt with number of taps */
627   blkCnt = blockSize;
628 
629 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
630 
631   while (blkCnt > 0U)
632   {
633     /* Copy one sample at a time into state buffer */
634     *pStateCurnt++ = *pSrc++;
635 
636     /* Set the accumulator to zero */
637     acc0 = 0;
638 
639     /* Initialize state pointer */
640     px = pState;
641 
642     /* Initialize Coefficient pointer */
643     pb = pCoeffs;
644 
645     i = numTaps;
646 
647     /* Perform the multiply-accumulates */
648     while (i > 0U)
649     {
650       acc0 += (q15_t) * (px++) * (*(pb++));
651       i--;
652     }
653 
654     /* The result is in 2.14 format. Convert to 1.7
655        Then store the output in the destination buffer. */
656     *pDst++ = __SSAT((acc0 >> 7U), 8);
657 
658     /* Advance state pointer by 1 for the next sample */
659     pState = pState + 1U;
660 
661     /* Decrement loop counter */
662     blkCnt--;
663   }
664 
665   /* Processing is complete.
666      Now copy the last numTaps - 1 samples to the start of the state buffer.
667      This prepares the state buffer for the next function call. */
668 
669   /* Points to the start of the state buffer */
670   pStateCurnt = S->pState;
671 
672 #if defined (ARM_MATH_LOOPUNROLL)
673 
674   /* Loop unrolling: Compute 4 taps at a time */
675   tapCnt = (numTaps - 1U) >> 2U;
676 
677   /* Copy data */
678   while (tapCnt > 0U)
679   {
680     *pStateCurnt++ = *pState++;
681     *pStateCurnt++ = *pState++;
682     *pStateCurnt++ = *pState++;
683     *pStateCurnt++ = *pState++;
684 
685     /* Decrement loop counter */
686     tapCnt--;
687   }
688 
689   /* Calculate remaining number of copies */
690   tapCnt = (numTaps - 1U) % 0x4U;
691 
692 #else
693 
694   /* Initialize tapCnt with number of taps */
695   tapCnt = (numTaps - 1U);
696 
697 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
698 
699   /* Copy remaining data */
700   while (tapCnt > 0U)
701   {
702     *pStateCurnt++ = *pState++;
703 
704     /* Decrement the loop counter */
705     tapCnt--;
706   }
707 
708 }
709 #endif /* defined(ARM_MATH_MVEI) */
710 
711 /**
712   @} end of FIR group
713  */
714