1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_q15.c
4  * Description:  Q15 FIR filter processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q15 FIR filter.
42   @param[in]     S          points to an instance of the Q15 FIR filter structure
43   @param[in]     pSrc       points to the block of input data
44   @param[out]    pDst       points to the block of output data
45   @param[in]     blockSize  number of samples to process
46 
47   @par           Scaling and Overflow Behavior
48                    The function is implemented using a 64-bit internal accumulator.
49                    Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
50                    The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
51                    There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
52                    After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
53                    Lastly, the accumulator is saturated to yield a result in 1.15 format.
54 
55   @remark
56                    Refer to \ref arm_fir_fast_q15() for a faster but less precise implementation of this function.
57  */
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
59 
60 #define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
61 
62 
63 #define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs)        \
64         for (int j = 0; j < nbAcc; j++) {                                  \
65             const q15_t    *pSmp = &pSample[j];                            \
66             q63_t           acc[4];                                        \
67                                                                            \
68             acc[j] = 0;                                                    \
69             for (int i = 0; i < nbVecTaps; i++) {                          \
70                 vecIn0 = vld1q(pSmp + 8 * i);                  \
71                 acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]);         \
72             }                                                              \
73             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15);               \
74         }
75 
76 #define FIR_Q15_MAIN_CORE()                                                                  \
77 {                                                                                            \
78     q15_t          *pState = S->pState;     /* State pointer */                              \
79     const q15_t    *pCoeffs = S->pCoeffs;   /* Coefficient pointer */                        \
80     q15_t          *pStateCur;              /* Points to the current sample of the state */  \
81     const q15_t    *pSamples;               /* Temporary pointer to the sample buffer */     \
82     q15_t          *pOutput;                /* Temporary pointer to the output buffer */     \
83     const q15_t    *pTempSrc;               /* Temporary pointer to the source data */       \
84     q15_t          *pTempDest;              /* Temporary pointer to the destination buffer */\
85     uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */\
86     int32_t         blkCnt;                                                                  \
87     q15x8_t         vecIn0;                                                                  \
88                                                                                              \
89     /*                                                                                       \
90      * load coefs                                                                            \
91      */                                                                                      \
92     q15x8_t         vecCoeffs[NBVECTAPS];                                                    \
93                                                                                              \
94     for (int i = 0; i < NBVECTAPS; i++)                                                      \
95         vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i);                                          \
96                                                                                              \
97     /*                                                                                       \
98      * pState points to state array which contains previous frame (numTaps - 1) samples      \
99      * pStateCur points to the location where the new input data should be written           \
100      */                                                                                      \
101     pStateCur = &(pState[(numTaps - 1u)]);                                                   \
102     pTempSrc = pSrc;                                                                         \
103     pSamples = pState;                                                                       \
104     pOutput = pDst;                                                                          \
105                                                                                              \
106     blkCnt = blockSize >> 2;                                                                 \
107     while (blkCnt > 0) {                                                                     \
108         /*                                                                                   \
109          * Save 4 input samples in the history buffer                                        \
110          */                                                                                  \
111         vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc));                                         \
112         pStateCur += 4;                                                                      \
113         pTempSrc += 4;                                                                       \
114                                                                                              \
115         FIR_Q15_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs);                            \
116         pSamples += 4;                                                                       \
117                                                                                              \
118         blkCnt--;                                                                            \
119     }                                                                                        \
120                                                                                              \
121     /* tail */                                                                               \
122     int32_t        residual = blockSize & 3;                                                \
123                                                                                              \
124     for (int i = 0; i < residual; i++)                                                       \
125         *pStateCur++ = *pTempSrc++;                                                          \
126                                                                                              \
127     FIR_Q15_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs);                         \
128                                                                                              \
129     /*                                                                                       \
130      * Copy the samples back into the history buffer start                                   \
131      */                                                                                      \
132     pTempSrc = &pState[blockSize];                                                           \
133     pTempDest = pState;                                                                      \
134                                                                                              \
135     /* current compiler limitation */                                                        \
136     blkCnt = (numTaps - 1) >> 3;                                                             \
137     while (blkCnt > 0)                                                                       \
138     {                                                                                        \
139         vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc));                                         \
140         pTempSrc += 8;                                                                       \
141         pTempDest += 8;                                                                      \
142         blkCnt--;                                                                            \
143     }                                                                                        \
144     blkCnt = (numTaps - 1) & 7;                                                              \
145     if (blkCnt > 0)                                                                          \
146     {                                                                                        \
147         mve_pred16_t p = vctp16q(blkCnt);                                                    \
148         vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p);                               \
149     }                                                                                        \
150 }
151 
arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)152 static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S,
153   const q15_t * __restrict pSrc,
154   q15_t * __restrict pDst, uint32_t blockSize)
155 {
156     #define NBTAPS 32
157     #define NBVECTAPS (NBTAPS / 8)
158     FIR_Q15_MAIN_CORE();
159     #undef NBVECTAPS
160     #undef NBTAPS
161 }
162 
arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)163 static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S,
164   const q15_t * __restrict pSrc,
165   q15_t * __restrict pDst, uint32_t blockSize)
166 {
167     #define NBTAPS 24
168     #define NBVECTAPS (NBTAPS / 8)
169     FIR_Q15_MAIN_CORE();
170     #undef NBVECTAPS
171     #undef NBTAPS
172 }
173 
174 
arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)175 static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S,
176   const q15_t * __restrict pSrc,
177   q15_t * __restrict pDst, uint32_t blockSize)
178 {
179     #define NBTAPS 16
180     #define NBVECTAPS (NBTAPS / 8)
181     FIR_Q15_MAIN_CORE();
182     #undef NBVECTAPS
183     #undef NBTAPS
184 }
185 
arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)186 static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S,
187   const q15_t * __restrict pSrc,
188   q15_t * __restrict pDst, uint32_t blockSize)
189 {
190     #define NBTAPS 8
191     #define NBVECTAPS (NBTAPS / 8)
192     FIR_Q15_MAIN_CORE();
193     #undef NBVECTAPS
194     #undef NBTAPS
195 }
196 
197 
arm_fir_q15(const arm_fir_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)198 ARM_DSP_ATTRIBUTE void arm_fir_q15(
199   const arm_fir_instance_q15 * S,
200   const q15_t * pSrc,
201         q15_t * pDst,
202         uint32_t blockSize)
203 {
204     q15_t    *pState = S->pState;   /* State pointer */
205     const q15_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
206     q15_t    *pStateCur;        /* Points to the current sample of the state */
207     const q15_t    *pSamples;         /* Temporary pointer to the sample buffer */
208     q15_t    *pOutput;          /* Temporary pointer to the output buffer */
209     const q15_t    *pTempSrc;         /* Temporary pointer to the source data */
210     q15_t    *pTempDest;        /* Temporary pointer to the destination buffer */
211     uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
212     uint32_t  blkCnt;
213     q15x8_t vecIn0;
214     uint32_t  tapsBlkCnt = (numTaps + 7) / 8;
215     q63_t     acc0, acc1, acc2, acc3;
216 
217 
218 int32_t nbTaps = (numTaps + 7) >> 3;
219 
220 switch(nbTaps) {
221 
222     case 1:
223         arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize);
224         return;
225     case 2:
226         arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize);
227         return;
228     case 3:
229         arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize);
230         return;
231     case 4:
232         arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize);
233         return;
234     }
235     /*
236      * pState points to state array which contains previous frame (numTaps - 1) samples
237      * pStateCur points to the location where the new input data should be written
238      */
239     pStateCur   = &(pState[(numTaps - 1u)]);
240     pTempSrc    = pSrc;
241     pSamples    = pState;
242     pOutput     = pDst;
243     blkCnt      = blockSize >> 2;
244 
245     while (blkCnt > 0U)
246     {
247         const q15_t    *pCoeffsTmp = pCoeffs;
248         const q15_t    *pSamplesTmp = pSamples;
249 
250         acc0 = 0LL;
251         acc1 = 0LL;
252         acc2 = 0LL;
253         acc3 = 0LL;
254 
255         /*
256          * Save 8 input samples in the history buffer
257          */
258         vst1q(pStateCur, vld1q(pTempSrc));
259         pStateCur += 8;
260         pTempSrc += 8;
261 
262         int       i = tapsBlkCnt;
263         while (i > 0)
264         {
265             /*
266              * load 8 coefs
267              */
268             q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
269 
270             vecIn0 = vld1q(pSamplesTmp);
271             acc0 =  vmlaldavaq(acc0, vecIn0, vecCoeffs);
272 
273             vecIn0 = vld1q(&pSamplesTmp[1]);
274             acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
275 
276             vecIn0 = vld1q(&pSamplesTmp[2]);
277             acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
278 
279             vecIn0 = vld1q(&pSamplesTmp[3]);
280             acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
281 
282             pSamplesTmp += 8;
283             pCoeffsTmp += 8;
284             /*
285              * Decrement the taps block loop counter
286              */
287             i--;
288         }
289 
290         *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
291         *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
292         *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
293         *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
294 
295         pSamples += 4;
296         /*
297          * Decrement the sample block loop counter
298          */
299         blkCnt--;
300     }
301 
302     uint32_t  residual = blockSize & 3;
303     switch (residual)
304     {
305     case 3:
306         {
307             const q15_t    *pCoeffsTmp = pCoeffs;
308             const q15_t    *pSamplesTmp = pSamples;
309 
310             acc0 = 0LL;
311             acc1 = 0LL;
312             acc2 = 0LL;
313 
314             /*
315              * Save 8 input samples in the history buffer
316              */
317             *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
318             pStateCur += 8;
319             pTempSrc += 8;
320 
321             int       i = tapsBlkCnt;
322             while (i > 0)
323             {
324                 /*
325                  * load 8 coefs
326                  */
327                 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
328 
329                 vecIn0 = vld1q(pSamplesTmp);
330                 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
331 
332                 vecIn0 = vld1q(&pSamplesTmp[2]);
333                 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
334 
335                 vecIn0 = vld1q(&pSamplesTmp[4]);
336                 acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
337 
338                 pSamplesTmp += 8;
339                 pCoeffsTmp += 8;
340                 /*
341                  * Decrement the taps block loop counter
342                  */
343                 i--;
344             }
345 
346             acc0 = asrl(acc0, 15);
347             acc1 = asrl(acc1, 15);
348             acc2 = asrl(acc2, 15);
349 
350             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
351             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
352             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
353         }
354         break;
355 
356     case 2:
357         {
358             const q15_t    *pCoeffsTmp = pCoeffs;
359             const q15_t    *pSamplesTmp = pSamples;
360 
361             acc0 = 0LL;
362             acc1 = 0LL;
363             /*
364              * Save 8 input samples in the history buffer
365              */
366             vst1q(pStateCur, vld1q(pTempSrc));
367             pStateCur += 8;
368             pTempSrc += 8;
369 
370             int       i = tapsBlkCnt;
371             while (i > 0)
372             {
373                 /*
374                  * load 8 coefs
375                  */
376                 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
377 
378                 vecIn0 = vld1q(pSamplesTmp);
379                 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
380 
381                 vecIn0 = vld1q(&pSamplesTmp[2]);
382                 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
383 
384                 pSamplesTmp += 8;
385                 pCoeffsTmp += 8;
386                 /*
387                  * Decrement the taps block loop counter
388                  */
389                 i--;
390             }
391 
392             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
393             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
394         }
395         break;
396 
397     case 1:
398         {
399             const q15_t    *pCoeffsTmp = pCoeffs;
400             const q15_t    *pSamplesTmp = pSamples;
401 
402             acc0 = 0LL;
403 
404             /*
405              * Save 8 input samples in the history buffer
406              */
407             vst1q(pStateCur, vld1q(pTempSrc));
408             pStateCur += 8;
409             pTempSrc += 8;
410 
411             int       i = tapsBlkCnt;
412             while (i > 0)
413             {
414                 /*
415                  * load 8 coefs
416                  */
417                 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
418 
419                 vecIn0 = vld1q(pSamplesTmp);
420                 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
421 
422                 pSamplesTmp += 8;
423                 pCoeffsTmp += 8;
424                 /*
425                  * Decrement the taps block loop counter
426                  */
427                 i--;
428             }
429 
430             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
431         }
432         break;
433     }
434 
435     /*
436      * Copy the samples back into the history buffer start
437      */
438     pTempSrc = &pState[blockSize];
439     pTempDest = pState;
440 
441     blkCnt = numTaps >> 3;
442     while (blkCnt > 0U)
443     {
444         vst1q(pTempDest, vld1q(pTempSrc));
445         pTempSrc += 8;
446         pTempDest += 8;
447         blkCnt--;
448     }
449     blkCnt = numTaps & 7;
450     if (blkCnt > 0U)
451     {
452         mve_pred16_t p0 = vctp16q(blkCnt);
453         vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0);
454     }
455 }
456 
457 #else
arm_fir_q15(const arm_fir_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)458 ARM_DSP_ATTRIBUTE void arm_fir_q15(
459   const arm_fir_instance_q15 * S,
460   const q15_t * pSrc,
461         q15_t * pDst,
462         uint32_t blockSize)
463 {
464         q15_t *pState = S->pState;                     /* State pointer */
465   const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
466         q15_t *pStateCurnt;                            /* Points to the current sample of the state */
467         q15_t *px;                                     /* Temporary pointer for state buffer */
468   const q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
469         q63_t acc0;                                    /* Accumulators */
470         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
471         uint32_t tapCnt, blkCnt;                       /* Loop counters */
472 
473 #if defined (ARM_MATH_LOOPUNROLL)
474         q63_t acc1, acc2, acc3;                        /* Accumulators */
475         q31_t x0, x1, x2, c0;                          /* Temporary variables to hold state and coefficient values */
476 #endif
477 
478   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
479   /* pStateCurnt points to the location where the new input data should be written */
480   pStateCurnt = &(S->pState[(numTaps - 1U)]);
481 
482 #if defined (ARM_MATH_LOOPUNROLL)
483 
484   /* Loop unrolling: Compute 4 output values simultaneously.
485    * The variables acc0 ... acc3 hold output values that are being computed:
486    *
487    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
488    *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
489    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
490    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
491    */
492   blkCnt = blockSize >> 2U;
493 
494   while (blkCnt > 0U)
495   {
496     /* Copy 4 new input samples into the state buffer. */
497     *pStateCurnt++ = *pSrc++;
498     *pStateCurnt++ = *pSrc++;
499     *pStateCurnt++ = *pSrc++;
500     *pStateCurnt++ = *pSrc++;
501 
502     /* Set all accumulators to zero */
503     acc0 = 0;
504     acc1 = 0;
505     acc2 = 0;
506     acc3 = 0;
507 
508     /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
509     px = pState;
510 
511     /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
512     pb = pCoeffs;
513 
514     /* Read the first two samples from the state buffer:  x[n-N], x[n-N-1] */
515     x0 = read_q15x2_ia (&px);
516 
517     /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
518     x2 = read_q15x2_ia (&px);
519 
520     /* Loop over the number of taps.  Unroll by a factor of 4.
521        Repeat until we've computed numTaps-(numTaps%4) coefficients. */
522     tapCnt = numTaps >> 2U;
523 
524     while (tapCnt > 0U)
525     {
526       /* Read the first two coefficients using SIMD:  b[N] and b[N-1] coefficients */
527       c0 = read_q15x2_ia (&pb);
528 
529       /* acc0 +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */
530       acc0 = __SMLALD(x0, c0, acc0);
531 
532       /* acc2 +=  b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
533       acc2 = __SMLALD(x2, c0, acc2);
534 
535       /* pack  x[n-N-1] and x[n-N-2] */
536 #ifndef ARM_MATH_BIG_ENDIAN
537       x1 = __PKHBT(x2, x0, 0);
538 #else
539       x1 = __PKHBT(x0, x2, 0);
540 #endif
541 
542       /* Read state x[n-N-4], x[n-N-5] */
543       x0 = read_q15x2_ia (&px);
544 
545       /* acc1 +=  b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
546       acc1 = __SMLALDX(x1, c0, acc1);
547 
548       /* pack  x[n-N-3] and x[n-N-4] */
549 #ifndef ARM_MATH_BIG_ENDIAN
550       x1 = __PKHBT(x0, x2, 0);
551 #else
552       x1 = __PKHBT(x2, x0, 0);
553 #endif
554 
555       /* acc3 +=  b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
556       acc3 = __SMLALDX(x1, c0, acc3);
557 
558       /* Read coefficients b[N-2], b[N-3] */
559       c0 = read_q15x2_ia (&pb);
560 
561       /* acc0 +=  b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
562       acc0 = __SMLALD(x2, c0, acc0);
563 
564       /* Read state x[n-N-6], x[n-N-7] with offset */
565       x2 = read_q15x2_ia (&px);
566 
567       /* acc2 +=  b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
568       acc2 = __SMLALD(x0, c0, acc2);
569 
570       /* acc1 +=  b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
571       acc1 = __SMLALDX(x1, c0, acc1);
572 
573       /* pack  x[n-N-5] and x[n-N-6] */
574 #ifndef ARM_MATH_BIG_ENDIAN
575       x1 = __PKHBT(x2, x0, 0);
576 #else
577       x1 = __PKHBT(x0, x2, 0);
578 #endif
579 
580       /* acc3 +=  b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
581       acc3 = __SMLALDX(x1, c0, acc3);
582 
583       /* Decrement tap count */
584       tapCnt--;
585     }
586 
587     /* If the filter length is not a multiple of 4, compute the remaining filter taps.
588        This is always be 2 taps since the filter length is even. */
589     if ((numTaps & 0x3U) != 0U)
590     {
591       /* Read last two coefficients */
592       c0 = read_q15x2_ia (&pb);
593 
594       /* Perform the multiply-accumulates */
595       acc0 = __SMLALD(x0, c0, acc0);
596       acc2 = __SMLALD(x2, c0, acc2);
597 
598       /* pack state variables */
599 #ifndef ARM_MATH_BIG_ENDIAN
600       x1 = __PKHBT(x2, x0, 0);
601 #else
602       x1 = __PKHBT(x0, x2, 0);
603 #endif
604 
605       /* Read last state variables */
606       x0 = read_q15x2 (px);
607 
608       /* Perform the multiply-accumulates */
609       acc1 = __SMLALDX(x1, c0, acc1);
610 
611       /* pack state variables */
612 #ifndef ARM_MATH_BIG_ENDIAN
613       x1 = __PKHBT(x0, x2, 0);
614 #else
615       x1 = __PKHBT(x2, x0, 0);
616 #endif
617 
618       /* Perform the multiply-accumulates */
619       acc3 = __SMLALDX(x1, c0, acc3);
620     }
621 
622     /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
623        Then store the 4 outputs in the destination buffer. */
624 #ifndef ARM_MATH_BIG_ENDIAN
625     write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
626     write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
627 #else
628     write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
629     write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
630 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
631 
632     /* Advance the state pointer by 4 to process the next group of 4 samples */
633     pState = pState + 4U;
634 
635     /* Decrement loop counter */
636     blkCnt--;
637   }
638 
639   /* Loop unrolling: Compute remaining output samples */
640   blkCnt = blockSize % 0x4U;
641 
642 #else
643 
644   /* Initialize blkCnt with number of taps */
645   blkCnt = blockSize;
646 
647 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
648 
649   while (blkCnt > 0U)
650   {
651     /* Copy two samples into state buffer */
652     *pStateCurnt++ = *pSrc++;
653 
654     /* Set the accumulator to zero */
655     acc0 = 0;
656 
657     /* Use SIMD to hold states and coefficients */
658     px = pState;
659     pb = pCoeffs;
660 
661     tapCnt = numTaps >> 1U;
662 
663     while (tapCnt > 0U)
664     {
665       acc0 += (q31_t) *px++ * *pb++;
666 	    acc0 += (q31_t) *px++ * *pb++;
667 
668       tapCnt--;
669     }
670 
671 
672     /* The result is in 2.30 format. Convert to 1.15 with saturation.
673        Then store the output in the destination buffer. */
674     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
675 
676     /* Advance state pointer by 1 for the next sample */
677     pState = pState + 1U;
678 
679     /* Decrement loop counter */
680     blkCnt--;
681   }
682 
683   /* Processing is complete.
684      Now copy the last numTaps - 1 samples to the start of the state buffer.
685      This prepares the state buffer for the next function call. */
686 
687   /* Points to the start of the state buffer */
688   pStateCurnt = S->pState;
689 
690 #if defined (ARM_MATH_LOOPUNROLL)
691 
692   /* Loop unrolling: Compute 4 taps at a time */
693   tapCnt = (numTaps - 1U) >> 2U;
694 
695   /* Copy data */
696   while (tapCnt > 0U)
697   {
698     *pStateCurnt++ = *pState++;
699     *pStateCurnt++ = *pState++;
700     *pStateCurnt++ = *pState++;
701     *pStateCurnt++ = *pState++;
702 
703     /* Decrement loop counter */
704     tapCnt--;
705   }
706 
707   /* Calculate remaining number of copies */
708   tapCnt = (numTaps - 1U) % 0x4U;
709 
710 #else
711 
712   /* Initialize tapCnt with number of taps */
713   tapCnt = (numTaps - 1U);
714 
715 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
716 
717   /* Copy remaining data */
718   while (tapCnt > 0U)
719   {
720     *pStateCurnt++ = *pState++;
721 
722     /* Decrement loop counter */
723     tapCnt--;
724   }
725 
726 }
727 #endif /* defined(ARM_MATH_MVEI) */
728 
729 /**
730   @} end of FIR group
731  */
732