1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_interpolate_q15.c
4  * Description:  Q15 FIR interpolation
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR_Interpolate
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q15 FIR interpolator.
42   @param[in]     S          points to an instance of the Q15 FIR interpolator structure
43   @param[in]     pSrc       points to the block of input data
44   @param[out]    pDst       points to the block of output data
45   @param[in]     blockSize  number of input samples to process
46 
47   @par           Scaling and Overflow Behavior
48                    The function is implemented using a 64-bit internal accumulator.
49                    Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
50                    The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
51                    There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
52                    After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
53                    Lastly, the accumulator is saturated to yield a result in 1.15 format.
54  */
55 
56 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
57 
58 #include "arm_helium_utils.h"
arm_fir_interpolate_q15(const arm_fir_interpolate_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)59 ARM_DSP_ATTRIBUTE void arm_fir_interpolate_q15(
60   const arm_fir_interpolate_instance_q15 * S,
61   const q15_t * pSrc,
62         q15_t * pDst,
63         uint32_t blockSize)
64 {
65     q15_t    *pState = S->pState;   /* State pointer */
66     const q15_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
67     q15_t    *pStateCurnt;      /* Points to the current sample of the state */
68     const q15_t    *ptr1, *ptr2;      /* Temporary pointers for state and coefficient buffers */
69 
70     uint32_t  i, blkCnt;        /* Loop counters */
71     uint16_t  phaseLen = S->phaseLength;    /* Length of each polyphase filter component */
72     uint16_t  strides[8] = {
73         0, 1 * S->L, 2 * S->L, 3 * S->L,
74         4 * S->L, 5 * S->L, 6 * S->L, 7 * S->L
75         };
76     uint16x8_t vec_strides0 = *(uint16x8_t *) strides;
77     uint16x8_t vec_strides1 = vec_strides0 + 1;
78     uint16x8_t vec_strides2 = vec_strides0 + 2;
79     uint16x8_t vec_strides3 = vec_strides0 + 3;
80     q15x8_t vecState, vecCoef;
81 
82     /*
83      * S->pState buffer contains previous frame (phaseLen - 1) samples
84      * pStateCurnt points to the location where the new input data should be written
85      */
86     pStateCurnt = S->pState + ((q15_t) phaseLen - 1);
87     /*
88      * Total number of intput samples
89      */
90     blkCnt = blockSize;
91     /*
92      * Loop over the blockSize.
93      */
94     while (blkCnt > 0U)
95     {
96         /*
97          * Copy new input sample into the state buffer
98          */
99         *pStateCurnt++ = *pSrc++;
100         /*
101          * Loop over the Interpolation factor.
102          */
103         i = S->L;
104         while (i > 0U)
105         {
106             /*
107              * Initialize state pointer
108              */
109             ptr1 = pState;
110             if (i >= 4)
111             {
112                 /*
113                  * Initialize coefficient pointer
114                  */
115                 ptr2 = pCoeffs + (i - 1 - 3U);
116 
117                 q63_t     acc0 = 0LL;
118                 q63_t     acc1 = 0LL;
119                 q63_t     acc2 = 0LL;
120                 q63_t     acc3 = 0LL;
121 
122                 uint32_t  tapCnt = phaseLen >> 3;
123                 while (tapCnt > 0U)
124                 {
125                     vecState = vldrhq_s16(ptr1);
126 
127                     vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides3);
128                     acc0 = vmlaldavaq(acc0, vecState, vecCoef);
129 
130                     vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides2);
131                     acc1 = vmlaldavaq(acc1, vecState, vecCoef);
132 
133                     vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides1);
134                     acc2 = vmlaldavaq(acc2, vecState, vecCoef);
135 
136                     vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
137                     acc3 = vmlaldavaq(acc3, vecState, vecCoef);
138 
139                     ptr1 += 8;
140                     ptr2 = ptr2 + S->L * 8;
141                     tapCnt--;
142                 }
143                 tapCnt = phaseLen & 7;
144                 if (tapCnt > 0U)
145                 {
146                     mve_pred16_t p0 = vctp16q(tapCnt);
147 
148                     vecState = vldrhq_z_s16(ptr1, p0);
149 
150                     vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides3, p0);
151                     acc0 = vmlaldavaq(acc0, vecState, vecCoef);
152 
153                     vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides2, p0);
154                     acc1 = vmlaldavaq(acc1, vecState, vecCoef);
155 
156                     vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides1, p0);
157                     acc2 = vmlaldavaq(acc2, vecState, vecCoef);
158 
159                     vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
160                     acc3 = vmlaldavaq(acc3, vecState, vecCoef);
161                 }
162 
163                 acc0 = asrl(acc0, 15);
164                 acc1 = asrl(acc1, 15);
165                 acc2 = asrl(acc2, 15);
166                 acc3 = asrl(acc3, 15);
167 
168                 *pDst++ = (q15_t) __SSAT(acc0, 16);
169                 *pDst++ = (q15_t) __SSAT(acc1, 16);
170                 *pDst++ = (q15_t) __SSAT(acc2, 16);
171                 *pDst++ = (q15_t) __SSAT(acc3, 16);
172                 i -= 4;
173             }
174             else if (i >= 3)
175             {
176                 /*
177                  * Initialize coefficient pointer
178                  */
179                 ptr2 = pCoeffs + (i - 1U - 2);
180 
181                 q63_t     acc0 = 0LL;
182                 q63_t     acc1 = 0LL;
183                 q63_t     acc2 = 0LL;
184 
185                 uint32_t  tapCnt = phaseLen >> 3;
186                 while (tapCnt > 0U)
187                 {
188                     vecState = vldrhq_s16(ptr1);
189 
190                     vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides2);
191                     acc0 = vmlaldavaq(acc0, vecState, vecCoef);
192 
193                     vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides1);
194                     acc1 = vmlaldavaq(acc1, vecState, vecCoef);
195 
196                     vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
197                     acc2 = vmlaldavaq(acc2, vecState, vecCoef);
198 
199                     ptr1 += 8;
200                     ptr2 = ptr2 + S->L * 8;
201                     tapCnt--;
202                 }
203                 tapCnt = phaseLen & 7;
204                 if (tapCnt > 0U)
205                 {
206                     mve_pred16_t p0 = vctp16q(tapCnt);
207 
208                     vecState = vldrhq_z_s16(ptr1, p0);
209 
210                     vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides2, p0);
211                     acc0 = vmlaldavaq(acc0, vecState, vecCoef);
212 
213                     vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides1, p0);
214                     acc1 = vmlaldavaq(acc1, vecState, vecCoef);
215 
216                     vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
217                     acc2 = vmlaldavaq(acc2, vecState, vecCoef);
218                 }
219 
220                 acc0 = asrl(acc0, 15);
221                 acc1 = asrl(acc1, 15);
222                 acc2 = asrl(acc2, 15);
223 
224                 *pDst++ = (q15_t) __SSAT(acc0, 16);;
225                 *pDst++ = (q15_t) __SSAT(acc1, 16);;
226                 *pDst++ = (q15_t) __SSAT(acc2, 16);;
227                 i -= 3;
228             }
229             else if (i >= 2)
230             {
231                 /*
232                  * Initialize coefficient pointer
233                  */
234                 ptr2 = pCoeffs + (i - 1U - 1);
235 
236                 q63_t     acc0 = 0LL;
237                 q63_t     acc1 = 0LL;
238 
239                 uint32_t  tapCnt = phaseLen >> 3;
240                 while (tapCnt > 0U)
241                 {
242                     vecState = vldrhq_s16(ptr1);
243 
244                     vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides1);
245                     acc0 = vmlaldavaq(acc0, vecState, vecCoef);
246 
247                     vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
248                     acc1 = vmlaldavaq(acc1, vecState, vecCoef);
249 
250                     ptr1 += 8;
251                     ptr2 = ptr2 + S->L * 8;
252                     tapCnt--;
253                 }
254                 tapCnt = phaseLen & 7;
255                 if (tapCnt > 0U)
256                 {
257                     mve_pred16_t p0 = vctp16q(tapCnt);
258 
259                     vecState = vldrhq_z_s16(ptr1, p0);
260 
261                     vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides1, p0);
262                     acc0 = vmlaldavaq(acc0, vecState, vecCoef);
263 
264                     vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
265                     acc1 = vmlaldavaq(acc1, vecState, vecCoef);
266                 }
267 
268                 acc0 = asrl(acc0, 15);
269                 acc1 = asrl(acc1, 15);
270 
271                 *pDst++ = (q15_t) __SSAT(acc0, 16);
272                 *pDst++ = (q15_t) __SSAT(acc1, 16);
273                 i -= 2;
274             }
275             else
276             {
277                 /*
278                  * Initialize coefficient pointer
279                  */
280                 ptr2 = pCoeffs + (i - 1U);
281 
282                 q63_t     acc0 = 0LL;
283 
284                 uint32_t  tapCnt = phaseLen >> 3;
285                 while (tapCnt > 0U)
286                 {
287                     vecState = vldrhq_s16(ptr1);
288                     vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
289 
290                     acc0 = vmlaldavaq(acc0, vecState, vecCoef);
291 
292                     ptr1 += 8;
293                     ptr2 = ptr2 + S->L * 8;
294                     tapCnt--;
295                 }
296                 tapCnt = phaseLen & 7;
297                 if (tapCnt > 0U)
298                 {
299                     mve_pred16_t p0 = vctp16q(tapCnt);
300 
301                     vecState = vldrhq_z_s16(ptr1, p0);
302                     vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
303                     acc0 = vmlaldavaq(acc0, vecState, vecCoef);
304                 }
305 
306                 acc0 = asrl(acc0, 15);
307                 *pDst++ = (q15_t) __SSAT(acc0, 16);
308                 /*
309                  * Decrement the loop counter
310                  */
311                 i--;
312             }
313         }
314         /*
315          * Advance the state pointer by 1
316          * * to process the next group of interpolation factor number samples
317          */
318         pState = pState + 1;
319         /*
320          * Decrement the loop counter
321          */
322         blkCnt--;
323     }
324 
325     /*
326      * Processing is complete.
327      * Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
328      * This prepares the state buffer for the next function call.
329      */
330 
331     /*
332      * Points to the start of the state buffer
333      */
334     pStateCurnt = S->pState;
335     blkCnt = (phaseLen - 1U) >> 3;
336     while (blkCnt > 0U)
337     {
338         vstrhq_s16(pStateCurnt, vldrhq_s16(pState));
339         pState += 8;
340         pStateCurnt += 8;
341         blkCnt--;
342     }
343     blkCnt = (phaseLen - 1U) & 7;
344     if (blkCnt > 0U)
345     {
346         mve_pred16_t p0 = vctp16q(blkCnt);
347         vstrhq_p_s16(pStateCurnt, vldrhq_s16(pState), p0);
348     }
349 }
350 #else
arm_fir_interpolate_q15(const arm_fir_interpolate_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)351 ARM_DSP_ATTRIBUTE void arm_fir_interpolate_q15(
352   const arm_fir_interpolate_instance_q15 * S,
353   const q15_t * pSrc,
354         q15_t * pDst,
355         uint32_t blockSize)
356 {
357 #if (1)
358 //#if !defined(ARM_MATH_CM0_FAMILY)
359 
360         q15_t *pState = S->pState;                     /* State pointer */
361   const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
362         q15_t *pStateCur;                              /* Points to the current sample of the state */
363         q15_t *ptr1;                                   /* Temporary pointer for state buffer */
364   const q15_t *ptr2;                                   /* Temporary pointer for coefficient buffer */
365         q63_t sum0;                                    /* Accumulators */
366         uint32_t i, blkCnt, tapCnt;                    /* Loop counters */
367         uint32_t phaseLen = S->phaseLength;            /* Length of each polyphase filter component */
368         uint32_t j;
369 
370 #if defined (ARM_MATH_LOOPUNROLL)
371         q63_t acc0, acc1, acc2, acc3;
372         q15_t x0, x1, x2, x3;
373         q15_t c0, c1, c2, c3;
374 #endif
375 
376   /* S->pState buffer contains previous frame (phaseLen - 1) samples */
377   /* pStateCur points to the location where the new input data should be written */
378   pStateCur = S->pState + (phaseLen - 1U);
379 
380 #if defined (ARM_MATH_LOOPUNROLL)
381 
382   /* Loop unrolling: Compute 4 outputs at a time */
383   blkCnt = blockSize >> 2U;
384 
385   while (blkCnt > 0U)
386   {
387     /* Copy new input sample into the state buffer */
388     *pStateCur++ = *pSrc++;
389     *pStateCur++ = *pSrc++;
390     *pStateCur++ = *pSrc++;
391     *pStateCur++ = *pSrc++;
392 
393     /* Address modifier index of coefficient buffer */
394     j = 1U;
395 
396     /* Loop over the Interpolation factor. */
397     i = (S->L);
398 
399     while (i > 0U)
400     {
401       /* Set accumulator to zero */
402       acc0 = 0;
403       acc1 = 0;
404       acc2 = 0;
405       acc3 = 0;
406 
407       /* Initialize state pointer */
408       ptr1 = pState;
409 
410       /* Initialize coefficient pointer */
411       ptr2 = pCoeffs + (S->L - j);
412 
413       /* Loop over the polyPhase length. Unroll by a factor of 4.
414          Repeat until we've computed numTaps-(4*S->L) coefficients. */
415       tapCnt = phaseLen >> 2U;
416 
417       x0 = *(ptr1++);
418       x1 = *(ptr1++);
419       x2 = *(ptr1++);
420 
421       while (tapCnt > 0U)
422       {
423         /* Read the input sample */
424         x3 = *(ptr1++);
425 
426         /* Read the coefficient */
427         c0 = *(ptr2);
428 
429         /* Perform the multiply-accumulate */
430         acc0 += (q63_t) x0 * c0;
431         acc1 += (q63_t) x1 * c0;
432         acc2 += (q63_t) x2 * c0;
433         acc3 += (q63_t) x3 * c0;
434 
435         /* Read the coefficient */
436         c1 = *(ptr2 + S->L);
437 
438         /* Read the input sample */
439         x0 = *(ptr1++);
440 
441         /* Perform the multiply-accumulate */
442         acc0 += (q63_t) x1 * c1;
443         acc1 += (q63_t) x2 * c1;
444         acc2 += (q63_t) x3 * c1;
445         acc3 += (q63_t) x0 * c1;
446 
447         /* Read the coefficient */
448         c2 = *(ptr2 + S->L * 2);
449 
450         /* Read the input sample */
451         x1 = *(ptr1++);
452 
453         /* Perform the multiply-accumulate */
454         acc0 += (q63_t) x2 * c2;
455         acc1 += (q63_t) x3 * c2;
456         acc2 += (q63_t) x0 * c2;
457         acc3 += (q63_t) x1 * c2;
458 
459         /* Read the coefficient */
460         c3 = *(ptr2 + S->L * 3);
461 
462         /* Read the input sample */
463         x2 = *(ptr1++);
464 
465         /* Perform the multiply-accumulate */
466         acc0 += (q63_t) x3 * c3;
467         acc1 += (q63_t) x0 * c3;
468         acc2 += (q63_t) x1 * c3;
469         acc3 += (q63_t) x2 * c3;
470 
471 
472         /* Upsampling is done by stuffing L-1 zeros between each sample.
473          * So instead of multiplying zeros with coefficients,
474          * Increment the coefficient pointer by interpolation factor times. */
475         ptr2 += 4 * S->L;
476 
477         /* Decrement loop counter */
478         tapCnt--;
479       }
480 
481       /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
482       tapCnt = phaseLen % 0x4U;
483 
484       while (tapCnt > 0U)
485       {
486         /* Read the input sample */
487         x3 = *(ptr1++);
488 
489         /* Read the coefficient */
490         c0 = *(ptr2);
491 
492         /* Perform the multiply-accumulate */
493         acc0 += (q63_t) x0 * c0;
494         acc1 += (q63_t) x1 * c0;
495         acc2 += (q63_t) x2 * c0;
496         acc3 += (q63_t) x3 * c0;
497 
498         /* Increment the coefficient pointer by interpolation factor times. */
499         ptr2 += S->L;
500 
501         /* update states for next sample processing */
502         x0 = x1;
503         x1 = x2;
504         x2 = x3;
505 
506         /* Decrement loop counter */
507         tapCnt--;
508       }
509 
510       /* The result is in the accumulator, store in the destination buffer. */
511       *(pDst           ) = (q15_t) (__SSAT((acc0 >> 15), 16));
512       *(pDst +     S->L) = (q15_t) (__SSAT((acc1 >> 15), 16));
513       *(pDst + 2 * S->L) = (q15_t) (__SSAT((acc2 >> 15), 16));
514       *(pDst + 3 * S->L) = (q15_t) (__SSAT((acc3 >> 15), 16));
515 
516       pDst++;
517 
518       /* Increment the address modifier index of coefficient buffer */
519       j++;
520 
521       /* Decrement loop counter */
522       i--;
523     }
524 
525     /* Advance the state pointer by 1
526      * to process the next group of interpolation factor number samples */
527     pState = pState + 4;
528 
529     pDst += S->L * 3;
530 
531     /* Decrement loop counter */
532     blkCnt--;
533   }
534 
535   /* Loop unrolling: Compute remaining outputs */
536   blkCnt = blockSize % 0x4U;
537 
538 #else
539 
540   /* Initialize blkCnt with number of samples */
541   blkCnt = blockSize;
542 
543 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
544 
545   while (blkCnt > 0U)
546   {
547     /* Copy new input sample into the state buffer */
548     *pStateCur++ = *pSrc++;
549 
550     /* Address modifier index of coefficient buffer */
551     j = 1U;
552 
553     /* Loop over the Interpolation factor. */
554     i = S->L;
555     while (i > 0U)
556     {
557       /* Set accumulator to zero */
558       sum0 = 0;
559 
560       /* Initialize state pointer */
561       ptr1 = pState;
562 
563       /* Initialize coefficient pointer */
564       ptr2 = pCoeffs + (S->L - j);
565 
566       /* Loop over the polyPhase length.
567          Repeat until we've computed numTaps-(4*S->L) coefficients. */
568 
569 #if defined (ARM_MATH_LOOPUNROLL)
570 
571      /* Loop unrolling: Compute 4 outputs at a time */
572       tapCnt = phaseLen >> 2U;
573 
574       while (tapCnt > 0U)
575       {
576         /* Perform the multiply-accumulate */
577         sum0 += (q63_t) *ptr1++ * *ptr2;
578 
579         /* Upsampling is done by stuffing L-1 zeros between each sample.
580          * So instead of multiplying zeros with coefficients,
581          * Increment the coefficient pointer by interpolation factor times. */
582         ptr2 += S->L;
583 
584         sum0 += (q63_t) *ptr1++ * *ptr2;
585         ptr2 += S->L;
586 
587         sum0 += (q63_t) *ptr1++ * *ptr2;
588         ptr2 += S->L;
589 
590         sum0 += (q63_t) *ptr1++ * *ptr2;
591         ptr2 += S->L;
592 
593         /* Decrement loop counter */
594         tapCnt--;
595       }
596 
597       /* Loop unrolling: Compute remaining outputs */
598       tapCnt = phaseLen % 0x4U;
599 
600 #else
601 
602       /* Initialize tapCnt with number of samples */
603       tapCnt = phaseLen;
604 
605 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
606 
607       while (tapCnt > 0U)
608       {
609         /* Perform the multiply-accumulate */
610         sum0 += (q63_t) *ptr1++ * *ptr2;
611 
612         /* Upsampling is done by stuffing L-1 zeros between each sample.
613          * So instead of multiplying zeros with coefficients,
614          * Increment the coefficient pointer by interpolation factor times. */
615         ptr2 += S->L;
616 
617         /* Decrement loop counter */
618         tapCnt--;
619       }
620 
621       /* The result is in the accumulator, store in the destination buffer. */
622       *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
623 
624       /* Increment the address modifier index of coefficient buffer */
625       j++;
626 
627       /* Decrement the loop counter */
628       i--;
629     }
630 
631     /* Advance the state pointer by 1
632      * to process the next group of interpolation factor number samples */
633     pState = pState + 1;
634 
635     /* Decrement the loop counter */
636     blkCnt--;
637   }
638 
639   /* Processing is complete.
640      Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
641      This prepares the state buffer for the next function call. */
642 
643   /* Points to the start of the state buffer */
644   pStateCur = S->pState;
645 
646 #if defined (ARM_MATH_LOOPUNROLL)
647 
648   /* Loop unrolling: Compute 4 outputs at a time */
649   tapCnt = (phaseLen - 1U) >> 2U;
650 
651   /* copy data */
652   while (tapCnt > 0U)
653   {
654     write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
655     write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
656 
657     /* Decrement loop counter */
658     tapCnt--;
659   }
660 
661   /* Loop unrolling: Compute remaining outputs */
662   tapCnt = (phaseLen - 1U) % 0x04U;
663 
664 #else
665 
666     /* Initialize tapCnt with number of samples */
667     tapCnt = (phaseLen - 1U);
668 
669 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
670 
671   /* Copy data */
672   while (tapCnt > 0U)
673   {
674     *pStateCur++ = *pState++;
675 
676     /* Decrement loop counter */
677     tapCnt--;
678   }
679 
680 #else
681 /* alternate version for CM0_FAMILY */
682 
683         q15_t *pState = S->pState;                     /* State pointer */
684   const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
685         q15_t *pStateCur;                              /* Points to the current sample of the state */
686         q15_t *ptr1;                                   /* Temporary pointer for state buffer */
687   const q15_t *ptr2;                                   /* Temporary pointer for coefficient buffer */
688         q63_t sum0;                                    /* Accumulators */
689         uint32_t i, blkCnt, tapCnt;                    /* Loop counters */
690         uint32_t phaseLen = S->phaseLength;            /* Length of each polyphase filter component */
691 
692   /* S->pState buffer contains previous frame (phaseLen - 1) samples */
693   /* pStateCur points to the location where the new input data should be written */
694   pStateCur = S->pState + (phaseLen - 1U);
695 
696   /* Total number of intput samples */
697   blkCnt = blockSize;
698 
699   /* Loop over the blockSize. */
700   while (blkCnt > 0U)
701   {
702     /* Copy new input sample into the state buffer */
703     *pStateCur++ = *pSrc++;
704 
705     /* Loop over the Interpolation factor. */
706     i = S->L;
707 
708     while (i > 0U)
709     {
710       /* Set accumulator to zero */
711       sum0 = 0;
712 
713       /* Initialize state pointer */
714       ptr1 = pState;
715 
716       /* Initialize coefficient pointer */
717       ptr2 = pCoeffs + (i - 1U);
718 
719       /* Loop over the polyPhase length */
720       tapCnt = phaseLen;
721 
722       while (tapCnt > 0U)
723       {
724         /* Perform the multiply-accumulate */
725         sum0 += ((q63_t) *ptr1++ * *ptr2);
726 
727         /* Increment the coefficient pointer by interpolation factor times. */
728         ptr2 += S->L;
729 
730         /* Decrement the loop counter */
731         tapCnt--;
732       }
733 
734       /* Store the result after converting to 1.15 format in the destination buffer. */
735       *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
736 
737       /* Decrement loop counter */
738       i--;
739     }
740 
741     /* Advance the state pointer by 1
742      * to process the next group of interpolation factor number samples */
743     pState = pState + 1;
744 
745     /* Decrement loop counter */
746     blkCnt--;
747   }
748 
749   /* Processing is complete.
750    ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
751    ** This prepares the state buffer for the next function call. */
752 
753   /* Points to the start of the state buffer */
754   pStateCur = S->pState;
755 
756   tapCnt = phaseLen - 1U;
757 
758   /* Copy data */
759   while (tapCnt > 0U)
760   {
761     *pStateCur++ = *pState++;
762 
763     /* Decrement loop counter */
764     tapCnt--;
765   }
766 
767 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
768 
769 }
770 #endif /* defined(ARM_MATH_MVEI)*/
771 /**
772   @} end of FIR_Interpolate group
773  */
774