1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_interpolate_q31.c
4  * Description:  Q31 FIR interpolation
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR_Interpolate
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q31 FIR interpolator.
42   @param[in]     S          points to an instance of the Q31 FIR interpolator structure
43   @param[in]     pSrc       points to the block of input data
44   @param[out]    pDst       points to the block of output data
45   @param[in]     blockSize  number of input samples to process
46 
47   @par           Scaling and Overflow Behavior
48                    The function is implemented using an internal 64-bit accumulator.
49                    The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
50                    Thus, if the accumulator result overflows it wraps around rather than clip.
51                    In order to avoid overflows completely the input signal must be scaled down by <code>1/(numTaps/L)</code>.
52                    since <code>numTaps/L</code> additions occur per output sample.
53                    After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
54  */
55 
56 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
57 
58 #include "arm_helium_utils.h"
arm_fir_interpolate_q31(const arm_fir_interpolate_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)59 ARM_DSP_ATTRIBUTE void arm_fir_interpolate_q31(
60   const arm_fir_interpolate_instance_q31 * S,
61   const q31_t * pSrc,
62         q31_t * pDst,
63         uint32_t blockSize)
64 {
65     q31_t    *pState = S->pState;   /* State pointer */
66     const q31_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
67     q31_t    *pStateCurnt;      /* Points to the current sample of the state */
68     const q31_t    *ptr1, *ptr2;      /* Temporary pointers for state and coefficient buffers */
69 
70     uint32_t  i, blkCnt;        /* Loop counters */
71     uint16_t  phaseLen = S->phaseLength;    /* Length of each polyphase filter component */
72     uint32_t  strides[4] = { 0, 1 * S->L, 2 * S->L, 3 * S->L };
73     uint32x4_t vec_strides0 =  vld1q_u32(strides);
74     uint32x4_t vec_strides1 = vec_strides0 + 1;
75     uint32x4_t vec_strides2 = vec_strides0 + 2;
76     uint32x4_t vec_strides3 = vec_strides0 + 3;
77     q31x4_t vecState, vecCoef;
78 
79     /*
80      * S->pState buffer contains previous frame (phaseLen - 1) samples
81      * pStateCurnt points to the location where the new input data should be written
82      */
83     pStateCurnt = S->pState + ((q31_t) phaseLen - 1);
84     /*
85      * Total number of intput samples
86      */
87     blkCnt = blockSize;
88     /*
89      * Loop over the blockSize.
90      */
91     while (blkCnt > 0U)
92     {
93         /*
94          * Copy new input sample into the state buffer
95          */
96         *pStateCurnt++ = *pSrc++;
97         /*
98          * Loop over the Interpolation factor.
99          */
100         i = S->L;
101         while (i > 0U)
102         {
103             /*
104              * Initialize state pointer
105              */
106             ptr1 = pState;
107             if (i >= 4)
108             {
109                 /*
110                  * Initialize coefficient pointer
111                  */
112                 ptr2 = pCoeffs + (i - 1 - 3U);
113 
114                 q63_t     acc0 = 0LL;
115                 q63_t     acc1 = 0LL;
116                 q63_t     acc2 = 0LL;
117                 q63_t     acc3 = 0LL;
118 
119                 uint32_t  tapCnt = phaseLen >> 2;
120                 while (tapCnt > 0U)
121                 {
122                     vecState = vldrwq_s32(ptr1);
123 
124                     vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides3);
125                     acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
126 
127                     vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides2);
128                     acc1 = vrmlaldavhaq(acc1, vecState, vecCoef);
129 
130                     vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides1);
131                     acc2 = vrmlaldavhaq(acc2, vecState, vecCoef);
132 
133                     vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides0);
134                     acc3 = vrmlaldavhaq(acc3, vecState, vecCoef);
135 
136                     ptr1 += 4;
137                     ptr2 = ptr2 + S->L * 4;
138                     tapCnt--;
139                 }
140                 tapCnt = phaseLen & 3;
141                 if (tapCnt > 0U)
142                 {
143                     mve_pred16_t p0 = vctp32q(tapCnt);
144 
145                     vecState = vldrwq_z_s32(ptr1, p0);
146 
147                     vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides3, p0);
148                     acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
149 
150                     vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides2, p0);
151                     acc1 = vrmlaldavhaq(acc1, vecState, vecCoef);
152 
153                     vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides1, p0);
154                     acc2 = vrmlaldavhaq(acc2, vecState, vecCoef);
155 
156                     vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides0, p0);
157                     acc3 = vrmlaldavhaq(acc3, vecState, vecCoef);
158                 }
159 
160                 acc0 = asrl(acc0, 31 - 8);
161                 acc1 = asrl(acc1, 31 - 8);
162                 acc2 = asrl(acc2, 31 - 8);
163                 acc3 = asrl(acc3, 31 - 8);
164 
165                 *pDst++ = (q31_t) acc0;
166                 *pDst++ = (q31_t) acc1;
167                 *pDst++ = (q31_t) acc2;
168                 *pDst++ = (q31_t) acc3;
169                 i -= 4;
170             }
171             else if (i >= 3)
172             {
173                 /*
174                  * Initialize coefficient pointer
175                  */
176                 ptr2 = pCoeffs + (i - 1U - 2);
177 
178                 q63_t     acc0 = 0LL;
179                 q63_t     acc1 = 0LL;
180                 q63_t     acc2 = 0LL;
181 
182                 uint32_t  tapCnt = phaseLen >> 2;
183                 while (tapCnt > 0U)
184                 {
185                     vecState = vldrwq_s32(ptr1);
186 
187                     vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides2);
188                     acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
189 
190                     vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides1);
191                     acc1 = vrmlaldavhaq(acc1, vecState, vecCoef);
192 
193                     vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides0);
194                     acc2 = vrmlaldavhaq(acc2, vecState, vecCoef);
195 
196                     ptr1 += 4;
197                     ptr2 = ptr2 + S->L * 4;
198                     tapCnt--;
199                 }
200                 tapCnt = phaseLen & 3;
201                 if (tapCnt > 0U)
202                 {
203                     mve_pred16_t p0 = vctp32q(tapCnt);
204 
205                     vecState = vldrwq_z_s32(ptr1, p0);
206 
207                     vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides2, p0);
208                     acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
209 
210                     vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides1, p0);
211                     acc1 = vrmlaldavhaq(acc1, vecState, vecCoef);
212 
213                     vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides0, p0);
214                     acc2 = vrmlaldavhaq(acc2, vecState, vecCoef);
215                 }
216 
217                 acc0 = asrl(acc0, 31 - 8);
218                 acc1 = asrl(acc1, 31 - 8);
219                 acc2 = asrl(acc2, 31 - 8);
220 
221                 *pDst++ = (q31_t) acc0;
222                 *pDst++ = (q31_t) acc1;
223                 *pDst++ = (q31_t) acc2;
224                 i -= 3;
225             }
226             else if (i >= 2)
227             {
228                 /*
229                  * Initialize coefficient pointer
230                  */
231                 ptr2 = pCoeffs + (i - 1U - 1);
232 
233                 q63_t     acc0 = 0LL;
234                 q63_t     acc1 = 0LL;
235 
236                 uint32_t  tapCnt = phaseLen >> 2;
237                 while (tapCnt > 0U)
238                 {
239                     vecState = vldrwq_s32(ptr1);
240 
241                     vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides1);
242                     acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
243 
244                     vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides0);
245                     acc1 = vrmlaldavhaq(acc1, vecState, vecCoef);
246 
247                     ptr1 += 4;
248                     ptr2 = ptr2 + S->L * 4;
249                     tapCnt--;
250                 }
251                 tapCnt = phaseLen & 3;
252                 if (tapCnt > 0U)
253                 {
254                     mve_pred16_t p0 = vctp32q(tapCnt);
255 
256                     vecState = vldrwq_z_s32(ptr1, p0);
257 
258                     vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides1, p0);
259                     acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
260 
261                     vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides0, p0);
262                     acc1 = vrmlaldavhaq(acc1, vecState, vecCoef);
263                 }
264 
265                 acc0 = asrl(acc0, 31 - 8);
266                 acc1 = asrl(acc1, 31 - 8);
267 
268                 *pDst++ = (q31_t) acc0;
269                 *pDst++ = (q31_t) acc1;
270                 i -= 2;
271             }
272             else
273             {
274                 /*
275                  * Initialize coefficient pointer
276                  */
277                 ptr2 = pCoeffs + (i - 1U);
278 
279                 q63_t     acc0 = 0LL;
280 
281                 uint32_t  tapCnt = phaseLen >> 2;
282                 while (tapCnt > 0U)
283                 {
284                     vecState = vldrwq_s32(ptr1);
285                     vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides0);
286 
287                     acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
288 
289                     ptr1 += 4;
290                     ptr2 = ptr2 + S->L * 4;
291                     tapCnt--;
292                 }
293                 tapCnt = phaseLen & 3;
294                 if (tapCnt > 0U)
295                 {
296                     mve_pred16_t p0 = vctp32q(tapCnt);
297 
298                     vecState = vldrwq_z_s32(ptr1, p0);
299                     vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides0, p0);
300                     acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
301                 }
302 
303                 acc0 = asrl(acc0, 31 - 8);
304                 *pDst++ = (q31_t) acc0;
305                 /*
306                  * Decrement the loop counter
307                  */
308                 i--;
309             }
310         }
311         /*
312          * Advance the state pointer by 1
313          * * to process the next group of interpolation factor number samples
314          */
315         pState = pState + 1;
316         /*
317          * Decrement the loop counter
318          */
319         blkCnt--;
320     }
321 
322     /*
323      * Processing is complete.
324      * ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
325      * ** This prepares the state buffer for the next function call.
326      */
327 
328     /*
329      * Points to the start of the state buffer
330      */
331     pStateCurnt = S->pState;
332     blkCnt = (phaseLen - 1U) >> 2;
333     while (blkCnt > 0U)
334     {
335         vst1q(pStateCurnt, vldrwq_s32(pState));
336         pState += 4;
337         pStateCurnt += 4;
338         blkCnt--;
339     }
340     blkCnt = (phaseLen - 1U) & 3;
341     if (blkCnt > 0U)
342     {
343         mve_pred16_t p0 = vctp32q(blkCnt);
344         vstrwq_p_s32(pStateCurnt, vldrwq_s32(pState), p0);
345     }
346 }
347 #else
arm_fir_interpolate_q31(const arm_fir_interpolate_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)348 ARM_DSP_ATTRIBUTE void arm_fir_interpolate_q31(
349   const arm_fir_interpolate_instance_q31 * S,
350   const q31_t * pSrc,
351         q31_t * pDst,
352         uint32_t blockSize)
353 {
354 #if (1)
355 //#if !defined(ARM_MATH_CM0_FAMILY)
356 
357         q31_t *pState = S->pState;                     /* State pointer */
358   const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
359         q31_t *pStateCur;                              /* Points to the current sample of the state */
360         q31_t *ptr1;                                   /* Temporary pointer for state buffer */
361   const q31_t *ptr2;                                   /* Temporary pointer for coefficient buffer */
362         q63_t sum0;                                    /* Accumulators */
363         uint32_t i, blkCnt, tapCnt;                    /* Loop counters */
364         uint32_t phaseLen = S->phaseLength;            /* Length of each polyphase filter component */
365         uint32_t j;
366 
367 #if defined (ARM_MATH_LOOPUNROLL)
368         q63_t acc0, acc1, acc2, acc3;
369         q31_t x0, x1, x2, x3;
370         q31_t c0, c1, c2, c3;
371 #endif
372 
373   /* S->pState buffer contains previous frame (phaseLen - 1) samples */
374   /* pStateCur points to the location where the new input data should be written */
375   pStateCur = S->pState + (phaseLen - 1U);
376 
377 #if defined (ARM_MATH_LOOPUNROLL)
378 
379   /* Loop unrolling: Compute 4 outputs at a time */
380   blkCnt = blockSize >> 2U;
381 
382   while (blkCnt > 0U)
383   {
384     /* Copy new input sample into the state buffer */
385     *pStateCur++ = *pSrc++;
386     *pStateCur++ = *pSrc++;
387     *pStateCur++ = *pSrc++;
388     *pStateCur++ = *pSrc++;
389 
390     /* Address modifier index of coefficient buffer */
391     j = 1U;
392 
393     /* Loop over the Interpolation factor. */
394     i = (S->L);
395 
396     while (i > 0U)
397     {
398       /* Set accumulator to zero */
399       acc0 = 0;
400       acc1 = 0;
401       acc2 = 0;
402       acc3 = 0;
403 
404       /* Initialize state pointer */
405       ptr1 = pState;
406 
407       /* Initialize coefficient pointer */
408       ptr2 = pCoeffs + (S->L - j);
409 
410       /* Loop over the polyPhase length. Unroll by a factor of 4.
411          Repeat until we've computed numTaps-(4*S->L) coefficients. */
412       tapCnt = phaseLen >> 2U;
413 
414       x0 = *(ptr1++);
415       x1 = *(ptr1++);
416       x2 = *(ptr1++);
417 
418       while (tapCnt > 0U)
419       {
420         /* Read the input sample */
421         x3 = *(ptr1++);
422 
423         /* Read the coefficient */
424         c0 = *(ptr2);
425 
426         /* Perform the multiply-accumulate */
427         acc0 += (q63_t) x0 * c0;
428         acc1 += (q63_t) x1 * c0;
429         acc2 += (q63_t) x2 * c0;
430         acc3 += (q63_t) x3 * c0;
431 
432         /* Read the coefficient */
433         c1 = *(ptr2 + S->L);
434 
435         /* Read the input sample */
436         x0 = *(ptr1++);
437 
438         /* Perform the multiply-accumulate */
439         acc0 += (q63_t) x1 * c1;
440         acc1 += (q63_t) x2 * c1;
441         acc2 += (q63_t) x3 * c1;
442         acc3 += (q63_t) x0 * c1;
443 
444         /* Read the coefficient */
445         c2 = *(ptr2 + S->L * 2);
446 
447         /* Read the input sample */
448         x1 = *(ptr1++);
449 
450         /* Perform the multiply-accumulate */
451         acc0 += (q63_t) x2 * c2;
452         acc1 += (q63_t) x3 * c2;
453         acc2 += (q63_t) x0 * c2;
454         acc3 += (q63_t) x1 * c2;
455 
456         /* Read the coefficient */
457         c3 = *(ptr2 + S->L * 3);
458 
459         /* Read the input sample */
460         x2 = *(ptr1++);
461 
462         /* Perform the multiply-accumulate */
463         acc0 += (q63_t) x3 * c3;
464         acc1 += (q63_t) x0 * c3;
465         acc2 += (q63_t) x1 * c3;
466         acc3 += (q63_t) x2 * c3;
467 
468 
469         /* Upsampling is done by stuffing L-1 zeros between each sample.
470          * So instead of multiplying zeros with coefficients,
471          * Increment the coefficient pointer by interpolation factor times. */
472         ptr2 += 4 * S->L;
473 
474         /* Decrement loop counter */
475         tapCnt--;
476       }
477 
478       /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
479       tapCnt = phaseLen % 0x4U;
480 
481       while (tapCnt > 0U)
482       {
483         /* Read the input sample */
484         x3 = *(ptr1++);
485 
486         /* Read the coefficient */
487         c0 = *(ptr2);
488 
489         /* Perform the multiply-accumulate */
490         acc0 += (q63_t) x0 * c0;
491         acc1 += (q63_t) x1 * c0;
492         acc2 += (q63_t) x2 * c0;
493         acc3 += (q63_t) x3 * c0;
494 
495         /* Increment the coefficient pointer by interpolation factor times. */
496         ptr2 += S->L;
497 
498         /* update states for next sample processing */
499         x0 = x1;
500         x1 = x2;
501         x2 = x3;
502 
503         /* Decrement loop counter */
504         tapCnt--;
505       }
506 
507       /* The result is in the accumulator, store in the destination buffer. */
508       *(pDst           ) = (q31_t) (acc0 >> 31);
509       *(pDst +     S->L) = (q31_t) (acc1 >> 31);
510       *(pDst + 2 * S->L) = (q31_t) (acc2 >> 31);
511       *(pDst + 3 * S->L) = (q31_t) (acc3 >> 31);
512 
513       pDst++;
514 
515       /* Increment the address modifier index of coefficient buffer */
516       j++;
517 
518       /* Decrement loop counter */
519       i--;
520     }
521 
522     /* Advance the state pointer by 1
523      * to process the next group of interpolation factor number samples */
524     pState = pState + 4;
525 
526     pDst += S->L * 3;
527 
528     /* Decrement loop counter */
529     blkCnt--;
530   }
531 
532   /* Loop unrolling: Compute remaining outputs */
533   blkCnt = blockSize % 0x4U;
534 
535 #else
536 
537   /* Initialize blkCnt with number of samples */
538   blkCnt = blockSize;
539 
540 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
541 
542   while (blkCnt > 0U)
543   {
544     /* Copy new input sample into the state buffer */
545     *pStateCur++ = *pSrc++;
546 
547     /* Address modifier index of coefficient buffer */
548     j = 1U;
549 
550     /* Loop over the Interpolation factor. */
551     i = S->L;
552     while (i > 0U)
553     {
554       /* Set accumulator to zero */
555       sum0 = 0;
556 
557       /* Initialize state pointer */
558       ptr1 = pState;
559 
560       /* Initialize coefficient pointer */
561       ptr2 = pCoeffs + (S->L - j);
562 
563       /* Loop over the polyPhase length.
564          Repeat until we've computed numTaps-(4*S->L) coefficients. */
565 
566 #if defined (ARM_MATH_LOOPUNROLL)
567 
568      /* Loop unrolling: Compute 4 outputs at a time */
569       tapCnt = phaseLen >> 2U;
570 
571       while (tapCnt > 0U)
572       {
573         /* Perform the multiply-accumulate */
574         sum0 += (q63_t) *ptr1++ * *ptr2;
575 
576         /* Upsampling is done by stuffing L-1 zeros between each sample.
577          * So instead of multiplying zeros with coefficients,
578          * Increment the coefficient pointer by interpolation factor times. */
579         ptr2 += S->L;
580 
581         sum0 += (q63_t) *ptr1++ * *ptr2;
582         ptr2 += S->L;
583 
584         sum0 += (q63_t) *ptr1++ * *ptr2;
585         ptr2 += S->L;
586 
587         sum0 += (q63_t) *ptr1++ * *ptr2;
588         ptr2 += S->L;
589 
590         /* Decrement loop counter */
591         tapCnt--;
592       }
593 
594       /* Loop unrolling: Compute remaining outputs */
595       tapCnt = phaseLen % 0x4U;
596 
597 #else
598 
599       /* Initialize tapCnt with number of samples */
600       tapCnt = phaseLen;
601 
602 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
603 
604       while (tapCnt > 0U)
605       {
606         /* Perform the multiply-accumulate */
607         sum0 += (q63_t) *ptr1++ * *ptr2;
608 
609         /* Upsampling is done by stuffing L-1 zeros between each sample.
610          * So instead of multiplying zeros with coefficients,
611          * Increment the coefficient pointer by interpolation factor times. */
612         ptr2 += S->L;
613 
614         /* Decrement loop counter */
615         tapCnt--;
616       }
617 
618       /* The result is in the accumulator, store in the destination buffer. */
619       *pDst++ = (q31_t) (sum0 >> 31);
620 
621       /* Increment the address modifier index of coefficient buffer */
622       j++;
623 
624       /* Decrement the loop counter */
625       i--;
626     }
627 
628     /* Advance the state pointer by 1
629      * to process the next group of interpolation factor number samples */
630     pState = pState + 1;
631 
632     /* Decrement the loop counter */
633     blkCnt--;
634   }
635 
636   /* Processing is complete.
637      Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
638      This prepares the state buffer for the next function call. */
639 
640   /* Points to the start of the state buffer */
641   pStateCur = S->pState;
642 
643 #if defined (ARM_MATH_LOOPUNROLL)
644 
645   /* Loop unrolling: Compute 4 outputs at a time */
646   tapCnt = (phaseLen - 1U) >> 2U;
647 
648   /* copy data */
649   while (tapCnt > 0U)
650   {
651     *pStateCur++ = *pState++;
652     *pStateCur++ = *pState++;
653     *pStateCur++ = *pState++;
654     *pStateCur++ = *pState++;
655 
656     /* Decrement loop counter */
657     tapCnt--;
658   }
659 
660   /* Loop unrolling: Compute remaining outputs */
661   tapCnt = (phaseLen - 1U) % 0x04U;
662 
663 #else
664 
665     /* Initialize tapCnt with number of samples */
666     tapCnt = (phaseLen - 1U);
667 
668 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
669 
670   /* Copy data */
671   while (tapCnt > 0U)
672   {
673     *pStateCur++ = *pState++;
674 
675     /* Decrement loop counter */
676     tapCnt--;
677   }
678 
679 #else
680 /* alternate version for CM0_FAMILY */
681 
682         q31_t *pState = S->pState;                     /* State pointer */
683   const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
684         q31_t *pStateCur;                              /* Points to the current sample of the state */
685         q31_t *ptr1;                                   /* Temporary pointer for state buffer */
686   const q31_t *ptr2;                                   /* Temporary pointer for coefficient buffer */
687         q63_t sum0;                                    /* Accumulators */
688         uint32_t i, blkCnt, tapCnt;                    /* Loop counters */
689         uint32_t phaseLen = S->phaseLength;            /* Length of each polyphase filter component */
690 
691   /* S->pState buffer contains previous frame (phaseLen - 1) samples */
692   /* pStateCur points to the location where the new input data should be written */
693   pStateCur = S->pState + (phaseLen - 1U);
694 
695   /* Total number of intput samples */
696   blkCnt = blockSize;
697 
698   /* Loop over the blockSize. */
699   while (blkCnt > 0U)
700   {
701     /* Copy new input sample into the state buffer */
702     *pStateCur++ = *pSrc++;
703 
704     /* Loop over the Interpolation factor. */
705     i = S->L;
706 
707     while (i > 0U)
708     {
709       /* Set accumulator to zero */
710       sum0 = 0;
711 
712       /* Initialize state pointer */
713       ptr1 = pState;
714 
715       /* Initialize coefficient pointer */
716       ptr2 = pCoeffs + (i - 1U);
717 
718       /* Loop over the polyPhase length */
719       tapCnt = phaseLen;
720 
721       while (tapCnt > 0U)
722       {
723         /* Perform the multiply-accumulate */
724         sum0 += ((q63_t) *ptr1++ * *ptr2);
725 
726         /* Increment the coefficient pointer by interpolation factor times. */
727         ptr2 += S->L;
728 
729         /* Decrement the loop counter */
730         tapCnt--;
731       }
732 
733       /* The result is in the accumulator, store in the destination buffer. */
734       *pDst++ = (q31_t) (sum0 >> 31);
735 
736       /* Decrement loop counter */
737       i--;
738     }
739 
740     /* Advance the state pointer by 1
741      * to process the next group of interpolation factor number samples */
742     pState = pState + 1;
743 
744     /* Decrement loop counter */
745     blkCnt--;
746   }
747 
748   /* Processing is complete.
749    ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
750    ** This prepares the state buffer for the next function call. */
751 
752   /* Points to the start of the state buffer */
753   pStateCur = S->pState;
754 
755   tapCnt = phaseLen - 1U;
756 
757   /* Copy data */
758   while (tapCnt > 0U)
759   {
760     *pStateCur++ = *pState++;
761 
762     /* Decrement loop counter */
763     tapCnt--;
764   }
765 
766 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
767 
768 }
769 #endif /* defined(ARM_MATH_MVEI) */
770 /**
771   @} end of FIR_Interpolate group
772  */
773