1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_interpolate_f32.c
4  * Description:  Floating-point FIR interpolation sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @defgroup FIR_Interpolate Finite Impulse Response (FIR) Interpolator
33 
34   These functions combine an upsampler (zero stuffer) and an FIR filter.
35   They are used in multirate systems for increasing the sample rate of a signal without introducing high frequency images.
36   Conceptually, the functions are equivalent to the block diagram below:
37   \image html FIRInterpolator.gif "Components included in the FIR Interpolator functions"
38   After upsampling by a factor of <code>L</code>, the signal should be filtered by a lowpass filter with a normalized
39   cutoff frequency of <code>1/L</code> in order to eliminate high frequency copies of the spectrum.
40   The user of the function is responsible for providing the filter coefficients.
41 
42   The FIR interpolator functions provided in the CMSIS DSP Library combine the upsampler and FIR filter in an efficient manner.
43   The upsampler inserts <code>L-1</code> zeros between each sample.
44   Instead of multiplying by these zero values, the FIR filter is designed to skip them.
45   This leads to an efficient implementation without any wasted effort.
46   The functions operate on blocks of input and output data.
47   <code>pSrc</code> points to an array of <code>blockSize</code> input values and
48   <code>pDst</code> points to an array of <code>blockSize*L</code> output values.
49 
50   The library provides separate functions for Q15, Q31, and floating-point data types.
51 
52   @par           Algorithm
53                    The functions use a polyphase filter structure:
54   <pre>
55       y[n] = b[0] * x[n] + b[L]   * x[n-1] + ... + b[L*(phaseLength-1)] * x[n-phaseLength+1]
56       y[n+1] = b[1] * x[n] + b[L+1] * x[n-1] + ... + b[L*(phaseLength-1)+1] * x[n-phaseLength+1]
57       ...
58       y[n+(L-1)] = b[L-1] * x[n] + b[2*L-1] * x[n-1] + ....+ b[L*(phaseLength-1)+(L-1)] * x[n-phaseLength+1]
59   </pre>
60                    This approach is more efficient than straightforward upsample-then-filter algorithms.
61                    With this method the computation is reduced by a factor of <code>1/L</code> when compared to using a standard FIR filter.
62   @par
63                    <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
64                    <code>numTaps</code> must be a multiple of the interpolation factor <code>L</code> and this is checked by the
65                    initialization functions.
66                    Internally, the function divides the FIR filter's impulse response into shorter filters of length
67                    <code>phaseLength=numTaps/L</code>.
68                    Coefficients are stored in time reversed order.
69   <pre>
70       {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
71   </pre>
72   @par
73                    <code>pState</code> points to a state array of size <code>blockSize + phaseLength - 1</code>.
74                    Samples in the state buffer are stored in the order:
75   <pre>
76      {x[n-phaseLength+1], x[n-phaseLength], x[n-phaseLength-1], x[n-phaseLength-2]....x[0], x[1], ..., x[blockSize-1]}
77   </pre>
78   @par
79                    The state variables are updated after each block of data is processed, the coefficients are untouched.
80 
81   @par           Instance Structure
82                    The coefficients and state variables for a filter are stored together in an instance data structure.
83                    A separate instance structure must be defined for each filter.
84                    Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
85                    There are separate instance structure declarations for each of the 3 supported data types.
86 
87   @par           Initialization Functions
88                    There is also an associated initialization function for each data type.
89                    The initialization function performs the following operations:
90                    - Sets the values of the internal structure fields.
91                    - Zeros out the values in the state buffer.
92                    - Checks to make sure that the length of the filter is a multiple of the interpolation factor.
93                    To do this manually without calling the init function, assign the follow subfields of the instance structure:
94                    L (interpolation factor), pCoeffs, phaseLength (numTaps / L), pState. Also set all of the values in pState to zero.
95   @par
96                    Use of the initialization function is optional.
97                    However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
98                    To place an instance structure into a const data section, the instance structure must be manually initialized.
99                    The code below statically initializes each of the 3 different data type filter instance structures
100   <pre>
101       arm_fir_interpolate_instance_f32 S = {L, phaseLength, pCoeffs, pState};
102       arm_fir_interpolate_instance_q31 S = {L, phaseLength, pCoeffs, pState};
103       arm_fir_interpolate_instance_q15 S = {L, phaseLength, pCoeffs, pState};
104   </pre>
105   @par
106                    where <code>L</code> is the interpolation factor; <code>phaseLength=numTaps/L</code> is the
107                    length of each of the shorter FIR filters used internally,
108                    <code>pCoeffs</code> is the address of the coefficient buffer;
109                    <code>pState</code> is the address of the state buffer.
110                    Be sure to set the values in the state buffer to zeros when doing static initialization.
111 
112   @par           Fixed-Point Behavior
113                    Care must be taken when using the fixed-point versions of the FIR interpolate filter functions.
114                    In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
115                    Refer to the function specific documentation below for usage guidelines.
116  */
117 
118 /**
119   @addtogroup FIR_Interpolate
120   @{
121  */
122 
123 /**
124   @brief         Processing function for floating-point FIR interpolator.
125   @param[in]     S          points to an instance of the floating-point FIR interpolator structure
126   @param[in]     pSrc       points to the block of input data
127   @param[out]    pDst       points to the block of output data
128   @param[in]     blockSize  number of input samples to process
129  */
130 
131 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
132 
133 #include "arm_helium_utils.h"
134 
arm_fir_interpolate2_f32_mve(const arm_fir_interpolate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)135 static void arm_fir_interpolate2_f32_mve(
136   const arm_fir_interpolate_instance_f32 * S,
137   const float32_t * pSrc,
138   float32_t * pDst,
139   uint32_t blockSize)
140 {
141     float32_t *pState = S->pState;  /* State pointer */
142     const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
143     float32_t *pStateCurnt;     /* Points to the current sample of the state */
144     const float32_t *ptr1, *ptr2;     /* Temporary pointers for state and coefficient buffers */
145     uint32_t  tapCnt;
146     uint32_t  blkCnt;           /* Loop counters */
147     uint16_t  phaseLen = S->phaseLength;    /* Length of each polyphase filter component */
148     uint32_t  strides[4] = { 0, 1 * 2, 2 * 2, 3 * 2 };
149     uint32x4_t vec_strides0 = vld1q_u32(strides);
150     uint32x4_t vec_strides1 = vec_strides0 + 1;
151     f32x4_t acc0, acc1;
152 
153     /*
154      * S->pState buffer contains previous frame (phaseLen - 1) samples
155      * pStateCurnt points to the location where the new input data should be written
156      */
157     pStateCurnt = S->pState + (phaseLen - 1U);
158     /*
159      * Total number of intput samples
160      */
161     blkCnt = blockSize;
162     /*
163      * Loop over the blockSize.
164      */
165     while (blkCnt > 0U)
166     {
167         /*
168          * Copy new input sample into the state buffer
169          */
170         *pStateCurnt++ = *pSrc++;
171         /*
172          * Initialize state pointer
173          */
174         ptr1 = pState;
175 
176         acc0 = vdupq_n_f32(0.0f);
177         acc1 = vdupq_n_f32(0.0f);
178         /*
179          * Initialize coefficient pointer
180          */
181         ptr2 = pCoeffs;
182 
183         tapCnt = phaseLen >> 2;
184         while (tapCnt > 0U)
185         {
186             f32x4_t vecCoef, vecState;
187 
188             vecState = vldrwq_f32(ptr1);
189 
190             vecCoef = vldrwq_gather_shifted_offset_f32(ptr2, vec_strides1);
191             acc1 = vfmaq_f32(acc1, vecState, vecCoef);
192 
193             vecCoef = vldrwq_gather_shifted_offset_f32(ptr2, vec_strides0);
194             acc0 = vfmaq_f32(acc0, vecState, vecCoef);
195 
196             ptr2 += 4 * 2;
197             ptr1 += 4;
198             /*
199              * Decrement the loop counter
200              */
201             tapCnt--;
202         }
203 
204         tapCnt = phaseLen & 3;
205         if (tapCnt > 0U)
206         {
207             mve_pred16_t p0 = vctp32q(tapCnt);
208             f32x4_t vecCoef, vecState;
209 
210             vecState = vldrwq_z_f32(ptr1, p0);
211 
212             vecCoef = vldrwq_gather_shifted_offset_z_f32(ptr2, vec_strides1, p0);
213             acc1 = vfmaq_f32(acc1, vecState, vecCoef);
214             vecCoef = vldrwq_gather_shifted_offset_z_f32(ptr2, vec_strides0, p0);
215             acc0 = vfmaq_f32(acc0, vecState, vecCoef);
216 
217         }
218         *pDst++ = vecAddAcrossF32Mve(acc1);
219         *pDst++ = vecAddAcrossF32Mve(acc0);
220 
221         /*
222          * Advance the state pointer by 1
223          * * to process the next group of interpolation factor number samples
224          */
225         pState = pState + 1;
226         /*
227          * Decrement the loop counter
228          */
229         blkCnt--;
230     }
231 
232     /*
233      * Processing is complete.
234      * ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
235      * ** This prepares the state buffer for the next function call.
236      */
237 
238     /*
239      * Points to the start of the state buffer
240      */
241     pStateCurnt = S->pState;
242     blkCnt = (phaseLen - 1U) >> 2;
243     while (blkCnt > 0U)
244     {
245         vst1q(pStateCurnt, vldrwq_f32(pState));
246         pState += 4;
247         pStateCurnt += 4;
248         blkCnt--;
249     }
250     blkCnt = (phaseLen - 1U) & 3;
251     if (blkCnt > 0U)
252     {
253         mve_pred16_t p0 = vctp32q(blkCnt);
254         vstrwq_p_f32(pStateCurnt, vldrwq_f32(pState), p0);
255     }
256 }
257 
arm_fir_interpolate_f32(const arm_fir_interpolate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)258 ARM_DSP_ATTRIBUTE void arm_fir_interpolate_f32(
259   const arm_fir_interpolate_instance_f32 * S,
260   const float32_t * pSrc,
261   float32_t * pDst,
262   uint32_t blockSize)
263 {
264     float32_t *pState = S->pState;  /* State pointer */
265     const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
266     float32_t *pStateCurnt;     /* Points to the current sample of the state */
267     const float32_t *ptr1, *ptr2;     /* Temporary pointers for state and coefficient buffers */
268     uint32_t  tapCnt;
269     uint32_t  i, blkCnt;        /* Loop counters */
270     uint16_t  phaseLen = S->phaseLength;    /* Length of each polyphase filter component */
271     uint32_t  strides[4] = { 0, 1 * S->L, 2 * S->L, 3 * S->L };
272     uint32_t  stridesM[4] = { 4, 3, 2, 1 };
273     uint32x4_t vec_stridesM =  vld1q_u32(stridesM);
274     uint32x4_t vec_strides =  vld1q_u32(strides);
275     f32x4_t acc;
276 
277 
278     if ( S->L == 2 ) {
279         arm_fir_interpolate2_f32_mve(S, pSrc, pDst, blockSize);
280         return;
281     }
282 
283     /*
284      * S->pState buffer contains previous frame (phaseLen - 1) samples
285      */
286     /*
287      * pStateCurnt points to the location where the new input data should be written
288      */
289     pStateCurnt = S->pState + (phaseLen - 1U);
290     /*
291      * Total number of intput samples
292      */
293     blkCnt = blockSize;
294     /*
295      * Loop over the blockSize.
296      */
297     while (blkCnt > 0U)
298     {
299         /*
300          * Copy new input sample into the state buffer
301          */
302         *pStateCurnt++ = *pSrc++;
303         /*
304          * Loop over the Interpolation factor.
305          */
306         i = S->L;
307         while (i > 0U)
308         {
309             /*
310              * Initialize state pointer
311              */
312             ptr1 = pState;
313             if (i >= 4)
314             {
315                 float32_t state0, state1, state2, state3;
316                 acc = vdupq_n_f32(0.0f);
317                 /*
318                  * Initialize coefficient pointer
319                  */
320                 ptr2 = pCoeffs + (i - 1U) - 4;
321                 tapCnt = phaseLen >> 2;
322                 while (tapCnt > 0U)
323                 {
324                     f32x4_t vecCoef;
325                     const float32_t *pCoef = ptr2;
326 
327                     state0 = ptr1[0];
328                     state1 = ptr1[1];
329                     state2 = ptr1[2];
330                     state3 = ptr1[3];
331                     ptr1 += 4;
332 
333                     vecCoef = vldrwq_gather_shifted_offset_f32(pCoef, vec_stridesM);
334                     pCoef += S->L;
335                     acc = vfmaq_n_f32(acc, vecCoef, state0);
336 
337                     vecCoef = vldrwq_gather_shifted_offset_f32(pCoef, vec_stridesM);
338                     pCoef += S->L;
339                     acc = vfmaq_n_f32(acc, vecCoef, state1);
340 
341                     vecCoef = vldrwq_gather_shifted_offset_f32(pCoef, vec_stridesM);
342                     pCoef += S->L;
343                     acc = vfmaq_n_f32(acc, vecCoef, state2);
344 
345                     vecCoef = vldrwq_gather_shifted_offset_f32(pCoef, vec_stridesM);
346                     pCoef += S->L;
347                     acc = vfmaq_n_f32(acc, vecCoef, state3);
348 
349                     ptr2 = ptr2 + 4 * S->L;
350                     /*
351                      * Decrement the loop counter
352                      */
353                     tapCnt--;
354                 }
355 
356                 tapCnt = phaseLen & 3;
357                 if (tapCnt > 0U)
358                 {
359                     mve_pred16_t p0 = vctp32q(tapCnt);
360                     f32x4_t vecCoef;
361                     const float32_t *pCoef = ptr2;
362 
363                     state0 = ptr1[0];
364                     state1 = ptr1[1];
365                     state2 = ptr1[2];
366                     state3 = ptr1[3];
367 
368                     vecCoef = vldrwq_gather_shifted_offset_z_f32(pCoef, vec_stridesM, p0);
369                     pCoef += S->L;
370                     acc = vfmaq_n_f32(acc, vecCoef, state0);
371 
372                     vecCoef = vldrwq_gather_shifted_offset_z_f32(pCoef, vec_stridesM, p0);
373                     pCoef += S->L;
374                     acc = vfmaq_n_f32(acc, vecCoef, state1);
375 
376                     vecCoef = vldrwq_gather_shifted_offset_z_f32(pCoef, vec_stridesM, p0);
377                     pCoef += S->L;
378                     acc = vfmaq_n_f32(acc, vecCoef, state2);
379 
380                     vecCoef = vldrwq_gather_shifted_offset_z_f32(pCoef, vec_stridesM, p0);
381                     pCoef += S->L;
382                     acc = vfmaq_n_f32(acc, vecCoef, state3);
383                 }
384 
385                 vst1q(pDst,  acc);
386                 pDst += 4;
387                 i -= 4;
388             }
389             else
390             {
391                 acc = vdupq_n_f32(0.0f);
392                 /*
393                  * Initialize coefficient pointer
394                  */
395                 ptr2 = pCoeffs + (i - 1U);
396 
397                 tapCnt = phaseLen >> 2;
398                 while (tapCnt > 0U)
399                 {
400                     f32x4_t vecCoef, vecState;
401 
402                     vecState = vldrwq_f32(ptr1);
403                     ptr1 += 4;
404 
405                     vecCoef = vldrwq_gather_shifted_offset_f32(ptr2, vec_strides);
406                     ptr2 += 4 * S->L;
407                     acc = vfmaq_f32(acc, vecState, vecCoef);
408                     /*
409                      * Decrement the loop counter
410                      */
411                     tapCnt--;
412                 }
413 
414                 tapCnt = phaseLen & 3;
415                 if (tapCnt > 0U)
416                 {
417                     mve_pred16_t p0 = vctp32q(tapCnt);
418                     f32x4_t vecCoef, vecState;
419 
420                     vecState = vldrwq_z_f32(ptr1, p0);
421 
422                     vecCoef = vldrwq_gather_shifted_offset_z_f32(ptr2, vec_strides, p0);
423                     acc = vfmaq_f32(acc, vecState, vecCoef);
424                 }
425                 *pDst++ = vecAddAcrossF32Mve(acc);
426                 /*
427                  * Decrement the loop counter
428                  */
429                 i--;
430             }
431         }
432 
433         /*
434          * Advance the state pointer by 1
435          * * to process the next group of interpolation factor number samples
436          */
437         pState = pState + 1;
438         /*
439          * Decrement the loop counter
440          */
441         blkCnt--;
442     }
443 
444     /*
445      * Processing is complete.
446      * ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
447      * ** This prepares the state buffer for the next function call.
448      */
449 
450     /*
451      * Points to the start of the state buffer
452      */
453     pStateCurnt = S->pState;
454     blkCnt = (phaseLen - 1U) >> 2;
455     while (blkCnt > 0U)
456     {
457         vst1q(pStateCurnt, vldrwq_f32(pState));
458         pState += 4;
459         pStateCurnt += 4;
460         blkCnt--;
461     }
462     blkCnt = (phaseLen - 1U) & 3;
463     if (blkCnt > 0U)
464     {
465         mve_pred16_t p0 = vctp32q(blkCnt);
466         vstrwq_p_f32(pStateCurnt, vldrwq_f32(pState), p0);
467     }
468 }
469 
470 #else
471 #if defined(ARM_MATH_NEON)
arm_fir_interpolate_f32(const arm_fir_interpolate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)472 ARM_DSP_ATTRIBUTE void arm_fir_interpolate_f32(
473   const arm_fir_interpolate_instance_f32 * S,
474   const float32_t * pSrc,
475   float32_t * pDst,
476   uint32_t blockSize)
477 {
478   float32_t *pState = S->pState;                 /* State pointer */
479   const float32_t *pCoeffs = S->pCoeffs;         /* Coefficient pointer */
480   float32_t *pStateCurnt;                        /* Points to the current sample of the state */
481   float32_t *ptr1;                               /* Temporary pointers for state buffer */
482   const float32_t *ptr2;                         /* Temporary pointers for coefficient buffer */
483   float32_t sum0;                                /* Accumulators */
484   float32_t c0;                              /* Temporary variables to hold state and coefficient values */
485   uint32_t i, blkCnt, j;                         /* Loop counters */
486   uint16_t phaseLen = S->phaseLength, tapCnt;    /* Length of each polyphase filter component */
487   uint32_t blkCntN4;
488   float32_t c1, c2, c3;
489 
490   float32x4_t sum0v;
491   float32x4_t accV0,accV1;
492   float32x4_t x0v,x1v,x2v,xa,xb;
493   float32x2_t tempV;
494 
495   /* S->pState buffer contains previous frame (phaseLen - 1) samples */
496   /* pStateCurnt points to the location where the new input data should be written */
497   pStateCurnt = S->pState + (phaseLen - 1U);
498 
499   /* Initialise  blkCnt */
500   blkCnt = blockSize >> 3;
501   blkCntN4 = blockSize & 7;
502 
503   /* Loop unrolling */
504   while (blkCnt > 0U)
505   {
506     /* Copy new input samples into the state buffer */
507     sum0v = vld1q_f32(pSrc);
508     vst1q_f32(pStateCurnt,sum0v);
509     pSrc += 4;
510     pStateCurnt += 4;
511 
512     sum0v = vld1q_f32(pSrc);
513     vst1q_f32(pStateCurnt,sum0v);
514     pSrc += 4;
515     pStateCurnt += 4;
516 
517     /* Address modifier index of coefficient buffer */
518     j = 1U;
519 
520     /* Loop over the Interpolation factor. */
521     i = (S->L);
522 
523     while (i > 0U)
524     {
525       /* Set accumulator to zero */
526       accV0 = vdupq_n_f32(0.0);
527       accV1 = vdupq_n_f32(0.0);
528 
529       /* Initialize state pointer */
530       ptr1 = pState;
531 
532       /* Initialize coefficient pointer */
533       ptr2 = pCoeffs + (S->L - j);
534 
535       /* Loop over the polyPhase length. Unroll by a factor of 4.
536        ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
537       tapCnt = phaseLen >> 2U;
538 
539       x0v = vld1q_f32(ptr1);
540       x1v = vld1q_f32(ptr1 + 4);
541 
542       while (tapCnt > 0U)
543       {
544         /* Read the input samples */
545         x2v = vld1q_f32(ptr1 + 8);
546 
547         /* Read the coefficients */
548         c0 = *(ptr2);
549 
550         /* Perform the multiply-accumulate */
551         accV0 = vmlaq_n_f32(accV0,x0v,c0);
552         accV1 = vmlaq_n_f32(accV1,x1v,c0);
553 
554         /* Read the coefficients, inputs and perform multiply-accumulate */
555         c1 = *(ptr2 + S->L);
556 
557         xa = vextq_f32(x0v,x1v,1);
558         xb = vextq_f32(x1v,x2v,1);
559 
560         accV0 = vmlaq_n_f32(accV0,xa,c1);
561         accV1 = vmlaq_n_f32(accV1,xb,c1);
562 
563         /* Read the coefficients, inputs and perform multiply-accumulate */
564         c2 = *(ptr2 + S->L * 2);
565 
566         xa = vextq_f32(x0v,x1v,2);
567         xb = vextq_f32(x1v,x2v,2);
568 
569         accV0 = vmlaq_n_f32(accV0,xa,c2);
570         accV1 = vmlaq_n_f32(accV1,xb,c2);
571 
572         /* Read the coefficients, inputs and perform multiply-accumulate */
573         c3 = *(ptr2 + S->L * 3);
574 
575         xa = vextq_f32(x0v,x1v,3);
576         xb = vextq_f32(x1v,x2v,3);
577 
578         accV0 = vmlaq_n_f32(accV0,xa,c3);
579         accV1 = vmlaq_n_f32(accV1,xb,c3);
580 
581         /* Upsampling is done by stuffing L-1 zeros between each sample.
582          * So instead of multiplying zeros with coefficients,
583          * Increment the coefficient pointer by interpolation factor times. */
584         ptr2 += 4 * S->L;
585         ptr1 += 4;
586         x0v = x1v;
587         x1v = x2v;
588 
589         /* Decrement the loop counter */
590         tapCnt--;
591       }
592 
593       /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
594       tapCnt = phaseLen % 0x4U;
595 
596       x2v = vld1q_f32(ptr1 + 8);
597 
598       switch (tapCnt)
599       {
600         case 3:
601              c0 = *(ptr2);
602              accV0 = vmlaq_n_f32(accV0,x0v,c0);
603              accV1 = vmlaq_n_f32(accV1,x1v,c0);
604              ptr2 += S->L;
605 
606              c0 = *(ptr2);
607 
608              xa = vextq_f32(x0v,x1v,1);
609              xb = vextq_f32(x1v,x2v,1);
610 
611              accV0 = vmlaq_n_f32(accV0,xa,c0);
612              accV1 = vmlaq_n_f32(accV1,xb,c0);
613              ptr2 += S->L;
614 
615              c0 = *(ptr2);
616 
617              xa = vextq_f32(x0v,x1v,2);
618              xb = vextq_f32(x1v,x2v,2);
619 
620              accV0 = vmlaq_n_f32(accV0,xa,c0);
621              accV1 = vmlaq_n_f32(accV1,xb,c0);
622              ptr2 += S->L;
623 
624         break;
625 
626         case 2:
627              c0 = *(ptr2);
628              accV0 = vmlaq_n_f32(accV0,x0v,c0);
629              accV1 = vmlaq_n_f32(accV1,x1v,c0);
630              ptr2 += S->L;
631 
632              c0 = *(ptr2);
633 
634              xa = vextq_f32(x0v,x1v,1);
635              xb = vextq_f32(x1v,x2v,1);
636 
637              accV0 = vmlaq_n_f32(accV0,xa,c0);
638              accV1 = vmlaq_n_f32(accV1,xb,c0);
639              ptr2 += S->L;
640 
641         break;
642 
643         case 1:
644              c0 = *(ptr2);
645              accV0 = vmlaq_n_f32(accV0,x0v,c0);
646              accV1 = vmlaq_n_f32(accV1,x1v,c0);
647              ptr2 += S->L;
648 
649         break;
650 
651         default:
652         break;
653 
654       }
655 
656       /* The result is in the accumulator, store in the destination buffer. */
657       *pDst = vgetq_lane_f32(accV0, 0);
658       *(pDst + S->L) = vgetq_lane_f32(accV0, 1);
659       *(pDst + 2 * S->L) = vgetq_lane_f32(accV0, 2);
660       *(pDst + 3 * S->L) = vgetq_lane_f32(accV0, 3);
661 
662       *(pDst + 4 * S->L) = vgetq_lane_f32(accV1, 0);
663       *(pDst + 5 * S->L) = vgetq_lane_f32(accV1, 1);
664       *(pDst + 6 * S->L) = vgetq_lane_f32(accV1, 2);
665       *(pDst + 7 * S->L) = vgetq_lane_f32(accV1, 3);
666 
667       pDst++;
668 
669       /* Increment the address modifier index of coefficient buffer */
670       j++;
671 
672       /* Decrement the loop counter */
673       i--;
674     }
675 
676     /* Advance the state pointer by 1
677      * to process the next group of interpolation factor number samples */
678     pState = pState + 8;
679 
680     pDst += S->L * 7;
681 
682     /* Decrement the loop counter */
683     blkCnt--;
684   }
685 
686   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
687    ** No loop unrolling is used. */
688 
689   while (blkCntN4 > 0U)
690   {
691     /* Copy new input sample into the state buffer */
692     *pStateCurnt++ = *pSrc++;
693 
694     /* Address modifier index of coefficient buffer */
695     j = 1U;
696 
697     /* Loop over the Interpolation factor. */
698     i = S->L;
699 
700     while (i > 0U)
701     {
702       /* Set accumulator to zero */
703       sum0v = vdupq_n_f32(0.0);
704 
705       /* Initialize state pointer */
706       ptr1 = pState;
707 
708       /* Initialize coefficient pointer */
709       ptr2 = pCoeffs + (S->L - j);
710 
711       /* Loop over the polyPhase length. Unroll by a factor of 4.
712        ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
713       tapCnt = phaseLen >> 2U;
714 
715       while (tapCnt > 0U)
716       {
717         /* Read the coefficient */
718         x1v = vsetq_lane_f32(*(ptr2),x1v,0);
719 
720         /* Upsampling is done by stuffing L-1 zeros between each sample.
721          * So instead of multiplying zeros with coefficients,
722          * Increment the coefficient pointer by interpolation factor times. */
723         ptr2 += S->L;
724 
725         /* Read the input sample */
726         x0v = vld1q_f32(ptr1);
727         ptr1 += 4;
728 
729         /* Read the coefficient */
730         x1v = vsetq_lane_f32(*(ptr2),x1v,1);
731 
732         /* Increment the coefficient pointer by interpolation factor times. */
733         ptr2 += S->L;
734 
735         /* Read the coefficient */
736         x1v = vsetq_lane_f32(*(ptr2),x1v,2);
737 
738         /* Increment the coefficient pointer by interpolation factor times. */
739         ptr2 += S->L;
740 
741         /* Read the coefficient */
742         x1v = vsetq_lane_f32(*(ptr2),x1v,3);
743 
744         /* Increment the coefficient pointer by interpolation factor times. */
745         ptr2 += S->L;
746 
747         sum0v = vmlaq_f32(sum0v,x0v,x1v);
748 
749         /* Decrement the loop counter */
750         tapCnt--;
751       }
752 
753       tempV = vpadd_f32(vget_low_f32(sum0v),vget_high_f32(sum0v));
754       sum0 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
755 
756       /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
757       tapCnt = phaseLen % 0x4U;
758 
759       while (tapCnt > 0U)
760       {
761         /* Perform the multiply-accumulate */
762         sum0 += *(ptr1++) * (*ptr2);
763 
764         /* Increment the coefficient pointer by interpolation factor times. */
765         ptr2 += S->L;
766 
767         /* Decrement the loop counter */
768         tapCnt--;
769       }
770 
771       /* The result is in the accumulator, store in the destination buffer. */
772       *pDst++ = sum0;
773 
774       /* Increment the address modifier index of coefficient buffer */
775       j++;
776 
777       /* Decrement the loop counter */
778       i--;
779     }
780 
781     /* Advance the state pointer by 1
782      * to process the next group of interpolation factor number samples */
783     pState = pState + 1;
784 
785     /* Decrement the loop counter */
786     blkCntN4--;
787   }
788 
789   /* Processing is complete.
790    ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
791    ** This prepares the state buffer for the next function call. */
792 
793   /* Points to the start of the state buffer */
794   pStateCurnt = S->pState;
795 
796   tapCnt = (phaseLen - 1U) >> 2U;
797 
798   /* Copy data */
799   while (tapCnt > 0U)
800   {
801     sum0v = vld1q_f32(pState);
802     vst1q_f32(pStateCurnt,sum0v);
803     pState += 4;
804     pStateCurnt += 4;
805 
806     /* Decrement the loop counter */
807     tapCnt--;
808   }
809 
810   tapCnt = (phaseLen - 1U) % 0x04U;
811 
812   /* copy data */
813   while (tapCnt > 0U)
814   {
815     *pStateCurnt++ = *pState++;
816 
817     /* Decrement the loop counter */
818     tapCnt--;
819   }
820 
821 }
822 #else
823 
arm_fir_interpolate_f32(const arm_fir_interpolate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)824 ARM_DSP_ATTRIBUTE void arm_fir_interpolate_f32(
825   const arm_fir_interpolate_instance_f32 * S,
826   const float32_t * pSrc,
827         float32_t * pDst,
828         uint32_t blockSize)
829 {
830 #if (1)
831 //#if !defined(ARM_MATH_CM0_FAMILY)
832 
833         float32_t *pState = S->pState;                 /* State pointer */
834   const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
835         float32_t *pStateCur;                          /* Points to the current sample of the state */
836         float32_t *ptr1;                               /* Temporary pointer for state buffer */
837   const float32_t *ptr2;                               /* Temporary pointer for coefficient buffer */
838         float32_t sum0;                                /* Accumulators */
839         uint32_t i, blkCnt, tapCnt;                    /* Loop counters */
840         uint32_t phaseLen = S->phaseLength;            /* Length of each polyphase filter component */
841         uint32_t j;
842 
843 #if defined (ARM_MATH_LOOPUNROLL)
844         float32_t acc0, acc1, acc2, acc3;
845         float32_t x0, x1, x2, x3;
846         float32_t c0, c1, c2, c3;
847 #endif
848 
849   /* S->pState buffer contains previous frame (phaseLen - 1) samples */
850   /* pStateCur points to the location where the new input data should be written */
851   pStateCur = S->pState + (phaseLen - 1U);
852 
853 #if defined (ARM_MATH_LOOPUNROLL)
854 
855   /* Loop unrolling: Compute 4 outputs at a time */
856   blkCnt = blockSize >> 2U;
857 
858   while (blkCnt > 0U)
859   {
860     /* Copy new input sample into the state buffer */
861     *pStateCur++ = *pSrc++;
862     *pStateCur++ = *pSrc++;
863     *pStateCur++ = *pSrc++;
864     *pStateCur++ = *pSrc++;
865 
866     /* Address modifier index of coefficient buffer */
867     j = 1U;
868 
869     /* Loop over the Interpolation factor. */
870     i = (S->L);
871 
872     while (i > 0U)
873     {
874       /* Set accumulator to zero */
875       acc0 = 0.0f;
876       acc1 = 0.0f;
877       acc2 = 0.0f;
878       acc3 = 0.0f;
879 
880       /* Initialize state pointer */
881       ptr1 = pState;
882 
883       /* Initialize coefficient pointer */
884       ptr2 = pCoeffs + (S->L - j);
885 
886       /* Loop over the polyPhase length. Unroll by a factor of 4.
887          Repeat until we've computed numTaps-(4*S->L) coefficients. */
888       tapCnt = phaseLen >> 2U;
889 
890       x0 = *(ptr1++);
891       x1 = *(ptr1++);
892       x2 = *(ptr1++);
893 
894       while (tapCnt > 0U)
895       {
896         /* Read the input sample */
897         x3 = *(ptr1++);
898 
899         /* Read the coefficient */
900         c0 = *(ptr2);
901 
902         /* Perform the multiply-accumulate */
903         acc0 += x0 * c0;
904         acc1 += x1 * c0;
905         acc2 += x2 * c0;
906         acc3 += x3 * c0;
907 
908         /* Read the coefficient */
909         c1 = *(ptr2 + S->L);
910 
911         /* Read the input sample */
912         x0 = *(ptr1++);
913 
914         /* Perform the multiply-accumulate */
915         acc0 += x1 * c1;
916         acc1 += x2 * c1;
917         acc2 += x3 * c1;
918         acc3 += x0 * c1;
919 
920         /* Read the coefficient */
921         c2 = *(ptr2 + S->L * 2);
922 
923         /* Read the input sample */
924         x1 = *(ptr1++);
925 
926         /* Perform the multiply-accumulate */
927         acc0 += x2 * c2;
928         acc1 += x3 * c2;
929         acc2 += x0 * c2;
930         acc3 += x1 * c2;
931 
932         /* Read the coefficient */
933         c3 = *(ptr2 + S->L * 3);
934 
935         /* Read the input sample */
936         x2 = *(ptr1++);
937 
938         /* Perform the multiply-accumulate */
939         acc0 += x3 * c3;
940         acc1 += x0 * c3;
941         acc2 += x1 * c3;
942         acc3 += x2 * c3;
943 
944 
945         /* Upsampling is done by stuffing L-1 zeros between each sample.
946          * So instead of multiplying zeros with coefficients,
947          * Increment the coefficient pointer by interpolation factor times. */
948         ptr2 += 4 * S->L;
949 
950         /* Decrement loop counter */
951         tapCnt--;
952       }
953 
954       /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
955       tapCnt = phaseLen % 0x4U;
956 
957       while (tapCnt > 0U)
958       {
959         /* Read the input sample */
960         x3 = *(ptr1++);
961 
962         /* Read the coefficient */
963         c0 = *(ptr2);
964 
965         /* Perform the multiply-accumulate */
966         acc0 += x0 * c0;
967         acc1 += x1 * c0;
968         acc2 += x2 * c0;
969         acc3 += x3 * c0;
970 
971         /* Increment the coefficient pointer by interpolation factor times. */
972         ptr2 += S->L;
973 
974         /* update states for next sample processing */
975         x0 = x1;
976         x1 = x2;
977         x2 = x3;
978 
979         /* Decrement loop counter */
980         tapCnt--;
981       }
982 
983       /* The result is in the accumulator, store in the destination buffer. */
984       *(pDst           ) = acc0;
985       *(pDst +     S->L) = acc1;
986       *(pDst + 2 * S->L) = acc2;
987       *(pDst + 3 * S->L) = acc3;
988 
989       pDst++;
990 
991       /* Increment the address modifier index of coefficient buffer */
992       j++;
993 
994       /* Decrement loop counter */
995       i--;
996     }
997 
998     /* Advance the state pointer by 1
999      * to process the next group of interpolation factor number samples */
1000     pState = pState + 4;
1001 
1002     pDst += S->L * 3;
1003 
1004     /* Decrement loop counter */
1005     blkCnt--;
1006   }
1007 
1008   /* Loop unrolling: Compute remaining outputs */
1009   blkCnt = blockSize % 0x4U;
1010 
1011 #else
1012 
1013   /* Initialize blkCnt with number of samples */
1014   blkCnt = blockSize;
1015 
1016 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1017 
1018   while (blkCnt > 0U)
1019   {
1020     /* Copy new input sample into the state buffer */
1021     *pStateCur++ = *pSrc++;
1022 
1023     /* Address modifier index of coefficient buffer */
1024     j = 1U;
1025 
1026     /* Loop over the Interpolation factor. */
1027     i = S->L;
1028 
1029     while (i > 0U)
1030     {
1031       /* Set accumulator to zero */
1032       sum0 = 0.0f;
1033 
1034       /* Initialize state pointer */
1035       ptr1 = pState;
1036 
1037       /* Initialize coefficient pointer */
1038       ptr2 = pCoeffs + (S->L - j);
1039 
1040       /* Loop over the polyPhase length.
1041          Repeat until we've computed numTaps-(4*S->L) coefficients. */
1042 
1043 #if defined (ARM_MATH_LOOPUNROLL)
1044 
1045      /* Loop unrolling: Compute 4 outputs at a time */
1046       tapCnt = phaseLen >> 2U;
1047 
1048       while (tapCnt > 0U)
1049       {
1050         /* Perform the multiply-accumulate */
1051         sum0 += *ptr1++ * *ptr2;
1052 
1053         /* Upsampling is done by stuffing L-1 zeros between each sample.
1054          * So instead of multiplying zeros with coefficients,
1055          * Increment the coefficient pointer by interpolation factor times. */
1056         ptr2 += S->L;
1057 
1058         sum0 += *ptr1++ * *ptr2;
1059         ptr2 += S->L;
1060 
1061         sum0 += *ptr1++ * *ptr2;
1062         ptr2 += S->L;
1063 
1064         sum0 += *ptr1++ * *ptr2;
1065         ptr2 += S->L;
1066 
1067         /* Decrement loop counter */
1068         tapCnt--;
1069       }
1070 
1071       /* Loop unrolling: Compute remaining outputs */
1072       tapCnt = phaseLen % 0x4U;
1073 
1074 #else
1075 
1076       /* Initialize tapCnt with number of samples */
1077       tapCnt = phaseLen;
1078 
1079 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1080 
1081       while (tapCnt > 0U)
1082       {
1083         /* Perform the multiply-accumulate */
1084         sum0 += *ptr1++ * *ptr2;
1085 
1086         /* Upsampling is done by stuffing L-1 zeros between each sample.
1087          * So instead of multiplying zeros with coefficients,
1088          * Increment the coefficient pointer by interpolation factor times. */
1089         ptr2 += S->L;
1090 
1091         /* Decrement loop counter */
1092         tapCnt--;
1093       }
1094 
1095       /* The result is in the accumulator, store in the destination buffer. */
1096       *pDst++ = sum0;
1097 
1098       /* Increment the address modifier index of coefficient buffer */
1099       j++;
1100 
1101       /* Decrement the loop counter */
1102       i--;
1103     }
1104 
1105     /* Advance the state pointer by 1
1106      * to process the next group of interpolation factor number samples */
1107     pState = pState + 1;
1108 
1109     /* Decrement the loop counter */
1110     blkCnt--;
1111   }
1112 
1113   /* Processing is complete.
1114      Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
1115      This prepares the state buffer for the next function call. */
1116 
1117   /* Points to the start of the state buffer */
1118   pStateCur = S->pState;
1119 
1120 #if defined (ARM_MATH_LOOPUNROLL)
1121 
1122   /* Loop unrolling: Compute 4 outputs at a time */
1123   tapCnt = (phaseLen - 1U) >> 2U;
1124 
1125   /* copy data */
1126   while (tapCnt > 0U)
1127   {
1128     *pStateCur++ = *pState++;
1129     *pStateCur++ = *pState++;
1130     *pStateCur++ = *pState++;
1131     *pStateCur++ = *pState++;
1132 
1133     /* Decrement loop counter */
1134     tapCnt--;
1135   }
1136 
1137   /* Loop unrolling: Compute remaining outputs */
1138   tapCnt = (phaseLen - 1U) % 0x04U;
1139 
1140 #else
1141 
1142     /* Initialize tapCnt with number of samples */
1143     tapCnt = (phaseLen - 1U);
1144 
1145 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1146 
1147   /* Copy data */
1148   while (tapCnt > 0U)
1149   {
1150     *pStateCur++ = *pState++;
1151 
1152     /* Decrement loop counter */
1153     tapCnt--;
1154   }
1155 
1156 #else
1157 /* alternate version for CM0_FAMILY */
1158 
1159         float32_t *pState = S->pState;                 /* State pointer */
1160   const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
1161         float32_t *pStateCur;                        /* Points to the current sample of the state */
1162         float32_t *ptr1;                               /* Temporary pointer for state buffer */
1163   const float32_t *ptr2;                               /* Temporary pointer for coefficient buffer */
1164         float32_t sum0;                                /* Accumulators */
1165         uint32_t i, blkCnt, tapCnt;                    /* Loop counters */
1166         uint32_t phaseLen = S->phaseLength;            /* Length of each polyphase filter component */
1167 
1168   /* S->pState buffer contains previous frame (phaseLen - 1) samples */
1169   /* pStateCur points to the location where the new input data should be written */
1170   pStateCur = S->pState + (phaseLen - 1U);
1171 
1172   /* Total number of intput samples */
1173   blkCnt = blockSize;
1174 
1175   /* Loop over the blockSize. */
1176   while (blkCnt > 0U)
1177   {
1178     /* Copy new input sample into the state buffer */
1179     *pStateCur++ = *pSrc++;
1180 
1181     /* Loop over the Interpolation factor. */
1182     i = S->L;
1183 
1184     while (i > 0U)
1185     {
1186       /* Set accumulator to zero */
1187       sum0 = 0.0f;
1188 
1189       /* Initialize state pointer */
1190       ptr1 = pState;
1191 
1192       /* Initialize coefficient pointer */
1193       ptr2 = pCoeffs + (i - 1U);
1194 
1195       /* Loop over the polyPhase length */
1196       tapCnt = phaseLen;
1197 
1198       while (tapCnt > 0U)
1199       {
1200         /* Perform the multiply-accumulate */
1201         sum0 += *ptr1++ * *ptr2;
1202 
1203         /* Increment the coefficient pointer by interpolation factor times. */
1204         ptr2 += S->L;
1205 
1206         /* Decrement the loop counter */
1207         tapCnt--;
1208       }
1209 
1210       /* The result is in the accumulator, store in the destination buffer. */
1211       *pDst++ = sum0;
1212 
1213       /* Decrement loop counter */
1214       i--;
1215     }
1216 
1217     /* Advance the state pointer by 1
1218      * to process the next group of interpolation factor number samples */
1219     pState = pState + 1;
1220 
1221     /* Decrement loop counter */
1222     blkCnt--;
1223   }
1224 
1225   /* Processing is complete.
1226    ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
1227    ** This prepares the state buffer for the next function call. */
1228 
1229   /* Points to the start of the state buffer */
1230   pStateCur = S->pState;
1231 
1232   tapCnt = phaseLen - 1U;
1233 
1234   /* Copy data */
1235   while (tapCnt > 0U)
1236   {
1237     *pStateCur++ = *pState++;
1238 
1239     /* Decrement loop counter */
1240     tapCnt--;
1241   }
1242 
1243 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
1244 
1245 }
1246 
1247 #endif /* #if defined(ARM_MATH_NEON) */
1248 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1249 /**
1250   @} end of FIR_Interpolate group
1251  */
1252