1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_interpolate_f32.c
4 * Description: Floating-point FIR interpolation sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @defgroup FIR_Interpolate Finite Impulse Response (FIR) Interpolator
33
34 These functions combine an upsampler (zero stuffer) and an FIR filter.
35 They are used in multirate systems for increasing the sample rate of a signal without introducing high frequency images.
36 Conceptually, the functions are equivalent to the block diagram below:
37 \image html FIRInterpolator.gif "Components included in the FIR Interpolator functions"
38 After upsampling by a factor of <code>L</code>, the signal should be filtered by a lowpass filter with a normalized
39 cutoff frequency of <code>1/L</code> in order to eliminate high frequency copies of the spectrum.
40 The user of the function is responsible for providing the filter coefficients.
41
42 The FIR interpolator functions provided in the CMSIS DSP Library combine the upsampler and FIR filter in an efficient manner.
43 The upsampler inserts <code>L-1</code> zeros between each sample.
44 Instead of multiplying by these zero values, the FIR filter is designed to skip them.
45 This leads to an efficient implementation without any wasted effort.
46 The functions operate on blocks of input and output data.
47 <code>pSrc</code> points to an array of <code>blockSize</code> input values and
48 <code>pDst</code> points to an array of <code>blockSize*L</code> output values.
49
50 The library provides separate functions for Q15, Q31, and floating-point data types.
51
52 @par Algorithm
53 The functions use a polyphase filter structure:
54 <pre>
55 y[n] = b[0] * x[n] + b[L] * x[n-1] + ... + b[L*(phaseLength-1)] * x[n-phaseLength+1]
56 y[n+1] = b[1] * x[n] + b[L+1] * x[n-1] + ... + b[L*(phaseLength-1)+1] * x[n-phaseLength+1]
57 ...
58 y[n+(L-1)] = b[L-1] * x[n] + b[2*L-1] * x[n-1] + ....+ b[L*(phaseLength-1)+(L-1)] * x[n-phaseLength+1]
59 </pre>
60 This approach is more efficient than straightforward upsample-then-filter algorithms.
61 With this method the computation is reduced by a factor of <code>1/L</code> when compared to using a standard FIR filter.
62 @par
63 <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
64 <code>numTaps</code> must be a multiple of the interpolation factor <code>L</code> and this is checked by the
65 initialization functions.
66 Internally, the function divides the FIR filter's impulse response into shorter filters of length
67 <code>phaseLength=numTaps/L</code>.
68 Coefficients are stored in time reversed order.
69 <pre>
70 {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
71 </pre>
72 @par
73 <code>pState</code> points to a state array of size <code>blockSize + phaseLength - 1</code>.
74 Samples in the state buffer are stored in the order:
75 <pre>
76 {x[n-phaseLength+1], x[n-phaseLength], x[n-phaseLength-1], x[n-phaseLength-2]....x[0], x[1], ..., x[blockSize-1]}
77 </pre>
78 @par
79 The state variables are updated after each block of data is processed, the coefficients are untouched.
80
81 @par Instance Structure
82 The coefficients and state variables for a filter are stored together in an instance data structure.
83 A separate instance structure must be defined for each filter.
84 Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
85 There are separate instance structure declarations for each of the 3 supported data types.
86
87 @par Initialization Functions
88 There is also an associated initialization function for each data type.
89 The initialization function performs the following operations:
90 - Sets the values of the internal structure fields.
91 - Zeros out the values in the state buffer.
92 - Checks to make sure that the length of the filter is a multiple of the interpolation factor.
93 To do this manually without calling the init function, assign the follow subfields of the instance structure:
94 L (interpolation factor), pCoeffs, phaseLength (numTaps / L), pState. Also set all of the values in pState to zero.
95 @par
96 Use of the initialization function is optional.
97 However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
98 To place an instance structure into a const data section, the instance structure must be manually initialized.
99 The code below statically initializes each of the 3 different data type filter instance structures
100 <pre>
101 arm_fir_interpolate_instance_f32 S = {L, phaseLength, pCoeffs, pState};
102 arm_fir_interpolate_instance_q31 S = {L, phaseLength, pCoeffs, pState};
103 arm_fir_interpolate_instance_q15 S = {L, phaseLength, pCoeffs, pState};
104 </pre>
105 @par
106 where <code>L</code> is the interpolation factor; <code>phaseLength=numTaps/L</code> is the
107 length of each of the shorter FIR filters used internally,
108 <code>pCoeffs</code> is the address of the coefficient buffer;
109 <code>pState</code> is the address of the state buffer.
110 Be sure to set the values in the state buffer to zeros when doing static initialization.
111
112 @par Fixed-Point Behavior
113 Care must be taken when using the fixed-point versions of the FIR interpolate filter functions.
114 In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
115 Refer to the function specific documentation below for usage guidelines.
116 */
117
118 /**
119 @addtogroup FIR_Interpolate
120 @{
121 */
122
123 /**
124 @brief Processing function for floating-point FIR interpolator.
125 @param[in] S points to an instance of the floating-point FIR interpolator structure
126 @param[in] pSrc points to the block of input data
127 @param[out] pDst points to the block of output data
128 @param[in] blockSize number of input samples to process
129 */
130
131 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
132
133 #include "arm_helium_utils.h"
134
arm_fir_interpolate2_f32_mve(const arm_fir_interpolate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)135 static void arm_fir_interpolate2_f32_mve(
136 const arm_fir_interpolate_instance_f32 * S,
137 const float32_t * pSrc,
138 float32_t * pDst,
139 uint32_t blockSize)
140 {
141 float32_t *pState = S->pState; /* State pointer */
142 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
143 float32_t *pStateCurnt; /* Points to the current sample of the state */
144 const float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
145 uint32_t tapCnt;
146 uint32_t blkCnt; /* Loop counters */
147 uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
148 uint32_t strides[4] = { 0, 1 * 2, 2 * 2, 3 * 2 };
149 uint32x4_t vec_strides0 = vld1q_u32(strides);
150 uint32x4_t vec_strides1 = vec_strides0 + 1;
151 f32x4_t acc0, acc1;
152
153 /*
154 * S->pState buffer contains previous frame (phaseLen - 1) samples
155 * pStateCurnt points to the location where the new input data should be written
156 */
157 pStateCurnt = S->pState + (phaseLen - 1U);
158 /*
159 * Total number of intput samples
160 */
161 blkCnt = blockSize;
162 /*
163 * Loop over the blockSize.
164 */
165 while (blkCnt > 0U)
166 {
167 /*
168 * Copy new input sample into the state buffer
169 */
170 *pStateCurnt++ = *pSrc++;
171 /*
172 * Initialize state pointer
173 */
174 ptr1 = pState;
175
176 acc0 = vdupq_n_f32(0.0f);
177 acc1 = vdupq_n_f32(0.0f);
178 /*
179 * Initialize coefficient pointer
180 */
181 ptr2 = pCoeffs;
182
183 tapCnt = phaseLen >> 2;
184 while (tapCnt > 0U)
185 {
186 f32x4_t vecCoef, vecState;
187
188 vecState = vldrwq_f32(ptr1);
189
190 vecCoef = vldrwq_gather_shifted_offset_f32(ptr2, vec_strides1);
191 acc1 = vfmaq_f32(acc1, vecState, vecCoef);
192
193 vecCoef = vldrwq_gather_shifted_offset_f32(ptr2, vec_strides0);
194 acc0 = vfmaq_f32(acc0, vecState, vecCoef);
195
196 ptr2 += 4 * 2;
197 ptr1 += 4;
198 /*
199 * Decrement the loop counter
200 */
201 tapCnt--;
202 }
203
204 tapCnt = phaseLen & 3;
205 if (tapCnt > 0U)
206 {
207 mve_pred16_t p0 = vctp32q(tapCnt);
208 f32x4_t vecCoef, vecState;
209
210 vecState = vldrwq_z_f32(ptr1, p0);
211
212 vecCoef = vldrwq_gather_shifted_offset_z_f32(ptr2, vec_strides1, p0);
213 acc1 = vfmaq_f32(acc1, vecState, vecCoef);
214 vecCoef = vldrwq_gather_shifted_offset_z_f32(ptr2, vec_strides0, p0);
215 acc0 = vfmaq_f32(acc0, vecState, vecCoef);
216
217 }
218 *pDst++ = vecAddAcrossF32Mve(acc1);
219 *pDst++ = vecAddAcrossF32Mve(acc0);
220
221 /*
222 * Advance the state pointer by 1
223 * * to process the next group of interpolation factor number samples
224 */
225 pState = pState + 1;
226 /*
227 * Decrement the loop counter
228 */
229 blkCnt--;
230 }
231
232 /*
233 * Processing is complete.
234 * ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
235 * ** This prepares the state buffer for the next function call.
236 */
237
238 /*
239 * Points to the start of the state buffer
240 */
241 pStateCurnt = S->pState;
242 blkCnt = (phaseLen - 1U) >> 2;
243 while (blkCnt > 0U)
244 {
245 vst1q(pStateCurnt, vldrwq_f32(pState));
246 pState += 4;
247 pStateCurnt += 4;
248 blkCnt--;
249 }
250 blkCnt = (phaseLen - 1U) & 3;
251 if (blkCnt > 0U)
252 {
253 mve_pred16_t p0 = vctp32q(blkCnt);
254 vstrwq_p_f32(pStateCurnt, vldrwq_f32(pState), p0);
255 }
256 }
257
arm_fir_interpolate_f32(const arm_fir_interpolate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)258 ARM_DSP_ATTRIBUTE void arm_fir_interpolate_f32(
259 const arm_fir_interpolate_instance_f32 * S,
260 const float32_t * pSrc,
261 float32_t * pDst,
262 uint32_t blockSize)
263 {
264 float32_t *pState = S->pState; /* State pointer */
265 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
266 float32_t *pStateCurnt; /* Points to the current sample of the state */
267 const float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
268 uint32_t tapCnt;
269 uint32_t i, blkCnt; /* Loop counters */
270 uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
271 uint32_t strides[4] = { 0, 1 * S->L, 2 * S->L, 3 * S->L };
272 uint32_t stridesM[4] = { 4, 3, 2, 1 };
273 uint32x4_t vec_stridesM = vld1q_u32(stridesM);
274 uint32x4_t vec_strides = vld1q_u32(strides);
275 f32x4_t acc;
276
277
278 if ( S->L == 2 ) {
279 arm_fir_interpolate2_f32_mve(S, pSrc, pDst, blockSize);
280 return;
281 }
282
283 /*
284 * S->pState buffer contains previous frame (phaseLen - 1) samples
285 */
286 /*
287 * pStateCurnt points to the location where the new input data should be written
288 */
289 pStateCurnt = S->pState + (phaseLen - 1U);
290 /*
291 * Total number of intput samples
292 */
293 blkCnt = blockSize;
294 /*
295 * Loop over the blockSize.
296 */
297 while (blkCnt > 0U)
298 {
299 /*
300 * Copy new input sample into the state buffer
301 */
302 *pStateCurnt++ = *pSrc++;
303 /*
304 * Loop over the Interpolation factor.
305 */
306 i = S->L;
307 while (i > 0U)
308 {
309 /*
310 * Initialize state pointer
311 */
312 ptr1 = pState;
313 if (i >= 4)
314 {
315 float32_t state0, state1, state2, state3;
316 acc = vdupq_n_f32(0.0f);
317 /*
318 * Initialize coefficient pointer
319 */
320 ptr2 = pCoeffs + (i - 1U) - 4;
321 tapCnt = phaseLen >> 2;
322 while (tapCnt > 0U)
323 {
324 f32x4_t vecCoef;
325 const float32_t *pCoef = ptr2;
326
327 state0 = ptr1[0];
328 state1 = ptr1[1];
329 state2 = ptr1[2];
330 state3 = ptr1[3];
331 ptr1 += 4;
332
333 vecCoef = vldrwq_gather_shifted_offset_f32(pCoef, vec_stridesM);
334 pCoef += S->L;
335 acc = vfmaq_n_f32(acc, vecCoef, state0);
336
337 vecCoef = vldrwq_gather_shifted_offset_f32(pCoef, vec_stridesM);
338 pCoef += S->L;
339 acc = vfmaq_n_f32(acc, vecCoef, state1);
340
341 vecCoef = vldrwq_gather_shifted_offset_f32(pCoef, vec_stridesM);
342 pCoef += S->L;
343 acc = vfmaq_n_f32(acc, vecCoef, state2);
344
345 vecCoef = vldrwq_gather_shifted_offset_f32(pCoef, vec_stridesM);
346 pCoef += S->L;
347 acc = vfmaq_n_f32(acc, vecCoef, state3);
348
349 ptr2 = ptr2 + 4 * S->L;
350 /*
351 * Decrement the loop counter
352 */
353 tapCnt--;
354 }
355
356 tapCnt = phaseLen & 3;
357 if (tapCnt > 0U)
358 {
359 mve_pred16_t p0 = vctp32q(tapCnt);
360 f32x4_t vecCoef;
361 const float32_t *pCoef = ptr2;
362
363 state0 = ptr1[0];
364 state1 = ptr1[1];
365 state2 = ptr1[2];
366 state3 = ptr1[3];
367
368 vecCoef = vldrwq_gather_shifted_offset_z_f32(pCoef, vec_stridesM, p0);
369 pCoef += S->L;
370 acc = vfmaq_n_f32(acc, vecCoef, state0);
371
372 vecCoef = vldrwq_gather_shifted_offset_z_f32(pCoef, vec_stridesM, p0);
373 pCoef += S->L;
374 acc = vfmaq_n_f32(acc, vecCoef, state1);
375
376 vecCoef = vldrwq_gather_shifted_offset_z_f32(pCoef, vec_stridesM, p0);
377 pCoef += S->L;
378 acc = vfmaq_n_f32(acc, vecCoef, state2);
379
380 vecCoef = vldrwq_gather_shifted_offset_z_f32(pCoef, vec_stridesM, p0);
381 pCoef += S->L;
382 acc = vfmaq_n_f32(acc, vecCoef, state3);
383 }
384
385 vst1q(pDst, acc);
386 pDst += 4;
387 i -= 4;
388 }
389 else
390 {
391 acc = vdupq_n_f32(0.0f);
392 /*
393 * Initialize coefficient pointer
394 */
395 ptr2 = pCoeffs + (i - 1U);
396
397 tapCnt = phaseLen >> 2;
398 while (tapCnt > 0U)
399 {
400 f32x4_t vecCoef, vecState;
401
402 vecState = vldrwq_f32(ptr1);
403 ptr1 += 4;
404
405 vecCoef = vldrwq_gather_shifted_offset_f32(ptr2, vec_strides);
406 ptr2 += 4 * S->L;
407 acc = vfmaq_f32(acc, vecState, vecCoef);
408 /*
409 * Decrement the loop counter
410 */
411 tapCnt--;
412 }
413
414 tapCnt = phaseLen & 3;
415 if (tapCnt > 0U)
416 {
417 mve_pred16_t p0 = vctp32q(tapCnt);
418 f32x4_t vecCoef, vecState;
419
420 vecState = vldrwq_z_f32(ptr1, p0);
421
422 vecCoef = vldrwq_gather_shifted_offset_z_f32(ptr2, vec_strides, p0);
423 acc = vfmaq_f32(acc, vecState, vecCoef);
424 }
425 *pDst++ = vecAddAcrossF32Mve(acc);
426 /*
427 * Decrement the loop counter
428 */
429 i--;
430 }
431 }
432
433 /*
434 * Advance the state pointer by 1
435 * * to process the next group of interpolation factor number samples
436 */
437 pState = pState + 1;
438 /*
439 * Decrement the loop counter
440 */
441 blkCnt--;
442 }
443
444 /*
445 * Processing is complete.
446 * ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
447 * ** This prepares the state buffer for the next function call.
448 */
449
450 /*
451 * Points to the start of the state buffer
452 */
453 pStateCurnt = S->pState;
454 blkCnt = (phaseLen - 1U) >> 2;
455 while (blkCnt > 0U)
456 {
457 vst1q(pStateCurnt, vldrwq_f32(pState));
458 pState += 4;
459 pStateCurnt += 4;
460 blkCnt--;
461 }
462 blkCnt = (phaseLen - 1U) & 3;
463 if (blkCnt > 0U)
464 {
465 mve_pred16_t p0 = vctp32q(blkCnt);
466 vstrwq_p_f32(pStateCurnt, vldrwq_f32(pState), p0);
467 }
468 }
469
470 #else
471 #if defined(ARM_MATH_NEON)
arm_fir_interpolate_f32(const arm_fir_interpolate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)472 ARM_DSP_ATTRIBUTE void arm_fir_interpolate_f32(
473 const arm_fir_interpolate_instance_f32 * S,
474 const float32_t * pSrc,
475 float32_t * pDst,
476 uint32_t blockSize)
477 {
478 float32_t *pState = S->pState; /* State pointer */
479 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
480 float32_t *pStateCurnt; /* Points to the current sample of the state */
481 float32_t *ptr1; /* Temporary pointers for state buffer */
482 const float32_t *ptr2; /* Temporary pointers for coefficient buffer */
483 float32_t sum0; /* Accumulators */
484 float32_t c0; /* Temporary variables to hold state and coefficient values */
485 uint32_t i, blkCnt, j; /* Loop counters */
486 uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
487 uint32_t blkCntN4;
488 float32_t c1, c2, c3;
489
490 float32x4_t sum0v;
491 float32x4_t accV0,accV1;
492 float32x4_t x0v,x1v,x2v,xa,xb;
493 float32x2_t tempV;
494
495 /* S->pState buffer contains previous frame (phaseLen - 1) samples */
496 /* pStateCurnt points to the location where the new input data should be written */
497 pStateCurnt = S->pState + (phaseLen - 1U);
498
499 /* Initialise blkCnt */
500 blkCnt = blockSize >> 3;
501 blkCntN4 = blockSize & 7;
502
503 /* Loop unrolling */
504 while (blkCnt > 0U)
505 {
506 /* Copy new input samples into the state buffer */
507 sum0v = vld1q_f32(pSrc);
508 vst1q_f32(pStateCurnt,sum0v);
509 pSrc += 4;
510 pStateCurnt += 4;
511
512 sum0v = vld1q_f32(pSrc);
513 vst1q_f32(pStateCurnt,sum0v);
514 pSrc += 4;
515 pStateCurnt += 4;
516
517 /* Address modifier index of coefficient buffer */
518 j = 1U;
519
520 /* Loop over the Interpolation factor. */
521 i = (S->L);
522
523 while (i > 0U)
524 {
525 /* Set accumulator to zero */
526 accV0 = vdupq_n_f32(0.0);
527 accV1 = vdupq_n_f32(0.0);
528
529 /* Initialize state pointer */
530 ptr1 = pState;
531
532 /* Initialize coefficient pointer */
533 ptr2 = pCoeffs + (S->L - j);
534
535 /* Loop over the polyPhase length. Unroll by a factor of 4.
536 ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
537 tapCnt = phaseLen >> 2U;
538
539 x0v = vld1q_f32(ptr1);
540 x1v = vld1q_f32(ptr1 + 4);
541
542 while (tapCnt > 0U)
543 {
544 /* Read the input samples */
545 x2v = vld1q_f32(ptr1 + 8);
546
547 /* Read the coefficients */
548 c0 = *(ptr2);
549
550 /* Perform the multiply-accumulate */
551 accV0 = vmlaq_n_f32(accV0,x0v,c0);
552 accV1 = vmlaq_n_f32(accV1,x1v,c0);
553
554 /* Read the coefficients, inputs and perform multiply-accumulate */
555 c1 = *(ptr2 + S->L);
556
557 xa = vextq_f32(x0v,x1v,1);
558 xb = vextq_f32(x1v,x2v,1);
559
560 accV0 = vmlaq_n_f32(accV0,xa,c1);
561 accV1 = vmlaq_n_f32(accV1,xb,c1);
562
563 /* Read the coefficients, inputs and perform multiply-accumulate */
564 c2 = *(ptr2 + S->L * 2);
565
566 xa = vextq_f32(x0v,x1v,2);
567 xb = vextq_f32(x1v,x2v,2);
568
569 accV0 = vmlaq_n_f32(accV0,xa,c2);
570 accV1 = vmlaq_n_f32(accV1,xb,c2);
571
572 /* Read the coefficients, inputs and perform multiply-accumulate */
573 c3 = *(ptr2 + S->L * 3);
574
575 xa = vextq_f32(x0v,x1v,3);
576 xb = vextq_f32(x1v,x2v,3);
577
578 accV0 = vmlaq_n_f32(accV0,xa,c3);
579 accV1 = vmlaq_n_f32(accV1,xb,c3);
580
581 /* Upsampling is done by stuffing L-1 zeros between each sample.
582 * So instead of multiplying zeros with coefficients,
583 * Increment the coefficient pointer by interpolation factor times. */
584 ptr2 += 4 * S->L;
585 ptr1 += 4;
586 x0v = x1v;
587 x1v = x2v;
588
589 /* Decrement the loop counter */
590 tapCnt--;
591 }
592
593 /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
594 tapCnt = phaseLen % 0x4U;
595
596 x2v = vld1q_f32(ptr1 + 8);
597
598 switch (tapCnt)
599 {
600 case 3:
601 c0 = *(ptr2);
602 accV0 = vmlaq_n_f32(accV0,x0v,c0);
603 accV1 = vmlaq_n_f32(accV1,x1v,c0);
604 ptr2 += S->L;
605
606 c0 = *(ptr2);
607
608 xa = vextq_f32(x0v,x1v,1);
609 xb = vextq_f32(x1v,x2v,1);
610
611 accV0 = vmlaq_n_f32(accV0,xa,c0);
612 accV1 = vmlaq_n_f32(accV1,xb,c0);
613 ptr2 += S->L;
614
615 c0 = *(ptr2);
616
617 xa = vextq_f32(x0v,x1v,2);
618 xb = vextq_f32(x1v,x2v,2);
619
620 accV0 = vmlaq_n_f32(accV0,xa,c0);
621 accV1 = vmlaq_n_f32(accV1,xb,c0);
622 ptr2 += S->L;
623
624 break;
625
626 case 2:
627 c0 = *(ptr2);
628 accV0 = vmlaq_n_f32(accV0,x0v,c0);
629 accV1 = vmlaq_n_f32(accV1,x1v,c0);
630 ptr2 += S->L;
631
632 c0 = *(ptr2);
633
634 xa = vextq_f32(x0v,x1v,1);
635 xb = vextq_f32(x1v,x2v,1);
636
637 accV0 = vmlaq_n_f32(accV0,xa,c0);
638 accV1 = vmlaq_n_f32(accV1,xb,c0);
639 ptr2 += S->L;
640
641 break;
642
643 case 1:
644 c0 = *(ptr2);
645 accV0 = vmlaq_n_f32(accV0,x0v,c0);
646 accV1 = vmlaq_n_f32(accV1,x1v,c0);
647 ptr2 += S->L;
648
649 break;
650
651 default:
652 break;
653
654 }
655
656 /* The result is in the accumulator, store in the destination buffer. */
657 *pDst = vgetq_lane_f32(accV0, 0);
658 *(pDst + S->L) = vgetq_lane_f32(accV0, 1);
659 *(pDst + 2 * S->L) = vgetq_lane_f32(accV0, 2);
660 *(pDst + 3 * S->L) = vgetq_lane_f32(accV0, 3);
661
662 *(pDst + 4 * S->L) = vgetq_lane_f32(accV1, 0);
663 *(pDst + 5 * S->L) = vgetq_lane_f32(accV1, 1);
664 *(pDst + 6 * S->L) = vgetq_lane_f32(accV1, 2);
665 *(pDst + 7 * S->L) = vgetq_lane_f32(accV1, 3);
666
667 pDst++;
668
669 /* Increment the address modifier index of coefficient buffer */
670 j++;
671
672 /* Decrement the loop counter */
673 i--;
674 }
675
676 /* Advance the state pointer by 1
677 * to process the next group of interpolation factor number samples */
678 pState = pState + 8;
679
680 pDst += S->L * 7;
681
682 /* Decrement the loop counter */
683 blkCnt--;
684 }
685
686 /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
687 ** No loop unrolling is used. */
688
689 while (blkCntN4 > 0U)
690 {
691 /* Copy new input sample into the state buffer */
692 *pStateCurnt++ = *pSrc++;
693
694 /* Address modifier index of coefficient buffer */
695 j = 1U;
696
697 /* Loop over the Interpolation factor. */
698 i = S->L;
699
700 while (i > 0U)
701 {
702 /* Set accumulator to zero */
703 sum0v = vdupq_n_f32(0.0);
704
705 /* Initialize state pointer */
706 ptr1 = pState;
707
708 /* Initialize coefficient pointer */
709 ptr2 = pCoeffs + (S->L - j);
710
711 /* Loop over the polyPhase length. Unroll by a factor of 4.
712 ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
713 tapCnt = phaseLen >> 2U;
714
715 while (tapCnt > 0U)
716 {
717 /* Read the coefficient */
718 x1v = vsetq_lane_f32(*(ptr2),x1v,0);
719
720 /* Upsampling is done by stuffing L-1 zeros between each sample.
721 * So instead of multiplying zeros with coefficients,
722 * Increment the coefficient pointer by interpolation factor times. */
723 ptr2 += S->L;
724
725 /* Read the input sample */
726 x0v = vld1q_f32(ptr1);
727 ptr1 += 4;
728
729 /* Read the coefficient */
730 x1v = vsetq_lane_f32(*(ptr2),x1v,1);
731
732 /* Increment the coefficient pointer by interpolation factor times. */
733 ptr2 += S->L;
734
735 /* Read the coefficient */
736 x1v = vsetq_lane_f32(*(ptr2),x1v,2);
737
738 /* Increment the coefficient pointer by interpolation factor times. */
739 ptr2 += S->L;
740
741 /* Read the coefficient */
742 x1v = vsetq_lane_f32(*(ptr2),x1v,3);
743
744 /* Increment the coefficient pointer by interpolation factor times. */
745 ptr2 += S->L;
746
747 sum0v = vmlaq_f32(sum0v,x0v,x1v);
748
749 /* Decrement the loop counter */
750 tapCnt--;
751 }
752
753 tempV = vpadd_f32(vget_low_f32(sum0v),vget_high_f32(sum0v));
754 sum0 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
755
756 /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
757 tapCnt = phaseLen % 0x4U;
758
759 while (tapCnt > 0U)
760 {
761 /* Perform the multiply-accumulate */
762 sum0 += *(ptr1++) * (*ptr2);
763
764 /* Increment the coefficient pointer by interpolation factor times. */
765 ptr2 += S->L;
766
767 /* Decrement the loop counter */
768 tapCnt--;
769 }
770
771 /* The result is in the accumulator, store in the destination buffer. */
772 *pDst++ = sum0;
773
774 /* Increment the address modifier index of coefficient buffer */
775 j++;
776
777 /* Decrement the loop counter */
778 i--;
779 }
780
781 /* Advance the state pointer by 1
782 * to process the next group of interpolation factor number samples */
783 pState = pState + 1;
784
785 /* Decrement the loop counter */
786 blkCntN4--;
787 }
788
789 /* Processing is complete.
790 ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
791 ** This prepares the state buffer for the next function call. */
792
793 /* Points to the start of the state buffer */
794 pStateCurnt = S->pState;
795
796 tapCnt = (phaseLen - 1U) >> 2U;
797
798 /* Copy data */
799 while (tapCnt > 0U)
800 {
801 sum0v = vld1q_f32(pState);
802 vst1q_f32(pStateCurnt,sum0v);
803 pState += 4;
804 pStateCurnt += 4;
805
806 /* Decrement the loop counter */
807 tapCnt--;
808 }
809
810 tapCnt = (phaseLen - 1U) % 0x04U;
811
812 /* copy data */
813 while (tapCnt > 0U)
814 {
815 *pStateCurnt++ = *pState++;
816
817 /* Decrement the loop counter */
818 tapCnt--;
819 }
820
821 }
822 #else
823
arm_fir_interpolate_f32(const arm_fir_interpolate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)824 ARM_DSP_ATTRIBUTE void arm_fir_interpolate_f32(
825 const arm_fir_interpolate_instance_f32 * S,
826 const float32_t * pSrc,
827 float32_t * pDst,
828 uint32_t blockSize)
829 {
830 #if (1)
831 //#if !defined(ARM_MATH_CM0_FAMILY)
832
833 float32_t *pState = S->pState; /* State pointer */
834 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
835 float32_t *pStateCur; /* Points to the current sample of the state */
836 float32_t *ptr1; /* Temporary pointer for state buffer */
837 const float32_t *ptr2; /* Temporary pointer for coefficient buffer */
838 float32_t sum0; /* Accumulators */
839 uint32_t i, blkCnt, tapCnt; /* Loop counters */
840 uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
841 uint32_t j;
842
843 #if defined (ARM_MATH_LOOPUNROLL)
844 float32_t acc0, acc1, acc2, acc3;
845 float32_t x0, x1, x2, x3;
846 float32_t c0, c1, c2, c3;
847 #endif
848
849 /* S->pState buffer contains previous frame (phaseLen - 1) samples */
850 /* pStateCur points to the location where the new input data should be written */
851 pStateCur = S->pState + (phaseLen - 1U);
852
853 #if defined (ARM_MATH_LOOPUNROLL)
854
855 /* Loop unrolling: Compute 4 outputs at a time */
856 blkCnt = blockSize >> 2U;
857
858 while (blkCnt > 0U)
859 {
860 /* Copy new input sample into the state buffer */
861 *pStateCur++ = *pSrc++;
862 *pStateCur++ = *pSrc++;
863 *pStateCur++ = *pSrc++;
864 *pStateCur++ = *pSrc++;
865
866 /* Address modifier index of coefficient buffer */
867 j = 1U;
868
869 /* Loop over the Interpolation factor. */
870 i = (S->L);
871
872 while (i > 0U)
873 {
874 /* Set accumulator to zero */
875 acc0 = 0.0f;
876 acc1 = 0.0f;
877 acc2 = 0.0f;
878 acc3 = 0.0f;
879
880 /* Initialize state pointer */
881 ptr1 = pState;
882
883 /* Initialize coefficient pointer */
884 ptr2 = pCoeffs + (S->L - j);
885
886 /* Loop over the polyPhase length. Unroll by a factor of 4.
887 Repeat until we've computed numTaps-(4*S->L) coefficients. */
888 tapCnt = phaseLen >> 2U;
889
890 x0 = *(ptr1++);
891 x1 = *(ptr1++);
892 x2 = *(ptr1++);
893
894 while (tapCnt > 0U)
895 {
896 /* Read the input sample */
897 x3 = *(ptr1++);
898
899 /* Read the coefficient */
900 c0 = *(ptr2);
901
902 /* Perform the multiply-accumulate */
903 acc0 += x0 * c0;
904 acc1 += x1 * c0;
905 acc2 += x2 * c0;
906 acc3 += x3 * c0;
907
908 /* Read the coefficient */
909 c1 = *(ptr2 + S->L);
910
911 /* Read the input sample */
912 x0 = *(ptr1++);
913
914 /* Perform the multiply-accumulate */
915 acc0 += x1 * c1;
916 acc1 += x2 * c1;
917 acc2 += x3 * c1;
918 acc3 += x0 * c1;
919
920 /* Read the coefficient */
921 c2 = *(ptr2 + S->L * 2);
922
923 /* Read the input sample */
924 x1 = *(ptr1++);
925
926 /* Perform the multiply-accumulate */
927 acc0 += x2 * c2;
928 acc1 += x3 * c2;
929 acc2 += x0 * c2;
930 acc3 += x1 * c2;
931
932 /* Read the coefficient */
933 c3 = *(ptr2 + S->L * 3);
934
935 /* Read the input sample */
936 x2 = *(ptr1++);
937
938 /* Perform the multiply-accumulate */
939 acc0 += x3 * c3;
940 acc1 += x0 * c3;
941 acc2 += x1 * c3;
942 acc3 += x2 * c3;
943
944
945 /* Upsampling is done by stuffing L-1 zeros between each sample.
946 * So instead of multiplying zeros with coefficients,
947 * Increment the coefficient pointer by interpolation factor times. */
948 ptr2 += 4 * S->L;
949
950 /* Decrement loop counter */
951 tapCnt--;
952 }
953
954 /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
955 tapCnt = phaseLen % 0x4U;
956
957 while (tapCnt > 0U)
958 {
959 /* Read the input sample */
960 x3 = *(ptr1++);
961
962 /* Read the coefficient */
963 c0 = *(ptr2);
964
965 /* Perform the multiply-accumulate */
966 acc0 += x0 * c0;
967 acc1 += x1 * c0;
968 acc2 += x2 * c0;
969 acc3 += x3 * c0;
970
971 /* Increment the coefficient pointer by interpolation factor times. */
972 ptr2 += S->L;
973
974 /* update states for next sample processing */
975 x0 = x1;
976 x1 = x2;
977 x2 = x3;
978
979 /* Decrement loop counter */
980 tapCnt--;
981 }
982
983 /* The result is in the accumulator, store in the destination buffer. */
984 *(pDst ) = acc0;
985 *(pDst + S->L) = acc1;
986 *(pDst + 2 * S->L) = acc2;
987 *(pDst + 3 * S->L) = acc3;
988
989 pDst++;
990
991 /* Increment the address modifier index of coefficient buffer */
992 j++;
993
994 /* Decrement loop counter */
995 i--;
996 }
997
998 /* Advance the state pointer by 1
999 * to process the next group of interpolation factor number samples */
1000 pState = pState + 4;
1001
1002 pDst += S->L * 3;
1003
1004 /* Decrement loop counter */
1005 blkCnt--;
1006 }
1007
1008 /* Loop unrolling: Compute remaining outputs */
1009 blkCnt = blockSize % 0x4U;
1010
1011 #else
1012
1013 /* Initialize blkCnt with number of samples */
1014 blkCnt = blockSize;
1015
1016 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1017
1018 while (blkCnt > 0U)
1019 {
1020 /* Copy new input sample into the state buffer */
1021 *pStateCur++ = *pSrc++;
1022
1023 /* Address modifier index of coefficient buffer */
1024 j = 1U;
1025
1026 /* Loop over the Interpolation factor. */
1027 i = S->L;
1028
1029 while (i > 0U)
1030 {
1031 /* Set accumulator to zero */
1032 sum0 = 0.0f;
1033
1034 /* Initialize state pointer */
1035 ptr1 = pState;
1036
1037 /* Initialize coefficient pointer */
1038 ptr2 = pCoeffs + (S->L - j);
1039
1040 /* Loop over the polyPhase length.
1041 Repeat until we've computed numTaps-(4*S->L) coefficients. */
1042
1043 #if defined (ARM_MATH_LOOPUNROLL)
1044
1045 /* Loop unrolling: Compute 4 outputs at a time */
1046 tapCnt = phaseLen >> 2U;
1047
1048 while (tapCnt > 0U)
1049 {
1050 /* Perform the multiply-accumulate */
1051 sum0 += *ptr1++ * *ptr2;
1052
1053 /* Upsampling is done by stuffing L-1 zeros between each sample.
1054 * So instead of multiplying zeros with coefficients,
1055 * Increment the coefficient pointer by interpolation factor times. */
1056 ptr2 += S->L;
1057
1058 sum0 += *ptr1++ * *ptr2;
1059 ptr2 += S->L;
1060
1061 sum0 += *ptr1++ * *ptr2;
1062 ptr2 += S->L;
1063
1064 sum0 += *ptr1++ * *ptr2;
1065 ptr2 += S->L;
1066
1067 /* Decrement loop counter */
1068 tapCnt--;
1069 }
1070
1071 /* Loop unrolling: Compute remaining outputs */
1072 tapCnt = phaseLen % 0x4U;
1073
1074 #else
1075
1076 /* Initialize tapCnt with number of samples */
1077 tapCnt = phaseLen;
1078
1079 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1080
1081 while (tapCnt > 0U)
1082 {
1083 /* Perform the multiply-accumulate */
1084 sum0 += *ptr1++ * *ptr2;
1085
1086 /* Upsampling is done by stuffing L-1 zeros between each sample.
1087 * So instead of multiplying zeros with coefficients,
1088 * Increment the coefficient pointer by interpolation factor times. */
1089 ptr2 += S->L;
1090
1091 /* Decrement loop counter */
1092 tapCnt--;
1093 }
1094
1095 /* The result is in the accumulator, store in the destination buffer. */
1096 *pDst++ = sum0;
1097
1098 /* Increment the address modifier index of coefficient buffer */
1099 j++;
1100
1101 /* Decrement the loop counter */
1102 i--;
1103 }
1104
1105 /* Advance the state pointer by 1
1106 * to process the next group of interpolation factor number samples */
1107 pState = pState + 1;
1108
1109 /* Decrement the loop counter */
1110 blkCnt--;
1111 }
1112
1113 /* Processing is complete.
1114 Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
1115 This prepares the state buffer for the next function call. */
1116
1117 /* Points to the start of the state buffer */
1118 pStateCur = S->pState;
1119
1120 #if defined (ARM_MATH_LOOPUNROLL)
1121
1122 /* Loop unrolling: Compute 4 outputs at a time */
1123 tapCnt = (phaseLen - 1U) >> 2U;
1124
1125 /* copy data */
1126 while (tapCnt > 0U)
1127 {
1128 *pStateCur++ = *pState++;
1129 *pStateCur++ = *pState++;
1130 *pStateCur++ = *pState++;
1131 *pStateCur++ = *pState++;
1132
1133 /* Decrement loop counter */
1134 tapCnt--;
1135 }
1136
1137 /* Loop unrolling: Compute remaining outputs */
1138 tapCnt = (phaseLen - 1U) % 0x04U;
1139
1140 #else
1141
1142 /* Initialize tapCnt with number of samples */
1143 tapCnt = (phaseLen - 1U);
1144
1145 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1146
1147 /* Copy data */
1148 while (tapCnt > 0U)
1149 {
1150 *pStateCur++ = *pState++;
1151
1152 /* Decrement loop counter */
1153 tapCnt--;
1154 }
1155
1156 #else
1157 /* alternate version for CM0_FAMILY */
1158
1159 float32_t *pState = S->pState; /* State pointer */
1160 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
1161 float32_t *pStateCur; /* Points to the current sample of the state */
1162 float32_t *ptr1; /* Temporary pointer for state buffer */
1163 const float32_t *ptr2; /* Temporary pointer for coefficient buffer */
1164 float32_t sum0; /* Accumulators */
1165 uint32_t i, blkCnt, tapCnt; /* Loop counters */
1166 uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
1167
1168 /* S->pState buffer contains previous frame (phaseLen - 1) samples */
1169 /* pStateCur points to the location where the new input data should be written */
1170 pStateCur = S->pState + (phaseLen - 1U);
1171
1172 /* Total number of intput samples */
1173 blkCnt = blockSize;
1174
1175 /* Loop over the blockSize. */
1176 while (blkCnt > 0U)
1177 {
1178 /* Copy new input sample into the state buffer */
1179 *pStateCur++ = *pSrc++;
1180
1181 /* Loop over the Interpolation factor. */
1182 i = S->L;
1183
1184 while (i > 0U)
1185 {
1186 /* Set accumulator to zero */
1187 sum0 = 0.0f;
1188
1189 /* Initialize state pointer */
1190 ptr1 = pState;
1191
1192 /* Initialize coefficient pointer */
1193 ptr2 = pCoeffs + (i - 1U);
1194
1195 /* Loop over the polyPhase length */
1196 tapCnt = phaseLen;
1197
1198 while (tapCnt > 0U)
1199 {
1200 /* Perform the multiply-accumulate */
1201 sum0 += *ptr1++ * *ptr2;
1202
1203 /* Increment the coefficient pointer by interpolation factor times. */
1204 ptr2 += S->L;
1205
1206 /* Decrement the loop counter */
1207 tapCnt--;
1208 }
1209
1210 /* The result is in the accumulator, store in the destination buffer. */
1211 *pDst++ = sum0;
1212
1213 /* Decrement loop counter */
1214 i--;
1215 }
1216
1217 /* Advance the state pointer by 1
1218 * to process the next group of interpolation factor number samples */
1219 pState = pState + 1;
1220
1221 /* Decrement loop counter */
1222 blkCnt--;
1223 }
1224
1225 /* Processing is complete.
1226 ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
1227 ** This prepares the state buffer for the next function call. */
1228
1229 /* Points to the start of the state buffer */
1230 pStateCur = S->pState;
1231
1232 tapCnt = phaseLen - 1U;
1233
1234 /* Copy data */
1235 while (tapCnt > 0U)
1236 {
1237 *pStateCur++ = *pState++;
1238
1239 /* Decrement loop counter */
1240 tapCnt--;
1241 }
1242
1243 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
1244
1245 }
1246
1247 #endif /* #if defined(ARM_MATH_NEON) */
1248 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1249 /**
1250 @} end of FIR_Interpolate group
1251 */
1252