1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_decimate_f32.c
4 * Description: FIR decimation for floating-point sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @defgroup FIR_decimate Finite Impulse Response (FIR) Decimator
37
38 These functions combine an FIR filter together with a decimator.
39 They are used in multirate systems for reducing the sample rate of a signal without introducing aliasing distortion.
40 Conceptually, the functions are equivalent to the block diagram below:
41 \image html FIRDecimator.gif "Components included in the FIR Decimator functions"
42 When decimating by a factor of <code>M</code>, the signal should be prefiltered by a lowpass filter with a normalized
43 cutoff frequency of <code>1/M</code> in order to prevent aliasing distortion.
44 The user of the function is responsible for providing the filter coefficients.
45
46 The FIR decimator functions provided in the CMSIS DSP Library combine the FIR filter and the decimator in an efficient manner.
47 Instead of calculating all of the FIR filter outputs and discarding <code>M-1</code> out of every <code>M</code>, only the
48 samples output by the decimator are computed.
49 The functions operate on blocks of input and output data.
50 <code>pSrc</code> points to an array of <code>blockSize</code> input values and
51 <code>pDst</code> points to an array of <code>blockSize/M</code> output values.
52 In order to have an integer number of output samples <code>blockSize</code>
53 must always be a multiple of the decimation factor <code>M</code>.
54
55 The library provides separate functions for Q15, Q31 and floating-point data types.
56
57 @par Algorithm:
58 The FIR portion of the algorithm uses the standard form filter:
59 <pre>
60 y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
61 </pre>
62 where, <code>b[n]</code> are the filter coefficients.
63 @par
64 The <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
65 Coefficients are stored in time reversed order.
66 @par
67 <pre>
68 {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
69 </pre>
70 @par
71 <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
72 Samples in the state buffer are stored in the order:
73 @par
74 <pre>
75 {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
76 </pre>
77 The state variables are updated after each block of data is processed, the coefficients are untouched.
78
79 @par Instance Structure
80 The coefficients and state variables for a filter are stored together in an instance data structure.
81 A separate instance structure must be defined for each filter.
82 Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
83 There are separate instance structure declarations for each of the 3 supported data types.
84
85 @par Initialization Functions
86 There is also an associated initialization function for each data type.
87 The initialization function performs the following operations:
88 - Sets the values of the internal structure fields.
89 - Zeros out the values in the state buffer.
90 - Checks to make sure that the size of the input is a multiple of the decimation factor.
91 To do this manually without calling the init function, assign the follow subfields of the instance structure:
92 numTaps, pCoeffs, M (decimation factor), pState. Also set all of the values in pState to zero.
93 @par
94 Use of the initialization function is optional.
95 However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
96 To place an instance structure into a const data section, the instance structure must be manually initialized.
97 The code below statically initializes each of the 3 different data type filter instance structures
98 <pre>
99 arm_fir_decimate_instance_f32 S = {M, numTaps, pCoeffs, pState};
100 arm_fir_decimate_instance_q31 S = {M, numTaps, pCoeffs, pState};
101 arm_fir_decimate_instance_q15 S = {M, numTaps, pCoeffs, pState};
102 </pre>
103 where <code>M</code> is the decimation factor; <code>numTaps</code> is the number of filter coefficients in the filter;
104 <code>pCoeffs</code> is the address of the coefficient buffer;
105 <code>pState</code> is the address of the state buffer.
106 Be sure to set the values in the state buffer to zeros when doing static initialization.
107
108 @par Fixed-Point Behavior
109 Care must be taken when using the fixed-point versions of the FIR decimate filter functions.
110 In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
111 Refer to the function specific documentation below for usage guidelines.
112 */
113
114 /**
115 @addtogroup FIR_decimate
116 @{
117 */
118
119 /**
120 @brief Processing function for floating-point FIR decimator.
121 @param[in] S points to an instance of the floating-point FIR decimator structure
122 @param[in] pSrc points to the block of input data
123 @param[out] pDst points to the block of output data
124 @param[in] blockSize number of input samples to process
125 */
126 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
127
128 #include "arm_helium_utils.h"
129
arm_fir_decimate_f32(const arm_fir_decimate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)130 ARM_DSP_ATTRIBUTE void arm_fir_decimate_f32(
131 const arm_fir_decimate_instance_f32 * S,
132 const float32_t * pSrc,
133 float32_t * pDst,
134 uint32_t blockSize)
135 {
136 float32_t *pState = S->pState; /* State pointer */
137 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
138 float32_t *pStateCurnt; /* Points to the current sample of the state */
139 const float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
140 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
141 uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
142 uint32_t blkCntN4;
143 const float32_t *px0, *px1, *px2, *px3;
144 f32x4_t accv = { 0 }, acc0v, acc1v, acc2v, acc3v;
145 f32x4_t x0v, x1v, x2v, x3v;
146 f32x4_t c0v;
147
148 /*
149 * S->pState buffer contains previous frame (numTaps - 1) samples
150 * pStateCurnt points to the location where the new input data should be written
151 */
152 pStateCurnt = S->pState + (numTaps - 1U);
153 /*
154 * Total number of output samples to be computed
155 */
156 blkCnt = outBlockSize / 4;
157 blkCntN4 = outBlockSize - (4 * blkCnt);
158
159 while (blkCnt > 0U)
160 {
161 /*
162 * Copy 4 * decimation factor number of new input samples into the state buffer
163 */
164 i = (4 * S->M) >> 2;
165 do
166 {
167 vst1q(pStateCurnt, vld1q((const float32_t *)pSrc));
168 pSrc += 4;
169 pStateCurnt += 4;
170 i--;
171 }
172 while (i > 0U);
173
174 /*
175 * Set accumulators to zero
176 */
177 acc0v = vdupq_n_f32(0.0f);
178 acc1v = vdupq_n_f32(0.0f);
179 acc2v = vdupq_n_f32(0.0f);
180 acc3v = vdupq_n_f32(0.0f);
181
182 /*
183 * Initialize state pointer for all the samples
184 */
185 px0 = pState;
186 px1 = pState + S->M;
187 px2 = pState + 2 * S->M;
188 px3 = pState + 3 * S->M;
189 /*
190 * Initialize coeff pointer
191 */
192 pb = pCoeffs;
193 /*
194 * Loop unrolling. Process 4 taps at a time.
195 */
196 tapCnt = numTaps >> 2;
197 /*
198 * Loop over the number of taps. Unroll by a factor of 4.
199 * Repeat until we've computed numTaps-4 coefficients.
200 */
201 while (tapCnt > 0U)
202 {
203 /*
204 * Read the b[numTaps-1] coefficient
205 */
206 c0v = vld1q((const float32_t *)pb);
207 pb += 4;
208 /*
209 * Read x[n-numTaps-1] sample for acc0
210 */
211 x0v = vld1q(px0);
212 x1v = vld1q(px1);
213 x2v = vld1q(px2);
214 x3v = vld1q(px3);
215 px0 += 4;
216 px1 += 4;
217 px2 += 4;
218 px3 += 4;
219
220 acc0v = vfmaq(acc0v, x0v, c0v);
221 acc1v = vfmaq(acc1v, x1v, c0v);
222 acc2v = vfmaq(acc2v, x2v, c0v);
223 acc3v = vfmaq(acc3v, x3v, c0v);
224 /*
225 * Decrement the loop counter
226 */
227 tapCnt--;
228 }
229
230 /*
231 * If the filter length is not a multiple of 4, compute the remaining filter taps
232 * should be tail predicated
233 */
234 tapCnt = numTaps % 0x4U;
235 if (tapCnt > 0U)
236 {
237 mve_pred16_t p0 = vctp32q(tapCnt);
238 /*
239 * Read the b[numTaps-1] coefficient
240 */
241 c0v = vldrwq_z_f32(pb, p0);
242 pb += 4;
243 /*
244 * Read x[n-numTaps-1] sample for acc0
245 */
246 x0v = vld1q(px0);
247 x1v = vld1q(px1);
248 x2v = vld1q(px2);
249 x3v = vld1q(px3);
250 px0 += 4;
251 px1 += 4;
252 px2 += 4;
253 px3 += 4;
254
255 acc0v = vfmaq_f32(acc0v, x0v, c0v);
256 acc1v = vfmaq_f32(acc1v, x1v, c0v);
257 acc2v = vfmaq_f32(acc2v, x2v, c0v);
258 acc3v = vfmaq_f32(acc3v, x3v, c0v);
259 }
260
261 /* reduction */
262 accv[0] = vecAddAcrossF32Mve(acc0v);
263 accv[1] = vecAddAcrossF32Mve(acc1v);
264 accv[2] = vecAddAcrossF32Mve(acc2v);
265 accv[3] = vecAddAcrossF32Mve(acc3v);
266
267 /*
268 * Advance the state pointer by the decimation factor
269 * to process the next group of decimation factor number samples
270 */
271 pState = pState + 4 * S->M;
272 /*
273 * The result is in the accumulator, store in the destination buffer.
274 */
275 vst1q(pDst, accv);
276 pDst += 4;
277
278 /*
279 * Decrement the loop counter
280 */
281 blkCnt--;
282 }
283
284 while (blkCntN4 > 0U)
285 {
286 /*
287 * Copy decimation factor number of new input samples into the state buffer
288 */
289 i = S->M;
290 do
291 {
292 *pStateCurnt++ = *pSrc++;
293 }
294 while (--i);
295 /*
296 * Set accumulator to zero
297 */
298 acc0v = vdupq_n_f32(0.0f);
299 /*
300 * Initialize state pointer
301 */
302 px = pState;
303 /*
304 * Initialize coeff pointer
305 */
306 pb = pCoeffs;
307 /*
308 * Loop unrolling. Process 4 taps at a time.
309 */
310 tapCnt = numTaps >> 2;
311 /*
312 * Loop over the number of taps. Unroll by a factor of 4.
313 * Repeat until we've computed numTaps-4 coefficients.
314 */
315 while (tapCnt > 0U)
316 {
317 c0v = vldrwq_f32(pb);
318 x0v = vldrwq_f32(px);
319 pb += 4;
320 px += 4;
321 acc0v = vfmaq_f32(acc0v, x0v, c0v);
322 /*
323 * Decrement the loop counter
324 */
325 tapCnt--;
326 }
327 tapCnt = numTaps % 0x4U;
328 if (tapCnt > 0U)
329 {
330 mve_pred16_t p0 = vctp32q(tapCnt);
331 c0v = vldrwq_z_f32(pb, p0);
332 x0v = vldrwq_f32(px);
333 acc0v = vfmaq_f32(acc0v, x0v, c0v);
334 }
335 accv[0] = vecAddAcrossF32Mve(acc0v);
336
337 /*
338 * Advance the state pointer by the decimation factor
339 * * to process the next group of decimation factor number samples
340 */
341 pState = pState + S->M;
342 /*
343 * The result is in the accumulator, store in the destination buffer.
344 */
345 *pDst++ = accv[0];
346 /*
347 * Decrement the loop counter
348 */
349 blkCntN4--;
350 }
351
352 /*
353 * Processing is complete.
354 * Now copy the last numTaps - 1 samples to the start of the state buffer.
355 * This prepares the state buffer for the next function call.
356 */
357
358 pStateCurnt = S->pState;
359 blkCnt =(numTaps - 1) >> 2;
360 while (blkCnt > 0U)
361 {
362 vst1q(pStateCurnt, vldrwq_f32(pState));
363 pState += 4;
364 pStateCurnt += 4;
365 blkCnt--;
366 }
367 blkCnt = (numTaps - 1) & 3;
368 if (blkCnt > 0U)
369 {
370 mve_pred16_t p0 = vctp32q(blkCnt);
371 vstrwq_p_f32(pStateCurnt, vldrwq_f32(pState), p0);
372 }
373 }
374 #else
375 #if defined(ARM_MATH_NEON)
arm_fir_decimate_f32(const arm_fir_decimate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)376 ARM_DSP_ATTRIBUTE void arm_fir_decimate_f32(
377 const arm_fir_decimate_instance_f32 * S,
378 const float32_t * pSrc,
379 float32_t * pDst,
380 uint32_t blockSize)
381 {
382 float32_t *pState = S->pState; /* State pointer */
383 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
384 float32_t *pStateCurnt; /* Points to the current sample of the state */
385 float32_t *px; /* Temporary pointer for state buffer */
386 const float32_t *pb; /* Temporary pointer for coefficient buffer */
387 float32_t sum0; /* Accumulator */
388 float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
389 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
390 uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
391
392 uint32_t blkCntN4;
393 float32_t *px0, *px1, *px2, *px3;
394 float32_t x1, x2, x3;
395
396 float32x4_t accv,acc0v,acc1v,acc2v,acc3v;
397 float32x4_t x0v, x1v, x2v, x3v;
398 float32x4_t c0v;
399 float32x2_t temp;
400 float32x4_t sum0v;
401
402 /* S->pState buffer contains previous frame (numTaps - 1) samples */
403 /* pStateCurnt points to the location where the new input data should be written */
404 pStateCurnt = S->pState + (numTaps - 1U);
405
406 /* Total number of output samples to be computed */
407 blkCnt = outBlockSize / 4;
408 blkCntN4 = outBlockSize - (4 * blkCnt);
409
410 while (blkCnt > 0U)
411 {
412 /* Copy 4 * decimation factor number of new input samples into the state buffer */
413 i = 4 * S->M;
414
415 do
416 {
417 *pStateCurnt++ = *pSrc++;
418
419 } while (--i);
420
421 /* Set accumulators to zero */
422 acc0v = vdupq_n_f32(0.0);
423 acc1v = vdupq_n_f32(0.0);
424 acc2v = vdupq_n_f32(0.0);
425 acc3v = vdupq_n_f32(0.0);
426
427 /* Initialize state pointer for all the samples */
428 px0 = pState;
429 px1 = pState + S->M;
430 px2 = pState + 2 * S->M;
431 px3 = pState + 3 * S->M;
432
433 /* Initialize coeff pointer */
434 pb = pCoeffs;
435
436 /* Process 4 taps at a time. */
437 tapCnt = numTaps >> 2;
438
439 /* Loop over the number of taps.
440 ** Repeat until we've computed numTaps-4 coefficients. */
441
442 while (tapCnt > 0U)
443 {
444 /* Read the b[numTaps-1] coefficient */
445 c0v = vld1q_f32(pb);
446 pb += 4;
447
448 /* Read x[n-numTaps-1] sample for acc0 */
449 x0v = vld1q_f32(px0);
450 x1v = vld1q_f32(px1);
451 x2v = vld1q_f32(px2);
452 x3v = vld1q_f32(px3);
453
454 px0 += 4;
455 px1 += 4;
456 px2 += 4;
457 px3 += 4;
458
459 acc0v = vmlaq_f32(acc0v, x0v, c0v);
460 acc1v = vmlaq_f32(acc1v, x1v, c0v);
461 acc2v = vmlaq_f32(acc2v, x2v, c0v);
462 acc3v = vmlaq_f32(acc3v, x3v, c0v);
463
464 /* Decrement the loop counter */
465 tapCnt--;
466 }
467
468 temp = vpadd_f32(vget_low_f32(acc0v),vget_high_f32(acc0v));
469 accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,0);
470
471 temp = vpadd_f32(vget_low_f32(acc1v),vget_high_f32(acc1v));
472 accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,1);
473
474 temp = vpadd_f32(vget_low_f32(acc2v),vget_high_f32(acc2v));
475 accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,2);
476
477 temp = vpadd_f32(vget_low_f32(acc3v),vget_high_f32(acc3v));
478 accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,3);
479
480 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
481 tapCnt = numTaps % 0x4U;
482
483 while (tapCnt > 0U)
484 {
485 /* Read coefficients */
486 c0 = *(pb++);
487
488 /* Fetch state variables for acc0, acc1, acc2, acc3 */
489 x0 = *(px0++);
490 x1 = *(px1++);
491 x2 = *(px2++);
492 x3 = *(px3++);
493
494 /* Perform the multiply-accumulate */
495 accv = vsetq_lane_f32(vgetq_lane_f32(accv, 0) + x0 * c0,accv,0);
496 accv = vsetq_lane_f32(vgetq_lane_f32(accv, 1) + x1 * c0,accv,1);
497 accv = vsetq_lane_f32(vgetq_lane_f32(accv, 2) + x2 * c0,accv,2);
498 accv = vsetq_lane_f32(vgetq_lane_f32(accv, 3) + x3 * c0,accv,3);
499
500 /* Decrement the loop counter */
501 tapCnt--;
502 }
503
504 /* Advance the state pointer by the decimation factor
505 * to process the next group of decimation factor number samples */
506 pState = pState + 4 * S->M;
507
508 /* The result is in the accumulator, store in the destination buffer. */
509 vst1q_f32(pDst,accv);
510 pDst += 4;
511
512 /* Decrement the loop counter */
513 blkCnt--;
514 }
515
516 while (blkCntN4 > 0U)
517 {
518 /* Copy decimation factor number of new input samples into the state buffer */
519 i = S->M;
520
521 do
522 {
523 *pStateCurnt++ = *pSrc++;
524
525 } while (--i);
526
527 /* Set accumulator to zero */
528 sum0v = vdupq_n_f32(0.0);
529
530 /* Initialize state pointer */
531 px = pState;
532
533 /* Initialize coeff pointer */
534 pb = pCoeffs;
535
536 /* Process 4 taps at a time. */
537 tapCnt = numTaps >> 2;
538
539 /* Loop over the number of taps.
540 ** Repeat until we've computed numTaps-4 coefficients. */
541 while (tapCnt > 0U)
542 {
543 c0v = vld1q_f32(pb);
544 pb += 4;
545
546 x0v = vld1q_f32(px);
547 px += 4;
548
549 sum0v = vmlaq_f32(sum0v, x0v, c0v);
550
551 /* Decrement the loop counter */
552 tapCnt--;
553 }
554
555 temp = vpadd_f32(vget_low_f32(sum0v),vget_high_f32(sum0v));
556 sum0 = vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1);
557
558 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
559 tapCnt = numTaps % 0x4U;
560
561 while (tapCnt > 0U)
562 {
563 /* Read coefficients */
564 c0 = *(pb++);
565
566 /* Fetch 1 state variable */
567 x0 = *(px++);
568
569 /* Perform the multiply-accumulate */
570 sum0 += x0 * c0;
571
572 /* Decrement the loop counter */
573 tapCnt--;
574 }
575
576 /* Advance the state pointer by the decimation factor
577 * to process the next group of decimation factor number samples */
578 pState = pState + S->M;
579
580 /* The result is in the accumulator, store in the destination buffer. */
581 *pDst++ = sum0;
582
583 /* Decrement the loop counter */
584 blkCntN4--;
585 }
586
587 /* Processing is complete.
588 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
589 ** This prepares the state buffer for the next function call. */
590
591 /* Points to the start of the state buffer */
592 pStateCurnt = S->pState;
593
594 i = (numTaps - 1U) >> 2;
595
596 /* Copy data */
597 while (i > 0U)
598 {
599 sum0v = vld1q_f32(pState);
600 vst1q_f32(pStateCurnt,sum0v);
601 pState += 4;
602 pStateCurnt += 4;
603
604 /* Decrement the loop counter */
605 i--;
606 }
607
608 i = (numTaps - 1U) % 0x04U;
609
610 /* Copy data */
611 while (i > 0U)
612 {
613 *pStateCurnt++ = *pState++;
614
615 /* Decrement the loop counter */
616 i--;
617 }
618 }
619 #else
arm_fir_decimate_f32(const arm_fir_decimate_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)620 ARM_DSP_ATTRIBUTE void arm_fir_decimate_f32(
621 const arm_fir_decimate_instance_f32 * S,
622 const float32_t * pSrc,
623 float32_t * pDst,
624 uint32_t blockSize)
625 {
626 float32_t *pState = S->pState; /* State pointer */
627 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
628 float32_t *pStateCur; /* Points to the current sample of the state */
629 float32_t *px0; /* Temporary pointer for state buffer */
630 const float32_t *pb; /* Temporary pointer for coefficient buffer */
631 float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
632 float32_t acc0; /* Accumulator */
633 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
634 uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
635
636 #if defined (ARM_MATH_LOOPUNROLL)
637 float32_t *px1, *px2, *px3;
638 float32_t x1, x2, x3;
639 float32_t acc1, acc2, acc3;
640 #endif
641
642 /* S->pState buffer contains previous frame (numTaps - 1) samples */
643 /* pStateCur points to the location where the new input data should be written */
644 pStateCur = S->pState + (numTaps - 1U);
645
646 #if defined (ARM_MATH_LOOPUNROLL)
647
648 /* Loop unrolling: Compute 4 samples at a time */
649 blkCnt = outBlockSize >> 2U;
650
651 /* Samples loop unrolled by 4 */
652 while (blkCnt > 0U)
653 {
654 /* Copy 4 * decimation factor number of new input samples into the state buffer */
655 i = S->M * 4;
656
657 do
658 {
659 *pStateCur++ = *pSrc++;
660
661 } while (--i);
662
663 /* Set accumulators to zero */
664 acc0 = 0.0f;
665 acc1 = 0.0f;
666 acc2 = 0.0f;
667 acc3 = 0.0f;
668
669 /* Initialize state pointer for all the samples */
670 px0 = pState;
671 px1 = pState + S->M;
672 px2 = pState + 2 * S->M;
673 px3 = pState + 3 * S->M;
674
675 /* Initialize coeff pointer */
676 pb = pCoeffs;
677
678 /* Loop unrolling: Compute 4 taps at a time */
679 tapCnt = numTaps >> 2U;
680
681 while (tapCnt > 0U)
682 {
683 /* Read the b[numTaps-1] coefficient */
684 c0 = *(pb++);
685
686 /* Read x[n-numTaps-1] sample for acc0 */
687 x0 = *(px0++);
688 /* Read x[n-numTaps-1] sample for acc1 */
689 x1 = *(px1++);
690 /* Read x[n-numTaps-1] sample for acc2 */
691 x2 = *(px2++);
692 /* Read x[n-numTaps-1] sample for acc3 */
693 x3 = *(px3++);
694
695 /* Perform the multiply-accumulate */
696 acc0 += x0 * c0;
697 acc1 += x1 * c0;
698 acc2 += x2 * c0;
699 acc3 += x3 * c0;
700
701 /* Read the b[numTaps-2] coefficient */
702 c0 = *(pb++);
703
704 /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */
705 x0 = *(px0++);
706 x1 = *(px1++);
707 x2 = *(px2++);
708 x3 = *(px3++);
709
710 /* Perform the multiply-accumulate */
711 acc0 += x0 * c0;
712 acc1 += x1 * c0;
713 acc2 += x2 * c0;
714 acc3 += x3 * c0;
715
716 /* Read the b[numTaps-3] coefficient */
717 c0 = *(pb++);
718
719 /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */
720 x0 = *(px0++);
721 x1 = *(px1++);
722 x2 = *(px2++);
723 x3 = *(px3++);
724
725 /* Perform the multiply-accumulate */
726 acc0 += x0 * c0;
727 acc1 += x1 * c0;
728 acc2 += x2 * c0;
729 acc3 += x3 * c0;
730
731 /* Read the b[numTaps-4] coefficient */
732 c0 = *(pb++);
733
734 /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */
735 x0 = *(px0++);
736 x1 = *(px1++);
737 x2 = *(px2++);
738 x3 = *(px3++);
739
740 /* Perform the multiply-accumulate */
741 acc0 += x0 * c0;
742 acc1 += x1 * c0;
743 acc2 += x2 * c0;
744 acc3 += x3 * c0;
745
746 /* Decrement loop counter */
747 tapCnt--;
748 }
749
750 /* Loop unrolling: Compute remaining taps */
751 tapCnt = numTaps % 0x4U;
752
753 while (tapCnt > 0U)
754 {
755 /* Read coefficients */
756 c0 = *(pb++);
757
758 /* Fetch state variables for acc0, acc1, acc2, acc3 */
759 x0 = *(px0++);
760 x1 = *(px1++);
761 x2 = *(px2++);
762 x3 = *(px3++);
763
764 /* Perform the multiply-accumulate */
765 acc0 += x0 * c0;
766 acc1 += x1 * c0;
767 acc2 += x2 * c0;
768 acc3 += x3 * c0;
769
770 /* Decrement loop counter */
771 tapCnt--;
772 }
773
774 /* Advance the state pointer by the decimation factor
775 * to process the next group of decimation factor number samples */
776 pState = pState + S->M * 4;
777
778 /* The result is in the accumulator, store in the destination buffer. */
779 *pDst++ = acc0;
780 *pDst++ = acc1;
781 *pDst++ = acc2;
782 *pDst++ = acc3;
783
784 /* Decrement loop counter */
785 blkCnt--;
786 }
787
788 /* Loop unrolling: Compute remaining samples */
789 blkCnt = outBlockSize % 0x4U;
790
791 #else
792
793 /* Initialize blkCnt with number of samples */
794 blkCnt = outBlockSize;
795
796 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
797
798 while (blkCnt > 0U)
799 {
800 /* Copy decimation factor number of new input samples into the state buffer */
801 i = S->M;
802
803 do
804 {
805 *pStateCur++ = *pSrc++;
806
807 } while (--i);
808
809 /* Set accumulator to zero */
810 acc0 = 0.0f;
811
812 /* Initialize state pointer */
813 px0 = pState;
814
815 /* Initialize coeff pointer */
816 pb = pCoeffs;
817
818 #if defined (ARM_MATH_LOOPUNROLL)
819
820 /* Loop unrolling: Compute 4 taps at a time */
821 tapCnt = numTaps >> 2U;
822
823 while (tapCnt > 0U)
824 {
825 /* Read the b[numTaps-1] coefficient */
826 c0 = *pb++;
827
828 /* Read x[n-numTaps-1] sample */
829 x0 = *px0++;
830
831 /* Perform the multiply-accumulate */
832 acc0 += x0 * c0;
833
834 /* Read the b[numTaps-2] coefficient */
835 c0 = *pb++;
836
837 /* Read x[n-numTaps-2] sample */
838 x0 = *px0++;
839
840 /* Perform the multiply-accumulate */
841 acc0 += x0 * c0;
842
843 /* Read the b[numTaps-3] coefficient */
844 c0 = *pb++;
845
846 /* Read x[n-numTaps-3] sample */
847 x0 = *px0++;
848
849 /* Perform the multiply-accumulate */
850 acc0 += x0 * c0;
851
852 /* Read the b[numTaps-4] coefficient */
853 c0 = *pb++;
854
855 /* Read x[n-numTaps-4] sample */
856 x0 = *px0++;
857
858 /* Perform the multiply-accumulate */
859 acc0 += x0 * c0;
860
861 /* Decrement loop counter */
862 tapCnt--;
863 }
864
865 /* Loop unrolling: Compute remaining taps */
866 tapCnt = numTaps % 0x4U;
867
868 #else
869
870 /* Initialize tapCnt with number of taps */
871 tapCnt = numTaps;
872
873 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
874
875 while (tapCnt > 0U)
876 {
877 /* Read coefficients */
878 c0 = *pb++;
879
880 /* Fetch 1 state variable */
881 x0 = *px0++;
882
883 /* Perform the multiply-accumulate */
884 acc0 += x0 * c0;
885
886 /* Decrement loop counter */
887 tapCnt--;
888 }
889
890 /* Advance the state pointer by the decimation factor
891 * to process the next group of decimation factor number samples */
892 pState = pState + S->M;
893
894 /* The result is in the accumulator, store in the destination buffer. */
895 *pDst++ = acc0;
896
897 /* Decrement loop counter */
898 blkCnt--;
899 }
900
901 /* Processing is complete.
902 Now copy the last numTaps - 1 samples to the satrt of the state buffer.
903 This prepares the state buffer for the next function call. */
904
905 /* Points to the start of the state buffer */
906 pStateCur = S->pState;
907
908 #if defined (ARM_MATH_LOOPUNROLL)
909
910 /* Loop unrolling: Compute 4 taps at a time */
911 tapCnt = (numTaps - 1U) >> 2U;
912
913 /* Copy data */
914 while (tapCnt > 0U)
915 {
916 *pStateCur++ = *pState++;
917 *pStateCur++ = *pState++;
918 *pStateCur++ = *pState++;
919 *pStateCur++ = *pState++;
920
921 /* Decrement loop counter */
922 tapCnt--;
923 }
924
925 /* Loop unrolling: Compute remaining taps */
926 tapCnt = (numTaps - 1U) % 0x04U;
927
928 #else
929
930 /* Initialize tapCnt with number of taps */
931 tapCnt = (numTaps - 1U);
932
933 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
934
935 /* Copy data */
936 while (tapCnt > 0U)
937 {
938 *pStateCur++ = *pState++;
939
940 /* Decrement loop counter */
941 tapCnt--;
942 }
943
944 }
945 #endif /* #if defined(ARM_MATH_NEON) */
946
947 #endif /*defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
948 /**
949 @} end of FIR_decimate group
950 */
951