1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_decimate_q15.c
4 * Description: Q15 FIR Decimator
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup FIR_decimate
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q15 FIR decimator.
42 @param[in] S points to an instance of the Q15 FIR decimator structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] blockSize number of input samples to process per call
46
47 @par Scaling and Overflow Behavior
48 The function is implemented using a 64-bit internal accumulator.
49 Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
50 The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
51 There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
52 After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
53 Lastly, the accumulator is saturated to yield a result in 1.15 format.
54
55 @remark
56 Refer to \ref arm_fir_decimate_fast_q15() for a faster but less precise implementation of this function.
57 */
58
59 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
60
61 #include "arm_helium_utils.h"
62
arm_fir_decimate_q15(const arm_fir_decimate_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)63 ARM_DSP_ATTRIBUTE void arm_fir_decimate_q15(
64 const arm_fir_decimate_instance_q15 * S,
65 const q15_t * pSrc,
66 q15_t * pDst,
67 uint32_t blockSize)
68 {
69 q15_t *pState = S->pState; /* State pointer */
70 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
71 q15_t *pStateCurnt; /* Points to the current sample of the state */
72 const q15_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
73 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
74 uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
75 uint32_t blkCntN4;
76 const q15_t *px0, *px1, *px2, *px3;
77 q63_t acc0v, acc1v, acc2v, acc3v;
78 q15x8_t x0v, x1v, x2v, x3v;
79 q15x8_t c0v;
80
81 /*
82 * S->pState buffer contains previous frame (numTaps - 1) samples
83 * pStateCurnt points to the location where the new input data should be written
84 */
85 pStateCurnt = S->pState + (numTaps - 1U);
86 /*
87 * Total number of output samples to be computed
88 */
89 blkCnt = outBlockSize / 4;
90 blkCntN4 = outBlockSize - (4 * blkCnt);
91
92 while (blkCnt > 0U)
93 {
94 /*
95 * Need extra temp variables as 4 * S->M is not necessarily a multiple of 8
96 * and cause final tail predicated post incremented pointers to jump ahead
97 */
98 const q15_t *pSrcTmp = pSrc;
99 q15_t *pStateCurntTmp = pStateCurnt;
100
101 /*
102 * Copy 4 * decimation factor number of new input samples into the state buffer
103 */
104 i = (4 * S->M) >> 3;
105 while (i > 0U)
106 {
107 vstrhq_s16(pStateCurntTmp, vldrhq_s16(pSrcTmp));
108 pSrcTmp += 8;
109 pStateCurntTmp += 8;
110 i--;
111 }
112 i = (4 * S->M) & 7;
113 if (i > 0U)
114 {
115 mve_pred16_t p0 = vctp16q(i);
116 vstrhq_p_s16(pStateCurntTmp, vldrhq_s16(pSrcTmp), p0);
117 }
118
119 pSrc += (4 * S->M);
120 pStateCurnt += (4 * S->M);
121
122 /*
123 * Clear all accumulators
124 */
125 acc0v = 0LL;
126 acc1v = 0LL;
127 acc2v = 0LL;
128 acc3v = 0LL;
129 /*
130 * Initialize state pointer for all the samples
131 */
132 px0 = pState;
133 px1 = pState + S->M;
134 px2 = pState + 2 * S->M;
135 px3 = pState + 3 * S->M;
136 /*
137 * Initialize coeff. pointer
138 */
139 pb = pCoeffs;
140
141 tapCnt = numTaps >> 3;
142 /*
143 * Loop over the number of taps. Unroll by a factor of 4.
144 * Repeat until we've computed numTaps-4 coefficients.
145 */
146 while (tapCnt > 0U)
147 {
148 /*
149 * Read the b[numTaps-1] coefficient
150 */
151 c0v = vldrhq_s16(pb);
152 pb += 8;
153 /*
154 * Read x[n-numTaps-1] sample for acc0
155 */
156 x0v = vld1q(px0);
157 x1v = vld1q(px1);
158 x2v = vld1q(px2);
159 x3v = vld1q(px3);
160 px0 += 8;
161 px1 += 8;
162 px2 += 8;
163 px3 += 8;
164
165 acc0v = vmlaldavaq(acc0v, x0v, c0v);
166 acc1v = vmlaldavaq(acc1v, x1v, c0v);
167 acc2v = vmlaldavaq(acc2v, x2v, c0v);
168 acc3v = vmlaldavaq(acc3v, x3v, c0v);
169 /*
170 * Decrement the loop counter
171 */
172 tapCnt--;
173 }
174
175 /*
176 * If the filter length is not a multiple of 4, compute the remaining filter taps
177 * should be tail predicated
178 */
179 tapCnt = numTaps & 7;
180 if (tapCnt > 0U)
181 {
182 mve_pred16_t p0 = vctp16q(tapCnt);
183 /*
184 * Read the b[numTaps-1] coefficient
185 */
186 c0v = vldrhq_z_s16(pb, p0);
187 pb += 8;
188 /*
189 * Read x[n-numTaps-1] sample for acc0
190 */
191 x0v = vld1q(px0);
192 x1v = vld1q(px1);
193 x2v = vld1q(px2);
194 x3v = vld1q(px3);
195 px0 += 8;
196 px1 += 8;
197 px2 += 8;
198 px3 += 8;
199
200 acc0v = vmlaldavaq(acc0v, x0v, c0v);
201 acc1v = vmlaldavaq(acc1v, x1v, c0v);
202 acc2v = vmlaldavaq(acc2v, x2v, c0v);
203 acc3v = vmlaldavaq(acc3v, x3v, c0v);
204 }
205
206 acc0v = asrl(acc0v, 15);
207 acc1v = asrl(acc1v, 15);
208 acc2v = asrl(acc2v, 15);
209 acc3v = asrl(acc3v, 15);
210 /*
211 * store in the destination buffer.
212 */
213 *pDst++ = (q15_t) __SSAT((q31_t) acc0v, 16);
214 *pDst++ = (q15_t) __SSAT((q31_t) acc1v, 16);;
215 *pDst++ = (q15_t) __SSAT((q31_t) acc2v, 16);;
216 *pDst++ = (q15_t) __SSAT((q31_t) acc3v, 16);;
217
218 /*
219 * Advance the state pointer by the decimation factor
220 * to process the next group of decimation factor number samples
221 */
222 pState = pState + 4 * S->M;
223 /*
224 * Decrement the loop counter
225 */
226 blkCnt--;
227 }
228
229 while (blkCntN4 > 0U)
230 {
231 /*
232 * Copy decimation factor number of new input samples into the state buffer
233 */
234 i = S->M;
235 do
236 {
237 *pStateCurnt++ = *pSrc++;
238 }
239 while (--i);
240 /*
241 * Set accumulator to zero
242 */
243 acc0v = 0LL;
244 /*
245 * Initialize state pointer
246 */
247 px = pState;
248 /*
249 * Initialize coeff. pointer
250 */
251 pb = pCoeffs;
252
253 tapCnt = numTaps >> 3;
254 while (tapCnt > 0U)
255 {
256 c0v = vldrhq_s16(pb);
257 x0v = vldrhq_s16(px);
258 pb += 8;
259 px += 8;
260 acc0v = vmlaldavaq(acc0v, x0v, c0v);
261 /*
262 * Decrement the loop counter
263 */
264 tapCnt--;
265 }
266
267 tapCnt = numTaps & 7;
268 if (tapCnt > 0U)
269 {
270 mve_pred16_t p0 = vctp16q(tapCnt);
271 c0v = vldrhq_z_s16(pb, p0);
272 x0v = vldrhq_z_s16(px, p0);
273 acc0v = vmlaldavaq_p(acc0v, x0v, c0v, p0);
274 }
275
276 acc0v = asrl(acc0v, 15);
277
278 /*
279 * Advance the state pointer by the decimation factor
280 * to process the next group of decimation factor number samples
281 */
282 pState = pState + S->M;
283 /*
284 * The result is in the accumulator, store in the destination buffer.
285 */
286 *pDst++ = (q15_t) __SSAT((q31_t) acc0v, 16);
287 /*
288 * Decrement the loop counter
289 */
290 blkCntN4--;
291 }
292
293 /*
294 * Processing is complete.
295 * Now copy the last numTaps - 1 samples to the start of the state buffer.
296 * This prepares the state buffer for the next function call.
297 */
298
299 pStateCurnt = S->pState;
300 blkCnt = (numTaps - 1) >> 3;
301 while (blkCnt > 0U)
302 {
303 vstrhq_s16(pStateCurnt, vldrhq_s16(pState));
304 pState += 8;
305 pStateCurnt += 8;
306 blkCnt--;
307 }
308 blkCnt = (numTaps - 1) & 7;
309 if (blkCnt > 0U)
310 {
311 mve_pred16_t p0 = vctp16q(blkCnt);
312 vstrhq_p_s16(pStateCurnt, vldrhq_s16(pState), p0);
313 }
314 }
315 #else
316 #if defined (ARM_MATH_DSP)
317
arm_fir_decimate_q15(const arm_fir_decimate_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)318 ARM_DSP_ATTRIBUTE void arm_fir_decimate_q15(
319 const arm_fir_decimate_instance_q15 * S,
320 const q15_t * pSrc,
321 q15_t * pDst,
322 uint32_t blockSize)
323 {
324 q15_t *pState = S->pState; /* State pointer */
325 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
326 q15_t *pStateCur; /* Points to the current sample of the state */
327 q15_t *px; /* Temporary pointer for state buffer */
328 const q15_t *pb; /* Temporary pointer for coefficient buffer */
329 q31_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
330 q63_t sum0; /* Accumulators */
331 q63_t acc0, acc1;
332 q15_t *px0, *px1;
333 uint32_t blkCntN3;
334 uint32_t numTaps = S->numTaps; /* Number of taps */
335 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
336
337 #if defined (ARM_MATH_LOOPUNROLL)
338 q31_t c1; /* Temporary variables to hold state and coefficient values */
339 #endif
340
341 /* S->pState buffer contains previous frame (numTaps - 1) samples */
342 /* pStateCur points to the location where the new input data should be written */
343 pStateCur = S->pState + (numTaps - 1U);
344
345 /* Total number of output samples to be computed */
346 blkCnt = outBlockSize / 2;
347 blkCntN3 = outBlockSize - (2 * blkCnt);
348
349 while (blkCnt > 0U)
350 {
351 /* Copy 2 * decimation factor number of new input samples into the state buffer */
352 i = S->M * 2;
353
354 do
355 {
356 *pStateCur++ = *pSrc++;
357
358 } while (--i);
359
360 /* Set accumulator to zero */
361 acc0 = 0;
362 acc1 = 0;
363
364 /* Initialize state pointer for all the samples */
365 px0 = pState;
366 px1 = pState + S->M;
367
368 /* Initialize coeff pointer */
369 pb = pCoeffs;
370
371 #if defined (ARM_MATH_LOOPUNROLL)
372
373 /* Loop unrolling: Compute 4 taps at a time */
374 tapCnt = numTaps >> 2U;
375
376 while (tapCnt > 0U)
377 {
378 /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
379 c0 = read_q15x2_ia ((q15_t **) &pb);
380
381 /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
382 x0 = read_q15x2_ia (&px0);
383 x1 = read_q15x2_ia (&px1);
384
385 /* Perform the multiply-accumulate */
386 acc0 = __SMLALD(x0, c0, acc0);
387 acc1 = __SMLALD(x1, c0, acc1);
388
389 /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
390 c0 = read_q15x2_ia ((q15_t **) &pb);
391
392 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
393 x0 = read_q15x2_ia (&px0);
394 x1 = read_q15x2_ia (&px1);
395
396 /* Perform the multiply-accumulate */
397 acc0 = __SMLALD(x0, c0, acc0);
398 acc1 = __SMLALD(x1, c0, acc1);
399
400 /* Decrement loop counter */
401 tapCnt--;
402 }
403
404 /* Loop unrolling: Compute remaining taps */
405 tapCnt = numTaps % 0x4U;
406
407 #else
408
409 /* Initialize tapCnt with number of taps */
410 tapCnt = numTaps;
411
412 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
413
414 while (tapCnt > 0U)
415 {
416 /* Read coefficients */
417 c0 = *pb++;
418
419 /* Fetch state variables for acc0, acc1 */
420 x0 = *px0++;
421 x1 = *px1++;
422
423 /* Perform the multiply-accumulate */
424 acc0 = __SMLALD(x0, c0, acc0);
425 acc1 = __SMLALD(x1, c0, acc1);
426
427 /* Decrement loop counter */
428 tapCnt--;
429 }
430
431 /* Advance the state pointer by the decimation factor
432 * to process the next group of decimation factor number samples */
433 pState = pState + S->M * 2;
434
435 /* Store filter output, smlad returns the values in 2.14 format */
436 /* so downsacle by 15 to get output in 1.15 */
437 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
438 *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
439
440 /* Decrement loop counter */
441 blkCnt--;
442 }
443
444 while (blkCntN3 > 0U)
445 {
446 /* Copy decimation factor number of new input samples into the state buffer */
447 i = S->M;
448
449 do
450 {
451 *pStateCur++ = *pSrc++;
452
453 } while (--i);
454
455 /* Set accumulator to zero */
456 sum0 = 0;
457
458 /* Initialize state pointer */
459 px = pState;
460
461 /* Initialize coeff pointer */
462 pb = pCoeffs;
463
464 #if defined (ARM_MATH_LOOPUNROLL)
465
466 /* Loop unrolling: Compute 4 taps at a time */
467 tapCnt = numTaps >> 2U;
468
469 while (tapCnt > 0U)
470 {
471 /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
472 c0 = read_q15x2_ia ((q15_t **) &pb);
473
474 /* Read x[n-numTaps-1] and x[n-numTaps-2] sample */
475 x0 = read_q15x2_ia (&px);
476
477 /* Read the b[numTaps-3] and b[numTaps-4] coefficients */
478 c1 = read_q15x2_ia ((q15_t **) &pb);
479
480 /* Perform the multiply-accumulate */
481 sum0 = __SMLALD(x0, c0, sum0);
482
483 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
484 x0 = read_q15x2_ia (&px);
485
486 /* Perform the multiply-accumulate */
487 sum0 = __SMLALD(x0, c1, sum0);
488
489 /* Decrement loop counter */
490 tapCnt--;
491 }
492
493 /* Loop unrolling: Compute remaining taps */
494 tapCnt = numTaps % 0x4U;
495
496 #else
497
498 /* Initialize tapCnt with number of taps */
499 tapCnt = numTaps;
500
501 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
502
503 while (tapCnt > 0U)
504 {
505 /* Read coefficients */
506 c0 = *pb++;
507
508 /* Fetch 1 state variable */
509 x0 = *px++;
510
511 /* Perform the multiply-accumulate */
512 sum0 = __SMLALD(x0, c0, sum0);
513
514 /* Decrement loop counter */
515 tapCnt--;
516 }
517
518 /* Advance the state pointer by the decimation factor
519 * to process the next group of decimation factor number samples */
520 pState = pState + S->M;
521
522 /* Store filter output, smlad returns the values in 2.14 format */
523 /* so downsacle by 15 to get output in 1.15 */
524 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
525
526 /* Decrement loop counter */
527 blkCntN3--;
528 }
529
530 /* Processing is complete.
531 Now copy the last numTaps - 1 samples to the satrt of the state buffer.
532 This prepares the state buffer for the next function call. */
533
534 /* Points to the start of the state buffer */
535 pStateCur = S->pState;
536 i = (numTaps - 1U) >> 2U;
537
538 /* copy data */
539 while (i > 0U)
540 {
541 write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
542 write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
543
544 /* Decrement loop counter */
545 i--;
546 }
547
548 i = (numTaps - 1U) % 0x04U;
549
550 /* Copy data */
551 while (i > 0U)
552 {
553 *pStateCur++ = *pState++;
554
555 /* Decrement loop counter */
556 i--;
557 }
558
559 }
560
561 #else /* #if defined (ARM_MATH_DSP) */
562
arm_fir_decimate_q15(const arm_fir_decimate_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)563 ARM_DSP_ATTRIBUTE void arm_fir_decimate_q15(
564 const arm_fir_decimate_instance_q15 * S,
565 const q15_t * pSrc,
566 q15_t * pDst,
567 uint32_t blockSize)
568 {
569 q15_t *pState = S->pState; /* State pointer */
570 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
571 q15_t *pStateCur; /* Points to the current sample of the state */
572 q15_t *px; /* Temporary pointer for state buffer */
573 const q15_t *pb; /* Temporary pointer for coefficient buffer */
574 q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
575 q63_t sum0; /* Accumulators */
576 q63_t acc0, acc1;
577 q15_t *px0, *px1;
578 uint32_t blkCntN3;
579 uint32_t numTaps = S->numTaps; /* Number of taps */
580 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
581
582
583 /* S->pState buffer contains previous frame (numTaps - 1) samples */
584 /* pStateCur points to the location where the new input data should be written */
585 pStateCur = S->pState + (numTaps - 1U);
586
587 /* Total number of output samples to be computed */
588 blkCnt = outBlockSize / 2;
589 blkCntN3 = outBlockSize - (2 * blkCnt);
590
591 while (blkCnt > 0U)
592 {
593 /* Copy 2 * decimation factor number of new input samples into the state buffer */
594 i = S->M * 2;
595
596 do
597 {
598 *pStateCur++ = *pSrc++;
599
600 } while (--i);
601
602 /* Set accumulator to zero */
603 acc0 = 0;
604 acc1 = 0;
605
606 /* Initialize state pointer */
607 px0 = pState;
608 px1 = pState + S->M;
609
610 /* Initialize coeff pointer */
611 pb = pCoeffs;
612
613 #if defined (ARM_MATH_LOOPUNROLL)
614
615 /* Loop unrolling: Compute 4 taps at a time */
616 tapCnt = numTaps >> 2U;
617
618 while (tapCnt > 0U)
619 {
620 /* Read the Read b[numTaps-1] coefficients */
621 c0 = *pb++;
622
623 /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
624 x0 = *px0++;
625 x1 = *px1++;
626
627 /* Perform the multiply-accumulate */
628 acc0 += x0 * c0;
629 acc1 += x1 * c0;
630
631 /* Read the b[numTaps-2] coefficient */
632 c0 = *pb++;
633
634 /* Read x[n-numTaps-2] for sample 0 and sample 1 */
635 x0 = *px0++;
636 x1 = *px1++;
637
638 /* Perform the multiply-accumulate */
639 acc0 += x0 * c0;
640 acc1 += x1 * c0;
641
642 /* Read the b[numTaps-3] coefficients */
643 c0 = *pb++;
644
645 /* Read x[n-numTaps-3] for sample 0 and sample 1 */
646 x0 = *px0++;
647 x1 = *px1++;
648
649 /* Perform the multiply-accumulate */
650 acc0 += x0 * c0;
651 acc1 += x1 * c0;
652
653 /* Read the b[numTaps-4] coefficient */
654 c0 = *pb++;
655
656 /* Read x[n-numTaps-4] for sample 0 and sample 1 */
657 x0 = *px0++;
658 x1 = *px1++;
659
660 /* Perform the multiply-accumulate */
661 acc0 += x0 * c0;
662 acc1 += x1 * c0;
663
664 /* Decrement the loop counter */
665 tapCnt--;
666 }
667
668 /* Loop unrolling: Compute remaining taps */
669 tapCnt = numTaps % 0x4U;
670
671 #else
672
673 /* Initialize tapCnt with number of taps */
674 tapCnt = numTaps;
675
676 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
677
678 while (tapCnt > 0U)
679 {
680 /* Read coefficients */
681 c0 = *pb++;
682
683 /* Fetch 1 state variable */
684 x0 = *px0++;
685 x1 = *px1++;
686
687 /* Perform the multiply-accumulate */
688 acc0 += x0 * c0;
689 acc1 += x1 * c0;
690
691 /* Decrement the loop counter */
692 tapCnt--;
693 }
694
695 /* Advance the state pointer by the decimation factor
696 * to process the next group of decimation factor number samples */
697 pState = pState + S->M * 2;
698
699 /* Store filter output, smlad returns the values in 2.14 format */
700 /* so downsacle by 15 to get output in 1.15 */
701
702 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
703 *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
704
705 /* Decrement loop counter */
706 blkCnt--;
707 }
708
709 while (blkCntN3 > 0U)
710 {
711 /* Copy decimation factor number of new input samples into the state buffer */
712 i = S->M;
713
714 do
715 {
716 *pStateCur++ = *pSrc++;
717
718 } while (--i);
719
720 /* Set accumulator to zero */
721 sum0 = 0;
722
723 /* Initialize state pointer */
724 px = pState;
725
726 /* Initialize coeff pointer */
727 pb = pCoeffs;
728
729 #if defined (ARM_MATH_LOOPUNROLL)
730
731 /* Loop unrolling: Compute 4 taps at a time */
732 tapCnt = numTaps >> 2U;
733
734 while (tapCnt > 0U)
735 {
736 /* Read the b[numTaps-1] coefficient */
737 c0 = *pb++;
738
739 /* Read x[n-numTaps-1] sample */
740 x0 = *px++;
741
742 /* Perform the multiply-accumulate */
743 sum0 += x0 * c0;
744
745 /* Read the b[numTaps-2] coefficient */
746 c0 = *pb++;
747
748 /* Read x[n-numTaps-2] sample */
749 x0 = *px++;
750
751 /* Perform the multiply-accumulate */
752 sum0 += x0 * c0;
753
754 /* Read the b[numTaps-3] coefficient */
755 c0 = *pb++;
756
757 /* Read x[n-numTaps-3] sample */
758 x0 = *px++;
759
760 /* Perform the multiply-accumulate */
761 sum0 += x0 * c0;
762
763 /* Read the b[numTaps-4] coefficient */
764 c0 = *pb++;
765
766 /* Read x[n-numTaps-4] sample */
767 x0 = *px++;
768
769 /* Perform the multiply-accumulate */
770 sum0 += x0 * c0;
771
772 /* Decrement loop counter */
773 tapCnt--;
774 }
775
776 /* Loop unrolling: Compute remaining taps */
777 tapCnt = numTaps % 0x4U;
778
779 #else
780
781 /* Initialize tapCnt with number of taps */
782 tapCnt = numTaps;
783
784 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
785
786 while (tapCnt > 0U)
787 {
788 /* Read coefficients */
789 c0 = *pb++;
790
791 /* Fetch 1 state variable */
792 x0 = *px++;
793
794 /* Perform the multiply-accumulate */
795 sum0 += x0 * c0;
796
797 /* Decrement the loop counter */
798 tapCnt--;
799 }
800
801 /* Advance the state pointer by the decimation factor
802 * to process the next group of decimation factor number samples */
803 pState = pState + S->M;
804
805 /* Store filter output, smlad returns the values in 2.14 format */
806 /* so downsacle by 15 to get output in 1.15 */
807 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
808
809 /* Decrement loop counter */
810 blkCntN3--;
811 }
812
813 /* Processing is complete.
814 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
815 ** This prepares the state buffer for the next function call. */
816
817 /* Points to the start of the state buffer */
818 pStateCur = S->pState;
819
820 i = (numTaps - 1U) >> 2U;
821
822 /* copy data */
823 while (i > 0U)
824 {
825 *pStateCur++ = *pState++;
826 *pStateCur++ = *pState++;
827 *pStateCur++ = *pState++;
828 *pStateCur++ = *pState++;
829
830 /* Decrement loop counter */
831 i--;
832 }
833
834 i = (numTaps - 1U) % 0x04U;
835
836 /* copy data */
837 while (i > 0U)
838 {
839 *pStateCur++ = *pState++;
840
841 /* Decrement loop counter */
842 i--;
843 }
844 }
845
846 #endif /* #if defined (ARM_MATH_DSP) */
847 #endif /* defined(ARM_MATH_MVEI) */
848 /**
849 @} end of FIR_decimate group
850 */
851