1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_decimate_q31.c
4 * Description: Q31 FIR Decimator
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup FIR_decimate
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q31 FIR decimator.
42 @param[in] S points to an instance of the Q31 FIR decimator structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] blockSize number of input samples to process
46
47 @par Scaling and Overflow Behavior
48 The function is implemented using an internal 64-bit accumulator.
49 The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
50 Thus, if the accumulator result overflows it wraps around rather than clip.
51 In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).
52 After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
53
54 @remark
55 Refer to \ref arm_fir_decimate_fast_q31() for a faster but less precise implementation of this function.
56 */
57
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
59
60 #include "arm_helium_utils.h"
61
arm_fir_decimate_q31(const arm_fir_decimate_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)62 ARM_DSP_ATTRIBUTE void arm_fir_decimate_q31(
63 const arm_fir_decimate_instance_q31 * S,
64 const q31_t * pSrc,
65 q31_t * pDst,
66 uint32_t blockSize)
67 {
68 q31_t *pState = S->pState; /* State pointer */
69 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
70 q31_t *pStateCurnt; /* Points to the current sample of the state */
71 const q31_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
72 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
73 uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
74 uint32_t blkCntN4;
75 const q31_t *px0, *px1, *px2, *px3;
76 q63_t acc0v, acc1v, acc2v, acc3v;
77 q31x4_t x0v, x1v, x2v, x3v;
78 q31x4_t c0v;
79
80 /*
81 * S->pState buffer contains previous frame (numTaps - 1) samples
82 * pStateCurnt points to the location where the new input data should be written
83 */
84 pStateCurnt = S->pState + (numTaps - 1U);
85 /*
86 * Total number of output samples to be computed
87 */
88 blkCnt = outBlockSize / 4;
89 blkCntN4 = outBlockSize - (4 * blkCnt);
90
91 while (blkCnt > 0U)
92 {
93 /*
94 * Copy 4 * decimation factor number of new input samples into the state buffer
95 */
96 i = (4 * S->M) >> 2;
97 do
98 {
99 vst1q(pStateCurnt, vldrwq_s32(pSrc));
100 pSrc += 4;
101 pStateCurnt += 4;
102 i--;
103 }
104 while (i > 0U);
105
106 /*
107 * Clear all accumulators
108 */
109 acc0v = 0LL;
110 acc1v = 0LL;
111 acc2v = 0LL;
112 acc3v = 0LL;
113 /*
114 * Initialize state pointer for all the samples
115 */
116 px0 = pState;
117 px1 = pState + S->M;
118 px2 = pState + 2 * S->M;
119 px3 = pState + 3 * S->M;
120 /*
121 * Initialize coeff. pointer
122 */
123 pb = pCoeffs;
124 /*
125 * Loop unrolling. Process 4 taps at a time.
126 */
127 tapCnt = numTaps >> 2;
128 /*
129 * Loop over the number of taps. Unroll by a factor of 4.
130 * Repeat until we've computed numTaps-4 coefficients.
131 */
132 while (tapCnt > 0U)
133 {
134 /*
135 * Read the b[numTaps-1] coefficient
136 */
137 c0v = vldrwq_s32(pb);
138 pb += 4;
139 /*
140 * Read x[n-numTaps-1] sample for acc0
141 */
142 x0v = vld1q(px0);
143 x1v = vld1q(px1);
144 x2v = vld1q(px2);
145 x3v = vld1q(px3);
146 px0 += 4;
147 px1 += 4;
148 px2 += 4;
149 px3 += 4;
150
151 acc0v = vrmlaldavhaq(acc0v, x0v, c0v);
152 acc1v = vrmlaldavhaq(acc1v, x1v, c0v);
153 acc2v = vrmlaldavhaq(acc2v, x2v, c0v);
154 acc3v = vrmlaldavhaq(acc3v, x3v, c0v);
155 /*
156 * Decrement the loop counter
157 */
158 tapCnt--;
159 }
160
161 /*
162 * If the filter length is not a multiple of 4, compute the remaining filter taps
163 * should be tail predicated
164 */
165 tapCnt = numTaps % 0x4U;
166 if (tapCnt > 0U)
167 {
168 mve_pred16_t p0 = vctp32q(tapCnt);
169 /*
170 * Read the b[numTaps-1] coefficient
171 */
172 c0v = vldrwq_z_s32(pb, p0);
173 pb += 4;
174 /*
175 * Read x[n-numTaps-1] sample for acc0
176 */
177 x0v = vld1q(px0);
178 x1v = vld1q(px1);
179 x2v = vld1q(px2);
180 x3v = vld1q(px3);
181 px0 += 4;
182 px1 += 4;
183 px2 += 4;
184 px3 += 4;
185
186 acc0v = vrmlaldavhaq(acc0v, x0v, c0v);
187 acc1v = vrmlaldavhaq(acc1v, x1v, c0v);
188 acc2v = vrmlaldavhaq(acc2v, x2v, c0v);
189 acc3v = vrmlaldavhaq(acc3v, x3v, c0v);
190 }
191
192 acc0v = asrl(acc0v, 31 - 8);
193 acc1v = asrl(acc1v, 31 - 8);
194 acc2v = asrl(acc2v, 31 - 8);
195 acc3v = asrl(acc3v, 31 - 8);
196 /*
197 * store in the destination buffer.
198 */
199 *pDst++ = (q31_t) acc0v;
200 *pDst++ = (q31_t) acc1v;
201 *pDst++ = (q31_t) acc2v;
202 *pDst++ = (q31_t) acc3v;
203
204 /*
205 * Advance the state pointer by the decimation factor
206 * to process the next group of decimation factor number samples
207 */
208 pState = pState + 4 * S->M;
209 /*
210 * Decrement the loop counter
211 */
212 blkCnt--;
213 }
214
215 while (blkCntN4 > 0U)
216 {
217 /*
218 * Copy decimation factor number of new input samples into the state buffer
219 */
220 i = S->M;
221 do
222 {
223 *pStateCurnt++ = *pSrc++;
224 }
225 while (--i);
226 /*
227 * Set accumulator to zero
228 */
229 acc0v = 0LL;
230 /*
231 * Initialize state pointer
232 */
233 px = pState;
234 /*
235 * Initialize coeff. pointer
236 */
237 pb = pCoeffs;
238 /*
239 * Loop unrolling. Process 4 taps at a time.
240 */
241 tapCnt = numTaps >> 2;
242 /*
243 * Loop over the number of taps. Unroll by a factor of 4.
244 * Repeat until we've computed numTaps-4 coefficients.
245 */
246 while (tapCnt > 0U)
247 {
248 c0v = vldrwq_s32(pb);
249 x0v = vldrwq_s32(px);
250 pb += 4;
251 px += 4;
252 acc0v = vrmlaldavhaq(acc0v, x0v, c0v);
253 /*
254 * Decrement the loop counter
255 */
256 tapCnt--;
257 }
258 tapCnt = numTaps % 0x4U;
259 if (tapCnt > 0U)
260 {
261 mve_pred16_t p0 = vctp32q(tapCnt);
262 c0v = vldrwq_z_s32(pb, p0);
263 x0v = vldrwq_z_s32(px, p0);
264 acc0v = vrmlaldavhaq_p(acc0v, x0v, c0v, p0);
265 }
266 acc0v = asrl(acc0v, 31 - 8);
267
268 /*
269 * Advance the state pointer by the decimation factor
270 * * to process the next group of decimation factor number samples
271 */
272 pState = pState + S->M;
273 /*
274 * The result is in the accumulator, store in the destination buffer.
275 */
276 *pDst++ = (q31_t) acc0v;
277 /*
278 * Decrement the loop counter
279 */
280 blkCntN4--;
281 }
282
283 /*
284 * Processing is complete.
285 * Now copy the last numTaps - 1 samples to the start of the state buffer.
286 * This prepares the state buffer for the next function call.
287 */
288 pStateCurnt = S->pState;
289 blkCnt = (numTaps - 1) >> 2;
290 while (blkCnt > 0U)
291 {
292 vst1q(pStateCurnt, vldrwq_s32(pState));
293 pState += 4;
294 pStateCurnt += 4;
295 blkCnt--;
296 }
297 blkCnt = (numTaps - 1) & 3;
298 if (blkCnt > 0U)
299 {
300 mve_pred16_t p0 = vctp32q(blkCnt);
301 vstrwq_p_s32(pStateCurnt, vldrwq_s32(pState), p0);
302 }
303 }
304 #else
arm_fir_decimate_q31(const arm_fir_decimate_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)305 ARM_DSP_ATTRIBUTE void arm_fir_decimate_q31(
306 const arm_fir_decimate_instance_q31 * S,
307 const q31_t * pSrc,
308 q31_t * pDst,
309 uint32_t blockSize)
310 {
311 q31_t *pState = S->pState; /* State pointer */
312 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
313 q31_t *pStateCur; /* Points to the current sample of the state */
314 q31_t *px0; /* Temporary pointer for state buffer */
315 const q31_t *pb; /* Temporary pointer for coefficient buffer */
316 q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
317 q63_t acc0; /* Accumulator */
318 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
319 uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
320
321 #if defined (ARM_MATH_LOOPUNROLL)
322 q31_t *px1, *px2, *px3;
323 q31_t x1, x2, x3;
324 q63_t acc1, acc2, acc3;
325 #endif
326
327 /* S->pState buffer contains previous frame (numTaps - 1) samples */
328 /* pStateCur points to the location where the new input data should be written */
329 pStateCur = S->pState + (numTaps - 1U);
330
331 #if defined (ARM_MATH_LOOPUNROLL)
332
333 /* Loop unrolling: Compute 4 samples at a time */
334 blkCnt = outBlockSize >> 2U;
335
336 /* Samples loop unrolled by 4 */
337 while (blkCnt > 0U)
338 {
339 /* Copy 4 * decimation factor number of new input samples into the state buffer */
340 i = S->M * 4;
341
342 do
343 {
344 *pStateCur++ = *pSrc++;
345
346 } while (--i);
347
348 /* Set accumulators to zero */
349 acc0 = 0;
350 acc1 = 0;
351 acc2 = 0;
352 acc3 = 0;
353
354 /* Initialize state pointer for all the samples */
355 px0 = pState;
356 px1 = pState + S->M;
357 px2 = pState + 2 * S->M;
358 px3 = pState + 3 * S->M;
359
360 /* Initialize coeff pointer */
361 pb = pCoeffs;
362
363 /* Loop unrolling: Compute 4 taps at a time */
364 tapCnt = numTaps >> 2U;
365
366 while (tapCnt > 0U)
367 {
368 /* Read the b[numTaps-1] coefficient */
369 c0 = *(pb++);
370
371 /* Read x[n-numTaps-1] sample for acc0 */
372 x0 = *(px0++);
373 /* Read x[n-numTaps-1] sample for acc1 */
374 x1 = *(px1++);
375 /* Read x[n-numTaps-1] sample for acc2 */
376 x2 = *(px2++);
377 /* Read x[n-numTaps-1] sample for acc3 */
378 x3 = *(px3++);
379
380 /* Perform the multiply-accumulate */
381 acc0 += (q63_t) x0 * c0;
382 acc1 += (q63_t) x1 * c0;
383 acc2 += (q63_t) x2 * c0;
384 acc3 += (q63_t) x3 * c0;
385
386 /* Read the b[numTaps-2] coefficient */
387 c0 = *(pb++);
388
389 /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */
390 x0 = *(px0++);
391 x1 = *(px1++);
392 x2 = *(px2++);
393 x3 = *(px3++);
394
395 /* Perform the multiply-accumulate */
396 acc0 += (q63_t) x0 * c0;
397 acc1 += (q63_t) x1 * c0;
398 acc2 += (q63_t) x2 * c0;
399 acc3 += (q63_t) x3 * c0;
400
401 /* Read the b[numTaps-3] coefficient */
402 c0 = *(pb++);
403
404 /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */
405 x0 = *(px0++);
406 x1 = *(px1++);
407 x2 = *(px2++);
408 x3 = *(px3++);
409
410 /* Perform the multiply-accumulate */
411 acc0 += (q63_t) x0 * c0;
412 acc1 += (q63_t) x1 * c0;
413 acc2 += (q63_t) x2 * c0;
414 acc3 += (q63_t) x3 * c0;
415
416 /* Read the b[numTaps-4] coefficient */
417 c0 = *(pb++);
418
419 /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */
420 x0 = *(px0++);
421 x1 = *(px1++);
422 x2 = *(px2++);
423 x3 = *(px3++);
424
425 /* Perform the multiply-accumulate */
426 acc0 += (q63_t) x0 * c0;
427 acc1 += (q63_t) x1 * c0;
428 acc2 += (q63_t) x2 * c0;
429 acc3 += (q63_t) x3 * c0;
430
431 /* Decrement loop counter */
432 tapCnt--;
433 }
434
435 /* Loop unrolling: Compute remaining taps */
436 tapCnt = numTaps % 0x4U;
437
438 while (tapCnt > 0U)
439 {
440 /* Read coefficients */
441 c0 = *(pb++);
442
443 /* Fetch state variables for acc0, acc1, acc2, acc3 */
444 x0 = *(px0++);
445 x1 = *(px1++);
446 x2 = *(px2++);
447 x3 = *(px3++);
448
449 /* Perform the multiply-accumulate */
450 acc0 += (q63_t) x0 * c0;
451 acc1 += (q63_t) x1 * c0;
452 acc2 += (q63_t) x2 * c0;
453 acc3 += (q63_t) x3 * c0;
454
455 /* Decrement loop counter */
456 tapCnt--;
457 }
458
459 /* Advance the state pointer by the decimation factor
460 * to process the next group of decimation factor number samples */
461 pState = pState + S->M * 4;
462
463 /* The result is in the accumulator, store in the destination buffer. */
464 *pDst++ = (q31_t) (acc0 >> 31);
465 *pDst++ = (q31_t) (acc1 >> 31);
466 *pDst++ = (q31_t) (acc2 >> 31);
467 *pDst++ = (q31_t) (acc3 >> 31);
468
469 /* Decrement loop counter */
470 blkCnt--;
471 }
472
473 /* Loop unrolling: Compute remaining samples */
474 blkCnt = outBlockSize % 0x4U;
475
476 #else
477
478 /* Initialize blkCnt with number of samples */
479 blkCnt = outBlockSize;
480
481 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
482
483 while (blkCnt > 0U)
484 {
485 /* Copy decimation factor number of new input samples into the state buffer */
486 i = S->M;
487
488 do
489 {
490 *pStateCur++ = *pSrc++;
491
492 } while (--i);
493
494 /* Set accumulator to zero */
495 acc0 = 0;
496
497 /* Initialize state pointer */
498 px0 = pState;
499
500 /* Initialize coeff pointer */
501 pb = pCoeffs;
502
503 #if defined (ARM_MATH_LOOPUNROLL)
504
505 /* Loop unrolling: Compute 4 taps at a time */
506 tapCnt = numTaps >> 2U;
507
508 while (tapCnt > 0U)
509 {
510 /* Read the b[numTaps-1] coefficient */
511 c0 = *pb++;
512
513 /* Read x[n-numTaps-1] sample */
514 x0 = *px0++;
515
516 /* Perform the multiply-accumulate */
517 acc0 += (q63_t) x0 * c0;
518
519 /* Read the b[numTaps-2] coefficient */
520 c0 = *pb++;
521
522 /* Read x[n-numTaps-2] sample */
523 x0 = *px0++;
524
525 /* Perform the multiply-accumulate */
526 acc0 += (q63_t) x0 * c0;
527
528 /* Read the b[numTaps-3] coefficient */
529 c0 = *pb++;
530
531 /* Read x[n-numTaps-3] sample */
532 x0 = *px0++;
533
534 /* Perform the multiply-accumulate */
535 acc0 += (q63_t) x0 * c0;
536
537 /* Read the b[numTaps-4] coefficient */
538 c0 = *pb++;
539
540 /* Read x[n-numTaps-4] sample */
541 x0 = *px0++;
542
543 /* Perform the multiply-accumulate */
544 acc0 += (q63_t) x0 * c0;
545
546 /* Decrement loop counter */
547 tapCnt--;
548 }
549
550 /* Loop unrolling: Compute remaining taps */
551 tapCnt = numTaps % 0x4U;
552
553 #else
554
555 /* Initialize tapCnt with number of taps */
556 tapCnt = numTaps;
557
558 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
559
560 while (tapCnt > 0U)
561 {
562 /* Read coefficients */
563 c0 = *pb++;
564
565 /* Fetch 1 state variable */
566 x0 = *px0++;
567
568 /* Perform the multiply-accumulate */
569 acc0 += (q63_t) x0 * c0;
570
571 /* Decrement loop counter */
572 tapCnt--;
573 }
574
575 /* Advance the state pointer by the decimation factor
576 * to process the next group of decimation factor number samples */
577 pState = pState + S->M;
578
579 /* The result is in the accumulator, store in the destination buffer. */
580 *pDst++ = (q31_t) (acc0 >> 31);
581
582 /* Decrement loop counter */
583 blkCnt--;
584 }
585
586 /* Processing is complete.
587 Now copy the last numTaps - 1 samples to the satrt of the state buffer.
588 This prepares the state buffer for the next function call. */
589
590 /* Points to the start of the state buffer */
591 pStateCur = S->pState;
592
593 #if defined (ARM_MATH_LOOPUNROLL)
594
595 /* Loop unrolling: Compute 4 taps at a time */
596 tapCnt = (numTaps - 1U) >> 2U;
597
598 /* Copy data */
599 while (tapCnt > 0U)
600 {
601 *pStateCur++ = *pState++;
602 *pStateCur++ = *pState++;
603 *pStateCur++ = *pState++;
604 *pStateCur++ = *pState++;
605
606 /* Decrement loop counter */
607 tapCnt--;
608 }
609
610 /* Loop unrolling: Compute remaining taps */
611 tapCnt = (numTaps - 1U) % 0x04U;
612
613 #else
614
615 /* Initialize tapCnt with number of taps */
616 tapCnt = (numTaps - 1U);
617
618 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
619
620 /* Copy data */
621 while (tapCnt > 0U)
622 {
623 *pStateCur++ = *pState++;
624
625 /* Decrement loop counter */
626 tapCnt--;
627 }
628
629 }
630 #endif /* defined(ARM_MATH_MVEI) */
631 /**
632 @} end of FIR_decimate group
633 */
634