1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_decimate_q31.c
4  * Description:  Q31 FIR Decimator
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR_decimate
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q31 FIR decimator.
42   @param[in]     S          points to an instance of the Q31 FIR decimator structure
43   @param[in]     pSrc       points to the block of input data
44   @param[out]    pDst       points to the block of output data
45   @param[in]     blockSize  number of input samples to process
46 
47   @par           Scaling and Overflow Behavior
48                    The function is implemented using an internal 64-bit accumulator.
49                    The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
50                    Thus, if the accumulator result overflows it wraps around rather than clip.
51                    In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).
52                    After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
53 
54  @remark
55                    Refer to \ref arm_fir_decimate_fast_q31() for a faster but less precise implementation of this function.
56  */
57 
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
59 
60 #include "arm_helium_utils.h"
61 
arm_fir_decimate_q31(const arm_fir_decimate_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)62 ARM_DSP_ATTRIBUTE void arm_fir_decimate_q31(
63   const arm_fir_decimate_instance_q31 * S,
64   const q31_t * pSrc,
65         q31_t * pDst,
66         uint32_t blockSize)
67 {
68     q31_t    *pState = S->pState;   /* State pointer */
69     const q31_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
70     q31_t    *pStateCurnt;      /* Points to the current sample of the state */
71     const q31_t    *px, *pb;          /* Temporary pointers for state and coefficient buffers */
72     uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
73     uint32_t  i, tapCnt, blkCnt, outBlockSize = blockSize / S->M;   /* Loop counters */
74     uint32_t  blkCntN4;
75     const q31_t    *px0, *px1, *px2, *px3;
76     q63_t     acc0v, acc1v, acc2v, acc3v;
77     q31x4_t x0v, x1v, x2v, x3v;
78     q31x4_t c0v;
79 
80     /*
81      * S->pState buffer contains previous frame (numTaps - 1) samples
82      * pStateCurnt points to the location where the new input data should be written
83      */
84     pStateCurnt = S->pState + (numTaps - 1U);
85     /*
86      * Total number of output samples to be computed
87      */
88     blkCnt = outBlockSize / 4;
89     blkCntN4 = outBlockSize - (4 * blkCnt);
90 
91     while (blkCnt > 0U)
92     {
93         /*
94          * Copy 4 * decimation factor number of new input samples into the state buffer
95          */
96         i = (4 * S->M) >> 2;
97         do
98         {
99             vst1q(pStateCurnt, vldrwq_s32(pSrc));
100             pSrc += 4;
101             pStateCurnt += 4;
102             i--;
103         }
104         while (i > 0U);
105 
106         /*
107          * Clear all accumulators
108          */
109         acc0v = 0LL;
110         acc1v = 0LL;
111         acc2v = 0LL;
112         acc3v = 0LL;
113         /*
114          * Initialize state pointer for all the samples
115          */
116         px0 = pState;
117         px1 = pState + S->M;
118         px2 = pState + 2 * S->M;
119         px3 = pState + 3 * S->M;
120         /*
121          * Initialize coeff. pointer
122          */
123         pb = pCoeffs;
124         /*
125          * Loop unrolling.  Process 4 taps at a time.
126          */
127         tapCnt = numTaps >> 2;
128         /*
129          * Loop over the number of taps.  Unroll by a factor of 4.
130          * Repeat until we've computed numTaps-4 coefficients.
131          */
132         while (tapCnt > 0U)
133         {
134             /*
135              * Read the b[numTaps-1] coefficient
136              */
137             c0v = vldrwq_s32(pb);
138             pb += 4;
139             /*
140              * Read x[n-numTaps-1] sample for acc0
141              */
142             x0v = vld1q(px0);
143             x1v = vld1q(px1);
144             x2v = vld1q(px2);
145             x3v = vld1q(px3);
146             px0 += 4;
147             px1 += 4;
148             px2 += 4;
149             px3 += 4;
150 
151             acc0v = vrmlaldavhaq(acc0v, x0v, c0v);
152             acc1v = vrmlaldavhaq(acc1v, x1v, c0v);
153             acc2v = vrmlaldavhaq(acc2v, x2v, c0v);
154             acc3v = vrmlaldavhaq(acc3v, x3v, c0v);
155             /*
156              * Decrement the loop counter
157              */
158             tapCnt--;
159         }
160 
161         /*
162          * If the filter length is not a multiple of 4, compute the remaining filter taps
163          * should be tail predicated
164          */
165         tapCnt = numTaps % 0x4U;
166         if (tapCnt > 0U)
167         {
168             mve_pred16_t p0 = vctp32q(tapCnt);
169             /*
170              * Read the b[numTaps-1] coefficient
171              */
172             c0v = vldrwq_z_s32(pb, p0);
173             pb += 4;
174             /*
175              * Read x[n-numTaps-1] sample for acc0
176              */
177             x0v = vld1q(px0);
178             x1v = vld1q(px1);
179             x2v = vld1q(px2);
180             x3v = vld1q(px3);
181             px0 += 4;
182             px1 += 4;
183             px2 += 4;
184             px3 += 4;
185 
186             acc0v = vrmlaldavhaq(acc0v, x0v, c0v);
187             acc1v = vrmlaldavhaq(acc1v, x1v, c0v);
188             acc2v = vrmlaldavhaq(acc2v, x2v, c0v);
189             acc3v = vrmlaldavhaq(acc3v, x3v, c0v);
190         }
191 
192         acc0v = asrl(acc0v, 31 - 8);
193         acc1v = asrl(acc1v, 31 - 8);
194         acc2v = asrl(acc2v, 31 - 8);
195         acc3v = asrl(acc3v, 31 - 8);
196         /*
197          * store in the destination buffer.
198          */
199         *pDst++ = (q31_t) acc0v;
200         *pDst++ = (q31_t) acc1v;
201         *pDst++ = (q31_t) acc2v;
202         *pDst++ = (q31_t) acc3v;
203 
204         /*
205          * Advance the state pointer by the decimation factor
206          * to process the next group of decimation factor number samples
207          */
208         pState = pState + 4 * S->M;
209         /*
210          * Decrement the loop counter
211          */
212         blkCnt--;
213     }
214 
215     while (blkCntN4 > 0U)
216     {
217         /*
218          * Copy decimation factor number of new input samples into the state buffer
219          */
220         i = S->M;
221         do
222         {
223             *pStateCurnt++ = *pSrc++;
224         }
225         while (--i);
226         /*
227          * Set accumulator to zero
228          */
229         acc0v = 0LL;
230         /*
231          * Initialize state pointer
232          */
233         px = pState;
234         /*
235          * Initialize coeff. pointer
236          */
237         pb = pCoeffs;
238         /*
239          * Loop unrolling.  Process 4 taps at a time.
240          */
241         tapCnt = numTaps >> 2;
242         /*
243          * Loop over the number of taps.  Unroll by a factor of 4.
244          * Repeat until we've computed numTaps-4 coefficients.
245          */
246         while (tapCnt > 0U)
247         {
248             c0v = vldrwq_s32(pb);
249             x0v = vldrwq_s32(px);
250             pb += 4;
251             px += 4;
252             acc0v = vrmlaldavhaq(acc0v, x0v, c0v);
253             /*
254              * Decrement the loop counter
255              */
256             tapCnt--;
257         }
258         tapCnt = numTaps % 0x4U;
259         if (tapCnt > 0U)
260         {
261             mve_pred16_t p0 = vctp32q(tapCnt);
262             c0v = vldrwq_z_s32(pb, p0);
263             x0v = vldrwq_z_s32(px, p0);
264             acc0v = vrmlaldavhaq_p(acc0v, x0v, c0v, p0);
265         }
266         acc0v = asrl(acc0v, 31 - 8);
267 
268         /*
269          * Advance the state pointer by the decimation factor
270          * * to process the next group of decimation factor number samples
271          */
272         pState = pState + S->M;
273         /*
274          * The result is in the accumulator, store in the destination buffer.
275          */
276         *pDst++ = (q31_t) acc0v;
277         /*
278          * Decrement the loop counter
279          */
280         blkCntN4--;
281     }
282 
283     /*
284      * Processing is complete.
285      * Now copy the last numTaps - 1 samples to the start of the state buffer.
286      * This prepares the state buffer for the next function call.
287      */
288     pStateCurnt = S->pState;
289     blkCnt = (numTaps - 1) >> 2;
290     while (blkCnt > 0U)
291     {
292         vst1q(pStateCurnt, vldrwq_s32(pState));
293         pState += 4;
294         pStateCurnt += 4;
295         blkCnt--;
296     }
297     blkCnt = (numTaps - 1) & 3;
298     if (blkCnt > 0U)
299     {
300         mve_pred16_t p0 = vctp32q(blkCnt);
301         vstrwq_p_s32(pStateCurnt, vldrwq_s32(pState), p0);
302     }
303 }
304 #else
arm_fir_decimate_q31(const arm_fir_decimate_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)305 ARM_DSP_ATTRIBUTE void arm_fir_decimate_q31(
306   const arm_fir_decimate_instance_q31 * S,
307   const q31_t * pSrc,
308         q31_t * pDst,
309         uint32_t blockSize)
310 {
311         q31_t *pState = S->pState;                     /* State pointer */
312   const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
313         q31_t *pStateCur;                              /* Points to the current sample of the state */
314         q31_t *px0;                                    /* Temporary pointer for state buffer */
315   const q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
316         q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
317         q63_t acc0;                                    /* Accumulator */
318         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
319         uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
320 
321 #if defined (ARM_MATH_LOOPUNROLL)
322         q31_t *px1, *px2, *px3;
323         q31_t x1, x2, x3;
324         q63_t acc1, acc2, acc3;
325 #endif
326 
327   /* S->pState buffer contains previous frame (numTaps - 1) samples */
328   /* pStateCur points to the location where the new input data should be written */
329   pStateCur = S->pState + (numTaps - 1U);
330 
331 #if defined (ARM_MATH_LOOPUNROLL)
332 
333     /* Loop unrolling: Compute 4 samples at a time */
334   blkCnt = outBlockSize >> 2U;
335 
336   /* Samples loop unrolled by 4 */
337   while (blkCnt > 0U)
338   {
339     /* Copy 4 * decimation factor number of new input samples into the state buffer */
340     i = S->M * 4;
341 
342     do
343     {
344       *pStateCur++ = *pSrc++;
345 
346     } while (--i);
347 
348     /* Set accumulators to zero */
349     acc0 = 0;
350     acc1 = 0;
351     acc2 = 0;
352     acc3 = 0;
353 
354     /* Initialize state pointer for all the samples */
355     px0 = pState;
356     px1 = pState + S->M;
357     px2 = pState + 2 * S->M;
358     px3 = pState + 3 * S->M;
359 
360     /* Initialize coeff pointer */
361     pb = pCoeffs;
362 
363     /* Loop unrolling: Compute 4 taps at a time */
364     tapCnt = numTaps >> 2U;
365 
366     while (tapCnt > 0U)
367     {
368       /* Read the b[numTaps-1] coefficient */
369       c0 = *(pb++);
370 
371       /* Read x[n-numTaps-1] sample for acc0 */
372       x0 = *(px0++);
373       /* Read x[n-numTaps-1] sample for acc1 */
374       x1 = *(px1++);
375       /* Read x[n-numTaps-1] sample for acc2 */
376       x2 = *(px2++);
377       /* Read x[n-numTaps-1] sample for acc3 */
378       x3 = *(px3++);
379 
380       /* Perform the multiply-accumulate */
381       acc0 += (q63_t) x0 * c0;
382       acc1 += (q63_t) x1 * c0;
383       acc2 += (q63_t) x2 * c0;
384       acc3 += (q63_t) x3 * c0;
385 
386       /* Read the b[numTaps-2] coefficient */
387       c0 = *(pb++);
388 
389       /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */
390       x0 = *(px0++);
391       x1 = *(px1++);
392       x2 = *(px2++);
393       x3 = *(px3++);
394 
395       /* Perform the multiply-accumulate */
396       acc0 += (q63_t) x0 * c0;
397       acc1 += (q63_t) x1 * c0;
398       acc2 += (q63_t) x2 * c0;
399       acc3 += (q63_t) x3 * c0;
400 
401       /* Read the b[numTaps-3] coefficient */
402       c0 = *(pb++);
403 
404       /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */
405       x0 = *(px0++);
406       x1 = *(px1++);
407       x2 = *(px2++);
408       x3 = *(px3++);
409 
410       /* Perform the multiply-accumulate */
411       acc0 += (q63_t) x0 * c0;
412       acc1 += (q63_t) x1 * c0;
413       acc2 += (q63_t) x2 * c0;
414       acc3 += (q63_t) x3 * c0;
415 
416       /* Read the b[numTaps-4] coefficient */
417       c0 = *(pb++);
418 
419       /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */
420       x0 = *(px0++);
421       x1 = *(px1++);
422       x2 = *(px2++);
423       x3 = *(px3++);
424 
425       /* Perform the multiply-accumulate */
426       acc0 += (q63_t) x0 * c0;
427       acc1 += (q63_t) x1 * c0;
428       acc2 += (q63_t) x2 * c0;
429       acc3 += (q63_t) x3 * c0;
430 
431       /* Decrement loop counter */
432       tapCnt--;
433     }
434 
435     /* Loop unrolling: Compute remaining taps */
436     tapCnt = numTaps % 0x4U;
437 
438     while (tapCnt > 0U)
439     {
440       /* Read coefficients */
441       c0 = *(pb++);
442 
443       /* Fetch state variables for acc0, acc1, acc2, acc3 */
444       x0 = *(px0++);
445       x1 = *(px1++);
446       x2 = *(px2++);
447       x3 = *(px3++);
448 
449       /* Perform the multiply-accumulate */
450       acc0 += (q63_t) x0 * c0;
451       acc1 += (q63_t) x1 * c0;
452       acc2 += (q63_t) x2 * c0;
453       acc3 += (q63_t) x3 * c0;
454 
455       /* Decrement loop counter */
456       tapCnt--;
457     }
458 
459     /* Advance the state pointer by the decimation factor
460      * to process the next group of decimation factor number samples */
461     pState = pState + S->M * 4;
462 
463     /* The result is in the accumulator, store in the destination buffer. */
464     *pDst++ = (q31_t) (acc0 >> 31);
465     *pDst++ = (q31_t) (acc1 >> 31);
466     *pDst++ = (q31_t) (acc2 >> 31);
467     *pDst++ = (q31_t) (acc3 >> 31);
468 
469     /* Decrement loop counter */
470     blkCnt--;
471   }
472 
473   /* Loop unrolling: Compute remaining samples */
474   blkCnt = outBlockSize % 0x4U;
475 
476 #else
477 
478   /* Initialize blkCnt with number of samples */
479   blkCnt = outBlockSize;
480 
481 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
482 
483   while (blkCnt > 0U)
484   {
485     /* Copy decimation factor number of new input samples into the state buffer */
486     i = S->M;
487 
488     do
489     {
490       *pStateCur++ = *pSrc++;
491 
492     } while (--i);
493 
494     /* Set accumulator to zero */
495     acc0 = 0;
496 
497     /* Initialize state pointer */
498     px0 = pState;
499 
500     /* Initialize coeff pointer */
501     pb = pCoeffs;
502 
503 #if defined (ARM_MATH_LOOPUNROLL)
504 
505     /* Loop unrolling: Compute 4 taps at a time */
506     tapCnt = numTaps >> 2U;
507 
508     while (tapCnt > 0U)
509     {
510       /* Read the b[numTaps-1] coefficient */
511       c0 = *pb++;
512 
513       /* Read x[n-numTaps-1] sample */
514       x0 = *px0++;
515 
516       /* Perform the multiply-accumulate */
517       acc0 += (q63_t) x0 * c0;
518 
519       /* Read the b[numTaps-2] coefficient */
520       c0 = *pb++;
521 
522       /* Read x[n-numTaps-2] sample */
523       x0 = *px0++;
524 
525       /* Perform the multiply-accumulate */
526       acc0 += (q63_t) x0 * c0;
527 
528       /* Read the b[numTaps-3] coefficient */
529       c0 = *pb++;
530 
531       /* Read x[n-numTaps-3] sample */
532       x0 = *px0++;
533 
534       /* Perform the multiply-accumulate */
535       acc0 += (q63_t) x0 * c0;
536 
537       /* Read the b[numTaps-4] coefficient */
538       c0 = *pb++;
539 
540       /* Read x[n-numTaps-4] sample */
541       x0 = *px0++;
542 
543       /* Perform the multiply-accumulate */
544       acc0 += (q63_t) x0 * c0;
545 
546       /* Decrement loop counter */
547       tapCnt--;
548     }
549 
550     /* Loop unrolling: Compute remaining taps */
551     tapCnt = numTaps % 0x4U;
552 
553 #else
554 
555     /* Initialize tapCnt with number of taps */
556     tapCnt = numTaps;
557 
558 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
559 
560     while (tapCnt > 0U)
561     {
562       /* Read coefficients */
563       c0 = *pb++;
564 
565       /* Fetch 1 state variable */
566       x0 = *px0++;
567 
568       /* Perform the multiply-accumulate */
569       acc0 += (q63_t) x0 * c0;
570 
571       /* Decrement loop counter */
572       tapCnt--;
573     }
574 
575     /* Advance the state pointer by the decimation factor
576      * to process the next group of decimation factor number samples */
577     pState = pState + S->M;
578 
579     /* The result is in the accumulator, store in the destination buffer. */
580     *pDst++ = (q31_t) (acc0 >> 31);
581 
582     /* Decrement loop counter */
583     blkCnt--;
584   }
585 
586   /* Processing is complete.
587      Now copy the last numTaps - 1 samples to the satrt of the state buffer.
588      This prepares the state buffer for the next function call. */
589 
590   /* Points to the start of the state buffer */
591   pStateCur = S->pState;
592 
593 #if defined (ARM_MATH_LOOPUNROLL)
594 
595   /* Loop unrolling: Compute 4 taps at a time */
596   tapCnt = (numTaps - 1U) >> 2U;
597 
598   /* Copy data */
599   while (tapCnt > 0U)
600   {
601     *pStateCur++ = *pState++;
602     *pStateCur++ = *pState++;
603     *pStateCur++ = *pState++;
604     *pStateCur++ = *pState++;
605 
606     /* Decrement loop counter */
607     tapCnt--;
608   }
609 
610   /* Loop unrolling: Compute remaining taps */
611   tapCnt = (numTaps - 1U) % 0x04U;
612 
613 #else
614 
615   /* Initialize tapCnt with number of taps */
616   tapCnt = (numTaps - 1U);
617 
618 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
619 
620   /* Copy data */
621   while (tapCnt > 0U)
622   {
623     *pStateCur++ = *pState++;
624 
625     /* Decrement loop counter */
626     tapCnt--;
627   }
628 
629 }
630 #endif /* defined(ARM_MATH_MVEI) */
631 /**
632   @} end of FIR_decimate group
633  */
634