1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_decimate_fast_q31.c
4  * Description:  Fast Q31 FIR Decimator
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR_decimate
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q31 FIR decimator (fast variant).
42   @param[in]     S          points to an instance of the Q31 FIR decimator structure
43   @param[in]     pSrc       points to the block of input data
44   @param[out]    pDst       points to the block of output data
45   @param[in]     blockSize  number of samples to process
46 
47   @par           Scaling and Overflow Behavior
48                    This function is optimized for speed at the expense of fixed-point precision and overflow protection.
49                    The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
50                    These intermediate results are added to a 2.30 accumulator.
51                    Finally, the accumulator is saturated and converted to a 1.31 result.
52                    The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
53                    In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).
54 
55   @remark
56                    Refer to \ref arm_fir_decimate_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
57                    Both the slow and the fast versions use the same instance structure.
58                    Use function \ref arm_fir_decimate_init_q31() to initialize the filter structure.
59  */
60 
arm_fir_decimate_fast_q31(const arm_fir_decimate_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)61 void arm_fir_decimate_fast_q31(
62   const arm_fir_decimate_instance_q31 * S,
63   const q31_t * pSrc,
64         q31_t * pDst,
65         uint32_t blockSize)
66 {
67         q31_t *pState = S->pState;                     /* State pointer */
68   const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
69         q31_t *pStateCur;                              /* Points to the current sample of the state */
70         q31_t *px0;                                    /* Temporary pointer for state buffer */
71   const q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
72         q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
73         q63_t acc0;                                    /* Accumulator */
74         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
75         uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
76 
77 #if defined (ARM_MATH_LOOPUNROLL)
78         q31_t *px1, *px2, *px3;
79         q31_t x1, x2, x3;
80         q63_t acc1, acc2, acc3;
81 #endif
82 
83   /* S->pState buffer contains previous frame (numTaps - 1) samples */
84   /* pStateCur points to the location where the new input data should be written */
85   pStateCur = S->pState + (numTaps - 1U);
86 
87 #if defined (ARM_MATH_LOOPUNROLL)
88 
89     /* Loop unrolling: Compute 4 samples at a time */
90   blkCnt = outBlockSize >> 2U;
91 
92   /* Samples loop unrolled by 4 */
93   while (blkCnt > 0U)
94   {
95     /* Copy 4 * decimation factor number of new input samples into the state buffer */
96     i = S->M * 4;
97 
98     do
99     {
100       *pStateCur++ = *pSrc++;
101 
102     } while (--i);
103 
104     /* Set accumulators to zero */
105     acc0 = 0;
106     acc1 = 0;
107     acc2 = 0;
108     acc3 = 0;
109 
110     /* Initialize state pointer for all the samples */
111     px0 = pState;
112     px1 = pState + S->M;
113     px2 = pState + 2 * S->M;
114     px3 = pState + 3 * S->M;
115 
116     /* Initialize coeff pointer */
117     pb = pCoeffs;
118 
119     /* Loop unrolling: Compute 4 taps at a time */
120     tapCnt = numTaps >> 2U;
121 
122     while (tapCnt > 0U)
123     {
124       /* Read the b[numTaps-1] coefficient */
125       c0 = *(pb++);
126 
127       /* Read x[n-numTaps-1] sample for acc0 */
128       x0 = *(px0++);
129       /* Read x[n-numTaps-1] sample for acc1 */
130       x1 = *(px1++);
131       /* Read x[n-numTaps-1] sample for acc2 */
132       x2 = *(px2++);
133       /* Read x[n-numTaps-1] sample for acc3 */
134       x3 = *(px3++);
135 
136       /* Perform the multiply-accumulate */
137       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
138       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
139       acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
140       acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
141 
142       /* Read the b[numTaps-2] coefficient */
143       c0 = *(pb++);
144 
145       /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */
146       x0 = *(px0++);
147       x1 = *(px1++);
148       x2 = *(px2++);
149       x3 = *(px3++);
150 
151       /* Perform the multiply-accumulate */
152       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
153       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
154       acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
155       acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
156 
157       /* Read the b[numTaps-3] coefficient */
158       c0 = *(pb++);
159 
160       /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */
161       x0 = *(px0++);
162       x1 = *(px1++);
163       x2 = *(px2++);
164       x3 = *(px3++);
165 
166       /* Perform the multiply-accumulate */
167       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
168       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
169       acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
170       acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
171 
172       /* Read the b[numTaps-4] coefficient */
173       c0 = *(pb++);
174 
175       /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */
176       x0 = *(px0++);
177       x1 = *(px1++);
178       x2 = *(px2++);
179       x3 = *(px3++);
180 
181       /* Perform the multiply-accumulate */
182       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
183       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
184       acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
185       acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
186 
187       /* Decrement loop counter */
188       tapCnt--;
189     }
190 
191     /* Loop unrolling: Compute remaining taps */
192     tapCnt = numTaps % 0x4U;
193 
194     while (tapCnt > 0U)
195     {
196       /* Read coefficients */
197       c0 = *(pb++);
198 
199       /* Fetch state variables for acc0, acc1, acc2, acc3 */
200       x0 = *(px0++);
201       x1 = *(px1++);
202       x2 = *(px2++);
203       x3 = *(px3++);
204 
205       /* Perform the multiply-accumulate */
206       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
207       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
208       acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
209       acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
210 
211       /* Decrement loop counter */
212       tapCnt--;
213     }
214 
215     /* Advance the state pointer by the decimation factor
216      * to process the next group of decimation factor number samples */
217     pState = pState + S->M * 4;
218 
219     /* The result is in the accumulator, store in the destination buffer. */
220     *pDst++ = (q31_t) (acc0 << 1);
221     *pDst++ = (q31_t) (acc1 << 1);
222     *pDst++ = (q31_t) (acc2 << 1);
223     *pDst++ = (q31_t) (acc3 << 1);
224 
225     /* Decrement loop counter */
226     blkCnt--;
227   }
228 
229   /* Loop unrolling: Compute remaining samples */
230   blkCnt = outBlockSize % 0x4U;
231 
232 #else
233 
234   /* Initialize blkCnt with number of samples */
235   blkCnt = outBlockSize;
236 
237 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
238 
239   while (blkCnt > 0U)
240   {
241     /* Copy decimation factor number of new input samples into the state buffer */
242     i = S->M;
243 
244     do
245     {
246       *pStateCur++ = *pSrc++;
247 
248     } while (--i);
249 
250     /* Set accumulator to zero */
251     acc0 = 0;
252 
253     /* Initialize state pointer */
254     px0 = pState;
255 
256     /* Initialize coeff pointer */
257     pb = pCoeffs;
258 
259 #if defined (ARM_MATH_LOOPUNROLL)
260 
261     /* Loop unrolling: Compute 4 taps at a time */
262     tapCnt = numTaps >> 2U;
263 
264     while (tapCnt > 0U)
265     {
266       /* Read the b[numTaps-1] coefficient */
267       c0 = *pb++;
268 
269       /* Read x[n-numTaps-1] sample */
270       x0 = *px0++;
271 
272       /* Perform the multiply-accumulate */
273       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
274 
275       /* Read the b[numTaps-2] coefficient */
276       c0 = *pb++;
277 
278       /* Read x[n-numTaps-2] sample */
279       x0 = *px0++;
280 
281       /* Perform the multiply-accumulate */
282       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
283 
284       /* Read the b[numTaps-3] coefficient */
285       c0 = *pb++;
286 
287       /* Read x[n-numTaps-3] sample */
288       x0 = *px0++;
289 
290       /* Perform the multiply-accumulate */
291       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
292 
293       /* Read the b[numTaps-4] coefficient */
294       c0 = *pb++;
295 
296       /* Read x[n-numTaps-4] sample */
297       x0 = *px0++;
298 
299       /* Perform the multiply-accumulate */
300       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
301 
302       /* Decrement loop counter */
303       tapCnt--;
304     }
305 
306     /* Loop unrolling: Compute remaining taps */
307     tapCnt = numTaps % 0x4U;
308 
309 #else
310 
311     /* Initialize tapCnt with number of taps */
312     tapCnt = numTaps;
313 
314 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
315 
316     while (tapCnt > 0U)
317     {
318       /* Read coefficients */
319       c0 = *pb++;
320 
321       /* Fetch 1 state variable */
322       x0 = *px0++;
323 
324       /* Perform the multiply-accumulate */
325       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
326 
327       /* Decrement loop counter */
328       tapCnt--;
329     }
330 
331     /* Advance the state pointer by the decimation factor
332      * to process the next group of decimation factor number samples */
333     pState = pState + S->M;
334 
335     /* The result is in the accumulator, store in the destination buffer. */
336     *pDst++ = (q31_t) (acc0 << 1);
337 
338     /* Decrement loop counter */
339     blkCnt--;
340   }
341 
342   /* Processing is complete.
343      Now copy the last numTaps - 1 samples to the satrt of the state buffer.
344      This prepares the state buffer for the next function call. */
345 
346   /* Points to the start of the state buffer */
347   pStateCur = S->pState;
348 
349 #if defined (ARM_MATH_LOOPUNROLL)
350 
351   /* Loop unrolling: Compute 4 taps at a time */
352   tapCnt = (numTaps - 1U) >> 2U;
353 
354   /* Copy data */
355   while (tapCnt > 0U)
356   {
357     *pStateCur++ = *pState++;
358     *pStateCur++ = *pState++;
359     *pStateCur++ = *pState++;
360     *pStateCur++ = *pState++;
361 
362     /* Decrement loop counter */
363     tapCnt--;
364   }
365 
366   /* Loop unrolling: Compute remaining taps */
367   tapCnt = (numTaps - 1U) % 0x04U;
368 
369 #else
370 
371   /* Initialize tapCnt with number of taps */
372   tapCnt = (numTaps - 1U);
373 
374 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
375 
376   /* Copy data */
377   while (tapCnt > 0U)
378   {
379     *pStateCur++ = *pState++;
380 
381     /* Decrement loop counter */
382     tapCnt--;
383   }
384 
385 }
386 
387 /**
388   @} end of FIR_decimate group
389  */
390