1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_fast_q31.c
4  * Description:  Processing function for the Q31 Fast FIR filter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q31 FIR filter (fast version).
42   @param[in]     S          points to an instance of the Q31 structure
43   @param[in]     pSrc       points to the block of input data
44   @param[out]    pDst       points to the block of output data
45   @param[in]     blockSize  number of samples to process
46 
47   @par           Scaling and Overflow Behavior
48                    This function is optimized for speed at the expense of fixed-point precision and overflow protection.
49                    The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
50                    These intermediate results are added to a 2.30 accumulator.
51                    Finally, the accumulator is saturated and converted to a 1.31 result.
52                    The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
53                    In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
54 
55  @remark
56                    Refer to \ref arm_fir_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.  Both the slow and the fast versions use the same instance structure.
57                    Use function \ref arm_fir_init_q31() to initialize the filter structure.
58  */
59 
arm_fir_fast_q31(const arm_fir_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)60 void arm_fir_fast_q31(
61   const arm_fir_instance_q31 * S,
62   const q31_t * pSrc,
63         q31_t * pDst,
64         uint32_t blockSize)
65 {
66         q31_t *pState = S->pState;                     /* State pointer */
67   const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
68         q31_t *pStateCurnt;                            /* Points to the current sample of the state */
69         q31_t *px;                                     /* Temporary pointer for state buffer */
70   const q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
71         q31_t acc0;                                    /* Accumulators */
72         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
73         uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
74 
75 #if defined (ARM_MATH_LOOPUNROLL)
76         q31_t acc1, acc2, acc3;                        /* Accumulators */
77         q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
78 #endif
79 
80   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
81   /* pStateCurnt points to the location where the new input data should be written */
82   pStateCurnt = &(S->pState[(numTaps - 1U)]);
83 
84 #if defined (ARM_MATH_LOOPUNROLL)
85 
86   /* Loop unrolling: Compute 4 output values simultaneously.
87    * The variables acc0 ... acc3 hold output values that are being computed:
88    *
89    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
90    *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
91    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
92    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
93    */
94   blkCnt = blockSize >> 2U;
95 
96   while (blkCnt > 0U)
97   {
98     /* Copy 4 new input samples into the state buffer. */
99     *pStateCurnt++ = *pSrc++;
100     *pStateCurnt++ = *pSrc++;
101     *pStateCurnt++ = *pSrc++;
102     *pStateCurnt++ = *pSrc++;
103 
104     /* Set all accumulators to zero */
105     acc0 = 0;
106     acc1 = 0;
107     acc2 = 0;
108     acc3 = 0;
109 
110     /* Initialize state pointer */
111     px = pState;
112 
113     /* Initialize coefficient pointer */
114     pb = pCoeffs;
115 
116     /* Read the first 3 samples from the state buffer:
117      *  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
118     x0 = *px++;
119     x1 = *px++;
120     x2 = *px++;
121 
122     /* Loop unrolling. Process 4 taps at a time. */
123     tapCnt = numTaps >> 2U;
124 
125     /* Loop over the number of taps.  Unroll by a factor of 4.
126        Repeat until we've computed numTaps-4 coefficients. */
127     while (tapCnt > 0U)
128     {
129       /* Read the b[numTaps] coefficient */
130       c0 = *pb;
131 
132       /* Read x[n-numTaps-3] sample */
133       x3 = *px;
134 
135       /* acc0 +=  b[numTaps] * x[n-numTaps] */
136       multAcc_32x32_keep32_R(acc0, x0, c0);
137 
138       /* acc1 +=  b[numTaps] * x[n-numTaps-1] */
139       multAcc_32x32_keep32_R(acc1, x1, c0);
140 
141       /* acc2 +=  b[numTaps] * x[n-numTaps-2] */
142       multAcc_32x32_keep32_R(acc2, x2, c0);
143 
144       /* acc3 +=  b[numTaps] * x[n-numTaps-3] */
145       multAcc_32x32_keep32_R(acc3, x3, c0);
146 
147       /* Read the b[numTaps-1] coefficient */
148       c0 = *(pb + 1U);
149 
150       /* Read x[n-numTaps-4] sample */
151       x0 = *(px + 1U);
152 
153       /* Perform the multiply-accumulates */
154       multAcc_32x32_keep32_R(acc0, x1, c0);
155       multAcc_32x32_keep32_R(acc1, x2, c0);
156       multAcc_32x32_keep32_R(acc2, x3, c0);
157       multAcc_32x32_keep32_R(acc3, x0, c0);
158 
159       /* Read the b[numTaps-2] coefficient */
160       c0 = *(pb + 2U);
161 
162       /* Read x[n-numTaps-5] sample */
163       x1 = *(px + 2U);
164 
165       /* Perform the multiply-accumulates */
166       multAcc_32x32_keep32_R(acc0, x2, c0);
167       multAcc_32x32_keep32_R(acc1, x3, c0);
168       multAcc_32x32_keep32_R(acc2, x0, c0);
169       multAcc_32x32_keep32_R(acc3, x1, c0);
170 
171       /* Read the b[numTaps-3] coefficients */
172       c0 = *(pb + 3U);
173 
174       /* Read x[n-numTaps-6] sample */
175       x2 = *(px + 3U);
176 
177       /* Perform the multiply-accumulates */
178       multAcc_32x32_keep32_R(acc0, x3, c0);
179       multAcc_32x32_keep32_R(acc1, x0, c0);
180       multAcc_32x32_keep32_R(acc2, x1, c0);
181       multAcc_32x32_keep32_R(acc3, x2, c0);
182 
183       /* update coefficient pointer */
184       pb += 4U;
185       px += 4U;
186 
187       /* Decrement loop counter */
188       tapCnt--;
189     }
190 
191     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
192     tapCnt = numTaps % 0x4U;
193 
194     while (tapCnt > 0U)
195     {
196       /* Read coefficients */
197       c0 = *(pb++);
198 
199       /* Fetch 1 state variable */
200       x3 = *(px++);
201 
202       /* Perform the multiply-accumulates */
203       multAcc_32x32_keep32_R(acc0, x0, c0);
204       multAcc_32x32_keep32_R(acc1, x1, c0);
205       multAcc_32x32_keep32_R(acc2, x2, c0);
206       multAcc_32x32_keep32_R(acc3, x3, c0);
207 
208       /* Reuse the present sample states for next sample */
209       x0 = x1;
210       x1 = x2;
211       x2 = x3;
212 
213       /* Decrement loop counter */
214       tapCnt--;
215     }
216 
217     /* The results in the 4 accumulators are in 2.30 format. Convert to 1.31
218        Then store the 4 outputs in the destination buffer. */
219     *pDst++ = (q31_t) (acc0 << 1);
220     *pDst++ = (q31_t) (acc1 << 1);
221     *pDst++ = (q31_t) (acc2 << 1);
222     *pDst++ = (q31_t) (acc3 << 1);
223 
224     /* Advance the state pointer by 4 to process the next group of 4 samples */
225     pState = pState + 4U;
226 
227     /* Decrement loop counter */
228     blkCnt--;
229   }
230 
231   /* Loop unrolling: Compute remaining output samples */
232   blkCnt = blockSize % 0x4U;
233 
234 #else
235 
236   /* Initialize blkCnt with number of taps */
237   blkCnt = blockSize;
238 
239 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
240 
241   while (blkCnt > 0U)
242   {
243     /* Copy one sample at a time into state buffer */
244     *pStateCurnt++ = *pSrc++;
245 
246     /* Set the accumulator to zero */
247     acc0 = 0;
248 
249     /* Initialize state pointer */
250     px = pState;
251 
252     /* Initialize Coefficient pointer */
253     pb = pCoeffs;
254 
255     i = numTaps;
256 
257     /* Perform the multiply-accumulates */
258     do
259     {
260       multAcc_32x32_keep32_R(acc0, (*px++), (*pb++));
261       i--;
262     } while (i > 0U);
263 
264     /* The result is in 2.30 format. Convert to 1.31
265        Then store the output in the destination buffer. */
266     *pDst++ = (q31_t) (acc0 << 1);
267 
268     /* Advance state pointer by 1 for the next sample */
269     pState = pState + 1U;
270 
271     /* Decrement loop counter */
272     blkCnt--;
273   }
274 
275   /* Processing is complete.
276      Now copy the last numTaps - 1 samples to the start of the state buffer.
277      This prepares the state buffer for the next function call. */
278 
279   /* Points to the start of the state buffer */
280   pStateCurnt = S->pState;
281 
282 #if defined (ARM_MATH_LOOPUNROLL)
283 
284   /* Loop unrolling: Compute 4 taps at a time */
285   tapCnt = (numTaps - 1U) >> 2U;
286 
287   /* Copy data */
288   while (tapCnt > 0U)
289   {
290     *pStateCurnt++ = *pState++;
291     *pStateCurnt++ = *pState++;
292     *pStateCurnt++ = *pState++;
293     *pStateCurnt++ = *pState++;
294 
295     /* Decrement loop counter */
296     tapCnt--;
297   }
298 
299   /* Calculate remaining number of copies */
300   tapCnt = (numTaps - 1U) % 0x4U;
301 
302 #else
303 
304   /* Initialize tapCnt with number of taps */
305   tapCnt = (numTaps - 1U);
306 
307 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
308 
309   /* Copy remaining data */
310   while (tapCnt > 0U)
311   {
312     *pStateCurnt++ = *pState++;
313 
314     /* Decrement the loop counter */
315     tapCnt--;
316   }
317 
318 }
319 /**
320   @} end of FIR group
321  */
322