1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_biquad_cascade_df1_fast_q31.c
4  * Description:  Processing function for the Q31 Fast Biquad cascade DirectFormI(DF1) filter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup BiquadCascadeDF1
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q31 Biquad cascade filter (fast variant).
42   @param[in]     S         points to an instance of the Q31 Biquad cascade structure
43   @param[in]     pSrc      points to the block of input data
44   @param[out]    pDst      points to the block of output data
45   @param[in]     blockSize number of samples to process per call
46 
47   @par           Scaling and Overflow Behavior
48                    This function is optimized for speed at the expense of fixed-point precision and overflow protection.
49                    The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
50                    These intermediate results are added to a 2.30 accumulator.
51                    Finally, the accumulator is saturated and converted to a 1.31 result.
52                    The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
53                    In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). Use the intialization function
54                    arm_biquad_cascade_df1_init_q31() to initialize filter structure.
55   @remark
56                    Refer to \ref arm_biquad_cascade_df1_q31() for a slower implementation of this function
57                    which uses 64-bit accumulation to provide higher precision. Both the slow and the fast versions use the same instance structure.
58                    Use the function \ref arm_biquad_cascade_df1_init_q31() to initialize the filter structure.
59  */
60 
arm_biquad_cascade_df1_fast_q31(const arm_biquad_casd_df1_inst_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)61 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_fast_q31(
62   const arm_biquad_casd_df1_inst_q31 * S,
63   const q31_t * pSrc,
64         q31_t * pDst,
65         uint32_t blockSize)
66 {
67   const q31_t *pIn = pSrc;                             /* Source pointer */
68         q31_t *pOut = pDst;                            /* Destination pointer */
69         q31_t *pState = S->pState;                     /* pState pointer */
70   const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
71         q31_t acc = 0;                                 /* Accumulator */
72         q31_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
73         q31_t Xn1, Xn2, Yn1, Yn2;                      /* Filter pState variables */
74         q31_t Xn;                                      /* Temporary input */
75         int32_t shift = (int32_t) S->postShift + 1;    /* Shift to be applied to the output */
76         uint32_t sample, stage = S->numStages;         /* Loop counters */
77 
78   do
79   {
80     /* Reading the coefficients */
81     b0 = *pCoeffs++;
82     b1 = *pCoeffs++;
83     b2 = *pCoeffs++;
84     a1 = *pCoeffs++;
85     a2 = *pCoeffs++;
86 
87     /* Reading the pState values */
88     Xn1 = pState[0];
89     Xn2 = pState[1];
90     Yn1 = pState[2];
91     Yn2 = pState[3];
92 
93 #if defined (ARM_MATH_LOOPUNROLL)
94 
95     /* Apply loop unrolling and compute 4 output values simultaneously. */
96     /* Variables acc ... acc3 hold output values that are being computed:
97      *
98      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
99      */
100 
101     /* Loop unrolling: Compute 4 outputs at a time */
102     sample = blockSize >> 2U;
103 
104     while (sample > 0U)
105     {
106       /* Read the input */
107       Xn = *pIn;
108 
109       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
110       /* acc =  b0 * x[n] */
111       /* acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);*/
112       mult_32x32_keep32_R(acc, b1, Xn1);
113       /* acc +=  b1 * x[n-1] */
114       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);*/
115       multAcc_32x32_keep32_R(acc, b0, Xn);
116       /* acc +=  b[2] * x[n-2] */
117       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
118       multAcc_32x32_keep32_R(acc, b2, Xn2);
119       /* acc +=  a1 * y[n-1] */
120       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
121       multAcc_32x32_keep32_R(acc, a1, Yn1);
122       /* acc +=  a2 * y[n-2] */
123       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
124       multAcc_32x32_keep32_R(acc, a2, Yn2);
125 
126       /* The result is converted to 1.31 , Yn2 variable is reused */
127       Yn2 = acc << shift;
128 
129       /* Read the second input */
130       Xn2 = *(pIn + 1U);
131 
132       /* Store the output in the destination buffer. */
133       *pOut = Yn2;
134 
135       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
136       /* acc =  b0 * x[n] */
137       /* acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);*/
138       mult_32x32_keep32_R(acc, b0, Xn2);
139       /* acc +=  b1 * x[n-1] */
140       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);*/
141       multAcc_32x32_keep32_R(acc, b1, Xn);
142       /* acc +=  b[2] * x[n-2] */
143       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);*/
144       multAcc_32x32_keep32_R(acc, b2, Xn1);
145       /* acc +=  a1 * y[n-1] */
146       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
147       multAcc_32x32_keep32_R(acc, a1, Yn2);
148       /* acc +=  a2 * y[n-2] */
149       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
150       multAcc_32x32_keep32_R(acc, a2, Yn1);
151 
152       /* The result is converted to 1.31, Yn1 variable is reused  */
153       Yn1 = acc << shift;
154 
155       /* Read the third input  */
156       Xn1 = *(pIn + 2U);
157 
158       /* Store the output in the destination buffer. */
159       *(pOut + 1U) = Yn1;
160 
161       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
162       /* acc =  b0 * x[n] */
163       /* acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);*/
164       mult_32x32_keep32_R(acc, b0, Xn1);
165       /* acc +=  b1 * x[n-1] */
166       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);*/
167       multAcc_32x32_keep32_R(acc, b1, Xn2);
168       /* acc +=  b[2] * x[n-2] */
169       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);*/
170       multAcc_32x32_keep32_R(acc, b2, Xn);
171       /* acc +=  a1 * y[n-1] */
172       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
173       multAcc_32x32_keep32_R(acc, a1, Yn1);
174       /* acc +=  a2 * y[n-2] */
175       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
176       multAcc_32x32_keep32_R(acc, a2, Yn2);
177 
178       /* The result is converted to 1.31, Yn2 variable is reused  */
179       Yn2 = acc << shift;
180 
181       /* Read the forth input */
182       Xn = *(pIn + 3U);
183 
184       /* Store the output in the destination buffer. */
185       *(pOut + 2U) = Yn2;
186       pIn += 4U;
187 
188       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
189       /* acc =  b0 * x[n] */
190       /* acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
191       mult_32x32_keep32_R(acc, b0, Xn);
192       /* acc +=  b1 * x[n-1] */
193       /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
194       multAcc_32x32_keep32_R(acc, b1, Xn1);
195       /* acc +=  b[2] * x[n-2] */
196       /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
197       multAcc_32x32_keep32_R(acc, b2, Xn2);
198       /* acc +=  a1 * y[n-1] */
199       /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
200       multAcc_32x32_keep32_R(acc, a1, Yn2);
201       /* acc +=  a2 * y[n-2] */
202       /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
203       multAcc_32x32_keep32_R(acc, a2, Yn1);
204 
205       /* Every time after the output is computed state should be updated. */
206       /* The states should be updated as:  */
207       /* Xn2 = Xn1 */
208       Xn2 = Xn1;
209 
210       /* The result is converted to 1.31, Yn1 variable is reused  */
211       Yn1 = acc << shift;
212 
213       /* Xn1 = Xn */
214       Xn1 = Xn;
215 
216       /* Store the output in the destination buffer. */
217       *(pOut + 3U) = Yn1;
218       pOut += 4U;
219 
220       /* decrement loop counter */
221       sample--;
222     }
223 
224     /* Loop unrolling: Compute remaining outputs */
225     sample = (blockSize & 0x3U);
226 
227 #else
228 
229     /* Initialize blkCnt with number of samples */
230     sample = blockSize;
231 
232 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
233 
234     while (sample > 0U)
235     {
236       /* Read the input */
237       Xn = *pIn++;
238 
239       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
240       /* acc =  b0 * x[n] */
241       /* acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
242       mult_32x32_keep32_R(acc, b0, Xn);
243       /* acc +=  b1 * x[n-1] */
244       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
245       multAcc_32x32_keep32_R(acc, b1, Xn1);
246       /* acc +=  b[2] * x[n-2] */
247       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
248       multAcc_32x32_keep32_R(acc, b2, Xn2);
249       /* acc +=  a1 * y[n-1] */
250       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
251       multAcc_32x32_keep32_R(acc, a1, Yn1);
252       /* acc +=  a2 * y[n-2] */
253       /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
254       multAcc_32x32_keep32_R(acc, a2, Yn2);
255 
256       /* The result is converted to 1.31  */
257       acc = acc << shift;
258 
259       /* Every time after the output is computed state should be updated. */
260       /* The states should be updated as:  */
261       /* Xn2 = Xn1 */
262       /* Xn1 = Xn  */
263       /* Yn2 = Yn1 */
264       /* Yn1 = acc */
265       Xn2 = Xn1;
266       Xn1 = Xn;
267       Yn2 = Yn1;
268       Yn1 = acc;
269 
270       /* Store the output in the destination buffer. */
271       *pOut++ = acc;
272 
273       /* decrement loop counter */
274       sample--;
275     }
276 
277     /* The first stage goes from the input buffer to the output buffer. */
278     /* Subsequent stages occur in-place in the output buffer */
279     pIn = pDst;
280 
281     /* Reset to destination pointer */
282     pOut = pDst;
283 
284     /* Store the updated state variables back into the pState array */
285     *pState++ = Xn1;
286     *pState++ = Xn2;
287     *pState++ = Yn1;
288     *pState++ = Yn2;
289 
290   } while (--stage);
291 }
292 
293 /**
294   @} end of BiquadCascadeDF1 group
295  */
296