1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_biquad_cascade_df1_fast_q31.c
4 * Description: Processing function for the Q31 Fast Biquad cascade DirectFormI(DF1) filter
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup BiquadCascadeDF1
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q31 Biquad cascade filter (fast variant).
42 @param[in] S points to an instance of the Q31 Biquad cascade structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] blockSize number of samples to process per call
46
47 @par Scaling and Overflow Behavior
48 This function is optimized for speed at the expense of fixed-point precision and overflow protection.
49 The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
50 These intermediate results are added to a 2.30 accumulator.
51 Finally, the accumulator is saturated and converted to a 1.31 result.
52 The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
53 In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). Use the intialization function
54 arm_biquad_cascade_df1_init_q31() to initialize filter structure.
55 @remark
56 Refer to \ref arm_biquad_cascade_df1_q31() for a slower implementation of this function
57 which uses 64-bit accumulation to provide higher precision. Both the slow and the fast versions use the same instance structure.
58 Use the function \ref arm_biquad_cascade_df1_init_q31() to initialize the filter structure.
59 */
60
arm_biquad_cascade_df1_fast_q31(const arm_biquad_casd_df1_inst_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)61 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_fast_q31(
62 const arm_biquad_casd_df1_inst_q31 * S,
63 const q31_t * pSrc,
64 q31_t * pDst,
65 uint32_t blockSize)
66 {
67 const q31_t *pIn = pSrc; /* Source pointer */
68 q31_t *pOut = pDst; /* Destination pointer */
69 q31_t *pState = S->pState; /* pState pointer */
70 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
71 q31_t acc = 0; /* Accumulator */
72 q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
73 q31_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
74 q31_t Xn; /* Temporary input */
75 int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
76 uint32_t sample, stage = S->numStages; /* Loop counters */
77
78 do
79 {
80 /* Reading the coefficients */
81 b0 = *pCoeffs++;
82 b1 = *pCoeffs++;
83 b2 = *pCoeffs++;
84 a1 = *pCoeffs++;
85 a2 = *pCoeffs++;
86
87 /* Reading the pState values */
88 Xn1 = pState[0];
89 Xn2 = pState[1];
90 Yn1 = pState[2];
91 Yn2 = pState[3];
92
93 #if defined (ARM_MATH_LOOPUNROLL)
94
95 /* Apply loop unrolling and compute 4 output values simultaneously. */
96 /* Variables acc ... acc3 hold output values that are being computed:
97 *
98 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
99 */
100
101 /* Loop unrolling: Compute 4 outputs at a time */
102 sample = blockSize >> 2U;
103
104 while (sample > 0U)
105 {
106 /* Read the input */
107 Xn = *pIn;
108
109 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
110 /* acc = b0 * x[n] */
111 /* acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);*/
112 mult_32x32_keep32_R(acc, b1, Xn1);
113 /* acc += b1 * x[n-1] */
114 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);*/
115 multAcc_32x32_keep32_R(acc, b0, Xn);
116 /* acc += b[2] * x[n-2] */
117 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
118 multAcc_32x32_keep32_R(acc, b2, Xn2);
119 /* acc += a1 * y[n-1] */
120 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
121 multAcc_32x32_keep32_R(acc, a1, Yn1);
122 /* acc += a2 * y[n-2] */
123 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
124 multAcc_32x32_keep32_R(acc, a2, Yn2);
125
126 /* The result is converted to 1.31 , Yn2 variable is reused */
127 Yn2 = acc << shift;
128
129 /* Read the second input */
130 Xn2 = *(pIn + 1U);
131
132 /* Store the output in the destination buffer. */
133 *pOut = Yn2;
134
135 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
136 /* acc = b0 * x[n] */
137 /* acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);*/
138 mult_32x32_keep32_R(acc, b0, Xn2);
139 /* acc += b1 * x[n-1] */
140 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);*/
141 multAcc_32x32_keep32_R(acc, b1, Xn);
142 /* acc += b[2] * x[n-2] */
143 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);*/
144 multAcc_32x32_keep32_R(acc, b2, Xn1);
145 /* acc += a1 * y[n-1] */
146 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
147 multAcc_32x32_keep32_R(acc, a1, Yn2);
148 /* acc += a2 * y[n-2] */
149 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
150 multAcc_32x32_keep32_R(acc, a2, Yn1);
151
152 /* The result is converted to 1.31, Yn1 variable is reused */
153 Yn1 = acc << shift;
154
155 /* Read the third input */
156 Xn1 = *(pIn + 2U);
157
158 /* Store the output in the destination buffer. */
159 *(pOut + 1U) = Yn1;
160
161 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
162 /* acc = b0 * x[n] */
163 /* acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);*/
164 mult_32x32_keep32_R(acc, b0, Xn1);
165 /* acc += b1 * x[n-1] */
166 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);*/
167 multAcc_32x32_keep32_R(acc, b1, Xn2);
168 /* acc += b[2] * x[n-2] */
169 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);*/
170 multAcc_32x32_keep32_R(acc, b2, Xn);
171 /* acc += a1 * y[n-1] */
172 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
173 multAcc_32x32_keep32_R(acc, a1, Yn1);
174 /* acc += a2 * y[n-2] */
175 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
176 multAcc_32x32_keep32_R(acc, a2, Yn2);
177
178 /* The result is converted to 1.31, Yn2 variable is reused */
179 Yn2 = acc << shift;
180
181 /* Read the forth input */
182 Xn = *(pIn + 3U);
183
184 /* Store the output in the destination buffer. */
185 *(pOut + 2U) = Yn2;
186 pIn += 4U;
187
188 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
189 /* acc = b0 * x[n] */
190 /* acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
191 mult_32x32_keep32_R(acc, b0, Xn);
192 /* acc += b1 * x[n-1] */
193 /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
194 multAcc_32x32_keep32_R(acc, b1, Xn1);
195 /* acc += b[2] * x[n-2] */
196 /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
197 multAcc_32x32_keep32_R(acc, b2, Xn2);
198 /* acc += a1 * y[n-1] */
199 /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
200 multAcc_32x32_keep32_R(acc, a1, Yn2);
201 /* acc += a2 * y[n-2] */
202 /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
203 multAcc_32x32_keep32_R(acc, a2, Yn1);
204
205 /* Every time after the output is computed state should be updated. */
206 /* The states should be updated as: */
207 /* Xn2 = Xn1 */
208 Xn2 = Xn1;
209
210 /* The result is converted to 1.31, Yn1 variable is reused */
211 Yn1 = acc << shift;
212
213 /* Xn1 = Xn */
214 Xn1 = Xn;
215
216 /* Store the output in the destination buffer. */
217 *(pOut + 3U) = Yn1;
218 pOut += 4U;
219
220 /* decrement loop counter */
221 sample--;
222 }
223
224 /* Loop unrolling: Compute remaining outputs */
225 sample = (blockSize & 0x3U);
226
227 #else
228
229 /* Initialize blkCnt with number of samples */
230 sample = blockSize;
231
232 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
233
234 while (sample > 0U)
235 {
236 /* Read the input */
237 Xn = *pIn++;
238
239 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
240 /* acc = b0 * x[n] */
241 /* acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
242 mult_32x32_keep32_R(acc, b0, Xn);
243 /* acc += b1 * x[n-1] */
244 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
245 multAcc_32x32_keep32_R(acc, b1, Xn1);
246 /* acc += b[2] * x[n-2] */
247 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
248 multAcc_32x32_keep32_R(acc, b2, Xn2);
249 /* acc += a1 * y[n-1] */
250 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
251 multAcc_32x32_keep32_R(acc, a1, Yn1);
252 /* acc += a2 * y[n-2] */
253 /* acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
254 multAcc_32x32_keep32_R(acc, a2, Yn2);
255
256 /* The result is converted to 1.31 */
257 acc = acc << shift;
258
259 /* Every time after the output is computed state should be updated. */
260 /* The states should be updated as: */
261 /* Xn2 = Xn1 */
262 /* Xn1 = Xn */
263 /* Yn2 = Yn1 */
264 /* Yn1 = acc */
265 Xn2 = Xn1;
266 Xn1 = Xn;
267 Yn2 = Yn1;
268 Yn1 = acc;
269
270 /* Store the output in the destination buffer. */
271 *pOut++ = acc;
272
273 /* decrement loop counter */
274 sample--;
275 }
276
277 /* The first stage goes from the input buffer to the output buffer. */
278 /* Subsequent stages occur in-place in the output buffer */
279 pIn = pDst;
280
281 /* Reset to destination pointer */
282 pOut = pDst;
283
284 /* Store the updated state variables back into the pState array */
285 *pState++ = Xn1;
286 *pState++ = Xn2;
287 *pState++ = Yn1;
288 *pState++ = Yn2;
289
290 } while (--stage);
291 }
292
293 /**
294 @} end of BiquadCascadeDF1 group
295 */
296