1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_biquad_cascade_stereo_df2T_f32.c
4 * Description: Processing function for floating-point transposed direct form II Biquad cascade filter. 2 channels
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup BiquadCascadeDF2T
37 @{
38 */
39
40 /**
41 @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
42 @param[in] S points to an instance of the filter data structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] blockSize number of samples to process
46 */
47 #if (defined(ARM_MATH_MVEF) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
48 #include "arm_helium_utils.h"
49
arm_biquad_cascade_stereo_df2T_f32(const arm_biquad_cascade_stereo_df2T_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)50 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_stereo_df2T_f32(
51 const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
52 const float32_t * pSrc,
53 float32_t * pDst,
54 uint32_t blockSize)
55 {
56 const float32_t *pIn = pSrc; /* source pointer */
57 float32_t *pOut = pDst; /* destination pointer */
58 float32_t *pState = S->pState; /* State pointer */
59 const float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
60 float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
61 uint32_t sample, stage = S->numStages; /* loop counters */
62 float32_t scratch[6];
63 uint32x4_t loadIdxVec;
64 f32x4_t aCoeffs, bCoeffs;
65 f32x4_t stateVec0, stateVec1;
66 f32x4_t inVec;
67 uint32_t startIdx = 0;
68
69 /*
70 * {0, 1, 0, 1} generator
71 */
72 loadIdxVec = viwdupq_u32(&startIdx, 2, 1);
73
74 /*
75 * scratch top clearing
76 * layout : [d1a d1b d2a d2b 0 0]
77 */
78 scratch[4] = 0.0f;
79 scratch[5] = 0.0f;
80
81 do
82 {
83 /*
84 * Reading the coefficients
85 */
86 b0 = *pCoeffs++;
87 b1 = *pCoeffs++;
88 b2 = *pCoeffs++;
89 a1 = *pCoeffs++;
90 a2 = *pCoeffs++;
91
92 /*
93 * aCoeffs = {a1 a1 a2 a2}
94 */
95 aCoeffs = vdupq_n_f32(a1);
96 aCoeffs = vsetq_lane(a2, aCoeffs, 2);
97 aCoeffs = vsetq_lane(a2, aCoeffs, 3);
98
99 /*
100 * bCoeffs = {b1 b1 b2 b2}
101 */
102 bCoeffs = vdupq_n_f32(b1);
103 bCoeffs = vsetq_lane(b2, bCoeffs, 2);
104 bCoeffs = vsetq_lane(b2, bCoeffs, 3);
105
106 /*
107 * Reading the state values
108 * Save into scratch
109 */
110 *(f32x4_t *) scratch = *(f32x4_t *) pState;
111
112 sample = blockSize;
113
114 while (sample > 0U)
115 {
116 /*
117 * step 1
118 *
119 * 0 | acc1a = xn1a * b0 + d1a
120 * 1 | acc1b = xn1b * b0 + d1b
121 * 2 | acc1a = xn1a * b0 + d1a
122 * 3 | acc1b = xn1b * b0 + d1b
123 */
124 /*
125 * load {d1a, d1b, d1a, d1b}
126 */
127 stateVec0 = (f32x4_t)vldrwq_gather_shifted_offset((uint32_t const *) scratch, loadIdxVec);
128 /*
129 * load {in0 in1 in0 in1}
130 */
131 inVec = (f32x4_t)vldrwq_gather_shifted_offset((uint32_t const *) pIn, loadIdxVec);
132
133 stateVec0 = vfmaq(stateVec0, inVec, b0);
134 *pOut++ = vgetq_lane(stateVec0, 0);
135 *pOut++ = vgetq_lane(stateVec0, 1);
136
137 /*
138 * step 2
139 *
140 * 0 | d1a = b1 * xn1a + a1 * acc1a + d2a
141 * 1 | d1b = b1 * xn1b + a1 * acc1b + d2b
142 * 2 | d2a = b2 * xn1a + a2 * acc1a + 0
143 * 3 | d2b = b2 * xn1b + a2 * acc1b + 0
144 */
145
146 /*
147 * load {d2a, d2b, 0, 0}
148 */
149 stateVec1 = *(f32x4_t *) & scratch[2];
150 stateVec1 = vfmaq(stateVec1, stateVec0, aCoeffs);
151 stateVec1 = vfmaq(stateVec1, inVec, bCoeffs);
152 *(f32x4_t *) scratch = stateVec1;
153
154 pIn = pIn + 2;
155 sample--;
156 }
157
158 /*
159 * Store the updated state variables back into the state array
160 */
161 vst1q(pState, stateVec1);
162 pState += 4;
163
164 /*
165 * The current stage output is given as the input to the next stage
166 */
167 pIn = pDst;
168 /*
169 * Reset the output working pointer
170 */
171 pOut = pDst;
172 /*
173 * decrement the loop counter
174 */
175 stage--;
176 }
177 while (stage > 0U);
178 }
179
180 #else
181
arm_biquad_cascade_stereo_df2T_f32(const arm_biquad_cascade_stereo_df2T_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)182 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_stereo_df2T_f32(
183 const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
184 const float32_t * pSrc,
185 float32_t * pDst,
186 uint32_t blockSize)
187 {
188 const float32_t *pIn = pSrc; /* Source pointer */
189 float32_t *pOut = pDst; /* Destination pointer */
190 float32_t *pState = S->pState; /* State pointer */
191 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
192 float32_t acc1a, acc1b; /* Accumulator */
193 float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
194 float32_t Xn1a, Xn1b; /* Temporary input */
195 float32_t d1a, d2a, d1b, d2b; /* State variables */
196 uint32_t sample, stage = S->numStages; /* Loop counters */
197
198 do
199 {
200 /* Reading the coefficients */
201 b0 = pCoeffs[0];
202 b1 = pCoeffs[1];
203 b2 = pCoeffs[2];
204 a1 = pCoeffs[3];
205 a2 = pCoeffs[4];
206
207 /* Reading the state values */
208 d1a = pState[0];
209 d2a = pState[1];
210 d1b = pState[2];
211 d2b = pState[3];
212
213 pCoeffs += 5U;
214
215 #if defined (ARM_MATH_LOOPUNROLL)
216
217 /* Loop unrolling: Compute 8 outputs at a time */
218 sample = blockSize >> 3U;
219
220 while (sample > 0U) {
221 /* y[n] = b0 * x[n] + d1 */
222 /* d1 = b1 * x[n] + a1 * y[n] + d2 */
223 /* d2 = b2 * x[n] + a2 * y[n] */
224
225 /* 1 */
226 Xn1a = *pIn++; /* Channel a */
227 Xn1b = *pIn++; /* Channel b */
228
229 acc1a = (b0 * Xn1a) + d1a;
230 acc1b = (b0 * Xn1b) + d1b;
231
232 *pOut++ = acc1a;
233 *pOut++ = acc1b;
234
235 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
236 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
237
238 d2a = (b2 * Xn1a) + (a2 * acc1a);
239 d2b = (b2 * Xn1b) + (a2 * acc1b);
240
241 /* 2 */
242 Xn1a = *pIn++; /* Channel a */
243 Xn1b = *pIn++; /* Channel b */
244
245 acc1a = (b0 * Xn1a) + d1a;
246 acc1b = (b0 * Xn1b) + d1b;
247
248 *pOut++ = acc1a;
249 *pOut++ = acc1b;
250
251 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
252 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
253
254 d2a = (b2 * Xn1a) + (a2 * acc1a);
255 d2b = (b2 * Xn1b) + (a2 * acc1b);
256
257 /* 3 */
258 Xn1a = *pIn++; /* Channel a */
259 Xn1b = *pIn++; /* Channel b */
260
261 acc1a = (b0 * Xn1a) + d1a;
262 acc1b = (b0 * Xn1b) + d1b;
263
264 *pOut++ = acc1a;
265 *pOut++ = acc1b;
266
267 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
268 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
269
270 d2a = (b2 * Xn1a) + (a2 * acc1a);
271 d2b = (b2 * Xn1b) + (a2 * acc1b);
272
273 /* 4 */
274 Xn1a = *pIn++; /* Channel a */
275 Xn1b = *pIn++; /* Channel b */
276
277 acc1a = (b0 * Xn1a) + d1a;
278 acc1b = (b0 * Xn1b) + d1b;
279
280 *pOut++ = acc1a;
281 *pOut++ = acc1b;
282
283 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
284 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
285
286 d2a = (b2 * Xn1a) + (a2 * acc1a);
287 d2b = (b2 * Xn1b) + (a2 * acc1b);
288
289 /* 5 */
290 Xn1a = *pIn++; /* Channel a */
291 Xn1b = *pIn++; /* Channel b */
292
293 acc1a = (b0 * Xn1a) + d1a;
294 acc1b = (b0 * Xn1b) + d1b;
295
296 *pOut++ = acc1a;
297 *pOut++ = acc1b;
298
299 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
300 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
301
302 d2a = (b2 * Xn1a) + (a2 * acc1a);
303 d2b = (b2 * Xn1b) + (a2 * acc1b);
304
305 /* 6 */
306 Xn1a = *pIn++; /* Channel a */
307 Xn1b = *pIn++; /* Channel b */
308
309 acc1a = (b0 * Xn1a) + d1a;
310 acc1b = (b0 * Xn1b) + d1b;
311
312 *pOut++ = acc1a;
313 *pOut++ = acc1b;
314
315 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
316 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
317
318 d2a = (b2 * Xn1a) + (a2 * acc1a);
319 d2b = (b2 * Xn1b) + (a2 * acc1b);
320
321 /* 7 */
322 Xn1a = *pIn++; /* Channel a */
323 Xn1b = *pIn++; /* Channel b */
324
325 acc1a = (b0 * Xn1a) + d1a;
326 acc1b = (b0 * Xn1b) + d1b;
327
328 *pOut++ = acc1a;
329 *pOut++ = acc1b;
330
331 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
332 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
333
334 d2a = (b2 * Xn1a) + (a2 * acc1a);
335 d2b = (b2 * Xn1b) + (a2 * acc1b);
336
337 /* 8 */
338 Xn1a = *pIn++; /* Channel a */
339 Xn1b = *pIn++; /* Channel b */
340
341 acc1a = (b0 * Xn1a) + d1a;
342 acc1b = (b0 * Xn1b) + d1b;
343
344 *pOut++ = acc1a;
345 *pOut++ = acc1b;
346
347 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
348 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
349
350 d2a = (b2 * Xn1a) + (a2 * acc1a);
351 d2b = (b2 * Xn1b) + (a2 * acc1b);
352
353 /* decrement loop counter */
354 sample--;
355 }
356
357 /* Loop unrolling: Compute remaining outputs */
358 sample = blockSize & 0x7U;
359
360 #else
361
362 /* Initialize blkCnt with number of samples */
363 sample = blockSize;
364
365 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
366
367 while (sample > 0U) {
368 /* Read the input */
369 Xn1a = *pIn++; /* Channel a */
370 Xn1b = *pIn++; /* Channel b */
371
372 /* y[n] = b0 * x[n] + d1 */
373 acc1a = (b0 * Xn1a) + d1a;
374 acc1b = (b0 * Xn1b) + d1b;
375
376 /* Store the result in the accumulator in the destination buffer. */
377 *pOut++ = acc1a;
378 *pOut++ = acc1b;
379
380 /* Every time after the output is computed state should be updated. */
381 /* d1 = b1 * x[n] + a1 * y[n] + d2 */
382 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
383 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
384
385 /* d2 = b2 * x[n] + a2 * y[n] */
386 d2a = (b2 * Xn1a) + (a2 * acc1a);
387 d2b = (b2 * Xn1b) + (a2 * acc1b);
388
389 /* decrement loop counter */
390 sample--;
391 }
392
393 /* Store the updated state variables back into the state array */
394 pState[0] = d1a;
395 pState[1] = d2a;
396
397 pState[2] = d1b;
398 pState[3] = d2b;
399
400 pState += 4U;
401
402 /* The current stage output is given as the input to the next stage */
403 pIn = pDst;
404
405 /* Reset the output working pointer */
406 pOut = pDst;
407
408 /* Decrement the loop counter */
409 stage--;
410
411 } while (stage > 0U);
412
413 }
414
415 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
416
417 /**
418 @} end of BiquadCascadeDF2T group
419 */
420