1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_biquad_cascade_df1_f32.c
4 * Description: Processing function for the floating-point Biquad cascade DirectFormI(DF1) filter
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @defgroup BiquadCascadeDF1 Biquad Cascade IIR Filters Using Direct Form I Structure
37
38 This set of functions implements arbitrary order recursive (IIR) filters.
39 The filters are implemented as a cascade of second order Biquad sections.
40 The functions support Q15, Q31 and floating-point data types.
41 Fast version of Q15 and Q31 also available.
42
43 The functions operate on blocks of input and output data and each call to the function
44 processes <code>blockSize</code> samples through the filter.
45 <code>pSrc</code> points to the array of input data and
46 <code>pDst</code> points to the array of output data.
47 Both arrays contain <code>blockSize</code> values.
48
49 @par Algorithm
50 Each Biquad stage implements a second order filter using the difference equation:
51 <pre>
52 y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
53 </pre>
54 A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
55 \image html Biquad.gif "Single Biquad filter stage"
56 Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
57 Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
58 Pay careful attention to the sign of the feedback coefficients.
59 Some design tools use the difference equation
60 <pre>
61 y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
62 </pre>
63 In this case the feedback coefficients <code>a1</code> and <code>a2</code>
64 must be negated when used with the CMSIS DSP Library.
65
66 @par
67 Higher order filters are realized as a cascade of second order sections.
68 <code>numStages</code> refers to the number of second order stages used.
69 For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
70 \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
71 A 9th order filter would be realized with <code>numStages=5</code> second order stages with the coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
72
73 @par
74 The <code>pState</code> points to state variables array.
75 Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
76 The state variables are arranged in the <code>pState</code> array as:
77 <pre>
78 {x[n-1], x[n-2], y[n-1], y[n-2]}
79 </pre>
80
81 @par
82 The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
83 The state array has a total length of <code>4*numStages</code> values.
84 The state variables are updated after each block of data is processed, the coefficients are untouched.
85
86 @par Instance Structure
87 The coefficients and state variables for a filter are stored together in an instance data structure.
88 A separate instance structure must be defined for each filter.
89 Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
90 There are separate instance structure declarations for each of the 3 supported data types.
91
92 @par Init Function
93 There is also an associated initialization function for each data type.
94 The initialization function performs following operations:
95 - Sets the values of the internal structure fields.
96 - Zeros out the values in the state buffer.
97 To do this manually without calling the init function, assign the follow subfields of the instance structure:
98 numStages, pCoeffs, pState. Also set all of the values in pState to zero.
99
100 @par
101 Use of the initialization function is optional.
102 However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
103 To place an instance structure into a const data section, the instance structure must be manually initialized.
104 Set the values in the state buffer to zeros before static initialization.
105 The code below statically initializes each of the 3 different data type filter instance structures
106 <pre>
107 arm_biquad_casd_df1_inst_f32 S1 = {numStages, pState, pCoeffs};
108 arm_biquad_casd_df1_inst_q15 S2 = {numStages, pState, pCoeffs, postShift};
109 arm_biquad_casd_df1_inst_q31 S3 = {numStages, pState, pCoeffs, postShift};
110 </pre>
111 where <code>numStages</code> is the number of Biquad stages in the filter;
112 <code>pState</code> is the address of the state buffer;
113 <code>pCoeffs</code> is the address of the coefficient buffer;
114 <code>postShift</code> shift to be applied.
115
116 @par Fixed-Point Behavior
117 Care must be taken when using the Q15 and Q31 versions of the Biquad Cascade filter functions.
118 Following issues must be considered:
119 - Scaling of coefficients
120 - Filter gain
121 - Overflow and saturation
122
123 @par Scaling of coefficients
124 Filter coefficients are represented as fractional values and
125 coefficients are restricted to lie in the range <code>[-1 +1)</code>.
126 The fixed-point functions have an additional scaling parameter <code>postShift</code>
127 which allow the filter coefficients to exceed the range <code>[+1 -1)</code>.
128 At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
129 \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
130 This essentially scales the filter coefficients by <code>2^postShift</code>.
131 For example, to realize the coefficients
132 <pre>
133 {1.5, -0.8, 1.2, 1.6, -0.9}
134 </pre>
135 set the pCoeffs array to:
136 <pre>
137 {0.75, -0.4, 0.6, 0.8, -0.45}
138 </pre>
139 and set <code>postShift=1</code>
140
141 @par Filter gain
142 The frequency response of a Biquad filter is a function of its coefficients.
143 It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
144 This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
145 To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
146
147 @par Overflow and saturation
148 For Q15 and Q31 versions, it is described separately as part of the function specific documentation below.
149 */
150
151 /**
152 @addtogroup BiquadCascadeDF1
153 @{
154 */
155
156 /**
157 @brief Processing function for the floating-point Biquad cascade filter.
158 @param[in] S points to an instance of the floating-point Biquad cascade structure
159 @param[in] pSrc points to the block of input data
160 @param[out] pDst points to the block of output data
161 @param[in] blockSize number of samples to process
162 */
163
164 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
165 #include "arm_helium_utils.h"
arm_biquad_cascade_df1_f32(const arm_biquad_casd_df1_inst_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)166 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_f32(
167 const arm_biquad_casd_df1_inst_f32 * S,
168 const float32_t * pSrc,
169 float32_t * pDst,
170 uint32_t blockSize)
171 {
172 const float32_t *pIn = pSrc; /* source pointer */
173 float32_t *pOut = pDst; /* destination pointer */
174 float32_t *pState = S->pState; /* pState pointer */
175 const float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
176 float32_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
177 float32_t lastX, lastY; /* X,Y history for tail handling */
178 float32_t X0, X1, X2, X3 = 0; /* temporary input */
179 f32x4_t coeffs;
180 f32x4_t accVec; /* accumultor vector */
181 uint32_t sample, stage = S->numStages; /* loop counters */
182
183 do
184 {
185 /*
186 * Reading the pState values
187 */
188 Xn1 = pState[0];
189 Xn2 = pState[1];
190 Yn1 = pState[2];
191 Yn2 = pState[3];
192
193 sample = blockSize >> 2U;
194
195 /*
196 * First part of the processing with loop unrolling. Compute 4 outputs at a time.
197 * second loop below computes the remaining 1 to 3 samples.
198 */
199 while (sample > 0U)
200 {
201 X0 = *pIn++;
202 X1 = *pIn++;
203 X2 = *pIn++;
204 X3 = *pIn++;
205
206 coeffs = vld1q(pCoeffs);
207 accVec = vmulq(coeffs, X3);
208
209 coeffs = vld1q(&pCoeffs[4]);
210 accVec = vfmaq(accVec, coeffs, X2);
211
212 coeffs = vld1q(&pCoeffs[8]);
213 accVec = vfmaq(accVec, coeffs, X1);
214
215 coeffs = vld1q(&pCoeffs[12]);
216 accVec = vfmaq(accVec, coeffs, X0);
217
218 coeffs = vld1q(&pCoeffs[16]);
219 accVec = vfmaq(accVec, coeffs, Xn1);
220
221 coeffs = vld1q(&pCoeffs[20]);
222 accVec = vfmaq(accVec, coeffs, Xn2);
223
224 coeffs = vld1q(&pCoeffs[24]);
225 accVec = vfmaq(accVec, coeffs, Yn1);
226
227 coeffs = vld1q(&pCoeffs[28]);
228 accVec = vfmaq(accVec, coeffs, Yn2);
229 /*
230 * Store the result in the accumulator in the destination buffer.
231 */
232 vst1q(pOut, accVec);
233 pOut += 4;
234
235 /*
236 * update recurrence
237 */
238 Xn1 = X3;
239 Xn2 = X2;
240 Yn1 = vgetq_lane(accVec, 3);
241 Yn2 = vgetq_lane(accVec, 2);
242 /*
243 * decrement the loop counter
244 */
245 sample--;
246 }
247
248 /*
249 * If the blockSize is not a multiple of 4,
250 * compute any remaining output samples here.
251 */
252 sample = blockSize & 0x3U;
253 if (sample)
254 {
255 /* save previous X, Y for modulo 1 length case */
256 lastX = X3;
257 lastY = Yn1;
258
259 X0 = *pIn++;
260 X1 = *pIn++;
261 X2 = *pIn++;
262 X3 = *pIn++;
263
264 coeffs = vld1q(pCoeffs);
265 accVec = vmulq(coeffs, X3);
266
267 coeffs = vld1q(&pCoeffs[4]);
268 accVec = vfmaq(accVec, coeffs, X2);
269
270 coeffs = vld1q(&pCoeffs[8]);
271 accVec = vfmaq(accVec, coeffs, X1);
272
273 coeffs = vld1q(&pCoeffs[12]);
274 accVec = vfmaq(accVec, coeffs, X0);
275
276 coeffs = vld1q(&pCoeffs[16]);
277 accVec = vfmaq(accVec, coeffs, Xn1);
278
279 coeffs = vld1q(&pCoeffs[20]);
280 accVec = vfmaq(accVec, coeffs, Xn2);
281
282 coeffs = vld1q(&pCoeffs[24]);
283 accVec = vfmaq(accVec, coeffs, Yn1);
284
285 coeffs = vld1q(&pCoeffs[28]);
286 accVec = vfmaq(accVec, coeffs, Yn2);
287
288 if (sample == 1)
289 {
290 *pOut++ = vgetq_lane(accVec, 0);
291 Xn1 = X0;
292 Xn2 = lastX;
293 Yn1 = vgetq_lane(accVec, 0);
294 Yn2 = lastY;
295 }
296 else if (sample == 2)
297 {
298 *pOut++ = vgetq_lane(accVec, 0);
299 *pOut++ = vgetq_lane(accVec, 1);
300 Xn1 = X1;
301 Xn2 = X0;
302 Yn1 = vgetq_lane(accVec, 1);
303 Yn2 = vgetq_lane(accVec, 0);
304 }
305 else
306 {
307 *pOut++ = vgetq_lane(accVec, 0);
308 *pOut++ = vgetq_lane(accVec, 1);
309 *pOut++ = vgetq_lane(accVec, 2);
310 Xn1 = X2;
311 Xn2 = X1;
312 Yn1 = vgetq_lane(accVec, 2);
313 Yn2 = vgetq_lane(accVec, 1);
314 }
315 }
316 /*
317 * Store the updated state variables back into the pState array
318 */
319 *pState++ = Xn1;
320 *pState++ = Xn2;
321 *pState++ = Yn1;
322 *pState++ = Yn2;
323
324 pCoeffs += sizeof(arm_biquad_mod_coef_f32) / sizeof(float32_t);
325 /*
326 * The first stage goes from the input buffer to the output buffer.
327 * Subsequent numStages occur in-place in the output buffer
328 */
329 pIn = pDst;
330 /*
331 * Reset the output pointer
332 */
333 pOut = pDst;
334 /*
335 * decrement the loop counter
336 */
337 stage--;
338 }
339 while (stage > 0U);
340 }
341 #else
342 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_biquad_cascade_df1_f32(const arm_biquad_casd_df1_inst_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)343 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_f32(
344 const arm_biquad_casd_df1_inst_f32 * S,
345 const float32_t * pSrc,
346 float32_t * pDst,
347 uint32_t blockSize)
348 {
349
350 const float32_t *pIn = pSrc; /* source pointer */
351 float32_t *pOut = pDst; /* destination pointer */
352 float32_t *pState = S->pState; /* pState pointer */
353 const float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
354 float32_t acc; /* Simulates the accumulator */
355
356 uint32_t sample, stage = S->numStages; /* loop counters */
357
358 float32x4_t Xn;
359 float32x2_t Yn;
360 float32x2_t a;
361 float32x4_t b;
362
363 float32x4_t x,tmp;
364 float32x2_t t;
365 float32x2x2_t y;
366
367 float32_t Xns;
368
369 while (stage > 0U)
370 {
371 /* Reading the coefficients */
372 Xn = vdupq_n_f32(0.0f);
373
374 Xn = vsetq_lane_f32(pState[0],Xn,2);
375 Xn = vsetq_lane_f32(pState[1],Xn,3);
376 Yn = vset_lane_f32(pState[2],Yn,0);
377 Yn = vset_lane_f32(pState[3],Yn,1);
378
379 b = vld1q_f32(pCoeffs);
380 b = vrev64q_f32(b);
381 b = vcombine_f32(vget_high_f32(b), vget_low_f32(b));
382
383 a = vld1_f32(pCoeffs + 3);
384 a = vrev64_f32(a);
385 b = vsetq_lane_f32(0.0f,b,0);
386 pCoeffs += 5;
387
388 /* Reading the pState values */
389
390 /* Apply loop unrolling and compute 4 output values simultaneously. */
391 /* The variable acc hold output values that are being computed:
392 *
393 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
394 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
395 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
396 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
397 */
398
399 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
400 ** a second loop below computes the remaining 1 to 3 samples. */
401 sample = blockSize >> 2U;
402
403 while (sample > 0U)
404 {
405 /* Read the first 4 inputs */
406 x = vld1q_f32(pIn);
407
408 pIn += 4;
409
410 tmp = vextq_f32(Xn, x, 1);
411 t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
412 t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
413 t = vmla_f32(t, a, Yn);
414 t = vpadd_f32(t, t);
415 Yn = vext_f32(Yn, t, 1);
416
417 tmp = vextq_f32(Xn, x, 2);
418 t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
419 t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
420 t = vmla_f32(t, a, Yn);
421 t = vpadd_f32(t, t);
422 Yn = vext_f32(Yn, t, 1);
423
424 y.val[0] = Yn;
425
426 tmp = vextq_f32(Xn, x, 3);
427 t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
428 t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
429 t = vmla_f32(t, a, Yn);
430 t = vpadd_f32(t, t);
431 Yn = vext_f32(Yn, t, 1);
432
433 Xn = x;
434 t = vmul_f32(vget_high_f32(b), vget_high_f32(Xn));
435 t = vmla_f32(t, vget_low_f32(b), vget_low_f32(Xn));
436 t = vmla_f32(t, a, Yn);
437 t = vpadd_f32(t, t);
438 Yn = vext_f32(Yn, t, 1);
439
440 y.val[1] = Yn;
441
442 tmp = vcombine_f32(y.val[0], y.val[1]);
443
444 /* Store the 4 outputs and increment the pointer */
445 vst1q_f32(pOut, tmp);
446 pOut += 4;
447
448 /* Decrement the loop counter */
449 sample--;
450 }
451
452
453 /* If the block size is not a multiple of 4, compute any remaining output samples here.
454 ** No loop unrolling is used. */
455 sample = blockSize & 0x3U;
456
457 while (sample > 0U)
458 {
459 /* Read the input */
460 Xns = *pIn++;
461
462 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
463 acc = (vgetq_lane_f32(b, 1) * vgetq_lane_f32(Xn, 2))
464 + (vgetq_lane_f32(b, 2) * vgetq_lane_f32(Xn, 3))
465 + (vgetq_lane_f32(b, 3) * Xns)
466 + (vget_lane_f32(a, 0) * vget_lane_f32(Yn, 0))
467 + (vget_lane_f32(a, 1) * vget_lane_f32(Yn, 1));
468
469 /* Store the result in the accumulator in the destination buffer. */
470 *pOut++ = acc;
471
472 /* Every time after the output is computed state should be updated. */
473 /* The states should be updated as: */
474 /* Xn2 = Xn1 */
475 /* Xn1 = Xn */
476 /* Yn2 = Yn1 */
477 /* Yn1 = acc */
478 Xn = vsetq_lane_f32(vgetq_lane_f32(Xn, 3),Xn,2);
479 Xn = vsetq_lane_f32(Xns,Xn,3);
480 Yn = vset_lane_f32(vget_lane_f32(Yn, 1),Yn,0);
481 Yn = vset_lane_f32(acc,Yn,1);
482
483 /* Decrement the loop counter */
484 sample--;
485
486 }
487
488 vst1q_f32(pState,vcombine_f32((vget_high_f32(Xn)),(Yn)));
489 pState += 4;
490 /* Store the updated state variables back into the pState array */
491
492 /* The first stage goes from the input buffer to the output buffer. */
493 /* Subsequent numStages occur in-place in the output buffer */
494 pIn = pDst;
495
496 /* Reset the output pointer */
497 pOut = pDst;
498
499 /* Decrement the loop counter */
500 stage--;
501 }
502 }
503
504 #else
arm_biquad_cascade_df1_f32(const arm_biquad_casd_df1_inst_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)505 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_f32(
506 const arm_biquad_casd_df1_inst_f32 * S,
507 const float32_t * pSrc,
508 float32_t * pDst,
509 uint32_t blockSize)
510 {
511 const float32_t *pIn = pSrc; /* Source pointer */
512 float32_t *pOut = pDst; /* Destination pointer */
513 float32_t *pState = S->pState; /* pState pointer */
514 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
515 float32_t acc; /* Accumulator */
516 float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
517 float32_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
518 float32_t Xn; /* Temporary input */
519 uint32_t sample, stage = S->numStages; /* Loop counters */
520
521 do
522 {
523 /* Reading the coefficients */
524 b0 = *pCoeffs++;
525 b1 = *pCoeffs++;
526 b2 = *pCoeffs++;
527 a1 = *pCoeffs++;
528 a2 = *pCoeffs++;
529
530 /* Reading the pState values */
531 Xn1 = pState[0];
532 Xn2 = pState[1];
533 Yn1 = pState[2];
534 Yn2 = pState[3];
535
536 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
537
538 /* Apply loop unrolling and compute 4 output values simultaneously. */
539 /* Variable acc hold output values that are being computed:
540 *
541 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
542 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
543 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
544 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
545 */
546
547 /* Loop unrolling: Compute 4 outputs at a time */
548 sample = blockSize >> 2U;
549
550 while (sample > 0U)
551 {
552 /* Read the first input */
553 Xn = *pIn++;
554
555 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
556 Yn2 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
557
558 /* Store output in destination buffer. */
559 *pOut++ = Yn2;
560
561 /* Every time after the output is computed state should be updated. */
562 /* The states should be updated as: */
563 /* Xn2 = Xn1 */
564 /* Xn1 = Xn */
565 /* Yn2 = Yn1 */
566 /* Yn1 = acc */
567
568 /* Read the second input */
569 Xn2 = *pIn++;
570
571 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
572 Yn1 = (b0 * Xn2) + (b1 * Xn) + (b2 * Xn1) + (a1 * Yn2) + (a2 * Yn1);
573
574 /* Store output in destination buffer. */
575 *pOut++ = Yn1;
576
577 /* Every time after the output is computed state should be updated. */
578 /* The states should be updated as: */
579 /* Xn2 = Xn1 */
580 /* Xn1 = Xn */
581 /* Yn2 = Yn1 */
582 /* Yn1 = acc */
583
584 /* Read the third input */
585 Xn1 = *pIn++;
586
587 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
588 Yn2 = (b0 * Xn1) + (b1 * Xn2) + (b2 * Xn) + (a1 * Yn1) + (a2 * Yn2);
589
590 /* Store output in destination buffer. */
591 *pOut++ = Yn2;
592
593 /* Every time after the output is computed state should be updated. */
594 /* The states should be updated as: */
595 /* Xn2 = Xn1 */
596 /* Xn1 = Xn */
597 /* Yn2 = Yn1 */
598 /* Yn1 = acc */
599
600 /* Read the forth input */
601 Xn = *pIn++;
602
603 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
604 Yn1 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn2) + (a2 * Yn1);
605
606 /* Store output in destination buffer. */
607 *pOut++ = Yn1;
608
609 /* Every time after the output is computed state should be updated. */
610 /* The states should be updated as: */
611 /* Xn2 = Xn1 */
612 /* Xn1 = Xn */
613 /* Yn2 = Yn1 */
614 /* Yn1 = acc */
615 Xn2 = Xn1;
616 Xn1 = Xn;
617
618 /* decrement loop counter */
619 sample--;
620 }
621
622 /* Loop unrolling: Compute remaining outputs */
623 sample = blockSize & 0x3U;
624
625 #else
626
627 /* Initialize blkCnt with number of samples */
628 sample = blockSize;
629
630 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
631
632 while (sample > 0U)
633 {
634 /* Read the input */
635 Xn = *pIn++;
636
637 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
638 acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
639
640 /* Store output in destination buffer. */
641 *pOut++ = acc;
642
643 /* Every time after the output is computed state should be updated. */
644 /* The states should be updated as: */
645 /* Xn2 = Xn1 */
646 /* Xn1 = Xn */
647 /* Yn2 = Yn1 */
648 /* Yn1 = acc */
649 Xn2 = Xn1;
650 Xn1 = Xn;
651 Yn2 = Yn1;
652 Yn1 = acc;
653
654 /* decrement loop counter */
655 sample--;
656 }
657
658 /* Store the updated state variables back into the pState array */
659 *pState++ = Xn1;
660 *pState++ = Xn2;
661 *pState++ = Yn1;
662 *pState++ = Yn2;
663
664 /* The first stage goes from the input buffer to the output buffer. */
665 /* Subsequent numStages occur in-place in the output buffer */
666 pIn = pDst;
667
668 /* Reset output pointer */
669 pOut = pDst;
670
671 /* decrement loop counter */
672 stage--;
673
674 } while (stage > 0U);
675
676 }
677
678 #endif /* #if defined(ARM_MATH_NEON) */
679 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
680
681 /**
682 @} end of BiquadCascadeDF1 group
683 */
684