1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_biquad_cascade_df1_f32.c
4  * Description:  Processing function for the floating-point Biquad cascade DirectFormI(DF1) filter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @defgroup BiquadCascadeDF1 Biquad Cascade IIR Filters Using Direct Form I Structure
37 
38   This set of functions implements arbitrary order recursive (IIR) filters.
39   The filters are implemented as a cascade of second order Biquad sections.
40   The functions support Q15, Q31 and floating-point data types.
41   Fast version of Q15 and Q31 also available.
42 
43   The functions operate on blocks of input and output data and each call to the function
44   processes <code>blockSize</code> samples through the filter.
45   <code>pSrc</code> points to the array of input data and
46   <code>pDst</code> points to the array of output data.
47   Both arrays contain <code>blockSize</code> values.
48 
49   @par           Algorithm
50                    Each Biquad stage implements a second order filter using the difference equation:
51   <pre>
52       y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
53   </pre>
54                   A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
55                   \image html Biquad.gif "Single Biquad filter stage"
56                   Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
57                   Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
58                   Pay careful attention to the sign of the feedback coefficients.
59                   Some design tools use the difference equation
60   <pre>
61       y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
62   </pre>
63                   In this case the feedback coefficients <code>a1</code> and <code>a2</code>
64                   must be negated when used with the CMSIS DSP Library.
65 
66   @par
67                    Higher order filters are realized as a cascade of second order sections.
68                    <code>numStages</code> refers to the number of second order stages used.
69                    For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
70                    \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
71                    A 9th order filter would be realized with <code>numStages=5</code> second order stages with the coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
72 
73   @par
74                    The <code>pState</code> points to state variables array.
75                    Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
76                    The state variables are arranged in the <code>pState</code> array as:
77   <pre>
78       {x[n-1], x[n-2], y[n-1], y[n-2]}
79   </pre>
80 
81   @par
82                    The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
83                    The state array has a total length of <code>4*numStages</code> values.
84                    The state variables are updated after each block of data is processed, the coefficients are untouched.
85 
86   @par           Instance Structure
87                    The coefficients and state variables for a filter are stored together in an instance data structure.
88                    A separate instance structure must be defined for each filter.
89                    Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
90                    There are separate instance structure declarations for each of the 3 supported data types.
91 
92   @par           Init Function
93                    There is also an associated initialization function for each data type.
94                    The initialization function performs following operations:
95                    - Sets the values of the internal structure fields.
96                    - Zeros out the values in the state buffer.
97                    To do this manually without calling the init function, assign the follow subfields of the instance structure:
98                    numStages, pCoeffs, pState. Also set all of the values in pState to zero.
99 
100   @par
101                    Use of the initialization function is optional.
102                    However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
103                    To place an instance structure into a const data section, the instance structure must be manually initialized.
104                    Set the values in the state buffer to zeros before static initialization.
105                    The code below statically initializes each of the 3 different data type filter instance structures
106   <pre>
107       arm_biquad_casd_df1_inst_f32 S1 = {numStages, pState, pCoeffs};
108       arm_biquad_casd_df1_inst_q15 S2 = {numStages, pState, pCoeffs, postShift};
109       arm_biquad_casd_df1_inst_q31 S3 = {numStages, pState, pCoeffs, postShift};
110   </pre>
111                    where <code>numStages</code> is the number of Biquad stages in the filter;
112                    <code>pState</code> is the address of the state buffer;
113                    <code>pCoeffs</code> is the address of the coefficient buffer;
114                    <code>postShift</code> shift to be applied.
115 
116   @par           Fixed-Point Behavior
117                    Care must be taken when using the Q15 and Q31 versions of the Biquad Cascade filter functions.
118                    Following issues must be considered:
119                    - Scaling of coefficients
120                    - Filter gain
121                    - Overflow and saturation
122 
123   @par           Scaling of coefficients
124                    Filter coefficients are represented as fractional values and
125                    coefficients are restricted to lie in the range <code>[-1 +1)</code>.
126                    The fixed-point functions have an additional scaling parameter <code>postShift</code>
127                    which allow the filter coefficients to exceed the range <code>[+1 -1)</code>.
128                    At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
129                    \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
130                    This essentially scales the filter coefficients by <code>2^postShift</code>.
131                    For example, to realize the coefficients
132   <pre>
133      {1.5, -0.8, 1.2, 1.6, -0.9}
134   </pre>
135                    set the pCoeffs array to:
136   <pre>
137      {0.75, -0.4, 0.6, 0.8, -0.45}
138   </pre>
139                    and set <code>postShift=1</code>
140 
141   @par           Filter gain
142                    The frequency response of a Biquad filter is a function of its coefficients.
143                    It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
144                    This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
145                    To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
146 
147   @par           Overflow and saturation
148                    For Q15 and Q31 versions, it is described separately as part of the function specific documentation below.
149  */
150 
151 /**
152   @addtogroup BiquadCascadeDF1
153   @{
154  */
155 
156 /**
157   @brief         Processing function for the floating-point Biquad cascade filter.
158   @param[in]     S         points to an instance of the floating-point Biquad cascade structure
159   @param[in]     pSrc      points to the block of input data
160   @param[out]    pDst      points to the block of output data
161   @param[in]     blockSize  number of samples to process
162  */
163 
164 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
165 #include "arm_helium_utils.h"
arm_biquad_cascade_df1_f32(const arm_biquad_casd_df1_inst_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)166 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_f32(
167   const arm_biquad_casd_df1_inst_f32 * S,
168   const float32_t * pSrc,
169   float32_t * pDst,
170   uint32_t blockSize)
171 {
172     const float32_t *pIn = pSrc;              /*  source pointer            */
173     float32_t *pOut = pDst;             /*  destination pointer       */
174     float32_t *pState = S->pState;      /*  pState pointer            */
175     const float32_t *pCoeffs = S->pCoeffs;    /*  coefficient pointer       */
176     float32_t Xn1, Xn2, Yn1, Yn2;       /*  Filter pState variables   */
177     float32_t lastX, lastY;             /*  X,Y history for tail handling */
178     float32_t X0, X1, X2, X3 = 0;       /*  temporary input           */
179     f32x4_t coeffs;
180     f32x4_t accVec;                   /* accumultor vector */
181     uint32_t  sample, stage = S->numStages; /*  loop counters             */
182 
183     do
184     {
185         /*
186          * Reading the pState values
187          */
188         Xn1 = pState[0];
189         Xn2 = pState[1];
190         Yn1 = pState[2];
191         Yn2 = pState[3];
192 
193         sample = blockSize >> 2U;
194 
195         /*
196          * First part of the processing with loop unrolling.  Compute 4 outputs at a time.
197          * second loop below computes the remaining 1 to 3 samples.
198          */
199         while (sample > 0U)
200         {
201             X0 = *pIn++;
202             X1 = *pIn++;
203             X2 = *pIn++;
204             X3 = *pIn++;
205 
206             coeffs = vld1q(pCoeffs);
207             accVec = vmulq(coeffs, X3);
208 
209             coeffs = vld1q(&pCoeffs[4]);
210             accVec = vfmaq(accVec, coeffs, X2);
211 
212             coeffs = vld1q(&pCoeffs[8]);
213             accVec = vfmaq(accVec, coeffs, X1);
214 
215             coeffs = vld1q(&pCoeffs[12]);
216             accVec = vfmaq(accVec, coeffs, X0);
217 
218             coeffs = vld1q(&pCoeffs[16]);
219             accVec = vfmaq(accVec, coeffs, Xn1);
220 
221             coeffs = vld1q(&pCoeffs[20]);
222             accVec = vfmaq(accVec, coeffs, Xn2);
223 
224             coeffs = vld1q(&pCoeffs[24]);
225             accVec = vfmaq(accVec, coeffs, Yn1);
226 
227             coeffs = vld1q(&pCoeffs[28]);
228             accVec = vfmaq(accVec, coeffs, Yn2);
229             /*
230              * Store the result in the accumulator in the destination buffer.
231              */
232             vst1q(pOut, accVec);
233             pOut += 4;
234 
235             /*
236              * update recurrence
237              */
238             Xn1 = X3;
239             Xn2 = X2;
240             Yn1 = vgetq_lane(accVec, 3);
241             Yn2 = vgetq_lane(accVec, 2);
242             /*
243              * decrement the loop counter
244              */
245             sample--;
246         }
247 
248         /*
249          * If the blockSize is not a multiple of 4,
250          * compute any remaining output samples here.
251          */
252         sample = blockSize & 0x3U;
253         if (sample)
254         {
255             /* save previous X, Y for modulo 1 length case */
256             lastX = X3;
257             lastY = Yn1;
258 
259             X0 = *pIn++;
260             X1 = *pIn++;
261             X2 = *pIn++;
262             X3 = *pIn++;
263 
264             coeffs = vld1q(pCoeffs);
265             accVec = vmulq(coeffs, X3);
266 
267             coeffs = vld1q(&pCoeffs[4]);
268             accVec = vfmaq(accVec, coeffs, X2);
269 
270             coeffs = vld1q(&pCoeffs[8]);
271             accVec = vfmaq(accVec, coeffs, X1);
272 
273             coeffs = vld1q(&pCoeffs[12]);
274             accVec = vfmaq(accVec, coeffs, X0);
275 
276             coeffs = vld1q(&pCoeffs[16]);
277             accVec = vfmaq(accVec, coeffs, Xn1);
278 
279             coeffs = vld1q(&pCoeffs[20]);
280             accVec = vfmaq(accVec, coeffs, Xn2);
281 
282             coeffs = vld1q(&pCoeffs[24]);
283             accVec = vfmaq(accVec, coeffs, Yn1);
284 
285             coeffs = vld1q(&pCoeffs[28]);
286             accVec = vfmaq(accVec, coeffs, Yn2);
287 
288             if (sample == 1)
289             {
290                 *pOut++ = vgetq_lane(accVec, 0);
291                 Xn1 = X0;
292                 Xn2 = lastX;
293                 Yn1 = vgetq_lane(accVec, 0);
294                 Yn2 = lastY;
295             }
296             else if (sample == 2)
297             {
298                 *pOut++ = vgetq_lane(accVec, 0);
299                 *pOut++ = vgetq_lane(accVec, 1);
300                 Xn1 = X1;
301                 Xn2 = X0;
302                 Yn1 = vgetq_lane(accVec, 1);
303                 Yn2 = vgetq_lane(accVec, 0);
304             }
305             else
306             {
307                 *pOut++ = vgetq_lane(accVec, 0);
308                 *pOut++ = vgetq_lane(accVec, 1);
309                 *pOut++ = vgetq_lane(accVec, 2);
310                 Xn1 = X2;
311                 Xn2 = X1;
312                 Yn1 = vgetq_lane(accVec, 2);
313                 Yn2 = vgetq_lane(accVec, 1);
314             }
315         }
316         /*
317          * Store the updated state variables back into the pState array
318          */
319         *pState++ = Xn1;
320         *pState++ = Xn2;
321         *pState++ = Yn1;
322         *pState++ = Yn2;
323 
324         pCoeffs += sizeof(arm_biquad_mod_coef_f32) / sizeof(float32_t);
325         /*
326          * The first stage goes from the input buffer to the output buffer.
327          * Subsequent numStages  occur in-place in the output buffer
328          */
329         pIn = pDst;
330         /*
331          * Reset the output pointer
332          */
333         pOut = pDst;
334         /*
335          * decrement the loop counter
336          */
337         stage--;
338     }
339     while (stage > 0U);
340 }
341 #else
342 #if defined(ARM_MATH_NEON)  && !defined(ARM_MATH_AUTOVECTORIZE)
arm_biquad_cascade_df1_f32(const arm_biquad_casd_df1_inst_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)343 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_f32(
344   const arm_biquad_casd_df1_inst_f32 * S,
345   const float32_t * pSrc,
346   float32_t * pDst,
347   uint32_t blockSize)
348 {
349 
350   const float32_t *pIn = pSrc;                         /*  source pointer            */
351   float32_t *pOut = pDst;                        /*  destination pointer       */
352   float32_t *pState = S->pState;                 /*  pState pointer            */
353   const float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
354   float32_t acc;                                 /*  Simulates the accumulator */
355 
356   uint32_t sample, stage = S->numStages;         /*  loop counters             */
357 
358   float32x4_t Xn;
359   float32x2_t Yn;
360   float32x2_t a;
361   float32x4_t b;
362 
363   float32x4_t x,tmp;
364   float32x2_t t;
365   float32x2x2_t y;
366 
367   float32_t Xns;
368 
369   while (stage > 0U)
370   {
371     /* Reading the coefficients */
372     Xn = vdupq_n_f32(0.0f);
373 
374     Xn = vsetq_lane_f32(pState[0],Xn,2);
375     Xn = vsetq_lane_f32(pState[1],Xn,3);
376     Yn = vset_lane_f32(pState[2],Yn,0);
377     Yn = vset_lane_f32(pState[3],Yn,1);
378 
379     b = vld1q_f32(pCoeffs);
380     b = vrev64q_f32(b);
381     b = vcombine_f32(vget_high_f32(b), vget_low_f32(b));
382 
383     a = vld1_f32(pCoeffs + 3);
384     a = vrev64_f32(a);
385     b = vsetq_lane_f32(0.0f,b,0);
386     pCoeffs += 5;
387 
388     /* Reading the pState values */
389 
390     /* Apply loop unrolling and compute 4 output values simultaneously. */
391     /*      The variable acc hold output values that are being computed:
392      *
393      *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
394      *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
395      *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
396      *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
397      */
398 
399     /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
400      ** a second loop below computes the remaining 1 to 3 samples. */
401     sample = blockSize >> 2U;
402 
403     while (sample > 0U)
404     {
405       /* Read the first 4 inputs */
406       x = vld1q_f32(pIn);
407 
408       pIn += 4;
409 
410       tmp = vextq_f32(Xn, x, 1);
411       t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
412       t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
413       t = vmla_f32(t, a, Yn);
414       t = vpadd_f32(t, t);
415       Yn = vext_f32(Yn, t, 1);
416 
417       tmp = vextq_f32(Xn, x, 2);
418       t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
419       t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
420       t = vmla_f32(t, a, Yn);
421       t = vpadd_f32(t, t);
422       Yn = vext_f32(Yn, t, 1);
423 
424       y.val[0] = Yn;
425 
426       tmp = vextq_f32(Xn, x, 3);
427       t = vmul_f32(vget_high_f32(b), vget_high_f32(tmp));
428       t = vmla_f32(t, vget_low_f32(b), vget_low_f32(tmp));
429       t = vmla_f32(t, a, Yn);
430       t = vpadd_f32(t, t);
431       Yn = vext_f32(Yn, t, 1);
432 
433       Xn = x;
434       t = vmul_f32(vget_high_f32(b), vget_high_f32(Xn));
435       t = vmla_f32(t, vget_low_f32(b), vget_low_f32(Xn));
436       t = vmla_f32(t, a, Yn);
437       t = vpadd_f32(t, t);
438       Yn = vext_f32(Yn, t, 1);
439 
440       y.val[1] = Yn;
441 
442       tmp = vcombine_f32(y.val[0], y.val[1]);
443 
444       /* Store the 4 outputs and increment the pointer */
445       vst1q_f32(pOut, tmp);
446       pOut += 4;
447 
448       /* Decrement the loop counter */
449       sample--;
450     }
451 
452 
453     /* If the block size is not a multiple of 4, compute any remaining output samples here.
454      ** No loop unrolling is used. */
455     sample = blockSize & 0x3U;
456 
457     while (sample > 0U)
458     {
459       /* Read the input */
460       Xns = *pIn++;
461 
462       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
463       acc =  (vgetq_lane_f32(b, 1) * vgetq_lane_f32(Xn, 2))
464       + (vgetq_lane_f32(b, 2) * vgetq_lane_f32(Xn, 3))
465       + (vgetq_lane_f32(b, 3) * Xns)
466       + (vget_lane_f32(a, 0) * vget_lane_f32(Yn, 0))
467       + (vget_lane_f32(a, 1) * vget_lane_f32(Yn, 1));
468 
469       /* Store the result in the accumulator in the destination buffer. */
470       *pOut++ = acc;
471 
472       /* Every time after the output is computed state should be updated. */
473       /* The states should be updated as:    */
474       /* Xn2 = Xn1   */
475       /* Xn1 = Xn    */
476       /* Yn2 = Yn1   */
477       /* Yn1 = acc   */
478       Xn = vsetq_lane_f32(vgetq_lane_f32(Xn, 3),Xn,2);
479       Xn = vsetq_lane_f32(Xns,Xn,3);
480       Yn = vset_lane_f32(vget_lane_f32(Yn, 1),Yn,0);
481       Yn = vset_lane_f32(acc,Yn,1);
482 
483       /* Decrement the loop counter */
484       sample--;
485 
486     }
487 
488     vst1q_f32(pState,vcombine_f32((vget_high_f32(Xn)),(Yn)));
489     pState += 4;
490     /*  Store the updated state variables back into the pState array */
491 
492     /*  The first stage goes from the input buffer to the output buffer. */
493     /*  Subsequent numStages  occur in-place in the output buffer */
494     pIn = pDst;
495 
496     /* Reset the output pointer */
497     pOut = pDst;
498 
499     /* Decrement the loop counter */
500     stage--;
501   }
502 }
503 
504 #else
arm_biquad_cascade_df1_f32(const arm_biquad_casd_df1_inst_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)505 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_f32(
506   const arm_biquad_casd_df1_inst_f32 * S,
507   const float32_t * pSrc,
508         float32_t * pDst,
509         uint32_t blockSize)
510 {
511   const float32_t *pIn = pSrc;                         /* Source pointer */
512         float32_t *pOut = pDst;                        /* Destination pointer */
513         float32_t *pState = S->pState;                 /* pState pointer */
514   const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
515         float32_t acc;                                 /* Accumulator */
516         float32_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
517         float32_t Xn1, Xn2, Yn1, Yn2;                  /* Filter pState variables */
518         float32_t Xn;                                  /* Temporary input */
519         uint32_t sample, stage = S->numStages;         /* Loop counters */
520 
521   do
522   {
523     /* Reading the coefficients */
524     b0 = *pCoeffs++;
525     b1 = *pCoeffs++;
526     b2 = *pCoeffs++;
527     a1 = *pCoeffs++;
528     a2 = *pCoeffs++;
529 
530     /* Reading the pState values */
531     Xn1 = pState[0];
532     Xn2 = pState[1];
533     Yn1 = pState[2];
534     Yn2 = pState[3];
535 
536 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
537 
538     /* Apply loop unrolling and compute 4 output values simultaneously. */
539     /* Variable acc hold output values that are being computed:
540      *
541      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
542      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
543      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
544      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
545      */
546 
547     /* Loop unrolling: Compute 4 outputs at a time */
548     sample = blockSize >> 2U;
549 
550     while (sample > 0U)
551     {
552       /* Read the first input */
553       Xn = *pIn++;
554 
555       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
556       Yn2 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
557 
558       /* Store output in destination buffer. */
559       *pOut++ = Yn2;
560 
561       /* Every time after the output is computed state should be updated. */
562       /* The states should be updated as: */
563       /* Xn2 = Xn1 */
564       /* Xn1 = Xn  */
565       /* Yn2 = Yn1 */
566       /* Yn1 = acc */
567 
568       /* Read the second input */
569       Xn2 = *pIn++;
570 
571       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
572       Yn1 = (b0 * Xn2) + (b1 * Xn) + (b2 * Xn1) + (a1 * Yn2) + (a2 * Yn1);
573 
574       /* Store output in destination buffer. */
575       *pOut++ = Yn1;
576 
577       /* Every time after the output is computed state should be updated. */
578       /* The states should be updated as: */
579       /* Xn2 = Xn1 */
580       /* Xn1 = Xn  */
581       /* Yn2 = Yn1 */
582       /* Yn1 = acc */
583 
584       /* Read the third input */
585       Xn1 = *pIn++;
586 
587       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
588       Yn2 = (b0 * Xn1) + (b1 * Xn2) + (b2 * Xn) + (a1 * Yn1) + (a2 * Yn2);
589 
590       /* Store output in destination buffer. */
591       *pOut++ = Yn2;
592 
593       /* Every time after the output is computed state should be updated. */
594       /* The states should be updated as: */
595       /* Xn2 = Xn1 */
596       /* Xn1 = Xn  */
597       /* Yn2 = Yn1 */
598       /* Yn1 = acc */
599 
600       /* Read the forth input */
601       Xn = *pIn++;
602 
603       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
604       Yn1 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn2) + (a2 * Yn1);
605 
606       /* Store output in destination buffer. */
607       *pOut++ = Yn1;
608 
609       /* Every time after the output is computed state should be updated. */
610       /* The states should be updated as: */
611       /* Xn2 = Xn1 */
612       /* Xn1 = Xn  */
613       /* Yn2 = Yn1 */
614       /* Yn1 = acc */
615       Xn2 = Xn1;
616       Xn1 = Xn;
617 
618       /* decrement loop counter */
619       sample--;
620     }
621 
622     /* Loop unrolling: Compute remaining outputs */
623     sample = blockSize & 0x3U;
624 
625 #else
626 
627     /* Initialize blkCnt with number of samples */
628     sample = blockSize;
629 
630 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
631 
632     while (sample > 0U)
633     {
634       /* Read the input */
635       Xn = *pIn++;
636 
637       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
638       acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
639 
640       /* Store output in destination buffer. */
641       *pOut++ = acc;
642 
643       /* Every time after the output is computed state should be updated. */
644       /* The states should be updated as: */
645       /* Xn2 = Xn1 */
646       /* Xn1 = Xn  */
647       /* Yn2 = Yn1 */
648       /* Yn1 = acc */
649       Xn2 = Xn1;
650       Xn1 = Xn;
651       Yn2 = Yn1;
652       Yn1 = acc;
653 
654       /* decrement loop counter */
655       sample--;
656     }
657 
658     /* Store the updated state variables back into the pState array */
659     *pState++ = Xn1;
660     *pState++ = Xn2;
661     *pState++ = Yn1;
662     *pState++ = Yn2;
663 
664     /* The first stage goes from the input buffer to the output buffer. */
665     /* Subsequent numStages occur in-place in the output buffer */
666     pIn = pDst;
667 
668     /* Reset output pointer */
669     pOut = pDst;
670 
671     /* decrement loop counter */
672     stage--;
673 
674   } while (stage > 0U);
675 
676 }
677 
678 #endif /* #if defined(ARM_MATH_NEON) */
679 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
680 
681 /**
682   @} end of BiquadCascadeDF1 group
683  */
684