1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_biquad_cascade_df1_q15.c
4  * Description:  Processing function for the Q15 Biquad cascade DirectFormI(DF1) filter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup BiquadCascadeDF1
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q15 Biquad cascade filter.
42   @param[in]     S         points to an instance of the Q15 Biquad cascade structure
43   @param[in]     pSrc      points to the block of input data
44   @param[out]    pDst      points to the location where the output result is written
45   @param[in]     blockSize number of samples to process
46 
47   @par           Scaling and Overflow Behavior
48                    The function is implemented using a 64-bit internal accumulator.
49                    Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
50                    The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
51                    There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
52                    The accumulator is then shifted by <code>postShift</code> bits to truncate the result to 1.15 format by discarding the low 16 bits.
53                    Finally, the result is saturated to 1.15 format.
54   @remark
55                    Refer to \ref arm_biquad_cascade_df1_fast_q15() for a faster but less precise implementation of this filter.
56  */
57 
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
59 
arm_biquad_cascade_df1_q15(const arm_biquad_casd_df1_inst_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)60 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_q15(
61   const arm_biquad_casd_df1_inst_q15 * S,
62   const q15_t * pSrc,
63         q15_t * pDst,
64         uint32_t blockSize)
65 {
66     const q15_t    *pIn = pSrc; /*  input pointer initialization  */
67     q15_t          *pOut = pDst;        /*  output pointer initialization */
68     int             shift;
69     uint32_t        sample, stages = S->numStages;      /*  loop counters                 */
70     int             postShift = S->postShift;
71     q15x8_t         bCoeffs0, bCoeffs1, bCoeffs2, bCoeffs3;     /*  Coefficients vector           */
72     q15_t          *pState = S->pState; /*  pState pointer initialization */
73     q15x8_t         inVec0;
74     int64_t         acc;
75     const q15_t     *pCoeffs = S->pCoeffs;       /*  coeff pointer initialization  */
76     q31_t           out, out1;
77 
78     shift = (15 - postShift) - 32;
79 
80     do {
81         q15_t           a2 = pCoeffs[5];
82         q15_t           a1 = pCoeffs[4];
83 
84         bCoeffs0 = vdupq_n_s16(0);
85         bCoeffs0[0] = pCoeffs[3];       // b2
86         bCoeffs0[1] = pCoeffs[2];       // b1
87         bCoeffs0[2] = pCoeffs[0];       // b0
88 
89         uint32_t        zero = 0;
90         bCoeffs1 = bCoeffs0;
91         bCoeffs1 = vshlcq_s16(bCoeffs1, &zero, 16);
92 
93         bCoeffs2 = bCoeffs1;
94         bCoeffs2 = vshlcq_s16(bCoeffs2, &zero, 16);
95 
96         bCoeffs3 = bCoeffs2;
97         bCoeffs3 = vshlcq_s16(bCoeffs3, &zero, 16);
98 
99         bCoeffs0[6] = a2;
100         bCoeffs0[7] = a1;
101         bCoeffs1[7] = a2;
102         bCoeffs1[6] = a1;
103 
104         bCoeffs2 =
105             vsetq_lane_s32(vgetq_lane_s32((q31x4_t) bCoeffs0, 3), (q31x4_t) bCoeffs2, 3);
106         bCoeffs3 =
107             vsetq_lane_s32(vgetq_lane_s32((q31x4_t) bCoeffs1, 3), (q31x4_t) bCoeffs3, 3);
108 
109 
110         /* 2 first elements are garbage, will be updated with history */
111         inVec0 = vld1q(pIn - 2);
112         pIn += 2;
113 
114         inVec0[0] = pState[1];
115         inVec0[1] = pState[0];
116         inVec0[6] = pState[3];
117         inVec0[7] = pState[2];
118 
119         acc = vmlaldavq(bCoeffs0, inVec0);
120         acc = sqrshrl_sat48(acc, shift);
121         out1 = (q31_t) ((acc >> 32) & 0xffffffff);
122 
123         inVec0[6] = out1;
124         acc = vmlaldavq(bCoeffs1, inVec0);
125         acc = sqrshrl_sat48(acc, shift);
126         out = (q31_t) ((acc >> 32) & 0xffffffff);
127 
128         inVec0[7] = out;
129         *pOut++ = (q15_t) out1;
130         *pOut++ = (q15_t) out;
131 
132         acc = vmlaldavq(bCoeffs2, inVec0);
133         acc = sqrshrl_sat48(acc, shift);
134         out1 = (q31_t) ((acc >> 32) & 0xffffffff);
135 
136 
137         inVec0[6] = out1;
138         acc = vmlaldavq(bCoeffs3, inVec0);
139         acc = sqrshrl_sat48(acc, shift);
140         out = (q31_t) ((acc >> 32) & 0xffffffff);
141 
142         /*
143          * main loop
144          */
145         sample = (blockSize - 4) >> 2U;
146         /* preload (efficient scheduling) */
147         inVec0 = vld1q(pIn);
148         pIn += 4;
149 
150         /*
151          * Compute 4 outputs at a time.
152          */
153         while (sample > 0U) {
154 
155             inVec0[6] = out1;
156             inVec0[7] = out;
157 
158             /* store */
159             *pOut++ = (q15_t) out1;
160             *pOut++ = (q15_t) out;
161 
162             /*
163              * in       { x0  x1  x2  x3  x4  x5  yn2  yn1 }
164              *                          x
165              * bCoeffs0 { b2  b1  b0   0   0   0   a2  a1 }
166              *
167              */
168             acc = vmlaldavq(bCoeffs0, inVec0);
169             /* shift + saturate to 16 bit */
170             acc = sqrshrl_sat48(acc, shift);
171             out1 = (q31_t) ((acc >> 32) & 0xffffffff);
172             inVec0[6] = out1;
173 
174             /*
175              * in       { x0  x1  x2  x3  x4  x5  y0  yn1 }
176              *                          x
177              * bCoeffs1 {  0  b2  b1  b0   0   0  a1  a2 }
178              */
179             acc = vmlaldavq(bCoeffs1, inVec0);
180             acc = sqrshrl_sat48(acc, shift);
181             out = (q31_t) ((acc >> 32) & 0xffffffff);
182 
183             *pOut++ = (q15_t) out1;
184             *pOut++ = (q15_t) out;
185 
186 
187             inVec0[7] = out;
188             /*
189              * in       { x0  x1  x2  x3  x4  x5  y0  yp1 }
190              *                         x
191              * bCoeffs2 { 0   0   b2  b1  b0   0  a2  a1 }
192              */
193             acc = vmlaldavq(bCoeffs2, inVec0);
194             acc = sqrshrl_sat48(acc, shift);
195             out1 = (q31_t) ((acc >> 32) & 0xffffffff);
196             inVec0[6] = out1;
197 
198             /*
199              * in       { x0  x1  x2  x3  x4  x5  y0  yp1 }
200              *                         x
201              * bCoeffs2 {  0   0   0  b2  b1  b0  a1  a2  }
202              */
203             acc = vmlaldavq(bCoeffs3, inVec0);
204             acc = sqrshrl_sat48(acc, shift);
205             out = (q31_t) ((acc >> 32) & 0xffffffff);
206 
207             inVec0 = vld1q(pIn);
208             pIn += 4;
209 
210             /* decrement the loop counter  */
211             sample--;
212         }
213 
214         *pOut++ = (q15_t) out1;
215         *pOut++ = (q15_t) out;
216 
217         /*
218          * Tail handling
219          */
220         int32_t         loopRemainder = blockSize & 3;
221 
222         if (loopRemainder == 3) {
223             inVec0[6] = out1;
224             inVec0[7] = out;
225 
226             acc = vmlaldavq(bCoeffs0, inVec0);
227             acc = sqrshrl_sat48(acc, shift);
228             out1 = (q31_t) ((acc >> 32) & 0xffffffff);
229             inVec0[6] = out1;
230 
231             acc = vmlaldavq(bCoeffs1, inVec0);
232             acc = sqrshrl_sat48(acc, shift);
233             out = (q31_t) ((acc >> 32) & 0xffffffff);
234 
235             *pOut++ = (q15_t) out1;
236             *pOut++ = (q15_t) out;
237 
238             inVec0[7] = out;
239             acc = vmlaldavq(bCoeffs2, inVec0);
240             acc = sqrshrl_sat48(acc, shift);
241             out1 = (q31_t) ((acc >> 32) & 0xffffffff);
242             *pOut++ = (q15_t) out1;
243 
244             /* Store the updated state variables back into the pState array */
245             pState[0] = vgetq_lane_s16(inVec0, 4);
246             pState[1] = vgetq_lane_s16(inVec0, 3);
247             pState[3] = out;
248             pState[2] = out1;
249 
250         } else if (loopRemainder == 2) {
251             inVec0[6] = out1;
252             inVec0[7] = out;
253 
254             acc = vmlaldavq(bCoeffs0, inVec0);
255             acc = sqrshrl_sat48(acc, shift);
256             out1 = (q31_t) ((acc >> 32) & 0xffffffff);
257             inVec0[6] = out1;
258 
259             acc = vmlaldavq(bCoeffs1, inVec0);
260             acc = sqrshrl_sat48(acc, shift);
261             out = (q31_t) ((acc >> 32) & 0xffffffff);
262 
263             *pOut++ = (q15_t) out1;
264             *pOut++ = (q15_t) out;
265 
266             /* Store the updated state variables back into the pState array */
267             pState[0] = vgetq_lane_s16(inVec0, 3);
268             pState[1] = vgetq_lane_s16(inVec0, 2);
269             pState[3] = out1;
270             pState[2] = out;
271         } else if (loopRemainder == 1) {
272 
273             inVec0[6] = out1;
274             inVec0[7] = out;
275 
276             acc = vmlaldavq(bCoeffs0, inVec0);
277             acc = sqrshrl_sat48(acc, shift);
278             out1 = (q31_t) ((acc >> 32) & 0xffffffff);
279             *pOut++ = (q15_t) out1;
280 
281             /* Store the updated state variables back into the pState array */
282             pState[0] = vgetq_lane_s16(inVec0, 2);
283             pState[1] = vgetq_lane_s16(inVec0, 1);
284             pState[3] = out;
285             pState[2] = out1;
286 
287 
288         } else {
289             /* Store the updated state variables back into the pState array */
290             pState[0] = vgetq_lane_s16(inVec0, 1);
291             pState[1] = vgetq_lane_s16(inVec0, 0);
292             pState[3] = out1;
293             pState[2] = out;
294         }
295 
296         pState += 4;
297         pCoeffs += 6;
298 
299         /*
300          * The first stage goes from the input buffer to the output buffer.
301          * Subsequent stages occur in-place in the output buffer
302          */
303         pIn = pDst;
304         /*
305          * Reset to destination pointer
306          */
307         pOut = pDst;
308     }
309     while (--stages);
310 }
311 #else
arm_biquad_cascade_df1_q15(const arm_biquad_casd_df1_inst_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)312 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_q15(
313   const arm_biquad_casd_df1_inst_q15 * S,
314   const q15_t * pSrc,
315         q15_t * pDst,
316         uint32_t blockSize)
317 {
318 
319 
320 #if defined (ARM_MATH_DSP)
321 
322   const q15_t *pIn = pSrc;                             /* Source pointer */
323         q15_t *pOut = pDst;                            /* Destination pointer */
324         q31_t in;                                      /* Temporary variable to hold input value */
325         q31_t out;                                     /* Temporary variable to hold output value */
326         q31_t b0;                                      /* Temporary variable to hold bo value */
327         q31_t b1, a1;                                  /* Filter coefficients */
328         q31_t state_in, state_out;                     /* Filter state variables */
329         q31_t acc_l, acc_h;
330         q63_t acc;                                     /* Accumulator */
331         q15_t *pState = S->pState;                     /* State pointer */
332   const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
333         int32_t lShift = (15 - (int32_t) S->postShift);       /* Post shift */
334         uint32_t sample, stage = (uint32_t) S->numStages;     /* Stage loop counter */
335         int32_t uShift = (32 - lShift);
336 
337   do
338   {
339     /* Read the b0 and 0 coefficients using SIMD  */
340     b0 = read_q15x2_ia ((q15_t **) &pCoeffs);
341 
342     /* Read the b1 and b2 coefficients using SIMD */
343     b1 = read_q15x2_ia ((q15_t **) &pCoeffs);
344 
345     /* Read the a1 and a2 coefficients using SIMD */
346     a1 = read_q15x2_ia ((q15_t **) &pCoeffs);
347 
348     /* Read the input state values from the state buffer:  x[n-1], x[n-2] */
349     state_in = read_q15x2_ia (&pState);
350 
351     /* Read the output state values from the state buffer:  y[n-1], y[n-2] */
352     state_out = read_q15x2_da (&pState);
353 
354     /* Apply loop unrolling and compute 2 output values simultaneously. */
355     /*      The variable acc hold output values that are being computed:
356      *
357      *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
358      *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
359      */
360     sample = blockSize >> 1U;
361 
362     /* First part of the processing with loop unrolling.  Compute 2 outputs at a time.
363      ** a second loop below computes the remaining 1 sample. */
364     while (sample > 0U)
365     {
366 
367       /* Read the input */
368       in = read_q15x2_ia ((q15_t **) &pIn);
369 
370       /* out =  b0 * x[n] + 0 * 0 */
371       out = __SMUAD(b0, in);
372 
373       /* acc +=  b1 * x[n-1] +  b2 * x[n-2] + out */
374       acc = __SMLALD(b1, state_in, out);
375       /* acc +=  a1 * y[n-1] +  a2 * y[n-2] */
376       acc = __SMLALD(a1, state_out, acc);
377 
378       /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */
379       /* Calc lower part of acc */
380       acc_l = acc & 0xffffffff;
381 
382       /* Calc upper part of acc */
383       acc_h = (acc >> 32) & 0xffffffff;
384 
385       /* Apply shift for lower part of acc and upper part of acc */
386       out = (uint32_t) acc_l >> lShift | acc_h << uShift;
387 
388       out = __SSAT(out, 16);
389 
390       /* Every time after the output is computed state should be updated. */
391       /* The states should be updated as:  */
392       /* Xn2 = Xn1 */
393       /* Xn1 = Xn  */
394       /* Yn2 = Yn1 */
395       /* Yn1 = acc */
396       /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
397       /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
398 
399 #ifndef  ARM_MATH_BIG_ENDIAN
400       state_in  = __PKHBT(in, state_in, 16);
401       state_out = __PKHBT(out, state_out, 16);
402 #else
403       state_in  = __PKHBT(state_in >> 16, (in >> 16), 16);
404       state_out = __PKHBT(state_out >> 16, (out), 16);
405 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
406 
407       /* out =  b0 * x[n] + 0 * 0 */
408       out = __SMUADX(b0, in);
409       /* acc +=  b1 * x[n-1] +  b2 * x[n-2] + out */
410       acc = __SMLALD(b1, state_in, out);
411       /* acc +=  a1 * y[n-1] + a2 * y[n-2] */
412       acc = __SMLALD(a1, state_out, acc);
413 
414       /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */
415       /* Calc lower part of acc */
416       acc_l = acc & 0xffffffff;
417 
418       /* Calc upper part of acc */
419       acc_h = (acc >> 32) & 0xffffffff;
420 
421       /* Apply shift for lower part of acc and upper part of acc */
422       out = (uint32_t) acc_l >> lShift | acc_h << uShift;
423 
424       out = __SSAT(out, 16);
425 
426       /* Store the output in the destination buffer. */
427 #ifndef  ARM_MATH_BIG_ENDIAN
428       write_q15x2_ia (&pOut, __PKHBT(state_out, out, 16));
429 #else
430       write_q15x2_ia (&pOut, __PKHBT(out, state_out >> 16, 16));
431 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
432 
433       /* Every time after the output is computed state should be updated. */
434       /* The states should be updated as:  */
435       /* Xn2 = Xn1 */
436       /* Xn1 = Xn  */
437       /* Yn2 = Yn1 */
438       /* Yn1 = acc */
439       /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
440       /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
441 #ifndef  ARM_MATH_BIG_ENDIAN
442       state_in  = __PKHBT(in >> 16, state_in, 16);
443       state_out = __PKHBT(out, state_out, 16);
444 #else
445       state_in  = __PKHBT(state_in >> 16, in, 16);
446       state_out = __PKHBT(state_out >> 16, out, 16);
447 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
448 
449       /* Decrement loop counter */
450       sample--;
451     }
452 
453     /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
454      ** No loop unrolling is used. */
455 
456     if ((blockSize & 0x1U) != 0U)
457     {
458       /* Read the input */
459       in = *pIn++;
460 
461       /* out =  b0 * x[n] + 0 * 0 */
462 #ifndef  ARM_MATH_BIG_ENDIAN
463       out = __SMUAD(b0, in);
464 #else
465       out = __SMUADX(b0, in);
466 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
467 
468       /* acc =  b1 * x[n-1] + b2 * x[n-2] + out */
469       acc = __SMLALD(b1, state_in, out);
470       /* acc +=  a1 * y[n-1] + a2 * y[n-2] */
471       acc = __SMLALD(a1, state_out, acc);
472 
473       /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */
474       /* Calc lower part of acc */
475       acc_l = acc & 0xffffffff;
476 
477       /* Calc upper part of acc */
478       acc_h = (acc >> 32) & 0xffffffff;
479 
480       /* Apply shift for lower part of acc and upper part of acc */
481       out = (uint32_t) acc_l >> lShift | acc_h << uShift;
482 
483       out = __SSAT(out, 16);
484 
485       /* Store the output in the destination buffer. */
486       *pOut++ = (q15_t) out;
487 
488       /* Every time after the output is computed state should be updated. */
489       /* The states should be updated as:  */
490       /* Xn2 = Xn1 */
491       /* Xn1 = Xn  */
492       /* Yn2 = Yn1 */
493       /* Yn1 = acc */
494       /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
495       /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
496 #ifndef  ARM_MATH_BIG_ENDIAN
497       state_in = __PKHBT(in, state_in, 16);
498       state_out = __PKHBT(out, state_out, 16);
499 #else
500       state_in = __PKHBT(state_in >> 16, in, 16);
501       state_out = __PKHBT(state_out >> 16, out, 16);
502 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
503     }
504 
505     /* The first stage goes from the input wire to the output wire.  */
506     /* Subsequent numStages occur in-place in the output wire  */
507     pIn = pDst;
508 
509     /* Reset the output pointer */
510     pOut = pDst;
511 
512     /* Store the updated state variables back into the state array */
513     write_q15x2_ia (&pState, state_in);
514     write_q15x2_ia (&pState, state_out);
515 
516     /* Decrement loop counter */
517     stage--;
518 
519   } while (stage > 0U);
520 
521 #else
522 
523   const q15_t *pIn = pSrc;                             /* Source pointer */
524         q15_t *pOut = pDst;                            /* Destination pointer */
525         q15_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
526         q15_t Xn1, Xn2, Yn1, Yn2;                      /* Filter state variables */
527         q15_t Xn;                                      /* temporary input */
528         q63_t acc;                                     /* Accumulator */
529         int32_t shift = (15 - (int32_t) S->postShift); /* Post shift */
530         q15_t *pState = S->pState;                     /* State pointer */
531   const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
532         uint32_t sample, stage = (uint32_t) S->numStages;     /* Stage loop counter */
533 
534   do
535   {
536     /* Reading the coefficients */
537     b0 = *pCoeffs++;
538     pCoeffs++;  // skip the 0 coefficient
539     b1 = *pCoeffs++;
540     b2 = *pCoeffs++;
541     a1 = *pCoeffs++;
542     a2 = *pCoeffs++;
543 
544     /* Reading the state values */
545     Xn1 = pState[0];
546     Xn2 = pState[1];
547     Yn1 = pState[2];
548     Yn2 = pState[3];
549 
550     /* The variables acc holds the output value that is computed:
551      *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
552      */
553 
554     sample = blockSize;
555 
556     while (sample > 0U)
557     {
558       /* Read the input */
559       Xn = *pIn++;
560 
561       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
562       /* acc =  b0 * x[n] */
563       acc = (q31_t) b0 *Xn;
564 
565       /* acc +=  b1 * x[n-1] */
566       acc += (q31_t) b1 *Xn1;
567       /* acc +=  b[2] * x[n-2] */
568       acc += (q31_t) b2 *Xn2;
569       /* acc +=  a1 * y[n-1] */
570       acc += (q31_t) a1 *Yn1;
571       /* acc +=  a2 * y[n-2] */
572       acc += (q31_t) a2 *Yn2;
573 
574       /* The result is converted to 1.31  */
575       acc = __SSAT((acc >> shift), 16);
576 
577       /* Every time after the output is computed state should be updated. */
578       /* The states should be updated as:  */
579       /* Xn2 = Xn1 */
580       /* Xn1 = Xn  */
581       /* Yn2 = Yn1 */
582       /* Yn1 = acc */
583       Xn2 = Xn1;
584       Xn1 = Xn;
585       Yn2 = Yn1;
586       Yn1 = (q15_t) acc;
587 
588       /* Store the output in the destination buffer. */
589       *pOut++ = (q15_t) acc;
590 
591       /* decrement the loop counter */
592       sample--;
593     }
594 
595     /*  The first stage goes from the input buffer to the output buffer. */
596     /*  Subsequent stages occur in-place in the output buffer */
597     pIn = pDst;
598 
599     /* Reset to destination pointer */
600     pOut = pDst;
601 
602     /*  Store the updated state variables back into the pState array */
603     *pState++ = Xn1;
604     *pState++ = Xn2;
605     *pState++ = Yn1;
606     *pState++ = Yn2;
607 
608   } while (--stage);
609 
610 #endif /* #if defined (ARM_MATH_DSP) */
611 
612 }
613 #endif /* defined(ARM_MATH_MVEI) */
614 
615 /**
616   @} end of BiquadCascadeDF1 group
617  */
618