1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_biquad_cascade_df1_32x64_q31.c
4  * Description:  High precision Q31 Biquad cascade filter processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter
37 
38   This function implements a high precision Biquad cascade filter which operates on
39   Q31 data values.  The filter coefficients are in 1.31 format and the state variables
40   are in 1.63 format.  The double precision state variables reduce quantization noise
41   in the filter and provide a cleaner output.
42   These filters are particularly useful when implementing filters in which the
43   singularities are close to the unit circle.  This is common for low pass or high
44   pass filters with very low cutoff frequencies.
45 
46   The function operates on blocks of input and output data
47   and each call to the function processes <code>blockSize</code> samples through
48   the filter. <code>pSrc</code> and <code>pDst</code> points to input and output arrays
49   containing <code>blockSize</code> Q31 values.
50 
51   @par           Algorithm
52                    Each Biquad stage implements a second order filter using the difference equation:
53   <pre>
54       y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
55   </pre>
56                    A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
57                    \image html Biquad.gif "Single Biquad filter stage"
58                    Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
59                    Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
60                    Pay careful attention to the sign of the feedback coefficients.
61                    Some design tools use the difference equation
62   <pre>
63       y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
64   </pre>
65                    In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
66   @par
67                    Higher order filters are realized as a cascade of second order sections.
68                    <code>numStages</code> refers to the number of second order stages used.
69                    For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
70                    \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
71                    A 9th order filter would be realized with <code>numStages=5</code> second order stages
72                    with the coefficients for one of the stages configured as a first order filter
73                    (<code>b2=0</code> and <code>a2=0</code>).
74   @par
75                    The <code>pState</code> points to state variables array.
76                    Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code> and each state variable in 1.63 format to improve precision.
77                    The state variables are arranged in the array as:
78   <pre>
79       {x[n-1], x[n-2], y[n-1], y[n-2]}
80   </pre>
81   @par
82                    The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
83                    The state array has a total length of <code>4*numStages</code> values of data in 1.63 format.
84                    The state variables are updated after each block of data is processed, the coefficients are untouched.
85 
86   @par           Instance Structure
87                    The coefficients and state variables for a filter are stored together in an instance data structure.
88                    A separate instance structure must be defined for each filter.
89                    Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
90 
91   @par           Init Function
92                    There is also an associated initialization function which performs the following operations:
93                    - Sets the values of the internal structure fields.
94                    - Zeros out the values in the state buffer.
95                    To do this manually without calling the init function, assign the follow subfields of the instance structure:
96                    numStages, pCoeffs, postShift, pState. Also set all of the values in pState to zero.
97 
98   @par
99                    Use of the initialization function is optional.
100                    However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
101                    To place an instance structure into a const data section, the instance structure must be manually initialized.
102                    Set the values in the state buffer to zeros before static initialization.
103                    For example, to statically initialize the filter instance structure use
104   <pre>
105       arm_biquad_cas_df1_32x64_ins_q31 S1 = {numStages, pState, pCoeffs, postShift};
106   </pre>
107                    where <code>numStages</code> is the number of Biquad stages in the filter;
108                    <code>pState</code> is the address of the state buffer;
109                    <code>pCoeffs</code> is the address of the coefficient buffer;
110                    <code>postShift</code> shift to be applied which is described in detail below.
111   @par           Fixed-Point Behavior
112                    Care must be taken while using Biquad Cascade 32x64 filter function.
113                    Following issues must be considered:
114                    - Scaling of coefficients
115                    - Filter gain
116                    - Overflow and saturation
117 
118   @par
119                    Filter coefficients are represented as fractional values and
120                    restricted to lie in the range <code>[-1 +1)</code>.
121                    The processing function has an additional scaling parameter <code>postShift</code>
122                    which allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
123                    At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
124                    \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
125                    This essentially scales the filter coefficients by <code>2^postShift</code>.
126                    For example, to realize the coefficients
127   <pre>
128      {1.5, -0.8, 1.2, 1.6, -0.9}
129   </pre>
130                    set the Coefficient array to:
131   <pre>
132      {0.75, -0.4, 0.6, 0.8, -0.45}
133   </pre>
134                    and set <code>postShift=1</code>
135   @par
136                    The second thing to keep in mind is the gain through the filter.
137                    The frequency response of a Biquad filter is a function of its coefficients.
138                    It is possible for the gain through the filter to exceed 1.0 meaning that the
139                    filter increases the amplitude of certain frequencies.
140                    This means that an input signal with amplitude < 1.0 may result in an output > 1.0
141                    and these are saturated or overflowed based on the implementation of the filter.
142                    To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0
143                    or the input signal must be scaled down so that the combination of input and filter are never overflowed.
144   @par
145                    The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version.
146                    This is described in the function specific documentation below.
147  */
148 
149 /**
150   @addtogroup BiquadCascadeDF1_32x64
151   @{
152  */
153 
154 /**
155   @brief         Processing function for the Q31 Biquad cascade 32x64 filter.
156   @param[in]     S         points to an instance of the high precision Q31 Biquad cascade filter
157   @param[in]     pSrc      points to the block of input data
158   @param[out]    pDst      points to the block of output data
159   @param[in]     blockSize number of samples to process
160 
161   @par           Details
162                    The function is implemented using an internal 64-bit accumulator.
163                    The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
164                    Thus, if the accumulator result overflows it wraps around rather than clip.
165                    In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
166                    After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
167                    1.31 format by discarding the low 32 bits.
168   @par
169                    Two related functions are provided in the CMSIS DSP library.
170                    - \ref arm_biquad_cascade_df1_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator.
171                    - \ref arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
172  */
173 
174 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
175 
176 #include "arm_helium_utils.h"
177 
arm_biquad_cas_df1_32x64_q31_scalar(const arm_biquad_cas_df1_32x64_ins_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)178 static void arm_biquad_cas_df1_32x64_q31_scalar(const arm_biquad_cas_df1_32x64_ins_q31 * S,
179   const q31_t * pSrc,
180         q31_t * pDst,
181         uint32_t blockSize)
182 {
183   const q31_t *pIn = pSrc;                             /* input pointer initialization */
184         q31_t *pOut = pDst;                            /* output pointer initialization */
185         q63_t *pState = S->pState;                     /* state pointer initialization */
186   const q31_t *pCoeffs = S->pCoeffs;                   /* coeff pointer initialization */
187         q63_t acc;                                     /* accumulator */
188         q31_t Xn1, Xn2;                                /* Input Filter state variables */
189         q63_t Yn1, Yn2;                                /* Output Filter state variables */
190         q31_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
191         q31_t Xn;                                      /* temporary input */
192         int32_t shift = (int32_t) S->postShift + 1;    /* Shift to be applied to the output */
193         uint32_t sample, stage = S->numStages;         /* loop counters */
194         q31_t acc_l, acc_h;                            /* temporary output */
195         uint32_t uShift = ((uint32_t) S->postShift + 1U);
196         uint32_t lShift = 32U - uShift;                /* Shift to be applied to the output */
197 
198   do
199   {
200     /* Reading the coefficients */
201     b0 = *pCoeffs++;
202     b1 = *pCoeffs++;
203     b2 = *pCoeffs++;
204     a1 = *pCoeffs++;
205     a2 = *pCoeffs++;
206 
207     /* Reading the state values */
208     Xn1 = (q31_t) (pState[0]);
209     Xn2 = (q31_t) (pState[1]);
210     Yn1 = pState[2];
211     Yn2 = pState[3];
212 
213     /* Initialize blkCnt with number of samples */
214     sample = blockSize;
215 
216     while (sample > 0U)
217     {
218       /* Read the input */
219       Xn = *pIn++;
220 
221       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
222 
223       /* acc =  b0 * x[n] */
224       acc = (q63_t) Xn * b0;
225       /* acc +=  b1 * x[n-1] */
226       acc += (q63_t) Xn1 * b1;
227       /* acc +=  b[2] * x[n-2] */
228       acc += (q63_t) Xn2 * b2;
229       /* acc +=  a1 * y[n-1] */
230       acc += mult32x64(Yn1, a1);
231       /* acc +=  a2 * y[n-2] */
232       acc += mult32x64(Yn2, a2);
233 
234       /* Every time after the output is computed state should be updated. */
235       /* The states should be updated as: */
236       /* Xn2 = Xn1 */
237       /* Xn1 = Xn  */
238       /* Yn2 = Yn1 */
239       /* Yn1 = acc */
240       Xn2 = Xn1;
241       Xn1 = Xn;
242       Yn2 = Yn1;
243 
244       /* The result is converted to 1.63, Yn1 variable is reused  */
245       Yn1 = acc << shift;
246 
247       /* Calc lower part of acc */
248       acc_l = acc & 0xffffffff;
249 
250       /* Calc upper part of acc */
251       acc_h = (acc >> 32) & 0xffffffff;
252 
253       /* Apply shift for lower part of acc and upper part of acc */
254       acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
255 
256       /* Store the output in the destination buffer in 1.31 format. */
257       *pOut++ = acc_h;
258       /* Yn1 = acc << shift; */
259 
260       /* Store the output in the destination buffer in 1.31 format. */
261 /*    *pOut++ = (q31_t) (acc >> (32 - shift));  */
262 
263       /* decrement loop counter */
264       sample--;
265     }
266 
267     /* The first stage output is given as input to the second stage. */
268     pIn = pDst;
269 
270     /* Reset to destination buffer working pointer */
271     pOut = pDst;
272 
273     /*  Store the updated state variables back into the pState array */
274     *pState++ = (q63_t) Xn1;
275     *pState++ = (q63_t) Xn2;
276     *pState++ = Yn1;
277     *pState++ = Yn2;
278 
279   } while (--stage);
280 
281 }
282 
arm_biquad_cas_df1_32x64_q31(const arm_biquad_cas_df1_32x64_ins_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)283 ARM_DSP_ATTRIBUTE void arm_biquad_cas_df1_32x64_q31(
284   const arm_biquad_cas_df1_32x64_ins_q31 * S,
285   const q31_t * pSrc,
286         q31_t * pDst,
287         uint32_t blockSize)
288 {
289     const q31_t *pIn = pSrc;    /*  input pointer initialization  */
290     q31_t    *pOut = pDst;      /*  output pointer initialization */
291     q63_t    *pState = S->pState;   /*  state pointer initialization  */
292     const q31_t    *pCoeffs = S->pCoeffs; /*  coeff pointer initialization  */
293     q31_t     Xn1, Xn2;         /*  Input Filter state variables        */
294     q63_t     Yn1, Yn2;         /*  Output Filter state variables        */
295     q31_t     b0, b1, b2, a1, a2;   /*  Filter coefficients           */
296     int32_t   shift = (int32_t) S->postShift + 1;   /*  Shift to be applied to the output */
297     uint32_t  sample, stage = S->numStages; /*  loop counters                     */
298     q31x4_t vecCoef = { 0 }, vecIn;
299     q63_t     acc;
300 
301     if (blockSize <= 3)
302     {
303       arm_biquad_cas_df1_32x64_q31_scalar(S,pSrc,pDst,blockSize);
304     }
305     else
306     {
307       do
308       {
309           uint32_t  i;
310           /*
311            * Reading the coefficients
312            */
313           b0 = *pCoeffs++;
314           b1 = *pCoeffs++;
315           b2 = *pCoeffs++;
316           a1 = *pCoeffs++;
317           a2 = *pCoeffs++;
318 
319           vecCoef[0] = 0;
320           vecCoef[1] = b2;
321           vecCoef[2] = b1;
322           vecCoef[3] = b0;
323           /*
324            * Reading the state values
325            */
326           Xn1 = pState[0];
327           Xn2 = pState[1];
328           Yn1 = pState[2];
329           Yn2 = pState[3];
330 
331           /*
332            * append history with initial samples
333            */
334           q31_t     hist[6];
335           hist[0] = 0;
336           hist[1] = Xn2;
337           hist[2] = Xn1;
338           hist[3] = pIn[0];
339           hist[4] = pIn[1];
340           hist[5] = pIn[2];
341 
342           const q31_t  *pIn1 = hist;
343           q31x4_t vecIn0 = *(q31x4_t *) & pIn[0];
344           q31x4_t vecIn1 = *(q31x4_t *) & pIn[1];
345           q31x4_t vecIn2 = *(q31x4_t *) & pIn[2];
346 
347           i = 3;
348           do
349           {
350               acc = mult32x64(Yn1, a1);
351               acc += mult32x64(Yn2, a2);
352               Yn2 = Yn1;
353               Yn1 = acc;
354               vecIn = vld1q(pIn1);
355               pIn1 += 1;
356               Yn1 = vmlaldavaq(Yn1, vecIn, vecCoef);
357               Yn1 = asrl(Yn1, -shift);
358               /*
359                * Store the output in the destination buffer in 1.31 format.
360                */
361               *pOut++ = (q31_t) (Yn1 >> 32);
362           }
363           while (--i);
364 
365           sample = blockSize - 3;
366           pIn1 = pIn + 3;
367 
368           i = sample / 4;
369           while (i > 0U)
370           {
371 
372               acc = mult32x64(Yn1, a1);
373               acc += mult32x64(Yn2, a2);
374               Yn2 = Yn1;
375               Yn1 = acc;
376               Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
377               vecIn = vld1q(pIn1);
378               pIn1 += 1;
379               Yn1 = asrl(Yn1, -shift);
380               /*
381                * Store the output in the destination buffer in 1.31 format.
382                */
383               *pOut++ = (q31_t) (Yn1 >> 32);
384 
385               acc = mult32x64(Yn1, a1);
386               acc += mult32x64(Yn2, a2);
387               Yn2 = Yn1;
388               Yn1 = acc;
389               Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
390               vecIn0 = vld1q(pIn1);
391               pIn1 += 1;
392               Yn1 = asrl(Yn1, -shift);
393               *pOut++ = (q31_t) (Yn1 >> 32);
394 
395               acc = mult32x64(Yn1, a1);
396               acc += mult32x64(Yn2, a2);
397               Yn2 = Yn1;
398               Yn1 = acc;
399               Yn1 = vmlaldavaq(Yn1, vecIn2, vecCoef);
400               vecIn1 = vld1q(pIn1);
401               pIn1 += 1;
402               Yn1 = asrl(Yn1, -shift);
403               *pOut++ = (q31_t) (Yn1 >> 32);
404 
405               acc = mult32x64(Yn1, a1);
406               acc += mult32x64(Yn2, a2);
407               Yn2 = Yn1;
408               Yn1 = acc;
409               Yn1 = vmlaldavaq(Yn1, vecIn, vecCoef);
410               vecIn2 = vld1q(pIn1);
411               pIn1 += 1;
412               Yn1 = asrl(Yn1, -shift);
413               *pOut++ = (q31_t) (Yn1 >> 32);
414               /*
415                * Decrement the loop counter
416                */
417               i--;
418           }
419           /*
420            * save input state
421            */
422           Xn2 = vecIn[2];
423           Xn1 = vecIn[3];
424 
425           int       loopRemainder = blockSize - 3 - 4 * ((blockSize - 3) / 4);
426           if (loopRemainder == 1)
427           {
428               /*
429                * remainder
430                */
431               acc = mult32x64(Yn1, a1);
432               acc += mult32x64(Yn2, a2);
433               Yn2 = Yn1;
434               Yn1 = acc;
435               Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
436               Yn1 = asrl(Yn1, -shift);
437               *pOut++ = (q31_t) (Yn1 >> 32);
438               /*
439                * save input state
440                */
441               Xn2 = vecIn0[2];
442               Xn1 = vecIn0[3];
443 
444           }
445           else if (loopRemainder == 2)
446           {
447               acc = mult32x64(Yn1, a1);
448               acc += mult32x64(Yn2, a2);
449               Yn2 = Yn1;
450               Yn1 = acc;
451               Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
452               Yn1 = asrl(Yn1, -shift);
453               *pOut++ = (q31_t) (Yn1 >> 32);
454 
455               acc = mult32x64(Yn1, a1);
456               acc += mult32x64(Yn2, a2);
457               Yn2 = Yn1;
458               Yn1 = acc;
459               Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
460               Yn1 = asrl(Yn1, -shift);
461               *pOut++ = (q31_t) (Yn1 >> 32);
462               /*
463                * save input state
464                */
465               Xn2 = vecIn1[2];
466               Xn1 = vecIn1[3];
467 
468           }
469           else if (loopRemainder == 3)
470           {
471               acc = mult32x64(Yn1, a1);
472               acc += mult32x64(Yn2, a2);
473               Yn2 = Yn1;
474               Yn1 = acc;
475               Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
476               Yn1 = asrl(Yn1, -shift);
477               *pOut++ = (q31_t) (Yn1 >> 32);
478 
479               acc = mult32x64(Yn1, a1);
480               acc += mult32x64(Yn2, a2);
481               Yn2 = Yn1;
482               Yn1 = acc;
483               Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
484               Yn1 = asrl(Yn1, -shift);
485               *pOut++ = (q31_t) (Yn1 >> 32);
486 
487               acc = mult32x64(Yn1, a1);
488               acc += mult32x64(Yn2, a2);
489               Yn2 = Yn1;
490               Yn1 = acc;
491               Yn1 = vmlaldavaq(Yn1, vecIn2, vecCoef);
492               Yn1 = asrl(Yn1, -shift);
493               *pOut++ = (q31_t) (Yn1 >> 32);
494               /*
495                * save input state
496                */
497               Xn2 = vecIn2[2];
498               Xn1 = vecIn2[3];
499 
500           }
501 
502           /*
503            * The first stage output is given as input to the second stage.
504            */
505           pIn = pDst;
506           /*
507            * Reset to destination buffer working pointer
508            */
509           pOut = pDst;
510           /*
511            * Store the updated state variables back into the pState array
512            */
513           *pState++ = (q63_t) Xn1;
514           *pState++ = (q63_t) Xn2;
515           *pState++ = Yn1;
516           *pState++ = Yn2;
517       }
518       while (--stage);
519     }
520 }
521 #else
arm_biquad_cas_df1_32x64_q31(const arm_biquad_cas_df1_32x64_ins_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)522 ARM_DSP_ATTRIBUTE void arm_biquad_cas_df1_32x64_q31(
523   const arm_biquad_cas_df1_32x64_ins_q31 * S,
524   const q31_t * pSrc,
525         q31_t * pDst,
526         uint32_t blockSize)
527 {
528   const q31_t *pIn = pSrc;                             /* input pointer initialization */
529         q31_t *pOut = pDst;                            /* output pointer initialization */
530         q63_t *pState = S->pState;                     /* state pointer initialization */
531   const q31_t *pCoeffs = S->pCoeffs;                   /* coeff pointer initialization */
532         q63_t acc;                                     /* accumulator */
533         q31_t Xn1, Xn2;                                /* Input Filter state variables */
534         q63_t Yn1, Yn2;                                /* Output Filter state variables */
535         q31_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
536         q31_t Xn;                                      /* temporary input */
537         int32_t shift = (int32_t) S->postShift + 1;    /* Shift to be applied to the output */
538         uint32_t sample, stage = S->numStages;         /* loop counters */
539         q31_t acc_l, acc_h;                            /* temporary output */
540         uint32_t uShift = ((uint32_t) S->postShift + 1U);
541         uint32_t lShift = 32U - uShift;                /* Shift to be applied to the output */
542 
543   do
544   {
545     /* Reading the coefficients */
546     b0 = *pCoeffs++;
547     b1 = *pCoeffs++;
548     b2 = *pCoeffs++;
549     a1 = *pCoeffs++;
550     a2 = *pCoeffs++;
551 
552     /* Reading the state values */
553     Xn1 = (q31_t) (pState[0]);
554     Xn2 = (q31_t) (pState[1]);
555     Yn1 = pState[2];
556     Yn2 = pState[3];
557 
558 #if defined (ARM_MATH_LOOPUNROLL)
559 
560     /* Apply loop unrolling and compute 4 output values simultaneously. */
561     /* Variable acc hold output value that is being computed and stored in destination buffer
562      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
563      */
564 
565     /* Loop unrolling: Compute 4 outputs at a time */
566     sample = blockSize >> 2U;
567 
568     while (sample > 0U)
569     {
570       /* Read the input */
571       Xn = *pIn++;
572 
573       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
574 
575       /* acc =  b0 * x[n] */
576       acc = (q63_t) Xn * b0;
577 
578       /* acc +=  b1 * x[n-1] */
579       acc += (q63_t) Xn1 * b1;
580 
581       /* acc +=  b[2] * x[n-2] */
582       acc += (q63_t) Xn2 * b2;
583 
584       /* acc +=  a1 * y[n-1] */
585       acc += mult32x64(Yn1, a1);
586 
587       /* acc +=  a2 * y[n-2] */
588       acc += mult32x64(Yn2, a2);
589 
590       /* The result is converted to 1.63 , Yn2 variable is reused */
591       Yn2 = acc << shift;
592 
593       /* Calc lower part of acc */
594       acc_l = acc & 0xffffffff;
595 
596       /* Calc upper part of acc */
597       acc_h = (acc >> 32) & 0xffffffff;
598 
599       /* Apply shift for lower part of acc and upper part of acc */
600       acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
601 
602       /* Store the output in the destination buffer in 1.31 format. */
603       *pOut = acc_h;
604 
605       /* Read the second input into Xn2, to reuse the value */
606       Xn2 = *pIn++;
607 
608       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
609 
610       /* acc +=  b1 * x[n-1] */
611       acc = (q63_t) Xn * b1;
612 
613       /* acc =  b0 * x[n] */
614       acc += (q63_t) Xn2 * b0;
615 
616       /* acc +=  b[2] * x[n-2] */
617       acc += (q63_t) Xn1 * b2;
618 
619       /* acc +=  a1 * y[n-1] */
620       acc += mult32x64(Yn2, a1);
621 
622       /* acc +=  a2 * y[n-2] */
623       acc += mult32x64(Yn1, a2);
624 
625       /* The result is converted to 1.63, Yn1 variable is reused */
626       Yn1 = acc << shift;
627 
628       /* Calc lower part of acc */
629       acc_l = acc & 0xffffffff;
630 
631       /* Calc upper part of acc */
632       acc_h = (acc >> 32) & 0xffffffff;
633 
634       /* Apply shift for lower part of acc and upper part of acc */
635       acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
636 
637       /* Read the third input into Xn1, to reuse the value */
638       Xn1 = *pIn++;
639 
640       /* The result is converted to 1.31 */
641       /* Store the output in the destination buffer. */
642       *(pOut + 1U) = acc_h;
643 
644       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
645 
646       /* acc =  b0 * x[n] */
647       acc = (q63_t) Xn1 * b0;
648 
649       /* acc +=  b1 * x[n-1] */
650       acc += (q63_t) Xn2 * b1;
651 
652       /* acc +=  b[2] * x[n-2] */
653       acc += (q63_t) Xn * b2;
654 
655       /* acc +=  a1 * y[n-1] */
656       acc += mult32x64(Yn1, a1);
657 
658       /* acc +=  a2 * y[n-2] */
659       acc += mult32x64(Yn2, a2);
660 
661       /* The result is converted to 1.63, Yn2 variable is reused  */
662       Yn2 = acc << shift;
663 
664       /* Calc lower part of acc */
665       acc_l = acc & 0xffffffff;
666 
667       /* Calc upper part of acc */
668       acc_h = (acc >> 32) & 0xffffffff;
669 
670       /* Apply shift for lower part of acc and upper part of acc */
671       acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
672 
673       /* Store the output in the destination buffer in 1.31 format. */
674       *(pOut + 2U) = acc_h;
675 
676       /* Read the fourth input into Xn, to reuse the value */
677       Xn = *pIn++;
678 
679       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
680       /* acc =  b0 * x[n] */
681       acc = (q63_t) Xn * b0;
682 
683       /* acc +=  b1 * x[n-1] */
684       acc += (q63_t) Xn1 * b1;
685 
686       /* acc +=  b[2] * x[n-2] */
687       acc += (q63_t) Xn2 * b2;
688 
689       /* acc +=  a1 * y[n-1] */
690       acc += mult32x64(Yn2, a1);
691 
692       /* acc +=  a2 * y[n-2] */
693       acc += mult32x64(Yn1, a2);
694 
695       /* The result is converted to 1.63, Yn1 variable is reused  */
696       Yn1 = acc << shift;
697 
698       /* Calc lower part of acc */
699       acc_l = acc & 0xffffffff;
700 
701       /* Calc upper part of acc */
702       acc_h = (acc >> 32) & 0xffffffff;
703 
704       /* Apply shift for lower part of acc and upper part of acc */
705       acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
706 
707       /* Store the output in the destination buffer in 1.31 format. */
708       *(pOut + 3U) = acc_h;
709 
710       /* Every time after the output is computed state should be updated. */
711       /* The states should be updated as: */
712       /* Xn2 = Xn1 */
713       /* Xn1 = Xn  */
714       /* Yn2 = Yn1 */
715       /* Yn1 = acc */
716       Xn2 = Xn1;
717       Xn1 = Xn;
718 
719       /* update output pointer */
720       pOut += 4U;
721 
722       /* decrement loop counter */
723       sample--;
724     }
725 
726     /* Loop unrolling: Compute remaining outputs */
727     sample = blockSize & 0x3U;
728 
729 #else
730 
731     /* Initialize blkCnt with number of samples */
732     sample = blockSize;
733 
734 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
735 
736     while (sample > 0U)
737     {
738       /* Read the input */
739       Xn = *pIn++;
740 
741       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
742 
743       /* acc =  b0 * x[n] */
744       acc = (q63_t) Xn * b0;
745       /* acc +=  b1 * x[n-1] */
746       acc += (q63_t) Xn1 * b1;
747       /* acc +=  b[2] * x[n-2] */
748       acc += (q63_t) Xn2 * b2;
749       /* acc +=  a1 * y[n-1] */
750       acc += mult32x64(Yn1, a1);
751       /* acc +=  a2 * y[n-2] */
752       acc += mult32x64(Yn2, a2);
753 
754       /* Every time after the output is computed state should be updated. */
755       /* The states should be updated as: */
756       /* Xn2 = Xn1 */
757       /* Xn1 = Xn  */
758       /* Yn2 = Yn1 */
759       /* Yn1 = acc */
760       Xn2 = Xn1;
761       Xn1 = Xn;
762       Yn2 = Yn1;
763 
764       /* The result is converted to 1.63, Yn1 variable is reused  */
765       Yn1 = acc << shift;
766 
767       /* Calc lower part of acc */
768       acc_l = acc & 0xffffffff;
769 
770       /* Calc upper part of acc */
771       acc_h = (acc >> 32) & 0xffffffff;
772 
773       /* Apply shift for lower part of acc and upper part of acc */
774       acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
775 
776       /* Store the output in the destination buffer in 1.31 format. */
777       *pOut++ = acc_h;
778       /* Yn1 = acc << shift; */
779 
780       /* Store the output in the destination buffer in 1.31 format. */
781 /*    *pOut++ = (q31_t) (acc >> (32 - shift));  */
782 
783       /* decrement loop counter */
784       sample--;
785     }
786 
787     /* The first stage output is given as input to the second stage. */
788     pIn = pDst;
789 
790     /* Reset to destination buffer working pointer */
791     pOut = pDst;
792 
793     /*  Store the updated state variables back into the pState array */
794     *pState++ = (q63_t) Xn1;
795     *pState++ = (q63_t) Xn2;
796     *pState++ = Yn1;
797     *pState++ = Yn2;
798 
799   } while (--stage);
800 
801 }
802 #endif /* defined(ARM_MATH_MVEI) */
803 
804 /**
805   @} end of BiquadCascadeDF1_32x64 group
806  */
807