1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_biquad_cascade_df1_32x64_q31.c
4  * Description:  High precision Q31 Biquad cascade filter processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter
37 
38   This function implements a high precision Biquad cascade filter which operates on
39   Q31 data values.  The filter coefficients are in 1.31 format and the state variables
40   are in 1.63 format.  The double precision state variables reduce quantization noise
41   in the filter and provide a cleaner output.
42   These filters are particularly useful when implementing filters in which the
43   singularities are close to the unit circle.  This is common for low pass or high
44   pass filters with very low cutoff frequencies.
45 
46   The function operates on blocks of input and output data
47   and each call to the function processes <code>blockSize</code> samples through
48   the filter. <code>pSrc</code> and <code>pDst</code> points to input and output arrays
49   containing <code>blockSize</code> Q31 values.
50 
51   @par           Algorithm
52                    Each Biquad stage implements a second order filter using the difference equation:
53   <pre>
54       y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
55   </pre>
56                    A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
57                    \image html Biquad.gif "Single Biquad filter stage"
58                    Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
59                    Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
60                    Pay careful attention to the sign of the feedback coefficients.
61                    Some design tools use the difference equation
62   <pre>
63       y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
64   </pre>
65                    In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
66   @par
67                    Higher order filters are realized as a cascade of second order sections.
68                    <code>numStages</code> refers to the number of second order stages used.
69                    For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
70                    \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
71                    A 9th order filter would be realized with <code>numStages=5</code> second order stages
72                    with the coefficients for one of the stages configured as a first order filter
73                    (<code>b2=0</code> and <code>a2=0</code>).
74   @par
75                    The <code>pState</code> points to state variables array.
76                    Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code> and each state variable in 1.63 format to improve precision.
77                    The state variables are arranged in the array as:
78   <pre>
79       {x[n-1], x[n-2], y[n-1], y[n-2]}
80   </pre>
81   @par
82                    The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
83                    The state array has a total length of <code>4*numStages</code> values of data in 1.63 format.
84                    The state variables are updated after each block of data is processed, the coefficients are untouched.
85 
86   @par           Instance Structure
87                    The coefficients and state variables for a filter are stored together in an instance data structure.
88                    A separate instance structure must be defined for each filter.
89                    Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
90 
91   @par           Init Function
92                    There is also an associated initialization function which performs the following operations:
93                    - Sets the values of the internal structure fields.
94                    - Zeros out the values in the state buffer.
95                    To do this manually without calling the init function, assign the follow subfields of the instance structure:
96                    numStages, pCoeffs, postShift, pState. Also set all of the values in pState to zero.
97 
98   @par
99                    Use of the initialization function is optional.
100                    However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
101                    To place an instance structure into a const data section, the instance structure must be manually initialized.
102                    Set the values in the state buffer to zeros before static initialization.
103                    For example, to statically initialize the filter instance structure use
104   <pre>
105       arm_biquad_cas_df1_32x64_ins_q31 S1 = {numStages, pState, pCoeffs, postShift};
106   </pre>
107                    where <code>numStages</code> is the number of Biquad stages in the filter;
108                    <code>pState</code> is the address of the state buffer;
109                    <code>pCoeffs</code> is the address of the coefficient buffer;
110                    <code>postShift</code> shift to be applied which is described in detail below.
111   @par           Fixed-Point Behavior
112                    Care must be taken while using Biquad Cascade 32x64 filter function.
113                    Following issues must be considered:
114                    - Scaling of coefficients
115                    - Filter gain
116                    - Overflow and saturation
117 
118   @par
119                    Filter coefficients are represented as fractional values and
120                    restricted to lie in the range <code>[-1 +1)</code>.
121                    The processing function has an additional scaling parameter <code>postShift</code>
122                    which allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
123                    At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
124                    \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
125                    This essentially scales the filter coefficients by <code>2^postShift</code>.
126                    For example, to realize the coefficients
127   <pre>
128      {1.5, -0.8, 1.2, 1.6, -0.9}
129   </pre>
130                    set the Coefficient array to:
131   <pre>
132      {0.75, -0.4, 0.6, 0.8, -0.45}
133   </pre>
134                    and set <code>postShift=1</code>
135   @par
136                    The second thing to keep in mind is the gain through the filter.
137                    The frequency response of a Biquad filter is a function of its coefficients.
138                    It is possible for the gain through the filter to exceed 1.0 meaning that the
139                    filter increases the amplitude of certain frequencies.
140                    This means that an input signal with amplitude < 1.0 may result in an output > 1.0
141                    and these are saturated or overflowed based on the implementation of the filter.
142                    To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0
143                    or the input signal must be scaled down so that the combination of input and filter are never overflowed.
144   @par
145                    The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version.
146                    This is described in the function specific documentation below.
147  */
148 
149 /**
150   @addtogroup BiquadCascadeDF1_32x64
151   @{
152  */
153 
154 /**
155   @brief         Processing function for the Q31 Biquad cascade 32x64 filter.
156   @param[in]     S         points to an instance of the high precision Q31 Biquad cascade filter
157   @param[in]     pSrc      points to the block of input data
158   @param[out]    pDst      points to the block of output data
159   @param[in]     blockSize number of samples to process
160   @return        none
161 
162   @par           Details
163                    The function is implemented using an internal 64-bit accumulator.
164                    The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
165                    Thus, if the accumulator result overflows it wraps around rather than clip.
166                    In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
167                    After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
168                    1.31 format by discarding the low 32 bits.
169   @par
170                    Two related functions are provided in the CMSIS DSP library.
171                    - \ref arm_biquad_cascade_df1_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator.
172                    - \ref arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
173  */
174 
175 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
176 
177 #include "arm_helium_utils.h"
178 
arm_biquad_cas_df1_32x64_q31_scalar(const arm_biquad_cas_df1_32x64_ins_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)179 static void arm_biquad_cas_df1_32x64_q31_scalar(const arm_biquad_cas_df1_32x64_ins_q31 * S,
180   const q31_t * pSrc,
181         q31_t * pDst,
182         uint32_t blockSize)
183 {
184   const q31_t *pIn = pSrc;                             /* input pointer initialization */
185         q31_t *pOut = pDst;                            /* output pointer initialization */
186         q63_t *pState = S->pState;                     /* state pointer initialization */
187   const q31_t *pCoeffs = S->pCoeffs;                   /* coeff pointer initialization */
188         q63_t acc;                                     /* accumulator */
189         q31_t Xn1, Xn2;                                /* Input Filter state variables */
190         q63_t Yn1, Yn2;                                /* Output Filter state variables */
191         q31_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
192         q31_t Xn;                                      /* temporary input */
193         int32_t shift = (int32_t) S->postShift + 1;    /* Shift to be applied to the output */
194         uint32_t sample, stage = S->numStages;         /* loop counters */
195         q31_t acc_l, acc_h;                            /* temporary output */
196         uint32_t uShift = ((uint32_t) S->postShift + 1U);
197         uint32_t lShift = 32U - uShift;                /* Shift to be applied to the output */
198 
199   do
200   {
201     /* Reading the coefficients */
202     b0 = *pCoeffs++;
203     b1 = *pCoeffs++;
204     b2 = *pCoeffs++;
205     a1 = *pCoeffs++;
206     a2 = *pCoeffs++;
207 
208     /* Reading the state values */
209     Xn1 = (q31_t) (pState[0]);
210     Xn2 = (q31_t) (pState[1]);
211     Yn1 = pState[2];
212     Yn2 = pState[3];
213 
214     /* Initialize blkCnt with number of samples */
215     sample = blockSize;
216 
217     while (sample > 0U)
218     {
219       /* Read the input */
220       Xn = *pIn++;
221 
222       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
223 
224       /* acc =  b0 * x[n] */
225       acc = (q63_t) Xn * b0;
226       /* acc +=  b1 * x[n-1] */
227       acc += (q63_t) Xn1 * b1;
228       /* acc +=  b[2] * x[n-2] */
229       acc += (q63_t) Xn2 * b2;
230       /* acc +=  a1 * y[n-1] */
231       acc += mult32x64(Yn1, a1);
232       /* acc +=  a2 * y[n-2] */
233       acc += mult32x64(Yn2, a2);
234 
235       /* Every time after the output is computed state should be updated. */
236       /* The states should be updated as: */
237       /* Xn2 = Xn1 */
238       /* Xn1 = Xn  */
239       /* Yn2 = Yn1 */
240       /* Yn1 = acc */
241       Xn2 = Xn1;
242       Xn1 = Xn;
243       Yn2 = Yn1;
244 
245       /* The result is converted to 1.63, Yn1 variable is reused  */
246       Yn1 = acc << shift;
247 
248       /* Calc lower part of acc */
249       acc_l = acc & 0xffffffff;
250 
251       /* Calc upper part of acc */
252       acc_h = (acc >> 32) & 0xffffffff;
253 
254       /* Apply shift for lower part of acc and upper part of acc */
255       acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
256 
257       /* Store the output in the destination buffer in 1.31 format. */
258       *pOut++ = acc_h;
259       /* Yn1 = acc << shift; */
260 
261       /* Store the output in the destination buffer in 1.31 format. */
262 /*    *pOut++ = (q31_t) (acc >> (32 - shift));  */
263 
264       /* decrement loop counter */
265       sample--;
266     }
267 
268     /* The first stage output is given as input to the second stage. */
269     pIn = pDst;
270 
271     /* Reset to destination buffer working pointer */
272     pOut = pDst;
273 
274     /*  Store the updated state variables back into the pState array */
275     *pState++ = (q63_t) Xn1;
276     *pState++ = (q63_t) Xn2;
277     *pState++ = Yn1;
278     *pState++ = Yn2;
279 
280   } while (--stage);
281 
282 }
283 
arm_biquad_cas_df1_32x64_q31(const arm_biquad_cas_df1_32x64_ins_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)284 void arm_biquad_cas_df1_32x64_q31(
285   const arm_biquad_cas_df1_32x64_ins_q31 * S,
286   const q31_t * pSrc,
287         q31_t * pDst,
288         uint32_t blockSize)
289 {
290     const q31_t *pIn = pSrc;    /*  input pointer initialization  */
291     q31_t    *pOut = pDst;      /*  output pointer initialization */
292     q63_t    *pState = S->pState;   /*  state pointer initialization  */
293     const q31_t    *pCoeffs = S->pCoeffs; /*  coeff pointer initialization  */
294     q31_t     Xn1, Xn2;         /*  Input Filter state variables        */
295     q63_t     Yn1, Yn2;         /*  Output Filter state variables        */
296     q31_t     b0, b1, b2, a1, a2;   /*  Filter coefficients           */
297     int32_t   shift = (int32_t) S->postShift + 1;   /*  Shift to be applied to the output */
298     uint32_t  sample, stage = S->numStages; /*  loop counters                     */
299     q31x4_t vecCoef, vecIn;
300     q63_t     acc;
301 
302     if (blockSize <= 3)
303     {
304       arm_biquad_cas_df1_32x64_q31_scalar(S,pSrc,pDst,blockSize);
305     }
306     else
307     {
308       do
309       {
310           uint32_t  i;
311           /*
312            * Reading the coefficients
313            */
314           b0 = *pCoeffs++;
315           b1 = *pCoeffs++;
316           b2 = *pCoeffs++;
317           a1 = *pCoeffs++;
318           a2 = *pCoeffs++;
319 
320           vecCoef[0] = 0;
321           vecCoef[1] = b2;
322           vecCoef[2] = b1;
323           vecCoef[3] = b0;
324           /*
325            * Reading the state values
326            */
327           Xn1 = pState[0];
328           Xn2 = pState[1];
329           Yn1 = pState[2];
330           Yn2 = pState[3];
331 
332           /*
333            * append history with initial samples
334            */
335           q31_t     hist[6];
336           hist[0] = 0;
337           hist[1] = Xn2;
338           hist[2] = Xn1;
339           hist[3] = pIn[0];
340           hist[4] = pIn[1];
341           hist[5] = pIn[2];
342 
343           const q31_t  *pIn1 = hist;
344           q31x4_t vecIn0 = *(q31x4_t *) & pIn[0];
345           q31x4_t vecIn1 = *(q31x4_t *) & pIn[1];
346           q31x4_t vecIn2 = *(q31x4_t *) & pIn[2];
347 
348           i = 3;
349           do
350           {
351               acc = mult32x64(Yn1, a1);
352               acc += mult32x64(Yn2, a2);
353               Yn2 = Yn1;
354               Yn1 = acc;
355               vecIn = vld1q(pIn1);
356               pIn1 += 1;
357               Yn1 = vmlaldavaq(Yn1, vecIn, vecCoef);
358               Yn1 = asrl(Yn1, -shift);
359               /*
360                * Store the output in the destination buffer in 1.31 format.
361                */
362               *pOut++ = (q31_t) (Yn1 >> 32);
363           }
364           while (--i);
365 
366           sample = blockSize - 3;
367           pIn1 = pIn + 3;
368 
369           i = sample / 4;
370           while (i > 0U)
371           {
372 
373               acc = mult32x64(Yn1, a1);
374               acc += mult32x64(Yn2, a2);
375               Yn2 = Yn1;
376               Yn1 = acc;
377               Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
378               vecIn = vld1q(pIn1);
379               pIn1 += 1;
380               Yn1 = asrl(Yn1, -shift);
381               /*
382                * Store the output in the destination buffer in 1.31 format.
383                */
384               *pOut++ = (q31_t) (Yn1 >> 32);
385 
386               acc = mult32x64(Yn1, a1);
387               acc += mult32x64(Yn2, a2);
388               Yn2 = Yn1;
389               Yn1 = acc;
390               Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
391               vecIn0 = vld1q(pIn1);
392               pIn1 += 1;
393               Yn1 = asrl(Yn1, -shift);
394               *pOut++ = (q31_t) (Yn1 >> 32);
395 
396               acc = mult32x64(Yn1, a1);
397               acc += mult32x64(Yn2, a2);
398               Yn2 = Yn1;
399               Yn1 = acc;
400               Yn1 = vmlaldavaq(Yn1, vecIn2, vecCoef);
401               vecIn1 = vld1q(pIn1);
402               pIn1 += 1;
403               Yn1 = asrl(Yn1, -shift);
404               *pOut++ = (q31_t) (Yn1 >> 32);
405 
406               acc = mult32x64(Yn1, a1);
407               acc += mult32x64(Yn2, a2);
408               Yn2 = Yn1;
409               Yn1 = acc;
410               Yn1 = vmlaldavaq(Yn1, vecIn, vecCoef);
411               vecIn2 = vld1q(pIn1);
412               pIn1 += 1;
413               Yn1 = asrl(Yn1, -shift);
414               *pOut++ = (q31_t) (Yn1 >> 32);
415               /*
416                * Decrement the loop counter
417                */
418               i--;
419           }
420           /*
421            * save input state
422            */
423           Xn2 = vecIn[2];
424           Xn1 = vecIn[3];
425 
426           int       loopRemainder = blockSize - 3 - 4 * ((blockSize - 3) / 4);
427           if (loopRemainder == 1)
428           {
429               /*
430                * remainder
431                */
432               acc = mult32x64(Yn1, a1);
433               acc += mult32x64(Yn2, a2);
434               Yn2 = Yn1;
435               Yn1 = acc;
436               Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
437               Yn1 = asrl(Yn1, -shift);
438               *pOut++ = (q31_t) (Yn1 >> 32);
439               /*
440                * save input state
441                */
442               Xn2 = vecIn0[2];
443               Xn1 = vecIn0[3];
444 
445           }
446           else if (loopRemainder == 2)
447           {
448               acc = mult32x64(Yn1, a1);
449               acc += mult32x64(Yn2, a2);
450               Yn2 = Yn1;
451               Yn1 = acc;
452               Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
453               Yn1 = asrl(Yn1, -shift);
454               *pOut++ = (q31_t) (Yn1 >> 32);
455 
456               acc = mult32x64(Yn1, a1);
457               acc += mult32x64(Yn2, a2);
458               Yn2 = Yn1;
459               Yn1 = acc;
460               Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
461               Yn1 = asrl(Yn1, -shift);
462               *pOut++ = (q31_t) (Yn1 >> 32);
463               /*
464                * save input state
465                */
466               Xn2 = vecIn1[2];
467               Xn1 = vecIn1[3];
468 
469           }
470           else if (loopRemainder == 3)
471           {
472               acc = mult32x64(Yn1, a1);
473               acc += mult32x64(Yn2, a2);
474               Yn2 = Yn1;
475               Yn1 = acc;
476               Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
477               Yn1 = asrl(Yn1, -shift);
478               *pOut++ = (q31_t) (Yn1 >> 32);
479 
480               acc = mult32x64(Yn1, a1);
481               acc += mult32x64(Yn2, a2);
482               Yn2 = Yn1;
483               Yn1 = acc;
484               Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
485               Yn1 = asrl(Yn1, -shift);
486               *pOut++ = (q31_t) (Yn1 >> 32);
487 
488               acc = mult32x64(Yn1, a1);
489               acc += mult32x64(Yn2, a2);
490               Yn2 = Yn1;
491               Yn1 = acc;
492               Yn1 = vmlaldavaq(Yn1, vecIn2, vecCoef);
493               Yn1 = asrl(Yn1, -shift);
494               *pOut++ = (q31_t) (Yn1 >> 32);
495               /*
496                * save input state
497                */
498               Xn2 = vecIn2[2];
499               Xn1 = vecIn2[3];
500 
501           }
502 
503           /*
504            * The first stage output is given as input to the second stage.
505            */
506           pIn = pDst;
507           /*
508            * Reset to destination buffer working pointer
509            */
510           pOut = pDst;
511           /*
512            * Store the updated state variables back into the pState array
513            */
514           *pState++ = (q63_t) Xn1;
515           *pState++ = (q63_t) Xn2;
516           *pState++ = Yn1;
517           *pState++ = Yn2;
518       }
519       while (--stage);
520     }
521 }
522 #else
arm_biquad_cas_df1_32x64_q31(const arm_biquad_cas_df1_32x64_ins_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)523 void arm_biquad_cas_df1_32x64_q31(
524   const arm_biquad_cas_df1_32x64_ins_q31 * S,
525   const q31_t * pSrc,
526         q31_t * pDst,
527         uint32_t blockSize)
528 {
529   const q31_t *pIn = pSrc;                             /* input pointer initialization */
530         q31_t *pOut = pDst;                            /* output pointer initialization */
531         q63_t *pState = S->pState;                     /* state pointer initialization */
532   const q31_t *pCoeffs = S->pCoeffs;                   /* coeff pointer initialization */
533         q63_t acc;                                     /* accumulator */
534         q31_t Xn1, Xn2;                                /* Input Filter state variables */
535         q63_t Yn1, Yn2;                                /* Output Filter state variables */
536         q31_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
537         q31_t Xn;                                      /* temporary input */
538         int32_t shift = (int32_t) S->postShift + 1;    /* Shift to be applied to the output */
539         uint32_t sample, stage = S->numStages;         /* loop counters */
540         q31_t acc_l, acc_h;                            /* temporary output */
541         uint32_t uShift = ((uint32_t) S->postShift + 1U);
542         uint32_t lShift = 32U - uShift;                /* Shift to be applied to the output */
543 
544   do
545   {
546     /* Reading the coefficients */
547     b0 = *pCoeffs++;
548     b1 = *pCoeffs++;
549     b2 = *pCoeffs++;
550     a1 = *pCoeffs++;
551     a2 = *pCoeffs++;
552 
553     /* Reading the state values */
554     Xn1 = (q31_t) (pState[0]);
555     Xn2 = (q31_t) (pState[1]);
556     Yn1 = pState[2];
557     Yn2 = pState[3];
558 
559 #if defined (ARM_MATH_LOOPUNROLL)
560 
561     /* Apply loop unrolling and compute 4 output values simultaneously. */
562     /* Variable acc hold output value that is being computed and stored in destination buffer
563      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
564      */
565 
566     /* Loop unrolling: Compute 4 outputs at a time */
567     sample = blockSize >> 2U;
568 
569     while (sample > 0U)
570     {
571       /* Read the input */
572       Xn = *pIn++;
573 
574       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
575 
576       /* acc =  b0 * x[n] */
577       acc = (q63_t) Xn * b0;
578 
579       /* acc +=  b1 * x[n-1] */
580       acc += (q63_t) Xn1 * b1;
581 
582       /* acc +=  b[2] * x[n-2] */
583       acc += (q63_t) Xn2 * b2;
584 
585       /* acc +=  a1 * y[n-1] */
586       acc += mult32x64(Yn1, a1);
587 
588       /* acc +=  a2 * y[n-2] */
589       acc += mult32x64(Yn2, a2);
590 
591       /* The result is converted to 1.63 , Yn2 variable is reused */
592       Yn2 = acc << shift;
593 
594       /* Calc lower part of acc */
595       acc_l = acc & 0xffffffff;
596 
597       /* Calc upper part of acc */
598       acc_h = (acc >> 32) & 0xffffffff;
599 
600       /* Apply shift for lower part of acc and upper part of acc */
601       acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
602 
603       /* Store the output in the destination buffer in 1.31 format. */
604       *pOut = acc_h;
605 
606       /* Read the second input into Xn2, to reuse the value */
607       Xn2 = *pIn++;
608 
609       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
610 
611       /* acc +=  b1 * x[n-1] */
612       acc = (q63_t) Xn * b1;
613 
614       /* acc =  b0 * x[n] */
615       acc += (q63_t) Xn2 * b0;
616 
617       /* acc +=  b[2] * x[n-2] */
618       acc += (q63_t) Xn1 * b2;
619 
620       /* acc +=  a1 * y[n-1] */
621       acc += mult32x64(Yn2, a1);
622 
623       /* acc +=  a2 * y[n-2] */
624       acc += mult32x64(Yn1, a2);
625 
626       /* The result is converted to 1.63, Yn1 variable is reused */
627       Yn1 = acc << shift;
628 
629       /* Calc lower part of acc */
630       acc_l = acc & 0xffffffff;
631 
632       /* Calc upper part of acc */
633       acc_h = (acc >> 32) & 0xffffffff;
634 
635       /* Apply shift for lower part of acc and upper part of acc */
636       acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
637 
638       /* Read the third input into Xn1, to reuse the value */
639       Xn1 = *pIn++;
640 
641       /* The result is converted to 1.31 */
642       /* Store the output in the destination buffer. */
643       *(pOut + 1U) = acc_h;
644 
645       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
646 
647       /* acc =  b0 * x[n] */
648       acc = (q63_t) Xn1 * b0;
649 
650       /* acc +=  b1 * x[n-1] */
651       acc += (q63_t) Xn2 * b1;
652 
653       /* acc +=  b[2] * x[n-2] */
654       acc += (q63_t) Xn * b2;
655 
656       /* acc +=  a1 * y[n-1] */
657       acc += mult32x64(Yn1, a1);
658 
659       /* acc +=  a2 * y[n-2] */
660       acc += mult32x64(Yn2, a2);
661 
662       /* The result is converted to 1.63, Yn2 variable is reused  */
663       Yn2 = acc << shift;
664 
665       /* Calc lower part of acc */
666       acc_l = acc & 0xffffffff;
667 
668       /* Calc upper part of acc */
669       acc_h = (acc >> 32) & 0xffffffff;
670 
671       /* Apply shift for lower part of acc and upper part of acc */
672       acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
673 
674       /* Store the output in the destination buffer in 1.31 format. */
675       *(pOut + 2U) = acc_h;
676 
677       /* Read the fourth input into Xn, to reuse the value */
678       Xn = *pIn++;
679 
680       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
681       /* acc =  b0 * x[n] */
682       acc = (q63_t) Xn * b0;
683 
684       /* acc +=  b1 * x[n-1] */
685       acc += (q63_t) Xn1 * b1;
686 
687       /* acc +=  b[2] * x[n-2] */
688       acc += (q63_t) Xn2 * b2;
689 
690       /* acc +=  a1 * y[n-1] */
691       acc += mult32x64(Yn2, a1);
692 
693       /* acc +=  a2 * y[n-2] */
694       acc += mult32x64(Yn1, a2);
695 
696       /* The result is converted to 1.63, Yn1 variable is reused  */
697       Yn1 = acc << shift;
698 
699       /* Calc lower part of acc */
700       acc_l = acc & 0xffffffff;
701 
702       /* Calc upper part of acc */
703       acc_h = (acc >> 32) & 0xffffffff;
704 
705       /* Apply shift for lower part of acc and upper part of acc */
706       acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
707 
708       /* Store the output in the destination buffer in 1.31 format. */
709       *(pOut + 3U) = acc_h;
710 
711       /* Every time after the output is computed state should be updated. */
712       /* The states should be updated as: */
713       /* Xn2 = Xn1 */
714       /* Xn1 = Xn  */
715       /* Yn2 = Yn1 */
716       /* Yn1 = acc */
717       Xn2 = Xn1;
718       Xn1 = Xn;
719 
720       /* update output pointer */
721       pOut += 4U;
722 
723       /* decrement loop counter */
724       sample--;
725     }
726 
727     /* Loop unrolling: Compute remaining outputs */
728     sample = blockSize & 0x3U;
729 
730 #else
731 
732     /* Initialize blkCnt with number of samples */
733     sample = blockSize;
734 
735 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
736 
737     while (sample > 0U)
738     {
739       /* Read the input */
740       Xn = *pIn++;
741 
742       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
743 
744       /* acc =  b0 * x[n] */
745       acc = (q63_t) Xn * b0;
746       /* acc +=  b1 * x[n-1] */
747       acc += (q63_t) Xn1 * b1;
748       /* acc +=  b[2] * x[n-2] */
749       acc += (q63_t) Xn2 * b2;
750       /* acc +=  a1 * y[n-1] */
751       acc += mult32x64(Yn1, a1);
752       /* acc +=  a2 * y[n-2] */
753       acc += mult32x64(Yn2, a2);
754 
755       /* Every time after the output is computed state should be updated. */
756       /* The states should be updated as: */
757       /* Xn2 = Xn1 */
758       /* Xn1 = Xn  */
759       /* Yn2 = Yn1 */
760       /* Yn1 = acc */
761       Xn2 = Xn1;
762       Xn1 = Xn;
763       Yn2 = Yn1;
764 
765       /* The result is converted to 1.63, Yn1 variable is reused  */
766       Yn1 = acc << shift;
767 
768       /* Calc lower part of acc */
769       acc_l = acc & 0xffffffff;
770 
771       /* Calc upper part of acc */
772       acc_h = (acc >> 32) & 0xffffffff;
773 
774       /* Apply shift for lower part of acc and upper part of acc */
775       acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
776 
777       /* Store the output in the destination buffer in 1.31 format. */
778       *pOut++ = acc_h;
779       /* Yn1 = acc << shift; */
780 
781       /* Store the output in the destination buffer in 1.31 format. */
782 /*    *pOut++ = (q31_t) (acc >> (32 - shift));  */
783 
784       /* decrement loop counter */
785       sample--;
786     }
787 
788     /* The first stage output is given as input to the second stage. */
789     pIn = pDst;
790 
791     /* Reset to destination buffer working pointer */
792     pOut = pDst;
793 
794     /*  Store the updated state variables back into the pState array */
795     *pState++ = (q63_t) Xn1;
796     *pState++ = (q63_t) Xn2;
797     *pState++ = Yn1;
798     *pState++ = Yn2;
799 
800   } while (--stage);
801 
802 }
803 #endif /* defined(ARM_MATH_MVEI) */
804 
805 /**
806   @} end of BiquadCascadeDF1_32x64 group
807  */
808