1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_biquad_cascade_df1_32x64_q31.c
4 * Description: High precision Q31 Biquad cascade filter processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter
37
38 This function implements a high precision Biquad cascade filter which operates on
39 Q31 data values. The filter coefficients are in 1.31 format and the state variables
40 are in 1.63 format. The double precision state variables reduce quantization noise
41 in the filter and provide a cleaner output.
42 These filters are particularly useful when implementing filters in which the
43 singularities are close to the unit circle. This is common for low pass or high
44 pass filters with very low cutoff frequencies.
45
46 The function operates on blocks of input and output data
47 and each call to the function processes <code>blockSize</code> samples through
48 the filter. <code>pSrc</code> and <code>pDst</code> points to input and output arrays
49 containing <code>blockSize</code> Q31 values.
50
51 @par Algorithm
52 Each Biquad stage implements a second order filter using the difference equation:
53 <pre>
54 y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
55 </pre>
56 A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
57 \image html Biquad.gif "Single Biquad filter stage"
58 Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
59 Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
60 Pay careful attention to the sign of the feedback coefficients.
61 Some design tools use the difference equation
62 <pre>
63 y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
64 </pre>
65 In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
66 @par
67 Higher order filters are realized as a cascade of second order sections.
68 <code>numStages</code> refers to the number of second order stages used.
69 For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
70 \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
71 A 9th order filter would be realized with <code>numStages=5</code> second order stages
72 with the coefficients for one of the stages configured as a first order filter
73 (<code>b2=0</code> and <code>a2=0</code>).
74 @par
75 The <code>pState</code> points to state variables array.
76 Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code> and each state variable in 1.63 format to improve precision.
77 The state variables are arranged in the array as:
78 <pre>
79 {x[n-1], x[n-2], y[n-1], y[n-2]}
80 </pre>
81 @par
82 The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
83 The state array has a total length of <code>4*numStages</code> values of data in 1.63 format.
84 The state variables are updated after each block of data is processed, the coefficients are untouched.
85
86 @par Instance Structure
87 The coefficients and state variables for a filter are stored together in an instance data structure.
88 A separate instance structure must be defined for each filter.
89 Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
90
91 @par Init Function
92 There is also an associated initialization function which performs the following operations:
93 - Sets the values of the internal structure fields.
94 - Zeros out the values in the state buffer.
95 To do this manually without calling the init function, assign the follow subfields of the instance structure:
96 numStages, pCoeffs, postShift, pState. Also set all of the values in pState to zero.
97
98 @par
99 Use of the initialization function is optional.
100 However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
101 To place an instance structure into a const data section, the instance structure must be manually initialized.
102 Set the values in the state buffer to zeros before static initialization.
103 For example, to statically initialize the filter instance structure use
104 <pre>
105 arm_biquad_cas_df1_32x64_ins_q31 S1 = {numStages, pState, pCoeffs, postShift};
106 </pre>
107 where <code>numStages</code> is the number of Biquad stages in the filter;
108 <code>pState</code> is the address of the state buffer;
109 <code>pCoeffs</code> is the address of the coefficient buffer;
110 <code>postShift</code> shift to be applied which is described in detail below.
111 @par Fixed-Point Behavior
112 Care must be taken while using Biquad Cascade 32x64 filter function.
113 Following issues must be considered:
114 - Scaling of coefficients
115 - Filter gain
116 - Overflow and saturation
117
118 @par
119 Filter coefficients are represented as fractional values and
120 restricted to lie in the range <code>[-1 +1)</code>.
121 The processing function has an additional scaling parameter <code>postShift</code>
122 which allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
123 At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
124 \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
125 This essentially scales the filter coefficients by <code>2^postShift</code>.
126 For example, to realize the coefficients
127 <pre>
128 {1.5, -0.8, 1.2, 1.6, -0.9}
129 </pre>
130 set the Coefficient array to:
131 <pre>
132 {0.75, -0.4, 0.6, 0.8, -0.45}
133 </pre>
134 and set <code>postShift=1</code>
135 @par
136 The second thing to keep in mind is the gain through the filter.
137 The frequency response of a Biquad filter is a function of its coefficients.
138 It is possible for the gain through the filter to exceed 1.0 meaning that the
139 filter increases the amplitude of certain frequencies.
140 This means that an input signal with amplitude < 1.0 may result in an output > 1.0
141 and these are saturated or overflowed based on the implementation of the filter.
142 To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0
143 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
144 @par
145 The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version.
146 This is described in the function specific documentation below.
147 */
148
149 /**
150 @addtogroup BiquadCascadeDF1_32x64
151 @{
152 */
153
154 /**
155 @brief Processing function for the Q31 Biquad cascade 32x64 filter.
156 @param[in] S points to an instance of the high precision Q31 Biquad cascade filter
157 @param[in] pSrc points to the block of input data
158 @param[out] pDst points to the block of output data
159 @param[in] blockSize number of samples to process
160 @return none
161
162 @par Details
163 The function is implemented using an internal 64-bit accumulator.
164 The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
165 Thus, if the accumulator result overflows it wraps around rather than clip.
166 In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
167 After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
168 1.31 format by discarding the low 32 bits.
169 @par
170 Two related functions are provided in the CMSIS DSP library.
171 - \ref arm_biquad_cascade_df1_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator.
172 - \ref arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
173 */
174
175 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
176
177 #include "arm_helium_utils.h"
178
arm_biquad_cas_df1_32x64_q31_scalar(const arm_biquad_cas_df1_32x64_ins_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)179 static void arm_biquad_cas_df1_32x64_q31_scalar(const arm_biquad_cas_df1_32x64_ins_q31 * S,
180 const q31_t * pSrc,
181 q31_t * pDst,
182 uint32_t blockSize)
183 {
184 const q31_t *pIn = pSrc; /* input pointer initialization */
185 q31_t *pOut = pDst; /* output pointer initialization */
186 q63_t *pState = S->pState; /* state pointer initialization */
187 const q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
188 q63_t acc; /* accumulator */
189 q31_t Xn1, Xn2; /* Input Filter state variables */
190 q63_t Yn1, Yn2; /* Output Filter state variables */
191 q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
192 q31_t Xn; /* temporary input */
193 int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
194 uint32_t sample, stage = S->numStages; /* loop counters */
195 q31_t acc_l, acc_h; /* temporary output */
196 uint32_t uShift = ((uint32_t) S->postShift + 1U);
197 uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
198
199 do
200 {
201 /* Reading the coefficients */
202 b0 = *pCoeffs++;
203 b1 = *pCoeffs++;
204 b2 = *pCoeffs++;
205 a1 = *pCoeffs++;
206 a2 = *pCoeffs++;
207
208 /* Reading the state values */
209 Xn1 = (q31_t) (pState[0]);
210 Xn2 = (q31_t) (pState[1]);
211 Yn1 = pState[2];
212 Yn2 = pState[3];
213
214 /* Initialize blkCnt with number of samples */
215 sample = blockSize;
216
217 while (sample > 0U)
218 {
219 /* Read the input */
220 Xn = *pIn++;
221
222 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
223
224 /* acc = b0 * x[n] */
225 acc = (q63_t) Xn * b0;
226 /* acc += b1 * x[n-1] */
227 acc += (q63_t) Xn1 * b1;
228 /* acc += b[2] * x[n-2] */
229 acc += (q63_t) Xn2 * b2;
230 /* acc += a1 * y[n-1] */
231 acc += mult32x64(Yn1, a1);
232 /* acc += a2 * y[n-2] */
233 acc += mult32x64(Yn2, a2);
234
235 /* Every time after the output is computed state should be updated. */
236 /* The states should be updated as: */
237 /* Xn2 = Xn1 */
238 /* Xn1 = Xn */
239 /* Yn2 = Yn1 */
240 /* Yn1 = acc */
241 Xn2 = Xn1;
242 Xn1 = Xn;
243 Yn2 = Yn1;
244
245 /* The result is converted to 1.63, Yn1 variable is reused */
246 Yn1 = acc << shift;
247
248 /* Calc lower part of acc */
249 acc_l = acc & 0xffffffff;
250
251 /* Calc upper part of acc */
252 acc_h = (acc >> 32) & 0xffffffff;
253
254 /* Apply shift for lower part of acc and upper part of acc */
255 acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
256
257 /* Store the output in the destination buffer in 1.31 format. */
258 *pOut++ = acc_h;
259 /* Yn1 = acc << shift; */
260
261 /* Store the output in the destination buffer in 1.31 format. */
262 /* *pOut++ = (q31_t) (acc >> (32 - shift)); */
263
264 /* decrement loop counter */
265 sample--;
266 }
267
268 /* The first stage output is given as input to the second stage. */
269 pIn = pDst;
270
271 /* Reset to destination buffer working pointer */
272 pOut = pDst;
273
274 /* Store the updated state variables back into the pState array */
275 *pState++ = (q63_t) Xn1;
276 *pState++ = (q63_t) Xn2;
277 *pState++ = Yn1;
278 *pState++ = Yn2;
279
280 } while (--stage);
281
282 }
283
arm_biquad_cas_df1_32x64_q31(const arm_biquad_cas_df1_32x64_ins_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)284 void arm_biquad_cas_df1_32x64_q31(
285 const arm_biquad_cas_df1_32x64_ins_q31 * S,
286 const q31_t * pSrc,
287 q31_t * pDst,
288 uint32_t blockSize)
289 {
290 const q31_t *pIn = pSrc; /* input pointer initialization */
291 q31_t *pOut = pDst; /* output pointer initialization */
292 q63_t *pState = S->pState; /* state pointer initialization */
293 const q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
294 q31_t Xn1, Xn2; /* Input Filter state variables */
295 q63_t Yn1, Yn2; /* Output Filter state variables */
296 q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
297 int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
298 uint32_t sample, stage = S->numStages; /* loop counters */
299 q31x4_t vecCoef, vecIn;
300 q63_t acc;
301
302 if (blockSize <= 3)
303 {
304 arm_biquad_cas_df1_32x64_q31_scalar(S,pSrc,pDst,blockSize);
305 }
306 else
307 {
308 do
309 {
310 uint32_t i;
311 /*
312 * Reading the coefficients
313 */
314 b0 = *pCoeffs++;
315 b1 = *pCoeffs++;
316 b2 = *pCoeffs++;
317 a1 = *pCoeffs++;
318 a2 = *pCoeffs++;
319
320 vecCoef[0] = 0;
321 vecCoef[1] = b2;
322 vecCoef[2] = b1;
323 vecCoef[3] = b0;
324 /*
325 * Reading the state values
326 */
327 Xn1 = pState[0];
328 Xn2 = pState[1];
329 Yn1 = pState[2];
330 Yn2 = pState[3];
331
332 /*
333 * append history with initial samples
334 */
335 q31_t hist[6];
336 hist[0] = 0;
337 hist[1] = Xn2;
338 hist[2] = Xn1;
339 hist[3] = pIn[0];
340 hist[4] = pIn[1];
341 hist[5] = pIn[2];
342
343 const q31_t *pIn1 = hist;
344 q31x4_t vecIn0 = *(q31x4_t *) & pIn[0];
345 q31x4_t vecIn1 = *(q31x4_t *) & pIn[1];
346 q31x4_t vecIn2 = *(q31x4_t *) & pIn[2];
347
348 i = 3;
349 do
350 {
351 acc = mult32x64(Yn1, a1);
352 acc += mult32x64(Yn2, a2);
353 Yn2 = Yn1;
354 Yn1 = acc;
355 vecIn = vld1q(pIn1);
356 pIn1 += 1;
357 Yn1 = vmlaldavaq(Yn1, vecIn, vecCoef);
358 Yn1 = asrl(Yn1, -shift);
359 /*
360 * Store the output in the destination buffer in 1.31 format.
361 */
362 *pOut++ = (q31_t) (Yn1 >> 32);
363 }
364 while (--i);
365
366 sample = blockSize - 3;
367 pIn1 = pIn + 3;
368
369 i = sample / 4;
370 while (i > 0U)
371 {
372
373 acc = mult32x64(Yn1, a1);
374 acc += mult32x64(Yn2, a2);
375 Yn2 = Yn1;
376 Yn1 = acc;
377 Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
378 vecIn = vld1q(pIn1);
379 pIn1 += 1;
380 Yn1 = asrl(Yn1, -shift);
381 /*
382 * Store the output in the destination buffer in 1.31 format.
383 */
384 *pOut++ = (q31_t) (Yn1 >> 32);
385
386 acc = mult32x64(Yn1, a1);
387 acc += mult32x64(Yn2, a2);
388 Yn2 = Yn1;
389 Yn1 = acc;
390 Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
391 vecIn0 = vld1q(pIn1);
392 pIn1 += 1;
393 Yn1 = asrl(Yn1, -shift);
394 *pOut++ = (q31_t) (Yn1 >> 32);
395
396 acc = mult32x64(Yn1, a1);
397 acc += mult32x64(Yn2, a2);
398 Yn2 = Yn1;
399 Yn1 = acc;
400 Yn1 = vmlaldavaq(Yn1, vecIn2, vecCoef);
401 vecIn1 = vld1q(pIn1);
402 pIn1 += 1;
403 Yn1 = asrl(Yn1, -shift);
404 *pOut++ = (q31_t) (Yn1 >> 32);
405
406 acc = mult32x64(Yn1, a1);
407 acc += mult32x64(Yn2, a2);
408 Yn2 = Yn1;
409 Yn1 = acc;
410 Yn1 = vmlaldavaq(Yn1, vecIn, vecCoef);
411 vecIn2 = vld1q(pIn1);
412 pIn1 += 1;
413 Yn1 = asrl(Yn1, -shift);
414 *pOut++ = (q31_t) (Yn1 >> 32);
415 /*
416 * Decrement the loop counter
417 */
418 i--;
419 }
420 /*
421 * save input state
422 */
423 Xn2 = vecIn[2];
424 Xn1 = vecIn[3];
425
426 int loopRemainder = blockSize - 3 - 4 * ((blockSize - 3) / 4);
427 if (loopRemainder == 1)
428 {
429 /*
430 * remainder
431 */
432 acc = mult32x64(Yn1, a1);
433 acc += mult32x64(Yn2, a2);
434 Yn2 = Yn1;
435 Yn1 = acc;
436 Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
437 Yn1 = asrl(Yn1, -shift);
438 *pOut++ = (q31_t) (Yn1 >> 32);
439 /*
440 * save input state
441 */
442 Xn2 = vecIn0[2];
443 Xn1 = vecIn0[3];
444
445 }
446 else if (loopRemainder == 2)
447 {
448 acc = mult32x64(Yn1, a1);
449 acc += mult32x64(Yn2, a2);
450 Yn2 = Yn1;
451 Yn1 = acc;
452 Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
453 Yn1 = asrl(Yn1, -shift);
454 *pOut++ = (q31_t) (Yn1 >> 32);
455
456 acc = mult32x64(Yn1, a1);
457 acc += mult32x64(Yn2, a2);
458 Yn2 = Yn1;
459 Yn1 = acc;
460 Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
461 Yn1 = asrl(Yn1, -shift);
462 *pOut++ = (q31_t) (Yn1 >> 32);
463 /*
464 * save input state
465 */
466 Xn2 = vecIn1[2];
467 Xn1 = vecIn1[3];
468
469 }
470 else if (loopRemainder == 3)
471 {
472 acc = mult32x64(Yn1, a1);
473 acc += mult32x64(Yn2, a2);
474 Yn2 = Yn1;
475 Yn1 = acc;
476 Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
477 Yn1 = asrl(Yn1, -shift);
478 *pOut++ = (q31_t) (Yn1 >> 32);
479
480 acc = mult32x64(Yn1, a1);
481 acc += mult32x64(Yn2, a2);
482 Yn2 = Yn1;
483 Yn1 = acc;
484 Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
485 Yn1 = asrl(Yn1, -shift);
486 *pOut++ = (q31_t) (Yn1 >> 32);
487
488 acc = mult32x64(Yn1, a1);
489 acc += mult32x64(Yn2, a2);
490 Yn2 = Yn1;
491 Yn1 = acc;
492 Yn1 = vmlaldavaq(Yn1, vecIn2, vecCoef);
493 Yn1 = asrl(Yn1, -shift);
494 *pOut++ = (q31_t) (Yn1 >> 32);
495 /*
496 * save input state
497 */
498 Xn2 = vecIn2[2];
499 Xn1 = vecIn2[3];
500
501 }
502
503 /*
504 * The first stage output is given as input to the second stage.
505 */
506 pIn = pDst;
507 /*
508 * Reset to destination buffer working pointer
509 */
510 pOut = pDst;
511 /*
512 * Store the updated state variables back into the pState array
513 */
514 *pState++ = (q63_t) Xn1;
515 *pState++ = (q63_t) Xn2;
516 *pState++ = Yn1;
517 *pState++ = Yn2;
518 }
519 while (--stage);
520 }
521 }
522 #else
arm_biquad_cas_df1_32x64_q31(const arm_biquad_cas_df1_32x64_ins_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)523 void arm_biquad_cas_df1_32x64_q31(
524 const arm_biquad_cas_df1_32x64_ins_q31 * S,
525 const q31_t * pSrc,
526 q31_t * pDst,
527 uint32_t blockSize)
528 {
529 const q31_t *pIn = pSrc; /* input pointer initialization */
530 q31_t *pOut = pDst; /* output pointer initialization */
531 q63_t *pState = S->pState; /* state pointer initialization */
532 const q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
533 q63_t acc; /* accumulator */
534 q31_t Xn1, Xn2; /* Input Filter state variables */
535 q63_t Yn1, Yn2; /* Output Filter state variables */
536 q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
537 q31_t Xn; /* temporary input */
538 int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
539 uint32_t sample, stage = S->numStages; /* loop counters */
540 q31_t acc_l, acc_h; /* temporary output */
541 uint32_t uShift = ((uint32_t) S->postShift + 1U);
542 uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
543
544 do
545 {
546 /* Reading the coefficients */
547 b0 = *pCoeffs++;
548 b1 = *pCoeffs++;
549 b2 = *pCoeffs++;
550 a1 = *pCoeffs++;
551 a2 = *pCoeffs++;
552
553 /* Reading the state values */
554 Xn1 = (q31_t) (pState[0]);
555 Xn2 = (q31_t) (pState[1]);
556 Yn1 = pState[2];
557 Yn2 = pState[3];
558
559 #if defined (ARM_MATH_LOOPUNROLL)
560
561 /* Apply loop unrolling and compute 4 output values simultaneously. */
562 /* Variable acc hold output value that is being computed and stored in destination buffer
563 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
564 */
565
566 /* Loop unrolling: Compute 4 outputs at a time */
567 sample = blockSize >> 2U;
568
569 while (sample > 0U)
570 {
571 /* Read the input */
572 Xn = *pIn++;
573
574 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
575
576 /* acc = b0 * x[n] */
577 acc = (q63_t) Xn * b0;
578
579 /* acc += b1 * x[n-1] */
580 acc += (q63_t) Xn1 * b1;
581
582 /* acc += b[2] * x[n-2] */
583 acc += (q63_t) Xn2 * b2;
584
585 /* acc += a1 * y[n-1] */
586 acc += mult32x64(Yn1, a1);
587
588 /* acc += a2 * y[n-2] */
589 acc += mult32x64(Yn2, a2);
590
591 /* The result is converted to 1.63 , Yn2 variable is reused */
592 Yn2 = acc << shift;
593
594 /* Calc lower part of acc */
595 acc_l = acc & 0xffffffff;
596
597 /* Calc upper part of acc */
598 acc_h = (acc >> 32) & 0xffffffff;
599
600 /* Apply shift for lower part of acc and upper part of acc */
601 acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
602
603 /* Store the output in the destination buffer in 1.31 format. */
604 *pOut = acc_h;
605
606 /* Read the second input into Xn2, to reuse the value */
607 Xn2 = *pIn++;
608
609 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
610
611 /* acc += b1 * x[n-1] */
612 acc = (q63_t) Xn * b1;
613
614 /* acc = b0 * x[n] */
615 acc += (q63_t) Xn2 * b0;
616
617 /* acc += b[2] * x[n-2] */
618 acc += (q63_t) Xn1 * b2;
619
620 /* acc += a1 * y[n-1] */
621 acc += mult32x64(Yn2, a1);
622
623 /* acc += a2 * y[n-2] */
624 acc += mult32x64(Yn1, a2);
625
626 /* The result is converted to 1.63, Yn1 variable is reused */
627 Yn1 = acc << shift;
628
629 /* Calc lower part of acc */
630 acc_l = acc & 0xffffffff;
631
632 /* Calc upper part of acc */
633 acc_h = (acc >> 32) & 0xffffffff;
634
635 /* Apply shift for lower part of acc and upper part of acc */
636 acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
637
638 /* Read the third input into Xn1, to reuse the value */
639 Xn1 = *pIn++;
640
641 /* The result is converted to 1.31 */
642 /* Store the output in the destination buffer. */
643 *(pOut + 1U) = acc_h;
644
645 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
646
647 /* acc = b0 * x[n] */
648 acc = (q63_t) Xn1 * b0;
649
650 /* acc += b1 * x[n-1] */
651 acc += (q63_t) Xn2 * b1;
652
653 /* acc += b[2] * x[n-2] */
654 acc += (q63_t) Xn * b2;
655
656 /* acc += a1 * y[n-1] */
657 acc += mult32x64(Yn1, a1);
658
659 /* acc += a2 * y[n-2] */
660 acc += mult32x64(Yn2, a2);
661
662 /* The result is converted to 1.63, Yn2 variable is reused */
663 Yn2 = acc << shift;
664
665 /* Calc lower part of acc */
666 acc_l = acc & 0xffffffff;
667
668 /* Calc upper part of acc */
669 acc_h = (acc >> 32) & 0xffffffff;
670
671 /* Apply shift for lower part of acc and upper part of acc */
672 acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
673
674 /* Store the output in the destination buffer in 1.31 format. */
675 *(pOut + 2U) = acc_h;
676
677 /* Read the fourth input into Xn, to reuse the value */
678 Xn = *pIn++;
679
680 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
681 /* acc = b0 * x[n] */
682 acc = (q63_t) Xn * b0;
683
684 /* acc += b1 * x[n-1] */
685 acc += (q63_t) Xn1 * b1;
686
687 /* acc += b[2] * x[n-2] */
688 acc += (q63_t) Xn2 * b2;
689
690 /* acc += a1 * y[n-1] */
691 acc += mult32x64(Yn2, a1);
692
693 /* acc += a2 * y[n-2] */
694 acc += mult32x64(Yn1, a2);
695
696 /* The result is converted to 1.63, Yn1 variable is reused */
697 Yn1 = acc << shift;
698
699 /* Calc lower part of acc */
700 acc_l = acc & 0xffffffff;
701
702 /* Calc upper part of acc */
703 acc_h = (acc >> 32) & 0xffffffff;
704
705 /* Apply shift for lower part of acc and upper part of acc */
706 acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
707
708 /* Store the output in the destination buffer in 1.31 format. */
709 *(pOut + 3U) = acc_h;
710
711 /* Every time after the output is computed state should be updated. */
712 /* The states should be updated as: */
713 /* Xn2 = Xn1 */
714 /* Xn1 = Xn */
715 /* Yn2 = Yn1 */
716 /* Yn1 = acc */
717 Xn2 = Xn1;
718 Xn1 = Xn;
719
720 /* update output pointer */
721 pOut += 4U;
722
723 /* decrement loop counter */
724 sample--;
725 }
726
727 /* Loop unrolling: Compute remaining outputs */
728 sample = blockSize & 0x3U;
729
730 #else
731
732 /* Initialize blkCnt with number of samples */
733 sample = blockSize;
734
735 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
736
737 while (sample > 0U)
738 {
739 /* Read the input */
740 Xn = *pIn++;
741
742 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
743
744 /* acc = b0 * x[n] */
745 acc = (q63_t) Xn * b0;
746 /* acc += b1 * x[n-1] */
747 acc += (q63_t) Xn1 * b1;
748 /* acc += b[2] * x[n-2] */
749 acc += (q63_t) Xn2 * b2;
750 /* acc += a1 * y[n-1] */
751 acc += mult32x64(Yn1, a1);
752 /* acc += a2 * y[n-2] */
753 acc += mult32x64(Yn2, a2);
754
755 /* Every time after the output is computed state should be updated. */
756 /* The states should be updated as: */
757 /* Xn2 = Xn1 */
758 /* Xn1 = Xn */
759 /* Yn2 = Yn1 */
760 /* Yn1 = acc */
761 Xn2 = Xn1;
762 Xn1 = Xn;
763 Yn2 = Yn1;
764
765 /* The result is converted to 1.63, Yn1 variable is reused */
766 Yn1 = acc << shift;
767
768 /* Calc lower part of acc */
769 acc_l = acc & 0xffffffff;
770
771 /* Calc upper part of acc */
772 acc_h = (acc >> 32) & 0xffffffff;
773
774 /* Apply shift for lower part of acc and upper part of acc */
775 acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
776
777 /* Store the output in the destination buffer in 1.31 format. */
778 *pOut++ = acc_h;
779 /* Yn1 = acc << shift; */
780
781 /* Store the output in the destination buffer in 1.31 format. */
782 /* *pOut++ = (q31_t) (acc >> (32 - shift)); */
783
784 /* decrement loop counter */
785 sample--;
786 }
787
788 /* The first stage output is given as input to the second stage. */
789 pIn = pDst;
790
791 /* Reset to destination buffer working pointer */
792 pOut = pDst;
793
794 /* Store the updated state variables back into the pState array */
795 *pState++ = (q63_t) Xn1;
796 *pState++ = (q63_t) Xn2;
797 *pState++ = Yn1;
798 *pState++ = Yn2;
799
800 } while (--stage);
801
802 }
803 #endif /* defined(ARM_MATH_MVEI) */
804
805 /**
806 @} end of BiquadCascadeDF1_32x64 group
807 */
808