1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_biquad_cascade_df1_32x64_q31.c
4 * Description: High precision Q31 Biquad cascade filter processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter
37
38 This function implements a high precision Biquad cascade filter which operates on
39 Q31 data values. The filter coefficients are in 1.31 format and the state variables
40 are in 1.63 format. The double precision state variables reduce quantization noise
41 in the filter and provide a cleaner output.
42 These filters are particularly useful when implementing filters in which the
43 singularities are close to the unit circle. This is common for low pass or high
44 pass filters with very low cutoff frequencies.
45
46 The function operates on blocks of input and output data
47 and each call to the function processes <code>blockSize</code> samples through
48 the filter. <code>pSrc</code> and <code>pDst</code> points to input and output arrays
49 containing <code>blockSize</code> Q31 values.
50
51 @par Algorithm
52 Each Biquad stage implements a second order filter using the difference equation:
53 <pre>
54 y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
55 </pre>
56 A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
57 \image html Biquad.gif "Single Biquad filter stage"
58 Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
59 Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
60 Pay careful attention to the sign of the feedback coefficients.
61 Some design tools use the difference equation
62 <pre>
63 y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
64 </pre>
65 In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
66 @par
67 Higher order filters are realized as a cascade of second order sections.
68 <code>numStages</code> refers to the number of second order stages used.
69 For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
70 \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
71 A 9th order filter would be realized with <code>numStages=5</code> second order stages
72 with the coefficients for one of the stages configured as a first order filter
73 (<code>b2=0</code> and <code>a2=0</code>).
74 @par
75 The <code>pState</code> points to state variables array.
76 Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code> and each state variable in 1.63 format to improve precision.
77 The state variables are arranged in the array as:
78 <pre>
79 {x[n-1], x[n-2], y[n-1], y[n-2]}
80 </pre>
81 @par
82 The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
83 The state array has a total length of <code>4*numStages</code> values of data in 1.63 format.
84 The state variables are updated after each block of data is processed, the coefficients are untouched.
85
86 @par Instance Structure
87 The coefficients and state variables for a filter are stored together in an instance data structure.
88 A separate instance structure must be defined for each filter.
89 Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
90
91 @par Init Function
92 There is also an associated initialization function which performs the following operations:
93 - Sets the values of the internal structure fields.
94 - Zeros out the values in the state buffer.
95 To do this manually without calling the init function, assign the follow subfields of the instance structure:
96 numStages, pCoeffs, postShift, pState. Also set all of the values in pState to zero.
97
98 @par
99 Use of the initialization function is optional.
100 However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
101 To place an instance structure into a const data section, the instance structure must be manually initialized.
102 Set the values in the state buffer to zeros before static initialization.
103 For example, to statically initialize the filter instance structure use
104 <pre>
105 arm_biquad_cas_df1_32x64_ins_q31 S1 = {numStages, pState, pCoeffs, postShift};
106 </pre>
107 where <code>numStages</code> is the number of Biquad stages in the filter;
108 <code>pState</code> is the address of the state buffer;
109 <code>pCoeffs</code> is the address of the coefficient buffer;
110 <code>postShift</code> shift to be applied which is described in detail below.
111 @par Fixed-Point Behavior
112 Care must be taken while using Biquad Cascade 32x64 filter function.
113 Following issues must be considered:
114 - Scaling of coefficients
115 - Filter gain
116 - Overflow and saturation
117
118 @par
119 Filter coefficients are represented as fractional values and
120 restricted to lie in the range <code>[-1 +1)</code>.
121 The processing function has an additional scaling parameter <code>postShift</code>
122 which allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
123 At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
124 \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
125 This essentially scales the filter coefficients by <code>2^postShift</code>.
126 For example, to realize the coefficients
127 <pre>
128 {1.5, -0.8, 1.2, 1.6, -0.9}
129 </pre>
130 set the Coefficient array to:
131 <pre>
132 {0.75, -0.4, 0.6, 0.8, -0.45}
133 </pre>
134 and set <code>postShift=1</code>
135 @par
136 The second thing to keep in mind is the gain through the filter.
137 The frequency response of a Biquad filter is a function of its coefficients.
138 It is possible for the gain through the filter to exceed 1.0 meaning that the
139 filter increases the amplitude of certain frequencies.
140 This means that an input signal with amplitude < 1.0 may result in an output > 1.0
141 and these are saturated or overflowed based on the implementation of the filter.
142 To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0
143 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
144 @par
145 The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version.
146 This is described in the function specific documentation below.
147 */
148
149 /**
150 @addtogroup BiquadCascadeDF1_32x64
151 @{
152 */
153
154 /**
155 @brief Processing function for the Q31 Biquad cascade 32x64 filter.
156 @param[in] S points to an instance of the high precision Q31 Biquad cascade filter
157 @param[in] pSrc points to the block of input data
158 @param[out] pDst points to the block of output data
159 @param[in] blockSize number of samples to process
160
161 @par Details
162 The function is implemented using an internal 64-bit accumulator.
163 The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
164 Thus, if the accumulator result overflows it wraps around rather than clip.
165 In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
166 After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
167 1.31 format by discarding the low 32 bits.
168 @par
169 Two related functions are provided in the CMSIS DSP library.
170 - \ref arm_biquad_cascade_df1_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator.
171 - \ref arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
172 */
173
174 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
175
176 #include "arm_helium_utils.h"
177
arm_biquad_cas_df1_32x64_q31_scalar(const arm_biquad_cas_df1_32x64_ins_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)178 static void arm_biquad_cas_df1_32x64_q31_scalar(const arm_biquad_cas_df1_32x64_ins_q31 * S,
179 const q31_t * pSrc,
180 q31_t * pDst,
181 uint32_t blockSize)
182 {
183 const q31_t *pIn = pSrc; /* input pointer initialization */
184 q31_t *pOut = pDst; /* output pointer initialization */
185 q63_t *pState = S->pState; /* state pointer initialization */
186 const q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
187 q63_t acc; /* accumulator */
188 q31_t Xn1, Xn2; /* Input Filter state variables */
189 q63_t Yn1, Yn2; /* Output Filter state variables */
190 q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
191 q31_t Xn; /* temporary input */
192 int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
193 uint32_t sample, stage = S->numStages; /* loop counters */
194 q31_t acc_l, acc_h; /* temporary output */
195 uint32_t uShift = ((uint32_t) S->postShift + 1U);
196 uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
197
198 do
199 {
200 /* Reading the coefficients */
201 b0 = *pCoeffs++;
202 b1 = *pCoeffs++;
203 b2 = *pCoeffs++;
204 a1 = *pCoeffs++;
205 a2 = *pCoeffs++;
206
207 /* Reading the state values */
208 Xn1 = (q31_t) (pState[0]);
209 Xn2 = (q31_t) (pState[1]);
210 Yn1 = pState[2];
211 Yn2 = pState[3];
212
213 /* Initialize blkCnt with number of samples */
214 sample = blockSize;
215
216 while (sample > 0U)
217 {
218 /* Read the input */
219 Xn = *pIn++;
220
221 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
222
223 /* acc = b0 * x[n] */
224 acc = (q63_t) Xn * b0;
225 /* acc += b1 * x[n-1] */
226 acc += (q63_t) Xn1 * b1;
227 /* acc += b[2] * x[n-2] */
228 acc += (q63_t) Xn2 * b2;
229 /* acc += a1 * y[n-1] */
230 acc += mult32x64(Yn1, a1);
231 /* acc += a2 * y[n-2] */
232 acc += mult32x64(Yn2, a2);
233
234 /* Every time after the output is computed state should be updated. */
235 /* The states should be updated as: */
236 /* Xn2 = Xn1 */
237 /* Xn1 = Xn */
238 /* Yn2 = Yn1 */
239 /* Yn1 = acc */
240 Xn2 = Xn1;
241 Xn1 = Xn;
242 Yn2 = Yn1;
243
244 /* The result is converted to 1.63, Yn1 variable is reused */
245 Yn1 = acc << shift;
246
247 /* Calc lower part of acc */
248 acc_l = acc & 0xffffffff;
249
250 /* Calc upper part of acc */
251 acc_h = (acc >> 32) & 0xffffffff;
252
253 /* Apply shift for lower part of acc and upper part of acc */
254 acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
255
256 /* Store the output in the destination buffer in 1.31 format. */
257 *pOut++ = acc_h;
258 /* Yn1 = acc << shift; */
259
260 /* Store the output in the destination buffer in 1.31 format. */
261 /* *pOut++ = (q31_t) (acc >> (32 - shift)); */
262
263 /* decrement loop counter */
264 sample--;
265 }
266
267 /* The first stage output is given as input to the second stage. */
268 pIn = pDst;
269
270 /* Reset to destination buffer working pointer */
271 pOut = pDst;
272
273 /* Store the updated state variables back into the pState array */
274 *pState++ = (q63_t) Xn1;
275 *pState++ = (q63_t) Xn2;
276 *pState++ = Yn1;
277 *pState++ = Yn2;
278
279 } while (--stage);
280
281 }
282
arm_biquad_cas_df1_32x64_q31(const arm_biquad_cas_df1_32x64_ins_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)283 ARM_DSP_ATTRIBUTE void arm_biquad_cas_df1_32x64_q31(
284 const arm_biquad_cas_df1_32x64_ins_q31 * S,
285 const q31_t * pSrc,
286 q31_t * pDst,
287 uint32_t blockSize)
288 {
289 const q31_t *pIn = pSrc; /* input pointer initialization */
290 q31_t *pOut = pDst; /* output pointer initialization */
291 q63_t *pState = S->pState; /* state pointer initialization */
292 const q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
293 q31_t Xn1, Xn2; /* Input Filter state variables */
294 q63_t Yn1, Yn2; /* Output Filter state variables */
295 q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
296 int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
297 uint32_t sample, stage = S->numStages; /* loop counters */
298 q31x4_t vecCoef = { 0 }, vecIn;
299 q63_t acc;
300
301 if (blockSize <= 3)
302 {
303 arm_biquad_cas_df1_32x64_q31_scalar(S,pSrc,pDst,blockSize);
304 }
305 else
306 {
307 do
308 {
309 uint32_t i;
310 /*
311 * Reading the coefficients
312 */
313 b0 = *pCoeffs++;
314 b1 = *pCoeffs++;
315 b2 = *pCoeffs++;
316 a1 = *pCoeffs++;
317 a2 = *pCoeffs++;
318
319 vecCoef[0] = 0;
320 vecCoef[1] = b2;
321 vecCoef[2] = b1;
322 vecCoef[3] = b0;
323 /*
324 * Reading the state values
325 */
326 Xn1 = pState[0];
327 Xn2 = pState[1];
328 Yn1 = pState[2];
329 Yn2 = pState[3];
330
331 /*
332 * append history with initial samples
333 */
334 q31_t hist[6];
335 hist[0] = 0;
336 hist[1] = Xn2;
337 hist[2] = Xn1;
338 hist[3] = pIn[0];
339 hist[4] = pIn[1];
340 hist[5] = pIn[2];
341
342 const q31_t *pIn1 = hist;
343 q31x4_t vecIn0 = *(q31x4_t *) & pIn[0];
344 q31x4_t vecIn1 = *(q31x4_t *) & pIn[1];
345 q31x4_t vecIn2 = *(q31x4_t *) & pIn[2];
346
347 i = 3;
348 do
349 {
350 acc = mult32x64(Yn1, a1);
351 acc += mult32x64(Yn2, a2);
352 Yn2 = Yn1;
353 Yn1 = acc;
354 vecIn = vld1q(pIn1);
355 pIn1 += 1;
356 Yn1 = vmlaldavaq(Yn1, vecIn, vecCoef);
357 Yn1 = asrl(Yn1, -shift);
358 /*
359 * Store the output in the destination buffer in 1.31 format.
360 */
361 *pOut++ = (q31_t) (Yn1 >> 32);
362 }
363 while (--i);
364
365 sample = blockSize - 3;
366 pIn1 = pIn + 3;
367
368 i = sample / 4;
369 while (i > 0U)
370 {
371
372 acc = mult32x64(Yn1, a1);
373 acc += mult32x64(Yn2, a2);
374 Yn2 = Yn1;
375 Yn1 = acc;
376 Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
377 vecIn = vld1q(pIn1);
378 pIn1 += 1;
379 Yn1 = asrl(Yn1, -shift);
380 /*
381 * Store the output in the destination buffer in 1.31 format.
382 */
383 *pOut++ = (q31_t) (Yn1 >> 32);
384
385 acc = mult32x64(Yn1, a1);
386 acc += mult32x64(Yn2, a2);
387 Yn2 = Yn1;
388 Yn1 = acc;
389 Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
390 vecIn0 = vld1q(pIn1);
391 pIn1 += 1;
392 Yn1 = asrl(Yn1, -shift);
393 *pOut++ = (q31_t) (Yn1 >> 32);
394
395 acc = mult32x64(Yn1, a1);
396 acc += mult32x64(Yn2, a2);
397 Yn2 = Yn1;
398 Yn1 = acc;
399 Yn1 = vmlaldavaq(Yn1, vecIn2, vecCoef);
400 vecIn1 = vld1q(pIn1);
401 pIn1 += 1;
402 Yn1 = asrl(Yn1, -shift);
403 *pOut++ = (q31_t) (Yn1 >> 32);
404
405 acc = mult32x64(Yn1, a1);
406 acc += mult32x64(Yn2, a2);
407 Yn2 = Yn1;
408 Yn1 = acc;
409 Yn1 = vmlaldavaq(Yn1, vecIn, vecCoef);
410 vecIn2 = vld1q(pIn1);
411 pIn1 += 1;
412 Yn1 = asrl(Yn1, -shift);
413 *pOut++ = (q31_t) (Yn1 >> 32);
414 /*
415 * Decrement the loop counter
416 */
417 i--;
418 }
419 /*
420 * save input state
421 */
422 Xn2 = vecIn[2];
423 Xn1 = vecIn[3];
424
425 int loopRemainder = blockSize - 3 - 4 * ((blockSize - 3) / 4);
426 if (loopRemainder == 1)
427 {
428 /*
429 * remainder
430 */
431 acc = mult32x64(Yn1, a1);
432 acc += mult32x64(Yn2, a2);
433 Yn2 = Yn1;
434 Yn1 = acc;
435 Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
436 Yn1 = asrl(Yn1, -shift);
437 *pOut++ = (q31_t) (Yn1 >> 32);
438 /*
439 * save input state
440 */
441 Xn2 = vecIn0[2];
442 Xn1 = vecIn0[3];
443
444 }
445 else if (loopRemainder == 2)
446 {
447 acc = mult32x64(Yn1, a1);
448 acc += mult32x64(Yn2, a2);
449 Yn2 = Yn1;
450 Yn1 = acc;
451 Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
452 Yn1 = asrl(Yn1, -shift);
453 *pOut++ = (q31_t) (Yn1 >> 32);
454
455 acc = mult32x64(Yn1, a1);
456 acc += mult32x64(Yn2, a2);
457 Yn2 = Yn1;
458 Yn1 = acc;
459 Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
460 Yn1 = asrl(Yn1, -shift);
461 *pOut++ = (q31_t) (Yn1 >> 32);
462 /*
463 * save input state
464 */
465 Xn2 = vecIn1[2];
466 Xn1 = vecIn1[3];
467
468 }
469 else if (loopRemainder == 3)
470 {
471 acc = mult32x64(Yn1, a1);
472 acc += mult32x64(Yn2, a2);
473 Yn2 = Yn1;
474 Yn1 = acc;
475 Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
476 Yn1 = asrl(Yn1, -shift);
477 *pOut++ = (q31_t) (Yn1 >> 32);
478
479 acc = mult32x64(Yn1, a1);
480 acc += mult32x64(Yn2, a2);
481 Yn2 = Yn1;
482 Yn1 = acc;
483 Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
484 Yn1 = asrl(Yn1, -shift);
485 *pOut++ = (q31_t) (Yn1 >> 32);
486
487 acc = mult32x64(Yn1, a1);
488 acc += mult32x64(Yn2, a2);
489 Yn2 = Yn1;
490 Yn1 = acc;
491 Yn1 = vmlaldavaq(Yn1, vecIn2, vecCoef);
492 Yn1 = asrl(Yn1, -shift);
493 *pOut++ = (q31_t) (Yn1 >> 32);
494 /*
495 * save input state
496 */
497 Xn2 = vecIn2[2];
498 Xn1 = vecIn2[3];
499
500 }
501
502 /*
503 * The first stage output is given as input to the second stage.
504 */
505 pIn = pDst;
506 /*
507 * Reset to destination buffer working pointer
508 */
509 pOut = pDst;
510 /*
511 * Store the updated state variables back into the pState array
512 */
513 *pState++ = (q63_t) Xn1;
514 *pState++ = (q63_t) Xn2;
515 *pState++ = Yn1;
516 *pState++ = Yn2;
517 }
518 while (--stage);
519 }
520 }
521 #else
arm_biquad_cas_df1_32x64_q31(const arm_biquad_cas_df1_32x64_ins_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)522 ARM_DSP_ATTRIBUTE void arm_biquad_cas_df1_32x64_q31(
523 const arm_biquad_cas_df1_32x64_ins_q31 * S,
524 const q31_t * pSrc,
525 q31_t * pDst,
526 uint32_t blockSize)
527 {
528 const q31_t *pIn = pSrc; /* input pointer initialization */
529 q31_t *pOut = pDst; /* output pointer initialization */
530 q63_t *pState = S->pState; /* state pointer initialization */
531 const q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
532 q63_t acc; /* accumulator */
533 q31_t Xn1, Xn2; /* Input Filter state variables */
534 q63_t Yn1, Yn2; /* Output Filter state variables */
535 q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
536 q31_t Xn; /* temporary input */
537 int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
538 uint32_t sample, stage = S->numStages; /* loop counters */
539 q31_t acc_l, acc_h; /* temporary output */
540 uint32_t uShift = ((uint32_t) S->postShift + 1U);
541 uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
542
543 do
544 {
545 /* Reading the coefficients */
546 b0 = *pCoeffs++;
547 b1 = *pCoeffs++;
548 b2 = *pCoeffs++;
549 a1 = *pCoeffs++;
550 a2 = *pCoeffs++;
551
552 /* Reading the state values */
553 Xn1 = (q31_t) (pState[0]);
554 Xn2 = (q31_t) (pState[1]);
555 Yn1 = pState[2];
556 Yn2 = pState[3];
557
558 #if defined (ARM_MATH_LOOPUNROLL)
559
560 /* Apply loop unrolling and compute 4 output values simultaneously. */
561 /* Variable acc hold output value that is being computed and stored in destination buffer
562 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
563 */
564
565 /* Loop unrolling: Compute 4 outputs at a time */
566 sample = blockSize >> 2U;
567
568 while (sample > 0U)
569 {
570 /* Read the input */
571 Xn = *pIn++;
572
573 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
574
575 /* acc = b0 * x[n] */
576 acc = (q63_t) Xn * b0;
577
578 /* acc += b1 * x[n-1] */
579 acc += (q63_t) Xn1 * b1;
580
581 /* acc += b[2] * x[n-2] */
582 acc += (q63_t) Xn2 * b2;
583
584 /* acc += a1 * y[n-1] */
585 acc += mult32x64(Yn1, a1);
586
587 /* acc += a2 * y[n-2] */
588 acc += mult32x64(Yn2, a2);
589
590 /* The result is converted to 1.63 , Yn2 variable is reused */
591 Yn2 = acc << shift;
592
593 /* Calc lower part of acc */
594 acc_l = acc & 0xffffffff;
595
596 /* Calc upper part of acc */
597 acc_h = (acc >> 32) & 0xffffffff;
598
599 /* Apply shift for lower part of acc and upper part of acc */
600 acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
601
602 /* Store the output in the destination buffer in 1.31 format. */
603 *pOut = acc_h;
604
605 /* Read the second input into Xn2, to reuse the value */
606 Xn2 = *pIn++;
607
608 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
609
610 /* acc += b1 * x[n-1] */
611 acc = (q63_t) Xn * b1;
612
613 /* acc = b0 * x[n] */
614 acc += (q63_t) Xn2 * b0;
615
616 /* acc += b[2] * x[n-2] */
617 acc += (q63_t) Xn1 * b2;
618
619 /* acc += a1 * y[n-1] */
620 acc += mult32x64(Yn2, a1);
621
622 /* acc += a2 * y[n-2] */
623 acc += mult32x64(Yn1, a2);
624
625 /* The result is converted to 1.63, Yn1 variable is reused */
626 Yn1 = acc << shift;
627
628 /* Calc lower part of acc */
629 acc_l = acc & 0xffffffff;
630
631 /* Calc upper part of acc */
632 acc_h = (acc >> 32) & 0xffffffff;
633
634 /* Apply shift for lower part of acc and upper part of acc */
635 acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
636
637 /* Read the third input into Xn1, to reuse the value */
638 Xn1 = *pIn++;
639
640 /* The result is converted to 1.31 */
641 /* Store the output in the destination buffer. */
642 *(pOut + 1U) = acc_h;
643
644 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
645
646 /* acc = b0 * x[n] */
647 acc = (q63_t) Xn1 * b0;
648
649 /* acc += b1 * x[n-1] */
650 acc += (q63_t) Xn2 * b1;
651
652 /* acc += b[2] * x[n-2] */
653 acc += (q63_t) Xn * b2;
654
655 /* acc += a1 * y[n-1] */
656 acc += mult32x64(Yn1, a1);
657
658 /* acc += a2 * y[n-2] */
659 acc += mult32x64(Yn2, a2);
660
661 /* The result is converted to 1.63, Yn2 variable is reused */
662 Yn2 = acc << shift;
663
664 /* Calc lower part of acc */
665 acc_l = acc & 0xffffffff;
666
667 /* Calc upper part of acc */
668 acc_h = (acc >> 32) & 0xffffffff;
669
670 /* Apply shift for lower part of acc and upper part of acc */
671 acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
672
673 /* Store the output in the destination buffer in 1.31 format. */
674 *(pOut + 2U) = acc_h;
675
676 /* Read the fourth input into Xn, to reuse the value */
677 Xn = *pIn++;
678
679 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
680 /* acc = b0 * x[n] */
681 acc = (q63_t) Xn * b0;
682
683 /* acc += b1 * x[n-1] */
684 acc += (q63_t) Xn1 * b1;
685
686 /* acc += b[2] * x[n-2] */
687 acc += (q63_t) Xn2 * b2;
688
689 /* acc += a1 * y[n-1] */
690 acc += mult32x64(Yn2, a1);
691
692 /* acc += a2 * y[n-2] */
693 acc += mult32x64(Yn1, a2);
694
695 /* The result is converted to 1.63, Yn1 variable is reused */
696 Yn1 = acc << shift;
697
698 /* Calc lower part of acc */
699 acc_l = acc & 0xffffffff;
700
701 /* Calc upper part of acc */
702 acc_h = (acc >> 32) & 0xffffffff;
703
704 /* Apply shift for lower part of acc and upper part of acc */
705 acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
706
707 /* Store the output in the destination buffer in 1.31 format. */
708 *(pOut + 3U) = acc_h;
709
710 /* Every time after the output is computed state should be updated. */
711 /* The states should be updated as: */
712 /* Xn2 = Xn1 */
713 /* Xn1 = Xn */
714 /* Yn2 = Yn1 */
715 /* Yn1 = acc */
716 Xn2 = Xn1;
717 Xn1 = Xn;
718
719 /* update output pointer */
720 pOut += 4U;
721
722 /* decrement loop counter */
723 sample--;
724 }
725
726 /* Loop unrolling: Compute remaining outputs */
727 sample = blockSize & 0x3U;
728
729 #else
730
731 /* Initialize blkCnt with number of samples */
732 sample = blockSize;
733
734 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
735
736 while (sample > 0U)
737 {
738 /* Read the input */
739 Xn = *pIn++;
740
741 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
742
743 /* acc = b0 * x[n] */
744 acc = (q63_t) Xn * b0;
745 /* acc += b1 * x[n-1] */
746 acc += (q63_t) Xn1 * b1;
747 /* acc += b[2] * x[n-2] */
748 acc += (q63_t) Xn2 * b2;
749 /* acc += a1 * y[n-1] */
750 acc += mult32x64(Yn1, a1);
751 /* acc += a2 * y[n-2] */
752 acc += mult32x64(Yn2, a2);
753
754 /* Every time after the output is computed state should be updated. */
755 /* The states should be updated as: */
756 /* Xn2 = Xn1 */
757 /* Xn1 = Xn */
758 /* Yn2 = Yn1 */
759 /* Yn1 = acc */
760 Xn2 = Xn1;
761 Xn1 = Xn;
762 Yn2 = Yn1;
763
764 /* The result is converted to 1.63, Yn1 variable is reused */
765 Yn1 = acc << shift;
766
767 /* Calc lower part of acc */
768 acc_l = acc & 0xffffffff;
769
770 /* Calc upper part of acc */
771 acc_h = (acc >> 32) & 0xffffffff;
772
773 /* Apply shift for lower part of acc and upper part of acc */
774 acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
775
776 /* Store the output in the destination buffer in 1.31 format. */
777 *pOut++ = acc_h;
778 /* Yn1 = acc << shift; */
779
780 /* Store the output in the destination buffer in 1.31 format. */
781 /* *pOut++ = (q31_t) (acc >> (32 - shift)); */
782
783 /* decrement loop counter */
784 sample--;
785 }
786
787 /* The first stage output is given as input to the second stage. */
788 pIn = pDst;
789
790 /* Reset to destination buffer working pointer */
791 pOut = pDst;
792
793 /* Store the updated state variables back into the pState array */
794 *pState++ = (q63_t) Xn1;
795 *pState++ = (q63_t) Xn2;
796 *pState++ = Yn1;
797 *pState++ = Yn2;
798
799 } while (--stage);
800
801 }
802 #endif /* defined(ARM_MATH_MVEI) */
803
804 /**
805 @} end of BiquadCascadeDF1_32x64 group
806 */
807