1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_biquad_cascade_df1_q15.c
4 * Description: Processing function for the Q15 Biquad cascade DirectFormI(DF1) filter
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup BiquadCascadeDF1
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q15 Biquad cascade filter.
42 @param[in] S points to an instance of the Q15 Biquad cascade structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the location where the output result is written
45 @param[in] blockSize number of samples to process
46
47 @par Scaling and Overflow Behavior
48 The function is implemented using a 64-bit internal accumulator.
49 Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
50 The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
51 There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
52 The accumulator is then shifted by <code>postShift</code> bits to truncate the result to 1.15 format by discarding the low 16 bits.
53 Finally, the result is saturated to 1.15 format.
54 @remark
55 Refer to \ref arm_biquad_cascade_df1_fast_q15() for a faster but less precise implementation of this filter.
56 */
57
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
59
arm_biquad_cascade_df1_q15(const arm_biquad_casd_df1_inst_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)60 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_q15(
61 const arm_biquad_casd_df1_inst_q15 * S,
62 const q15_t * pSrc,
63 q15_t * pDst,
64 uint32_t blockSize)
65 {
66 const q15_t *pIn = pSrc; /* input pointer initialization */
67 q15_t *pOut = pDst; /* output pointer initialization */
68 int shift;
69 uint32_t sample, stages = S->numStages; /* loop counters */
70 int postShift = S->postShift;
71 q15x8_t bCoeffs0, bCoeffs1, bCoeffs2, bCoeffs3; /* Coefficients vector */
72 q15_t *pState = S->pState; /* pState pointer initialization */
73 q15x8_t inVec0;
74 int64_t acc;
75 const q15_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
76 q31_t out, out1;
77
78 shift = (15 - postShift) - 32;
79
80 do {
81 q15_t a2 = pCoeffs[5];
82 q15_t a1 = pCoeffs[4];
83
84 bCoeffs0 = vdupq_n_s16(0);
85 bCoeffs0[0] = pCoeffs[3]; // b2
86 bCoeffs0[1] = pCoeffs[2]; // b1
87 bCoeffs0[2] = pCoeffs[0]; // b0
88
89 uint32_t zero = 0;
90 bCoeffs1 = bCoeffs0;
91 bCoeffs1 = vshlcq_s16(bCoeffs1, &zero, 16);
92
93 bCoeffs2 = bCoeffs1;
94 bCoeffs2 = vshlcq_s16(bCoeffs2, &zero, 16);
95
96 bCoeffs3 = bCoeffs2;
97 bCoeffs3 = vshlcq_s16(bCoeffs3, &zero, 16);
98
99 bCoeffs0[6] = a2;
100 bCoeffs0[7] = a1;
101 bCoeffs1[7] = a2;
102 bCoeffs1[6] = a1;
103
104 bCoeffs2 =
105 vsetq_lane_s32(vgetq_lane_s32((q31x4_t) bCoeffs0, 3), (q31x4_t) bCoeffs2, 3);
106 bCoeffs3 =
107 vsetq_lane_s32(vgetq_lane_s32((q31x4_t) bCoeffs1, 3), (q31x4_t) bCoeffs3, 3);
108
109
110 /* 2 first elements are garbage, will be updated with history */
111 inVec0 = vld1q(pIn - 2);
112 pIn += 2;
113
114 inVec0[0] = pState[1];
115 inVec0[1] = pState[0];
116 inVec0[6] = pState[3];
117 inVec0[7] = pState[2];
118
119 acc = vmlaldavq(bCoeffs0, inVec0);
120 acc = sqrshrl_sat48(acc, shift);
121 out1 = (q31_t) ((acc >> 32) & 0xffffffff);
122
123 inVec0[6] = out1;
124 acc = vmlaldavq(bCoeffs1, inVec0);
125 acc = sqrshrl_sat48(acc, shift);
126 out = (q31_t) ((acc >> 32) & 0xffffffff);
127
128 inVec0[7] = out;
129 *pOut++ = (q15_t) out1;
130 *pOut++ = (q15_t) out;
131
132 acc = vmlaldavq(bCoeffs2, inVec0);
133 acc = sqrshrl_sat48(acc, shift);
134 out1 = (q31_t) ((acc >> 32) & 0xffffffff);
135
136
137 inVec0[6] = out1;
138 acc = vmlaldavq(bCoeffs3, inVec0);
139 acc = sqrshrl_sat48(acc, shift);
140 out = (q31_t) ((acc >> 32) & 0xffffffff);
141
142 /*
143 * main loop
144 */
145 sample = (blockSize - 4) >> 2U;
146 /* preload (efficient scheduling) */
147 inVec0 = vld1q(pIn);
148 pIn += 4;
149
150 /*
151 * Compute 4 outputs at a time.
152 */
153 while (sample > 0U) {
154
155 inVec0[6] = out1;
156 inVec0[7] = out;
157
158 /* store */
159 *pOut++ = (q15_t) out1;
160 *pOut++ = (q15_t) out;
161
162 /*
163 * in { x0 x1 x2 x3 x4 x5 yn2 yn1 }
164 * x
165 * bCoeffs0 { b2 b1 b0 0 0 0 a2 a1 }
166 *
167 */
168 acc = vmlaldavq(bCoeffs0, inVec0);
169 /* shift + saturate to 16 bit */
170 acc = sqrshrl_sat48(acc, shift);
171 out1 = (q31_t) ((acc >> 32) & 0xffffffff);
172 inVec0[6] = out1;
173
174 /*
175 * in { x0 x1 x2 x3 x4 x5 y0 yn1 }
176 * x
177 * bCoeffs1 { 0 b2 b1 b0 0 0 a1 a2 }
178 */
179 acc = vmlaldavq(bCoeffs1, inVec0);
180 acc = sqrshrl_sat48(acc, shift);
181 out = (q31_t) ((acc >> 32) & 0xffffffff);
182
183 *pOut++ = (q15_t) out1;
184 *pOut++ = (q15_t) out;
185
186
187 inVec0[7] = out;
188 /*
189 * in { x0 x1 x2 x3 x4 x5 y0 yp1 }
190 * x
191 * bCoeffs2 { 0 0 b2 b1 b0 0 a2 a1 }
192 */
193 acc = vmlaldavq(bCoeffs2, inVec0);
194 acc = sqrshrl_sat48(acc, shift);
195 out1 = (q31_t) ((acc >> 32) & 0xffffffff);
196 inVec0[6] = out1;
197
198 /*
199 * in { x0 x1 x2 x3 x4 x5 y0 yp1 }
200 * x
201 * bCoeffs2 { 0 0 0 b2 b1 b0 a1 a2 }
202 */
203 acc = vmlaldavq(bCoeffs3, inVec0);
204 acc = sqrshrl_sat48(acc, shift);
205 out = (q31_t) ((acc >> 32) & 0xffffffff);
206
207 inVec0 = vld1q(pIn);
208 pIn += 4;
209
210 /* decrement the loop counter */
211 sample--;
212 }
213
214 *pOut++ = (q15_t) out1;
215 *pOut++ = (q15_t) out;
216
217 /*
218 * Tail handling
219 */
220 int32_t loopRemainder = blockSize & 3;
221
222 if (loopRemainder == 3) {
223 inVec0[6] = out1;
224 inVec0[7] = out;
225
226 acc = vmlaldavq(bCoeffs0, inVec0);
227 acc = sqrshrl_sat48(acc, shift);
228 out1 = (q31_t) ((acc >> 32) & 0xffffffff);
229 inVec0[6] = out1;
230
231 acc = vmlaldavq(bCoeffs1, inVec0);
232 acc = sqrshrl_sat48(acc, shift);
233 out = (q31_t) ((acc >> 32) & 0xffffffff);
234
235 *pOut++ = (q15_t) out1;
236 *pOut++ = (q15_t) out;
237
238 inVec0[7] = out;
239 acc = vmlaldavq(bCoeffs2, inVec0);
240 acc = sqrshrl_sat48(acc, shift);
241 out1 = (q31_t) ((acc >> 32) & 0xffffffff);
242 *pOut++ = (q15_t) out1;
243
244 /* Store the updated state variables back into the pState array */
245 pState[0] = vgetq_lane_s16(inVec0, 4);
246 pState[1] = vgetq_lane_s16(inVec0, 3);
247 pState[3] = out;
248 pState[2] = out1;
249
250 } else if (loopRemainder == 2) {
251 inVec0[6] = out1;
252 inVec0[7] = out;
253
254 acc = vmlaldavq(bCoeffs0, inVec0);
255 acc = sqrshrl_sat48(acc, shift);
256 out1 = (q31_t) ((acc >> 32) & 0xffffffff);
257 inVec0[6] = out1;
258
259 acc = vmlaldavq(bCoeffs1, inVec0);
260 acc = sqrshrl_sat48(acc, shift);
261 out = (q31_t) ((acc >> 32) & 0xffffffff);
262
263 *pOut++ = (q15_t) out1;
264 *pOut++ = (q15_t) out;
265
266 /* Store the updated state variables back into the pState array */
267 pState[0] = vgetq_lane_s16(inVec0, 3);
268 pState[1] = vgetq_lane_s16(inVec0, 2);
269 pState[3] = out1;
270 pState[2] = out;
271 } else if (loopRemainder == 1) {
272
273 inVec0[6] = out1;
274 inVec0[7] = out;
275
276 acc = vmlaldavq(bCoeffs0, inVec0);
277 acc = sqrshrl_sat48(acc, shift);
278 out1 = (q31_t) ((acc >> 32) & 0xffffffff);
279 *pOut++ = (q15_t) out1;
280
281 /* Store the updated state variables back into the pState array */
282 pState[0] = vgetq_lane_s16(inVec0, 2);
283 pState[1] = vgetq_lane_s16(inVec0, 1);
284 pState[3] = out;
285 pState[2] = out1;
286
287
288 } else {
289 /* Store the updated state variables back into the pState array */
290 pState[0] = vgetq_lane_s16(inVec0, 1);
291 pState[1] = vgetq_lane_s16(inVec0, 0);
292 pState[3] = out1;
293 pState[2] = out;
294 }
295
296 pState += 4;
297 pCoeffs += 6;
298
299 /*
300 * The first stage goes from the input buffer to the output buffer.
301 * Subsequent stages occur in-place in the output buffer
302 */
303 pIn = pDst;
304 /*
305 * Reset to destination pointer
306 */
307 pOut = pDst;
308 }
309 while (--stages);
310 }
311 #else
arm_biquad_cascade_df1_q15(const arm_biquad_casd_df1_inst_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)312 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_q15(
313 const arm_biquad_casd_df1_inst_q15 * S,
314 const q15_t * pSrc,
315 q15_t * pDst,
316 uint32_t blockSize)
317 {
318
319
320 #if defined (ARM_MATH_DSP)
321
322 const q15_t *pIn = pSrc; /* Source pointer */
323 q15_t *pOut = pDst; /* Destination pointer */
324 q31_t in; /* Temporary variable to hold input value */
325 q31_t out; /* Temporary variable to hold output value */
326 q31_t b0; /* Temporary variable to hold bo value */
327 q31_t b1, a1; /* Filter coefficients */
328 q31_t state_in, state_out; /* Filter state variables */
329 q31_t acc_l, acc_h;
330 q63_t acc; /* Accumulator */
331 q15_t *pState = S->pState; /* State pointer */
332 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
333 int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
334 uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */
335 int32_t uShift = (32 - lShift);
336
337 do
338 {
339 /* Read the b0 and 0 coefficients using SIMD */
340 b0 = read_q15x2_ia ((q15_t **) &pCoeffs);
341
342 /* Read the b1 and b2 coefficients using SIMD */
343 b1 = read_q15x2_ia ((q15_t **) &pCoeffs);
344
345 /* Read the a1 and a2 coefficients using SIMD */
346 a1 = read_q15x2_ia ((q15_t **) &pCoeffs);
347
348 /* Read the input state values from the state buffer: x[n-1], x[n-2] */
349 state_in = read_q15x2_ia (&pState);
350
351 /* Read the output state values from the state buffer: y[n-1], y[n-2] */
352 state_out = read_q15x2_da (&pState);
353
354 /* Apply loop unrolling and compute 2 output values simultaneously. */
355 /* The variable acc hold output values that are being computed:
356 *
357 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
358 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
359 */
360 sample = blockSize >> 1U;
361
362 /* First part of the processing with loop unrolling. Compute 2 outputs at a time.
363 ** a second loop below computes the remaining 1 sample. */
364 while (sample > 0U)
365 {
366
367 /* Read the input */
368 in = read_q15x2_ia ((q15_t **) &pIn);
369
370 /* out = b0 * x[n] + 0 * 0 */
371 out = __SMUAD(b0, in);
372
373 /* acc += b1 * x[n-1] + b2 * x[n-2] + out */
374 acc = __SMLALD(b1, state_in, out);
375 /* acc += a1 * y[n-1] + a2 * y[n-2] */
376 acc = __SMLALD(a1, state_out, acc);
377
378 /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */
379 /* Calc lower part of acc */
380 acc_l = acc & 0xffffffff;
381
382 /* Calc upper part of acc */
383 acc_h = (acc >> 32) & 0xffffffff;
384
385 /* Apply shift for lower part of acc and upper part of acc */
386 out = (uint32_t) acc_l >> lShift | acc_h << uShift;
387
388 out = __SSAT(out, 16);
389
390 /* Every time after the output is computed state should be updated. */
391 /* The states should be updated as: */
392 /* Xn2 = Xn1 */
393 /* Xn1 = Xn */
394 /* Yn2 = Yn1 */
395 /* Yn1 = acc */
396 /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
397 /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
398
399 #ifndef ARM_MATH_BIG_ENDIAN
400 state_in = __PKHBT(in, state_in, 16);
401 state_out = __PKHBT(out, state_out, 16);
402 #else
403 state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
404 state_out = __PKHBT(state_out >> 16, (out), 16);
405 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
406
407 /* out = b0 * x[n] + 0 * 0 */
408 out = __SMUADX(b0, in);
409 /* acc += b1 * x[n-1] + b2 * x[n-2] + out */
410 acc = __SMLALD(b1, state_in, out);
411 /* acc += a1 * y[n-1] + a2 * y[n-2] */
412 acc = __SMLALD(a1, state_out, acc);
413
414 /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */
415 /* Calc lower part of acc */
416 acc_l = acc & 0xffffffff;
417
418 /* Calc upper part of acc */
419 acc_h = (acc >> 32) & 0xffffffff;
420
421 /* Apply shift for lower part of acc and upper part of acc */
422 out = (uint32_t) acc_l >> lShift | acc_h << uShift;
423
424 out = __SSAT(out, 16);
425
426 /* Store the output in the destination buffer. */
427 #ifndef ARM_MATH_BIG_ENDIAN
428 write_q15x2_ia (&pOut, __PKHBT(state_out, out, 16));
429 #else
430 write_q15x2_ia (&pOut, __PKHBT(out, state_out >> 16, 16));
431 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
432
433 /* Every time after the output is computed state should be updated. */
434 /* The states should be updated as: */
435 /* Xn2 = Xn1 */
436 /* Xn1 = Xn */
437 /* Yn2 = Yn1 */
438 /* Yn1 = acc */
439 /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
440 /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
441 #ifndef ARM_MATH_BIG_ENDIAN
442 state_in = __PKHBT(in >> 16, state_in, 16);
443 state_out = __PKHBT(out, state_out, 16);
444 #else
445 state_in = __PKHBT(state_in >> 16, in, 16);
446 state_out = __PKHBT(state_out >> 16, out, 16);
447 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
448
449 /* Decrement loop counter */
450 sample--;
451 }
452
453 /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
454 ** No loop unrolling is used. */
455
456 if ((blockSize & 0x1U) != 0U)
457 {
458 /* Read the input */
459 in = *pIn++;
460
461 /* out = b0 * x[n] + 0 * 0 */
462 #ifndef ARM_MATH_BIG_ENDIAN
463 out = __SMUAD(b0, in);
464 #else
465 out = __SMUADX(b0, in);
466 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
467
468 /* acc = b1 * x[n-1] + b2 * x[n-2] + out */
469 acc = __SMLALD(b1, state_in, out);
470 /* acc += a1 * y[n-1] + a2 * y[n-2] */
471 acc = __SMLALD(a1, state_out, acc);
472
473 /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */
474 /* Calc lower part of acc */
475 acc_l = acc & 0xffffffff;
476
477 /* Calc upper part of acc */
478 acc_h = (acc >> 32) & 0xffffffff;
479
480 /* Apply shift for lower part of acc and upper part of acc */
481 out = (uint32_t) acc_l >> lShift | acc_h << uShift;
482
483 out = __SSAT(out, 16);
484
485 /* Store the output in the destination buffer. */
486 *pOut++ = (q15_t) out;
487
488 /* Every time after the output is computed state should be updated. */
489 /* The states should be updated as: */
490 /* Xn2 = Xn1 */
491 /* Xn1 = Xn */
492 /* Yn2 = Yn1 */
493 /* Yn1 = acc */
494 /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
495 /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
496 #ifndef ARM_MATH_BIG_ENDIAN
497 state_in = __PKHBT(in, state_in, 16);
498 state_out = __PKHBT(out, state_out, 16);
499 #else
500 state_in = __PKHBT(state_in >> 16, in, 16);
501 state_out = __PKHBT(state_out >> 16, out, 16);
502 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
503 }
504
505 /* The first stage goes from the input wire to the output wire. */
506 /* Subsequent numStages occur in-place in the output wire */
507 pIn = pDst;
508
509 /* Reset the output pointer */
510 pOut = pDst;
511
512 /* Store the updated state variables back into the state array */
513 write_q15x2_ia (&pState, state_in);
514 write_q15x2_ia (&pState, state_out);
515
516 /* Decrement loop counter */
517 stage--;
518
519 } while (stage > 0U);
520
521 #else
522
523 const q15_t *pIn = pSrc; /* Source pointer */
524 q15_t *pOut = pDst; /* Destination pointer */
525 q15_t b0, b1, b2, a1, a2; /* Filter coefficients */
526 q15_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */
527 q15_t Xn; /* temporary input */
528 q63_t acc; /* Accumulator */
529 int32_t shift = (15 - (int32_t) S->postShift); /* Post shift */
530 q15_t *pState = S->pState; /* State pointer */
531 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
532 uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */
533
534 do
535 {
536 /* Reading the coefficients */
537 b0 = *pCoeffs++;
538 pCoeffs++; // skip the 0 coefficient
539 b1 = *pCoeffs++;
540 b2 = *pCoeffs++;
541 a1 = *pCoeffs++;
542 a2 = *pCoeffs++;
543
544 /* Reading the state values */
545 Xn1 = pState[0];
546 Xn2 = pState[1];
547 Yn1 = pState[2];
548 Yn2 = pState[3];
549
550 /* The variables acc holds the output value that is computed:
551 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
552 */
553
554 sample = blockSize;
555
556 while (sample > 0U)
557 {
558 /* Read the input */
559 Xn = *pIn++;
560
561 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
562 /* acc = b0 * x[n] */
563 acc = (q31_t) b0 *Xn;
564
565 /* acc += b1 * x[n-1] */
566 acc += (q31_t) b1 *Xn1;
567 /* acc += b[2] * x[n-2] */
568 acc += (q31_t) b2 *Xn2;
569 /* acc += a1 * y[n-1] */
570 acc += (q31_t) a1 *Yn1;
571 /* acc += a2 * y[n-2] */
572 acc += (q31_t) a2 *Yn2;
573
574 /* The result is converted to 1.31 */
575 acc = __SSAT((acc >> shift), 16);
576
577 /* Every time after the output is computed state should be updated. */
578 /* The states should be updated as: */
579 /* Xn2 = Xn1 */
580 /* Xn1 = Xn */
581 /* Yn2 = Yn1 */
582 /* Yn1 = acc */
583 Xn2 = Xn1;
584 Xn1 = Xn;
585 Yn2 = Yn1;
586 Yn1 = (q15_t) acc;
587
588 /* Store the output in the destination buffer. */
589 *pOut++ = (q15_t) acc;
590
591 /* decrement the loop counter */
592 sample--;
593 }
594
595 /* The first stage goes from the input buffer to the output buffer. */
596 /* Subsequent stages occur in-place in the output buffer */
597 pIn = pDst;
598
599 /* Reset to destination pointer */
600 pOut = pDst;
601
602 /* Store the updated state variables back into the pState array */
603 *pState++ = Xn1;
604 *pState++ = Xn2;
605 *pState++ = Yn1;
606 *pState++ = Yn2;
607
608 } while (--stage);
609
610 #endif /* #if defined (ARM_MATH_DSP) */
611
612 }
613 #endif /* defined(ARM_MATH_MVEI) */
614
615 /**
616 @} end of BiquadCascadeDF1 group
617 */
618