1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_biquad_cascade_df2T_f32.c
4 * Description: Processing function for floating-point transposed direct form II Biquad cascade filter
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup BiquadCascadeDF2T
37 @{
38 */
39
40 /**
41 @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
42 @param[in] S points to an instance of the filter data structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] blockSize number of samples to process
46 */
47 #if (defined(ARM_MATH_MVEF) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
48 #include "arm_helium_utils.h"
49
arm_biquad_cascade_df2T_f32(const arm_biquad_cascade_df2T_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)50 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f32(
51 const arm_biquad_cascade_df2T_instance_f32 * S,
52 const float32_t * pSrc,
53 float32_t * pDst,
54 uint32_t blockSize)
55 {
56 const float32_t *pIn = pSrc; /* source pointer */
57 float32_t Xn0, Xn1;
58 float32_t acc0, acc1;
59 float32_t *pOut = pDst; /* destination pointer */
60 float32_t *pState = S->pState; /* State pointer */
61 uint32_t sample, stage = S->numStages; /* loop counters */
62 float32_t const *pCurCoeffs = /* coefficient pointer */
63 (float32_t const *) S->pCoeffs;
64 f32x4_t b0Coeffs, a0Coeffs; /* Coefficients vector */
65 f32x4_t b1Coeffs, a1Coeffs; /* Modified coef. vector */
66 f32x4_t state; /* State vector */
67
68 do
69 {
70 /*
71 * temporary carry variable for feeding the 128-bit vector shifter
72 */
73 uint32_t tmp = 0;
74 /*
75 * Reading the coefficients
76 * b0Coeffs = {b0, b1, b2, x}
77 * a0Coeffs = { x, a1, a2, x}
78 */
79 b0Coeffs = vld1q(pCurCoeffs); pCurCoeffs+= 2;
80 a0Coeffs = vld1q(pCurCoeffs); pCurCoeffs+= 3;
81 /*
82 * Reading the state values
83 * state = {d1, d2, 0, 0}
84 */
85 state = *(f32x4_t *) pState;
86 state = vsetq_lane(0.0f, state, 2);
87 state = vsetq_lane(0.0f, state, 3);
88
89 /* b1Coeffs = {b0, b1, b2, x} */
90 /* b1Coeffs = { x, x, a1, a2} */
91 b1Coeffs = (f32x4_t)vshlcq_s32((int32x4_t)b0Coeffs, &tmp, 32);
92 a1Coeffs = (f32x4_t)vshlcq_s32((int32x4_t)a0Coeffs, &tmp, 32);
93
94 sample = blockSize / 2;
95
96 /* unrolled 2 x */
97 while (sample > 0U)
98 {
99 /*
100 * Read 2 inputs
101 */
102 Xn0 = *pIn++;
103 Xn1 = *pIn++;
104
105 /*
106 * 1st half:
107 * / acc0 \ / b0 \ / d1 \ / 0 \
108 * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0
109 * | d2 | | b2 | | 0 | | a2 |
110 * \ x / \ x / \ x / \ x /
111 */
112
113 state = vfmaq(state, b0Coeffs, Xn0);
114 acc0 = vgetq_lane(state, 0);
115 state = vfmaq(state, a0Coeffs, acc0);
116 state = vsetq_lane(0.0f, state, 3);
117
118 /*
119 * 2nd half:
120 * same as 1st half, but all vector elements shifted down.
121 * / x \ / x \ / x \ / x \
122 * | acc1 | = | b0 | * Xn1 + | d1 | + | 0 | x acc1
123 * | d1 | | b1 | | d2 | | a1 |
124 * \ d2 / \ b2 / \ 0 / \ a2 /
125 */
126
127 state = vfmaq(state, b1Coeffs, Xn1);
128 acc1 = vgetq_lane(state, 1);
129 state = vfmaq(state, a1Coeffs, acc1);
130
131 /* move d1, d2 up + clearing */
132 /* expect dual move or long move */
133 state = vsetq_lane(vgetq_lane(state, 2), state, 0);
134 state = vsetq_lane(vgetq_lane(state, 3), state, 1);
135 state = vsetq_lane(0.0f, state, 2);
136 /*
137 * Store the results in the destination buffer.
138 */
139 *pOut++ = acc0;
140 *pOut++ = acc1;
141 /*
142 * decrement the loop counter
143 */
144 sample--;
145 }
146
147 /*
148 * tail handling
149 */
150 if (blockSize & 1)
151 {
152 Xn0 = *pIn++;
153 state = vfmaq(state, b0Coeffs, Xn0);
154 acc0 = vgetq_lane(state, 0);
155
156 state = vfmaq(state, a0Coeffs, acc0);
157 *pOut++ = acc0;
158 *pState++ = vgetq_lane(state, 1);
159 *pState++ = vgetq_lane(state, 2);
160 }
161 else
162 {
163 *pState++ = vgetq_lane(state, 0);
164 *pState++ = vgetq_lane(state, 1);
165 }
166 /*
167 * The current stage output is given as the input to the next stage
168 */
169 pIn = pDst;
170 /*
171 * Reset the output working pointer
172 */
173 pOut = pDst;
174 /*
175 * decrement the loop counter
176 */
177 stage--;
178 }
179 while (stage > 0U);
180 }
181 #else
182 #if defined(ARM_MATH_NEON)
183
arm_biquad_cascade_df2T_f32(const arm_biquad_cascade_df2T_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)184 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f32(
185 const arm_biquad_cascade_df2T_instance_f32 * S,
186 const float32_t * pSrc,
187 float32_t * pDst,
188 uint32_t blockSize)
189 {
190 const float32_t *pIn = pSrc; /* source pointer */
191 float32_t *pOut = pDst; /* destination pointer */
192 float32_t *pState = S->pState; /* State pointer */
193 const float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
194 float32_t acc1; /* accumulator */
195 float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
196 float32_t Xn1; /* temporary input */
197 float32_t d1, d2; /* state variables */
198 uint32_t sample, stageCnt,stage = S->numStages; /* loop counters */
199
200
201 float32x4_t XnV, YnV;
202 float32x4x2_t dV;
203 float32x4_t zeroV = vdupq_n_f32(0.0);
204 float32x4_t t1,t2,t3,t4,b1V,b2V,a1V,a2V,s;
205
206 /* Loop unrolling. Compute 4 outputs at a time */
207 stageCnt = stage >> 2;
208
209 while (stageCnt > 0U)
210 {
211 /* Reading the coefficients */
212 t1 = vld1q_f32(pCoeffs);
213 pCoeffs += 4;
214
215 t2 = vld1q_f32(pCoeffs);
216 pCoeffs += 4;
217
218 t3 = vld1q_f32(pCoeffs);
219 pCoeffs += 4;
220
221 t4 = vld1q_f32(pCoeffs);
222 pCoeffs += 4;
223
224 b1V = vld1q_f32(pCoeffs);
225 pCoeffs += 4;
226
227 b2V = vld1q_f32(pCoeffs);
228 pCoeffs += 4;
229
230 a1V = vld1q_f32(pCoeffs);
231 pCoeffs += 4;
232
233 a2V = vld1q_f32(pCoeffs);
234 pCoeffs += 4;
235
236 /* Reading the state values */
237 dV = vld2q_f32(pState);
238
239 sample = blockSize;
240
241 while (sample > 0U) {
242 /* y[n] = b0 * x[n] + d1 */
243 /* d1 = b1 * x[n] + a1 * y[n] + d2 */
244 /* d2 = b2 * x[n] + a2 * y[n] */
245
246 XnV = vdupq_n_f32(*pIn++);
247
248 s = dV.val[0];
249 YnV = s;
250
251 s = vextq_f32(zeroV,dV.val[0],3);
252 YnV = vmlaq_f32(YnV, t1, s);
253
254 s = vextq_f32(zeroV,dV.val[0],2);
255 YnV = vmlaq_f32(YnV, t2, s);
256
257 s = vextq_f32(zeroV,dV.val[0],1);
258 YnV = vmlaq_f32(YnV, t3, s);
259
260 YnV = vmlaq_f32(YnV, t4, XnV);
261
262 s = vextq_f32(XnV,YnV,3);
263
264 dV.val[0] = vmlaq_f32(dV.val[1], s, b1V);
265 dV.val[0] = vmlaq_f32(dV.val[0], YnV, a1V);
266
267 dV.val[1] = vmulq_f32(s, b2V);
268 dV.val[1] = vmlaq_f32(dV.val[1], YnV, a2V);
269
270 *pOut++ = vgetq_lane_f32(YnV, 3) ;
271
272 sample--;
273 }
274
275 /* Store the updated state variables back into the state array */
276 vst2q_f32(pState,dV);
277 pState += 8;
278
279 /* The current stage output is given as the input to the next stage */
280 pIn = pDst;
281
282 /*Reset the output working pointer */
283 pOut = pDst;
284
285 /* decrement the loop counter */
286 stageCnt--;
287
288 }
289
290 /* Tail */
291 stageCnt = stage & 3;
292
293 while (stageCnt > 0U)
294 {
295 /* Reading the coefficients */
296 b0 = *pCoeffs++;
297 b1 = *pCoeffs++;
298 b2 = *pCoeffs++;
299 a1 = *pCoeffs++;
300 a2 = *pCoeffs++;
301
302 /*Reading the state values */
303 d1 = pState[0];
304 d2 = pState[1];
305
306 sample = blockSize;
307
308 while (sample > 0U)
309 {
310 /* Read the input */
311 Xn1 = *pIn++;
312
313 /* y[n] = b0 * x[n] + d1 */
314 acc1 = (b0 * Xn1) + d1;
315
316 /* Store the result in the accumulator in the destination buffer. */
317 *pOut++ = acc1;
318
319 /* Every time after the output is computed state should be updated. */
320 /* d1 = b1 * x[n] + a1 * y[n] + d2 */
321 d1 = ((b1 * Xn1) + (a1 * acc1)) + d2;
322
323 /* d2 = b2 * x[n] + a2 * y[n] */
324 d2 = (b2 * Xn1) + (a2 * acc1);
325
326 /* decrement the loop counter */
327 sample--;
328 }
329
330 /* Store the updated state variables back into the state array */
331 *pState++ = d1;
332 *pState++ = d2;
333
334 /* The current stage output is given as the input to the next stage */
335 pIn = pDst;
336
337 /*Reset the output working pointer */
338 pOut = pDst;
339
340 /* decrement the loop counter */
341 stageCnt--;
342 }
343 }
344 #else
345
arm_biquad_cascade_df2T_f32(const arm_biquad_cascade_df2T_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)346 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f32(
347 const arm_biquad_cascade_df2T_instance_f32 * S,
348 const float32_t * pSrc,
349 float32_t * pDst,
350 uint32_t blockSize)
351 {
352 const float32_t *pIn = pSrc; /* Source pointer */
353 float32_t *pOut = pDst; /* Destination pointer */
354 float32_t *pState = S->pState; /* State pointer */
355 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
356 float32_t acc1; /* Accumulator */
357 float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
358 float32_t Xn1; /* Temporary input */
359 float32_t d1, d2; /* State variables */
360 uint32_t sample, stage = S->numStages; /* Loop counters */
361
362 do
363 {
364 /* Reading the coefficients */
365 b0 = pCoeffs[0];
366 b1 = pCoeffs[1];
367 b2 = pCoeffs[2];
368 a1 = pCoeffs[3];
369 a2 = pCoeffs[4];
370
371 /* Reading the state values */
372 d1 = pState[0];
373 d2 = pState[1];
374
375 pCoeffs += 5U;
376
377 #if defined (ARM_MATH_LOOPUNROLL)
378
379 /* Loop unrolling: Compute 16 outputs at a time */
380 sample = blockSize >> 4U;
381
382 while (sample > 0U) {
383
384 /* y[n] = b0 * x[n] + d1 */
385 /* d1 = b1 * x[n] + a1 * y[n] + d2 */
386 /* d2 = b2 * x[n] + a2 * y[n] */
387
388 /* 1 */
389 Xn1 = *pIn++;
390
391 acc1 = b0 * Xn1 + d1;
392
393 d1 = b1 * Xn1 + d2;
394 d1 += a1 * acc1;
395
396 d2 = b2 * Xn1;
397 d2 += a2 * acc1;
398
399 *pOut++ = acc1;
400
401 /* 2 */
402 Xn1 = *pIn++;
403
404 acc1 = b0 * Xn1 + d1;
405
406 d1 = b1 * Xn1 + d2;
407 d1 += a1 * acc1;
408
409 d2 = b2 * Xn1;
410 d2 += a2 * acc1;
411
412 *pOut++ = acc1;
413
414 /* 3 */
415 Xn1 = *pIn++;
416
417 acc1 = b0 * Xn1 + d1;
418
419 d1 = b1 * Xn1 + d2;
420 d1 += a1 * acc1;
421
422 d2 = b2 * Xn1;
423 d2 += a2 * acc1;
424
425 *pOut++ = acc1;
426
427 /* 4 */
428 Xn1 = *pIn++;
429
430 acc1 = b0 * Xn1 + d1;
431
432 d1 = b1 * Xn1 + d2;
433 d1 += a1 * acc1;
434
435 d2 = b2 * Xn1;
436 d2 += a2 * acc1;
437
438 *pOut++ = acc1;
439
440 /* 5 */
441 Xn1 = *pIn++;
442
443 acc1 = b0 * Xn1 + d1;
444
445 d1 = b1 * Xn1 + d2;
446 d1 += a1 * acc1;
447
448 d2 = b2 * Xn1;
449 d2 += a2 * acc1;
450
451 *pOut++ = acc1;
452
453 /* 6 */
454 Xn1 = *pIn++;
455
456 acc1 = b0 * Xn1 + d1;
457
458 d1 = b1 * Xn1 + d2;
459 d1 += a1 * acc1;
460
461 d2 = b2 * Xn1;
462 d2 += a2 * acc1;
463
464 *pOut++ = acc1;
465
466 /* 7 */
467 Xn1 = *pIn++;
468
469 acc1 = b0 * Xn1 + d1;
470
471 d1 = b1 * Xn1 + d2;
472 d1 += a1 * acc1;
473
474 d2 = b2 * Xn1;
475 d2 += a2 * acc1;
476
477 *pOut++ = acc1;
478
479 /* 8 */
480 Xn1 = *pIn++;
481
482 acc1 = b0 * Xn1 + d1;
483
484 d1 = b1 * Xn1 + d2;
485 d1 += a1 * acc1;
486
487 d2 = b2 * Xn1;
488 d2 += a2 * acc1;
489
490 *pOut++ = acc1;
491
492 /* 9 */
493 Xn1 = *pIn++;
494
495 acc1 = b0 * Xn1 + d1;
496
497 d1 = b1 * Xn1 + d2;
498 d1 += a1 * acc1;
499
500 d2 = b2 * Xn1;
501 d2 += a2 * acc1;
502
503 *pOut++ = acc1;
504
505 /* 10 */
506 Xn1 = *pIn++;
507
508 acc1 = b0 * Xn1 + d1;
509
510 d1 = b1 * Xn1 + d2;
511 d1 += a1 * acc1;
512
513 d2 = b2 * Xn1;
514 d2 += a2 * acc1;
515
516 *pOut++ = acc1;
517
518 /* 11 */
519 Xn1 = *pIn++;
520
521 acc1 = b0 * Xn1 + d1;
522
523 d1 = b1 * Xn1 + d2;
524 d1 += a1 * acc1;
525
526 d2 = b2 * Xn1;
527 d2 += a2 * acc1;
528
529 *pOut++ = acc1;
530
531 /* 12 */
532 Xn1 = *pIn++;
533
534 acc1 = b0 * Xn1 + d1;
535
536 d1 = b1 * Xn1 + d2;
537 d1 += a1 * acc1;
538
539 d2 = b2 * Xn1;
540 d2 += a2 * acc1;
541
542 *pOut++ = acc1;
543
544 /* 13 */
545 Xn1 = *pIn++;
546
547 acc1 = b0 * Xn1 + d1;
548
549 d1 = b1 * Xn1 + d2;
550 d1 += a1 * acc1;
551
552 d2 = b2 * Xn1;
553 d2 += a2 * acc1;
554
555 *pOut++ = acc1;
556
557 /* 14 */
558 Xn1 = *pIn++;
559
560 acc1 = b0 * Xn1 + d1;
561
562 d1 = b1 * Xn1 + d2;
563 d1 += a1 * acc1;
564
565 d2 = b2 * Xn1;
566 d2 += a2 * acc1;
567
568 *pOut++ = acc1;
569
570 /* 15 */
571 Xn1 = *pIn++;
572
573 acc1 = b0 * Xn1 + d1;
574
575 d1 = b1 * Xn1 + d2;
576 d1 += a1 * acc1;
577
578 d2 = b2 * Xn1;
579 d2 += a2 * acc1;
580
581 *pOut++ = acc1;
582
583 /* 16 */
584 Xn1 = *pIn++;
585
586 acc1 = b0 * Xn1 + d1;
587
588 d1 = b1 * Xn1 + d2;
589 d1 += a1 * acc1;
590
591 d2 = b2 * Xn1;
592 d2 += a2 * acc1;
593
594 *pOut++ = acc1;
595
596 /* decrement loop counter */
597 sample--;
598 }
599
600 /* Loop unrolling: Compute remaining outputs */
601 sample = blockSize & 0xFU;
602
603 #else
604
605 /* Initialize blkCnt with number of samples */
606 sample = blockSize;
607
608 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
609
610 while (sample > 0U) {
611 Xn1 = *pIn++;
612
613 acc1 = b0 * Xn1 + d1;
614
615 d1 = b1 * Xn1 + d2;
616 d1 += a1 * acc1;
617
618 d2 = b2 * Xn1;
619 d2 += a2 * acc1;
620
621 *pOut++ = acc1;
622
623 /* decrement loop counter */
624 sample--;
625 }
626
627 /* Store the updated state variables back into the state array */
628 pState[0] = d1;
629 pState[1] = d2;
630
631 pState += 2U;
632
633 /* The current stage output is given as the input to the next stage */
634 pIn = pDst;
635
636 /* Reset the output working pointer */
637 pOut = pDst;
638
639 /* decrement loop counter */
640 stage--;
641
642 } while (stage > 0U);
643
644 }
645
646 #endif /* #if defined(ARM_MATH_NEON) */
647 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
648
649 /**
650 @} end of BiquadCascadeDF2T group
651 */
652