1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_biquad_cascade_df2T_f32.c
4  * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33 */
34 
35 /**
36   @addtogroup BiquadCascadeDF2T
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the floating-point transposed direct form II Biquad cascade filter.
42   @param[in]     S         points to an instance of the filter data structure
43   @param[in]     pSrc      points to the block of input data
44   @param[out]    pDst      points to the block of output data
45   @param[in]     blockSize number of samples to process
46  */
47 #if (defined(ARM_MATH_MVEF) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
48 #include "arm_helium_utils.h"
49 
arm_biquad_cascade_df2T_f32(const arm_biquad_cascade_df2T_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)50 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f32(
51   const arm_biquad_cascade_df2T_instance_f32 * S,
52   const float32_t * pSrc,
53         float32_t * pDst,
54         uint32_t blockSize)
55 {
56     const float32_t *pIn = pSrc;                  /*  source pointer            */
57     float32_t Xn0, Xn1;
58     float32_t acc0, acc1;
59     float32_t *pOut = pDst;                 /*  destination pointer       */
60     float32_t *pState = S->pState;          /*  State pointer             */
61     uint32_t  sample, stage = S->numStages; /*  loop counters             */
62     float32_t const *pCurCoeffs =          /*  coefficient pointer       */
63                 (float32_t const *) S->pCoeffs;
64     f32x4_t b0Coeffs, a0Coeffs;           /*  Coefficients vector       */
65     f32x4_t b1Coeffs, a1Coeffs;           /*  Modified coef. vector     */
66     f32x4_t state;                        /*  State vector              */
67 
68     do
69     {
70         /*
71          * temporary carry variable for feeding the 128-bit vector shifter
72          */
73         uint32_t  tmp = 0;
74         /*
75          * Reading the coefficients
76          * b0Coeffs = {b0, b1, b2, x}
77          * a0Coeffs = { x, a1, a2, x}
78          */
79         b0Coeffs = vld1q(pCurCoeffs);   pCurCoeffs+= 2;
80         a0Coeffs = vld1q(pCurCoeffs);   pCurCoeffs+= 3;
81         /*
82          * Reading the state values
83          * state = {d1, d2,  0, 0}
84          */
85         state = *(f32x4_t *) pState;
86         state = vsetq_lane(0.0f, state, 2);
87         state = vsetq_lane(0.0f, state, 3);
88 
89         /* b1Coeffs = {b0, b1, b2, x} */
90         /* b1Coeffs = { x, x, a1, a2} */
91         b1Coeffs = (f32x4_t)vshlcq_s32((int32x4_t)b0Coeffs, &tmp, 32);
92         a1Coeffs = (f32x4_t)vshlcq_s32((int32x4_t)a0Coeffs, &tmp, 32);
93 
94         sample = blockSize / 2;
95 
96         /* unrolled 2 x */
97         while (sample > 0U)
98         {
99             /*
100              * Read 2 inputs
101              */
102             Xn0 = *pIn++;
103             Xn1 = *pIn++;
104 
105             /*
106              * 1st half:
107              * / acc0 \   / b0 \         / d1 \   / 0  \
108              * |  d1  | = | b1 | * Xn0 + | d2 | + | a1 | x acc0
109              * |  d2  |   | b2 |         | 0  |   | a2 |
110              * \  x   /   \ x  /         \ x  /   \ x  /
111              */
112 
113             state = vfmaq(state, b0Coeffs, Xn0);
114             acc0 = vgetq_lane(state, 0);
115             state = vfmaq(state, a0Coeffs, acc0);
116             state = vsetq_lane(0.0f, state, 3);
117 
118             /*
119              * 2nd half:
120              * same as 1st half, but all vector elements shifted down.
121              * /  x   \   / x  \         / x  \   / x  \
122              * | acc1 | = | b0 | * Xn1 + | d1 | + | 0  | x acc1
123              * |  d1  |   | b1 |         | d2 |   | a1 |
124              * \  d2  /   \ b2 /         \ 0  /   \ a2 /
125              */
126 
127             state = vfmaq(state, b1Coeffs, Xn1);
128             acc1 = vgetq_lane(state, 1);
129             state = vfmaq(state, a1Coeffs, acc1);
130 
131             /* move d1, d2 up + clearing */
132             /* expect dual move or long move */
133             state = vsetq_lane(vgetq_lane(state, 2), state, 0);
134             state = vsetq_lane(vgetq_lane(state, 3), state, 1);
135             state = vsetq_lane(0.0f, state, 2);
136             /*
137              * Store the results in the destination buffer.
138              */
139             *pOut++ = acc0;
140             *pOut++ = acc1;
141             /*
142              * decrement the loop counter
143              */
144             sample--;
145         }
146 
147         /*
148          * tail handling
149          */
150         if (blockSize & 1)
151         {
152             Xn0 = *pIn++;
153             state = vfmaq(state, b0Coeffs, Xn0);
154             acc0 = vgetq_lane(state, 0);
155 
156             state = vfmaq(state, a0Coeffs, acc0);
157             *pOut++ = acc0;
158             *pState++ = vgetq_lane(state, 1);
159             *pState++ = vgetq_lane(state, 2);
160         }
161         else
162         {
163             *pState++ = vgetq_lane(state, 0);
164             *pState++ = vgetq_lane(state, 1);
165         }
166         /*
167          * The current stage output is given as the input to the next stage
168          */
169         pIn = pDst;
170         /*
171          * Reset the output working pointer
172          */
173         pOut = pDst;
174         /*
175          * decrement the loop counter
176          */
177         stage--;
178     }
179     while (stage > 0U);
180 }
181 #else
182 #if defined(ARM_MATH_NEON)
183 
arm_biquad_cascade_df2T_f32(const arm_biquad_cascade_df2T_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)184 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f32(
185   const arm_biquad_cascade_df2T_instance_f32 * S,
186   const float32_t * pSrc,
187         float32_t * pDst,
188         uint32_t blockSize)
189 {
190    const float32_t *pIn = pSrc;                   /*  source pointer            */
191    float32_t *pOut = pDst;                        /*  destination pointer       */
192    float32_t *pState = S->pState;                 /*  State pointer             */
193    const float32_t *pCoeffs = S->pCoeffs;         /*  coefficient pointer       */
194    float32_t acc1;                                /*  accumulator               */
195    float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
196    float32_t Xn1;                                 /*  temporary input           */
197    float32_t d1, d2;                              /*  state variables           */
198    uint32_t sample, stageCnt,stage = S->numStages;         /*   loop counters   */
199 
200 
201    float32x4_t XnV, YnV;
202    float32x4x2_t dV;
203    float32x4_t zeroV = vdupq_n_f32(0.0);
204    float32x4_t t1,t2,t3,t4,b1V,b2V,a1V,a2V,s;
205 
206    /* Loop unrolling. Compute 4 outputs at a time */
207    stageCnt = stage >> 2;
208 
209    while (stageCnt > 0U)
210    {
211       /* Reading the coefficients */
212       t1 = vld1q_f32(pCoeffs);
213       pCoeffs += 4;
214 
215       t2 = vld1q_f32(pCoeffs);
216       pCoeffs += 4;
217 
218       t3 = vld1q_f32(pCoeffs);
219       pCoeffs += 4;
220 
221       t4 = vld1q_f32(pCoeffs);
222       pCoeffs += 4;
223 
224       b1V = vld1q_f32(pCoeffs);
225       pCoeffs += 4;
226 
227       b2V = vld1q_f32(pCoeffs);
228       pCoeffs += 4;
229 
230       a1V = vld1q_f32(pCoeffs);
231       pCoeffs += 4;
232 
233       a2V = vld1q_f32(pCoeffs);
234       pCoeffs += 4;
235 
236       /* Reading the state values */
237       dV = vld2q_f32(pState);
238 
239       sample = blockSize;
240 
241       while (sample > 0U) {
242          /* y[n] = b0 * x[n] + d1 */
243          /* d1 = b1 * x[n] + a1 * y[n] + d2 */
244          /* d2 = b2 * x[n] + a2 * y[n] */
245 
246          XnV = vdupq_n_f32(*pIn++);
247 
248          s = dV.val[0];
249          YnV = s;
250 
251          s = vextq_f32(zeroV,dV.val[0],3);
252          YnV = vmlaq_f32(YnV, t1, s);
253 
254          s = vextq_f32(zeroV,dV.val[0],2);
255          YnV = vmlaq_f32(YnV, t2, s);
256 
257          s = vextq_f32(zeroV,dV.val[0],1);
258          YnV = vmlaq_f32(YnV, t3, s);
259 
260          YnV = vmlaq_f32(YnV, t4, XnV);
261 
262          s = vextq_f32(XnV,YnV,3);
263 
264          dV.val[0] = vmlaq_f32(dV.val[1], s, b1V);
265          dV.val[0] = vmlaq_f32(dV.val[0], YnV, a1V);
266 
267          dV.val[1] = vmulq_f32(s, b2V);
268          dV.val[1] = vmlaq_f32(dV.val[1], YnV, a2V);
269 
270          *pOut++ = vgetq_lane_f32(YnV, 3) ;
271 
272          sample--;
273       }
274 
275       /* Store the updated state variables back into the state array */
276       vst2q_f32(pState,dV);
277       pState += 8;
278 
279       /* The current stage output is given as the input to the next stage */
280       pIn = pDst;
281 
282       /*Reset the output working pointer */
283       pOut = pDst;
284 
285       /* decrement the loop counter */
286       stageCnt--;
287 
288    }
289 
290    /* Tail */
291    stageCnt = stage & 3;
292 
293    while (stageCnt > 0U)
294    {
295       /* Reading the coefficients */
296       b0 = *pCoeffs++;
297       b1 = *pCoeffs++;
298       b2 = *pCoeffs++;
299       a1 = *pCoeffs++;
300       a2 = *pCoeffs++;
301 
302       /*Reading the state values */
303       d1 = pState[0];
304       d2 = pState[1];
305 
306       sample = blockSize;
307 
308       while (sample > 0U)
309       {
310          /* Read the input */
311          Xn1 = *pIn++;
312 
313          /* y[n] = b0 * x[n] + d1 */
314          acc1 = (b0 * Xn1) + d1;
315 
316          /* Store the result in the accumulator in the destination buffer. */
317          *pOut++ = acc1;
318 
319          /* Every time after the output is computed state should be updated. */
320          /* d1 = b1 * x[n] + a1 * y[n] + d2 */
321          d1 = ((b1 * Xn1) + (a1 * acc1)) + d2;
322 
323          /* d2 = b2 * x[n] + a2 * y[n] */
324          d2 = (b2 * Xn1) + (a2 * acc1);
325 
326          /* decrement the loop counter */
327          sample--;
328       }
329 
330       /* Store the updated state variables back into the state array */
331       *pState++ = d1;
332       *pState++ = d2;
333 
334       /* The current stage output is given as the input to the next stage */
335       pIn = pDst;
336 
337       /*Reset the output working pointer */
338       pOut = pDst;
339 
340       /* decrement the loop counter */
341       stageCnt--;
342    }
343 }
344 #else
345 
arm_biquad_cascade_df2T_f32(const arm_biquad_cascade_df2T_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)346 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f32(
347   const arm_biquad_cascade_df2T_instance_f32 * S,
348   const float32_t * pSrc,
349         float32_t * pDst,
350         uint32_t blockSize)
351 {
352   const float32_t *pIn = pSrc;                         /* Source pointer */
353         float32_t *pOut = pDst;                        /* Destination pointer */
354         float32_t *pState = S->pState;                 /* State pointer */
355   const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
356         float32_t acc1;                                /* Accumulator */
357         float32_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
358         float32_t Xn1;                                 /* Temporary input */
359         float32_t d1, d2;                              /* State variables */
360         uint32_t sample, stage = S->numStages;         /* Loop counters */
361 
362   do
363   {
364      /* Reading the coefficients */
365      b0 = pCoeffs[0];
366      b1 = pCoeffs[1];
367      b2 = pCoeffs[2];
368      a1 = pCoeffs[3];
369      a2 = pCoeffs[4];
370 
371      /* Reading the state values */
372      d1 = pState[0];
373      d2 = pState[1];
374 
375      pCoeffs += 5U;
376 
377 #if defined (ARM_MATH_LOOPUNROLL)
378 
379      /* Loop unrolling: Compute 16 outputs at a time */
380      sample = blockSize >> 4U;
381 
382      while (sample > 0U) {
383 
384        /* y[n] = b0 * x[n] + d1 */
385        /* d1 = b1 * x[n] + a1 * y[n] + d2 */
386        /* d2 = b2 * x[n] + a2 * y[n] */
387 
388 /*  1 */
389        Xn1 = *pIn++;
390 
391        acc1 = b0 * Xn1 + d1;
392 
393        d1 = b1 * Xn1 + d2;
394        d1 += a1 * acc1;
395 
396        d2 = b2 * Xn1;
397        d2 += a2 * acc1;
398 
399        *pOut++ = acc1;
400 
401 /*  2 */
402          Xn1 = *pIn++;
403 
404         acc1 = b0 * Xn1 + d1;
405 
406         d1 = b1 * Xn1 + d2;
407         d1 += a1 * acc1;
408 
409         d2 = b2 * Xn1;
410         d2 += a2 * acc1;
411 
412         *pOut++ = acc1;
413 
414 /*  3 */
415          Xn1 = *pIn++;
416 
417         acc1 = b0 * Xn1 + d1;
418 
419         d1 = b1 * Xn1 + d2;
420         d1 += a1 * acc1;
421 
422         d2 = b2 * Xn1;
423         d2 += a2 * acc1;
424 
425         *pOut++ = acc1;
426 
427 /*  4 */
428          Xn1 = *pIn++;
429 
430         acc1 = b0 * Xn1 + d1;
431 
432         d1 = b1 * Xn1 + d2;
433         d1 += a1 * acc1;
434 
435         d2 = b2 * Xn1;
436         d2 += a2 * acc1;
437 
438         *pOut++ = acc1;
439 
440 /*  5 */
441          Xn1 = *pIn++;
442 
443         acc1 = b0 * Xn1 + d1;
444 
445         d1 = b1 * Xn1 + d2;
446         d1 += a1 * acc1;
447 
448         d2 = b2 * Xn1;
449         d2 += a2 * acc1;
450 
451         *pOut++ = acc1;
452 
453 /*  6 */
454          Xn1 = *pIn++;
455 
456         acc1 = b0 * Xn1 + d1;
457 
458         d1 = b1 * Xn1 + d2;
459         d1 += a1 * acc1;
460 
461         d2 = b2 * Xn1;
462         d2 += a2 * acc1;
463 
464         *pOut++ = acc1;
465 
466 /*  7 */
467          Xn1 = *pIn++;
468 
469         acc1 = b0 * Xn1 + d1;
470 
471         d1 = b1 * Xn1 + d2;
472         d1 += a1 * acc1;
473 
474         d2 = b2 * Xn1;
475         d2 += a2 * acc1;
476 
477         *pOut++ = acc1;
478 
479 /*  8 */
480          Xn1 = *pIn++;
481 
482         acc1 = b0 * Xn1 + d1;
483 
484         d1 = b1 * Xn1 + d2;
485         d1 += a1 * acc1;
486 
487         d2 = b2 * Xn1;
488         d2 += a2 * acc1;
489 
490         *pOut++ = acc1;
491 
492 /*  9 */
493          Xn1 = *pIn++;
494 
495         acc1 = b0 * Xn1 + d1;
496 
497         d1 = b1 * Xn1 + d2;
498         d1 += a1 * acc1;
499 
500         d2 = b2 * Xn1;
501         d2 += a2 * acc1;
502 
503         *pOut++ = acc1;
504 
505 /* 10 */
506          Xn1 = *pIn++;
507 
508         acc1 = b0 * Xn1 + d1;
509 
510         d1 = b1 * Xn1 + d2;
511         d1 += a1 * acc1;
512 
513         d2 = b2 * Xn1;
514         d2 += a2 * acc1;
515 
516         *pOut++ = acc1;
517 
518 /* 11 */
519          Xn1 = *pIn++;
520 
521         acc1 = b0 * Xn1 + d1;
522 
523         d1 = b1 * Xn1 + d2;
524         d1 += a1 * acc1;
525 
526         d2 = b2 * Xn1;
527         d2 += a2 * acc1;
528 
529         *pOut++ = acc1;
530 
531 /* 12 */
532          Xn1 = *pIn++;
533 
534         acc1 = b0 * Xn1 + d1;
535 
536         d1 = b1 * Xn1 + d2;
537         d1 += a1 * acc1;
538 
539         d2 = b2 * Xn1;
540         d2 += a2 * acc1;
541 
542         *pOut++ = acc1;
543 
544 /* 13 */
545          Xn1 = *pIn++;
546 
547         acc1 = b0 * Xn1 + d1;
548 
549         d1 = b1 * Xn1 + d2;
550         d1 += a1 * acc1;
551 
552         d2 = b2 * Xn1;
553         d2 += a2 * acc1;
554 
555         *pOut++ = acc1;
556 
557 /* 14 */
558          Xn1 = *pIn++;
559 
560         acc1 = b0 * Xn1 + d1;
561 
562         d1 = b1 * Xn1 + d2;
563         d1 += a1 * acc1;
564 
565         d2 = b2 * Xn1;
566         d2 += a2 * acc1;
567 
568         *pOut++ = acc1;
569 
570 /* 15 */
571          Xn1 = *pIn++;
572 
573         acc1 = b0 * Xn1 + d1;
574 
575         d1 = b1 * Xn1 + d2;
576         d1 += a1 * acc1;
577 
578         d2 = b2 * Xn1;
579         d2 += a2 * acc1;
580 
581         *pOut++ = acc1;
582 
583 /* 16 */
584          Xn1 = *pIn++;
585 
586         acc1 = b0 * Xn1 + d1;
587 
588         d1 = b1 * Xn1 + d2;
589         d1 += a1 * acc1;
590 
591         d2 = b2 * Xn1;
592         d2 += a2 * acc1;
593 
594         *pOut++ = acc1;
595 
596         /* decrement loop counter */
597         sample--;
598       }
599 
600       /* Loop unrolling: Compute remaining outputs */
601       sample = blockSize & 0xFU;
602 
603 #else
604 
605       /* Initialize blkCnt with number of samples */
606       sample = blockSize;
607 
608 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
609 
610       while (sample > 0U) {
611         Xn1 = *pIn++;
612 
613         acc1 = b0 * Xn1 + d1;
614 
615         d1 = b1 * Xn1 + d2;
616         d1 += a1 * acc1;
617 
618         d2 = b2 * Xn1;
619         d2 += a2 * acc1;
620 
621         *pOut++ = acc1;
622 
623         /* decrement loop counter */
624         sample--;
625       }
626 
627       /* Store the updated state variables back into the state array */
628       pState[0] = d1;
629       pState[1] = d2;
630 
631       pState += 2U;
632 
633       /* The current stage output is given as the input to the next stage */
634       pIn = pDst;
635 
636       /* Reset the output working pointer */
637       pOut = pDst;
638 
639       /* decrement loop counter */
640       stage--;
641 
642    } while (stage > 0U);
643 
644 }
645 
646 #endif /* #if defined(ARM_MATH_NEON) */
647 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
648 
649 /**
650   @} end of BiquadCascadeDF2T group
651  */
652