1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_biquad_cascade_df2T_f64.c
4  * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter
5  *
6  * $Date:        03 June 2022
7  * $Revision:    V1.9.1
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33 */
34 
35 /**
36   @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
37 
38   This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
39   The filters are implemented as a cascade of second order Biquad sections.
40   These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
41   Only floating-point data is supported.
42 
43   This function operate on blocks of input and output data and each call to the function
44   processes <code>blockSize</code> samples through the filter.
45   <code>pSrc</code> points to the array of input data and
46   <code>pDst</code> points to the array of output data.
47   Both arrays contain <code>blockSize</code> values.
48 
49   @par           Algorithm
50                    Each Biquad stage implements a second order filter using the difference equation:
51   <pre>
52      y[n] = b0 * x[n] + d1
53      d1 = b1 * x[n] + a1 * y[n] + d2
54      d2 = b2 * x[n] + a2 * y[n]
55   </pre>
56                    where d1 and d2 represent the two state values.
57   @par
58                    A Biquad filter using a transposed Direct Form II structure is shown below.
59                    \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
60                    Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
61                    Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
62                    Pay careful attention to the sign of the feedback coefficients.
63                    Some design tools flip the sign of the feedback coefficients:
64   <pre>
65      y[n] = b0 * x[n] + d1;
66      d1 = b1 * x[n] - a1 * y[n] + d2;
67      d2 = b2 * x[n] - a2 * y[n];
68   </pre>
69                    In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
70   @par
71                    Higher order filters are realized as a cascade of second order sections.
72                    <code>numStages</code> refers to the number of second order stages used.
73                    For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
74                    A 9th order filter would be realized with <code>numStages=5</code> second order stages with the
75                    coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
76   @par
77                    <code>pState</code> points to the state variable array.
78                    Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.
79                    The state variables are arranged in the <code>pState</code> array as:
80   <pre>
81       {d11, d12, d21, d22, ...}
82   </pre>
83                    where <code>d1x</code> refers to the state variables for the first Biquad and
84                    <code>d2x</code> refers to the state variables for the second Biquad.
85                    The state array has a total length of <code>2*numStages</code> values.
86                    The state variables are updated after each block of data is processed; the coefficients are untouched.
87   @par
88                    The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
89                    The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
90                    That is why the Direct Form I structure supports Q15 and Q31 data types.
91                    The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.
92                    Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
93                    The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
94 
95   @par           Instance Structure
96                    The coefficients and state variables for a filter are stored together in an instance data structure.
97                    A separate instance structure must be defined for each filter.
98                    Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
99 
100   @par           Init Functions
101                    There is also an associated initialization function.
102                    The initialization function performs following operations:
103                    - Sets the values of the internal structure fields.
104                    - Zeros out the values in the state buffer.
105                    To do this manually without calling the init function, assign the follow subfields of the instance structure:
106                    numStages, pCoeffs, pState. Also set all of the values in pState to zero.
107   @par
108                    Use of the initialization function is optional except for the vectorized versions (Helium and Neon).
109                    However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
110                    To place an instance structure into a const data section, the instance structure must be manually initialized.
111                    Set the values in the state buffer to zeros before static initialization.
112                    For example, to statically initialize the instance structure use
113   <pre>
114       arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};
115       arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
116   </pre>
117                    where <code>numStages</code> is the number of Biquad stages in the filter;
118                    <code>pState</code> is the address of the state buffer.
119                    <code>pCoeffs</code> is the address of the coefficient buffer;
120 
121   @par           Neon version
122                   For Neon version, the function arm_biquad_cascade_df2T_compute_coefs_x must be
123                   used in addition to arm_biquad_cascade_df2T_init_x.
124 
125                   See the documentation of arm_biquad_cascade_df2T_init_x for more details.
126 
127 */
128 
129 /**
130   @addtogroup BiquadCascadeDF2T
131   @{
132  */
133 
134 /**
135  @brief         Processing function for the floating-point transposed direct form II Biquad cascade filter.
136  @param[in]     S         points to an instance of the filter data structure
137  @param[in]     pSrc      points to the block of input data
138  @param[out]    pDst      points to the block of output data
139  @param[in]     blockSize number of samples to process
140  */
141 
142 
143 #if defined(ARM_MATH_NEON) && defined(__aarch64__)
arm_biquad_cascade_df2T_f64(const arm_biquad_cascade_df2T_instance_f64 * S,const float64_t * pSrc,float64_t * pDst,uint32_t blockSize)144 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f64(
145     const arm_biquad_cascade_df2T_instance_f64 * S,
146     const float64_t * pSrc,
147     float64_t * pDst,
148     uint32_t  blockSize)
149 {
150 
151 
152 
153     const float64_t *pIn = pSrc;                  /*  source pointer            */
154     float64_t Xn0;
155     float64_t acc0;
156     float64_t *pOut = pDst;                 /*  destination pointer       */
157     float64_t *pState = S->pState;          /*  State pointer             */
158     uint32_t  sample, stage = S->numStages; /*  loop counters             */
159     float64_t const *pCurCoeffs =          /*  coefficient pointer       */
160     (float64_t const *) S->pCoeffs;
161     float64x2_t b0Coeffs, a0Coeffs;           /*  Coefficients vector       */
162     float64x2_t state;                        /*  State vector*/
163     float64x2_t zeroV = vdupq_n_f64(0);
164 
165     float64_t b0 ;
166 
167 
168     do
169     {
170 
171         /* Reading the coefficients */
172         b0 = *pCurCoeffs++ ;
173         b0Coeffs = vld1q_f64(pCurCoeffs);
174         pCurCoeffs += 2 ;
175         a0Coeffs = vld1q_f64(pCurCoeffs);
176         pCurCoeffs +=2 ;
177 
178         state = vld1q_f64(pState);
179 
180         sample = blockSize >> 1U;
181         while (sample > 0U)
182         {
183 
184             /* y[n] = b0 * x[n] + d1 */
185             /* d1 = b1 * x[n] + a1 * y[n] + d2 */
186             /* d2 = b2 * x[n] + a2 * y[n] */
187 
188             Xn0 = *pIn++ ;
189 
190             /* Calculation of acc0*/
191             acc0 = b0*Xn0+vgetq_lane_f64(state, 0);
192 
193 
194             /*
195              *  final                  initial
196              *  state   b0Coeffs        state  a0Coeffs
197              *   |        |              |        |
198              *   __       __             __       __
199              *  /  \     /  \           /  \     /  \
200              * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0
201              * | d2 |   | b2 |         | 0  |   | a2 |
202              *  \__/     \__/           \__/     \__/
203              */
204 
205             /* state -> initial state (see above) */
206 
207             state = vextq_f64(state, zeroV, 1);
208 
209             /* Calculation of final state */
210             state = vfmaq_n_f64(state, b0Coeffs, Xn0);
211             state = vfmaq_n_f64(state, a0Coeffs, acc0);
212 
213             *pOut++ = acc0 ;
214 
215             /* y[n] = b0 * x[n] + d1 */
216             /* d1 = b1 * x[n] + a1 * y[n] + d2 */
217             /* d2 = b2 * x[n] + a2 * y[n] */
218 
219             Xn0 = *pIn++ ;
220 
221             /* Calculation of acc0*/
222             acc0 = b0*Xn0+vgetq_lane_f64(state, 0);
223 
224 
225             /*
226              *  final                  initial
227              *  state   b0Coeffs        state  a0Coeffs
228              *   |        |              |        |
229              *   __       __             __       __
230              *  /  \     /  \           /  \     /  \
231              * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0
232              * | d2 |   | b2 |         | 0  |   | a2 |
233              *  \__/     \__/           \__/     \__/
234              */
235 
236             /* state -> initial state (see above) */
237 
238             state = vextq_f64(state, zeroV, 1);
239 
240             /* Calculation of final state */
241             state = vfmaq_n_f64(state, b0Coeffs, Xn0);
242             state = vfmaq_n_f64(state, a0Coeffs, acc0);
243 
244             *pOut++ = acc0;
245             sample--;
246         }
247         sample = blockSize & 1 ;
248         while (sample > 0U)
249         {
250 
251             /* y[n] = b0 * x[n] + d1 */
252             /* d1 = b1 * x[n] + a1 * y[n] + d2 */
253             /* d2 = b2 * x[n] + a2 * y[n] */
254 
255             Xn0 = *pIn++ ;
256 
257             /* Calculation of acc0*/
258             acc0 = b0*Xn0+vgetq_lane_f64(state, 0);
259 
260 
261             /*
262              *  final                  initial
263              *  state   b0Coeffs        state  a0Coeffs
264              *   |        |              |        |
265              *   __       __             __       __
266              *  /  \     /  \           /  \     /  \
267              * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0
268              * | d2 |   | b2 |         | 0  |   | a2 |
269              *  \__/     \__/           \__/     \__/
270              */
271 
272             /* state -> initial state (see above) */
273 
274             state = vextq_f64(state, zeroV, 1);
275 
276             /* Calculation of final state */
277             state = vfmaq_n_f64(state, b0Coeffs, Xn0);
278             state = vfmaq_n_f64(state, a0Coeffs, acc0);
279 
280             *pOut++ = acc0 ;
281             sample--;
282         }
283         /* Store the updated state variables back into the state array */
284         pState[0] = vgetq_lane_f64(state, 0);
285         pState[1] = vgetq_lane_f64(state, 1);
286 
287         pState += 2U;
288 
289         /* The current stage output is given as the input to the next stage */
290         pIn = pDst;
291 
292         /* Reset the output working pointer */
293         pOut = pDst;
294 
295         stage--;
296 
297     } while (stage > 0U);
298 
299 }
300 #else
301 
arm_biquad_cascade_df2T_f64(const arm_biquad_cascade_df2T_instance_f64 * S,const float64_t * pSrc,float64_t * pDst,uint32_t blockSize)302 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f64(
303     const arm_biquad_cascade_df2T_instance_f64 * S,
304     const float64_t * pSrc,
305     float64_t * pDst,
306     uint32_t blockSize)
307 {
308 
309     const float64_t *pIn = pSrc;                   /* Source pointer */
310     float64_t *pOut = pDst;                        /* Destination pointer */
311     float64_t *pState = S->pState;                 /* State pointer */
312     const float64_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
313     float64_t acc1;                                /* Accumulator */
314     float64_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
315     float64_t Xn1;                                 /* Temporary input */
316     float64_t d1, d2;                              /* State variables */
317     uint32_t sample, stage = S->numStages;         /* Loop counters */
318 
319 
320     do
321     {
322         /* Reading the coefficients */
323         b0 = pCoeffs[0];
324         b1 = pCoeffs[1];
325         b2 = pCoeffs[2];
326         a1 = pCoeffs[3];
327         a2 = pCoeffs[4];
328 
329         /* Reading the state values */
330         d1 = pState[0];
331         d2 = pState[1];
332 
333         pCoeffs += 5U;
334 
335 #if defined (ARM_MATH_LOOPUNROLL)
336 
337         /* Loop unrolling: Compute 16 outputs at a time */
338         sample = blockSize >> 4U;
339 
340         while (sample > 0U) {
341 
342             /* y[n] = b0 * x[n] + d1 */
343             /* d1 = b1 * x[n] + a1 * y[n] + d2 */
344             /* d2 = b2 * x[n] + a2 * y[n] */
345 
346             /*  1 */
347             Xn1 = *pIn++;
348 
349             acc1 = b0 * Xn1 + d1;
350 
351             d1 = b1 * Xn1 + d2;
352             d1 += a1 * acc1;
353 
354             d2 = b2 * Xn1;
355             d2 += a2 * acc1;
356 
357             *pOut++ = acc1;
358 
359 
360             /*  2 */
361             Xn1 = *pIn++;
362 
363             acc1 = b0 * Xn1 + d1;
364 
365             d1 = b1 * Xn1 + d2;
366             d1 += a1 * acc1;
367 
368             d2 = b2 * Xn1;
369             d2 += a2 * acc1;
370 
371             *pOut++ = acc1;
372 
373             /*  3 */
374             Xn1 = *pIn++;
375 
376             acc1 = b0 * Xn1 + d1;
377 
378             d1 = b1 * Xn1 + d2;
379             d1 += a1 * acc1;
380 
381             d2 = b2 * Xn1;
382             d2 += a2 * acc1;
383 
384             *pOut++ = acc1;
385 
386             /*  4 */
387             Xn1 = *pIn++;
388 
389             acc1 = b0 * Xn1 + d1;
390 
391             d1 = b1 * Xn1 + d2;
392             d1 += a1 * acc1;
393 
394             d2 = b2 * Xn1;
395             d2 += a2 * acc1;
396 
397             *pOut++ = acc1;
398 
399             /*  5 */
400             Xn1 = *pIn++;
401 
402             acc1 = b0 * Xn1 + d1;
403 
404             d1 = b1 * Xn1 + d2;
405             d1 += a1 * acc1;
406 
407             d2 = b2 * Xn1;
408             d2 += a2 * acc1;
409 
410             *pOut++ = acc1;
411 
412             /*  6 */
413             Xn1 = *pIn++;
414 
415             acc1 = b0 * Xn1 + d1;
416 
417             d1 = b1 * Xn1 + d2;
418             d1 += a1 * acc1;
419 
420             d2 = b2 * Xn1;
421             d2 += a2 * acc1;
422 
423             *pOut++ = acc1;
424 
425             /*  7 */
426             Xn1 = *pIn++;
427 
428             acc1 = b0 * Xn1 + d1;
429 
430             d1 = b1 * Xn1 + d2;
431             d1 += a1 * acc1;
432 
433             d2 = b2 * Xn1;
434             d2 += a2 * acc1;
435 
436             *pOut++ = acc1;
437 
438             /*  8 */
439             Xn1 = *pIn++;
440 
441             acc1 = b0 * Xn1 + d1;
442 
443             d1 = b1 * Xn1 + d2;
444             d1 += a1 * acc1;
445 
446             d2 = b2 * Xn1;
447             d2 += a2 * acc1;
448 
449             *pOut++ = acc1;
450 
451             /*  9 */
452             Xn1 = *pIn++;
453 
454             acc1 = b0 * Xn1 + d1;
455 
456             d1 = b1 * Xn1 + d2;
457             d1 += a1 * acc1;
458 
459             d2 = b2 * Xn1;
460             d2 += a2 * acc1;
461 
462             *pOut++ = acc1;
463 
464             /* 10 */
465             Xn1 = *pIn++;
466 
467             acc1 = b0 * Xn1 + d1;
468 
469             d1 = b1 * Xn1 + d2;
470             d1 += a1 * acc1;
471 
472             d2 = b2 * Xn1;
473             d2 += a2 * acc1;
474 
475             *pOut++ = acc1;
476 
477             /* 11 */
478             Xn1 = *pIn++;
479 
480             acc1 = b0 * Xn1 + d1;
481 
482             d1 = b1 * Xn1 + d2;
483             d1 += a1 * acc1;
484 
485             d2 = b2 * Xn1;
486             d2 += a2 * acc1;
487 
488             *pOut++ = acc1;
489 
490             /* 12 */
491             Xn1 = *pIn++;
492 
493             acc1 = b0 * Xn1 + d1;
494 
495             d1 = b1 * Xn1 + d2;
496             d1 += a1 * acc1;
497 
498             d2 = b2 * Xn1;
499             d2 += a2 * acc1;
500 
501             *pOut++ = acc1;
502 
503             /* 13 */
504             Xn1 = *pIn++;
505 
506             acc1 = b0 * Xn1 + d1;
507 
508             d1 = b1 * Xn1 + d2;
509             d1 += a1 * acc1;
510 
511             d2 = b2 * Xn1;
512             d2 += a2 * acc1;
513 
514             *pOut++ = acc1;
515 
516             /* 14 */
517             Xn1 = *pIn++;
518 
519             acc1 = b0 * Xn1 + d1;
520 
521             d1 = b1 * Xn1 + d2;
522             d1 += a1 * acc1;
523 
524             d2 = b2 * Xn1;
525             d2 += a2 * acc1;
526 
527             *pOut++ = acc1;
528 
529             /* 15 */
530             Xn1 = *pIn++;
531 
532             acc1 = b0 * Xn1 + d1;
533 
534             d1 = b1 * Xn1 + d2;
535             d1 += a1 * acc1;
536 
537             d2 = b2 * Xn1;
538             d2 += a2 * acc1;
539 
540             *pOut++ = acc1;
541 
542             /* 16 */
543             Xn1 = *pIn++;
544 
545             acc1 = b0 * Xn1 + d1;
546 
547             d1 = b1 * Xn1 + d2;
548             d1 += a1 * acc1;
549 
550             d2 = b2 * Xn1;
551             d2 += a2 * acc1;
552 
553             *pOut++ = acc1;
554 
555             /* decrement loop counter */
556             sample--;
557         }
558 
559         /* Loop unrolling: Compute remaining outputs */
560         sample = blockSize & 0xFU;
561 
562 #else
563 
564         /* Initialize blkCnt with number of samples */
565         sample = blockSize;
566 
567 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
568 
569         while (sample > 0U) {
570             Xn1 = *pIn++;
571 
572             acc1 = b0 * Xn1 + d1;
573 
574             d1 = b1 * Xn1 + d2;
575             d1 += a1 * acc1;
576 
577             d2 = b2 * Xn1;
578             d2 += a2 * acc1;
579 
580             *pOut++ = acc1;
581 
582             /* decrement loop counter */
583             sample--;
584         }
585 
586         /* Store the updated state variables back into the state array */
587         pState[0] = d1;
588         pState[1] = d2;
589 
590         pState += 2U;
591 
592         /* The current stage output is given as the input to the next stage */
593         pIn = pDst;
594 
595         /* Reset the output working pointer */
596         pOut = pDst;
597 
598         /* decrement loop counter */
599         stage--;
600 
601     } while (stage > 0U);
602 
603 }
604 #endif
605 
606 
607 
608 /**
609  @} end of BiquadCascadeDF2T group
610  */
611