1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_biquad_cascade_stereo_df2T_f32.c
4  * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter. 2 channels
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33 */
34 
35 /**
36   @addtogroup BiquadCascadeDF2T
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the floating-point transposed direct form II Biquad cascade filter.
42   @param[in]     S         points to an instance of the filter data structure
43   @param[in]     pSrc      points to the block of input data
44   @param[out]    pDst      points to the block of output data
45   @param[in]     blockSize number of samples to process
46  */
47 #if (defined(ARM_MATH_MVEF) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
48 #include "arm_helium_utils.h"
49 
arm_biquad_cascade_stereo_df2T_f32(const arm_biquad_cascade_stereo_df2T_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)50 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_stereo_df2T_f32(
51   const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
52   const float32_t * pSrc,
53         float32_t * pDst,
54         uint32_t blockSize)
55 {
56     const float32_t *pIn = pSrc;              /*  source pointer            */
57     float32_t *pOut = pDst;             /*  destination pointer       */
58     float32_t *pState = S->pState;      /*  State pointer             */
59     const float32_t *pCoeffs = S->pCoeffs;    /*  coefficient pointer       */
60     float32_t b0, b1, b2, a1, a2;       /*  Filter coefficients       */
61     uint32_t  sample, stage = S->numStages; /*  loop counters           */
62     float32_t scratch[6];
63     uint32x4_t loadIdxVec;
64     f32x4_t aCoeffs, bCoeffs;
65     f32x4_t stateVec0, stateVec1;
66     f32x4_t inVec;
67     uint32_t  startIdx = 0;
68 
69     /*
70      * {0, 1, 0, 1} generator
71      */
72     loadIdxVec = viwdupq_u32(&startIdx, 2, 1);
73 
74     /*
75      * scratch top clearing
76      * layout : [d1a d1b d2a d2b 0 0]
77      */
78     scratch[4] = 0.0f;
79     scratch[5] = 0.0f;
80 
81     do
82     {
83         /*
84          * Reading the coefficients
85          */
86         b0 = *pCoeffs++;
87         b1 = *pCoeffs++;
88         b2 = *pCoeffs++;
89         a1 = *pCoeffs++;
90         a2 = *pCoeffs++;
91 
92         /*
93          * aCoeffs = {a1 a1 a2 a2}
94          */
95         aCoeffs = vdupq_n_f32(a1);
96         aCoeffs = vsetq_lane(a2, aCoeffs, 2);
97         aCoeffs = vsetq_lane(a2, aCoeffs, 3);
98 
99         /*
100          * bCoeffs = {b1 b1 b2 b2}
101          */
102         bCoeffs = vdupq_n_f32(b1);
103         bCoeffs = vsetq_lane(b2, bCoeffs, 2);
104         bCoeffs = vsetq_lane(b2, bCoeffs, 3);
105 
106         /*
107          * Reading the state values
108          * Save into scratch
109          */
110         *(f32x4_t *) scratch = *(f32x4_t *) pState;
111 
112         sample = blockSize;
113 
114         while (sample > 0U)
115         {
116             /*
117              * step 1
118              *
119              * 0   | acc1a = xn1a * b0 + d1a
120              * 1   | acc1b = xn1b * b0 + d1b
121              * 2   | acc1a = xn1a * b0 + d1a
122              * 3   | acc1b = xn1b * b0 + d1b
123              */
124             /*
125              * load {d1a, d1b, d1a, d1b}
126              */
127             stateVec0 = (f32x4_t)vldrwq_gather_shifted_offset((uint32_t const *) scratch, loadIdxVec);
128             /*
129              * load {in0 in1 in0 in1}
130              */
131             inVec = (f32x4_t)vldrwq_gather_shifted_offset((uint32_t const *) pIn, loadIdxVec);
132 
133             stateVec0 = vfmaq(stateVec0, inVec, b0);
134             *pOut++ = vgetq_lane(stateVec0, 0);
135             *pOut++ = vgetq_lane(stateVec0, 1);
136 
137             /*
138              * step 2
139              *
140              * 0  | d1a = b1 * xn1a  +  a1 * acc1a  +  d2a
141              * 1  | d1b = b1 * xn1b  +  a1 * acc1b  +  d2b
142              * 2  | d2a = b2 * xn1a  +  a2 * acc1a  +  0
143              * 3  | d2b = b2 * xn1b  +  a2 * acc1b  +  0
144              */
145 
146             /*
147              * load {d2a, d2b, 0, 0}
148              */
149             stateVec1 = *(f32x4_t *) & scratch[2];
150             stateVec1 = vfmaq(stateVec1, stateVec0, aCoeffs);
151             stateVec1 = vfmaq(stateVec1, inVec, bCoeffs);
152             *(f32x4_t *) scratch = stateVec1;
153 
154             pIn = pIn + 2;
155             sample--;
156         }
157 
158         /*
159          * Store the updated state variables back into the state array
160          */
161         vst1q(pState, stateVec1);
162         pState += 4;
163 
164         /*
165          * The current stage output is given as the input to the next stage
166          */
167         pIn = pDst;
168         /*
169          * Reset the output working pointer
170          */
171         pOut = pDst;
172         /*
173          * decrement the loop counter
174          */
175         stage--;
176     }
177     while (stage > 0U);
178 }
179 
180 #else
181 
arm_biquad_cascade_stereo_df2T_f32(const arm_biquad_cascade_stereo_df2T_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)182 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_stereo_df2T_f32(
183   const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
184   const float32_t * pSrc,
185         float32_t * pDst,
186         uint32_t blockSize)
187 {
188   const float32_t *pIn = pSrc;                         /* Source pointer */
189         float32_t *pOut = pDst;                        /* Destination pointer */
190         float32_t *pState = S->pState;                 /* State pointer */
191   const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
192         float32_t acc1a, acc1b;                        /* Accumulator */
193         float32_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
194         float32_t Xn1a, Xn1b;                          /* Temporary input */
195         float32_t d1a, d2a, d1b, d2b;                  /* State variables */
196         uint32_t sample, stage = S->numStages;         /* Loop counters */
197 
198     do
199     {
200         /* Reading the coefficients */
201         b0 = pCoeffs[0];
202         b1 = pCoeffs[1];
203         b2 = pCoeffs[2];
204         a1 = pCoeffs[3];
205         a2 = pCoeffs[4];
206 
207         /* Reading the state values */
208         d1a = pState[0];
209         d2a = pState[1];
210         d1b = pState[2];
211         d2b = pState[3];
212 
213         pCoeffs += 5U;
214 
215 #if defined (ARM_MATH_LOOPUNROLL)
216 
217     /* Loop unrolling: Compute 8 outputs at a time */
218         sample = blockSize >> 3U;
219 
220         while (sample > 0U) {
221           /* y[n] = b0 * x[n] + d1 */
222           /* d1 = b1 * x[n] + a1 * y[n] + d2 */
223           /* d2 = b2 * x[n] + a2 * y[n] */
224 
225 /*  1 */
226           Xn1a = *pIn++; /* Channel a */
227           Xn1b = *pIn++; /* Channel b */
228 
229           acc1a = (b0 * Xn1a) + d1a;
230           acc1b = (b0 * Xn1b) + d1b;
231 
232           *pOut++ = acc1a;
233           *pOut++ = acc1b;
234 
235           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
236           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
237 
238           d2a = (b2 * Xn1a) + (a2 * acc1a);
239           d2b = (b2 * Xn1b) + (a2 * acc1b);
240 
241 /*  2 */
242           Xn1a = *pIn++; /* Channel a */
243           Xn1b = *pIn++; /* Channel b */
244 
245           acc1a = (b0 * Xn1a) + d1a;
246           acc1b = (b0 * Xn1b) + d1b;
247 
248           *pOut++ = acc1a;
249           *pOut++ = acc1b;
250 
251           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
252           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
253 
254           d2a = (b2 * Xn1a) + (a2 * acc1a);
255           d2b = (b2 * Xn1b) + (a2 * acc1b);
256 
257 /*  3 */
258           Xn1a = *pIn++; /* Channel a */
259           Xn1b = *pIn++; /* Channel b */
260 
261           acc1a = (b0 * Xn1a) + d1a;
262           acc1b = (b0 * Xn1b) + d1b;
263 
264           *pOut++ = acc1a;
265           *pOut++ = acc1b;
266 
267           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
268           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
269 
270           d2a = (b2 * Xn1a) + (a2 * acc1a);
271           d2b = (b2 * Xn1b) + (a2 * acc1b);
272 
273 /*  4 */
274           Xn1a = *pIn++; /* Channel a */
275           Xn1b = *pIn++; /* Channel b */
276 
277           acc1a = (b0 * Xn1a) + d1a;
278           acc1b = (b0 * Xn1b) + d1b;
279 
280           *pOut++ = acc1a;
281           *pOut++ = acc1b;
282 
283           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
284           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
285 
286           d2a = (b2 * Xn1a) + (a2 * acc1a);
287           d2b = (b2 * Xn1b) + (a2 * acc1b);
288 
289 /*  5 */
290           Xn1a = *pIn++; /* Channel a */
291           Xn1b = *pIn++; /* Channel b */
292 
293           acc1a = (b0 * Xn1a) + d1a;
294           acc1b = (b0 * Xn1b) + d1b;
295 
296           *pOut++ = acc1a;
297           *pOut++ = acc1b;
298 
299           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
300           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
301 
302           d2a = (b2 * Xn1a) + (a2 * acc1a);
303           d2b = (b2 * Xn1b) + (a2 * acc1b);
304 
305 /*  6 */
306           Xn1a = *pIn++; /* Channel a */
307           Xn1b = *pIn++; /* Channel b */
308 
309           acc1a = (b0 * Xn1a) + d1a;
310           acc1b = (b0 * Xn1b) + d1b;
311 
312           *pOut++ = acc1a;
313           *pOut++ = acc1b;
314 
315           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
316           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
317 
318           d2a = (b2 * Xn1a) + (a2 * acc1a);
319           d2b = (b2 * Xn1b) + (a2 * acc1b);
320 
321 /*  7 */
322           Xn1a = *pIn++; /* Channel a */
323           Xn1b = *pIn++; /* Channel b */
324 
325           acc1a = (b0 * Xn1a) + d1a;
326           acc1b = (b0 * Xn1b) + d1b;
327 
328           *pOut++ = acc1a;
329           *pOut++ = acc1b;
330 
331           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
332           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
333 
334           d2a = (b2 * Xn1a) + (a2 * acc1a);
335           d2b = (b2 * Xn1b) + (a2 * acc1b);
336 
337 /*  8 */
338           Xn1a = *pIn++; /* Channel a */
339           Xn1b = *pIn++; /* Channel b */
340 
341           acc1a = (b0 * Xn1a) + d1a;
342           acc1b = (b0 * Xn1b) + d1b;
343 
344           *pOut++ = acc1a;
345           *pOut++ = acc1b;
346 
347           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
348           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
349 
350           d2a = (b2 * Xn1a) + (a2 * acc1a);
351           d2b = (b2 * Xn1b) + (a2 * acc1b);
352 
353           /* decrement loop counter */
354           sample--;
355         }
356 
357         /* Loop unrolling: Compute remaining outputs */
358         sample = blockSize & 0x7U;
359 
360 #else
361 
362         /* Initialize blkCnt with number of samples */
363         sample = blockSize;
364 
365 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
366 
367         while (sample > 0U) {
368           /* Read the input */
369           Xn1a = *pIn++; /* Channel a */
370           Xn1b = *pIn++; /* Channel b */
371 
372           /* y[n] = b0 * x[n] + d1 */
373           acc1a = (b0 * Xn1a) + d1a;
374           acc1b = (b0 * Xn1b) + d1b;
375 
376           /* Store the result in the accumulator in the destination buffer. */
377           *pOut++ = acc1a;
378           *pOut++ = acc1b;
379 
380           /* Every time after the output is computed state should be updated. */
381           /* d1 = b1 * x[n] + a1 * y[n] + d2 */
382           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
383           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
384 
385           /* d2 = b2 * x[n] + a2 * y[n] */
386           d2a = (b2 * Xn1a) + (a2 * acc1a);
387           d2b = (b2 * Xn1b) + (a2 * acc1b);
388 
389           /* decrement loop counter */
390           sample--;
391         }
392 
393         /* Store the updated state variables back into the state array */
394         pState[0] = d1a;
395         pState[1] = d2a;
396 
397         pState[2] = d1b;
398         pState[3] = d2b;
399 
400         pState += 4U;
401 
402         /* The current stage output is given as the input to the next stage */
403         pIn = pDst;
404 
405         /* Reset the output working pointer */
406         pOut = pDst;
407 
408         /* Decrement the loop counter */
409         stage--;
410 
411     } while (stage > 0U);
412 
413 }
414 
415 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
416 
417 /**
418   @} end of BiquadCascadeDF2T group
419  */
420