1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_biquad_cascade_stereo_df2T_f16.c
4  * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter. 2 channels
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 /**
33   @ingroup groupFilters
34 */
35 
36 /**
37   @addtogroup BiquadCascadeDF2T
38   @{
39  */
40 
41 /**
42   @brief         Processing function for the floating-point transposed direct form II Biquad cascade filter.
43   @param[in]     S         points to an instance of the filter data structure
44   @param[in]     pSrc      points to the block of input data
45   @param[out]    pDst      points to the block of output data
46   @param[in]     blockSize number of samples to process
47  */
48 
49 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(ARM_DSP_BUILT_WITH_GCC)
50 #pragma message "Scalar version of arm_biquad_cascade_stereo_df2T_f16 built. Helium version has build issues with gcc."
51 #endif
52 
53 #if (defined(ARM_MATH_MVE_FLOAT16) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(ARM_DSP_BUILT_WITH_GCC)
arm_biquad_cascade_stereo_df2T_f16(const arm_biquad_cascade_stereo_df2T_instance_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)54 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_stereo_df2T_f16(
55   const arm_biquad_cascade_stereo_df2T_instance_f16 * S,
56   const float16_t * pSrc,
57         float16_t * pDst,
58         uint32_t blockSize)
59 {
60     float16_t *pIn = (float16_t *)pSrc;      /*  source pointer            */
61     float16_t *pOut = pDst;     /*  destination pointer       */
62     float16_t *pState = S->pState;  /*  State pointer             */
63     const float16_t *pCoeffs = S->pCoeffs;    /*  coefficient pointer       */
64     float16_t b0, b1, b2, a1, a2;   /*  Filter coefficients       */
65     uint32_t  sample, stage = S->numStages; /*  loop counters             */
66     static const uint16_t idx2[] = {2, 3, 8, 9, 2, 3, 8, 9};
67     f16x8_t aCoeffs, bCoeffs;
68     float16_t scratch[16];
69     uint16x8_t loadIdxVec;
70     uint16x8_t reshufledIdxVec;
71     uint16_t  startIdx = 0;
72     f16x8_t stateVec0, stateVec1;
73     f16x8_t inVec;
74 
75     /*
76      * {0, 1, 0, 1, 0, 1, 0, 1} generator
77      */
78     loadIdxVec = viwdupq_u16(startIdx, 2, 1);
79     reshufledIdxVec = *(uint16x8_t *)&idx2;
80 
81     /*
82      * scratch top clearing
83      * layout : [d1a d1b d2a d2b d1a d1b d2a d2b 0 0]
84      */
85     scratch[8] = (float16_t)0.0;
86     scratch[9] = (float16_t)0.0;
87 
88     do
89     {
90         /*
91          * Reading the coefficients
92          */
93         b0 = *pCoeffs++;
94         b1 = *pCoeffs++;
95         b2 = *pCoeffs++;
96         a1 = *pCoeffs++;
97         a2 = *pCoeffs++;
98 
99         /* aCoeffs = {a1 a1 a2 a2 a1 a1 a2 a2} */
100         aCoeffs = vdupq_n_f16(a1);
101         aCoeffs = vsetq_lane(a2, aCoeffs, 2);
102         aCoeffs = vsetq_lane(a2, aCoeffs, 3);
103         aCoeffs = vsetq_lane(a2, aCoeffs, 6);
104         aCoeffs = vsetq_lane(a2, aCoeffs, 7);
105 
106         /* bCoeffs = {b1 b1 b2 b2 b1 b1 b2 b2} */
107         bCoeffs = vdupq_n_f16(b1);
108         bCoeffs = vsetq_lane(b2, bCoeffs, 2);
109         bCoeffs = vsetq_lane(b2, bCoeffs, 3);
110         bCoeffs = vsetq_lane(b2, bCoeffs, 6);
111         bCoeffs = vsetq_lane(b2, bCoeffs, 7);
112 
113         /*
114          * Reading the state values
115          * Save into scratch
116          */
117         *(f16x8_t *) scratch = *(f16x8_t *) pState;
118 
119         sample = blockSize;
120 
121         while (sample > 0U)
122         {
123             /*
124              * step 1
125              *
126              * 0   | acc1a = xn1a * b0 + d1a
127              * 1   | acc1b = xn1b * b0 + d1b
128              * 2   | acc1a = xn1a * b0 + d1a
129              * 3   | acc1b = xn1b * b0 + d1b
130              * 4   |   <repeat>
131              * 5   |   ...
132              */
133 
134             /*
135              * load {d1a, d1b, d1a, d1b, d1a, d1b, d1a, d1b}
136              */
137             stateVec0 = vldrhq_gather_shifted_offset((float16_t const *) scratch, loadIdxVec);
138             /*
139              * load {in0 in1 in0 in1 in0 in1 in0 in1}
140              */
141             inVec = vldrhq_gather_shifted_offset_f16(pIn, loadIdxVec);
142 
143             stateVec0 = vfmaq(stateVec0, inVec, b0);
144             *pOut++ = vgetq_lane(stateVec0, 0);
145             *pOut++ = vgetq_lane(stateVec0, 1);
146 
147             /*
148              * step 2
149              *
150              * 0  | d1a = b1 * xn1a  +  a1 * acc1a  +  d2a
151              * 1  | d1b = b1 * xn1b  +  a1 * acc1b  +  d2b
152              * 2  | d2a = b2 * xn1a  +  a2 * acc1a  +  0
153              * 3  | d2b = b2 * xn1b  +  a2 * acc1b  +  0
154              * 4  |   <repeat>
155              * 5  |   ...
156              */
157 
158             /*
159              * load {d2a, d2b, 0, 0, d2a, d2b, 0, 0}
160              */
161             stateVec1 = vldrhq_gather_shifted_offset((float16_t const *) scratch, reshufledIdxVec);
162             stateVec1 = vfmaq(stateVec1, stateVec0, aCoeffs);
163             stateVec1 = vfmaq(stateVec1, inVec, bCoeffs);
164             *(f16x8_t *) scratch = stateVec1;
165 
166             pIn = pIn + 2;
167             sample--;
168         }
169 
170         /*
171          * Store the updated state variables back into the state array
172          */
173          *pState++ = vgetq_lane(stateVec1, 0);
174          *pState++ = vgetq_lane(stateVec1, 1);
175          *pState++ = vgetq_lane(stateVec1, 2);
176          *pState++ = vgetq_lane(stateVec1, 3);
177 
178         /*
179          * The current stage input is given as the output to the next stage
180          */
181         pIn = pDst;
182         /*
183          * Reset the output working pointer
184          */
185         pOut = pDst;
186         /*
187          * decrement the loop counter
188          */
189         stage--;
190     }
191     while (stage > 0U);
192 }
193 #else
194 
arm_biquad_cascade_stereo_df2T_f16(const arm_biquad_cascade_stereo_df2T_instance_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)195 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_stereo_df2T_f16(
196   const arm_biquad_cascade_stereo_df2T_instance_f16 * S,
197   const float16_t * pSrc,
198         float16_t * pDst,
199         uint32_t blockSize)
200 {
201   const float16_t *pIn = pSrc;                         /* Source pointer */
202         float16_t *pOut = pDst;                        /* Destination pointer */
203         float16_t *pState = S->pState;                 /* State pointer */
204   const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
205         _Float16 acc1a, acc1b;                        /* Accumulator */
206         _Float16 b0, b1, b2, a1, a2;                  /* Filter coefficients */
207         _Float16 Xn1a, Xn1b;                          /* Temporary input */
208         _Float16 d1a, d2a, d1b, d2b;                  /* State variables */
209         uint32_t sample, stage = S->numStages;         /* Loop counters */
210 
211     do
212     {
213         /* Reading the coefficients */
214         b0 = pCoeffs[0];
215         b1 = pCoeffs[1];
216         b2 = pCoeffs[2];
217         a1 = pCoeffs[3];
218         a2 = pCoeffs[4];
219 
220         /* Reading the state values */
221         d1a = pState[0];
222         d2a = pState[1];
223         d1b = pState[2];
224         d2b = pState[3];
225 
226         pCoeffs += 5U;
227 
228 #if defined (ARM_MATH_LOOPUNROLL)
229 
230     /* Loop unrolling: Compute 8 outputs at a time */
231         sample = blockSize >> 3U;
232 
233         while (sample > 0U) {
234           /* y[n] = b0 * x[n] + d1 */
235           /* d1 = b1 * x[n] + a1 * y[n] + d2 */
236           /* d2 = b2 * x[n] + a2 * y[n] */
237 
238 /*  1 */
239           Xn1a = *pIn++; /* Channel a */
240           Xn1b = *pIn++; /* Channel b */
241 
242           acc1a = (b0 * Xn1a) + d1a;
243           acc1b = (b0 * Xn1b) + d1b;
244 
245           *pOut++ = acc1a;
246           *pOut++ = acc1b;
247 
248           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
249           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
250 
251           d2a = (b2 * Xn1a) + (a2 * acc1a);
252           d2b = (b2 * Xn1b) + (a2 * acc1b);
253 
254 /*  2 */
255           Xn1a = *pIn++; /* Channel a */
256           Xn1b = *pIn++; /* Channel b */
257 
258           acc1a = (b0 * Xn1a) + d1a;
259           acc1b = (b0 * Xn1b) + d1b;
260 
261           *pOut++ = acc1a;
262           *pOut++ = acc1b;
263 
264           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
265           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
266 
267           d2a = (b2 * Xn1a) + (a2 * acc1a);
268           d2b = (b2 * Xn1b) + (a2 * acc1b);
269 
270 /*  3 */
271           Xn1a = *pIn++; /* Channel a */
272           Xn1b = *pIn++; /* Channel b */
273 
274           acc1a = (b0 * Xn1a) + d1a;
275           acc1b = (b0 * Xn1b) + d1b;
276 
277           *pOut++ = acc1a;
278           *pOut++ = acc1b;
279 
280           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
281           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
282 
283           d2a = (b2 * Xn1a) + (a2 * acc1a);
284           d2b = (b2 * Xn1b) + (a2 * acc1b);
285 
286 /*  4 */
287           Xn1a = *pIn++; /* Channel a */
288           Xn1b = *pIn++; /* Channel b */
289 
290           acc1a = (b0 * Xn1a) + d1a;
291           acc1b = (b0 * Xn1b) + d1b;
292 
293           *pOut++ = acc1a;
294           *pOut++ = acc1b;
295 
296           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
297           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
298 
299           d2a = (b2 * Xn1a) + (a2 * acc1a);
300           d2b = (b2 * Xn1b) + (a2 * acc1b);
301 
302 /*  5 */
303           Xn1a = *pIn++; /* Channel a */
304           Xn1b = *pIn++; /* Channel b */
305 
306           acc1a = (b0 * Xn1a) + d1a;
307           acc1b = (b0 * Xn1b) + d1b;
308 
309           *pOut++ = acc1a;
310           *pOut++ = acc1b;
311 
312           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
313           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
314 
315           d2a = (b2 * Xn1a) + (a2 * acc1a);
316           d2b = (b2 * Xn1b) + (a2 * acc1b);
317 
318 /*  6 */
319           Xn1a = *pIn++; /* Channel a */
320           Xn1b = *pIn++; /* Channel b */
321 
322           acc1a = (b0 * Xn1a) + d1a;
323           acc1b = (b0 * Xn1b) + d1b;
324 
325           *pOut++ = acc1a;
326           *pOut++ = acc1b;
327 
328           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
329           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
330 
331           d2a = (b2 * Xn1a) + (a2 * acc1a);
332           d2b = (b2 * Xn1b) + (a2 * acc1b);
333 
334 /*  7 */
335           Xn1a = *pIn++; /* Channel a */
336           Xn1b = *pIn++; /* Channel b */
337 
338           acc1a = (b0 * Xn1a) + d1a;
339           acc1b = (b0 * Xn1b) + d1b;
340 
341           *pOut++ = acc1a;
342           *pOut++ = acc1b;
343 
344           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
345           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
346 
347           d2a = (b2 * Xn1a) + (a2 * acc1a);
348           d2b = (b2 * Xn1b) + (a2 * acc1b);
349 
350 /*  8 */
351           Xn1a = *pIn++; /* Channel a */
352           Xn1b = *pIn++; /* Channel b */
353 
354           acc1a = (b0 * Xn1a) + d1a;
355           acc1b = (b0 * Xn1b) + d1b;
356 
357           *pOut++ = acc1a;
358           *pOut++ = acc1b;
359 
360           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
361           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
362 
363           d2a = (b2 * Xn1a) + (a2 * acc1a);
364           d2b = (b2 * Xn1b) + (a2 * acc1b);
365 
366           /* decrement loop counter */
367           sample--;
368         }
369 
370         /* Loop unrolling: Compute remaining outputs */
371         sample = blockSize & 0x7U;
372 
373 #else
374 
375         /* Initialize blkCnt with number of samples */
376         sample = blockSize;
377 
378 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
379 
380         while (sample > 0U) {
381           /* Read the input */
382           Xn1a = *pIn++; /* Channel a */
383           Xn1b = *pIn++; /* Channel b */
384 
385           /* y[n] = b0 * x[n] + d1 */
386           acc1a = (b0 * Xn1a) + d1a;
387           acc1b = (b0 * Xn1b) + d1b;
388 
389           /* Store the result in the accumulator in the destination buffer. */
390           *pOut++ = acc1a;
391           *pOut++ = acc1b;
392 
393           /* Every time after the output is computed state should be updated. */
394           /* d1 = b1 * x[n] + a1 * y[n] + d2 */
395           d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
396           d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
397 
398           /* d2 = b2 * x[n] + a2 * y[n] */
399           d2a = (b2 * Xn1a) + (a2 * acc1a);
400           d2b = (b2 * Xn1b) + (a2 * acc1b);
401 
402           /* decrement loop counter */
403           sample--;
404         }
405 
406         /* Store the updated state variables back into the state array */
407         pState[0] = d1a;
408         pState[1] = d2a;
409 
410         pState[2] = d1b;
411         pState[3] = d2b;
412 
413         pState += 4U;
414 
415         /* The current stage output is given as the input to the next stage */
416         pIn = pDst;
417 
418         /* Reset the output working pointer */
419         pOut = pDst;
420 
421         /* Decrement the loop counter */
422         stage--;
423 
424     } while (stage > 0U);
425 
426 }
427 
428 #endif /* #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
429 /**
430   @} end of BiquadCascadeDF2T group
431  */
432 
433 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
434