1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_biquad_cascade_stereo_df2T_f16.c
4 * Description: Processing function for floating-point transposed direct form II Biquad cascade filter. 2 channels
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 /**
33 @ingroup groupFilters
34 */
35
36 /**
37 @addtogroup BiquadCascadeDF2T
38 @{
39 */
40
41 /**
42 @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
43 @param[in] S points to an instance of the filter data structure
44 @param[in] pSrc points to the block of input data
45 @param[out] pDst points to the block of output data
46 @param[in] blockSize number of samples to process
47 */
48
49 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(ARM_DSP_BUILT_WITH_GCC)
50 #pragma message "Scalar version of arm_biquad_cascade_stereo_df2T_f16 built. Helium version has build issues with gcc."
51 #endif
52
53 #if (defined(ARM_MATH_MVE_FLOAT16) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(ARM_DSP_BUILT_WITH_GCC)
arm_biquad_cascade_stereo_df2T_f16(const arm_biquad_cascade_stereo_df2T_instance_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)54 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_stereo_df2T_f16(
55 const arm_biquad_cascade_stereo_df2T_instance_f16 * S,
56 const float16_t * pSrc,
57 float16_t * pDst,
58 uint32_t blockSize)
59 {
60 float16_t *pIn = (float16_t *)pSrc; /* source pointer */
61 float16_t *pOut = pDst; /* destination pointer */
62 float16_t *pState = S->pState; /* State pointer */
63 const float16_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
64 float16_t b0, b1, b2, a1, a2; /* Filter coefficients */
65 uint32_t sample, stage = S->numStages; /* loop counters */
66 static const uint16_t idx2[] = {2, 3, 8, 9, 2, 3, 8, 9};
67 f16x8_t aCoeffs, bCoeffs;
68 float16_t scratch[16];
69 uint16x8_t loadIdxVec;
70 uint16x8_t reshufledIdxVec;
71 uint16_t startIdx = 0;
72 f16x8_t stateVec0, stateVec1;
73 f16x8_t inVec;
74
75 /*
76 * {0, 1, 0, 1, 0, 1, 0, 1} generator
77 */
78 loadIdxVec = viwdupq_u16(startIdx, 2, 1);
79 reshufledIdxVec = *(uint16x8_t *)&idx2;
80
81 /*
82 * scratch top clearing
83 * layout : [d1a d1b d2a d2b d1a d1b d2a d2b 0 0]
84 */
85 scratch[8] = (float16_t)0.0;
86 scratch[9] = (float16_t)0.0;
87
88 do
89 {
90 /*
91 * Reading the coefficients
92 */
93 b0 = *pCoeffs++;
94 b1 = *pCoeffs++;
95 b2 = *pCoeffs++;
96 a1 = *pCoeffs++;
97 a2 = *pCoeffs++;
98
99 /* aCoeffs = {a1 a1 a2 a2 a1 a1 a2 a2} */
100 aCoeffs = vdupq_n_f16(a1);
101 aCoeffs = vsetq_lane(a2, aCoeffs, 2);
102 aCoeffs = vsetq_lane(a2, aCoeffs, 3);
103 aCoeffs = vsetq_lane(a2, aCoeffs, 6);
104 aCoeffs = vsetq_lane(a2, aCoeffs, 7);
105
106 /* bCoeffs = {b1 b1 b2 b2 b1 b1 b2 b2} */
107 bCoeffs = vdupq_n_f16(b1);
108 bCoeffs = vsetq_lane(b2, bCoeffs, 2);
109 bCoeffs = vsetq_lane(b2, bCoeffs, 3);
110 bCoeffs = vsetq_lane(b2, bCoeffs, 6);
111 bCoeffs = vsetq_lane(b2, bCoeffs, 7);
112
113 /*
114 * Reading the state values
115 * Save into scratch
116 */
117 *(f16x8_t *) scratch = *(f16x8_t *) pState;
118
119 sample = blockSize;
120
121 while (sample > 0U)
122 {
123 /*
124 * step 1
125 *
126 * 0 | acc1a = xn1a * b0 + d1a
127 * 1 | acc1b = xn1b * b0 + d1b
128 * 2 | acc1a = xn1a * b0 + d1a
129 * 3 | acc1b = xn1b * b0 + d1b
130 * 4 | <repeat>
131 * 5 | ...
132 */
133
134 /*
135 * load {d1a, d1b, d1a, d1b, d1a, d1b, d1a, d1b}
136 */
137 stateVec0 = vldrhq_gather_shifted_offset((float16_t const *) scratch, loadIdxVec);
138 /*
139 * load {in0 in1 in0 in1 in0 in1 in0 in1}
140 */
141 inVec = vldrhq_gather_shifted_offset_f16(pIn, loadIdxVec);
142
143 stateVec0 = vfmaq(stateVec0, inVec, b0);
144 *pOut++ = vgetq_lane(stateVec0, 0);
145 *pOut++ = vgetq_lane(stateVec0, 1);
146
147 /*
148 * step 2
149 *
150 * 0 | d1a = b1 * xn1a + a1 * acc1a + d2a
151 * 1 | d1b = b1 * xn1b + a1 * acc1b + d2b
152 * 2 | d2a = b2 * xn1a + a2 * acc1a + 0
153 * 3 | d2b = b2 * xn1b + a2 * acc1b + 0
154 * 4 | <repeat>
155 * 5 | ...
156 */
157
158 /*
159 * load {d2a, d2b, 0, 0, d2a, d2b, 0, 0}
160 */
161 stateVec1 = vldrhq_gather_shifted_offset((float16_t const *) scratch, reshufledIdxVec);
162 stateVec1 = vfmaq(stateVec1, stateVec0, aCoeffs);
163 stateVec1 = vfmaq(stateVec1, inVec, bCoeffs);
164 *(f16x8_t *) scratch = stateVec1;
165
166 pIn = pIn + 2;
167 sample--;
168 }
169
170 /*
171 * Store the updated state variables back into the state array
172 */
173 *pState++ = vgetq_lane(stateVec1, 0);
174 *pState++ = vgetq_lane(stateVec1, 1);
175 *pState++ = vgetq_lane(stateVec1, 2);
176 *pState++ = vgetq_lane(stateVec1, 3);
177
178 /*
179 * The current stage input is given as the output to the next stage
180 */
181 pIn = pDst;
182 /*
183 * Reset the output working pointer
184 */
185 pOut = pDst;
186 /*
187 * decrement the loop counter
188 */
189 stage--;
190 }
191 while (stage > 0U);
192 }
193 #else
194
arm_biquad_cascade_stereo_df2T_f16(const arm_biquad_cascade_stereo_df2T_instance_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)195 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_stereo_df2T_f16(
196 const arm_biquad_cascade_stereo_df2T_instance_f16 * S,
197 const float16_t * pSrc,
198 float16_t * pDst,
199 uint32_t blockSize)
200 {
201 const float16_t *pIn = pSrc; /* Source pointer */
202 float16_t *pOut = pDst; /* Destination pointer */
203 float16_t *pState = S->pState; /* State pointer */
204 const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
205 _Float16 acc1a, acc1b; /* Accumulator */
206 _Float16 b0, b1, b2, a1, a2; /* Filter coefficients */
207 _Float16 Xn1a, Xn1b; /* Temporary input */
208 _Float16 d1a, d2a, d1b, d2b; /* State variables */
209 uint32_t sample, stage = S->numStages; /* Loop counters */
210
211 do
212 {
213 /* Reading the coefficients */
214 b0 = pCoeffs[0];
215 b1 = pCoeffs[1];
216 b2 = pCoeffs[2];
217 a1 = pCoeffs[3];
218 a2 = pCoeffs[4];
219
220 /* Reading the state values */
221 d1a = pState[0];
222 d2a = pState[1];
223 d1b = pState[2];
224 d2b = pState[3];
225
226 pCoeffs += 5U;
227
228 #if defined (ARM_MATH_LOOPUNROLL)
229
230 /* Loop unrolling: Compute 8 outputs at a time */
231 sample = blockSize >> 3U;
232
233 while (sample > 0U) {
234 /* y[n] = b0 * x[n] + d1 */
235 /* d1 = b1 * x[n] + a1 * y[n] + d2 */
236 /* d2 = b2 * x[n] + a2 * y[n] */
237
238 /* 1 */
239 Xn1a = *pIn++; /* Channel a */
240 Xn1b = *pIn++; /* Channel b */
241
242 acc1a = (b0 * Xn1a) + d1a;
243 acc1b = (b0 * Xn1b) + d1b;
244
245 *pOut++ = acc1a;
246 *pOut++ = acc1b;
247
248 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
249 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
250
251 d2a = (b2 * Xn1a) + (a2 * acc1a);
252 d2b = (b2 * Xn1b) + (a2 * acc1b);
253
254 /* 2 */
255 Xn1a = *pIn++; /* Channel a */
256 Xn1b = *pIn++; /* Channel b */
257
258 acc1a = (b0 * Xn1a) + d1a;
259 acc1b = (b0 * Xn1b) + d1b;
260
261 *pOut++ = acc1a;
262 *pOut++ = acc1b;
263
264 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
265 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
266
267 d2a = (b2 * Xn1a) + (a2 * acc1a);
268 d2b = (b2 * Xn1b) + (a2 * acc1b);
269
270 /* 3 */
271 Xn1a = *pIn++; /* Channel a */
272 Xn1b = *pIn++; /* Channel b */
273
274 acc1a = (b0 * Xn1a) + d1a;
275 acc1b = (b0 * Xn1b) + d1b;
276
277 *pOut++ = acc1a;
278 *pOut++ = acc1b;
279
280 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
281 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
282
283 d2a = (b2 * Xn1a) + (a2 * acc1a);
284 d2b = (b2 * Xn1b) + (a2 * acc1b);
285
286 /* 4 */
287 Xn1a = *pIn++; /* Channel a */
288 Xn1b = *pIn++; /* Channel b */
289
290 acc1a = (b0 * Xn1a) + d1a;
291 acc1b = (b0 * Xn1b) + d1b;
292
293 *pOut++ = acc1a;
294 *pOut++ = acc1b;
295
296 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
297 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
298
299 d2a = (b2 * Xn1a) + (a2 * acc1a);
300 d2b = (b2 * Xn1b) + (a2 * acc1b);
301
302 /* 5 */
303 Xn1a = *pIn++; /* Channel a */
304 Xn1b = *pIn++; /* Channel b */
305
306 acc1a = (b0 * Xn1a) + d1a;
307 acc1b = (b0 * Xn1b) + d1b;
308
309 *pOut++ = acc1a;
310 *pOut++ = acc1b;
311
312 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
313 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
314
315 d2a = (b2 * Xn1a) + (a2 * acc1a);
316 d2b = (b2 * Xn1b) + (a2 * acc1b);
317
318 /* 6 */
319 Xn1a = *pIn++; /* Channel a */
320 Xn1b = *pIn++; /* Channel b */
321
322 acc1a = (b0 * Xn1a) + d1a;
323 acc1b = (b0 * Xn1b) + d1b;
324
325 *pOut++ = acc1a;
326 *pOut++ = acc1b;
327
328 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
329 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
330
331 d2a = (b2 * Xn1a) + (a2 * acc1a);
332 d2b = (b2 * Xn1b) + (a2 * acc1b);
333
334 /* 7 */
335 Xn1a = *pIn++; /* Channel a */
336 Xn1b = *pIn++; /* Channel b */
337
338 acc1a = (b0 * Xn1a) + d1a;
339 acc1b = (b0 * Xn1b) + d1b;
340
341 *pOut++ = acc1a;
342 *pOut++ = acc1b;
343
344 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
345 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
346
347 d2a = (b2 * Xn1a) + (a2 * acc1a);
348 d2b = (b2 * Xn1b) + (a2 * acc1b);
349
350 /* 8 */
351 Xn1a = *pIn++; /* Channel a */
352 Xn1b = *pIn++; /* Channel b */
353
354 acc1a = (b0 * Xn1a) + d1a;
355 acc1b = (b0 * Xn1b) + d1b;
356
357 *pOut++ = acc1a;
358 *pOut++ = acc1b;
359
360 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
361 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
362
363 d2a = (b2 * Xn1a) + (a2 * acc1a);
364 d2b = (b2 * Xn1b) + (a2 * acc1b);
365
366 /* decrement loop counter */
367 sample--;
368 }
369
370 /* Loop unrolling: Compute remaining outputs */
371 sample = blockSize & 0x7U;
372
373 #else
374
375 /* Initialize blkCnt with number of samples */
376 sample = blockSize;
377
378 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
379
380 while (sample > 0U) {
381 /* Read the input */
382 Xn1a = *pIn++; /* Channel a */
383 Xn1b = *pIn++; /* Channel b */
384
385 /* y[n] = b0 * x[n] + d1 */
386 acc1a = (b0 * Xn1a) + d1a;
387 acc1b = (b0 * Xn1b) + d1b;
388
389 /* Store the result in the accumulator in the destination buffer. */
390 *pOut++ = acc1a;
391 *pOut++ = acc1b;
392
393 /* Every time after the output is computed state should be updated. */
394 /* d1 = b1 * x[n] + a1 * y[n] + d2 */
395 d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
396 d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
397
398 /* d2 = b2 * x[n] + a2 * y[n] */
399 d2a = (b2 * Xn1a) + (a2 * acc1a);
400 d2b = (b2 * Xn1b) + (a2 * acc1b);
401
402 /* decrement loop counter */
403 sample--;
404 }
405
406 /* Store the updated state variables back into the state array */
407 pState[0] = d1a;
408 pState[1] = d2a;
409
410 pState[2] = d1b;
411 pState[3] = d2b;
412
413 pState += 4U;
414
415 /* The current stage output is given as the input to the next stage */
416 pIn = pDst;
417
418 /* Reset the output working pointer */
419 pOut = pDst;
420
421 /* Decrement the loop counter */
422 stage--;
423
424 } while (stage > 0U);
425
426 }
427
428 #endif /* #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
429 /**
430 @} end of BiquadCascadeDF2T group
431 */
432
433 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
434