1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_biquad_cascade_df2T_f16.c
4 * Description: Processing function for floating-point transposed direct form II Biquad cascade filter
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 /**
33 @ingroup groupFilters
34 */
35
36 /**
37 @addtogroup BiquadCascadeDF2T
38 @{
39 */
40
41 /**
42 @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
43 @param[in] S points to an instance of the filter data structure
44 @param[in] pSrc points to the block of input data
45 @param[out] pDst points to the block of output data
46 @param[in] blockSize number of samples to process
47 */
48
49 #if (defined(ARM_MATH_MVE_FLOAT16) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_biquad_cascade_df2T_f16(const arm_biquad_cascade_df2T_instance_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)50 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f16(
51 const arm_biquad_cascade_df2T_instance_f16 * S,
52 const float16_t * pSrc,
53 float16_t * pDst,
54 uint32_t blockSize)
55 {
56 float16_t *pIn = (float16_t *)pSrc; /* source pointer */
57 float16_t Xn0, Xn1;
58 float16_t acc0, acc1;
59 float16_t *pOut = pDst; /* destination pointer */
60 float16_t *pState = S->pState; /* State pointer */
61 uint32_t sample, stage = S->numStages; /* loop counters */
62 float16_t const *pCurCoeffs = /* coefficient pointer */
63 (float16_t const *) S->pCoeffs;
64 f16x8_t b0Coeffs, a0Coeffs; /* Coefficients vector */
65 f16x8_t b1Coeffs, a1Coeffs; /* Modified coef. vector */
66 f16x8_t state; /* State vector */
67
68 do
69 {
70 /*
71 * temporary carry variable for feeding the 128-bit vector shifter
72 */
73 uint32_t tmp = 0;
74 /*
75 * Reading the coefficients
76 * b0Coeffs = {b0, b1, b2, x, x, x, x, x}
77 * a0Coeffs = { x, a1, a2, x, x, x, x, x}
78 */
79 b0Coeffs = vld1q(pCurCoeffs); pCurCoeffs += 2;
80 a0Coeffs = vld1q(pCurCoeffs); pCurCoeffs += 3;
81 /*
82 * Reading the state values
83 * state = {d1, d2, 0, 0, x, x, x, x}
84 */
85 state = *(f16x8_t *) pState;
86 state = vsetq_lane((float16_t)0.0, state, 2);
87 state = vsetq_lane((float16_t)0.0, state, 3);
88
89 /* b1Coeffs = {0, b0, b1, b2, x, x, x, x} */
90 /* b1Coeffs = { x, x, a1, a2, x, x, x, x} */
91 b1Coeffs = (f16x8_t)vshlcq_s16((int16x8_t)b0Coeffs, &tmp, 16);
92 a1Coeffs = (f16x8_t)vshlcq_s16((int16x8_t)a0Coeffs, &tmp, 16);
93
94 sample = blockSize / 2;
95
96 /* unrolled 2 x */
97 while (sample > 0U)
98 {
99 /*
100 * Read 2 inputs
101 */
102 Xn0 = *pIn++;
103 Xn1 = *pIn++;
104
105 /*
106 * 1st half:
107 * / acc1 \ / b0 \ / d1 \ / 0 \
108 * | d1 | | b1 | | d2 | | a1 |
109 * | d2 | | b2 | | 0 | | a2 |
110 * | x | = | x | * Xn1 + | x | + | x | x acc1
111 * ... ... ... ...
112 * \ x / \ x / \ x / \ x /
113 */
114
115 state = vfmaq(state, b0Coeffs, Xn0);
116 acc0 = vgetq_lane(state, 0);
117 state = vfmaq(state, a0Coeffs, acc0);
118 state = vsetq_lane((float16_t)0.0, state, 3);
119
120 /*
121 * 2nd half:
122 * same as 1st half, but all vector elements shifted down.
123 * / x \ / x \ / x \ / x \
124 * | acc1 | | b0 | | d1 | | 0 |
125 * | d1 | | b1 | | d2 | | a1 |
126 * | d2 | | b2 | | 0 | | a2 |
127 * | x | = | x | * Xn1 + | x | + | x | x acc1
128 * ... ... ... ...
129 * \ x / \ x / \ x / \ x /
130 */
131
132 state = vfmaq(state, b1Coeffs, Xn1);
133 acc1 = vgetq_lane(state, 1);
134 state = vfmaq(state, a1Coeffs, acc1);
135
136 /* move d1, d2 up + clearing */
137 /* expect dual move or long move */
138 state = vsetq_lane(vgetq_lane(state, 2), state, 0);
139 state = vsetq_lane(vgetq_lane(state, 3), state, 1);
140 state = vsetq_lane((float16_t)0.0, state, 2);
141 /*
142 * Store the results in the destination buffer.
143 */
144 *pOut++ = acc0;
145 *pOut++ = acc1;
146 /*
147 * decrement the loop counter
148 */
149 sample--;
150 }
151
152 /* compiler does not come back when enabled */
153 /*
154 * tail handling
155 */
156 if (blockSize & 1)
157 {
158 Xn0 = *pIn++;
159 state = vfmaq_n_f16(state, b0Coeffs, Xn0);
160 acc0 = vgetq_lane(state, 0);
161
162 state = vfmaq_n_f16(state, a0Coeffs, acc0);
163 *pOut++ = acc0;
164 *pState++ = vgetq_lane(state, 1);
165 *pState++ = vgetq_lane(state, 2);
166 }
167 else
168 {
169 *pState++ = vgetq_lane(state, 0);
170 *pState++ = vgetq_lane(state, 1);
171 }
172 /*
173 * The current stage input is given as the output to the next stage
174 */
175 pIn = pDst;
176 /*
177 * Reset the output working pointer
178 */
179 pOut = pDst;
180 /*
181 * decrement the loop counter
182 */
183 stage--;
184 }
185 while (stage > 0U);
186 }
187 #else
188
arm_biquad_cascade_df2T_f16(const arm_biquad_cascade_df2T_instance_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)189 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f16(
190 const arm_biquad_cascade_df2T_instance_f16 * S,
191 const float16_t * pSrc,
192 float16_t * pDst,
193 uint32_t blockSize)
194 {
195 const float16_t *pIn = pSrc; /* Source pointer */
196 float16_t *pOut = pDst; /* Destination pointer */
197 float16_t *pState = S->pState; /* State pointer */
198 const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
199 _Float16 acc1; /* Accumulator */
200 _Float16 b0, b1, b2, a1, a2; /* Filter coefficients */
201 _Float16 Xn1; /* Temporary input */
202 _Float16 d1, d2; /* State variables */
203 uint32_t sample, stage = S->numStages; /* Loop counters */
204
205 do
206 {
207 /* Reading the coefficients */
208 b0 = pCoeffs[0];
209 b1 = pCoeffs[1];
210 b2 = pCoeffs[2];
211 a1 = pCoeffs[3];
212 a2 = pCoeffs[4];
213
214 /* Reading the state values */
215 d1 = pState[0];
216 d2 = pState[1];
217
218 pCoeffs += 5U;
219
220 #if defined (ARM_MATH_LOOPUNROLL)
221
222 /* Loop unrolling: Compute 16 outputs at a time */
223 sample = blockSize >> 4U;
224
225 while (sample > 0U) {
226
227 /* y[n] = b0 * x[n] + d1 */
228 /* d1 = b1 * x[n] + a1 * y[n] + d2 */
229 /* d2 = b2 * x[n] + a2 * y[n] */
230
231 /* 1 */
232 Xn1 = *pIn++;
233
234 acc1 = b0 * Xn1 + d1;
235
236 d1 = b1 * Xn1 + d2;
237 d1 += a1 * acc1;
238
239 d2 = b2 * Xn1;
240 d2 += a2 * acc1;
241
242 *pOut++ = acc1;
243
244 /* 2 */
245 Xn1 = *pIn++;
246
247 acc1 = b0 * Xn1 + d1;
248
249 d1 = b1 * Xn1 + d2;
250 d1 += a1 * acc1;
251
252 d2 = b2 * Xn1;
253 d2 += a2 * acc1;
254
255 *pOut++ = acc1;
256
257 /* 3 */
258 Xn1 = *pIn++;
259
260 acc1 = b0 * Xn1 + d1;
261
262 d1 = b1 * Xn1 + d2;
263 d1 += a1 * acc1;
264
265 d2 = b2 * Xn1;
266 d2 += a2 * acc1;
267
268 *pOut++ = acc1;
269
270 /* 4 */
271 Xn1 = *pIn++;
272
273 acc1 = b0 * Xn1 + d1;
274
275 d1 = b1 * Xn1 + d2;
276 d1 += a1 * acc1;
277
278 d2 = b2 * Xn1;
279 d2 += a2 * acc1;
280
281 *pOut++ = acc1;
282
283 /* 5 */
284 Xn1 = *pIn++;
285
286 acc1 = b0 * Xn1 + d1;
287
288 d1 = b1 * Xn1 + d2;
289 d1 += a1 * acc1;
290
291 d2 = b2 * Xn1;
292 d2 += a2 * acc1;
293
294 *pOut++ = acc1;
295
296 /* 6 */
297 Xn1 = *pIn++;
298
299 acc1 = b0 * Xn1 + d1;
300
301 d1 = b1 * Xn1 + d2;
302 d1 += a1 * acc1;
303
304 d2 = b2 * Xn1;
305 d2 += a2 * acc1;
306
307 *pOut++ = acc1;
308
309 /* 7 */
310 Xn1 = *pIn++;
311
312 acc1 = b0 * Xn1 + d1;
313
314 d1 = b1 * Xn1 + d2;
315 d1 += a1 * acc1;
316
317 d2 = b2 * Xn1;
318 d2 += a2 * acc1;
319
320 *pOut++ = acc1;
321
322 /* 8 */
323 Xn1 = *pIn++;
324
325 acc1 = b0 * Xn1 + d1;
326
327 d1 = b1 * Xn1 + d2;
328 d1 += a1 * acc1;
329
330 d2 = b2 * Xn1;
331 d2 += a2 * acc1;
332
333 *pOut++ = acc1;
334
335 /* 9 */
336 Xn1 = *pIn++;
337
338 acc1 = b0 * Xn1 + d1;
339
340 d1 = b1 * Xn1 + d2;
341 d1 += a1 * acc1;
342
343 d2 = b2 * Xn1;
344 d2 += a2 * acc1;
345
346 *pOut++ = acc1;
347
348 /* 10 */
349 Xn1 = *pIn++;
350
351 acc1 = b0 * Xn1 + d1;
352
353 d1 = b1 * Xn1 + d2;
354 d1 += a1 * acc1;
355
356 d2 = b2 * Xn1;
357 d2 += a2 * acc1;
358
359 *pOut++ = acc1;
360
361 /* 11 */
362 Xn1 = *pIn++;
363
364 acc1 = b0 * Xn1 + d1;
365
366 d1 = b1 * Xn1 + d2;
367 d1 += a1 * acc1;
368
369 d2 = b2 * Xn1;
370 d2 += a2 * acc1;
371
372 *pOut++ = acc1;
373
374 /* 12 */
375 Xn1 = *pIn++;
376
377 acc1 = b0 * Xn1 + d1;
378
379 d1 = b1 * Xn1 + d2;
380 d1 += a1 * acc1;
381
382 d2 = b2 * Xn1;
383 d2 += a2 * acc1;
384
385 *pOut++ = acc1;
386
387 /* 13 */
388 Xn1 = *pIn++;
389
390 acc1 = b0 * Xn1 + d1;
391
392 d1 = b1 * Xn1 + d2;
393 d1 += a1 * acc1;
394
395 d2 = b2 * Xn1;
396 d2 += a2 * acc1;
397
398 *pOut++ = acc1;
399
400 /* 14 */
401 Xn1 = *pIn++;
402
403 acc1 = b0 * Xn1 + d1;
404
405 d1 = b1 * Xn1 + d2;
406 d1 += a1 * acc1;
407
408 d2 = b2 * Xn1;
409 d2 += a2 * acc1;
410
411 *pOut++ = acc1;
412
413 /* 15 */
414 Xn1 = *pIn++;
415
416 acc1 = b0 * Xn1 + d1;
417
418 d1 = b1 * Xn1 + d2;
419 d1 += a1 * acc1;
420
421 d2 = b2 * Xn1;
422 d2 += a2 * acc1;
423
424 *pOut++ = acc1;
425
426 /* 16 */
427 Xn1 = *pIn++;
428
429 acc1 = b0 * Xn1 + d1;
430
431 d1 = b1 * Xn1 + d2;
432 d1 += a1 * acc1;
433
434 d2 = b2 * Xn1;
435 d2 += a2 * acc1;
436
437 *pOut++ = acc1;
438
439 /* decrement loop counter */
440 sample--;
441 }
442
443 /* Loop unrolling: Compute remaining outputs */
444 sample = blockSize & 0xFU;
445
446 #else
447
448 /* Initialize blkCnt with number of samples */
449 sample = blockSize;
450
451 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
452
453 while (sample > 0U) {
454 Xn1 = *pIn++;
455
456 acc1 = b0 * Xn1 + d1;
457
458 d1 = b1 * Xn1 + d2;
459 d1 += a1 * acc1;
460
461 d2 = b2 * Xn1;
462 d2 += a2 * acc1;
463
464 *pOut++ = acc1;
465
466 /* decrement loop counter */
467 sample--;
468 }
469
470 /* Store the updated state variables back into the state array */
471 pState[0] = d1;
472 pState[1] = d2;
473
474 pState += 2U;
475
476 /* The current stage output is given as the input to the next stage */
477 pIn = pDst;
478
479 /* Reset the output working pointer */
480 pOut = pDst;
481
482 /* decrement loop counter */
483 stage--;
484
485 } while (stage > 0U);
486
487 }
488 #endif /* #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
489 /**
490 @} end of BiquadCascadeDF2T group
491 */
492
493 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
494