1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_biquad_cascade_df1_f16.c
4 * Description: Processing function for the floating-point Biquad cascade DirectFormI(DF1) filter
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 /**
33 @ingroup groupFilters
34 */
35
36
37 /**
38 @addtogroup BiquadCascadeDF1
39 @{
40 */
41
42 /**
43 @brief Processing function for the floating-point Biquad cascade filter.
44 @param[in] S points to an instance of the floating-point Biquad cascade structure
45 @param[in] pSrc points to the block of input data
46 @param[out] pDst points to the block of output data
47 @param[in] blockSize number of samples to process
48 */
49 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
50
51 #include "arm_helium_utils.h"
52
arm_biquad_cascade_df1_f16(const arm_biquad_casd_df1_inst_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)53 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_f16(
54 const arm_biquad_casd_df1_inst_f16 * S,
55 const float16_t * pSrc,
56 float16_t * pDst,
57 uint32_t blockSize)
58 {
59 float16_t *pIn = (float16_t *)pSrc; /* source pointer */
60 float16_t *pOut = pDst; /* destination pointer */
61 float16_t *pState = S->pState; /* pState pointer */
62 const float16_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
63 float16_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
64 float16_t X0, X1, X2, X3; /* temporary input */
65 float16_t X4, X5, X6, X7 = 0; /* temporary input */
66 _Float16 lastX, lastY; /* X,Y history for tail handling */
67 f16x8_t coeffs;
68 f16x8_t accVec; /* accumultor vector */
69 uint32_t sample, stage = S->numStages; /* loop counters */
70
71 do
72 {
73 /*
74 * Reading the pState values
75 */
76 Xn1 = pState[0];
77 Xn2 = pState[1];
78 Yn1 = pState[2];
79 Yn2 = pState[3];
80
81 sample = blockSize >> 3U;
82
83 /*
84 * First part of the processing with loop unrolling. Compute 8 outputs at a time.
85 */
86 while (sample > 0U)
87 {
88 X0 = *pIn++;
89 X1 = *pIn++;
90 X2 = *pIn++;
91 X3 = *pIn++;
92 X4 = *pIn++;
93 X5 = *pIn++;
94 X6 = *pIn++;
95 X7 = *pIn++;
96
97 coeffs = vld1q(pCoeffs);
98 accVec = vmulq(coeffs, X7);
99
100 coeffs = vld1q(&pCoeffs[8]);
101 accVec = vfmaq(accVec, coeffs, X6);
102
103 coeffs = vld1q(&pCoeffs[16]);
104 accVec = vfmaq(accVec, coeffs, X5);
105
106 coeffs = vld1q(&pCoeffs[24]);
107 accVec = vfmaq(accVec, coeffs, X4);
108
109 coeffs = vld1q(&pCoeffs[32]);
110 accVec = vfmaq(accVec, coeffs, X3);
111
112 coeffs = vld1q(&pCoeffs[40]);
113 accVec = vfmaq(accVec, coeffs, X2);
114
115 coeffs = vld1q(&pCoeffs[48]);
116 accVec = vfmaq(accVec, coeffs, X1);
117
118 coeffs = vld1q(&pCoeffs[56]);
119 accVec = vfmaq(accVec, coeffs, X0);
120
121 coeffs = vld1q(&pCoeffs[64]);
122 accVec = vfmaq(accVec, coeffs, Xn1);
123
124 coeffs = vld1q(&pCoeffs[72]);
125 accVec = vfmaq(accVec, coeffs, Xn2);
126
127 coeffs = vld1q(&pCoeffs[80]);
128 accVec = vfmaq(accVec, coeffs, Yn1);
129
130 coeffs = vld1q(&pCoeffs[88]);
131 accVec = vfmaq(accVec, coeffs, Yn2);
132 /*
133 * Store the result in the accumulator in the destination buffer.
134 */
135 vst1q(pOut, accVec);
136 pOut += 8;
137
138 /*
139 * update recurrence
140 */
141 Xn1 = X7;
142 Xn2 = X6;
143 Yn1 = vgetq_lane(accVec, 7);
144 Yn2 = vgetq_lane(accVec, 6);
145 /*
146 * decrement the loop counter
147 */
148 sample--;
149 }
150
151 /*
152 * If the blockSize is not a multiple of 8,
153 * compute any remaining output samples here.
154 */
155 sample = blockSize & 0x7U;
156 if (sample)
157 {
158 /* save previous X, Y for modulo 1 length case */
159 lastX = X7;
160 lastY = Yn1;
161
162 X0 = *pIn++;
163 X1 = *pIn++;
164 X2 = *pIn++;
165 X3 = *pIn++;
166 X4 = *pIn++;
167 X5 = *pIn++;
168 X6 = *pIn++;
169 X7 = *pIn++;
170
171 coeffs = vld1q(pCoeffs);
172 accVec = vmulq(coeffs, X7);
173
174 coeffs = vld1q(&pCoeffs[8]);
175 accVec = vfmaq(accVec, coeffs, X6);
176
177 coeffs = vld1q(&pCoeffs[16]);
178 accVec = vfmaq(accVec, coeffs, X5);
179
180 coeffs = vld1q(&pCoeffs[24]);
181 accVec = vfmaq(accVec, coeffs, X4);
182
183 coeffs = vld1q(&pCoeffs[32]);
184 accVec = vfmaq(accVec, coeffs, X3);
185
186 coeffs = vld1q(&pCoeffs[40]);
187 accVec = vfmaq(accVec, coeffs, X2);
188
189 coeffs = vld1q(&pCoeffs[48]);
190 accVec = vfmaq(accVec, coeffs, X1);
191
192 coeffs = vld1q(&pCoeffs[56]);
193 accVec = vfmaq(accVec, coeffs, X0);
194
195 coeffs = vld1q(&pCoeffs[64]);
196 accVec = vfmaq(accVec, coeffs, Xn1);
197
198 coeffs = vld1q(&pCoeffs[72]);
199 accVec = vfmaq(accVec, coeffs, Xn2);
200
201 coeffs = vld1q(&pCoeffs[80]);
202 accVec = vfmaq(accVec, coeffs, Yn1);
203
204 coeffs = vld1q(&pCoeffs[88]);
205 accVec = vfmaq(accVec, coeffs, Yn2);
206
207 switch(sample)
208 {
209 case 1:
210 *pOut++ = vgetq_lane(accVec, 0);
211 Xn1 = X0;
212 Xn2 = lastX;
213 Yn1 = vgetq_lane(accVec, 0);
214 Yn2 = lastY;
215 break;
216 case 2:
217 *pOut++ = vgetq_lane(accVec, 0);
218 *pOut++ = vgetq_lane(accVec, 1);
219 Xn1 = X1;
220 Xn2 = X0;
221 Yn1 = vgetq_lane(accVec, 1);
222 Yn2 = vgetq_lane(accVec, 0);
223 break;
224 case 3:
225 *pOut++ = vgetq_lane(accVec, 0);
226 *pOut++ = vgetq_lane(accVec, 1);
227 *pOut++ = vgetq_lane(accVec, 2);
228 Xn1 = X2;
229 Xn2 = X1;
230 Yn1 = vgetq_lane(accVec, 2);
231 Yn2 = vgetq_lane(accVec, 1);
232 break;
233
234 case 4:
235 *pOut++ = vgetq_lane(accVec, 0);
236 *pOut++ = vgetq_lane(accVec, 1);
237 *pOut++ = vgetq_lane(accVec, 2);
238 *pOut++ = vgetq_lane(accVec, 3);
239 Xn1 = X3;
240 Xn2 = X2;
241 Yn1 = vgetq_lane(accVec, 3);
242 Yn2 = vgetq_lane(accVec, 2);
243 break;
244
245 case 5:
246 *pOut++ = vgetq_lane(accVec, 0);
247 *pOut++ = vgetq_lane(accVec, 1);
248 *pOut++ = vgetq_lane(accVec, 2);
249 *pOut++ = vgetq_lane(accVec, 3);
250 *pOut++ = vgetq_lane(accVec, 4);
251 Xn1 = X4;
252 Xn2 = X3;
253 Yn1 = vgetq_lane(accVec, 4);
254 Yn2 = vgetq_lane(accVec, 3);
255 break;
256
257 case 6:
258 *pOut++ = vgetq_lane(accVec, 0);
259 *pOut++ = vgetq_lane(accVec, 1);
260 *pOut++ = vgetq_lane(accVec, 2);
261 *pOut++ = vgetq_lane(accVec, 3);
262 *pOut++ = vgetq_lane(accVec, 4);
263 *pOut++ = vgetq_lane(accVec, 5);
264 Xn1 = X5;
265 Xn2 = X4;
266 Yn1 = vgetq_lane(accVec, 5);
267 Yn2 = vgetq_lane(accVec, 4);
268 break;
269
270 case 7:
271 *pOut++ = vgetq_lane(accVec, 0);
272 *pOut++ = vgetq_lane(accVec, 1);
273 *pOut++ = vgetq_lane(accVec, 2);
274 *pOut++ = vgetq_lane(accVec, 3);
275 *pOut++ = vgetq_lane(accVec, 4);
276 *pOut++ = vgetq_lane(accVec, 5);
277 *pOut++ = vgetq_lane(accVec, 6);
278 Xn1 = X6;
279 Xn2 = X5;
280 Yn1 = vgetq_lane(accVec, 6);
281 Yn2 = vgetq_lane(accVec, 5);
282 break;
283 }
284 }
285 /*
286 * Store the updated state variables back into the pState array
287 */
288 *pState++ = Xn1;
289 *pState++ = Xn2;
290 *pState++ = Yn1;
291 *pState++ = Yn2;
292
293 pCoeffs += sizeof(arm_biquad_mod_coef_f16) / sizeof(float16_t);
294 /*
295 * The first stage goes from the input buffer to the output buffer.
296 * Subsequent numStages occur in-place in the output buffer
297 */
298 pIn = pDst;
299 /*
300 * Reset the output pointer
301 */
302 pOut = pDst;
303 /*
304 * decrement the loop counter
305 */
306 stage--;
307 }
308 while (stage > 0U);
309 }
310
311 #else
arm_biquad_cascade_df1_f16(const arm_biquad_casd_df1_inst_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)312 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_f16(
313 const arm_biquad_casd_df1_inst_f16 * S,
314 const float16_t * pSrc,
315 float16_t * pDst,
316 uint32_t blockSize)
317 {
318 const float16_t *pIn = pSrc; /* Source pointer */
319 float16_t *pOut = pDst; /* Destination pointer */
320 float16_t *pState = S->pState; /* pState pointer */
321 const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
322 _Float16 acc; /* Accumulator */
323 _Float16 b0, b1, b2, a1, a2; /* Filter coefficients */
324 _Float16 Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
325 _Float16 Xn; /* Temporary input */
326 uint32_t sample, stage = S->numStages; /* Loop counters */
327
328 do
329 {
330 /* Reading the coefficients */
331 b0 = *pCoeffs++;
332 b1 = *pCoeffs++;
333 b2 = *pCoeffs++;
334 a1 = *pCoeffs++;
335 a2 = *pCoeffs++;
336
337 /* Reading the pState values */
338 Xn1 = pState[0];
339 Xn2 = pState[1];
340 Yn1 = pState[2];
341 Yn2 = pState[3];
342
343 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
344
345 /* Apply loop unrolling and compute 4 output values simultaneously. */
346 /* Variable acc hold output values that are being computed:
347 *
348 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
349 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
350 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
351 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
352 */
353
354 /* Loop unrolling: Compute 4 outputs at a time */
355 sample = blockSize >> 2U;
356
357 while (sample > 0U)
358 {
359 /* Read the first input */
360 Xn = *pIn++;
361
362 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
363 Yn2 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
364
365 /* Store output in destination buffer. */
366 *pOut++ = Yn2;
367
368 /* Every time after the output is computed state should be updated. */
369 /* The states should be updated as: */
370 /* Xn2 = Xn1 */
371 /* Xn1 = Xn */
372 /* Yn2 = Yn1 */
373 /* Yn1 = acc */
374
375 /* Read the second input */
376 Xn2 = *pIn++;
377
378 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
379 Yn1 = (b0 * Xn2) + (b1 * Xn) + (b2 * Xn1) + (a1 * Yn2) + (a2 * Yn1);
380
381 /* Store output in destination buffer. */
382 *pOut++ = Yn1;
383
384 /* Every time after the output is computed state should be updated. */
385 /* The states should be updated as: */
386 /* Xn2 = Xn1 */
387 /* Xn1 = Xn */
388 /* Yn2 = Yn1 */
389 /* Yn1 = acc */
390
391 /* Read the third input */
392 Xn1 = *pIn++;
393
394 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
395 Yn2 = (b0 * Xn1) + (b1 * Xn2) + (b2 * Xn) + (a1 * Yn1) + (a2 * Yn2);
396
397 /* Store output in destination buffer. */
398 *pOut++ = Yn2;
399
400 /* Every time after the output is computed state should be updated. */
401 /* The states should be updated as: */
402 /* Xn2 = Xn1 */
403 /* Xn1 = Xn */
404 /* Yn2 = Yn1 */
405 /* Yn1 = acc */
406
407 /* Read the forth input */
408 Xn = *pIn++;
409
410 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
411 Yn1 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn2) + (a2 * Yn1);
412
413 /* Store output in destination buffer. */
414 *pOut++ = Yn1;
415
416 /* Every time after the output is computed state should be updated. */
417 /* The states should be updated as: */
418 /* Xn2 = Xn1 */
419 /* Xn1 = Xn */
420 /* Yn2 = Yn1 */
421 /* Yn1 = acc */
422 Xn2 = Xn1;
423 Xn1 = Xn;
424
425 /* decrement loop counter */
426 sample--;
427 }
428
429 /* Loop unrolling: Compute remaining outputs */
430 sample = blockSize & 0x3U;
431
432 #else
433
434 /* Initialize blkCnt with number of samples */
435 sample = blockSize;
436
437 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
438
439 while (sample > 0U)
440 {
441 /* Read the input */
442 Xn = *pIn++;
443
444 /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
445 acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
446
447 /* Store output in destination buffer. */
448 *pOut++ = acc;
449
450 /* Every time after the output is computed state should be updated. */
451 /* The states should be updated as: */
452 /* Xn2 = Xn1 */
453 /* Xn1 = Xn */
454 /* Yn2 = Yn1 */
455 /* Yn1 = acc */
456 Xn2 = Xn1;
457 Xn1 = Xn;
458 Yn2 = Yn1;
459 Yn1 = acc;
460
461 /* decrement loop counter */
462 sample--;
463 }
464
465 /* Store the updated state variables back into the pState array */
466 *pState++ = Xn1;
467 *pState++ = Xn2;
468 *pState++ = Yn1;
469 *pState++ = Yn2;
470
471 /* The first stage goes from the input buffer to the output buffer. */
472 /* Subsequent numStages occur in-place in the output buffer */
473 pIn = pDst;
474
475 /* Reset output pointer */
476 pOut = pDst;
477
478 /* decrement loop counter */
479 stage--;
480
481 } while (stage > 0U);
482
483 }
484
485 /**
486 @} end of BiquadCascadeDF1 group
487 */
488 #endif /* #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
489
490 #endif /*#if defined(ARM_FLOAT16_SUPPORTED)*/
491