1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_biquad_cascade_df2T_f16.c
4  * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 /**
33   @ingroup groupFilters
34 */
35 
36 /**
37   @addtogroup BiquadCascadeDF2T
38   @{
39  */
40 
41 /**
42   @brief         Processing function for the floating-point transposed direct form II Biquad cascade filter.
43   @param[in]     S         points to an instance of the filter data structure
44   @param[in]     pSrc      points to the block of input data
45   @param[out]    pDst      points to the block of output data
46   @param[in]     blockSize number of samples to process
47  */
48 
49 #if (defined(ARM_MATH_MVE_FLOAT16) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_biquad_cascade_df2T_f16(const arm_biquad_cascade_df2T_instance_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)50 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f16(
51   const arm_biquad_cascade_df2T_instance_f16 * S,
52   const float16_t * pSrc,
53         float16_t * pDst,
54         uint32_t blockSize)
55 {
56     float16_t *pIn = (float16_t *)pSrc;                  /*  source pointer            */
57     float16_t Xn0, Xn1;
58     float16_t acc0, acc1;
59     float16_t *pOut = pDst;                 /*  destination pointer       */
60     float16_t *pState = S->pState;          /*  State pointer             */
61     uint32_t  sample, stage = S->numStages; /*  loop counters             */
62     float16_t const *pCurCoeffs =          /*  coefficient pointer       */
63                 (float16_t const *) S->pCoeffs;
64     f16x8_t b0Coeffs, a0Coeffs;           /*  Coefficients vector       */
65     f16x8_t b1Coeffs, a1Coeffs;           /*  Modified coef. vector     */
66     f16x8_t state;                        /*  State vector              */
67 
68     do
69     {
70         /*
71          * temporary carry variable for feeding the 128-bit vector shifter
72          */
73         uint32_t  tmp = 0;
74         /*
75          * Reading the coefficients
76          * b0Coeffs = {b0, b1, b2, x, x, x, x, x}
77          * a0Coeffs = { x, a1, a2, x, x, x, x, x}
78          */
79         b0Coeffs = vld1q(pCurCoeffs);   pCurCoeffs += 2;
80         a0Coeffs = vld1q(pCurCoeffs);   pCurCoeffs += 3;
81         /*
82          * Reading the state values
83          * state = {d1, d2, 0, 0, x, x, x, x}
84          */
85         state = *(f16x8_t *) pState;
86         state = vsetq_lane((float16_t)0.0, state, 2);
87         state = vsetq_lane((float16_t)0.0, state, 3);
88 
89         /* b1Coeffs = {0, b0, b1, b2, x, x, x, x} */
90         /* b1Coeffs = { x, x, a1, a2, x, x, x, x} */
91         b1Coeffs = (f16x8_t)vshlcq_s16((int16x8_t)b0Coeffs, &tmp, 16);
92         a1Coeffs = (f16x8_t)vshlcq_s16((int16x8_t)a0Coeffs, &tmp, 16);
93 
94         sample = blockSize / 2;
95 
96         /* unrolled 2 x */
97         while (sample > 0U)
98         {
99             /*
100              * Read 2 inputs
101              */
102             Xn0 = *pIn++;
103             Xn1 = *pIn++;
104 
105             /*
106              * 1st half:
107              * / acc1 \   / b0 \         / d1 \   / 0  \
108              * |  d1  |   | b1 |         | d2 |   | a1 |
109              * |  d2  |   | b2 |         | 0  |   | a2 |
110              * |  x   | = | x  | * Xn1 + | x  | + | x  | x acc1
111              *   ...       ...            ...      ...
112              * \  x   /   \ x  /         \ x  /   \ x  /
113              */
114 
115             state = vfmaq(state, b0Coeffs, Xn0);
116             acc0 = vgetq_lane(state, 0);
117             state = vfmaq(state, a0Coeffs, acc0);
118             state = vsetq_lane((float16_t)0.0, state, 3);
119 
120             /*
121              * 2nd half:
122              * same as 1st half, but all vector elements shifted down.
123              * /  x   \   / x  \         / x  \   / x  \
124              * | acc1 |   | b0 |         | d1 |   | 0  |
125              * |  d1  |   | b1 |         | d2 |   | a1 |
126              * |  d2  |   | b2 |         | 0  |   | a2 |
127              * |  x   | = | x  | * Xn1 + | x  | + | x  | x acc1
128              *   ...       ...            ...      ...
129              * \  x   /   \ x  /         \ x  /   \ x  /
130              */
131 
132             state = vfmaq(state, b1Coeffs, Xn1);
133             acc1 = vgetq_lane(state, 1);
134             state = vfmaq(state, a1Coeffs, acc1);
135 
136             /* move d1, d2 up + clearing */
137             /* expect dual move or long move */
138             state = vsetq_lane(vgetq_lane(state, 2), state, 0);
139             state = vsetq_lane(vgetq_lane(state, 3), state, 1);
140             state = vsetq_lane((float16_t)0.0, state, 2);
141             /*
142              * Store the results in the destination buffer.
143              */
144             *pOut++ = acc0;
145             *pOut++ = acc1;
146             /*
147              * decrement the loop counter
148              */
149             sample--;
150         }
151 
152         /* compiler does not come back when enabled */
153         /*
154          * tail handling
155          */
156         if (blockSize & 1)
157         {
158             Xn0 = *pIn++;
159             state = vfmaq_n_f16(state, b0Coeffs, Xn0);
160             acc0 = vgetq_lane(state, 0);
161 
162             state = vfmaq_n_f16(state, a0Coeffs, acc0);
163             *pOut++ = acc0;
164             *pState++ = vgetq_lane(state, 1);
165             *pState++ = vgetq_lane(state, 2);
166         }
167         else
168         {
169             *pState++ = vgetq_lane(state, 0);
170             *pState++ = vgetq_lane(state, 1);
171         }
172         /*
173          * The current stage input is given as the output to the next stage
174          */
175         pIn = pDst;
176         /*
177          * Reset the output working pointer
178          */
179         pOut = pDst;
180         /*
181          * decrement the loop counter
182          */
183         stage--;
184     }
185     while (stage > 0U);
186 }
187 #else
188 
arm_biquad_cascade_df2T_f16(const arm_biquad_cascade_df2T_instance_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)189 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f16(
190   const arm_biquad_cascade_df2T_instance_f16 * S,
191   const float16_t * pSrc,
192         float16_t * pDst,
193         uint32_t blockSize)
194 {
195   const float16_t *pIn = pSrc;                         /* Source pointer */
196         float16_t *pOut = pDst;                        /* Destination pointer */
197         float16_t *pState = S->pState;                 /* State pointer */
198   const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
199         _Float16 acc1;                                /* Accumulator */
200         _Float16 b0, b1, b2, a1, a2;                  /* Filter coefficients */
201         _Float16 Xn1;                                 /* Temporary input */
202         _Float16 d1, d2;                              /* State variables */
203         uint32_t sample, stage = S->numStages;         /* Loop counters */
204 
205   do
206   {
207      /* Reading the coefficients */
208      b0 = pCoeffs[0];
209      b1 = pCoeffs[1];
210      b2 = pCoeffs[2];
211      a1 = pCoeffs[3];
212      a2 = pCoeffs[4];
213 
214      /* Reading the state values */
215      d1 = pState[0];
216      d2 = pState[1];
217 
218      pCoeffs += 5U;
219 
220 #if defined (ARM_MATH_LOOPUNROLL)
221 
222      /* Loop unrolling: Compute 16 outputs at a time */
223      sample = blockSize >> 4U;
224 
225      while (sample > 0U) {
226 
227        /* y[n] = b0 * x[n] + d1 */
228        /* d1 = b1 * x[n] + a1 * y[n] + d2 */
229        /* d2 = b2 * x[n] + a2 * y[n] */
230 
231 /*  1 */
232        Xn1 = *pIn++;
233 
234        acc1 = b0 * Xn1 + d1;
235 
236        d1 = b1 * Xn1 + d2;
237        d1 += a1 * acc1;
238 
239        d2 = b2 * Xn1;
240        d2 += a2 * acc1;
241 
242        *pOut++ = acc1;
243 
244 /*  2 */
245          Xn1 = *pIn++;
246 
247         acc1 = b0 * Xn1 + d1;
248 
249         d1 = b1 * Xn1 + d2;
250         d1 += a1 * acc1;
251 
252         d2 = b2 * Xn1;
253         d2 += a2 * acc1;
254 
255         *pOut++ = acc1;
256 
257 /*  3 */
258          Xn1 = *pIn++;
259 
260         acc1 = b0 * Xn1 + d1;
261 
262         d1 = b1 * Xn1 + d2;
263         d1 += a1 * acc1;
264 
265         d2 = b2 * Xn1;
266         d2 += a2 * acc1;
267 
268         *pOut++ = acc1;
269 
270 /*  4 */
271          Xn1 = *pIn++;
272 
273         acc1 = b0 * Xn1 + d1;
274 
275         d1 = b1 * Xn1 + d2;
276         d1 += a1 * acc1;
277 
278         d2 = b2 * Xn1;
279         d2 += a2 * acc1;
280 
281         *pOut++ = acc1;
282 
283 /*  5 */
284          Xn1 = *pIn++;
285 
286         acc1 = b0 * Xn1 + d1;
287 
288         d1 = b1 * Xn1 + d2;
289         d1 += a1 * acc1;
290 
291         d2 = b2 * Xn1;
292         d2 += a2 * acc1;
293 
294         *pOut++ = acc1;
295 
296 /*  6 */
297          Xn1 = *pIn++;
298 
299         acc1 = b0 * Xn1 + d1;
300 
301         d1 = b1 * Xn1 + d2;
302         d1 += a1 * acc1;
303 
304         d2 = b2 * Xn1;
305         d2 += a2 * acc1;
306 
307         *pOut++ = acc1;
308 
309 /*  7 */
310          Xn1 = *pIn++;
311 
312         acc1 = b0 * Xn1 + d1;
313 
314         d1 = b1 * Xn1 + d2;
315         d1 += a1 * acc1;
316 
317         d2 = b2 * Xn1;
318         d2 += a2 * acc1;
319 
320         *pOut++ = acc1;
321 
322 /*  8 */
323          Xn1 = *pIn++;
324 
325         acc1 = b0 * Xn1 + d1;
326 
327         d1 = b1 * Xn1 + d2;
328         d1 += a1 * acc1;
329 
330         d2 = b2 * Xn1;
331         d2 += a2 * acc1;
332 
333         *pOut++ = acc1;
334 
335 /*  9 */
336          Xn1 = *pIn++;
337 
338         acc1 = b0 * Xn1 + d1;
339 
340         d1 = b1 * Xn1 + d2;
341         d1 += a1 * acc1;
342 
343         d2 = b2 * Xn1;
344         d2 += a2 * acc1;
345 
346         *pOut++ = acc1;
347 
348 /* 10 */
349          Xn1 = *pIn++;
350 
351         acc1 = b0 * Xn1 + d1;
352 
353         d1 = b1 * Xn1 + d2;
354         d1 += a1 * acc1;
355 
356         d2 = b2 * Xn1;
357         d2 += a2 * acc1;
358 
359         *pOut++ = acc1;
360 
361 /* 11 */
362          Xn1 = *pIn++;
363 
364         acc1 = b0 * Xn1 + d1;
365 
366         d1 = b1 * Xn1 + d2;
367         d1 += a1 * acc1;
368 
369         d2 = b2 * Xn1;
370         d2 += a2 * acc1;
371 
372         *pOut++ = acc1;
373 
374 /* 12 */
375          Xn1 = *pIn++;
376 
377         acc1 = b0 * Xn1 + d1;
378 
379         d1 = b1 * Xn1 + d2;
380         d1 += a1 * acc1;
381 
382         d2 = b2 * Xn1;
383         d2 += a2 * acc1;
384 
385         *pOut++ = acc1;
386 
387 /* 13 */
388          Xn1 = *pIn++;
389 
390         acc1 = b0 * Xn1 + d1;
391 
392         d1 = b1 * Xn1 + d2;
393         d1 += a1 * acc1;
394 
395         d2 = b2 * Xn1;
396         d2 += a2 * acc1;
397 
398         *pOut++ = acc1;
399 
400 /* 14 */
401          Xn1 = *pIn++;
402 
403         acc1 = b0 * Xn1 + d1;
404 
405         d1 = b1 * Xn1 + d2;
406         d1 += a1 * acc1;
407 
408         d2 = b2 * Xn1;
409         d2 += a2 * acc1;
410 
411         *pOut++ = acc1;
412 
413 /* 15 */
414          Xn1 = *pIn++;
415 
416         acc1 = b0 * Xn1 + d1;
417 
418         d1 = b1 * Xn1 + d2;
419         d1 += a1 * acc1;
420 
421         d2 = b2 * Xn1;
422         d2 += a2 * acc1;
423 
424         *pOut++ = acc1;
425 
426 /* 16 */
427          Xn1 = *pIn++;
428 
429         acc1 = b0 * Xn1 + d1;
430 
431         d1 = b1 * Xn1 + d2;
432         d1 += a1 * acc1;
433 
434         d2 = b2 * Xn1;
435         d2 += a2 * acc1;
436 
437         *pOut++ = acc1;
438 
439         /* decrement loop counter */
440         sample--;
441       }
442 
443       /* Loop unrolling: Compute remaining outputs */
444       sample = blockSize & 0xFU;
445 
446 #else
447 
448       /* Initialize blkCnt with number of samples */
449       sample = blockSize;
450 
451 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
452 
453       while (sample > 0U) {
454         Xn1 = *pIn++;
455 
456         acc1 = b0 * Xn1 + d1;
457 
458         d1 = b1 * Xn1 + d2;
459         d1 += a1 * acc1;
460 
461         d2 = b2 * Xn1;
462         d2 += a2 * acc1;
463 
464         *pOut++ = acc1;
465 
466         /* decrement loop counter */
467         sample--;
468       }
469 
470       /* Store the updated state variables back into the state array */
471       pState[0] = d1;
472       pState[1] = d2;
473 
474       pState += 2U;
475 
476       /* The current stage output is given as the input to the next stage */
477       pIn = pDst;
478 
479       /* Reset the output working pointer */
480       pOut = pDst;
481 
482       /* decrement loop counter */
483       stage--;
484 
485    } while (stage > 0U);
486 
487 }
488 #endif /* #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
489 /**
490   @} end of BiquadCascadeDF2T group
491  */
492 
493 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
494