1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_biquad_cascade_df1_f16.c
4  * Description:  Processing function for the floating-point Biquad cascade DirectFormI(DF1) filter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 /**
33   @ingroup groupFilters
34  */
35 
36 
37 /**
38   @addtogroup BiquadCascadeDF1
39   @{
40  */
41 
42 /**
43   @brief         Processing function for the floating-point Biquad cascade filter.
44   @param[in]     S         points to an instance of the floating-point Biquad cascade structure
45   @param[in]     pSrc      points to the block of input data
46   @param[out]    pDst      points to the block of output data
47   @param[in]     blockSize  number of samples to process
48  */
49 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
50 
51 #include "arm_helium_utils.h"
52 
arm_biquad_cascade_df1_f16(const arm_biquad_casd_df1_inst_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)53 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_f16(
54   const arm_biquad_casd_df1_inst_f16 * S,
55   const float16_t * pSrc,
56         float16_t * pDst,
57         uint32_t blockSize)
58 {
59     float16_t *pIn = (float16_t *)pSrc;      /*  source pointer            */
60     float16_t *pOut = pDst;     /*  destination pointer       */
61     float16_t *pState = S->pState;  /*  pState pointer            */
62     const float16_t *pCoeffs = S->pCoeffs;    /*  coefficient pointer       */
63     float16_t Xn1, Xn2, Yn1, Yn2;   /*  Filter pState variables   */
64     float16_t X0, X1, X2, X3;   /*  temporary input           */
65     float16_t X4, X5, X6, X7 = 0;   /*  temporary input           */
66     _Float16 lastX, lastY;             /*  X,Y history for tail handling */
67     f16x8_t coeffs;
68     f16x8_t accVec;           /* accumultor vector */
69     uint32_t  sample, stage = S->numStages; /*  loop counters             */
70 
71     do
72     {
73         /*
74          * Reading the pState values
75          */
76         Xn1 = pState[0];
77         Xn2 = pState[1];
78         Yn1 = pState[2];
79         Yn2 = pState[3];
80 
81         sample = blockSize >> 3U;
82 
83         /*
84          * First part of the processing with loop unrolling.  Compute 8 outputs at a time.
85          */
86         while (sample > 0U)
87         {
88             X0 = *pIn++;
89             X1 = *pIn++;
90             X2 = *pIn++;
91             X3 = *pIn++;
92             X4 = *pIn++;
93             X5 = *pIn++;
94             X6 = *pIn++;
95             X7 = *pIn++;
96 
97             coeffs = vld1q(pCoeffs);
98             accVec = vmulq(coeffs, X7);
99 
100             coeffs = vld1q(&pCoeffs[8]);
101             accVec = vfmaq(accVec, coeffs, X6);
102 
103             coeffs = vld1q(&pCoeffs[16]);
104             accVec = vfmaq(accVec, coeffs, X5);
105 
106             coeffs = vld1q(&pCoeffs[24]);
107             accVec = vfmaq(accVec, coeffs, X4);
108 
109             coeffs = vld1q(&pCoeffs[32]);
110             accVec = vfmaq(accVec, coeffs, X3);
111 
112             coeffs = vld1q(&pCoeffs[40]);
113             accVec = vfmaq(accVec, coeffs, X2);
114 
115             coeffs = vld1q(&pCoeffs[48]);
116             accVec = vfmaq(accVec, coeffs, X1);
117 
118             coeffs = vld1q(&pCoeffs[56]);
119             accVec = vfmaq(accVec, coeffs, X0);
120 
121             coeffs = vld1q(&pCoeffs[64]);
122             accVec = vfmaq(accVec, coeffs, Xn1);
123 
124             coeffs = vld1q(&pCoeffs[72]);
125             accVec = vfmaq(accVec, coeffs, Xn2);
126 
127             coeffs = vld1q(&pCoeffs[80]);
128             accVec = vfmaq(accVec, coeffs, Yn1);
129 
130             coeffs = vld1q(&pCoeffs[88]);
131             accVec = vfmaq(accVec, coeffs, Yn2);
132             /*
133              * Store the result in the accumulator in the destination buffer.
134              */
135             vst1q(pOut, accVec);
136             pOut += 8;
137 
138             /*
139              * update recurrence
140              */
141             Xn1 = X7;
142             Xn2 = X6;
143             Yn1 = vgetq_lane(accVec, 7);
144             Yn2 = vgetq_lane(accVec, 6);
145             /*
146              * decrement the loop counter
147              */
148             sample--;
149         }
150 
151         /*
152          * If the blockSize is not a multiple of 8,
153          * compute any remaining output samples here.
154          */
155         sample = blockSize & 0x7U;
156         if (sample)
157         {
158             /* save previous X, Y for modulo 1 length case */
159             lastX = X7;
160             lastY = Yn1;
161 
162             X0 = *pIn++;
163             X1 = *pIn++;
164             X2 = *pIn++;
165             X3 = *pIn++;
166             X4 = *pIn++;
167             X5 = *pIn++;
168             X6 = *pIn++;
169             X7 = *pIn++;
170 
171             coeffs = vld1q(pCoeffs);
172             accVec = vmulq(coeffs, X7);
173 
174             coeffs = vld1q(&pCoeffs[8]);
175             accVec = vfmaq(accVec, coeffs, X6);
176 
177             coeffs = vld1q(&pCoeffs[16]);
178             accVec = vfmaq(accVec, coeffs, X5);
179 
180             coeffs = vld1q(&pCoeffs[24]);
181             accVec = vfmaq(accVec, coeffs, X4);
182 
183             coeffs = vld1q(&pCoeffs[32]);
184             accVec = vfmaq(accVec, coeffs, X3);
185 
186             coeffs = vld1q(&pCoeffs[40]);
187             accVec = vfmaq(accVec, coeffs, X2);
188 
189             coeffs = vld1q(&pCoeffs[48]);
190             accVec = vfmaq(accVec, coeffs, X1);
191 
192             coeffs = vld1q(&pCoeffs[56]);
193             accVec = vfmaq(accVec, coeffs, X0);
194 
195             coeffs = vld1q(&pCoeffs[64]);
196             accVec = vfmaq(accVec, coeffs, Xn1);
197 
198             coeffs = vld1q(&pCoeffs[72]);
199             accVec = vfmaq(accVec, coeffs, Xn2);
200 
201             coeffs = vld1q(&pCoeffs[80]);
202             accVec = vfmaq(accVec, coeffs, Yn1);
203 
204             coeffs = vld1q(&pCoeffs[88]);
205             accVec = vfmaq(accVec, coeffs, Yn2);
206 
207             switch(sample)
208             {
209                case 1:
210                  *pOut++ = vgetq_lane(accVec, 0);
211                   Xn1 = X0;
212                   Xn2 = lastX;
213                   Yn1 = vgetq_lane(accVec, 0);
214                   Yn2 = lastY;
215                break;
216                case 2:
217                  *pOut++ = vgetq_lane(accVec, 0);
218                  *pOut++ = vgetq_lane(accVec, 1);
219                  Xn1 = X1;
220                  Xn2 = X0;
221                  Yn1 = vgetq_lane(accVec, 1);
222                  Yn2 = vgetq_lane(accVec, 0);
223                break;
224                case 3:
225                 *pOut++ = vgetq_lane(accVec, 0);
226                 *pOut++ = vgetq_lane(accVec, 1);
227                 *pOut++ = vgetq_lane(accVec, 2);
228                 Xn1 = X2;
229                 Xn2 = X1;
230                 Yn1 = vgetq_lane(accVec, 2);
231                 Yn2 = vgetq_lane(accVec, 1);
232                break;
233 
234                case 4:
235                 *pOut++ = vgetq_lane(accVec, 0);
236                 *pOut++ = vgetq_lane(accVec, 1);
237                 *pOut++ = vgetq_lane(accVec, 2);
238                 *pOut++ = vgetq_lane(accVec, 3);
239                 Xn1 = X3;
240                 Xn2 = X2;
241                 Yn1 = vgetq_lane(accVec, 3);
242                 Yn2 = vgetq_lane(accVec, 2);
243                break;
244 
245                case 5:
246                 *pOut++ = vgetq_lane(accVec, 0);
247                 *pOut++ = vgetq_lane(accVec, 1);
248                 *pOut++ = vgetq_lane(accVec, 2);
249                 *pOut++ = vgetq_lane(accVec, 3);
250                 *pOut++ = vgetq_lane(accVec, 4);
251                 Xn1 = X4;
252                 Xn2 = X3;
253                 Yn1 = vgetq_lane(accVec, 4);
254                 Yn2 = vgetq_lane(accVec, 3);
255                break;
256 
257                case 6:
258                 *pOut++ = vgetq_lane(accVec, 0);
259                 *pOut++ = vgetq_lane(accVec, 1);
260                 *pOut++ = vgetq_lane(accVec, 2);
261                 *pOut++ = vgetq_lane(accVec, 3);
262                 *pOut++ = vgetq_lane(accVec, 4);
263                 *pOut++ = vgetq_lane(accVec, 5);
264                 Xn1 = X5;
265                 Xn2 = X4;
266                 Yn1 = vgetq_lane(accVec, 5);
267                 Yn2 = vgetq_lane(accVec, 4);
268                break;
269 
270                case 7:
271                 *pOut++ = vgetq_lane(accVec, 0);
272                 *pOut++ = vgetq_lane(accVec, 1);
273                 *pOut++ = vgetq_lane(accVec, 2);
274                 *pOut++ = vgetq_lane(accVec, 3);
275                 *pOut++ = vgetq_lane(accVec, 4);
276                 *pOut++ = vgetq_lane(accVec, 5);
277                 *pOut++ = vgetq_lane(accVec, 6);
278                 Xn1 = X6;
279                 Xn2 = X5;
280                 Yn1 = vgetq_lane(accVec, 6);
281                 Yn2 = vgetq_lane(accVec, 5);
282                break;
283             }
284         }
285         /*
286          * Store the updated state variables back into the pState array
287          */
288         *pState++ = Xn1;
289         *pState++ = Xn2;
290         *pState++ = Yn1;
291         *pState++ = Yn2;
292 
293         pCoeffs += sizeof(arm_biquad_mod_coef_f16) / sizeof(float16_t);
294         /*
295          * The first stage goes from the input buffer to the output buffer.
296          * Subsequent numStages  occur in-place in the output buffer
297          */
298         pIn = pDst;
299         /*
300          * Reset the output pointer
301          */
302         pOut = pDst;
303         /*
304          * decrement the loop counter
305          */
306         stage--;
307     }
308     while (stage > 0U);
309 }
310 
311 #else
arm_biquad_cascade_df1_f16(const arm_biquad_casd_df1_inst_f16 * S,const float16_t * pSrc,float16_t * pDst,uint32_t blockSize)312 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_f16(
313   const arm_biquad_casd_df1_inst_f16 * S,
314   const float16_t * pSrc,
315         float16_t * pDst,
316         uint32_t blockSize)
317 {
318   const float16_t *pIn = pSrc;                         /* Source pointer */
319         float16_t *pOut = pDst;                        /* Destination pointer */
320         float16_t *pState = S->pState;                 /* pState pointer */
321   const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
322         _Float16 acc;                                 /* Accumulator */
323         _Float16 b0, b1, b2, a1, a2;                  /* Filter coefficients */
324         _Float16 Xn1, Xn2, Yn1, Yn2;                  /* Filter pState variables */
325         _Float16 Xn;                                  /* Temporary input */
326         uint32_t sample, stage = S->numStages;         /* Loop counters */
327 
328   do
329   {
330     /* Reading the coefficients */
331     b0 = *pCoeffs++;
332     b1 = *pCoeffs++;
333     b2 = *pCoeffs++;
334     a1 = *pCoeffs++;
335     a2 = *pCoeffs++;
336 
337     /* Reading the pState values */
338     Xn1 = pState[0];
339     Xn2 = pState[1];
340     Yn1 = pState[2];
341     Yn2 = pState[3];
342 
343 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
344 
345     /* Apply loop unrolling and compute 4 output values simultaneously. */
346     /* Variable acc hold output values that are being computed:
347      *
348      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
349      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
350      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
351      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
352      */
353 
354     /* Loop unrolling: Compute 4 outputs at a time */
355     sample = blockSize >> 2U;
356 
357     while (sample > 0U)
358     {
359       /* Read the first input */
360       Xn = *pIn++;
361 
362       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
363       Yn2 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
364 
365       /* Store output in destination buffer. */
366       *pOut++ = Yn2;
367 
368       /* Every time after the output is computed state should be updated. */
369       /* The states should be updated as: */
370       /* Xn2 = Xn1 */
371       /* Xn1 = Xn  */
372       /* Yn2 = Yn1 */
373       /* Yn1 = acc */
374 
375       /* Read the second input */
376       Xn2 = *pIn++;
377 
378       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
379       Yn1 = (b0 * Xn2) + (b1 * Xn) + (b2 * Xn1) + (a1 * Yn2) + (a2 * Yn1);
380 
381       /* Store output in destination buffer. */
382       *pOut++ = Yn1;
383 
384       /* Every time after the output is computed state should be updated. */
385       /* The states should be updated as: */
386       /* Xn2 = Xn1 */
387       /* Xn1 = Xn  */
388       /* Yn2 = Yn1 */
389       /* Yn1 = acc */
390 
391       /* Read the third input */
392       Xn1 = *pIn++;
393 
394       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
395       Yn2 = (b0 * Xn1) + (b1 * Xn2) + (b2 * Xn) + (a1 * Yn1) + (a2 * Yn2);
396 
397       /* Store output in destination buffer. */
398       *pOut++ = Yn2;
399 
400       /* Every time after the output is computed state should be updated. */
401       /* The states should be updated as: */
402       /* Xn2 = Xn1 */
403       /* Xn1 = Xn  */
404       /* Yn2 = Yn1 */
405       /* Yn1 = acc */
406 
407       /* Read the forth input */
408       Xn = *pIn++;
409 
410       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
411       Yn1 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn2) + (a2 * Yn1);
412 
413       /* Store output in destination buffer. */
414       *pOut++ = Yn1;
415 
416       /* Every time after the output is computed state should be updated. */
417       /* The states should be updated as: */
418       /* Xn2 = Xn1 */
419       /* Xn1 = Xn  */
420       /* Yn2 = Yn1 */
421       /* Yn1 = acc */
422       Xn2 = Xn1;
423       Xn1 = Xn;
424 
425       /* decrement loop counter */
426       sample--;
427     }
428 
429     /* Loop unrolling: Compute remaining outputs */
430     sample = blockSize & 0x3U;
431 
432 #else
433 
434     /* Initialize blkCnt with number of samples */
435     sample = blockSize;
436 
437 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
438 
439     while (sample > 0U)
440     {
441       /* Read the input */
442       Xn = *pIn++;
443 
444       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
445       acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
446 
447       /* Store output in destination buffer. */
448       *pOut++ = acc;
449 
450       /* Every time after the output is computed state should be updated. */
451       /* The states should be updated as: */
452       /* Xn2 = Xn1 */
453       /* Xn1 = Xn  */
454       /* Yn2 = Yn1 */
455       /* Yn1 = acc */
456       Xn2 = Xn1;
457       Xn1 = Xn;
458       Yn2 = Yn1;
459       Yn1 = acc;
460 
461       /* decrement loop counter */
462       sample--;
463     }
464 
465     /* Store the updated state variables back into the pState array */
466     *pState++ = Xn1;
467     *pState++ = Xn2;
468     *pState++ = Yn1;
469     *pState++ = Yn2;
470 
471     /* The first stage goes from the input buffer to the output buffer. */
472     /* Subsequent numStages occur in-place in the output buffer */
473     pIn = pDst;
474 
475     /* Reset output pointer */
476     pOut = pDst;
477 
478     /* decrement loop counter */
479     stage--;
480 
481   } while (stage > 0U);
482 
483 }
484 
485 /**
486   @} end of BiquadCascadeDF1 group
487  */
488 #endif /* #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
489 
490 #endif /*#if defined(ARM_FLOAT16_SUPPORTED)*/
491