1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_biquad_cascade_df1_fast_q15.c
4  * Description:  Fast processing function for the Q15 Biquad cascade filter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup BiquadCascadeDF1
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q15 Biquad cascade filter (fast variant).
42   @param[in]     S         points to an instance of the Q15 Biquad cascade structure
43   @param[in]     pSrc      points to the block of input data
44   @param[out]    pDst      points to the block of output data
45   @param[in]     blockSize number of samples to process per call
46 
47   @par           Scaling and Overflow Behavior
48                    This fast version uses a 32-bit accumulator with 2.30 format.
49                    The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
50                    Thus, if the accumulator result overflows it wraps around and distorts the result.
51                    In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25).
52                    The 2.30 accumulator is then shifted by <code>postShift</code> bits and the result truncated to 1.15 format by discarding the low 16 bits.
53  @remark
54                    Refer to \ref arm_biquad_cascade_df1_q15() for a slower implementation of this function
55                    which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
56                    Use the function \ref arm_biquad_cascade_df1_init_q15() to initialize the filter structure.
57  */
58 
arm_biquad_cascade_df1_fast_q15(const arm_biquad_casd_df1_inst_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)59 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_fast_q15(
60   const arm_biquad_casd_df1_inst_q15 * S,
61   const q15_t * pSrc,
62         q15_t * pDst,
63         uint32_t blockSize)
64 {
65   const q15_t *pIn = pSrc;                             /* Source pointer */
66         q15_t *pOut = pDst;                            /* Destination pointer */
67         q15_t *pState = S->pState;                     /* State pointer */
68   const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
69         q31_t acc;                                     /* Accumulator */
70         q31_t in;                                      /* Temporary variable to hold input value */
71         q31_t out;                                     /* Temporary variable to hold output value */
72         q31_t b0;                                      /* Temporary variable to hold bo value */
73         q31_t b1, a1;                                  /* Filter coefficients */
74         q31_t state_in, state_out;                     /* Filter state variables */
75         int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */
76         uint32_t sample, stage = S->numStages;         /* Loop counters */
77 
78   do
79   {
80     /* Read the b0 and 0 coefficients using SIMD  */
81     b0 = read_q15x2_ia (&pCoeffs);
82 
83     /* Read the b1 and b2 coefficients using SIMD */
84     b1 = read_q15x2_ia (&pCoeffs);
85 
86     /* Read the a1 and a2 coefficients using SIMD */
87     a1 = read_q15x2_ia (&pCoeffs);
88 
89     /* Read the input state values from the state buffer:  x[n-1], x[n-2] */
90     state_in = read_q15x2_ia (&pState);
91 
92     /* Read the output state values from the state buffer:  y[n-1], y[n-2] */
93     state_out = read_q15x2_da (&pState);
94 
95 #if defined (ARM_MATH_LOOPUNROLL)
96 
97     /* Apply loop unrolling and compute 2 output values simultaneously. */
98     /* Variable acc hold output values that are being computed:
99      *
100      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
101      * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
102      */
103 
104     /* Loop unrolling: Compute 2 outputs at a time */
105     sample = blockSize >> 1U;
106 
107     while (sample > 0U)
108     {
109 
110       /* Read the input */
111       in = read_q15x2_ia (&pIn);
112 
113       /* out =  b0 * x[n] + 0 * 0 */
114       out = __SMUAD(b0, in);
115       /* acc =  b1 * x[n-1] + acc +=  b2 * x[n-2] + out */
116       acc = __SMLAD(b1, state_in, out);
117       /* acc +=  a1 * y[n-1] + acc +=  a2 * y[n-2] */
118       acc = __SMLAD(a1, state_out, acc);
119 
120       /* The result is converted from 3.29 to 1.31 and then saturation is applied */
121       out = __SSAT((acc >> shift), 16);
122 
123       /* Every time after the output is computed state should be updated. */
124       /* The states should be updated as:  */
125       /* Xn2 = Xn1 */
126       /* Xn1 = Xn  */
127       /* Yn2 = Yn1 */
128       /* Yn1 = acc */
129       /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
130       /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
131 
132 #ifndef  ARM_MATH_BIG_ENDIAN
133       state_in  = __PKHBT(in, state_in, 16);
134       state_out = __PKHBT(out, state_out, 16);
135 #else
136       state_in  = __PKHBT(state_in >> 16, (in >> 16), 16);
137       state_out = __PKHBT(state_out >> 16, (out), 16);
138 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
139 
140       /* out =  b0 * x[n] + 0 * 0 */
141       out = __SMUADX(b0, in);
142       /* acc0 =  b1 * x[n-1] , acc0 +=  b2 * x[n-2] + out */
143       acc = __SMLAD(b1, state_in, out);
144       /* acc +=  a1 * y[n-1] + acc +=  a2 * y[n-2] */
145       acc = __SMLAD(a1, state_out, acc);
146 
147       /* The result is converted from 3.29 to 1.31 and then saturation is applied */
148       out = __SSAT((acc >> shift), 16);
149 
150       /* Store the output in the destination buffer. */
151 #ifndef  ARM_MATH_BIG_ENDIAN
152       write_q15x2_ia (&pOut, __PKHBT(state_out, out, 16));
153 #else
154       write_q15x2_ia (&pOut, __PKHBT(out, state_out >> 16, 16));
155 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
156 
157       /* Every time after the output is computed state should be updated. */
158       /* The states should be updated as:  */
159       /* Xn2 = Xn1 */
160       /* Xn1 = Xn  */
161       /* Yn2 = Yn1 */
162       /* Yn1 = acc */
163       /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
164       /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
165 #ifndef  ARM_MATH_BIG_ENDIAN
166       state_in  = __PKHBT(in >> 16, state_in, 16);
167       state_out = __PKHBT(out, state_out, 16);
168 #else
169       state_in  = __PKHBT(state_in >> 16, in, 16);
170       state_out = __PKHBT(state_out >> 16, out, 16);
171 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
172 
173       /* Decrement loop counter */
174       sample--;
175     }
176 
177     /* Loop unrolling: Compute remaining outputs */
178     sample = (blockSize & 0x1U);
179 
180 #else
181 
182     /* Initialize blkCnt with number of samples */
183     sample = blockSize;
184 
185 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
186 
187     while (sample > 0U)
188     {
189       /* Read the input */
190       in = *pIn++;
191 
192       /* out =  b0 * x[n] + 0 * 0 */
193 #ifndef  ARM_MATH_BIG_ENDIAN
194       out = __SMUAD(b0, in);
195 #else
196       out = __SMUADX(b0, in);
197 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
198 
199       /* acc =  b1 * x[n-1], acc +=  b2 * x[n-2] + out */
200       acc = __SMLAD(b1, state_in, out);
201       /* acc +=  a1 * y[n-1] + acc +=  a2 * y[n-2] */
202       acc = __SMLAD(a1, state_out, acc);
203 
204       /* The result is converted from 3.29 to 1.31 and then saturation is applied */
205       out = __SSAT((acc >> shift), 16);
206 
207       /* Store the output in the destination buffer. */
208       *pOut++ = (q15_t) out;
209 
210       /* Every time after the output is computed state should be updated. */
211       /* The states should be updated as:  */
212       /* Xn2 = Xn1 */
213       /* Xn1 = Xn  */
214       /* Yn2 = Yn1 */
215       /* Yn1 = acc */
216       /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
217       /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
218 #ifndef  ARM_MATH_BIG_ENDIAN
219       state_in = __PKHBT(in, state_in, 16);
220       state_out = __PKHBT(out, state_out, 16);
221 #else
222       state_in = __PKHBT(state_in >> 16, in, 16);
223       state_out = __PKHBT(state_out >> 16, out, 16);
224 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
225 
226       /* decrement loop counter */
227       sample--;
228     }
229 
230     /* The first stage goes from the input buffer to the output buffer. */
231     /* Subsequent (numStages - 1) occur in-place in the output buffer */
232     pIn = pDst;
233 
234     /* Reset the output pointer */
235     pOut = pDst;
236 
237     /* Store the updated state variables back into the state array */
238     write_q15x2_ia(&pState, state_in);
239     write_q15x2_ia(&pState, state_out);
240 
241     /* Decrement loop counter */
242     stage--;
243 
244   } while (stage > 0U);
245 }
246 
247 /**
248   @} end of BiquadCascadeDF1 group
249  */
250