1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_biquad_cascade_df1_fast_q15.c
4 * Description: Fast processing function for the Q15 Biquad cascade filter
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup BiquadCascadeDF1
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q15 Biquad cascade filter (fast variant).
42 @param[in] S points to an instance of the Q15 Biquad cascade structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] blockSize number of samples to process per call
46
47 @par Scaling and Overflow Behavior
48 This fast version uses a 32-bit accumulator with 2.30 format.
49 The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
50 Thus, if the accumulator result overflows it wraps around and distorts the result.
51 In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25).
52 The 2.30 accumulator is then shifted by <code>postShift</code> bits and the result truncated to 1.15 format by discarding the low 16 bits.
53 @remark
54 Refer to \ref arm_biquad_cascade_df1_q15() for a slower implementation of this function
55 which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
56 Use the function \ref arm_biquad_cascade_df1_init_q15() to initialize the filter structure.
57 */
58
arm_biquad_cascade_df1_fast_q15(const arm_biquad_casd_df1_inst_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)59 ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df1_fast_q15(
60 const arm_biquad_casd_df1_inst_q15 * S,
61 const q15_t * pSrc,
62 q15_t * pDst,
63 uint32_t blockSize)
64 {
65 const q15_t *pIn = pSrc; /* Source pointer */
66 q15_t *pOut = pDst; /* Destination pointer */
67 q15_t *pState = S->pState; /* State pointer */
68 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
69 q31_t acc; /* Accumulator */
70 q31_t in; /* Temporary variable to hold input value */
71 q31_t out; /* Temporary variable to hold output value */
72 q31_t b0; /* Temporary variable to hold bo value */
73 q31_t b1, a1; /* Filter coefficients */
74 q31_t state_in, state_out; /* Filter state variables */
75 int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */
76 uint32_t sample, stage = S->numStages; /* Loop counters */
77
78 do
79 {
80 /* Read the b0 and 0 coefficients using SIMD */
81 b0 = read_q15x2_ia (&pCoeffs);
82
83 /* Read the b1 and b2 coefficients using SIMD */
84 b1 = read_q15x2_ia (&pCoeffs);
85
86 /* Read the a1 and a2 coefficients using SIMD */
87 a1 = read_q15x2_ia (&pCoeffs);
88
89 /* Read the input state values from the state buffer: x[n-1], x[n-2] */
90 state_in = read_q15x2_ia (&pState);
91
92 /* Read the output state values from the state buffer: y[n-1], y[n-2] */
93 state_out = read_q15x2_da (&pState);
94
95 #if defined (ARM_MATH_LOOPUNROLL)
96
97 /* Apply loop unrolling and compute 2 output values simultaneously. */
98 /* Variable acc hold output values that are being computed:
99 *
100 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
101 * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
102 */
103
104 /* Loop unrolling: Compute 2 outputs at a time */
105 sample = blockSize >> 1U;
106
107 while (sample > 0U)
108 {
109
110 /* Read the input */
111 in = read_q15x2_ia (&pIn);
112
113 /* out = b0 * x[n] + 0 * 0 */
114 out = __SMUAD(b0, in);
115 /* acc = b1 * x[n-1] + acc += b2 * x[n-2] + out */
116 acc = __SMLAD(b1, state_in, out);
117 /* acc += a1 * y[n-1] + acc += a2 * y[n-2] */
118 acc = __SMLAD(a1, state_out, acc);
119
120 /* The result is converted from 3.29 to 1.31 and then saturation is applied */
121 out = __SSAT((acc >> shift), 16);
122
123 /* Every time after the output is computed state should be updated. */
124 /* The states should be updated as: */
125 /* Xn2 = Xn1 */
126 /* Xn1 = Xn */
127 /* Yn2 = Yn1 */
128 /* Yn1 = acc */
129 /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
130 /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
131
132 #ifndef ARM_MATH_BIG_ENDIAN
133 state_in = __PKHBT(in, state_in, 16);
134 state_out = __PKHBT(out, state_out, 16);
135 #else
136 state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
137 state_out = __PKHBT(state_out >> 16, (out), 16);
138 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
139
140 /* out = b0 * x[n] + 0 * 0 */
141 out = __SMUADX(b0, in);
142 /* acc0 = b1 * x[n-1] , acc0 += b2 * x[n-2] + out */
143 acc = __SMLAD(b1, state_in, out);
144 /* acc += a1 * y[n-1] + acc += a2 * y[n-2] */
145 acc = __SMLAD(a1, state_out, acc);
146
147 /* The result is converted from 3.29 to 1.31 and then saturation is applied */
148 out = __SSAT((acc >> shift), 16);
149
150 /* Store the output in the destination buffer. */
151 #ifndef ARM_MATH_BIG_ENDIAN
152 write_q15x2_ia (&pOut, __PKHBT(state_out, out, 16));
153 #else
154 write_q15x2_ia (&pOut, __PKHBT(out, state_out >> 16, 16));
155 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
156
157 /* Every time after the output is computed state should be updated. */
158 /* The states should be updated as: */
159 /* Xn2 = Xn1 */
160 /* Xn1 = Xn */
161 /* Yn2 = Yn1 */
162 /* Yn1 = acc */
163 /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
164 /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
165 #ifndef ARM_MATH_BIG_ENDIAN
166 state_in = __PKHBT(in >> 16, state_in, 16);
167 state_out = __PKHBT(out, state_out, 16);
168 #else
169 state_in = __PKHBT(state_in >> 16, in, 16);
170 state_out = __PKHBT(state_out >> 16, out, 16);
171 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
172
173 /* Decrement loop counter */
174 sample--;
175 }
176
177 /* Loop unrolling: Compute remaining outputs */
178 sample = (blockSize & 0x1U);
179
180 #else
181
182 /* Initialize blkCnt with number of samples */
183 sample = blockSize;
184
185 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
186
187 while (sample > 0U)
188 {
189 /* Read the input */
190 in = *pIn++;
191
192 /* out = b0 * x[n] + 0 * 0 */
193 #ifndef ARM_MATH_BIG_ENDIAN
194 out = __SMUAD(b0, in);
195 #else
196 out = __SMUADX(b0, in);
197 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
198
199 /* acc = b1 * x[n-1], acc += b2 * x[n-2] + out */
200 acc = __SMLAD(b1, state_in, out);
201 /* acc += a1 * y[n-1] + acc += a2 * y[n-2] */
202 acc = __SMLAD(a1, state_out, acc);
203
204 /* The result is converted from 3.29 to 1.31 and then saturation is applied */
205 out = __SSAT((acc >> shift), 16);
206
207 /* Store the output in the destination buffer. */
208 *pOut++ = (q15_t) out;
209
210 /* Every time after the output is computed state should be updated. */
211 /* The states should be updated as: */
212 /* Xn2 = Xn1 */
213 /* Xn1 = Xn */
214 /* Yn2 = Yn1 */
215 /* Yn1 = acc */
216 /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
217 /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
218 #ifndef ARM_MATH_BIG_ENDIAN
219 state_in = __PKHBT(in, state_in, 16);
220 state_out = __PKHBT(out, state_out, 16);
221 #else
222 state_in = __PKHBT(state_in >> 16, in, 16);
223 state_out = __PKHBT(state_out >> 16, out, 16);
224 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
225
226 /* decrement loop counter */
227 sample--;
228 }
229
230 /* The first stage goes from the input buffer to the output buffer. */
231 /* Subsequent (numStages - 1) occur in-place in the output buffer */
232 pIn = pDst;
233
234 /* Reset the output pointer */
235 pOut = pDst;
236
237 /* Store the updated state variables back into the state array */
238 write_q15x2_ia(&pState, state_in);
239 write_q15x2_ia(&pState, state_out);
240
241 /* Decrement loop counter */
242 stage--;
243
244 } while (stage > 0U);
245 }
246
247 /**
248 @} end of BiquadCascadeDF1 group
249 */
250