1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_fast_q31.c
4 * Description: Processing function for the Q31 Fast FIR filter
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup FIR
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q31 FIR filter (fast version).
42 @param[in] S points to an instance of the Q31 structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] blockSize number of samples to process
46
47 @par Scaling and Overflow Behavior
48 This function is optimized for speed at the expense of fixed-point precision and overflow protection.
49 The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
50 These intermediate results are added to a 2.30 accumulator.
51 Finally, the accumulator is saturated and converted to a 1.31 result.
52 The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
53 In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
54
55 @remark
56 Refer to \ref arm_fir_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. Both the slow and the fast versions use the same instance structure.
57 Use function \ref arm_fir_init_q31() to initialize the filter structure.
58 */
59
arm_fir_fast_q31(const arm_fir_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)60 void arm_fir_fast_q31(
61 const arm_fir_instance_q31 * S,
62 const q31_t * pSrc,
63 q31_t * pDst,
64 uint32_t blockSize)
65 {
66 q31_t *pState = S->pState; /* State pointer */
67 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
68 q31_t *pStateCurnt; /* Points to the current sample of the state */
69 q31_t *px; /* Temporary pointer for state buffer */
70 const q31_t *pb; /* Temporary pointer for coefficient buffer */
71 q31_t acc0; /* Accumulators */
72 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
73 uint32_t i, tapCnt, blkCnt; /* Loop counters */
74
75 #if defined (ARM_MATH_LOOPUNROLL)
76 q31_t acc1, acc2, acc3; /* Accumulators */
77 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
78 #endif
79
80 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
81 /* pStateCurnt points to the location where the new input data should be written */
82 pStateCurnt = &(S->pState[(numTaps - 1U)]);
83
84 #if defined (ARM_MATH_LOOPUNROLL)
85
86 /* Loop unrolling: Compute 4 output values simultaneously.
87 * The variables acc0 ... acc3 hold output values that are being computed:
88 *
89 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
90 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
91 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
92 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
93 */
94 blkCnt = blockSize >> 2U;
95
96 while (blkCnt > 0U)
97 {
98 /* Copy 4 new input samples into the state buffer. */
99 *pStateCurnt++ = *pSrc++;
100 *pStateCurnt++ = *pSrc++;
101 *pStateCurnt++ = *pSrc++;
102 *pStateCurnt++ = *pSrc++;
103
104 /* Set all accumulators to zero */
105 acc0 = 0;
106 acc1 = 0;
107 acc2 = 0;
108 acc3 = 0;
109
110 /* Initialize state pointer */
111 px = pState;
112
113 /* Initialize coefficient pointer */
114 pb = pCoeffs;
115
116 /* Read the first 3 samples from the state buffer:
117 * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
118 x0 = *px++;
119 x1 = *px++;
120 x2 = *px++;
121
122 /* Loop unrolling. Process 4 taps at a time. */
123 tapCnt = numTaps >> 2U;
124
125 /* Loop over the number of taps. Unroll by a factor of 4.
126 Repeat until we've computed numTaps-4 coefficients. */
127 while (tapCnt > 0U)
128 {
129 /* Read the b[numTaps] coefficient */
130 c0 = *pb;
131
132 /* Read x[n-numTaps-3] sample */
133 x3 = *px;
134
135 /* acc0 += b[numTaps] * x[n-numTaps] */
136 multAcc_32x32_keep32_R(acc0, x0, c0);
137
138 /* acc1 += b[numTaps] * x[n-numTaps-1] */
139 multAcc_32x32_keep32_R(acc1, x1, c0);
140
141 /* acc2 += b[numTaps] * x[n-numTaps-2] */
142 multAcc_32x32_keep32_R(acc2, x2, c0);
143
144 /* acc3 += b[numTaps] * x[n-numTaps-3] */
145 multAcc_32x32_keep32_R(acc3, x3, c0);
146
147 /* Read the b[numTaps-1] coefficient */
148 c0 = *(pb + 1U);
149
150 /* Read x[n-numTaps-4] sample */
151 x0 = *(px + 1U);
152
153 /* Perform the multiply-accumulates */
154 multAcc_32x32_keep32_R(acc0, x1, c0);
155 multAcc_32x32_keep32_R(acc1, x2, c0);
156 multAcc_32x32_keep32_R(acc2, x3, c0);
157 multAcc_32x32_keep32_R(acc3, x0, c0);
158
159 /* Read the b[numTaps-2] coefficient */
160 c0 = *(pb + 2U);
161
162 /* Read x[n-numTaps-5] sample */
163 x1 = *(px + 2U);
164
165 /* Perform the multiply-accumulates */
166 multAcc_32x32_keep32_R(acc0, x2, c0);
167 multAcc_32x32_keep32_R(acc1, x3, c0);
168 multAcc_32x32_keep32_R(acc2, x0, c0);
169 multAcc_32x32_keep32_R(acc3, x1, c0);
170
171 /* Read the b[numTaps-3] coefficients */
172 c0 = *(pb + 3U);
173
174 /* Read x[n-numTaps-6] sample */
175 x2 = *(px + 3U);
176
177 /* Perform the multiply-accumulates */
178 multAcc_32x32_keep32_R(acc0, x3, c0);
179 multAcc_32x32_keep32_R(acc1, x0, c0);
180 multAcc_32x32_keep32_R(acc2, x1, c0);
181 multAcc_32x32_keep32_R(acc3, x2, c0);
182
183 /* update coefficient pointer */
184 pb += 4U;
185 px += 4U;
186
187 /* Decrement loop counter */
188 tapCnt--;
189 }
190
191 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
192 tapCnt = numTaps % 0x4U;
193
194 while (tapCnt > 0U)
195 {
196 /* Read coefficients */
197 c0 = *(pb++);
198
199 /* Fetch 1 state variable */
200 x3 = *(px++);
201
202 /* Perform the multiply-accumulates */
203 multAcc_32x32_keep32_R(acc0, x0, c0);
204 multAcc_32x32_keep32_R(acc1, x1, c0);
205 multAcc_32x32_keep32_R(acc2, x2, c0);
206 multAcc_32x32_keep32_R(acc3, x3, c0);
207
208 /* Reuse the present sample states for next sample */
209 x0 = x1;
210 x1 = x2;
211 x2 = x3;
212
213 /* Decrement loop counter */
214 tapCnt--;
215 }
216
217 /* The results in the 4 accumulators are in 2.30 format. Convert to 1.31
218 Then store the 4 outputs in the destination buffer. */
219 *pDst++ = (q31_t) (acc0 << 1);
220 *pDst++ = (q31_t) (acc1 << 1);
221 *pDst++ = (q31_t) (acc2 << 1);
222 *pDst++ = (q31_t) (acc3 << 1);
223
224 /* Advance the state pointer by 4 to process the next group of 4 samples */
225 pState = pState + 4U;
226
227 /* Decrement loop counter */
228 blkCnt--;
229 }
230
231 /* Loop unrolling: Compute remaining output samples */
232 blkCnt = blockSize % 0x4U;
233
234 #else
235
236 /* Initialize blkCnt with number of taps */
237 blkCnt = blockSize;
238
239 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
240
241 while (blkCnt > 0U)
242 {
243 /* Copy one sample at a time into state buffer */
244 *pStateCurnt++ = *pSrc++;
245
246 /* Set the accumulator to zero */
247 acc0 = 0;
248
249 /* Initialize state pointer */
250 px = pState;
251
252 /* Initialize Coefficient pointer */
253 pb = pCoeffs;
254
255 i = numTaps;
256
257 /* Perform the multiply-accumulates */
258 do
259 {
260 multAcc_32x32_keep32_R(acc0, (*px++), (*pb++));
261 i--;
262 } while (i > 0U);
263
264 /* The result is in 2.30 format. Convert to 1.31
265 Then store the output in the destination buffer. */
266 *pDst++ = (q31_t) (acc0 << 1);
267
268 /* Advance state pointer by 1 for the next sample */
269 pState = pState + 1U;
270
271 /* Decrement loop counter */
272 blkCnt--;
273 }
274
275 /* Processing is complete.
276 Now copy the last numTaps - 1 samples to the start of the state buffer.
277 This prepares the state buffer for the next function call. */
278
279 /* Points to the start of the state buffer */
280 pStateCurnt = S->pState;
281
282 #if defined (ARM_MATH_LOOPUNROLL)
283
284 /* Loop unrolling: Compute 4 taps at a time */
285 tapCnt = (numTaps - 1U) >> 2U;
286
287 /* Copy data */
288 while (tapCnt > 0U)
289 {
290 *pStateCurnt++ = *pState++;
291 *pStateCurnt++ = *pState++;
292 *pStateCurnt++ = *pState++;
293 *pStateCurnt++ = *pState++;
294
295 /* Decrement loop counter */
296 tapCnt--;
297 }
298
299 /* Calculate remaining number of copies */
300 tapCnt = (numTaps - 1U) % 0x4U;
301
302 #else
303
304 /* Initialize tapCnt with number of taps */
305 tapCnt = (numTaps - 1U);
306
307 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
308
309 /* Copy remaining data */
310 while (tapCnt > 0U)
311 {
312 *pStateCurnt++ = *pState++;
313
314 /* Decrement the loop counter */
315 tapCnt--;
316 }
317
318 }
319 /**
320 @} end of FIR group
321 */
322