1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_decimate_fast_q31.c
4 * Description: Fast Q31 FIR Decimator
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup FIR_decimate
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q31 FIR decimator (fast variant).
42 @param[in] S points to an instance of the Q31 FIR decimator structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] blockSize number of samples to process
46
47 @par Scaling and Overflow Behavior
48 This function is optimized for speed at the expense of fixed-point precision and overflow protection.
49 The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
50 These intermediate results are added to a 2.30 accumulator.
51 Finally, the accumulator is saturated and converted to a 1.31 result.
52 The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
53 In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).
54
55 @remark
56 Refer to \ref arm_fir_decimate_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
57 Both the slow and the fast versions use the same instance structure.
58 Use function \ref arm_fir_decimate_init_q31() to initialize the filter structure.
59 */
60
arm_fir_decimate_fast_q31(const arm_fir_decimate_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)61 void arm_fir_decimate_fast_q31(
62 const arm_fir_decimate_instance_q31 * S,
63 const q31_t * pSrc,
64 q31_t * pDst,
65 uint32_t blockSize)
66 {
67 q31_t *pState = S->pState; /* State pointer */
68 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
69 q31_t *pStateCur; /* Points to the current sample of the state */
70 q31_t *px0; /* Temporary pointer for state buffer */
71 const q31_t *pb; /* Temporary pointer for coefficient buffer */
72 q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
73 q63_t acc0; /* Accumulator */
74 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
75 uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
76
77 #if defined (ARM_MATH_LOOPUNROLL)
78 q31_t *px1, *px2, *px3;
79 q31_t x1, x2, x3;
80 q63_t acc1, acc2, acc3;
81 #endif
82
83 /* S->pState buffer contains previous frame (numTaps - 1) samples */
84 /* pStateCur points to the location where the new input data should be written */
85 pStateCur = S->pState + (numTaps - 1U);
86
87 #if defined (ARM_MATH_LOOPUNROLL)
88
89 /* Loop unrolling: Compute 4 samples at a time */
90 blkCnt = outBlockSize >> 2U;
91
92 /* Samples loop unrolled by 4 */
93 while (blkCnt > 0U)
94 {
95 /* Copy 4 * decimation factor number of new input samples into the state buffer */
96 i = S->M * 4;
97
98 do
99 {
100 *pStateCur++ = *pSrc++;
101
102 } while (--i);
103
104 /* Set accumulators to zero */
105 acc0 = 0;
106 acc1 = 0;
107 acc2 = 0;
108 acc3 = 0;
109
110 /* Initialize state pointer for all the samples */
111 px0 = pState;
112 px1 = pState + S->M;
113 px2 = pState + 2 * S->M;
114 px3 = pState + 3 * S->M;
115
116 /* Initialize coeff pointer */
117 pb = pCoeffs;
118
119 /* Loop unrolling: Compute 4 taps at a time */
120 tapCnt = numTaps >> 2U;
121
122 while (tapCnt > 0U)
123 {
124 /* Read the b[numTaps-1] coefficient */
125 c0 = *(pb++);
126
127 /* Read x[n-numTaps-1] sample for acc0 */
128 x0 = *(px0++);
129 /* Read x[n-numTaps-1] sample for acc1 */
130 x1 = *(px1++);
131 /* Read x[n-numTaps-1] sample for acc2 */
132 x2 = *(px2++);
133 /* Read x[n-numTaps-1] sample for acc3 */
134 x3 = *(px3++);
135
136 /* Perform the multiply-accumulate */
137 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
138 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
139 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
140 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
141
142 /* Read the b[numTaps-2] coefficient */
143 c0 = *(pb++);
144
145 /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */
146 x0 = *(px0++);
147 x1 = *(px1++);
148 x2 = *(px2++);
149 x3 = *(px3++);
150
151 /* Perform the multiply-accumulate */
152 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
153 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
154 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
155 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
156
157 /* Read the b[numTaps-3] coefficient */
158 c0 = *(pb++);
159
160 /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */
161 x0 = *(px0++);
162 x1 = *(px1++);
163 x2 = *(px2++);
164 x3 = *(px3++);
165
166 /* Perform the multiply-accumulate */
167 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
168 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
169 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
170 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
171
172 /* Read the b[numTaps-4] coefficient */
173 c0 = *(pb++);
174
175 /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */
176 x0 = *(px0++);
177 x1 = *(px1++);
178 x2 = *(px2++);
179 x3 = *(px3++);
180
181 /* Perform the multiply-accumulate */
182 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
183 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
184 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
185 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
186
187 /* Decrement loop counter */
188 tapCnt--;
189 }
190
191 /* Loop unrolling: Compute remaining taps */
192 tapCnt = numTaps % 0x4U;
193
194 while (tapCnt > 0U)
195 {
196 /* Read coefficients */
197 c0 = *(pb++);
198
199 /* Fetch state variables for acc0, acc1, acc2, acc3 */
200 x0 = *(px0++);
201 x1 = *(px1++);
202 x2 = *(px2++);
203 x3 = *(px3++);
204
205 /* Perform the multiply-accumulate */
206 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
207 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
208 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
209 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
210
211 /* Decrement loop counter */
212 tapCnt--;
213 }
214
215 /* Advance the state pointer by the decimation factor
216 * to process the next group of decimation factor number samples */
217 pState = pState + S->M * 4;
218
219 /* The result is in the accumulator, store in the destination buffer. */
220 *pDst++ = (q31_t) (acc0 << 1);
221 *pDst++ = (q31_t) (acc1 << 1);
222 *pDst++ = (q31_t) (acc2 << 1);
223 *pDst++ = (q31_t) (acc3 << 1);
224
225 /* Decrement loop counter */
226 blkCnt--;
227 }
228
229 /* Loop unrolling: Compute remaining samples */
230 blkCnt = outBlockSize % 0x4U;
231
232 #else
233
234 /* Initialize blkCnt with number of samples */
235 blkCnt = outBlockSize;
236
237 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
238
239 while (blkCnt > 0U)
240 {
241 /* Copy decimation factor number of new input samples into the state buffer */
242 i = S->M;
243
244 do
245 {
246 *pStateCur++ = *pSrc++;
247
248 } while (--i);
249
250 /* Set accumulator to zero */
251 acc0 = 0;
252
253 /* Initialize state pointer */
254 px0 = pState;
255
256 /* Initialize coeff pointer */
257 pb = pCoeffs;
258
259 #if defined (ARM_MATH_LOOPUNROLL)
260
261 /* Loop unrolling: Compute 4 taps at a time */
262 tapCnt = numTaps >> 2U;
263
264 while (tapCnt > 0U)
265 {
266 /* Read the b[numTaps-1] coefficient */
267 c0 = *pb++;
268
269 /* Read x[n-numTaps-1] sample */
270 x0 = *px0++;
271
272 /* Perform the multiply-accumulate */
273 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
274
275 /* Read the b[numTaps-2] coefficient */
276 c0 = *pb++;
277
278 /* Read x[n-numTaps-2] sample */
279 x0 = *px0++;
280
281 /* Perform the multiply-accumulate */
282 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
283
284 /* Read the b[numTaps-3] coefficient */
285 c0 = *pb++;
286
287 /* Read x[n-numTaps-3] sample */
288 x0 = *px0++;
289
290 /* Perform the multiply-accumulate */
291 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
292
293 /* Read the b[numTaps-4] coefficient */
294 c0 = *pb++;
295
296 /* Read x[n-numTaps-4] sample */
297 x0 = *px0++;
298
299 /* Perform the multiply-accumulate */
300 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
301
302 /* Decrement loop counter */
303 tapCnt--;
304 }
305
306 /* Loop unrolling: Compute remaining taps */
307 tapCnt = numTaps % 0x4U;
308
309 #else
310
311 /* Initialize tapCnt with number of taps */
312 tapCnt = numTaps;
313
314 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
315
316 while (tapCnt > 0U)
317 {
318 /* Read coefficients */
319 c0 = *pb++;
320
321 /* Fetch 1 state variable */
322 x0 = *px0++;
323
324 /* Perform the multiply-accumulate */
325 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
326
327 /* Decrement loop counter */
328 tapCnt--;
329 }
330
331 /* Advance the state pointer by the decimation factor
332 * to process the next group of decimation factor number samples */
333 pState = pState + S->M;
334
335 /* The result is in the accumulator, store in the destination buffer. */
336 *pDst++ = (q31_t) (acc0 << 1);
337
338 /* Decrement loop counter */
339 blkCnt--;
340 }
341
342 /* Processing is complete.
343 Now copy the last numTaps - 1 samples to the satrt of the state buffer.
344 This prepares the state buffer for the next function call. */
345
346 /* Points to the start of the state buffer */
347 pStateCur = S->pState;
348
349 #if defined (ARM_MATH_LOOPUNROLL)
350
351 /* Loop unrolling: Compute 4 taps at a time */
352 tapCnt = (numTaps - 1U) >> 2U;
353
354 /* Copy data */
355 while (tapCnt > 0U)
356 {
357 *pStateCur++ = *pState++;
358 *pStateCur++ = *pState++;
359 *pStateCur++ = *pState++;
360 *pStateCur++ = *pState++;
361
362 /* Decrement loop counter */
363 tapCnt--;
364 }
365
366 /* Loop unrolling: Compute remaining taps */
367 tapCnt = (numTaps - 1U) % 0x04U;
368
369 #else
370
371 /* Initialize tapCnt with number of taps */
372 tapCnt = (numTaps - 1U);
373
374 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
375
376 /* Copy data */
377 while (tapCnt > 0U)
378 {
379 *pStateCur++ = *pState++;
380
381 /* Decrement loop counter */
382 tapCnt--;
383 }
384
385 }
386
387 /**
388 @} end of FIR_decimate group
389 */
390