1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_decimate_fast_q15.c
4 * Description: Fast Q15 FIR Decimator
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup FIR_decimate
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q15 FIR decimator (fast variant).
42 @param[in] S points to an instance of the Q15 FIR decimator structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] blockSize number of input samples to process per call
46
47 @par Scaling and Overflow Behavior
48 This fast version uses a 32-bit accumulator with 2.30 format.
49 The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
50 Thus, if the accumulator result overflows it wraps around and distorts the result.
51 In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2).
52 The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
53 @remark
54 Refer to \ref arm_fir_decimate_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
55 Both the slow and the fast versions use the same instance structure.
56 Use function \ref arm_fir_decimate_init_q15() to initialize the filter structure.
57 */
58
59 #if defined (ARM_MATH_DSP)
60
arm_fir_decimate_fast_q15(const arm_fir_decimate_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)61 ARM_DSP_ATTRIBUTE void arm_fir_decimate_fast_q15(
62 const arm_fir_decimate_instance_q15 * S,
63 const q15_t * pSrc,
64 q15_t * pDst,
65 uint32_t blockSize)
66 {
67 q15_t *pState = S->pState; /* State pointer */
68 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
69 q15_t *pStateCur; /* Points to the current sample of the state */
70 q15_t *px; /* Temporary pointer for state buffer */
71 const q15_t *pb; /* Temporary pointer for coefficient buffer */
72 q31_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
73 q31_t sum0; /* Accumulators */
74 q31_t acc0, acc1;
75 q15_t *px0, *px1;
76 uint32_t blkCntN3;
77 uint32_t numTaps = S->numTaps; /* Number of taps */
78 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
79
80 #if defined (ARM_MATH_LOOPUNROLL)
81 q31_t c1; /* Temporary variables to hold state and coefficient values */
82 #endif
83
84 /* S->pState buffer contains previous frame (numTaps - 1) samples */
85 /* pStateCur points to the location where the new input data should be written */
86 pStateCur = S->pState + (numTaps - 1U);
87
88 /* Total number of output samples to be computed */
89 blkCnt = outBlockSize / 2;
90 blkCntN3 = outBlockSize - (2 * blkCnt);
91
92 while (blkCnt > 0U)
93 {
94 /* Copy 2 * decimation factor number of new input samples into the state buffer */
95 i = S->M * 2;
96
97 do
98 {
99 *pStateCur++ = *pSrc++;
100
101 } while (--i);
102
103 /* Set accumulator to zero */
104 acc0 = 0;
105 acc1 = 0;
106
107 /* Initialize state pointer for all the samples */
108 px0 = pState;
109 px1 = pState + S->M;
110
111 /* Initialize coeff pointer */
112 pb = pCoeffs;
113
114 #if defined (ARM_MATH_LOOPUNROLL)
115
116 /* Loop unrolling: Compute 4 taps at a time */
117 tapCnt = numTaps >> 2U;
118
119 while (tapCnt > 0U)
120 {
121 /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
122 c0 = read_q15x2_ia ((q15_t **) &pb);
123
124 /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
125 x0 = read_q15x2_ia (&px0);
126 x1 = read_q15x2_ia (&px1);
127
128 /* Perform the multiply-accumulate */
129 acc0 = __SMLAD(x0, c0, acc0);
130 acc1 = __SMLAD(x1, c0, acc1);
131
132 /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
133 c0 = read_q15x2_ia ((q15_t **) &pb);
134
135 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
136 x0 = read_q15x2_ia (&px0);
137 x1 = read_q15x2_ia (&px1);
138
139 /* Perform the multiply-accumulate */
140 acc0 = __SMLAD(x0, c0, acc0);
141 acc1 = __SMLAD(x1, c0, acc1);
142
143 /* Decrement loop counter */
144 tapCnt--;
145 }
146
147 /* Loop unrolling: Compute remaining taps */
148 tapCnt = numTaps % 0x4U;
149
150 #else
151
152 /* Initialize tapCnt with number of taps */
153 tapCnt = numTaps;
154
155 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
156
157 while (tapCnt > 0U)
158 {
159 /* Read coefficients */
160 c0 = *pb++;
161
162 /* Fetch state variables for acc0, acc1 */
163 x0 = *px0++;
164 x1 = *px1++;
165
166 /* Perform the multiply-accumulate */
167 acc0 = __SMLAD(x0, c0, acc0);
168 acc1 = __SMLAD(x1, c0, acc1);
169
170 /* Decrement loop counter */
171 tapCnt--;
172 }
173
174 /* Advance the state pointer by the decimation factor
175 * to process the next group of decimation factor number samples */
176 pState = pState + S->M * 2;
177
178 /* Store filter output, smlad returns the values in 2.14 format */
179 /* so downsacle by 15 to get output in 1.15 */
180 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
181 *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
182
183 /* Decrement loop counter */
184 blkCnt--;
185 }
186
187 while (blkCntN3 > 0U)
188 {
189 /* Copy decimation factor number of new input samples into the state buffer */
190 i = S->M;
191
192 do
193 {
194 *pStateCur++ = *pSrc++;
195
196 } while (--i);
197
198 /* Set accumulator to zero */
199 sum0 = 0;
200
201 /* Initialize state pointer */
202 px = pState;
203
204 /* Initialize coeff pointer */
205 pb = pCoeffs;
206
207 #if defined (ARM_MATH_LOOPUNROLL)
208
209 /* Loop unrolling: Compute 4 taps at a time */
210 tapCnt = numTaps >> 2U;
211
212 while (tapCnt > 0U)
213 {
214 /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
215 c0 = read_q15x2_ia ((q15_t **) &pb);
216
217 /* Read x[n-numTaps-1] and x[n-numTaps-2] sample */
218 x0 = read_q15x2_ia (&px);
219
220 /* Read the b[numTaps-3] and b[numTaps-4] coefficients */
221 c1 = read_q15x2_ia ((q15_t **) &pb);
222
223 /* Perform the multiply-accumulate */
224 sum0 = __SMLAD(x0, c0, sum0);
225
226 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
227 x0 = read_q15x2_ia (&px);
228
229 /* Perform the multiply-accumulate */
230 sum0 = __SMLAD(x0, c1, sum0);
231
232 /* Decrement loop counter */
233 tapCnt--;
234 }
235
236 /* Loop unrolling: Compute remaining taps */
237 tapCnt = numTaps % 0x4U;
238
239 #else
240
241 /* Initialize tapCnt with number of taps */
242 tapCnt = numTaps;
243
244 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
245
246 while (tapCnt > 0U)
247 {
248 /* Read coefficients */
249 c0 = *pb++;
250
251 /* Fetch 1 state variable */
252 x0 = *px++;
253
254 /* Perform the multiply-accumulate */
255 sum0 = __SMLAD(x0, c0, sum0);
256
257 /* Decrement loop counter */
258 tapCnt--;
259 }
260
261 /* Advance the state pointer by the decimation factor
262 * to process the next group of decimation factor number samples */
263 pState = pState + S->M;
264
265 /* Store filter output, smlad returns the values in 2.14 format */
266 /* so downsacle by 15 to get output in 1.15 */
267 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
268
269 /* Decrement loop counter */
270 blkCntN3--;
271 }
272
273 /* Processing is complete.
274 Now copy the last numTaps - 1 samples to the satrt of the state buffer.
275 This prepares the state buffer for the next function call. */
276
277 /* Points to the start of the state buffer */
278 pStateCur = S->pState;
279
280 i = (numTaps - 1U) >> 2U;
281
282 /* copy data */
283 while (i > 0U)
284 {
285 write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
286 write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
287
288 /* Decrement loop counter */
289 i--;
290 }
291
292 i = (numTaps - 1U) % 0x04U;
293
294 /* Copy data */
295 while (i > 0U)
296 {
297 *pStateCur++ = *pState++;
298
299 /* Decrement loop counter */
300 i--;
301 }
302
303 }
304
305 #else /* #if defined (ARM_MATH_DSP) */
306
arm_fir_decimate_fast_q15(const arm_fir_decimate_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)307 ARM_DSP_ATTRIBUTE void arm_fir_decimate_fast_q15(
308 const arm_fir_decimate_instance_q15 * S,
309 const q15_t * pSrc,
310 q15_t * pDst,
311 uint32_t blockSize)
312 {
313 q15_t *pState = S->pState; /* State pointer */
314 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
315 q15_t *pStateCur; /* Points to the current sample of the state */
316 q15_t *px; /* Temporary pointer for state buffer */
317 const q15_t *pb; /* Temporary pointer for coefficient buffer */
318 q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
319 q31_t sum0; /* Accumulators */
320 q31_t acc0, acc1;
321 q15_t *px0, *px1;
322 uint32_t blkCntN3;
323 uint32_t numTaps = S->numTaps; /* Number of taps */
324 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
325
326
327 /* S->pState buffer contains previous frame (numTaps - 1) samples */
328 /* pStateCur points to the location where the new input data should be written */
329 pStateCur = S->pState + (numTaps - 1U);
330
331 /* Total number of output samples to be computed */
332 blkCnt = outBlockSize / 2;
333 blkCntN3 = outBlockSize - (2 * blkCnt);
334
335 while (blkCnt > 0U)
336 {
337 /* Copy 2 * decimation factor number of new input samples into the state buffer */
338 i = S->M * 2;
339
340 do
341 {
342 *pStateCur++ = *pSrc++;
343
344 } while (--i);
345
346 /* Set accumulator to zero */
347 acc0 = 0;
348 acc1 = 0;
349
350 /* Initialize state pointer */
351 px0 = pState;
352 px1 = pState + S->M;
353
354 /* Initialize coeff pointer */
355 pb = pCoeffs;
356
357 #if defined (ARM_MATH_LOOPUNROLL)
358
359 /* Loop unrolling: Compute 4 taps at a time */
360 tapCnt = numTaps >> 2U;
361
362 while (tapCnt > 0U)
363 {
364 /* Read the Read b[numTaps-1] coefficients */
365 c0 = *pb++;
366
367 /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
368 x0 = *px0++;
369 x1 = *px1++;
370
371 /* Perform the multiply-accumulate */
372 acc0 += x0 * c0;
373 acc1 += x1 * c0;
374
375 /* Read the b[numTaps-2] coefficient */
376 c0 = *pb++;
377
378 /* Read x[n-numTaps-2] for sample 0 and sample 1 */
379 x0 = *px0++;
380 x1 = *px1++;
381
382 /* Perform the multiply-accumulate */
383 acc0 += x0 * c0;
384 acc1 += x1 * c0;
385
386 /* Read the b[numTaps-3] coefficients */
387 c0 = *pb++;
388
389 /* Read x[n-numTaps-3] for sample 0 and sample 1 */
390 x0 = *px0++;
391 x1 = *px1++;
392
393 /* Perform the multiply-accumulate */
394 acc0 += x0 * c0;
395 acc1 += x1 * c0;
396
397 /* Read the b[numTaps-4] coefficient */
398 c0 = *pb++;
399
400 /* Read x[n-numTaps-4] for sample 0 and sample 1 */
401 x0 = *px0++;
402 x1 = *px1++;
403
404 /* Perform the multiply-accumulate */
405 acc0 += x0 * c0;
406 acc1 += x1 * c0;
407
408 /* Decrement the loop counter */
409 tapCnt--;
410 }
411
412 /* Loop unrolling: Compute remaining taps */
413 tapCnt = numTaps % 0x4U;
414
415 #else
416
417 /* Initialize tapCnt with number of taps */
418 tapCnt = numTaps;
419
420 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
421
422 while (tapCnt > 0U)
423 {
424 /* Read coefficients */
425 c0 = *pb++;
426
427 /* Fetch 1 state variable */
428 x0 = *px0++;
429 x1 = *px1++;
430
431 /* Perform the multiply-accumulate */
432 acc0 += x0 * c0;
433 acc1 += x1 * c0;
434
435 /* Decrement the loop counter */
436 tapCnt--;
437 }
438
439 /* Advance the state pointer by the decimation factor
440 * to process the next group of decimation factor number samples */
441 pState = pState + S->M * 2;
442
443 /* Store filter output, smlad returns the values in 2.14 format */
444 /* so downsacle by 15 to get output in 1.15 */
445
446 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
447 *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
448
449 /* Decrement loop counter */
450 blkCnt--;
451 }
452
453 while (blkCntN3 > 0U)
454 {
455 /* Copy decimation factor number of new input samples into the state buffer */
456 i = S->M;
457
458 do
459 {
460 *pStateCur++ = *pSrc++;
461
462 } while (--i);
463
464 /* Set accumulator to zero */
465 sum0 = 0;
466
467 /* Initialize state pointer */
468 px = pState;
469
470 /* Initialize coeff pointer */
471 pb = pCoeffs;
472
473 #if defined (ARM_MATH_LOOPUNROLL)
474
475 /* Loop unrolling: Compute 4 taps at a time */
476 tapCnt = numTaps >> 2U;
477
478 while (tapCnt > 0U)
479 {
480 /* Read the b[numTaps-1] coefficient */
481 c0 = *pb++;
482
483 /* Read x[n-numTaps-1] sample */
484 x0 = *px++;
485
486 /* Perform the multiply-accumulate */
487 sum0 += x0 * c0;
488
489 /* Read the b[numTaps-2] coefficient */
490 c0 = *pb++;
491
492 /* Read x[n-numTaps-2] sample */
493 x0 = *px++;
494
495 /* Perform the multiply-accumulate */
496 sum0 += x0 * c0;
497
498 /* Read the b[numTaps-3] coefficient */
499 c0 = *pb++;
500
501 /* Read x[n-numTaps-3] sample */
502 x0 = *px++;
503
504 /* Perform the multiply-accumulate */
505 sum0 += x0 * c0;
506
507 /* Read the b[numTaps-4] coefficient */
508 c0 = *pb++;
509
510 /* Read x[n-numTaps-4] sample */
511 x0 = *px++;
512
513 /* Perform the multiply-accumulate */
514 sum0 += x0 * c0;
515
516 /* Decrement loop counter */
517 tapCnt--;
518 }
519
520 /* Loop unrolling: Compute remaining taps */
521 tapCnt = numTaps % 0x4U;
522
523 #else
524
525 /* Initialize tapCnt with number of taps */
526 tapCnt = numTaps;
527
528 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
529
530 while (tapCnt > 0U)
531 {
532 /* Read coefficients */
533 c0 = *pb++;
534
535 /* Fetch 1 state variable */
536 x0 = *px++;
537
538 /* Perform the multiply-accumulate */
539 sum0 += x0 * c0;
540
541 /* Decrement the loop counter */
542 tapCnt--;
543 }
544
545 /* Advance the state pointer by the decimation factor
546 * to process the next group of decimation factor number samples */
547 pState = pState + S->M;
548
549 /* Store filter output, smlad returns the values in 2.14 format */
550 /* so downsacle by 15 to get output in 1.15 */
551 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
552
553 /* Decrement loop counter */
554 blkCntN3--;
555 }
556
557 /* Processing is complete.
558 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
559 ** This prepares the state buffer for the next function call. */
560
561 /* Points to the start of the state buffer */
562 pStateCur = S->pState;
563
564 i = (numTaps - 1U) >> 2U;
565
566 /* copy data */
567 while (i > 0U)
568 {
569 *pStateCur++ = *pState++;
570 *pStateCur++ = *pState++;
571 *pStateCur++ = *pState++;
572 *pStateCur++ = *pState++;
573
574 /* Decrement loop counter */
575 i--;
576 }
577
578 i = (numTaps - 1U) % 0x04U;
579
580 /* copy data */
581 while (i > 0U)
582 {
583 *pStateCur++ = *pState++;
584
585 /* Decrement loop counter */
586 i--;
587 }
588 }
589
590 #endif /* #if defined (ARM_MATH_DSP) */
591
592 /**
593 @} end of FIR_decimate group
594 */
595