1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_decimate_fast_q15.c
4  * Description:  Fast Q15 FIR Decimator
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR_decimate
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q15 FIR decimator (fast variant).
42   @param[in]     S          points to an instance of the Q15 FIR decimator structure
43   @param[in]     pSrc       points to the block of input data
44   @param[out]    pDst       points to the block of output data
45   @param[in]     blockSize  number of input samples to process per call
46 
47   @par           Scaling and Overflow Behavior
48                    This fast version uses a 32-bit accumulator with 2.30 format.
49                    The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
50                    Thus, if the accumulator result overflows it wraps around and distorts the result.
51                    In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2).
52                    The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
53   @remark
54                    Refer to \ref arm_fir_decimate_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
55                    Both the slow and the fast versions use the same instance structure.
56                    Use function \ref arm_fir_decimate_init_q15() to initialize the filter structure.
57  */
58 
59 #if defined (ARM_MATH_DSP)
60 
arm_fir_decimate_fast_q15(const arm_fir_decimate_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)61 ARM_DSP_ATTRIBUTE void arm_fir_decimate_fast_q15(
62   const arm_fir_decimate_instance_q15 * S,
63   const q15_t * pSrc,
64         q15_t * pDst,
65         uint32_t blockSize)
66 {
67         q15_t *pState = S->pState;                     /* State pointer */
68   const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
69         q15_t *pStateCur;                              /* Points to the current sample of the state */
70         q15_t *px;                                     /* Temporary pointer for state buffer */
71   const q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
72         q31_t x0, x1, c0;                              /* Temporary variables to hold state and coefficient values */
73         q31_t sum0;                                    /* Accumulators */
74         q31_t acc0, acc1;
75         q15_t *px0, *px1;
76         uint32_t blkCntN3;
77         uint32_t numTaps = S->numTaps;                 /* Number of taps */
78         uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
79 
80 #if defined (ARM_MATH_LOOPUNROLL)
81         q31_t c1;                                      /* Temporary variables to hold state and coefficient values */
82 #endif
83 
84   /* S->pState buffer contains previous frame (numTaps - 1) samples */
85   /* pStateCur points to the location where the new input data should be written */
86   pStateCur = S->pState + (numTaps - 1U);
87 
88   /* Total number of output samples to be computed */
89   blkCnt = outBlockSize / 2;
90   blkCntN3 = outBlockSize - (2 * blkCnt);
91 
92   while (blkCnt > 0U)
93   {
94     /* Copy 2 * decimation factor number of new input samples into the state buffer */
95     i = S->M * 2;
96 
97     do
98     {
99       *pStateCur++ = *pSrc++;
100 
101     } while (--i);
102 
103     /* Set accumulator to zero */
104     acc0 = 0;
105     acc1 = 0;
106 
107     /* Initialize state pointer for all the samples */
108     px0 = pState;
109     px1 = pState + S->M;
110 
111     /* Initialize coeff pointer */
112     pb = pCoeffs;
113 
114 #if defined (ARM_MATH_LOOPUNROLL)
115 
116     /* Loop unrolling: Compute 4 taps at a time */
117     tapCnt = numTaps >> 2U;
118 
119     while (tapCnt > 0U)
120     {
121       /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
122       c0 = read_q15x2_ia ((q15_t **) &pb);
123 
124       /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
125       x0 = read_q15x2_ia (&px0);
126       x1 = read_q15x2_ia (&px1);
127 
128       /* Perform the multiply-accumulate */
129       acc0 = __SMLAD(x0, c0, acc0);
130       acc1 = __SMLAD(x1, c0, acc1);
131 
132       /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
133       c0 = read_q15x2_ia ((q15_t **) &pb);
134 
135       /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
136       x0 = read_q15x2_ia (&px0);
137       x1 = read_q15x2_ia (&px1);
138 
139       /* Perform the multiply-accumulate */
140       acc0 = __SMLAD(x0, c0, acc0);
141       acc1 = __SMLAD(x1, c0, acc1);
142 
143       /* Decrement loop counter */
144       tapCnt--;
145     }
146 
147     /* Loop unrolling: Compute remaining taps */
148     tapCnt = numTaps % 0x4U;
149 
150 #else
151 
152     /* Initialize tapCnt with number of taps */
153     tapCnt = numTaps;
154 
155 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
156 
157     while (tapCnt > 0U)
158     {
159       /* Read coefficients */
160       c0 = *pb++;
161 
162       /* Fetch state variables for acc0, acc1 */
163       x0 = *px0++;
164       x1 = *px1++;
165 
166       /* Perform the multiply-accumulate */
167       acc0 = __SMLAD(x0, c0, acc0);
168       acc1 = __SMLAD(x1, c0, acc1);
169 
170       /* Decrement loop counter */
171       tapCnt--;
172     }
173 
174     /* Advance the state pointer by the decimation factor
175      * to process the next group of decimation factor number samples */
176     pState = pState + S->M * 2;
177 
178     /* Store filter output, smlad returns the values in 2.14 format */
179     /* so downsacle by 15 to get output in 1.15 */
180     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
181     *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
182 
183     /* Decrement loop counter */
184     blkCnt--;
185   }
186 
187   while (blkCntN3 > 0U)
188   {
189     /* Copy decimation factor number of new input samples into the state buffer */
190     i = S->M;
191 
192     do
193     {
194       *pStateCur++ = *pSrc++;
195 
196     } while (--i);
197 
198     /* Set accumulator to zero */
199     sum0 = 0;
200 
201     /* Initialize state pointer */
202     px = pState;
203 
204     /* Initialize coeff pointer */
205     pb = pCoeffs;
206 
207 #if defined (ARM_MATH_LOOPUNROLL)
208 
209     /* Loop unrolling: Compute 4 taps at a time */
210     tapCnt = numTaps >> 2U;
211 
212     while (tapCnt > 0U)
213     {
214       /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
215       c0 = read_q15x2_ia ((q15_t **) &pb);
216 
217       /* Read x[n-numTaps-1] and x[n-numTaps-2] sample */
218       x0 = read_q15x2_ia (&px);
219 
220       /* Read the b[numTaps-3] and b[numTaps-4] coefficients */
221       c1 = read_q15x2_ia ((q15_t **) &pb);
222 
223       /* Perform the multiply-accumulate */
224       sum0 = __SMLAD(x0, c0, sum0);
225 
226       /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
227       x0 = read_q15x2_ia (&px);
228 
229       /* Perform the multiply-accumulate */
230       sum0 = __SMLAD(x0, c1, sum0);
231 
232       /* Decrement loop counter */
233       tapCnt--;
234     }
235 
236     /* Loop unrolling: Compute remaining taps */
237     tapCnt = numTaps % 0x4U;
238 
239 #else
240 
241     /* Initialize tapCnt with number of taps */
242     tapCnt = numTaps;
243 
244 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
245 
246     while (tapCnt > 0U)
247     {
248       /* Read coefficients */
249       c0 = *pb++;
250 
251       /* Fetch 1 state variable */
252       x0 = *px++;
253 
254       /* Perform the multiply-accumulate */
255       sum0 = __SMLAD(x0, c0, sum0);
256 
257       /* Decrement loop counter */
258       tapCnt--;
259     }
260 
261     /* Advance the state pointer by the decimation factor
262      * to process the next group of decimation factor number samples */
263     pState = pState + S->M;
264 
265     /* Store filter output, smlad returns the values in 2.14 format */
266     /* so downsacle by 15 to get output in 1.15 */
267     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
268 
269     /* Decrement loop counter */
270     blkCntN3--;
271   }
272 
273   /* Processing is complete.
274      Now copy the last numTaps - 1 samples to the satrt of the state buffer.
275      This prepares the state buffer for the next function call. */
276 
277   /* Points to the start of the state buffer */
278   pStateCur = S->pState;
279 
280   i = (numTaps - 1U) >> 2U;
281 
282   /* copy data */
283   while (i > 0U)
284   {
285     write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
286     write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
287 
288     /* Decrement loop counter */
289     i--;
290   }
291 
292   i = (numTaps - 1U) % 0x04U;
293 
294   /* Copy data */
295   while (i > 0U)
296   {
297     *pStateCur++ = *pState++;
298 
299     /* Decrement loop counter */
300     i--;
301   }
302 
303 }
304 
305 #else /* #if defined (ARM_MATH_DSP) */
306 
arm_fir_decimate_fast_q15(const arm_fir_decimate_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)307 ARM_DSP_ATTRIBUTE void arm_fir_decimate_fast_q15(
308   const arm_fir_decimate_instance_q15 * S,
309   const q15_t * pSrc,
310         q15_t * pDst,
311         uint32_t blockSize)
312 {
313         q15_t *pState = S->pState;                     /* State pointer */
314   const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
315         q15_t *pStateCur;                              /* Points to the current sample of the state */
316         q15_t *px;                                     /* Temporary pointer for state buffer */
317   const q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
318         q15_t x0, x1, c0;                              /* Temporary variables to hold state and coefficient values */
319         q31_t sum0;                                    /* Accumulators */
320         q31_t acc0, acc1;
321         q15_t *px0, *px1;
322         uint32_t blkCntN3;
323         uint32_t numTaps = S->numTaps;                 /* Number of taps */
324         uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
325 
326 
327   /* S->pState buffer contains previous frame (numTaps - 1) samples */
328   /* pStateCur points to the location where the new input data should be written */
329   pStateCur = S->pState + (numTaps - 1U);
330 
331   /* Total number of output samples to be computed */
332   blkCnt = outBlockSize / 2;
333   blkCntN3 = outBlockSize - (2 * blkCnt);
334 
335   while (blkCnt > 0U)
336   {
337     /* Copy 2 * decimation factor number of new input samples into the state buffer */
338     i = S->M * 2;
339 
340     do
341     {
342       *pStateCur++ = *pSrc++;
343 
344     } while (--i);
345 
346     /* Set accumulator to zero */
347     acc0 = 0;
348     acc1 = 0;
349 
350     /* Initialize state pointer */
351     px0 = pState;
352     px1 = pState + S->M;
353 
354     /* Initialize coeff pointer */
355     pb = pCoeffs;
356 
357 #if defined (ARM_MATH_LOOPUNROLL)
358 
359     /* Loop unrolling: Compute 4 taps at a time */
360     tapCnt = numTaps >> 2U;
361 
362     while (tapCnt > 0U)
363     {
364       /* Read the Read b[numTaps-1] coefficients */
365       c0 = *pb++;
366 
367       /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
368       x0 = *px0++;
369       x1 = *px1++;
370 
371       /* Perform the multiply-accumulate */
372       acc0 += x0 * c0;
373       acc1 += x1 * c0;
374 
375       /* Read the b[numTaps-2] coefficient */
376       c0 = *pb++;
377 
378       /* Read x[n-numTaps-2] for sample 0 and sample 1 */
379       x0 = *px0++;
380       x1 = *px1++;
381 
382       /* Perform the multiply-accumulate */
383       acc0 += x0 * c0;
384       acc1 += x1 * c0;
385 
386       /* Read the b[numTaps-3] coefficients */
387       c0 = *pb++;
388 
389       /* Read x[n-numTaps-3] for sample 0 and sample 1 */
390       x0 = *px0++;
391       x1 = *px1++;
392 
393       /* Perform the multiply-accumulate */
394       acc0 += x0 * c0;
395       acc1 += x1 * c0;
396 
397       /* Read the b[numTaps-4] coefficient */
398       c0 = *pb++;
399 
400       /* Read x[n-numTaps-4] for sample 0 and sample 1 */
401       x0 = *px0++;
402       x1 = *px1++;
403 
404       /* Perform the multiply-accumulate */
405       acc0 += x0 * c0;
406       acc1 += x1 * c0;
407 
408       /* Decrement the loop counter */
409       tapCnt--;
410     }
411 
412     /* Loop unrolling: Compute remaining taps */
413     tapCnt = numTaps % 0x4U;
414 
415 #else
416 
417     /* Initialize tapCnt with number of taps */
418     tapCnt = numTaps;
419 
420 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
421 
422     while (tapCnt > 0U)
423     {
424       /* Read coefficients */
425       c0 = *pb++;
426 
427       /* Fetch 1 state variable */
428       x0 = *px0++;
429       x1 = *px1++;
430 
431       /* Perform the multiply-accumulate */
432       acc0 += x0 * c0;
433       acc1 += x1 * c0;
434 
435       /* Decrement the loop counter */
436       tapCnt--;
437     }
438 
439     /* Advance the state pointer by the decimation factor
440      * to process the next group of decimation factor number samples */
441     pState = pState + S->M * 2;
442 
443     /* Store filter output, smlad returns the values in 2.14 format */
444     /* so downsacle by 15 to get output in 1.15 */
445 
446     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
447     *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
448 
449     /* Decrement loop counter */
450     blkCnt--;
451   }
452 
453   while (blkCntN3 > 0U)
454   {
455     /* Copy decimation factor number of new input samples into the state buffer */
456     i = S->M;
457 
458     do
459     {
460       *pStateCur++ = *pSrc++;
461 
462     } while (--i);
463 
464     /* Set accumulator to zero */
465     sum0 = 0;
466 
467     /* Initialize state pointer */
468     px = pState;
469 
470     /* Initialize coeff pointer */
471     pb = pCoeffs;
472 
473 #if defined (ARM_MATH_LOOPUNROLL)
474 
475     /* Loop unrolling: Compute 4 taps at a time */
476     tapCnt = numTaps >> 2U;
477 
478     while (tapCnt > 0U)
479     {
480       /* Read the b[numTaps-1] coefficient */
481       c0 = *pb++;
482 
483       /* Read x[n-numTaps-1] sample */
484       x0 = *px++;
485 
486       /* Perform the multiply-accumulate */
487       sum0 += x0 * c0;
488 
489       /* Read the b[numTaps-2] coefficient */
490       c0 = *pb++;
491 
492       /* Read x[n-numTaps-2] sample */
493       x0 = *px++;
494 
495       /* Perform the multiply-accumulate */
496       sum0 += x0 * c0;
497 
498       /* Read the b[numTaps-3] coefficient */
499       c0 = *pb++;
500 
501       /* Read x[n-numTaps-3] sample */
502       x0 = *px++;
503 
504       /* Perform the multiply-accumulate */
505       sum0 += x0 * c0;
506 
507       /* Read the b[numTaps-4] coefficient */
508       c0 = *pb++;
509 
510       /* Read x[n-numTaps-4] sample */
511       x0 = *px++;
512 
513       /* Perform the multiply-accumulate */
514       sum0 += x0 * c0;
515 
516       /* Decrement loop counter */
517       tapCnt--;
518     }
519 
520     /* Loop unrolling: Compute remaining taps */
521     tapCnt = numTaps % 0x4U;
522 
523 #else
524 
525     /* Initialize tapCnt with number of taps */
526     tapCnt = numTaps;
527 
528 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
529 
530     while (tapCnt > 0U)
531     {
532       /* Read coefficients */
533       c0 = *pb++;
534 
535       /* Fetch 1 state variable */
536       x0 = *px++;
537 
538       /* Perform the multiply-accumulate */
539       sum0 += x0 * c0;
540 
541       /* Decrement the loop counter */
542       tapCnt--;
543     }
544 
545     /* Advance the state pointer by the decimation factor
546      * to process the next group of decimation factor number samples */
547     pState = pState + S->M;
548 
549     /* Store filter output, smlad returns the values in 2.14 format */
550     /* so downsacle by 15 to get output in 1.15 */
551     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
552 
553     /* Decrement loop counter */
554     blkCntN3--;
555   }
556 
557   /* Processing is complete.
558    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
559    ** This prepares the state buffer for the next function call. */
560 
561   /* Points to the start of the state buffer */
562   pStateCur = S->pState;
563 
564   i = (numTaps - 1U) >> 2U;
565 
566   /* copy data */
567   while (i > 0U)
568   {
569     *pStateCur++ = *pState++;
570     *pStateCur++ = *pState++;
571     *pStateCur++ = *pState++;
572     *pStateCur++ = *pState++;
573 
574     /* Decrement loop counter */
575     i--;
576   }
577 
578   i = (numTaps - 1U) % 0x04U;
579 
580   /* copy data */
581   while (i > 0U)
582   {
583     *pStateCur++ = *pState++;
584 
585     /* Decrement loop counter */
586     i--;
587   }
588 }
589 
590 #endif /* #if defined (ARM_MATH_DSP) */
591 
592 /**
593   @} end of FIR_decimate group
594  */
595