1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_sparse_q31.c
4  * Description:  Q31 sparse FIR filter processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR_Sparse
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q31 sparse FIR filter.
42   @param[in]     S           points to an instance of the Q31 sparse FIR structure
43   @param[in]     pSrc        points to the block of input data
44   @param[out]    pDst        points to the block of output data
45   @param[in]     pScratchIn  points to a temporary buffer of size blockSize
46   @param[in]     blockSize   number of input samples to process
47 
48   @par           Scaling and Overflow Behavior
49                    The function is implemented using an internal 32-bit accumulator.
50                    The 1.31 x 1.31 multiplications are truncated to 2.30 format.
51                    This leads to loss of precision on the intermediate multiplications and provides only a single guard bit.
52                    If the accumulator result overflows, it wraps around rather than saturate.
53                    In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
54  */
55 
arm_fir_sparse_q31(arm_fir_sparse_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,q31_t * pScratchIn,uint32_t blockSize)56 void arm_fir_sparse_q31(
57         arm_fir_sparse_instance_q31 * S,
58   const q31_t * pSrc,
59         q31_t * pDst,
60         q31_t * pScratchIn,
61         uint32_t blockSize)
62 {
63         q31_t *pState = S->pState;                     /* State pointer */
64   const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
65         q31_t *px;                                     /* Scratch buffer pointer */
66         q31_t *py = pState;                            /* Temporary pointers for state buffer */
67         q31_t *pb = pScratchIn;                        /* Temporary pointers for scratch buffer */
68         q31_t *pOut;                                   /* Destination pointer */
69         int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
70         uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
71         uint16_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter  */
72         int32_t readIndex;                             /* Read index of the state buffer */
73         uint32_t tapCnt, blkCnt;                       /* loop counters */
74         q31_t coeff = *pCoeffs++;                      /* Read the first coefficient value */
75         q31_t in;
76         q63_t out;                                     /* Temporary output variable */
77 
78 
79   /* BlockSize of Input samples are copied into the state buffer */
80   /* StateIndex points to the starting position to write in the state buffer */
81   arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
82                         (int32_t *) pSrc, 1, blockSize);
83 
84   /* Read Index, from where the state buffer should be read, is calculated. */
85   readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
86 
87   /* Wraparound of readIndex */
88   if (readIndex < 0)
89   {
90     readIndex += (int32_t) delaySize;
91   }
92 
93   /* Working pointer for state buffer is updated */
94   py = pState;
95 
96   /* blockSize samples are read from the state buffer */
97   arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
98                        (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
99 
100   /* Working pointer for the scratch buffer of state values */
101   px = pb;
102 
103   /* Working pointer for scratch buffer of output values */
104   pOut = pDst;
105 
106 
107 #if defined (ARM_MATH_LOOPUNROLL)
108 
109   /* Loop unrolling: Compute 4 outputs at a time. */
110   blkCnt = blockSize >> 2U;
111 
112   while (blkCnt > 0U)
113   {
114     /* Perform Multiplications and store in destination buffer */
115     *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
116 
117     *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
118 
119     *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
120 
121     *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
122 
123     /* Decrement loop counter */
124     blkCnt--;
125   }
126 
127   /* Loop unrolling: Compute remaining outputs */
128   blkCnt = blockSize % 0x4U;
129 
130 #else
131 
132   /* Initialize blkCnt with number of samples */
133   blkCnt = blockSize;
134 
135 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
136 
137   while (blkCnt > 0U)
138   {
139     /* Perform Multiplication and store in destination buffer */
140     *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
141 
142     /* Decrement loop counter */
143     blkCnt--;
144   }
145 
146   /* Load the coefficient value and
147    * increment the coefficient buffer for the next set of state values */
148   coeff = *pCoeffs++;
149 
150   /* Read Index, from where the state buffer should be read, is calculated. */
151   readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
152 
153   /* Wraparound of readIndex */
154   if (readIndex < 0)
155   {
156     readIndex += (int32_t) delaySize;
157   }
158 
159   /* Loop over the number of taps. */
160   tapCnt = (uint32_t) numTaps - 2U;
161 
162   while (tapCnt > 0U)
163   {
164     /* Working pointer for state buffer is updated */
165     py = pState;
166 
167     /* blockSize samples are read from the state buffer */
168     arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
169                          (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
170 
171     /* Working pointer for the scratch buffer of state values */
172     px = pb;
173 
174     /* Working pointer for scratch buffer of output values */
175     pOut = pDst;
176 
177 
178 #if defined (ARM_MATH_LOOPUNROLL)
179 
180     /* Loop unrolling: Compute 4 outputs at a time. */
181     blkCnt = blockSize >> 2U;
182 
183     while (blkCnt > 0U)
184     {
185       /* Perform Multiply-Accumulate */
186       out = *pOut;
187       out += ((q63_t) *px++ * coeff) >> 32;
188       *pOut++ = (q31_t) (out);
189 
190       out = *pOut;
191       out += ((q63_t) *px++ * coeff) >> 32;
192       *pOut++ = (q31_t) (out);
193 
194       out = *pOut;
195       out += ((q63_t) *px++ * coeff) >> 32;
196       *pOut++ = (q31_t) (out);
197 
198       out = *pOut;
199       out += ((q63_t) *px++ * coeff) >> 32;
200       *pOut++ = (q31_t) (out);
201 
202       /* Decrement loop counter */
203       blkCnt--;
204     }
205 
206     /* Loop unrolling: Compute remaining outputs */
207     blkCnt = blockSize % 0x4U;
208 
209 #else
210 
211     /* Initialize blkCnt with number of samples */
212     blkCnt = blockSize;
213 
214 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
215 
216     while (blkCnt > 0U)
217     {
218       /* Perform Multiply-Accumulate */
219       out = *pOut;
220       out += ((q63_t) *px++ * coeff) >> 32;
221       *pOut++ = (q31_t) (out);
222 
223       /* Decrement loop counter */
224       blkCnt--;
225     }
226 
227     /* Load the coefficient value and
228      * increment the coefficient buffer for the next set of state values */
229     coeff = *pCoeffs++;
230 
231     /* Read Index, from where the state buffer should be read, is calculated. */
232     readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
233 
234     /* Wraparound of readIndex */
235     if (readIndex < 0)
236     {
237       readIndex += (int32_t) delaySize;
238     }
239 
240     /* Decrement tap loop counter */
241     tapCnt--;
242   }
243 
244   /* Compute last tap without the final read of pTapDelay */
245 
246   /* Working pointer for state buffer is updated */
247   py = pState;
248 
249   /* blockSize samples are read from the state buffer */
250   arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
251                        (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
252 
253   /* Working pointer for the scratch buffer of state values */
254   px = pb;
255 
256   /* Working pointer for scratch buffer of output values */
257   pOut = pDst;
258 
259 
260 #if defined (ARM_MATH_LOOPUNROLL)
261 
262   /* Loop unrolling: Compute 4 outputs at a time. */
263   blkCnt = blockSize >> 2U;
264 
265   while (blkCnt > 0U)
266   {
267     /* Perform Multiply-Accumulate */
268     out = *pOut;
269     out += ((q63_t) * px++ * coeff) >> 32;
270     *pOut++ = (q31_t) (out);
271 
272     out = *pOut;
273     out += ((q63_t) * px++ * coeff) >> 32;
274     *pOut++ = (q31_t) (out);
275 
276     out = *pOut;
277     out += ((q63_t) * px++ * coeff) >> 32;
278     *pOut++ = (q31_t) (out);
279 
280     out = *pOut;
281     out += ((q63_t) * px++ * coeff) >> 32;
282     *pOut++ = (q31_t) (out);
283 
284     /* Decrement loop counter */
285     blkCnt--;
286   }
287 
288   /* Loop unrolling: Compute remaining outputs */
289   blkCnt = blockSize % 0x4U;
290 
291 #else
292 
293   /* Initialize blkCnt with number of samples */
294   blkCnt = blockSize;
295 
296 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
297 
298   while (blkCnt > 0U)
299   {
300     /* Perform Multiply-Accumulate */
301     out = *pOut;
302     out += ((q63_t) *px++ * coeff) >> 32;
303     *pOut++ = (q31_t) (out);
304 
305     /* Decrement loop counter */
306     blkCnt--;
307   }
308 
309   /* Working output pointer is updated */
310   pOut = pDst;
311 
312   /* Output is converted into 1.31 format. */
313 #if defined (ARM_MATH_LOOPUNROLL)
314 
315   /* Loop unrolling: Compute 4 outputs at a time. */
316   blkCnt = blockSize >> 2U;
317 
318   while (blkCnt > 0U)
319   {
320     in = *pOut << 1;
321     *pOut++ = in;
322     in = *pOut << 1;
323     *pOut++ = in;
324     in = *pOut << 1;
325     *pOut++ = in;
326     in = *pOut << 1;
327     *pOut++ = in;
328 
329     /* Decrement loop counter */
330     blkCnt--;
331   }
332 
333   /* Loop unrolling: Compute remaining outputs */
334   blkCnt = blockSize % 0x4U;
335 
336 #else
337 
338   /* Initialize blkCnt with number of samples */
339   blkCnt = blockSize;
340 
341 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
342 
343   while (blkCnt > 0U)
344   {
345     in = *pOut << 1;
346     *pOut++ = in;
347 
348     /* Decrement loop counter */
349     blkCnt--;
350   }
351 
352 }
353 
354 /**
355   @} end of FIR_Sparse group
356  */
357