1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_sparse_q31.c
4  * Description:  Q31 sparse FIR filter processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR_Sparse
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q31 sparse FIR filter.
42   @param[in]     S           points to an instance of the Q31 sparse FIR structure
43   @param[in]     pSrc        points to the block of input data
44   @param[out]    pDst        points to the block of output data
45   @param[in]     pScratchIn  points to a temporary buffer of size blockSize
46   @param[in]     blockSize   number of input samples to process
47   @return        none
48 
49   @par           Scaling and Overflow Behavior
50                    The function is implemented using an internal 32-bit accumulator.
51                    The 1.31 x 1.31 multiplications are truncated to 2.30 format.
52                    This leads to loss of precision on the intermediate multiplications and provides only a single guard bit.
53                    If the accumulator result overflows, it wraps around rather than saturate.
54                    In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
55  */
56 
arm_fir_sparse_q31(arm_fir_sparse_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,q31_t * pScratchIn,uint32_t blockSize)57 void arm_fir_sparse_q31(
58         arm_fir_sparse_instance_q31 * S,
59   const q31_t * pSrc,
60         q31_t * pDst,
61         q31_t * pScratchIn,
62         uint32_t blockSize)
63 {
64         q31_t *pState = S->pState;                     /* State pointer */
65   const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
66         q31_t *px;                                     /* Scratch buffer pointer */
67         q31_t *py = pState;                            /* Temporary pointers for state buffer */
68         q31_t *pb = pScratchIn;                        /* Temporary pointers for scratch buffer */
69         q31_t *pOut;                                   /* Destination pointer */
70         int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
71         uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
72         uint16_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter  */
73         int32_t readIndex;                             /* Read index of the state buffer */
74         uint32_t tapCnt, blkCnt;                       /* loop counters */
75         q31_t coeff = *pCoeffs++;                      /* Read the first coefficient value */
76         q31_t in;
77         q63_t out;                                     /* Temporary output variable */
78 
79 
80   /* BlockSize of Input samples are copied into the state buffer */
81   /* StateIndex points to the starting position to write in the state buffer */
82   arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
83                         (int32_t *) pSrc, 1, blockSize);
84 
85   /* Read Index, from where the state buffer should be read, is calculated. */
86   readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
87 
88   /* Wraparound of readIndex */
89   if (readIndex < 0)
90   {
91     readIndex += (int32_t) delaySize;
92   }
93 
94   /* Working pointer for state buffer is updated */
95   py = pState;
96 
97   /* blockSize samples are read from the state buffer */
98   arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
99                        (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
100 
101   /* Working pointer for the scratch buffer of state values */
102   px = pb;
103 
104   /* Working pointer for scratch buffer of output values */
105   pOut = pDst;
106 
107 
108 #if defined (ARM_MATH_LOOPUNROLL)
109 
110   /* Loop unrolling: Compute 4 outputs at a time. */
111   blkCnt = blockSize >> 2U;
112 
113   while (blkCnt > 0U)
114   {
115     /* Perform Multiplications and store in destination buffer */
116     *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
117 
118     *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
119 
120     *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
121 
122     *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
123 
124     /* Decrement loop counter */
125     blkCnt--;
126   }
127 
128   /* Loop unrolling: Compute remaining outputs */
129   blkCnt = blockSize % 0x4U;
130 
131 #else
132 
133   /* Initialize blkCnt with number of samples */
134   blkCnt = blockSize;
135 
136 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
137 
138   while (blkCnt > 0U)
139   {
140     /* Perform Multiplication and store in destination buffer */
141     *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
142 
143     /* Decrement loop counter */
144     blkCnt--;
145   }
146 
147   /* Load the coefficient value and
148    * increment the coefficient buffer for the next set of state values */
149   coeff = *pCoeffs++;
150 
151   /* Read Index, from where the state buffer should be read, is calculated. */
152   readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
153 
154   /* Wraparound of readIndex */
155   if (readIndex < 0)
156   {
157     readIndex += (int32_t) delaySize;
158   }
159 
160   /* Loop over the number of taps. */
161   tapCnt = (uint32_t) numTaps - 2U;
162 
163   while (tapCnt > 0U)
164   {
165     /* Working pointer for state buffer is updated */
166     py = pState;
167 
168     /* blockSize samples are read from the state buffer */
169     arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
170                          (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
171 
172     /* Working pointer for the scratch buffer of state values */
173     px = pb;
174 
175     /* Working pointer for scratch buffer of output values */
176     pOut = pDst;
177 
178 
179 #if defined (ARM_MATH_LOOPUNROLL)
180 
181     /* Loop unrolling: Compute 4 outputs at a time. */
182     blkCnt = blockSize >> 2U;
183 
184     while (blkCnt > 0U)
185     {
186       /* Perform Multiply-Accumulate */
187       out = *pOut;
188       out += ((q63_t) *px++ * coeff) >> 32;
189       *pOut++ = (q31_t) (out);
190 
191       out = *pOut;
192       out += ((q63_t) *px++ * coeff) >> 32;
193       *pOut++ = (q31_t) (out);
194 
195       out = *pOut;
196       out += ((q63_t) *px++ * coeff) >> 32;
197       *pOut++ = (q31_t) (out);
198 
199       out = *pOut;
200       out += ((q63_t) *px++ * coeff) >> 32;
201       *pOut++ = (q31_t) (out);
202 
203       /* Decrement loop counter */
204       blkCnt--;
205     }
206 
207     /* Loop unrolling: Compute remaining outputs */
208     blkCnt = blockSize % 0x4U;
209 
210 #else
211 
212     /* Initialize blkCnt with number of samples */
213     blkCnt = blockSize;
214 
215 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
216 
217     while (blkCnt > 0U)
218     {
219       /* Perform Multiply-Accumulate */
220       out = *pOut;
221       out += ((q63_t) *px++ * coeff) >> 32;
222       *pOut++ = (q31_t) (out);
223 
224       /* Decrement loop counter */
225       blkCnt--;
226     }
227 
228     /* Load the coefficient value and
229      * increment the coefficient buffer for the next set of state values */
230     coeff = *pCoeffs++;
231 
232     /* Read Index, from where the state buffer should be read, is calculated. */
233     readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
234 
235     /* Wraparound of readIndex */
236     if (readIndex < 0)
237     {
238       readIndex += (int32_t) delaySize;
239     }
240 
241     /* Decrement tap loop counter */
242     tapCnt--;
243   }
244 
245   /* Compute last tap without the final read of pTapDelay */
246 
247   /* Working pointer for state buffer is updated */
248   py = pState;
249 
250   /* blockSize samples are read from the state buffer */
251   arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
252                        (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
253 
254   /* Working pointer for the scratch buffer of state values */
255   px = pb;
256 
257   /* Working pointer for scratch buffer of output values */
258   pOut = pDst;
259 
260 
261 #if defined (ARM_MATH_LOOPUNROLL)
262 
263   /* Loop unrolling: Compute 4 outputs at a time. */
264   blkCnt = blockSize >> 2U;
265 
266   while (blkCnt > 0U)
267   {
268     /* Perform Multiply-Accumulate */
269     out = *pOut;
270     out += ((q63_t) * px++ * coeff) >> 32;
271     *pOut++ = (q31_t) (out);
272 
273     out = *pOut;
274     out += ((q63_t) * px++ * coeff) >> 32;
275     *pOut++ = (q31_t) (out);
276 
277     out = *pOut;
278     out += ((q63_t) * px++ * coeff) >> 32;
279     *pOut++ = (q31_t) (out);
280 
281     out = *pOut;
282     out += ((q63_t) * px++ * coeff) >> 32;
283     *pOut++ = (q31_t) (out);
284 
285     /* Decrement loop counter */
286     blkCnt--;
287   }
288 
289   /* Loop unrolling: Compute remaining outputs */
290   blkCnt = blockSize % 0x4U;
291 
292 #else
293 
294   /* Initialize blkCnt with number of samples */
295   blkCnt = blockSize;
296 
297 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
298 
299   while (blkCnt > 0U)
300   {
301     /* Perform Multiply-Accumulate */
302     out = *pOut;
303     out += ((q63_t) *px++ * coeff) >> 32;
304     *pOut++ = (q31_t) (out);
305 
306     /* Decrement loop counter */
307     blkCnt--;
308   }
309 
310   /* Working output pointer is updated */
311   pOut = pDst;
312 
313   /* Output is converted into 1.31 format. */
314 #if defined (ARM_MATH_LOOPUNROLL)
315 
316   /* Loop unrolling: Compute 4 outputs at a time. */
317   blkCnt = blockSize >> 2U;
318 
319   while (blkCnt > 0U)
320   {
321     in = *pOut << 1;
322     *pOut++ = in;
323     in = *pOut << 1;
324     *pOut++ = in;
325     in = *pOut << 1;
326     *pOut++ = in;
327     in = *pOut << 1;
328     *pOut++ = in;
329 
330     /* Decrement loop counter */
331     blkCnt--;
332   }
333 
334   /* Loop unrolling: Compute remaining outputs */
335   blkCnt = blockSize % 0x4U;
336 
337 #else
338 
339   /* Initialize blkCnt with number of samples */
340   blkCnt = blockSize;
341 
342 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
343 
344   while (blkCnt > 0U)
345   {
346     in = *pOut << 1;
347     *pOut++ = in;
348 
349     /* Decrement loop counter */
350     blkCnt--;
351   }
352 
353 }
354 
355 /**
356   @} end of FIR_Sparse group
357  */
358