1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_sparse_q15.c
4  * Description:  Q15 sparse FIR filter processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR_Sparse
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q15 sparse FIR filter.
42   @param[in]     S           points to an instance of the Q15 sparse FIR structure
43   @param[in]     pSrc        points to the block of input data
44   @param[out]    pDst        points to the block of output data
45   @param[in]     pScratchIn  points to a temporary buffer of size blockSize
46   @param[in]     pScratchOut points to a temporary buffer of size blockSize
47   @param[in]     blockSize   number of input samples to process per call
48 
49   @par           Scaling and Overflow Behavior
50                    The function is implemented using an internal 32-bit accumulator.
51                    The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator.
52                    Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator.
53                    If the accumulator result overflows it will wrap around rather than saturate.
54                    After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format.
55                    In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
56  */
57 
arm_fir_sparse_q15(arm_fir_sparse_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,q15_t * pScratchIn,q31_t * pScratchOut,uint32_t blockSize)58 void arm_fir_sparse_q15(
59         arm_fir_sparse_instance_q15 * S,
60   const q15_t * pSrc,
61         q15_t * pDst,
62         q15_t * pScratchIn,
63         q31_t * pScratchOut,
64         uint32_t blockSize)
65 {
66         q15_t *pState = S->pState;                     /* State pointer */
67   const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
68         q15_t *px;                                     /* Temporary pointers for scratch buffer */
69         q15_t *py = pState;                            /* Temporary pointers for state buffer */
70         q15_t *pb = pScratchIn;                        /* Temporary pointers for scratch buffer */
71         q15_t *pOut = pDst;                            /* Working pointer for output */
72         int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
73         uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
74         uint16_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter  */
75         int32_t readIndex;                             /* Read index of the state buffer */
76         uint32_t tapCnt, blkCnt;                       /* loop counters */
77         q31_t *pScr2 = pScratchOut;                    /* Working pointer for scratch buffer of output values */
78         q15_t coeff = *pCoeffs++;                      /* Read the first coefficient value */
79 
80 #if defined (ARM_MATH_LOOPUNROLL)
81         q31_t in1, in2;                                /* Temporary variables */
82 #endif
83 
84   /* BlockSize of Input samples are copied into the state buffer */
85   /* StateIndex points to the starting position to write in the state buffer */
86   arm_circularWrite_q15(py, (int32_t) delaySize, &S->stateIndex, 1,pSrc, 1, blockSize);
87 
88   /* Loop over the number of taps. */
89   tapCnt = numTaps;
90 
91   /* Read Index, from where the state buffer should be read, is calculated. */
92   readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
93 
94   /* Wraparound of readIndex */
95   if (readIndex < 0)
96   {
97     readIndex += (int32_t) delaySize;
98   }
99 
100   /* Working pointer for state buffer is updated */
101   py = pState;
102 
103   /* blockSize samples are read from the state buffer */
104   arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1,
105                        pb, pb, (int32_t) blockSize, 1, blockSize);
106 
107   /* Working pointer for the scratch buffer of state values */
108   px = pb;
109 
110   /* Working pointer for scratch buffer of output values */
111   pScratchOut = pScr2;
112 
113 
114 #if defined (ARM_MATH_LOOPUNROLL)
115 
116   /* Loop unrolling: Compute 4 outputs at a time. */
117   blkCnt = blockSize >> 2U;
118 
119   while (blkCnt > 0U)
120   {
121     /* Perform multiplication and store in the scratch buffer */
122     *pScratchOut++ = ((q31_t) *px++ * coeff);
123     *pScratchOut++ = ((q31_t) *px++ * coeff);
124     *pScratchOut++ = ((q31_t) *px++ * coeff);
125     *pScratchOut++ = ((q31_t) *px++ * coeff);
126 
127     /* Decrement loop counter */
128     blkCnt--;
129   }
130 
131   /* Loop unrolling: Compute remaining outputs */
132   blkCnt = blockSize % 0x4U;
133 
134 #else
135 
136   /* Initialize blkCnt with number of samples */
137   blkCnt = blockSize;
138 
139 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
140 
141   while (blkCnt > 0U)
142   {
143     /* Perform Multiplication and store in the scratch buffer */
144     *pScratchOut++ = ((q31_t) *px++ * coeff);
145 
146     /* Decrement loop counter */
147     blkCnt--;
148   }
149 
150   /* Load the coefficient value and
151    * increment the coefficient buffer for the next set of state values */
152   coeff = *pCoeffs++;
153 
154   /* Read Index, from where the state buffer should be read, is calculated. */
155   readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
156 
157   /* Wraparound of readIndex */
158   if (readIndex < 0)
159   {
160     readIndex += (int32_t) delaySize;
161   }
162 
163   /* Loop over the number of taps. */
164   tapCnt = (uint32_t) numTaps - 2U;
165 
166   while (tapCnt > 0U)
167   {
168     /* Working pointer for state buffer is updated */
169     py = pState;
170 
171     /* blockSize samples are read from the state buffer */
172     arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1,
173                          pb, pb, (int32_t) blockSize, 1, blockSize);
174 
175     /* Working pointer for the scratch buffer of state values */
176     px = pb;
177 
178     /* Working pointer for scratch buffer of output values */
179     pScratchOut = pScr2;
180 
181 
182 #if defined (ARM_MATH_LOOPUNROLL)
183 
184     /* Loop unrolling: Compute 4 outputs at a time. */
185     blkCnt = blockSize >> 2U;
186 
187     while (blkCnt > 0U)
188     {
189       /* Perform Multiply-Accumulate */
190       *pScratchOut++ += (q31_t) *px++ * coeff;
191       *pScratchOut++ += (q31_t) *px++ * coeff;
192       *pScratchOut++ += (q31_t) *px++ * coeff;
193       *pScratchOut++ += (q31_t) *px++ * coeff;
194 
195       /* Decrement loop counter */
196       blkCnt--;
197     }
198 
199     /* Loop unrolling: Compute remaining outputs */
200     blkCnt = blockSize % 0x4U;
201 
202 #else
203 
204     /* Initialize blkCnt with number of samples */
205     blkCnt = blockSize;
206 
207 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
208 
209     while (blkCnt > 0U)
210     {
211       /* Perform Multiply-Accumulate */
212       *pScratchOut++ += (q31_t) *px++ * coeff;
213 
214       /* Decrement loop counter */
215       blkCnt--;
216     }
217 
218     /* Load the coefficient value and
219      * increment the coefficient buffer for the next set of state values */
220     coeff = *pCoeffs++;
221 
222     /* Read Index, from where the state buffer should be read, is calculated. */
223     readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
224 
225     /* Wraparound of readIndex */
226     if (readIndex < 0)
227     {
228       readIndex += (int32_t) delaySize;
229     }
230 
231     /* Decrement loop counter */
232     tapCnt--;
233   }
234 
235   /* Compute last tap without the final read of pTapDelay */
236 
237   /* Working pointer for state buffer is updated */
238   py = pState;
239 
240   /* blockSize samples are read from the state buffer */
241   arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1,
242                        pb, pb, (int32_t) blockSize, 1, blockSize);
243 
244   /* Working pointer for the scratch buffer of state values */
245   px = pb;
246 
247   /* Working pointer for scratch buffer of output values */
248   pScratchOut = pScr2;
249 
250 
251 #if defined (ARM_MATH_LOOPUNROLL)
252 
253   /* Loop unrolling: Compute 4 outputs at a time. */
254   blkCnt = blockSize >> 2U;
255 
256   while (blkCnt > 0U)
257   {
258     /* Perform Multiply-Accumulate */
259     *pScratchOut++ += (q31_t) *px++ * coeff;
260     *pScratchOut++ += (q31_t) *px++ * coeff;
261     *pScratchOut++ += (q31_t) *px++ * coeff;
262     *pScratchOut++ += (q31_t) *px++ * coeff;
263 
264     /* Decrement loop counter */
265     blkCnt--;
266   }
267 
268   /* Loop unrolling: Compute remaining outputs */
269   blkCnt = blockSize % 0x4U;
270 
271 #else
272 
273   /* Initialize blkCnt with number of samples */
274   blkCnt = blockSize;
275 
276 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
277 
278   while (blkCnt > 0U)
279   {
280     /* Perform Multiply-Accumulate */
281     *pScratchOut++ += (q31_t) *px++ * coeff;
282 
283     /* Decrement loop counter */
284     blkCnt--;
285   }
286 
287   /* All the output values are in pScratchOut buffer.
288      Convert them into 1.15 format, saturate and store in the destination buffer. */
289 #if defined (ARM_MATH_LOOPUNROLL)
290 
291   /* Loop unrolling: Compute 4 outputs at a time. */
292   blkCnt = blockSize >> 2U;
293 
294   while (blkCnt > 0U)
295   {
296     in1 = *pScr2++;
297     in2 = *pScr2++;
298 
299 #ifndef ARM_MATH_BIG_ENDIAN
300     write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 16));
301 #else
302     write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 16));
303 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
304 
305     in1 = *pScr2++;
306     in2 = *pScr2++;
307 
308 #ifndef ARM_MATH_BIG_ENDIAN
309     write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 16));
310 #else
311     write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 16));
312 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
313 
314     /* Decrement loop counter */
315     blkCnt--;
316   }
317 
318   /* Loop unrolling: Compute remaining outputs */
319   blkCnt = blockSize % 0x4U;
320 
321 #else
322 
323   /* Initialize blkCnt with number of samples */
324   blkCnt = blockSize;
325 
326 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
327 
328   while (blkCnt > 0U)
329   {
330     *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
331 
332     /* Decrement loop counter */
333     blkCnt--;
334   }
335 
336 }
337 
338 /**
339   @} end of FIR_Sparse group
340  */
341