1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_sparse_q7.c
4  * Description:  Q7 sparse FIR filter processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR_Sparse
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q7 sparse FIR filter.
42   @param[in]     S           points to an instance of the Q7 sparse FIR structure
43   @param[in]     pSrc        points to the block of input data
44   @param[out]    pDst        points to the block of output data
45   @param[in]     pScratchIn  points to a temporary buffer of size blockSize
46   @param[in]     pScratchOut points to a temporary buffer of size blockSize
47   @param[in]     blockSize   number of input samples to process
48 
49   @par           Scaling and Overflow Behavior
50                    The function is implemented using a 32-bit internal accumulator.
51                    Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
52                    The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
53                    There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
54                    The accumulator is then converted to 18.7 format by discarding the low 7 bits.
55                    Finally, the result is truncated to 1.7 format.
56  */
57 
arm_fir_sparse_q7(arm_fir_sparse_instance_q7 * S,const q7_t * pSrc,q7_t * pDst,q7_t * pScratchIn,q31_t * pScratchOut,uint32_t blockSize)58 void arm_fir_sparse_q7(
59         arm_fir_sparse_instance_q7 * S,
60   const q7_t * pSrc,
61         q7_t * pDst,
62         q7_t * pScratchIn,
63         q31_t * pScratchOut,
64         uint32_t blockSize)
65 {
66         q7_t *pState = S->pState;                      /* State pointer */
67   const q7_t *pCoeffs = S->pCoeffs;                    /* Coefficient pointer */
68         q7_t *px;                                      /* Scratch buffer pointer */
69         q7_t *py = pState;                             /* Temporary pointers for state buffer */
70         q7_t *pb = pScratchIn;                         /* Temporary pointers for scratch buffer */
71         q7_t *pOut = pDst;                             /* Destination pointer */
72         int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
73         uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
74         uint16_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter  */
75         int32_t readIndex;                             /* Read index of the state buffer */
76         uint32_t tapCnt, blkCnt;                       /* loop counters */
77         q31_t *pScr2 = pScratchOut;                    /* Working pointer for scratch buffer of output values */
78         q31_t in;
79         q7_t coeff = *pCoeffs++;                       /* Read the coefficient value */
80 
81 #if defined (ARM_MATH_LOOPUNROLL)
82         q7_t in1, in2, in3, in4;
83 #endif
84 
85   /* BlockSize of Input samples are copied into the state buffer */
86   /* StateIndex points to the starting position to write in the state buffer */
87   arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1, blockSize);
88 
89   /* Loop over the number of taps. */
90   tapCnt = numTaps;
91 
92   /* Read Index, from where the state buffer should be read, is calculated. */
93   readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
94 
95   /* Wraparound of readIndex */
96   if (readIndex < 0)
97   {
98     readIndex += (int32_t) delaySize;
99   }
100 
101   /* Working pointer for state buffer is updated */
102   py = pState;
103 
104   /* blockSize samples are read from the state buffer */
105   arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1,
106                    pb, pb, (int32_t) blockSize, 1, blockSize);
107 
108   /* Working pointer for the scratch buffer of state values */
109   px = pb;
110 
111   /* Working pointer for scratch buffer of output values */
112   pScratchOut = pScr2;
113 
114 
115 #if defined (ARM_MATH_LOOPUNROLL)
116 
117   /* Loop unrolling: Compute 4 outputs at a time. */
118   blkCnt = blockSize >> 2U;
119 
120   while (blkCnt > 0U)
121   {
122     /* Perform multiplication and store in the scratch buffer */
123     *pScratchOut++ = ((q31_t) *px++ * coeff);
124     *pScratchOut++ = ((q31_t) *px++ * coeff);
125     *pScratchOut++ = ((q31_t) *px++ * coeff);
126     *pScratchOut++ = ((q31_t) *px++ * coeff);
127 
128     /* Decrement loop counter */
129     blkCnt--;
130   }
131 
132   /* Loop unrolling: Compute remaining outputs */
133   blkCnt = blockSize % 0x4U;
134 
135 #else
136 
137   /* Initialize blkCnt with number of samples */
138   blkCnt = blockSize;
139 
140 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
141 
142   while (blkCnt > 0U)
143   {
144     /* Perform Multiplication and store in the scratch buffer */
145     *pScratchOut++ = ((q31_t) *px++ * coeff);
146 
147     /* Decrement loop counter */
148     blkCnt--;
149   }
150 
151   /* Load the coefficient value and
152    * increment the coefficient buffer for the next set of state values */
153   coeff = *pCoeffs++;
154 
155   /* Read Index, from where the state buffer should be read, is calculated. */
156   readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
157 
158   /* Wraparound of readIndex */
159   if (readIndex < 0)
160   {
161     readIndex += (int32_t) delaySize;
162   }
163 
164   /* Loop over the number of taps. */
165   tapCnt = (uint32_t) numTaps - 2U;
166 
167   while (tapCnt > 0U)
168   {
169     /* Working pointer for state buffer is updated */
170     py = pState;
171 
172     /* blockSize samples are read from the state buffer */
173     arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1,
174                         pb, pb, (int32_t) blockSize, 1, blockSize);
175 
176     /* Working pointer for the scratch buffer of state values */
177     px = pb;
178 
179     /* Working pointer for scratch buffer of output values */
180     pScratchOut = pScr2;
181 
182 
183 #if defined (ARM_MATH_LOOPUNROLL)
184 
185     /* Loop unrolling: Compute 4 outputs at a time. */
186     blkCnt = blockSize >> 2U;
187 
188     while (blkCnt > 0U)
189     {
190       /* Perform Multiply-Accumulate */
191       in = *pScratchOut + ((q31_t) * px++ * coeff);
192       *pScratchOut++ = in;
193       in = *pScratchOut + ((q31_t) * px++ * coeff);
194       *pScratchOut++ = in;
195       in = *pScratchOut + ((q31_t) * px++ * coeff);
196       *pScratchOut++ = in;
197       in = *pScratchOut + ((q31_t) * px++ * coeff);
198       *pScratchOut++ = in;
199 
200       /* Decrement loop counter */
201       blkCnt--;
202     }
203 
204     /* Loop unrolling: Compute remaining outputs */
205     blkCnt = blockSize % 0x4U;
206 
207 #else
208 
209     /* Initialize blkCnt with number of samples */
210     blkCnt = blockSize;
211 
212 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
213 
214     while (blkCnt > 0U)
215     {
216       /* Perform Multiply-Accumulate */
217       in = *pScratchOut + ((q31_t) *px++ * coeff);
218       *pScratchOut++ = in;
219 
220       /* Decrement loop counter */
221       blkCnt--;
222     }
223 
224     /* Load the coefficient value and
225      * increment the coefficient buffer for the next set of state values */
226     coeff = *pCoeffs++;
227 
228     /* Read Index, from where the state buffer should be read, is calculated. */
229     readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
230 
231     /* Wraparound of readIndex */
232     if (readIndex < 0)
233     {
234       readIndex += (int32_t) delaySize;
235     }
236 
237     /* Decrement loop counter */
238     tapCnt--;
239   }
240 
241   /* Compute last tap without the final read of pTapDelay */
242 
243   /* Working pointer for state buffer is updated */
244   py = pState;
245 
246   /* blockSize samples are read from the state buffer */
247   arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1,
248                       pb, pb, (int32_t) blockSize, 1, blockSize);
249 
250   /* Working pointer for the scratch buffer of state values */
251   px = pb;
252 
253   /* Working pointer for scratch buffer of output values */
254   pScratchOut = pScr2;
255 
256 
257 #if defined (ARM_MATH_LOOPUNROLL)
258 
259   /* Loop unrolling: Compute 4 outputs at a time. */
260   blkCnt = blockSize >> 2U;
261 
262   while (blkCnt > 0U)
263   {
264     /* Perform Multiply-Accumulate */
265     in = *pScratchOut + ((q31_t) *px++ * coeff);
266     *pScratchOut++ = in;
267     in = *pScratchOut + ((q31_t) *px++ * coeff);
268     *pScratchOut++ = in;
269     in = *pScratchOut + ((q31_t) *px++ * coeff);
270     *pScratchOut++ = in;
271     in = *pScratchOut + ((q31_t) *px++ * coeff);
272     *pScratchOut++ = in;
273 
274     /* Decrement loop counter */
275     blkCnt--;
276   }
277 
278   /* Loop unrolling: Compute remaining outputs */
279   blkCnt = blockSize % 0x4U;
280 
281 #else
282 
283   /* Initialize blkCnt with number of samples */
284   blkCnt = blockSize;
285 
286 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
287 
288   while (blkCnt > 0U)
289   {
290     /* Perform Multiply-Accumulate */
291     in = *pScratchOut + ((q31_t) *px++ * coeff);
292     *pScratchOut++ = in;
293 
294     /* Decrement loop counter */
295     blkCnt--;
296   }
297 
298   /* All the output values are in pScratchOut buffer.
299      Convert them into 1.15 format, saturate and store in the destination buffer. */
300 #if defined (ARM_MATH_LOOPUNROLL)
301 
302   /* Loop unrolling: Compute 4 outputs at a time. */
303   blkCnt = blockSize >> 2U;
304 
305   while (blkCnt > 0U)
306   {
307     in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
308     in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
309     in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
310     in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
311 
312     write_q7x4_ia (&pOut, __PACKq7(in1, in2, in3, in4));
313 
314     /* Decrement loop counter */
315     blkCnt--;
316   }
317 
318   /* Loop unrolling: Compute remaining outputs */
319   blkCnt = blockSize % 0x4U;
320 
321 #else
322 
323   /* Initialize blkCnt with number of samples */
324   blkCnt = blockSize;
325 
326 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
327 
328   while (blkCnt > 0U)
329   {
330     *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
331 
332     /* Decrement loop counter */
333     blkCnt--;
334   }
335 
336 }
337 
338 /**
339   @} end of FIR_Sparse group
340  */
341