1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_sparse_q31.c
4 * Description: Q31 sparse FIR filter processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup FIR_Sparse
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q31 sparse FIR filter.
42 @param[in] S points to an instance of the Q31 sparse FIR structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] pScratchIn points to a temporary buffer of size blockSize
46 @param[in] blockSize number of input samples to process
47 @return none
48
49 @par Scaling and Overflow Behavior
50 The function is implemented using an internal 32-bit accumulator.
51 The 1.31 x 1.31 multiplications are truncated to 2.30 format.
52 This leads to loss of precision on the intermediate multiplications and provides only a single guard bit.
53 If the accumulator result overflows, it wraps around rather than saturate.
54 In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
55 */
56
arm_fir_sparse_q31(arm_fir_sparse_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,q31_t * pScratchIn,uint32_t blockSize)57 void arm_fir_sparse_q31(
58 arm_fir_sparse_instance_q31 * S,
59 const q31_t * pSrc,
60 q31_t * pDst,
61 q31_t * pScratchIn,
62 uint32_t blockSize)
63 {
64 q31_t *pState = S->pState; /* State pointer */
65 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
66 q31_t *px; /* Scratch buffer pointer */
67 q31_t *py = pState; /* Temporary pointers for state buffer */
68 q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
69 q31_t *pOut; /* Destination pointer */
70 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
71 uint32_t delaySize = S->maxDelay + blockSize; /* state length */
72 uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
73 int32_t readIndex; /* Read index of the state buffer */
74 uint32_t tapCnt, blkCnt; /* loop counters */
75 q31_t coeff = *pCoeffs++; /* Read the first coefficient value */
76 q31_t in;
77 q63_t out; /* Temporary output variable */
78
79
80 /* BlockSize of Input samples are copied into the state buffer */
81 /* StateIndex points to the starting position to write in the state buffer */
82 arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
83 (int32_t *) pSrc, 1, blockSize);
84
85 /* Read Index, from where the state buffer should be read, is calculated. */
86 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
87
88 /* Wraparound of readIndex */
89 if (readIndex < 0)
90 {
91 readIndex += (int32_t) delaySize;
92 }
93
94 /* Working pointer for state buffer is updated */
95 py = pState;
96
97 /* blockSize samples are read from the state buffer */
98 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
99 (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
100
101 /* Working pointer for the scratch buffer of state values */
102 px = pb;
103
104 /* Working pointer for scratch buffer of output values */
105 pOut = pDst;
106
107
108 #if defined (ARM_MATH_LOOPUNROLL)
109
110 /* Loop unrolling: Compute 4 outputs at a time. */
111 blkCnt = blockSize >> 2U;
112
113 while (blkCnt > 0U)
114 {
115 /* Perform Multiplications and store in destination buffer */
116 *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
117
118 *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
119
120 *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
121
122 *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
123
124 /* Decrement loop counter */
125 blkCnt--;
126 }
127
128 /* Loop unrolling: Compute remaining outputs */
129 blkCnt = blockSize % 0x4U;
130
131 #else
132
133 /* Initialize blkCnt with number of samples */
134 blkCnt = blockSize;
135
136 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
137
138 while (blkCnt > 0U)
139 {
140 /* Perform Multiplication and store in destination buffer */
141 *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
142
143 /* Decrement loop counter */
144 blkCnt--;
145 }
146
147 /* Load the coefficient value and
148 * increment the coefficient buffer for the next set of state values */
149 coeff = *pCoeffs++;
150
151 /* Read Index, from where the state buffer should be read, is calculated. */
152 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
153
154 /* Wraparound of readIndex */
155 if (readIndex < 0)
156 {
157 readIndex += (int32_t) delaySize;
158 }
159
160 /* Loop over the number of taps. */
161 tapCnt = (uint32_t) numTaps - 2U;
162
163 while (tapCnt > 0U)
164 {
165 /* Working pointer for state buffer is updated */
166 py = pState;
167
168 /* blockSize samples are read from the state buffer */
169 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
170 (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
171
172 /* Working pointer for the scratch buffer of state values */
173 px = pb;
174
175 /* Working pointer for scratch buffer of output values */
176 pOut = pDst;
177
178
179 #if defined (ARM_MATH_LOOPUNROLL)
180
181 /* Loop unrolling: Compute 4 outputs at a time. */
182 blkCnt = blockSize >> 2U;
183
184 while (blkCnt > 0U)
185 {
186 /* Perform Multiply-Accumulate */
187 out = *pOut;
188 out += ((q63_t) *px++ * coeff) >> 32;
189 *pOut++ = (q31_t) (out);
190
191 out = *pOut;
192 out += ((q63_t) *px++ * coeff) >> 32;
193 *pOut++ = (q31_t) (out);
194
195 out = *pOut;
196 out += ((q63_t) *px++ * coeff) >> 32;
197 *pOut++ = (q31_t) (out);
198
199 out = *pOut;
200 out += ((q63_t) *px++ * coeff) >> 32;
201 *pOut++ = (q31_t) (out);
202
203 /* Decrement loop counter */
204 blkCnt--;
205 }
206
207 /* Loop unrolling: Compute remaining outputs */
208 blkCnt = blockSize % 0x4U;
209
210 #else
211
212 /* Initialize blkCnt with number of samples */
213 blkCnt = blockSize;
214
215 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
216
217 while (blkCnt > 0U)
218 {
219 /* Perform Multiply-Accumulate */
220 out = *pOut;
221 out += ((q63_t) *px++ * coeff) >> 32;
222 *pOut++ = (q31_t) (out);
223
224 /* Decrement loop counter */
225 blkCnt--;
226 }
227
228 /* Load the coefficient value and
229 * increment the coefficient buffer for the next set of state values */
230 coeff = *pCoeffs++;
231
232 /* Read Index, from where the state buffer should be read, is calculated. */
233 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
234
235 /* Wraparound of readIndex */
236 if (readIndex < 0)
237 {
238 readIndex += (int32_t) delaySize;
239 }
240
241 /* Decrement tap loop counter */
242 tapCnt--;
243 }
244
245 /* Compute last tap without the final read of pTapDelay */
246
247 /* Working pointer for state buffer is updated */
248 py = pState;
249
250 /* blockSize samples are read from the state buffer */
251 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
252 (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
253
254 /* Working pointer for the scratch buffer of state values */
255 px = pb;
256
257 /* Working pointer for scratch buffer of output values */
258 pOut = pDst;
259
260
261 #if defined (ARM_MATH_LOOPUNROLL)
262
263 /* Loop unrolling: Compute 4 outputs at a time. */
264 blkCnt = blockSize >> 2U;
265
266 while (blkCnt > 0U)
267 {
268 /* Perform Multiply-Accumulate */
269 out = *pOut;
270 out += ((q63_t) * px++ * coeff) >> 32;
271 *pOut++ = (q31_t) (out);
272
273 out = *pOut;
274 out += ((q63_t) * px++ * coeff) >> 32;
275 *pOut++ = (q31_t) (out);
276
277 out = *pOut;
278 out += ((q63_t) * px++ * coeff) >> 32;
279 *pOut++ = (q31_t) (out);
280
281 out = *pOut;
282 out += ((q63_t) * px++ * coeff) >> 32;
283 *pOut++ = (q31_t) (out);
284
285 /* Decrement loop counter */
286 blkCnt--;
287 }
288
289 /* Loop unrolling: Compute remaining outputs */
290 blkCnt = blockSize % 0x4U;
291
292 #else
293
294 /* Initialize blkCnt with number of samples */
295 blkCnt = blockSize;
296
297 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
298
299 while (blkCnt > 0U)
300 {
301 /* Perform Multiply-Accumulate */
302 out = *pOut;
303 out += ((q63_t) *px++ * coeff) >> 32;
304 *pOut++ = (q31_t) (out);
305
306 /* Decrement loop counter */
307 blkCnt--;
308 }
309
310 /* Working output pointer is updated */
311 pOut = pDst;
312
313 /* Output is converted into 1.31 format. */
314 #if defined (ARM_MATH_LOOPUNROLL)
315
316 /* Loop unrolling: Compute 4 outputs at a time. */
317 blkCnt = blockSize >> 2U;
318
319 while (blkCnt > 0U)
320 {
321 in = *pOut << 1;
322 *pOut++ = in;
323 in = *pOut << 1;
324 *pOut++ = in;
325 in = *pOut << 1;
326 *pOut++ = in;
327 in = *pOut << 1;
328 *pOut++ = in;
329
330 /* Decrement loop counter */
331 blkCnt--;
332 }
333
334 /* Loop unrolling: Compute remaining outputs */
335 blkCnt = blockSize % 0x4U;
336
337 #else
338
339 /* Initialize blkCnt with number of samples */
340 blkCnt = blockSize;
341
342 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
343
344 while (blkCnt > 0U)
345 {
346 in = *pOut << 1;
347 *pOut++ = in;
348
349 /* Decrement loop counter */
350 blkCnt--;
351 }
352
353 }
354
355 /**
356 @} end of FIR_Sparse group
357 */
358