1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_sparse_q31.c
4 * Description: Q31 sparse FIR filter processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup FIR_Sparse
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q31 sparse FIR filter.
42 @param[in] S points to an instance of the Q31 sparse FIR structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] pScratchIn points to a temporary buffer of size blockSize
46 @param[in] blockSize number of input samples to process
47
48 @par Scaling and Overflow Behavior
49 The function is implemented using an internal 32-bit accumulator.
50 The 1.31 x 1.31 multiplications are truncated to 2.30 format.
51 This leads to loss of precision on the intermediate multiplications and provides only a single guard bit.
52 If the accumulator result overflows, it wraps around rather than saturate.
53 In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
54 */
55
arm_fir_sparse_q31(arm_fir_sparse_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,q31_t * pScratchIn,uint32_t blockSize)56 void arm_fir_sparse_q31(
57 arm_fir_sparse_instance_q31 * S,
58 const q31_t * pSrc,
59 q31_t * pDst,
60 q31_t * pScratchIn,
61 uint32_t blockSize)
62 {
63 q31_t *pState = S->pState; /* State pointer */
64 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
65 q31_t *px; /* Scratch buffer pointer */
66 q31_t *py = pState; /* Temporary pointers for state buffer */
67 q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
68 q31_t *pOut; /* Destination pointer */
69 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
70 uint32_t delaySize = S->maxDelay + blockSize; /* state length */
71 uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
72 int32_t readIndex; /* Read index of the state buffer */
73 uint32_t tapCnt, blkCnt; /* loop counters */
74 q31_t coeff = *pCoeffs++; /* Read the first coefficient value */
75 q31_t in;
76 q63_t out; /* Temporary output variable */
77
78
79 /* BlockSize of Input samples are copied into the state buffer */
80 /* StateIndex points to the starting position to write in the state buffer */
81 arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
82 (int32_t *) pSrc, 1, blockSize);
83
84 /* Read Index, from where the state buffer should be read, is calculated. */
85 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
86
87 /* Wraparound of readIndex */
88 if (readIndex < 0)
89 {
90 readIndex += (int32_t) delaySize;
91 }
92
93 /* Working pointer for state buffer is updated */
94 py = pState;
95
96 /* blockSize samples are read from the state buffer */
97 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
98 (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
99
100 /* Working pointer for the scratch buffer of state values */
101 px = pb;
102
103 /* Working pointer for scratch buffer of output values */
104 pOut = pDst;
105
106
107 #if defined (ARM_MATH_LOOPUNROLL)
108
109 /* Loop unrolling: Compute 4 outputs at a time. */
110 blkCnt = blockSize >> 2U;
111
112 while (blkCnt > 0U)
113 {
114 /* Perform Multiplications and store in destination buffer */
115 *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
116
117 *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
118
119 *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
120
121 *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
122
123 /* Decrement loop counter */
124 blkCnt--;
125 }
126
127 /* Loop unrolling: Compute remaining outputs */
128 blkCnt = blockSize % 0x4U;
129
130 #else
131
132 /* Initialize blkCnt with number of samples */
133 blkCnt = blockSize;
134
135 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
136
137 while (blkCnt > 0U)
138 {
139 /* Perform Multiplication and store in destination buffer */
140 *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
141
142 /* Decrement loop counter */
143 blkCnt--;
144 }
145
146 /* Load the coefficient value and
147 * increment the coefficient buffer for the next set of state values */
148 coeff = *pCoeffs++;
149
150 /* Read Index, from where the state buffer should be read, is calculated. */
151 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
152
153 /* Wraparound of readIndex */
154 if (readIndex < 0)
155 {
156 readIndex += (int32_t) delaySize;
157 }
158
159 /* Loop over the number of taps. */
160 tapCnt = (uint32_t) numTaps - 2U;
161
162 while (tapCnt > 0U)
163 {
164 /* Working pointer for state buffer is updated */
165 py = pState;
166
167 /* blockSize samples are read from the state buffer */
168 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
169 (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
170
171 /* Working pointer for the scratch buffer of state values */
172 px = pb;
173
174 /* Working pointer for scratch buffer of output values */
175 pOut = pDst;
176
177
178 #if defined (ARM_MATH_LOOPUNROLL)
179
180 /* Loop unrolling: Compute 4 outputs at a time. */
181 blkCnt = blockSize >> 2U;
182
183 while (blkCnt > 0U)
184 {
185 /* Perform Multiply-Accumulate */
186 out = *pOut;
187 out += ((q63_t) *px++ * coeff) >> 32;
188 *pOut++ = (q31_t) (out);
189
190 out = *pOut;
191 out += ((q63_t) *px++ * coeff) >> 32;
192 *pOut++ = (q31_t) (out);
193
194 out = *pOut;
195 out += ((q63_t) *px++ * coeff) >> 32;
196 *pOut++ = (q31_t) (out);
197
198 out = *pOut;
199 out += ((q63_t) *px++ * coeff) >> 32;
200 *pOut++ = (q31_t) (out);
201
202 /* Decrement loop counter */
203 blkCnt--;
204 }
205
206 /* Loop unrolling: Compute remaining outputs */
207 blkCnt = blockSize % 0x4U;
208
209 #else
210
211 /* Initialize blkCnt with number of samples */
212 blkCnt = blockSize;
213
214 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
215
216 while (blkCnt > 0U)
217 {
218 /* Perform Multiply-Accumulate */
219 out = *pOut;
220 out += ((q63_t) *px++ * coeff) >> 32;
221 *pOut++ = (q31_t) (out);
222
223 /* Decrement loop counter */
224 blkCnt--;
225 }
226
227 /* Load the coefficient value and
228 * increment the coefficient buffer for the next set of state values */
229 coeff = *pCoeffs++;
230
231 /* Read Index, from where the state buffer should be read, is calculated. */
232 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
233
234 /* Wraparound of readIndex */
235 if (readIndex < 0)
236 {
237 readIndex += (int32_t) delaySize;
238 }
239
240 /* Decrement tap loop counter */
241 tapCnt--;
242 }
243
244 /* Compute last tap without the final read of pTapDelay */
245
246 /* Working pointer for state buffer is updated */
247 py = pState;
248
249 /* blockSize samples are read from the state buffer */
250 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
251 (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
252
253 /* Working pointer for the scratch buffer of state values */
254 px = pb;
255
256 /* Working pointer for scratch buffer of output values */
257 pOut = pDst;
258
259
260 #if defined (ARM_MATH_LOOPUNROLL)
261
262 /* Loop unrolling: Compute 4 outputs at a time. */
263 blkCnt = blockSize >> 2U;
264
265 while (blkCnt > 0U)
266 {
267 /* Perform Multiply-Accumulate */
268 out = *pOut;
269 out += ((q63_t) * px++ * coeff) >> 32;
270 *pOut++ = (q31_t) (out);
271
272 out = *pOut;
273 out += ((q63_t) * px++ * coeff) >> 32;
274 *pOut++ = (q31_t) (out);
275
276 out = *pOut;
277 out += ((q63_t) * px++ * coeff) >> 32;
278 *pOut++ = (q31_t) (out);
279
280 out = *pOut;
281 out += ((q63_t) * px++ * coeff) >> 32;
282 *pOut++ = (q31_t) (out);
283
284 /* Decrement loop counter */
285 blkCnt--;
286 }
287
288 /* Loop unrolling: Compute remaining outputs */
289 blkCnt = blockSize % 0x4U;
290
291 #else
292
293 /* Initialize blkCnt with number of samples */
294 blkCnt = blockSize;
295
296 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
297
298 while (blkCnt > 0U)
299 {
300 /* Perform Multiply-Accumulate */
301 out = *pOut;
302 out += ((q63_t) *px++ * coeff) >> 32;
303 *pOut++ = (q31_t) (out);
304
305 /* Decrement loop counter */
306 blkCnt--;
307 }
308
309 /* Working output pointer is updated */
310 pOut = pDst;
311
312 /* Output is converted into 1.31 format. */
313 #if defined (ARM_MATH_LOOPUNROLL)
314
315 /* Loop unrolling: Compute 4 outputs at a time. */
316 blkCnt = blockSize >> 2U;
317
318 while (blkCnt > 0U)
319 {
320 in = *pOut << 1;
321 *pOut++ = in;
322 in = *pOut << 1;
323 *pOut++ = in;
324 in = *pOut << 1;
325 *pOut++ = in;
326 in = *pOut << 1;
327 *pOut++ = in;
328
329 /* Decrement loop counter */
330 blkCnt--;
331 }
332
333 /* Loop unrolling: Compute remaining outputs */
334 blkCnt = blockSize % 0x4U;
335
336 #else
337
338 /* Initialize blkCnt with number of samples */
339 blkCnt = blockSize;
340
341 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
342
343 while (blkCnt > 0U)
344 {
345 in = *pOut << 1;
346 *pOut++ = in;
347
348 /* Decrement loop counter */
349 blkCnt--;
350 }
351
352 }
353
354 /**
355 @} end of FIR_Sparse group
356 */
357