1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_sparse_q15.c
4 * Description: Q15 sparse FIR filter processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup FIR_Sparse
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q15 sparse FIR filter.
42 @param[in] S points to an instance of the Q15 sparse FIR structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] pScratchIn points to a temporary buffer of size blockSize
46 @param[in] pScratchOut points to a temporary buffer of size blockSize
47 @param[in] blockSize number of input samples to process per call
48
49 @par Scaling and Overflow Behavior
50 The function is implemented using an internal 32-bit accumulator.
51 The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator.
52 Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator.
53 If the accumulator result overflows it will wrap around rather than saturate.
54 After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format.
55 In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
56 */
57
arm_fir_sparse_q15(arm_fir_sparse_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,q15_t * pScratchIn,q31_t * pScratchOut,uint32_t blockSize)58 void arm_fir_sparse_q15(
59 arm_fir_sparse_instance_q15 * S,
60 const q15_t * pSrc,
61 q15_t * pDst,
62 q15_t * pScratchIn,
63 q31_t * pScratchOut,
64 uint32_t blockSize)
65 {
66 q15_t *pState = S->pState; /* State pointer */
67 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
68 q15_t *px; /* Temporary pointers for scratch buffer */
69 q15_t *py = pState; /* Temporary pointers for state buffer */
70 q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
71 q15_t *pOut = pDst; /* Working pointer for output */
72 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
73 uint32_t delaySize = S->maxDelay + blockSize; /* state length */
74 uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
75 int32_t readIndex; /* Read index of the state buffer */
76 uint32_t tapCnt, blkCnt; /* loop counters */
77 q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */
78 q15_t coeff = *pCoeffs++; /* Read the first coefficient value */
79
80 #if defined (ARM_MATH_LOOPUNROLL)
81 q31_t in1, in2; /* Temporary variables */
82 #endif
83
84 /* BlockSize of Input samples are copied into the state buffer */
85 /* StateIndex points to the starting position to write in the state buffer */
86 arm_circularWrite_q15(py, (int32_t) delaySize, &S->stateIndex, 1,pSrc, 1, blockSize);
87
88 /* Loop over the number of taps. */
89 tapCnt = numTaps;
90
91 /* Read Index, from where the state buffer should be read, is calculated. */
92 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
93
94 /* Wraparound of readIndex */
95 if (readIndex < 0)
96 {
97 readIndex += (int32_t) delaySize;
98 }
99
100 /* Working pointer for state buffer is updated */
101 py = pState;
102
103 /* blockSize samples are read from the state buffer */
104 arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1,
105 pb, pb, (int32_t) blockSize, 1, blockSize);
106
107 /* Working pointer for the scratch buffer of state values */
108 px = pb;
109
110 /* Working pointer for scratch buffer of output values */
111 pScratchOut = pScr2;
112
113
114 #if defined (ARM_MATH_LOOPUNROLL)
115
116 /* Loop unrolling: Compute 4 outputs at a time. */
117 blkCnt = blockSize >> 2U;
118
119 while (blkCnt > 0U)
120 {
121 /* Perform multiplication and store in the scratch buffer */
122 *pScratchOut++ = ((q31_t) *px++ * coeff);
123 *pScratchOut++ = ((q31_t) *px++ * coeff);
124 *pScratchOut++ = ((q31_t) *px++ * coeff);
125 *pScratchOut++ = ((q31_t) *px++ * coeff);
126
127 /* Decrement loop counter */
128 blkCnt--;
129 }
130
131 /* Loop unrolling: Compute remaining outputs */
132 blkCnt = blockSize % 0x4U;
133
134 #else
135
136 /* Initialize blkCnt with number of samples */
137 blkCnt = blockSize;
138
139 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
140
141 while (blkCnt > 0U)
142 {
143 /* Perform Multiplication and store in the scratch buffer */
144 *pScratchOut++ = ((q31_t) *px++ * coeff);
145
146 /* Decrement loop counter */
147 blkCnt--;
148 }
149
150 /* Load the coefficient value and
151 * increment the coefficient buffer for the next set of state values */
152 coeff = *pCoeffs++;
153
154 /* Read Index, from where the state buffer should be read, is calculated. */
155 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
156
157 /* Wraparound of readIndex */
158 if (readIndex < 0)
159 {
160 readIndex += (int32_t) delaySize;
161 }
162
163 /* Loop over the number of taps. */
164 tapCnt = (uint32_t) numTaps - 2U;
165
166 while (tapCnt > 0U)
167 {
168 /* Working pointer for state buffer is updated */
169 py = pState;
170
171 /* blockSize samples are read from the state buffer */
172 arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1,
173 pb, pb, (int32_t) blockSize, 1, blockSize);
174
175 /* Working pointer for the scratch buffer of state values */
176 px = pb;
177
178 /* Working pointer for scratch buffer of output values */
179 pScratchOut = pScr2;
180
181
182 #if defined (ARM_MATH_LOOPUNROLL)
183
184 /* Loop unrolling: Compute 4 outputs at a time. */
185 blkCnt = blockSize >> 2U;
186
187 while (blkCnt > 0U)
188 {
189 /* Perform Multiply-Accumulate */
190 *pScratchOut++ += (q31_t) *px++ * coeff;
191 *pScratchOut++ += (q31_t) *px++ * coeff;
192 *pScratchOut++ += (q31_t) *px++ * coeff;
193 *pScratchOut++ += (q31_t) *px++ * coeff;
194
195 /* Decrement loop counter */
196 blkCnt--;
197 }
198
199 /* Loop unrolling: Compute remaining outputs */
200 blkCnt = blockSize % 0x4U;
201
202 #else
203
204 /* Initialize blkCnt with number of samples */
205 blkCnt = blockSize;
206
207 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
208
209 while (blkCnt > 0U)
210 {
211 /* Perform Multiply-Accumulate */
212 *pScratchOut++ += (q31_t) *px++ * coeff;
213
214 /* Decrement loop counter */
215 blkCnt--;
216 }
217
218 /* Load the coefficient value and
219 * increment the coefficient buffer for the next set of state values */
220 coeff = *pCoeffs++;
221
222 /* Read Index, from where the state buffer should be read, is calculated. */
223 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
224
225 /* Wraparound of readIndex */
226 if (readIndex < 0)
227 {
228 readIndex += (int32_t) delaySize;
229 }
230
231 /* Decrement loop counter */
232 tapCnt--;
233 }
234
235 /* Compute last tap without the final read of pTapDelay */
236
237 /* Working pointer for state buffer is updated */
238 py = pState;
239
240 /* blockSize samples are read from the state buffer */
241 arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1,
242 pb, pb, (int32_t) blockSize, 1, blockSize);
243
244 /* Working pointer for the scratch buffer of state values */
245 px = pb;
246
247 /* Working pointer for scratch buffer of output values */
248 pScratchOut = pScr2;
249
250
251 #if defined (ARM_MATH_LOOPUNROLL)
252
253 /* Loop unrolling: Compute 4 outputs at a time. */
254 blkCnt = blockSize >> 2U;
255
256 while (blkCnt > 0U)
257 {
258 /* Perform Multiply-Accumulate */
259 *pScratchOut++ += (q31_t) *px++ * coeff;
260 *pScratchOut++ += (q31_t) *px++ * coeff;
261 *pScratchOut++ += (q31_t) *px++ * coeff;
262 *pScratchOut++ += (q31_t) *px++ * coeff;
263
264 /* Decrement loop counter */
265 blkCnt--;
266 }
267
268 /* Loop unrolling: Compute remaining outputs */
269 blkCnt = blockSize % 0x4U;
270
271 #else
272
273 /* Initialize blkCnt with number of samples */
274 blkCnt = blockSize;
275
276 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
277
278 while (blkCnt > 0U)
279 {
280 /* Perform Multiply-Accumulate */
281 *pScratchOut++ += (q31_t) *px++ * coeff;
282
283 /* Decrement loop counter */
284 blkCnt--;
285 }
286
287 /* All the output values are in pScratchOut buffer.
288 Convert them into 1.15 format, saturate and store in the destination buffer. */
289 #if defined (ARM_MATH_LOOPUNROLL)
290
291 /* Loop unrolling: Compute 4 outputs at a time. */
292 blkCnt = blockSize >> 2U;
293
294 while (blkCnt > 0U)
295 {
296 in1 = *pScr2++;
297 in2 = *pScr2++;
298
299 #ifndef ARM_MATH_BIG_ENDIAN
300 write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 16));
301 #else
302 write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 16));
303 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
304
305 in1 = *pScr2++;
306 in2 = *pScr2++;
307
308 #ifndef ARM_MATH_BIG_ENDIAN
309 write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 16));
310 #else
311 write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 16));
312 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
313
314 /* Decrement loop counter */
315 blkCnt--;
316 }
317
318 /* Loop unrolling: Compute remaining outputs */
319 blkCnt = blockSize % 0x4U;
320
321 #else
322
323 /* Initialize blkCnt with number of samples */
324 blkCnt = blockSize;
325
326 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
327
328 while (blkCnt > 0U)
329 {
330 *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
331
332 /* Decrement loop counter */
333 blkCnt--;
334 }
335
336 }
337
338 /**
339 @} end of FIR_Sparse group
340 */
341