1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_sparse_q7.c
4 * Description: Q7 sparse FIR filter processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup FIR_Sparse
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q7 sparse FIR filter.
42 @param[in] S points to an instance of the Q7 sparse FIR structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] pScratchIn points to a temporary buffer of size blockSize
46 @param[in] pScratchOut points to a temporary buffer of size blockSize
47 @param[in] blockSize number of input samples to process
48
49 @par Scaling and Overflow Behavior
50 The function is implemented using a 32-bit internal accumulator.
51 Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
52 The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
53 There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
54 The accumulator is then converted to 18.7 format by discarding the low 7 bits.
55 Finally, the result is truncated to 1.7 format.
56 */
57
arm_fir_sparse_q7(arm_fir_sparse_instance_q7 * S,const q7_t * pSrc,q7_t * pDst,q7_t * pScratchIn,q31_t * pScratchOut,uint32_t blockSize)58 void arm_fir_sparse_q7(
59 arm_fir_sparse_instance_q7 * S,
60 const q7_t * pSrc,
61 q7_t * pDst,
62 q7_t * pScratchIn,
63 q31_t * pScratchOut,
64 uint32_t blockSize)
65 {
66 q7_t *pState = S->pState; /* State pointer */
67 const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
68 q7_t *px; /* Scratch buffer pointer */
69 q7_t *py = pState; /* Temporary pointers for state buffer */
70 q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
71 q7_t *pOut = pDst; /* Destination pointer */
72 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
73 uint32_t delaySize = S->maxDelay + blockSize; /* state length */
74 uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
75 int32_t readIndex; /* Read index of the state buffer */
76 uint32_t tapCnt, blkCnt; /* loop counters */
77 q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */
78 q31_t in;
79 q7_t coeff = *pCoeffs++; /* Read the coefficient value */
80
81 #if defined (ARM_MATH_LOOPUNROLL)
82 q7_t in1, in2, in3, in4;
83 #endif
84
85 /* BlockSize of Input samples are copied into the state buffer */
86 /* StateIndex points to the starting position to write in the state buffer */
87 arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1, blockSize);
88
89 /* Loop over the number of taps. */
90 tapCnt = numTaps;
91
92 /* Read Index, from where the state buffer should be read, is calculated. */
93 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
94
95 /* Wraparound of readIndex */
96 if (readIndex < 0)
97 {
98 readIndex += (int32_t) delaySize;
99 }
100
101 /* Working pointer for state buffer is updated */
102 py = pState;
103
104 /* blockSize samples are read from the state buffer */
105 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1,
106 pb, pb, (int32_t) blockSize, 1, blockSize);
107
108 /* Working pointer for the scratch buffer of state values */
109 px = pb;
110
111 /* Working pointer for scratch buffer of output values */
112 pScratchOut = pScr2;
113
114
115 #if defined (ARM_MATH_LOOPUNROLL)
116
117 /* Loop unrolling: Compute 4 outputs at a time. */
118 blkCnt = blockSize >> 2U;
119
120 while (blkCnt > 0U)
121 {
122 /* Perform multiplication and store in the scratch buffer */
123 *pScratchOut++ = ((q31_t) *px++ * coeff);
124 *pScratchOut++ = ((q31_t) *px++ * coeff);
125 *pScratchOut++ = ((q31_t) *px++ * coeff);
126 *pScratchOut++ = ((q31_t) *px++ * coeff);
127
128 /* Decrement loop counter */
129 blkCnt--;
130 }
131
132 /* Loop unrolling: Compute remaining outputs */
133 blkCnt = blockSize % 0x4U;
134
135 #else
136
137 /* Initialize blkCnt with number of samples */
138 blkCnt = blockSize;
139
140 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
141
142 while (blkCnt > 0U)
143 {
144 /* Perform Multiplication and store in the scratch buffer */
145 *pScratchOut++ = ((q31_t) *px++ * coeff);
146
147 /* Decrement loop counter */
148 blkCnt--;
149 }
150
151 /* Load the coefficient value and
152 * increment the coefficient buffer for the next set of state values */
153 coeff = *pCoeffs++;
154
155 /* Read Index, from where the state buffer should be read, is calculated. */
156 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
157
158 /* Wraparound of readIndex */
159 if (readIndex < 0)
160 {
161 readIndex += (int32_t) delaySize;
162 }
163
164 /* Loop over the number of taps. */
165 tapCnt = (uint32_t) numTaps - 2U;
166
167 while (tapCnt > 0U)
168 {
169 /* Working pointer for state buffer is updated */
170 py = pState;
171
172 /* blockSize samples are read from the state buffer */
173 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1,
174 pb, pb, (int32_t) blockSize, 1, blockSize);
175
176 /* Working pointer for the scratch buffer of state values */
177 px = pb;
178
179 /* Working pointer for scratch buffer of output values */
180 pScratchOut = pScr2;
181
182
183 #if defined (ARM_MATH_LOOPUNROLL)
184
185 /* Loop unrolling: Compute 4 outputs at a time. */
186 blkCnt = blockSize >> 2U;
187
188 while (blkCnt > 0U)
189 {
190 /* Perform Multiply-Accumulate */
191 in = *pScratchOut + ((q31_t) * px++ * coeff);
192 *pScratchOut++ = in;
193 in = *pScratchOut + ((q31_t) * px++ * coeff);
194 *pScratchOut++ = in;
195 in = *pScratchOut + ((q31_t) * px++ * coeff);
196 *pScratchOut++ = in;
197 in = *pScratchOut + ((q31_t) * px++ * coeff);
198 *pScratchOut++ = in;
199
200 /* Decrement loop counter */
201 blkCnt--;
202 }
203
204 /* Loop unrolling: Compute remaining outputs */
205 blkCnt = blockSize % 0x4U;
206
207 #else
208
209 /* Initialize blkCnt with number of samples */
210 blkCnt = blockSize;
211
212 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
213
214 while (blkCnt > 0U)
215 {
216 /* Perform Multiply-Accumulate */
217 in = *pScratchOut + ((q31_t) *px++ * coeff);
218 *pScratchOut++ = in;
219
220 /* Decrement loop counter */
221 blkCnt--;
222 }
223
224 /* Load the coefficient value and
225 * increment the coefficient buffer for the next set of state values */
226 coeff = *pCoeffs++;
227
228 /* Read Index, from where the state buffer should be read, is calculated. */
229 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
230
231 /* Wraparound of readIndex */
232 if (readIndex < 0)
233 {
234 readIndex += (int32_t) delaySize;
235 }
236
237 /* Decrement loop counter */
238 tapCnt--;
239 }
240
241 /* Compute last tap without the final read of pTapDelay */
242
243 /* Working pointer for state buffer is updated */
244 py = pState;
245
246 /* blockSize samples are read from the state buffer */
247 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1,
248 pb, pb, (int32_t) blockSize, 1, blockSize);
249
250 /* Working pointer for the scratch buffer of state values */
251 px = pb;
252
253 /* Working pointer for scratch buffer of output values */
254 pScratchOut = pScr2;
255
256
257 #if defined (ARM_MATH_LOOPUNROLL)
258
259 /* Loop unrolling: Compute 4 outputs at a time. */
260 blkCnt = blockSize >> 2U;
261
262 while (blkCnt > 0U)
263 {
264 /* Perform Multiply-Accumulate */
265 in = *pScratchOut + ((q31_t) *px++ * coeff);
266 *pScratchOut++ = in;
267 in = *pScratchOut + ((q31_t) *px++ * coeff);
268 *pScratchOut++ = in;
269 in = *pScratchOut + ((q31_t) *px++ * coeff);
270 *pScratchOut++ = in;
271 in = *pScratchOut + ((q31_t) *px++ * coeff);
272 *pScratchOut++ = in;
273
274 /* Decrement loop counter */
275 blkCnt--;
276 }
277
278 /* Loop unrolling: Compute remaining outputs */
279 blkCnt = blockSize % 0x4U;
280
281 #else
282
283 /* Initialize blkCnt with number of samples */
284 blkCnt = blockSize;
285
286 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
287
288 while (blkCnt > 0U)
289 {
290 /* Perform Multiply-Accumulate */
291 in = *pScratchOut + ((q31_t) *px++ * coeff);
292 *pScratchOut++ = in;
293
294 /* Decrement loop counter */
295 blkCnt--;
296 }
297
298 /* All the output values are in pScratchOut buffer.
299 Convert them into 1.15 format, saturate and store in the destination buffer. */
300 #if defined (ARM_MATH_LOOPUNROLL)
301
302 /* Loop unrolling: Compute 4 outputs at a time. */
303 blkCnt = blockSize >> 2U;
304
305 while (blkCnt > 0U)
306 {
307 in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
308 in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
309 in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
310 in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
311
312 write_q7x4_ia (&pOut, __PACKq7(in1, in2, in3, in4));
313
314 /* Decrement loop counter */
315 blkCnt--;
316 }
317
318 /* Loop unrolling: Compute remaining outputs */
319 blkCnt = blockSize % 0x4U;
320
321 #else
322
323 /* Initialize blkCnt with number of samples */
324 blkCnt = blockSize;
325
326 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
327
328 while (blkCnt > 0U)
329 {
330 *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
331
332 /* Decrement loop counter */
333 blkCnt--;
334 }
335
336 }
337
338 /**
339 @} end of FIR_Sparse group
340 */
341