1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_f32.c
4 * Description: Floating-point FIR filter processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @defgroup FIR Finite Impulse Response (FIR) Filters
37
38 This set of functions implements Finite Impulse Response (FIR) filters
39 for Q7, Q15, Q31, and floating-point data types. Fast versions of Q15 and Q31 are also provided.
40 The functions operate on blocks of input and output data and each call to the function processes
41 <code>blockSize</code> samples through the filter. <code>pSrc</code> and
42 <code>pDst</code> points to input and output arrays containing <code>blockSize</code> values.
43
44 @par Algorithm
45 The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.
46 Each filter coefficient <code>b[n]</code> is multiplied by a state variable which equals a previous input sample <code>x[n]</code>.
47 <pre>
48 y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
49 </pre>
50 @par
51 \image html FIR.GIF "Finite Impulse Response filter"
52 @par
53 <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
54 Coefficients are stored in time reversed order.
55 @par
56 <pre>
57 {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
58 </pre>
59 @par
60 <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
61 Samples in the state buffer are stored in the following order.
62 @par
63 <pre>
64 {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[n](==pSrc[0]), x[n+1](==pSrc[1]), ..., x[n+blockSize-1](==pSrc[blockSize-1])}
65 </pre>
66 @par
67 Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>.
68 The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
69 to be avoided and yields a significant speed improvement.
70 The state variables are updated after each block of data is processed; the coefficients are untouched.
71
72 @par Instance Structure
73 The coefficients and state variables for a filter are stored together in an instance data structure.
74 A separate instance structure must be defined for each filter.
75 Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
76 There are separate instance structure declarations for each of the 4 supported data types.
77
78 @par Initialization Functions
79 There is also an associated initialization function for each data type.
80 The initialization function performs the following operations:
81 - Sets the values of the internal structure fields.
82 - Zeros out the values in the state buffer.
83 To do this manually without calling the init function, assign the follow subfields of the instance structure:
84 numTaps, pCoeffs, pState. Also set all of the values in pState to zero.
85 @par
86 Use of the initialization function is optional.
87 However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
88 To place an instance structure into a const data section, the instance structure must be manually initialized.
89 Set the values in the state buffer to zeros before static initialization.
90 The code below statically initializes each of the 4 different data type filter instance structures
91 <pre>
92 arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
93 arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
94 arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
95 arm_fir_instance_q7 S = {numTaps, pState, pCoeffs};
96 </pre>
97 where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
98 <code>pCoeffs</code> is the address of the coefficient buffer.
99 @par Initialization of Helium version
100 For Helium version the array of coefficients must be padded with zero to contain
101 a full number of lanes.
102
103 The array length L must be a multiple of x. L = x * a :
104 - x is 4 for f32
105 - x is 4 for q31
106 - x is 4 for f16 (so managed like the f32 version and not like the q15 one)
107 - x is 8 for q15
108 - x is 16 for q7
109
110 The additional coefficients
111 (x * a - numTaps) must be set to 0.
112 numTaps is still set to its right value in the init function. It means that
113 the implementation may require to read more coefficients due to the vectorization and
114 to avoid having to manage too many different cases in the code.
115
116
117 @par Helium state buffer
118 The state buffer must contain some additional temporary data
119 used during the computation but which is not the state of the FIR.
120 The first A samples are temporary data.
121 The remaining samples are the state of the FIR filter.
122 @par
123 So the state buffer has size <code> numTaps + A + blockSize - 1 </code> :
124 - A is blockSize for f32
125 - A is 8*ceil(blockSize/8) for f16
126 - A is 8*ceil(blockSize/4) for q31
127 - A is 0 for other datatypes (q15 and q7)
128
129
130 @par Fixed-Point Behavior
131 Care must be taken when using the fixed-point versions of the FIR filter functions.
132 In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
133 Refer to the function specific documentation below for usage guidelines.
134
135 */
136
137 /**
138 @addtogroup FIR
139 @{
140 */
141
142 /**
143 @brief Processing function for floating-point FIR filter.
144 @param[in] S points to an instance of the floating-point FIR filter structure
145 @param[in] pSrc points to the block of input data
146 @param[out] pDst points to the block of output data
147 @param[in] blockSize number of samples to process
148 @return none
149 */
150
151 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
152
153 #define FIR_F32_MAX_COEF_BLK 8
154
155 #define FIR_F32_CORE(pSamples, c, NB_TAPS) \
156 vecAcc0 = vdupq_n_f32(0.0f); \
157 for (int i = 0; i < NB_TAPS; i++) { \
158 vecIn0 = vld1q(&pSamples[i]); \
159 vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]); \
160 }
161
162
163 #define NB_TAPS 4
arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,const float32_t * __restrict pSrc,float32_t * __restrict pDst,uint32_t blockSize)164 __STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
165 const float32_t * __restrict pSrc,
166 float32_t * __restrict pDst, uint32_t blockSize)
167 {
168 float32_t *pRefStatePtr = S->pState + blockSize;
169 float32_t *pState = pRefStatePtr; /* State pointer */
170 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
171 float32_t *pStateCur; /* Points to the current sample of the state */
172 const float32_t *pSamples; /* Temporary pointer to the sample buffer */
173 float32_t *pOutput; /* Temporary pointer to the output buffer */
174 const float32_t *pTempSrc; /* Temporary pointer to the source data */
175 float32_t *pTempDest; /* Temporary pointer to the destination buffer */
176 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
177 int32_t blkCnt;
178 float32x4_t vecIn0;
179 float32x4_t vecAcc0;
180 float32_t c[NB_TAPS];
181 const float32_t *pCoeffsCur = pCoeffs;
182
183 /*
184 * pState points to state array which contains previous frame (numTaps - 1) samples
185 * pStateCur points to the location where the new input data should be written
186 */
187 pStateCur = &(pState[(numTaps - 1u)]);
188 pTempSrc = pSrc;
189
190 pSamples = pState;
191 pOutput = pDst;
192
193 for (int i = 0; i < NB_TAPS; i++)
194 c[i] = *pCoeffsCur++;
195
196 blkCnt = blockSize >> 2;
197 while (blkCnt > 0) {
198 /*
199 * Save 4 input samples in the history buffer
200 */
201 vst1q(pStateCur, vld1q(pTempSrc));
202 pStateCur += 4;
203 pTempSrc += 4;
204
205 FIR_F32_CORE(pSamples, c, NB_TAPS);
206
207 vst1q(pOutput, vecAcc0);
208
209 pOutput += 4;
210 pSamples += 4;
211
212 blkCnt--;
213 }
214
215 blkCnt = blockSize & 3;
216 if (blkCnt)
217 {
218 mve_pred16_t p0 = vctp32q(blkCnt);
219
220 vst1q(pStateCur, vld1q(pTempSrc));
221 pStateCur += 4;
222 pTempSrc += 4;
223
224 FIR_F32_CORE(pSamples, c, NB_TAPS);
225
226 vstrwq_p_f32(pOutput, vecAcc0, p0);
227 }
228
229 /*
230 * Copy the samples back into the history buffer start
231 */
232 pTempSrc = &pState[blockSize];
233 pTempDest = pState;
234
235 blkCnt = numTaps - 1;
236 do {
237 mve_pred16_t p = vctp32q(blkCnt);
238
239 vstrwq_p_f32(pTempDest, vldrwq_z_f32(pTempSrc, p), p);
240 pTempSrc += 4;
241 pTempDest += 4;
242 blkCnt -= 4;
243 }
244 while (blkCnt > 0);
245 }
246 #undef NB_TAPS
247
arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,const float32_t * __restrict pSrc,float32_t * __restrict pDst,uint32_t blockSize)248 __STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,
249 const float32_t * __restrict pSrc,
250 float32_t * __restrict pDst, uint32_t blockSize)
251 {
252 float32_t *pRefStatePtr = S->pState + blockSize;
253 float32_t *pState = pRefStatePtr; /* State pointer */
254 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
255 const float32_t *pSamples; /* Temporary pointer to the sample buffer */
256 const float32_t *pTempSrc; /* Temporary pointer to the source data */
257 float32_t *pTempDest; /* Temporary pointer to the destination buffer */
258 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
259 int32_t blkCnt;
260 float32_t c0, c1, c2, c3;
261 float32_t c4, c5, c6, c7;
262
263
264 pTempSrc = pSrc;
265 pTempDest = &(pState[(numTaps - 1u)]);
266 int cnt = blockSize;
267 do {
268 mve_pred16_t p0 = vctp32q(cnt);
269 vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
270 pTempDest += 4;
271 pTempSrc += 4;
272 cnt -= 4;
273 } while(cnt > 0);
274
275
276
277 pSamples = pState;
278 c0 = *pCoeffs++;
279 c1 = *pCoeffs++;
280 c2 = *pCoeffs++;
281 c3 = *pCoeffs++;
282 c4 = *pCoeffs++;
283 c5 = *pCoeffs++;
284 c6 = *pCoeffs++;
285 c7 = *pCoeffs++;
286
287 cnt = blockSize >> 2;
288 while(cnt > 0)
289 {
290 float32x4_t vecAcc0;
291 float32x4_t vecIn0;
292
293 vecIn0 = vld1q(pSamples);
294 vecAcc0 = vmulq(vecIn0, c0);
295 vecIn0 = vld1q(&pSamples[1]);
296 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
297 vecIn0 = vld1q(&pSamples[2]);
298 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
299 vecIn0 = vld1q(&pSamples[3]);
300 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
301 vecIn0 = vld1q(&pSamples[4]);
302 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
303 vecIn0 = vld1q(&pSamples[5]);
304 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
305 vecIn0 = vld1q(&pSamples[6]);
306 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
307 vecIn0 = vld1q(&pSamples[7]);
308 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
309 pSamples += 4;
310 vst1q(pDst, vecAcc0);
311 cnt--;
312 pDst += 4;
313 }
314
315 cnt = blockSize & 3;
316 if (cnt > 0)
317 {
318 float32x4_t vecAcc0;
319 float32x4_t vecIn0;
320
321 mve_pred16_t p0 = vctp32q(cnt);
322
323 vecIn0 = vld1q(pSamples);
324 vecAcc0 = vmulq(vecIn0, c0);
325 vecIn0 = vld1q(&pSamples[1]);
326 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
327 vecIn0 = vld1q(&pSamples[2]);
328 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
329 vecIn0 = vld1q(&pSamples[3]);
330 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
331 vecIn0 = vld1q(&pSamples[4]);
332 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
333 vecIn0 = vld1q(&pSamples[5]);
334 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
335 vecIn0 = vld1q(&pSamples[6]);
336 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
337 vecIn0 = vld1q(&pSamples[7]);
338 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
339 vstrwq_p_f32(pDst, vecAcc0,p0);
340 }
341
342
343 /*
344 * Copy the samples back into the history buffer start
345 */
346 pTempSrc = &pState[blockSize];
347 pTempDest = pState;
348 blkCnt = numTaps;
349 while (blkCnt > 0)
350 {
351 *pTempDest++ = *pTempSrc++;
352 blkCnt--;
353 }
354 }
355
356
357
arm_fir_f32(const arm_fir_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)358 void arm_fir_f32(
359 const arm_fir_instance_f32 * S,
360 const float32_t * pSrc,
361 float32_t * pDst,
362 uint32_t blockSize)
363 {
364 /*
365 S->pState is the arm_fir_partial_accu
366 S->pState + blockSize is the FIR state
367 */
368 float32_t *pRefStatePtr = S->pState + blockSize;
369 float32_t *pState = pRefStatePtr ; /* State pointer */
370 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
371 const float32_t *pSamples; /* Temporary pointer to the sample buffer */
372 float32_t *pOutput; /* Temporary pointer to the output buffer */
373 const float32_t *pTempSrc; /* Temporary pointer to the source data */
374 float32_t *pTempDest; /* Temporary pointer to the destination buffer */
375 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
376 uint32_t blkCnt;
377 float32_t c0, c1, c2, c3;
378 float32_t c4, c5, c6, c7;
379
380 /*
381 * [1 to 8 taps] specialized routines
382 */
383 if (numTaps <= 4)
384 {
385 arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
386 return;
387 }
388 else if (numTaps <= 8)
389 {
390 arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
391 return;
392 }
393
394 pTempSrc = pSrc;
395 pTempDest = &(pState[(numTaps - 1u)]);
396 int cnt = blockSize;
397 do {
398 mve_pred16_t p0 = vctp32q(cnt);
399 vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
400 pTempDest += 4;
401 pTempSrc += 4;
402 cnt -= 4;
403 } while(cnt > 0);
404
405 float32_t *partial_accu_ptr = S->pState;
406
407 pSamples = pState;
408 c0 = *pCoeffs++;
409 c1 = *pCoeffs++;
410 c2 = *pCoeffs++;
411 c3 = *pCoeffs++;
412 c4 = *pCoeffs++;
413 c5 = *pCoeffs++;
414 c6 = *pCoeffs++;
415 c7 = *pCoeffs++;
416
417 cnt = blockSize >> 2;
418 while(cnt > 0) {
419 float32x4_t vecAcc0;
420 float32x4_t vecIn0;
421
422 vecIn0 = vld1q(pSamples);
423 vecAcc0 = vmulq(vecIn0, c0);
424 vecIn0 = vld1q(&pSamples[1]);
425 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
426 vecIn0 = vld1q(&pSamples[2]);
427 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
428 vecIn0 = vld1q(&pSamples[3]);
429 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
430 vecIn0 = vld1q(&pSamples[4]);
431 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
432 vecIn0 = vld1q(&pSamples[5]);
433 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
434 vecIn0 = vld1q(&pSamples[6]);
435 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
436 vecIn0 = vld1q(&pSamples[7]);
437 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
438 pSamples += 4;
439 vst1q(partial_accu_ptr, vecAcc0);
440 cnt--;
441 partial_accu_ptr += 4;
442 }
443
444 cnt = blockSize & 3;
445 if (cnt > 0)
446 {
447 float32x4_t vecAcc0;
448 float32x4_t vecIn0;
449
450 mve_pred16_t p0 = vctp32q(cnt);
451
452 vecIn0 = vld1q(pSamples);
453 vecAcc0 = vmulq(vecIn0, c0);
454 vecIn0 = vld1q(&pSamples[1]);
455 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
456 vecIn0 = vld1q(&pSamples[2]);
457 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
458 vecIn0 = vld1q(&pSamples[3]);
459 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
460 vecIn0 = vld1q(&pSamples[4]);
461 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
462 vecIn0 = vld1q(&pSamples[5]);
463 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
464 vecIn0 = vld1q(&pSamples[6]);
465 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
466 vecIn0 = vld1q(&pSamples[7]);
467 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
468 vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
469 }
470
471 int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
472 int sample_offset = FIR_F32_MAX_COEF_BLK;
473 while (localTaps > FIR_F32_MAX_COEF_BLK) {
474 c0 = *pCoeffs++;
475 c1 = *pCoeffs++;
476 c2 = *pCoeffs++;
477 c3 = *pCoeffs++;
478 c4 = *pCoeffs++;
479 c5 = *pCoeffs++;
480 c6 = *pCoeffs++;
481 c7 = *pCoeffs++;
482
483 partial_accu_ptr = S->pState;
484 pSamples = pState + sample_offset;
485 int cnt = blockSize >> 2;
486 while(cnt > 0) {
487 float32x4_t vecAcc0;
488 float32x4_t vecIn0;
489
490 vecIn0 = vld1q(pSamples);
491 vecAcc0 = vmulq(vecIn0, c0);
492 vecIn0 = vld1q(&pSamples[1]);
493 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
494 vecIn0 = vld1q(&pSamples[2]);
495 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
496 vecIn0 = vld1q(&pSamples[3]);
497 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
498 vecIn0 = vld1q(&pSamples[4]);
499 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
500 vecIn0 = vld1q(&pSamples[5]);
501 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
502 vecIn0 = vld1q(&pSamples[6]);
503 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
504 vecIn0 = vld1q(&pSamples[7]);
505 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
506 pSamples += 4;
507 vecAcc0 += vld1q_f32(partial_accu_ptr);
508 vst1q(partial_accu_ptr, vecAcc0);
509 cnt--;
510 partial_accu_ptr += 4;
511 }
512
513 cnt = blockSize & 3;
514 if (cnt > 0) {
515 float32x4_t vecAcc0;
516 float32x4_t vecIn0;
517
518 mve_pred16_t p0 = vctp32q(cnt);
519
520 vecIn0 = vld1q(pSamples);
521 vecAcc0 = vmulq(vecIn0, c0);
522 vecIn0 = vld1q(&pSamples[1]);
523 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
524 vecIn0 = vld1q(&pSamples[2]);
525 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
526 vecIn0 = vld1q(&pSamples[3]);
527 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
528 vecIn0 = vld1q(&pSamples[4]);
529 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
530 vecIn0 = vld1q(&pSamples[5]);
531 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
532 vecIn0 = vld1q(&pSamples[6]);
533 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
534 vecIn0 = vld1q(&pSamples[7]);
535 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
536 vecAcc0 += vld1q_f32(partial_accu_ptr);
537 vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
538 }
539
540 localTaps -= FIR_F32_MAX_COEF_BLK;
541 sample_offset += FIR_F32_MAX_COEF_BLK;
542 }
543
544 pSamples = pState + sample_offset;
545
546 if (localTaps > 4) {
547 c0 = *pCoeffs++;
548 c1 = *pCoeffs++;
549 c2 = *pCoeffs++;
550 c3 = *pCoeffs++;
551 c4 = *pCoeffs++;
552 c5 = *pCoeffs++;
553 c6 = *pCoeffs++;
554 c7 = *pCoeffs++;
555 pOutput = pDst;
556
557 partial_accu_ptr = S->pState;
558 cnt = blockSize >> 2;
559 while(cnt > 0) {
560 float32x4_t vecAcc0;
561 float32x4_t vecIn0;
562
563 vecIn0 = vld1q(pSamples);
564 vecAcc0 = vmulq(vecIn0, c0);
565 vecIn0 = vld1q(&pSamples[1]);
566 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
567 vecIn0 = vld1q(&pSamples[2]);
568 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
569 vecIn0 = vld1q(&pSamples[3]);
570 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
571 vecIn0 = vld1q(&pSamples[4]);
572 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
573 vecIn0 = vld1q(&pSamples[5]);
574 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
575 vecIn0 = vld1q(&pSamples[6]);
576 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
577 vecIn0 = vld1q(&pSamples[7]);
578 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
579 pSamples += 4;
580 float32x4_t pap = vld1q_f32(partial_accu_ptr);
581 vst1q(pOutput, vecAcc0+pap);
582 cnt--;
583 partial_accu_ptr += 4;
584 pOutput += 4;
585 }
586
587 cnt = blockSize & 3;
588 if (cnt > 0) {
589 float32x4_t vecAcc0;
590 float32x4_t vecIn0;
591
592 mve_pred16_t p0 = vctp32q(cnt);
593
594 vecIn0 = vld1q(pSamples);
595 vecAcc0 = vmulq(vecIn0, c0);
596 vecIn0 = vld1q(&pSamples[1]);
597 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
598 vecIn0 = vld1q(&pSamples[2]);
599 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
600 vecIn0 = vld1q(&pSamples[3]);
601 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
602 vecIn0 = vld1q(&pSamples[4]);
603 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
604 vecIn0 = vld1q(&pSamples[5]);
605 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
606 vecIn0 = vld1q(&pSamples[6]);
607 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
608 vecIn0 = vld1q(&pSamples[7]);
609 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
610 float32x4_t pap = vld1q_f32(partial_accu_ptr);
611 vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
612 pOutput += cnt;
613 }
614 }
615 else {
616 c0 = *pCoeffs++;
617 c1 = *pCoeffs++;
618 c2 = *pCoeffs++;
619 c3 = *pCoeffs++;
620 pOutput = pDst;
621
622 partial_accu_ptr = S->pState;
623 cnt = blockSize >> 2;
624 while(cnt > 0) {
625 float32x4_t vecAcc0;
626 float32x4_t vecIn0;
627
628 vecIn0 = vld1q(pSamples);
629 vecAcc0 = vmulq(vecIn0, c0);
630 vecIn0 = vld1q(&pSamples[1]);
631 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
632 vecIn0 = vld1q(&pSamples[2]);
633 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
634 vecIn0 = vld1q(&pSamples[3]);
635 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
636 pSamples += 4;
637 float32x4_t pap = vld1q_f32(partial_accu_ptr);
638 vst1q(pOutput, vecAcc0+pap);
639 cnt--;
640 partial_accu_ptr += 4;
641 pOutput += 4;
642 }
643
644 cnt = blockSize & 3;
645 if (cnt > 0) {
646 float32x4_t vecAcc0;
647 float32x4_t vecIn0;
648
649 mve_pred16_t p0 = vctp32q(cnt);
650
651 vecIn0 = vld1q(pSamples);
652 vecAcc0 = vmulq(vecIn0, c0);
653 vecIn0 = vld1q(&pSamples[1]);
654 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
655 vecIn0 = vld1q(&pSamples[2]);
656 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
657 vecIn0 = vld1q(&pSamples[3]);
658 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
659 float32x4_t pap = vld1q_f32(partial_accu_ptr);
660 vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
661 pOutput += cnt;
662 }
663 }
664
665 /*
666 * Copy the samples back into the history buffer start
667 */
668 pTempSrc = &pRefStatePtr[blockSize];
669 pTempDest = pRefStatePtr;
670
671 blkCnt = numTaps >> 2;
672 while (blkCnt > 0)
673 {
674 vst1q(pTempDest, vld1q(pTempSrc));
675 pTempSrc += 4;
676 pTempDest += 4;
677 blkCnt--;
678 }
679 blkCnt = numTaps & 3;
680 if (blkCnt > 0)
681 {
682 mve_pred16_t p0 = vctp32q(blkCnt);
683 vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
684 }
685 }
686
687 #else
688 #if defined(ARM_MATH_NEON)
689
arm_fir_f32(const arm_fir_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)690 void arm_fir_f32(
691 const arm_fir_instance_f32 * S,
692 const float32_t * pSrc,
693 float32_t * pDst,
694 uint32_t blockSize)
695 {
696 float32_t *pState = S->pState; /* State pointer */
697 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
698 float32_t *pStateCurnt; /* Points to the current sample of the state */
699 float32_t *px; /* Temporary pointers for state buffer */
700 const float32_t *pb; /* Temporary pointers for coefficient buffer */
701 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
702 uint32_t i, tapCnt, blkCnt; /* Loop counters */
703
704 float32x4_t accv0,accv1,samples0,samples1,x0,x1,x2,xa,xb,b;
705 float32_t acc;
706
707 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
708 /* pStateCurnt points to the location where the new input data should be written */
709 pStateCurnt = &(S->pState[(numTaps - 1U)]);
710
711 /* Loop unrolling */
712 blkCnt = blockSize >> 3;
713
714 while (blkCnt > 0U)
715 {
716 /* Copy 8 samples at a time into state buffers */
717 samples0 = vld1q_f32(pSrc);
718 vst1q_f32(pStateCurnt,samples0);
719
720 pStateCurnt += 4;
721 pSrc += 4 ;
722
723 samples1 = vld1q_f32(pSrc);
724 vst1q_f32(pStateCurnt,samples1);
725
726 pStateCurnt += 4;
727 pSrc += 4 ;
728
729 /* Set the accumulators to zero */
730 accv0 = vdupq_n_f32(0);
731 accv1 = vdupq_n_f32(0);
732
733 /* Initialize state pointer */
734 px = pState;
735
736 /* Initialize coefficient pointer */
737 pb = pCoeffs;
738
739 /* Loop unroling */
740 i = numTaps >> 2;
741
742 /* Perform the multiply-accumulates */
743 x0 = vld1q_f32(px);
744 x1 = vld1q_f32(px + 4);
745
746 while(i > 0)
747 {
748 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
749 x2 = vld1q_f32(px + 8);
750 b = vld1q_f32(pb);
751 xa = x0;
752 xb = x1;
753 accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 0));
754 accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 0));
755
756 xa = vextq_f32(x0,x1,1);
757 xb = vextq_f32(x1,x2,1);
758
759 accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 1));
760 accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 1));
761
762 xa = vextq_f32(x0,x1,2);
763 xb = vextq_f32(x1,x2,2);
764
765 accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 2));
766 accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 2));
767
768 xa = vextq_f32(x0,x1,3);
769 xb = vextq_f32(x1,x2,3);
770
771 accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 3));
772 accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 3));
773
774 pb += 4;
775 x0 = x1;
776 x1 = x2;
777 px += 4;
778 i--;
779
780 }
781
782 /* Tail */
783 i = numTaps & 3;
784 x2 = vld1q_f32(px + 8);
785
786 /* Perform the multiply-accumulates */
787 switch(i)
788 {
789 case 3:
790 {
791 accv0 = vmlaq_n_f32(accv0,x0,*pb);
792 accv1 = vmlaq_n_f32(accv1,x1,*pb);
793
794 pb++;
795
796 xa = vextq_f32(x0,x1,1);
797 xb = vextq_f32(x1,x2,1);
798
799 accv0 = vmlaq_n_f32(accv0,xa,*pb);
800 accv1 = vmlaq_n_f32(accv1,xb,*pb);
801
802 pb++;
803
804 xa = vextq_f32(x0,x1,2);
805 xb = vextq_f32(x1,x2,2);
806
807 accv0 = vmlaq_n_f32(accv0,xa,*pb);
808 accv1 = vmlaq_n_f32(accv1,xb,*pb);
809
810 }
811 break;
812 case 2:
813 {
814 accv0 = vmlaq_n_f32(accv0,x0,*pb);
815 accv1 = vmlaq_n_f32(accv1,x1,*pb);
816
817 pb++;
818
819 xa = vextq_f32(x0,x1,1);
820 xb = vextq_f32(x1,x2,1);
821
822 accv0 = vmlaq_n_f32(accv0,xa,*pb);
823 accv1 = vmlaq_n_f32(accv1,xb,*pb);
824
825 }
826 break;
827 case 1:
828 {
829
830 accv0 = vmlaq_n_f32(accv0,x0,*pb);
831 accv1 = vmlaq_n_f32(accv1,x1,*pb);
832
833 }
834 break;
835 default:
836 break;
837 }
838
839 /* The result is stored in the destination buffer. */
840 vst1q_f32(pDst,accv0);
841 pDst += 4;
842 vst1q_f32(pDst,accv1);
843 pDst += 4;
844
845 /* Advance state pointer by 8 for the next 8 samples */
846 pState = pState + 8;
847
848 blkCnt--;
849 }
850
851 /* Tail */
852 blkCnt = blockSize & 0x7;
853
854 while (blkCnt > 0U)
855 {
856 /* Copy one sample at a time into state buffer */
857 *pStateCurnt++ = *pSrc++;
858
859 /* Set the accumulator to zero */
860 acc = 0.0f;
861
862 /* Initialize state pointer */
863 px = pState;
864
865 /* Initialize Coefficient pointer */
866 pb = pCoeffs;
867
868 i = numTaps;
869
870 /* Perform the multiply-accumulates */
871 do
872 {
873 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
874 acc += *px++ * *pb++;
875 i--;
876
877 } while (i > 0U);
878
879 /* The result is stored in the destination buffer. */
880 *pDst++ = acc;
881
882 /* Advance state pointer by 1 for the next sample */
883 pState = pState + 1;
884
885 blkCnt--;
886 }
887
888 /* Processing is complete.
889 ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
890 ** This prepares the state buffer for the next function call. */
891
892 /* Points to the start of the state buffer */
893 pStateCurnt = S->pState;
894
895 /* Copy numTaps number of values */
896 tapCnt = numTaps - 1U;
897
898 /* Copy data */
899 while (tapCnt > 0U)
900 {
901 *pStateCurnt++ = *pState++;
902
903 /* Decrement the loop counter */
904 tapCnt--;
905 }
906
907 }
908 #else
arm_fir_f32(const arm_fir_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)909 void arm_fir_f32(
910 const arm_fir_instance_f32 * S,
911 const float32_t * pSrc,
912 float32_t * pDst,
913 uint32_t blockSize)
914 {
915 float32_t *pState = S->pState; /* State pointer */
916 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
917 float32_t *pStateCurnt; /* Points to the current sample of the state */
918 float32_t *px; /* Temporary pointer for state buffer */
919 const float32_t *pb; /* Temporary pointer for coefficient buffer */
920 float32_t acc0; /* Accumulator */
921 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
922 uint32_t i, tapCnt, blkCnt; /* Loop counters */
923
924 #if defined (ARM_MATH_LOOPUNROLL)
925 float32_t acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */
926 float32_t x0, x1, x2, x3, x4, x5, x6, x7; /* Temporary variables to hold state values */
927 float32_t c0; /* Temporary variable to hold coefficient value */
928 #endif
929
930 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
931 /* pStateCurnt points to the location where the new input data should be written */
932 pStateCurnt = &(S->pState[(numTaps - 1U)]);
933
934 #if defined (ARM_MATH_LOOPUNROLL)
935
936 /* Loop unrolling: Compute 8 output values simultaneously.
937 * The variables acc0 ... acc7 hold output values that are being computed:
938 *
939 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
940 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
941 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
942 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
943 */
944
945 blkCnt = blockSize >> 3U;
946
947 while (blkCnt > 0U)
948 {
949 /* Copy 4 new input samples into the state buffer. */
950 *pStateCurnt++ = *pSrc++;
951 *pStateCurnt++ = *pSrc++;
952 *pStateCurnt++ = *pSrc++;
953 *pStateCurnt++ = *pSrc++;
954
955 /* Set all accumulators to zero */
956 acc0 = 0.0f;
957 acc1 = 0.0f;
958 acc2 = 0.0f;
959 acc3 = 0.0f;
960 acc4 = 0.0f;
961 acc5 = 0.0f;
962 acc6 = 0.0f;
963 acc7 = 0.0f;
964
965 /* Initialize state pointer */
966 px = pState;
967
968 /* Initialize coefficient pointer */
969 pb = pCoeffs;
970
971 /* This is separated from the others to avoid
972 * a call to __aeabi_memmove which would be slower
973 */
974 *pStateCurnt++ = *pSrc++;
975 *pStateCurnt++ = *pSrc++;
976 *pStateCurnt++ = *pSrc++;
977 *pStateCurnt++ = *pSrc++;
978
979 /* Read the first 7 samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
980 x0 = *px++;
981 x1 = *px++;
982 x2 = *px++;
983 x3 = *px++;
984 x4 = *px++;
985 x5 = *px++;
986 x6 = *px++;
987
988 /* Loop unrolling: process 8 taps at a time. */
989 tapCnt = numTaps >> 3U;
990
991 while (tapCnt > 0U)
992 {
993 /* Read the b[numTaps-1] coefficient */
994 c0 = *(pb++);
995
996 /* Read x[n-numTaps-3] sample */
997 x7 = *(px++);
998
999 /* acc0 += b[numTaps-1] * x[n-numTaps] */
1000 acc0 += x0 * c0;
1001
1002 /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
1003 acc1 += x1 * c0;
1004
1005 /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
1006 acc2 += x2 * c0;
1007
1008 /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
1009 acc3 += x3 * c0;
1010
1011 /* acc4 += b[numTaps-1] * x[n-numTaps-4] */
1012 acc4 += x4 * c0;
1013
1014 /* acc1 += b[numTaps-1] * x[n-numTaps-5] */
1015 acc5 += x5 * c0;
1016
1017 /* acc2 += b[numTaps-1] * x[n-numTaps-6] */
1018 acc6 += x6 * c0;
1019
1020 /* acc3 += b[numTaps-1] * x[n-numTaps-7] */
1021 acc7 += x7 * c0;
1022
1023 /* Read the b[numTaps-2] coefficient */
1024 c0 = *(pb++);
1025
1026 /* Read x[n-numTaps-4] sample */
1027 x0 = *(px++);
1028
1029 /* Perform the multiply-accumulate */
1030 acc0 += x1 * c0;
1031 acc1 += x2 * c0;
1032 acc2 += x3 * c0;
1033 acc3 += x4 * c0;
1034 acc4 += x5 * c0;
1035 acc5 += x6 * c0;
1036 acc6 += x7 * c0;
1037 acc7 += x0 * c0;
1038
1039 /* Read the b[numTaps-3] coefficient */
1040 c0 = *(pb++);
1041
1042 /* Read x[n-numTaps-5] sample */
1043 x1 = *(px++);
1044
1045 /* Perform the multiply-accumulates */
1046 acc0 += x2 * c0;
1047 acc1 += x3 * c0;
1048 acc2 += x4 * c0;
1049 acc3 += x5 * c0;
1050 acc4 += x6 * c0;
1051 acc5 += x7 * c0;
1052 acc6 += x0 * c0;
1053 acc7 += x1 * c0;
1054
1055 /* Read the b[numTaps-4] coefficient */
1056 c0 = *(pb++);
1057
1058 /* Read x[n-numTaps-6] sample */
1059 x2 = *(px++);
1060
1061 /* Perform the multiply-accumulates */
1062 acc0 += x3 * c0;
1063 acc1 += x4 * c0;
1064 acc2 += x5 * c0;
1065 acc3 += x6 * c0;
1066 acc4 += x7 * c0;
1067 acc5 += x0 * c0;
1068 acc6 += x1 * c0;
1069 acc7 += x2 * c0;
1070
1071 /* Read the b[numTaps-4] coefficient */
1072 c0 = *(pb++);
1073
1074 /* Read x[n-numTaps-6] sample */
1075 x3 = *(px++);
1076 /* Perform the multiply-accumulates */
1077 acc0 += x4 * c0;
1078 acc1 += x5 * c0;
1079 acc2 += x6 * c0;
1080 acc3 += x7 * c0;
1081 acc4 += x0 * c0;
1082 acc5 += x1 * c0;
1083 acc6 += x2 * c0;
1084 acc7 += x3 * c0;
1085
1086 /* Read the b[numTaps-4] coefficient */
1087 c0 = *(pb++);
1088
1089 /* Read x[n-numTaps-6] sample */
1090 x4 = *(px++);
1091
1092 /* Perform the multiply-accumulates */
1093 acc0 += x5 * c0;
1094 acc1 += x6 * c0;
1095 acc2 += x7 * c0;
1096 acc3 += x0 * c0;
1097 acc4 += x1 * c0;
1098 acc5 += x2 * c0;
1099 acc6 += x3 * c0;
1100 acc7 += x4 * c0;
1101
1102 /* Read the b[numTaps-4] coefficient */
1103 c0 = *(pb++);
1104
1105 /* Read x[n-numTaps-6] sample */
1106 x5 = *(px++);
1107
1108 /* Perform the multiply-accumulates */
1109 acc0 += x6 * c0;
1110 acc1 += x7 * c0;
1111 acc2 += x0 * c0;
1112 acc3 += x1 * c0;
1113 acc4 += x2 * c0;
1114 acc5 += x3 * c0;
1115 acc6 += x4 * c0;
1116 acc7 += x5 * c0;
1117
1118 /* Read the b[numTaps-4] coefficient */
1119 c0 = *(pb++);
1120
1121 /* Read x[n-numTaps-6] sample */
1122 x6 = *(px++);
1123
1124 /* Perform the multiply-accumulates */
1125 acc0 += x7 * c0;
1126 acc1 += x0 * c0;
1127 acc2 += x1 * c0;
1128 acc3 += x2 * c0;
1129 acc4 += x3 * c0;
1130 acc5 += x4 * c0;
1131 acc6 += x5 * c0;
1132 acc7 += x6 * c0;
1133
1134 /* Decrement loop counter */
1135 tapCnt--;
1136 }
1137
1138 /* Loop unrolling: Compute remaining outputs */
1139 tapCnt = numTaps % 0x8U;
1140
1141 while (tapCnt > 0U)
1142 {
1143 /* Read coefficients */
1144 c0 = *(pb++);
1145
1146 /* Fetch 1 state variable */
1147 x7 = *(px++);
1148
1149 /* Perform the multiply-accumulates */
1150 acc0 += x0 * c0;
1151 acc1 += x1 * c0;
1152 acc2 += x2 * c0;
1153 acc3 += x3 * c0;
1154 acc4 += x4 * c0;
1155 acc5 += x5 * c0;
1156 acc6 += x6 * c0;
1157 acc7 += x7 * c0;
1158
1159 /* Reuse the present sample states for next sample */
1160 x0 = x1;
1161 x1 = x2;
1162 x2 = x3;
1163 x3 = x4;
1164 x4 = x5;
1165 x5 = x6;
1166 x6 = x7;
1167
1168 /* Decrement loop counter */
1169 tapCnt--;
1170 }
1171
1172 /* Advance the state pointer by 8 to process the next group of 8 samples */
1173 pState = pState + 8;
1174
1175 /* The results in the 8 accumulators, store in the destination buffer. */
1176 *pDst++ = acc0;
1177 *pDst++ = acc1;
1178 *pDst++ = acc2;
1179 *pDst++ = acc3;
1180 *pDst++ = acc4;
1181 *pDst++ = acc5;
1182 *pDst++ = acc6;
1183 *pDst++ = acc7;
1184
1185
1186 /* Decrement loop counter */
1187 blkCnt--;
1188 }
1189
1190 /* Loop unrolling: Compute remaining output samples */
1191 blkCnt = blockSize % 0x8U;
1192
1193 #else
1194
1195 /* Initialize blkCnt with number of taps */
1196 blkCnt = blockSize;
1197
1198 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1199
1200 while (blkCnt > 0U)
1201 {
1202 /* Copy one sample at a time into state buffer */
1203 *pStateCurnt++ = *pSrc++;
1204
1205 /* Set the accumulator to zero */
1206 acc0 = 0.0f;
1207
1208 /* Initialize state pointer */
1209 px = pState;
1210
1211 /* Initialize Coefficient pointer */
1212 pb = pCoeffs;
1213
1214 i = numTaps;
1215
1216 /* Perform the multiply-accumulates */
1217 while (i > 0U)
1218 {
1219 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
1220 acc0 += *px++ * *pb++;
1221
1222 i--;
1223 }
1224
1225 /* Store result in destination buffer. */
1226 *pDst++ = acc0;
1227
1228 /* Advance state pointer by 1 for the next sample */
1229 pState = pState + 1U;
1230
1231 /* Decrement loop counter */
1232 blkCnt--;
1233 }
1234
1235 /* Processing is complete.
1236 Now copy the last numTaps - 1 samples to the start of the state buffer.
1237 This prepares the state buffer for the next function call. */
1238
1239 /* Points to the start of the state buffer */
1240 pStateCurnt = S->pState;
1241
1242 #if defined (ARM_MATH_LOOPUNROLL)
1243
1244 /* Loop unrolling: Compute 4 taps at a time */
1245 tapCnt = (numTaps - 1U) >> 2U;
1246
1247 /* Copy data */
1248 while (tapCnt > 0U)
1249 {
1250 *pStateCurnt++ = *pState++;
1251 *pStateCurnt++ = *pState++;
1252 *pStateCurnt++ = *pState++;
1253 *pStateCurnt++ = *pState++;
1254
1255 /* Decrement loop counter */
1256 tapCnt--;
1257 }
1258
1259 /* Calculate remaining number of copies */
1260 tapCnt = (numTaps - 1U) % 0x4U;
1261
1262 #else
1263
1264 /* Initialize tapCnt with number of taps */
1265 tapCnt = (numTaps - 1U);
1266
1267 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1268
1269 /* Copy remaining data */
1270 while (tapCnt > 0U)
1271 {
1272 *pStateCurnt++ = *pState++;
1273
1274 /* Decrement loop counter */
1275 tapCnt--;
1276 }
1277
1278 }
1279
1280 #endif /* #if defined(ARM_MATH_NEON) */
1281 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1282
1283 /**
1284 * @} end of FIR group
1285 */
1286