1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_f32.c
4 * Description: Floating-point FIR filter processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @defgroup FIR Finite Impulse Response (FIR) Filters
37
38 This set of functions implements Finite Impulse Response (FIR) filters
39 for Q7, Q15, Q31, and floating-point data types. Fast versions of Q15 and Q31 are also provided.
40 The functions operate on blocks of input and output data and each call to the function processes
41 <code>blockSize</code> samples through the filter. <code>pSrc</code> and
42 <code>pDst</code> points to input and output arrays containing <code>blockSize</code> values.
43
44 @par Algorithm
45 The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.
46 Each filter coefficient <code>b[n]</code> is multiplied by a state variable which equals a previous input sample <code>x[n]</code>.
47 <pre>
48 y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
49 </pre>
50 @par
51 \image html FIR.GIF "Finite Impulse Response filter"
52 @par
53 <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
54 Coefficients are stored in time reversed order.
55 @par
56 <pre>
57 {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
58 </pre>
59 @par
60 <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
61 Samples in the state buffer are stored in the following order.
62 @par
63 <pre>
64 {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[n](==pSrc[0]), x[n+1](==pSrc[1]), ..., x[n+blockSize-1](==pSrc[blockSize-1])}
65 </pre>
66
67 @par
68 Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>.
69 The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
70 to be avoided and yields a significant speed improvement.
71 The state variables are updated after each block of data is processed; the coefficients are untouched.
72
73 @par Instance Structure
74 The coefficients and state variables for a filter are stored together in an instance data structure.
75 A separate instance structure must be defined for each filter.
76 Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
77 There are separate instance structure declarations for each of the 4 supported data types.
78
79 @par Initialization Functions
80 There is also an associated initialization function for each data type.
81 The initialization function performs the following operations:
82 - Sets the values of the internal structure fields.
83 - Zeros out the values in the state buffer.
84 To do this manually without calling the init function, assign the follow subfields of the instance structure:
85 numTaps, pCoeffs, pState. Also set all of the values in pState to zero.
86
87 @par
88 Use of the initialization function is optional.
89 However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
90 To place an instance structure into a const data section, the instance structure must be manually initialized.
91 Set the values in the state buffer to zeros before static initialization.
92 The code below statically initializes each of the 4 different data type filter instance structures
93 <pre>
94 arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
95 arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
96 arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
97 arm_fir_instance_q7 S = {numTaps, pState, pCoeffs};
98 </pre>
99 where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
100 <code>pCoeffs</code> is the address of the coefficient buffer.
101
102 @par Initialization of Helium version
103 For Helium version the array of coefficients must be padded with zero to contain
104 a full number of lanes.
105
106 The array length L must be a multiple of x. L = x * a :
107 - x is 4 for f32
108 - x is 4 for q31
109 - x is 4 for f16 (so managed like the f32 version and not like the q15 one)
110 - x is 8 for q15
111 - x is 16 for q7
112
113 The additional coefficients
114 (x * a - numTaps) must be set to 0.
115 numTaps is still set to its right value in the init function. It means that
116 the implementation may require to read more coefficients due to the vectorization and
117 to avoid having to manage too many different cases in the code.
118
119 @par Helium state buffer
120 The state buffer must contain some additional temporary data
121 used during the computation but which is not the state of the FIR.
122 The first A samples are temporary data.
123 The remaining samples are the state of the FIR filter.
124
125 @par
126 So the state buffer has size <code> numTaps + A + blockSize - 1 </code> :
127 - A is blockSize for f32
128 - A is 8*ceil(blockSize/8) for f16
129 - A is 8*ceil(blockSize/4) for q31
130 - A is 0 for other datatypes (q15 and q7)
131
132
133 @par Fixed-Point Behavior
134 Care must be taken when using the fixed-point versions of the FIR filter functions.
135 In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
136 Refer to the function specific documentation below for usage guidelines.
137
138 */
139
140 /**
141 @addtogroup FIR
142 @{
143 */
144
145 /**
146 @brief Processing function for floating-point FIR filter.
147 @param[in] S points to an instance of the floating-point FIR filter structure
148 @param[in] pSrc points to the block of input data
149 @param[out] pDst points to the block of output data
150 @param[in] blockSize number of samples to process
151 */
152
153 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
154
155 #define FIR_F32_MAX_COEF_BLK 8
156
157 #define FIR_F32_CORE(pSamples, c, NB_TAPS) \
158 vecAcc0 = vdupq_n_f32(0.0f); \
159 for (int i = 0; i < NB_TAPS; i++) { \
160 vecIn0 = vld1q(&pSamples[i]); \
161 vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]); \
162 }
163
164
165 #define NB_TAPS 4
arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,const float32_t * __restrict pSrc,float32_t * __restrict pDst,uint32_t blockSize)166 __STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
167 const float32_t * __restrict pSrc,
168 float32_t * __restrict pDst, uint32_t blockSize)
169 {
170 float32_t *pRefStatePtr = S->pState + blockSize;
171 float32_t *pState = pRefStatePtr; /* State pointer */
172 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
173 float32_t *pStateCur; /* Points to the current sample of the state */
174 const float32_t *pSamples; /* Temporary pointer to the sample buffer */
175 float32_t *pOutput; /* Temporary pointer to the output buffer */
176 const float32_t *pTempSrc; /* Temporary pointer to the source data */
177 float32_t *pTempDest; /* Temporary pointer to the destination buffer */
178 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
179 int32_t blkCnt;
180 float32x4_t vecIn0;
181 float32x4_t vecAcc0;
182 float32_t c[NB_TAPS];
183 const float32_t *pCoeffsCur = pCoeffs;
184
185 /*
186 * pState points to state array which contains previous frame (numTaps - 1) samples
187 * pStateCur points to the location where the new input data should be written
188 */
189 pStateCur = &(pState[(numTaps - 1u)]);
190 pTempSrc = pSrc;
191
192 pSamples = pState;
193 pOutput = pDst;
194
195 for (int i = 0; i < NB_TAPS; i++)
196 c[i] = *pCoeffsCur++;
197
198 blkCnt = blockSize >> 2;
199 while (blkCnt > 0) {
200 /*
201 * Save 4 input samples in the history buffer
202 */
203 vst1q(pStateCur, vld1q(pTempSrc));
204 pStateCur += 4;
205 pTempSrc += 4;
206
207 FIR_F32_CORE(pSamples, c, NB_TAPS);
208
209 vst1q(pOutput, vecAcc0);
210
211 pOutput += 4;
212 pSamples += 4;
213
214 blkCnt--;
215 }
216
217 blkCnt = blockSize & 3;
218 if (blkCnt)
219 {
220 mve_pred16_t p0 = vctp32q(blkCnt);
221
222 vst1q(pStateCur, vld1q(pTempSrc));
223 pStateCur += 4;
224 pTempSrc += 4;
225
226 FIR_F32_CORE(pSamples, c, NB_TAPS);
227
228 vstrwq_p_f32(pOutput, vecAcc0, p0);
229 }
230
231 /*
232 * Copy the samples back into the history buffer start
233 */
234 pTempSrc = &pState[blockSize];
235 pTempDest = pState;
236
237 blkCnt = numTaps - 1;
238 do {
239 mve_pred16_t p = vctp32q(blkCnt);
240
241 vstrwq_p_f32(pTempDest, vldrwq_z_f32(pTempSrc, p), p);
242 pTempSrc += 4;
243 pTempDest += 4;
244 blkCnt -= 4;
245 }
246 while (blkCnt > 0);
247 }
248 #undef NB_TAPS
249
arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,const float32_t * __restrict pSrc,float32_t * __restrict pDst,uint32_t blockSize)250 __STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,
251 const float32_t * __restrict pSrc,
252 float32_t * __restrict pDst, uint32_t blockSize)
253 {
254 float32_t *pRefStatePtr = S->pState + blockSize;
255 float32_t *pState = pRefStatePtr; /* State pointer */
256 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
257 const float32_t *pSamples; /* Temporary pointer to the sample buffer */
258 const float32_t *pTempSrc; /* Temporary pointer to the source data */
259 float32_t *pTempDest; /* Temporary pointer to the destination buffer */
260 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
261 int32_t blkCnt;
262 float32_t c0, c1, c2, c3;
263 float32_t c4, c5, c6, c7;
264
265
266 pTempSrc = pSrc;
267 pTempDest = &(pState[(numTaps - 1u)]);
268 int cnt = blockSize;
269 do {
270 mve_pred16_t p0 = vctp32q(cnt);
271 vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
272 pTempDest += 4;
273 pTempSrc += 4;
274 cnt -= 4;
275 } while(cnt > 0);
276
277
278
279 pSamples = pState;
280 c0 = *pCoeffs++;
281 c1 = *pCoeffs++;
282 c2 = *pCoeffs++;
283 c3 = *pCoeffs++;
284 c4 = *pCoeffs++;
285 c5 = *pCoeffs++;
286 c6 = *pCoeffs++;
287 c7 = *pCoeffs++;
288
289 cnt = blockSize >> 2;
290 while(cnt > 0)
291 {
292 float32x4_t vecAcc0;
293 float32x4_t vecIn0;
294
295 vecIn0 = vld1q(pSamples);
296 vecAcc0 = vmulq(vecIn0, c0);
297 vecIn0 = vld1q(&pSamples[1]);
298 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
299 vecIn0 = vld1q(&pSamples[2]);
300 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
301 vecIn0 = vld1q(&pSamples[3]);
302 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
303 vecIn0 = vld1q(&pSamples[4]);
304 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
305 vecIn0 = vld1q(&pSamples[5]);
306 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
307 vecIn0 = vld1q(&pSamples[6]);
308 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
309 vecIn0 = vld1q(&pSamples[7]);
310 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
311 pSamples += 4;
312 vst1q(pDst, vecAcc0);
313 cnt--;
314 pDst += 4;
315 }
316
317 cnt = blockSize & 3;
318 if (cnt > 0)
319 {
320 float32x4_t vecAcc0;
321 float32x4_t vecIn0;
322
323 mve_pred16_t p0 = vctp32q(cnt);
324
325 vecIn0 = vld1q(pSamples);
326 vecAcc0 = vmulq(vecIn0, c0);
327 vecIn0 = vld1q(&pSamples[1]);
328 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
329 vecIn0 = vld1q(&pSamples[2]);
330 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
331 vecIn0 = vld1q(&pSamples[3]);
332 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
333 vecIn0 = vld1q(&pSamples[4]);
334 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
335 vecIn0 = vld1q(&pSamples[5]);
336 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
337 vecIn0 = vld1q(&pSamples[6]);
338 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
339 vecIn0 = vld1q(&pSamples[7]);
340 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
341 vstrwq_p_f32(pDst, vecAcc0,p0);
342 }
343
344
345 /*
346 * Copy the samples back into the history buffer start
347 */
348 pTempSrc = &pState[blockSize];
349 pTempDest = pState;
350 blkCnt = numTaps;
351 while (blkCnt > 0)
352 {
353 *pTempDest++ = *pTempSrc++;
354 blkCnt--;
355 }
356 }
357
358
359
arm_fir_f32(const arm_fir_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)360 ARM_DSP_ATTRIBUTE void arm_fir_f32(
361 const arm_fir_instance_f32 * S,
362 const float32_t * pSrc,
363 float32_t * pDst,
364 uint32_t blockSize)
365 {
366 /*
367 S->pState is the arm_fir_partial_accu
368 S->pState + blockSize is the FIR state
369 */
370 float32_t *pRefStatePtr = S->pState + blockSize;
371 float32_t *pState = pRefStatePtr ; /* State pointer */
372 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
373 const float32_t *pSamples; /* Temporary pointer to the sample buffer */
374 float32_t *pOutput; /* Temporary pointer to the output buffer */
375 const float32_t *pTempSrc; /* Temporary pointer to the source data */
376 float32_t *pTempDest; /* Temporary pointer to the destination buffer */
377 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
378 uint32_t blkCnt;
379 float32_t c0, c1, c2, c3;
380 float32_t c4, c5, c6, c7;
381
382 /*
383 * [1 to 8 taps] specialized routines
384 */
385 if (numTaps <= 4)
386 {
387 arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
388 return;
389 }
390 else if (numTaps <= 8)
391 {
392 arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
393 return;
394 }
395
396 pTempSrc = pSrc;
397 pTempDest = &(pState[(numTaps - 1u)]);
398 int cnt = blockSize;
399 do {
400 mve_pred16_t p0 = vctp32q(cnt);
401 vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
402 pTempDest += 4;
403 pTempSrc += 4;
404 cnt -= 4;
405 } while(cnt > 0);
406
407 float32_t *partial_accu_ptr = S->pState;
408
409 pSamples = pState;
410 c0 = *pCoeffs++;
411 c1 = *pCoeffs++;
412 c2 = *pCoeffs++;
413 c3 = *pCoeffs++;
414 c4 = *pCoeffs++;
415 c5 = *pCoeffs++;
416 c6 = *pCoeffs++;
417 c7 = *pCoeffs++;
418
419 cnt = blockSize >> 2;
420 while(cnt > 0) {
421 float32x4_t vecAcc0;
422 float32x4_t vecIn0;
423
424 vecIn0 = vld1q(pSamples);
425 vecAcc0 = vmulq(vecIn0, c0);
426 vecIn0 = vld1q(&pSamples[1]);
427 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
428 vecIn0 = vld1q(&pSamples[2]);
429 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
430 vecIn0 = vld1q(&pSamples[3]);
431 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
432 vecIn0 = vld1q(&pSamples[4]);
433 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
434 vecIn0 = vld1q(&pSamples[5]);
435 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
436 vecIn0 = vld1q(&pSamples[6]);
437 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
438 vecIn0 = vld1q(&pSamples[7]);
439 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
440 pSamples += 4;
441 vst1q(partial_accu_ptr, vecAcc0);
442 cnt--;
443 partial_accu_ptr += 4;
444 }
445
446 cnt = blockSize & 3;
447 if (cnt > 0)
448 {
449 float32x4_t vecAcc0;
450 float32x4_t vecIn0;
451
452 mve_pred16_t p0 = vctp32q(cnt);
453
454 vecIn0 = vld1q(pSamples);
455 vecAcc0 = vmulq(vecIn0, c0);
456 vecIn0 = vld1q(&pSamples[1]);
457 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
458 vecIn0 = vld1q(&pSamples[2]);
459 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
460 vecIn0 = vld1q(&pSamples[3]);
461 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
462 vecIn0 = vld1q(&pSamples[4]);
463 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
464 vecIn0 = vld1q(&pSamples[5]);
465 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
466 vecIn0 = vld1q(&pSamples[6]);
467 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
468 vecIn0 = vld1q(&pSamples[7]);
469 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
470 vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
471 }
472
473 int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
474 int sample_offset = FIR_F32_MAX_COEF_BLK;
475 while (localTaps > FIR_F32_MAX_COEF_BLK) {
476 c0 = *pCoeffs++;
477 c1 = *pCoeffs++;
478 c2 = *pCoeffs++;
479 c3 = *pCoeffs++;
480 c4 = *pCoeffs++;
481 c5 = *pCoeffs++;
482 c6 = *pCoeffs++;
483 c7 = *pCoeffs++;
484
485 partial_accu_ptr = S->pState;
486 pSamples = pState + sample_offset;
487 int cnt = blockSize >> 2;
488 while(cnt > 0) {
489 float32x4_t vecAcc0;
490 float32x4_t vecIn0;
491
492 vecIn0 = vld1q(pSamples);
493 vecAcc0 = vmulq(vecIn0, c0);
494 vecIn0 = vld1q(&pSamples[1]);
495 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
496 vecIn0 = vld1q(&pSamples[2]);
497 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
498 vecIn0 = vld1q(&pSamples[3]);
499 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
500 vecIn0 = vld1q(&pSamples[4]);
501 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
502 vecIn0 = vld1q(&pSamples[5]);
503 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
504 vecIn0 = vld1q(&pSamples[6]);
505 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
506 vecIn0 = vld1q(&pSamples[7]);
507 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
508 pSamples += 4;
509 vecAcc0 += vld1q_f32(partial_accu_ptr);
510 vst1q(partial_accu_ptr, vecAcc0);
511 cnt--;
512 partial_accu_ptr += 4;
513 }
514
515 cnt = blockSize & 3;
516 if (cnt > 0) {
517 float32x4_t vecAcc0;
518 float32x4_t vecIn0;
519
520 mve_pred16_t p0 = vctp32q(cnt);
521
522 vecIn0 = vld1q(pSamples);
523 vecAcc0 = vmulq(vecIn0, c0);
524 vecIn0 = vld1q(&pSamples[1]);
525 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
526 vecIn0 = vld1q(&pSamples[2]);
527 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
528 vecIn0 = vld1q(&pSamples[3]);
529 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
530 vecIn0 = vld1q(&pSamples[4]);
531 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
532 vecIn0 = vld1q(&pSamples[5]);
533 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
534 vecIn0 = vld1q(&pSamples[6]);
535 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
536 vecIn0 = vld1q(&pSamples[7]);
537 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
538 vecAcc0 += vld1q_f32(partial_accu_ptr);
539 vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
540 }
541
542 localTaps -= FIR_F32_MAX_COEF_BLK;
543 sample_offset += FIR_F32_MAX_COEF_BLK;
544 }
545
546 pSamples = pState + sample_offset;
547
548 if (localTaps > 4) {
549 c0 = *pCoeffs++;
550 c1 = *pCoeffs++;
551 c2 = *pCoeffs++;
552 c3 = *pCoeffs++;
553 c4 = *pCoeffs++;
554 c5 = *pCoeffs++;
555 c6 = *pCoeffs++;
556 c7 = *pCoeffs++;
557 pOutput = pDst;
558
559 partial_accu_ptr = S->pState;
560 cnt = blockSize >> 2;
561 while(cnt > 0) {
562 float32x4_t vecAcc0;
563 float32x4_t vecIn0;
564
565 vecIn0 = vld1q(pSamples);
566 vecAcc0 = vmulq(vecIn0, c0);
567 vecIn0 = vld1q(&pSamples[1]);
568 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
569 vecIn0 = vld1q(&pSamples[2]);
570 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
571 vecIn0 = vld1q(&pSamples[3]);
572 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
573 vecIn0 = vld1q(&pSamples[4]);
574 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
575 vecIn0 = vld1q(&pSamples[5]);
576 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
577 vecIn0 = vld1q(&pSamples[6]);
578 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
579 vecIn0 = vld1q(&pSamples[7]);
580 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
581 pSamples += 4;
582 float32x4_t pap = vld1q_f32(partial_accu_ptr);
583 vst1q(pOutput, vecAcc0+pap);
584 cnt--;
585 partial_accu_ptr += 4;
586 pOutput += 4;
587 }
588
589 cnt = blockSize & 3;
590 if (cnt > 0) {
591 float32x4_t vecAcc0;
592 float32x4_t vecIn0;
593
594 mve_pred16_t p0 = vctp32q(cnt);
595
596 vecIn0 = vld1q(pSamples);
597 vecAcc0 = vmulq(vecIn0, c0);
598 vecIn0 = vld1q(&pSamples[1]);
599 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
600 vecIn0 = vld1q(&pSamples[2]);
601 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
602 vecIn0 = vld1q(&pSamples[3]);
603 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
604 vecIn0 = vld1q(&pSamples[4]);
605 vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
606 vecIn0 = vld1q(&pSamples[5]);
607 vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
608 vecIn0 = vld1q(&pSamples[6]);
609 vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
610 vecIn0 = vld1q(&pSamples[7]);
611 vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
612 float32x4_t pap = vld1q_f32(partial_accu_ptr);
613 vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
614 pOutput += cnt;
615 }
616 }
617 else {
618 c0 = *pCoeffs++;
619 c1 = *pCoeffs++;
620 c2 = *pCoeffs++;
621 c3 = *pCoeffs++;
622 pOutput = pDst;
623
624 partial_accu_ptr = S->pState;
625 cnt = blockSize >> 2;
626 while(cnt > 0) {
627 float32x4_t vecAcc0;
628 float32x4_t vecIn0;
629
630 vecIn0 = vld1q(pSamples);
631 vecAcc0 = vmulq(vecIn0, c0);
632 vecIn0 = vld1q(&pSamples[1]);
633 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
634 vecIn0 = vld1q(&pSamples[2]);
635 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
636 vecIn0 = vld1q(&pSamples[3]);
637 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
638 pSamples += 4;
639 float32x4_t pap = vld1q_f32(partial_accu_ptr);
640 vst1q(pOutput, vecAcc0+pap);
641 cnt--;
642 partial_accu_ptr += 4;
643 pOutput += 4;
644 }
645
646 cnt = blockSize & 3;
647 if (cnt > 0) {
648 float32x4_t vecAcc0;
649 float32x4_t vecIn0;
650
651 mve_pred16_t p0 = vctp32q(cnt);
652
653 vecIn0 = vld1q(pSamples);
654 vecAcc0 = vmulq(vecIn0, c0);
655 vecIn0 = vld1q(&pSamples[1]);
656 vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
657 vecIn0 = vld1q(&pSamples[2]);
658 vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
659 vecIn0 = vld1q(&pSamples[3]);
660 vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
661 float32x4_t pap = vld1q_f32(partial_accu_ptr);
662 vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
663 pOutput += cnt;
664 }
665 }
666
667 /*
668 * Copy the samples back into the history buffer start
669 */
670 pTempSrc = &pRefStatePtr[blockSize];
671 pTempDest = pRefStatePtr;
672
673 blkCnt = numTaps >> 2;
674 while (blkCnt > 0)
675 {
676 vst1q(pTempDest, vld1q(pTempSrc));
677 pTempSrc += 4;
678 pTempDest += 4;
679 blkCnt--;
680 }
681 blkCnt = numTaps & 3;
682 if (blkCnt > 0)
683 {
684 mve_pred16_t p0 = vctp32q(blkCnt);
685 vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
686 }
687 }
688
689 #else
690 #if defined(ARM_MATH_NEON)
691
arm_fir_f32(const arm_fir_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)692 ARM_DSP_ATTRIBUTE void arm_fir_f32(
693 const arm_fir_instance_f32 * S,
694 const float32_t * pSrc,
695 float32_t * pDst,
696 uint32_t blockSize)
697 {
698 float32_t *pState = S->pState; /* State pointer */
699 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
700 float32_t *pStateCurnt; /* Points to the current sample of the state */
701 float32_t *px; /* Temporary pointers for state buffer */
702 const float32_t *pb; /* Temporary pointers for coefficient buffer */
703 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
704 uint32_t i, tapCnt, blkCnt; /* Loop counters */
705
706 float32x4_t accv0,accv1,samples0,samples1,x0,x1,x2,xa,xb,b;
707 float32_t acc;
708
709 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
710 /* pStateCurnt points to the location where the new input data should be written */
711 pStateCurnt = &(S->pState[(numTaps - 1U)]);
712
713 /* Loop unrolling */
714 blkCnt = blockSize >> 3;
715
716 while (blkCnt > 0U)
717 {
718 /* Copy 8 samples at a time into state buffers */
719 samples0 = vld1q_f32(pSrc);
720 vst1q_f32(pStateCurnt,samples0);
721
722 pStateCurnt += 4;
723 pSrc += 4 ;
724
725 samples1 = vld1q_f32(pSrc);
726 vst1q_f32(pStateCurnt,samples1);
727
728 pStateCurnt += 4;
729 pSrc += 4 ;
730
731 /* Set the accumulators to zero */
732 accv0 = vdupq_n_f32(0);
733 accv1 = vdupq_n_f32(0);
734
735 /* Initialize state pointer */
736 px = pState;
737
738 /* Initialize coefficient pointer */
739 pb = pCoeffs;
740
741 /* Loop unroling */
742 i = numTaps >> 2;
743
744 /* Perform the multiply-accumulates */
745 x0 = vld1q_f32(px);
746 x1 = vld1q_f32(px + 4);
747
748 while(i > 0)
749 {
750 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
751 x2 = vld1q_f32(px + 8);
752 b = vld1q_f32(pb);
753 xa = x0;
754 xb = x1;
755 accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 0));
756 accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 0));
757
758 xa = vextq_f32(x0,x1,1);
759 xb = vextq_f32(x1,x2,1);
760
761 accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 1));
762 accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 1));
763
764 xa = vextq_f32(x0,x1,2);
765 xb = vextq_f32(x1,x2,2);
766
767 accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 2));
768 accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 2));
769
770 xa = vextq_f32(x0,x1,3);
771 xb = vextq_f32(x1,x2,3);
772
773 accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 3));
774 accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 3));
775
776 pb += 4;
777 x0 = x1;
778 x1 = x2;
779 px += 4;
780 i--;
781
782 }
783
784 /* Tail */
785 i = numTaps & 3;
786 x2 = vld1q_f32(px + 8);
787
788 /* Perform the multiply-accumulates */
789 switch(i)
790 {
791 case 3:
792 {
793 accv0 = vmlaq_n_f32(accv0,x0,*pb);
794 accv1 = vmlaq_n_f32(accv1,x1,*pb);
795
796 pb++;
797
798 xa = vextq_f32(x0,x1,1);
799 xb = vextq_f32(x1,x2,1);
800
801 accv0 = vmlaq_n_f32(accv0,xa,*pb);
802 accv1 = vmlaq_n_f32(accv1,xb,*pb);
803
804 pb++;
805
806 xa = vextq_f32(x0,x1,2);
807 xb = vextq_f32(x1,x2,2);
808
809 accv0 = vmlaq_n_f32(accv0,xa,*pb);
810 accv1 = vmlaq_n_f32(accv1,xb,*pb);
811
812 }
813 break;
814 case 2:
815 {
816 accv0 = vmlaq_n_f32(accv0,x0,*pb);
817 accv1 = vmlaq_n_f32(accv1,x1,*pb);
818
819 pb++;
820
821 xa = vextq_f32(x0,x1,1);
822 xb = vextq_f32(x1,x2,1);
823
824 accv0 = vmlaq_n_f32(accv0,xa,*pb);
825 accv1 = vmlaq_n_f32(accv1,xb,*pb);
826
827 }
828 break;
829 case 1:
830 {
831
832 accv0 = vmlaq_n_f32(accv0,x0,*pb);
833 accv1 = vmlaq_n_f32(accv1,x1,*pb);
834
835 }
836 break;
837 default:
838 break;
839 }
840
841 /* The result is stored in the destination buffer. */
842 vst1q_f32(pDst,accv0);
843 pDst += 4;
844 vst1q_f32(pDst,accv1);
845 pDst += 4;
846
847 /* Advance state pointer by 8 for the next 8 samples */
848 pState = pState + 8;
849
850 blkCnt--;
851 }
852
853 /* Tail */
854 blkCnt = blockSize & 0x7;
855
856 while (blkCnt > 0U)
857 {
858 /* Copy one sample at a time into state buffer */
859 *pStateCurnt++ = *pSrc++;
860
861 /* Set the accumulator to zero */
862 acc = 0.0f;
863
864 /* Initialize state pointer */
865 px = pState;
866
867 /* Initialize Coefficient pointer */
868 pb = pCoeffs;
869
870 i = numTaps;
871
872 /* Perform the multiply-accumulates */
873 do
874 {
875 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
876 acc += *px++ * *pb++;
877 i--;
878
879 } while (i > 0U);
880
881 /* The result is stored in the destination buffer. */
882 *pDst++ = acc;
883
884 /* Advance state pointer by 1 for the next sample */
885 pState = pState + 1;
886
887 blkCnt--;
888 }
889
890 /* Processing is complete.
891 ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
892 ** This prepares the state buffer for the next function call. */
893
894 /* Points to the start of the state buffer */
895 pStateCurnt = S->pState;
896
897 /* Copy numTaps number of values */
898 tapCnt = numTaps - 1U;
899
900 /* Copy data */
901 while (tapCnt > 0U)
902 {
903 *pStateCurnt++ = *pState++;
904
905 /* Decrement the loop counter */
906 tapCnt--;
907 }
908
909 }
910 #else
arm_fir_f32(const arm_fir_instance_f32 * S,const float32_t * pSrc,float32_t * pDst,uint32_t blockSize)911 ARM_DSP_ATTRIBUTE void arm_fir_f32(
912 const arm_fir_instance_f32 * S,
913 const float32_t * pSrc,
914 float32_t * pDst,
915 uint32_t blockSize)
916 {
917 float32_t *pState = S->pState; /* State pointer */
918 const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
919 float32_t *pStateCurnt; /* Points to the current sample of the state */
920 float32_t *px; /* Temporary pointer for state buffer */
921 const float32_t *pb; /* Temporary pointer for coefficient buffer */
922 float32_t acc0; /* Accumulator */
923 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
924 uint32_t i, tapCnt, blkCnt; /* Loop counters */
925
926 #if defined (ARM_MATH_LOOPUNROLL)
927 float32_t acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */
928 float32_t x0, x1, x2, x3, x4, x5, x6, x7; /* Temporary variables to hold state values */
929 float32_t c0; /* Temporary variable to hold coefficient value */
930 #endif
931
932 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
933 /* pStateCurnt points to the location where the new input data should be written */
934 pStateCurnt = &(S->pState[(numTaps - 1U)]);
935
936 #if defined (ARM_MATH_LOOPUNROLL)
937
938 /* Loop unrolling: Compute 8 output values simultaneously.
939 * The variables acc0 ... acc7 hold output values that are being computed:
940 *
941 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
942 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
943 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
944 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
945 */
946
947 blkCnt = blockSize >> 3U;
948
949 while (blkCnt > 0U)
950 {
951 /* Copy 4 new input samples into the state buffer. */
952 *pStateCurnt++ = *pSrc++;
953 *pStateCurnt++ = *pSrc++;
954 *pStateCurnt++ = *pSrc++;
955 *pStateCurnt++ = *pSrc++;
956
957 /* Set all accumulators to zero */
958 acc0 = 0.0f;
959 acc1 = 0.0f;
960 acc2 = 0.0f;
961 acc3 = 0.0f;
962 acc4 = 0.0f;
963 acc5 = 0.0f;
964 acc6 = 0.0f;
965 acc7 = 0.0f;
966
967 /* Initialize state pointer */
968 px = pState;
969
970 /* Initialize coefficient pointer */
971 pb = pCoeffs;
972
973 /* This is separated from the others to avoid
974 * a call to __aeabi_memmove which would be slower
975 */
976 *pStateCurnt++ = *pSrc++;
977 *pStateCurnt++ = *pSrc++;
978 *pStateCurnt++ = *pSrc++;
979 *pStateCurnt++ = *pSrc++;
980
981 /* Read the first 7 samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
982 x0 = *px++;
983 x1 = *px++;
984 x2 = *px++;
985 x3 = *px++;
986 x4 = *px++;
987 x5 = *px++;
988 x6 = *px++;
989
990 /* Loop unrolling: process 8 taps at a time. */
991 tapCnt = numTaps >> 3U;
992
993 while (tapCnt > 0U)
994 {
995 /* Read the b[numTaps-1] coefficient */
996 c0 = *(pb++);
997
998 /* Read x[n-numTaps-3] sample */
999 x7 = *(px++);
1000
1001 /* acc0 += b[numTaps-1] * x[n-numTaps] */
1002 acc0 += x0 * c0;
1003
1004 /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
1005 acc1 += x1 * c0;
1006
1007 /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
1008 acc2 += x2 * c0;
1009
1010 /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
1011 acc3 += x3 * c0;
1012
1013 /* acc4 += b[numTaps-1] * x[n-numTaps-4] */
1014 acc4 += x4 * c0;
1015
1016 /* acc1 += b[numTaps-1] * x[n-numTaps-5] */
1017 acc5 += x5 * c0;
1018
1019 /* acc2 += b[numTaps-1] * x[n-numTaps-6] */
1020 acc6 += x6 * c0;
1021
1022 /* acc3 += b[numTaps-1] * x[n-numTaps-7] */
1023 acc7 += x7 * c0;
1024
1025 /* Read the b[numTaps-2] coefficient */
1026 c0 = *(pb++);
1027
1028 /* Read x[n-numTaps-4] sample */
1029 x0 = *(px++);
1030
1031 /* Perform the multiply-accumulate */
1032 acc0 += x1 * c0;
1033 acc1 += x2 * c0;
1034 acc2 += x3 * c0;
1035 acc3 += x4 * c0;
1036 acc4 += x5 * c0;
1037 acc5 += x6 * c0;
1038 acc6 += x7 * c0;
1039 acc7 += x0 * c0;
1040
1041 /* Read the b[numTaps-3] coefficient */
1042 c0 = *(pb++);
1043
1044 /* Read x[n-numTaps-5] sample */
1045 x1 = *(px++);
1046
1047 /* Perform the multiply-accumulates */
1048 acc0 += x2 * c0;
1049 acc1 += x3 * c0;
1050 acc2 += x4 * c0;
1051 acc3 += x5 * c0;
1052 acc4 += x6 * c0;
1053 acc5 += x7 * c0;
1054 acc6 += x0 * c0;
1055 acc7 += x1 * c0;
1056
1057 /* Read the b[numTaps-4] coefficient */
1058 c0 = *(pb++);
1059
1060 /* Read x[n-numTaps-6] sample */
1061 x2 = *(px++);
1062
1063 /* Perform the multiply-accumulates */
1064 acc0 += x3 * c0;
1065 acc1 += x4 * c0;
1066 acc2 += x5 * c0;
1067 acc3 += x6 * c0;
1068 acc4 += x7 * c0;
1069 acc5 += x0 * c0;
1070 acc6 += x1 * c0;
1071 acc7 += x2 * c0;
1072
1073 /* Read the b[numTaps-4] coefficient */
1074 c0 = *(pb++);
1075
1076 /* Read x[n-numTaps-6] sample */
1077 x3 = *(px++);
1078 /* Perform the multiply-accumulates */
1079 acc0 += x4 * c0;
1080 acc1 += x5 * c0;
1081 acc2 += x6 * c0;
1082 acc3 += x7 * c0;
1083 acc4 += x0 * c0;
1084 acc5 += x1 * c0;
1085 acc6 += x2 * c0;
1086 acc7 += x3 * c0;
1087
1088 /* Read the b[numTaps-4] coefficient */
1089 c0 = *(pb++);
1090
1091 /* Read x[n-numTaps-6] sample */
1092 x4 = *(px++);
1093
1094 /* Perform the multiply-accumulates */
1095 acc0 += x5 * c0;
1096 acc1 += x6 * c0;
1097 acc2 += x7 * c0;
1098 acc3 += x0 * c0;
1099 acc4 += x1 * c0;
1100 acc5 += x2 * c0;
1101 acc6 += x3 * c0;
1102 acc7 += x4 * c0;
1103
1104 /* Read the b[numTaps-4] coefficient */
1105 c0 = *(pb++);
1106
1107 /* Read x[n-numTaps-6] sample */
1108 x5 = *(px++);
1109
1110 /* Perform the multiply-accumulates */
1111 acc0 += x6 * c0;
1112 acc1 += x7 * c0;
1113 acc2 += x0 * c0;
1114 acc3 += x1 * c0;
1115 acc4 += x2 * c0;
1116 acc5 += x3 * c0;
1117 acc6 += x4 * c0;
1118 acc7 += x5 * c0;
1119
1120 /* Read the b[numTaps-4] coefficient */
1121 c0 = *(pb++);
1122
1123 /* Read x[n-numTaps-6] sample */
1124 x6 = *(px++);
1125
1126 /* Perform the multiply-accumulates */
1127 acc0 += x7 * c0;
1128 acc1 += x0 * c0;
1129 acc2 += x1 * c0;
1130 acc3 += x2 * c0;
1131 acc4 += x3 * c0;
1132 acc5 += x4 * c0;
1133 acc6 += x5 * c0;
1134 acc7 += x6 * c0;
1135
1136 /* Decrement loop counter */
1137 tapCnt--;
1138 }
1139
1140 /* Loop unrolling: Compute remaining outputs */
1141 tapCnt = numTaps % 0x8U;
1142
1143 while (tapCnt > 0U)
1144 {
1145 /* Read coefficients */
1146 c0 = *(pb++);
1147
1148 /* Fetch 1 state variable */
1149 x7 = *(px++);
1150
1151 /* Perform the multiply-accumulates */
1152 acc0 += x0 * c0;
1153 acc1 += x1 * c0;
1154 acc2 += x2 * c0;
1155 acc3 += x3 * c0;
1156 acc4 += x4 * c0;
1157 acc5 += x5 * c0;
1158 acc6 += x6 * c0;
1159 acc7 += x7 * c0;
1160
1161 /* Reuse the present sample states for next sample */
1162 x0 = x1;
1163 x1 = x2;
1164 x2 = x3;
1165 x3 = x4;
1166 x4 = x5;
1167 x5 = x6;
1168 x6 = x7;
1169
1170 /* Decrement loop counter */
1171 tapCnt--;
1172 }
1173
1174 /* Advance the state pointer by 8 to process the next group of 8 samples */
1175 pState = pState + 8;
1176
1177 /* The results in the 8 accumulators, store in the destination buffer. */
1178 *pDst++ = acc0;
1179 *pDst++ = acc1;
1180 *pDst++ = acc2;
1181 *pDst++ = acc3;
1182 *pDst++ = acc4;
1183 *pDst++ = acc5;
1184 *pDst++ = acc6;
1185 *pDst++ = acc7;
1186
1187
1188 /* Decrement loop counter */
1189 blkCnt--;
1190 }
1191
1192 /* Loop unrolling: Compute remaining output samples */
1193 blkCnt = blockSize % 0x8U;
1194
1195 #else
1196
1197 /* Initialize blkCnt with number of taps */
1198 blkCnt = blockSize;
1199
1200 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1201
1202 while (blkCnt > 0U)
1203 {
1204 /* Copy one sample at a time into state buffer */
1205 *pStateCurnt++ = *pSrc++;
1206
1207 /* Set the accumulator to zero */
1208 acc0 = 0.0f;
1209
1210 /* Initialize state pointer */
1211 px = pState;
1212
1213 /* Initialize Coefficient pointer */
1214 pb = pCoeffs;
1215
1216 i = numTaps;
1217
1218 /* Perform the multiply-accumulates */
1219 while (i > 0U)
1220 {
1221 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
1222 acc0 += *px++ * *pb++;
1223
1224 i--;
1225 }
1226
1227 /* Store result in destination buffer. */
1228 *pDst++ = acc0;
1229
1230 /* Advance state pointer by 1 for the next sample */
1231 pState = pState + 1U;
1232
1233 /* Decrement loop counter */
1234 blkCnt--;
1235 }
1236
1237 /* Processing is complete.
1238 Now copy the last numTaps - 1 samples to the start of the state buffer.
1239 This prepares the state buffer for the next function call. */
1240
1241 /* Points to the start of the state buffer */
1242 pStateCurnt = S->pState;
1243
1244 #if defined (ARM_MATH_LOOPUNROLL)
1245
1246 /* Loop unrolling: Compute 4 taps at a time */
1247 tapCnt = (numTaps - 1U) >> 2U;
1248
1249 /* Copy data */
1250 while (tapCnt > 0U)
1251 {
1252 *pStateCurnt++ = *pState++;
1253 *pStateCurnt++ = *pState++;
1254 *pStateCurnt++ = *pState++;
1255 *pStateCurnt++ = *pState++;
1256
1257 /* Decrement loop counter */
1258 tapCnt--;
1259 }
1260
1261 /* Calculate remaining number of copies */
1262 tapCnt = (numTaps - 1U) % 0x4U;
1263
1264 #else
1265
1266 /* Initialize tapCnt with number of taps */
1267 tapCnt = (numTaps - 1U);
1268
1269 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1270
1271 /* Copy remaining data */
1272 while (tapCnt > 0U)
1273 {
1274 *pStateCurnt++ = *pState++;
1275
1276 /* Decrement loop counter */
1277 tapCnt--;
1278 }
1279
1280 }
1281
1282 #endif /* #if defined(ARM_MATH_NEON) */
1283 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1284
1285 /**
1286 * @} end of FIR group
1287 */
1288