1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_q7.c
4 * Description: Q7 FIR filter processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup FIR
37 @{
38 */
39
40 /**
41 @brief Processing function for Q7 FIR filter.
42 @param[in] S points to an instance of the Q7 FIR filter structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] blockSize number of samples to process
46
47 @par Scaling and Overflow Behavior
48 The function is implemented using a 32-bit internal accumulator.
49 Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
50 The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
51 There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
52 The accumulator is converted to 18.7 format by discarding the low 7 bits.
53 Finally, the result is truncated to 1.7 format.
54 */
55
56 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
57
58 #define FIR_Q7_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs) \
59 for (int j = 0; j < nbAcc; j++) { \
60 const q7_t *pSmp = &pSample[j]; \
61 q31_t acc[4]; \
62 \
63 acc[j] = 0; \
64 for (int i = 0; i < nbVecTaps; i++) { \
65 vecIn0 = vld1q(pSmp + 16 * i); \
66 acc[j] = vmladavaq(acc[j], vecIn0, vecCoeffs[i]); \
67 } \
68 *pOutput++ = (q7_t) __SSAT((acc[j] >> 7U), 8); \
69 }
70
71 #define FIR_Q7_MAIN_CORE() \
72 { \
73 q7_t *pState = S->pState; /* State pointer */ \
74 const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
75 q7_t *pStateCur; /* Points to the current sample of the state */ \
76 const q7_t *pSamples; /* Temporary pointer to the sample buffer */ \
77 q7_t *pOutput; /* Temporary pointer to the output buffer */ \
78 const q7_t *pTempSrc; /* Temporary pointer to the source data */ \
79 q7_t *pTempDest; /* Temporary pointer to the destination buffer */\
80 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
81 int32_t blkCnt; \
82 q7x16_t vecIn0; \
83 \
84 /* \
85 * load coefs \
86 */ \
87 q7x16_t vecCoeffs[NBVECTAPS]; \
88 \
89 for (int i = 0; i < NBVECTAPS; i++) \
90 vecCoeffs[i] = vldrbq_s8(pCoeffs + 16 * i); \
91 \
92 /* \
93 * pState points to state array which contains previous frame (numTaps - 1) samples \
94 * pStateCur points to the location where the new input data should be written \
95 */ \
96 pStateCur = &(pState[(numTaps - 1u)]); \
97 pTempSrc = pSrc; \
98 pSamples = pState; \
99 pOutput = pDst; \
100 \
101 blkCnt = blockSize >> 2; \
102 while (blkCnt > 0) { \
103 /* \
104 * Save 4 input samples in the history buffer \
105 */ \
106 vstrbq_s32(pStateCur, vldrbq_s32(pTempSrc)); \
107 pStateCur += 4; \
108 pTempSrc += 4; \
109 \
110 FIR_Q7_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs); \
111 pSamples += 4; \
112 \
113 blkCnt--; \
114 } \
115 \
116 /* tail */ \
117 int32_t residual = blockSize & 3; \
118 \
119 for (int i = 0; i < residual; i++) \
120 *pStateCur++ = *pTempSrc++; \
121 \
122 FIR_Q7_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs); \
123 \
124 \
125 /* \
126 * Copy the samples back into the history buffer start \
127 */ \
128 pTempSrc = &pState[blockSize]; \
129 pTempDest = pState; \
130 blkCnt = numTaps - 1; \
131 do { \
132 mve_pred16_t p = vctp8q(blkCnt); \
133 \
134 vstrbq_p_s8(pTempDest, vldrbq_z_s8(pTempSrc, p), p); \
135 pTempSrc += 16; \
136 pTempDest += 16; \
137 blkCnt -= 16; \
138 } \
139 while (blkCnt > 0); \
140 }
141
142
arm_fir_q7_49_64_mve(const arm_fir_instance_q7 * S,const q7_t * __restrict pSrc,q7_t * __restrict pDst,uint32_t blockSize)143 static void arm_fir_q7_49_64_mve(const arm_fir_instance_q7 * S,
144 const q7_t * __restrict pSrc,
145 q7_t * __restrict pDst, uint32_t blockSize)
146 {
147 #define NBTAPS 64
148 #define NBVECTAPS (NBTAPS / 16)
149 FIR_Q7_MAIN_CORE();
150 #undef NBVECTAPS
151 #undef NBTAPS
152 }
153
154
arm_fir_q7_33_48_mve(const arm_fir_instance_q7 * S,const q7_t * __restrict pSrc,q7_t * __restrict pDst,uint32_t blockSize)155 static void arm_fir_q7_33_48_mve(const arm_fir_instance_q7 * S,
156 const q7_t * __restrict pSrc,
157 q7_t * __restrict pDst, uint32_t blockSize)
158 {
159 #define NBTAPS 48
160 #define NBVECTAPS (NBTAPS / 16)
161 FIR_Q7_MAIN_CORE();
162 #undef NBVECTAPS
163 #undef NBTAPS
164 }
165
arm_fir_q7_17_32_mve(const arm_fir_instance_q7 * S,const q7_t * __restrict pSrc,q7_t * __restrict pDst,uint32_t blockSize)166 static void arm_fir_q7_17_32_mve(const arm_fir_instance_q7 * S,
167 const q7_t * __restrict pSrc,
168 q7_t * __restrict pDst, uint32_t blockSize)
169 {
170 #define NBTAPS 32
171 #define NBVECTAPS (NBTAPS / 16)
172 FIR_Q7_MAIN_CORE();
173 #undef NBVECTAPS
174 #undef NBTAPS
175 }
176
177
arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S,const q7_t * __restrict pSrc,q7_t * __restrict pDst,uint32_t blockSize)178 static void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S,
179 const q7_t * __restrict pSrc,
180 q7_t * __restrict pDst, uint32_t blockSize)
181 {
182 #define NBTAPS 16
183 #define NBVECTAPS (NBTAPS / 16)
184 FIR_Q7_MAIN_CORE();
185 #undef NBVECTAPS
186 #undef NBTAPS
187 }
188
arm_fir_q7(const arm_fir_instance_q7 * S,const q7_t * pSrc,q7_t * pDst,uint32_t blockSize)189 ARM_DSP_ATTRIBUTE void arm_fir_q7(
190 const arm_fir_instance_q7 * S,
191 const q7_t * pSrc,
192 q7_t * pDst,
193 uint32_t blockSize)
194 {
195 q7_t *pState = S->pState; /* State pointer */
196 const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
197 q7_t *pStateCur; /* Points to the current sample of the state */
198 const q7_t *pSamples; /* Temporary pointer to the sample buffer */
199 q7_t *pOutput; /* Temporary pointer to the output buffer */
200 const q7_t *pTempSrc; /* Temporary pointer to the source data */
201 q7_t *pTempDest; /* Temporary pointer to the destination buffer */
202 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
203 uint32_t blkCnt;
204 q7x16_t vecIn0;
205 uint32_t tapsBlkCnt = (numTaps + 15) / 16;
206 q31_t acc0, acc1, acc2, acc3;
207 q7x16_t vecCoeffs;
208
209 if (numTaps <= 16)
210 {
211 /*
212 * [1 to 16 taps] specialized routine
213 */
214 arm_fir_q7_1_16_mve(S, pSrc, pDst, blockSize);
215 return;
216 }
217 else if (numTaps <= 32)
218 {
219 /*
220 * [17 to 32 taps] specialized routine
221 */
222 arm_fir_q7_17_32_mve(S, pSrc, pDst, blockSize);
223 return;
224 }
225 else if (numTaps <= 48)
226 {
227 /*
228 * [33 to 48 taps] specialized routine
229 */
230 arm_fir_q7_33_48_mve(S, pSrc, pDst, blockSize);
231 return;
232 }
233 else if (numTaps <= 64)
234 {
235 /*
236 * [49 to 64 taps] specialized routine
237 */
238 arm_fir_q7_49_64_mve(S, pSrc, pDst, blockSize);
239 return;
240 }
241
242 /*
243 * pState points to state array which contains previous frame (numTaps - 1) samples
244 * pStateCur points to the location where the new input data should be written
245 */
246 pStateCur = &(pState[(numTaps - 1u)]);
247 pSamples = pState;
248 pTempSrc = pSrc;
249 pOutput = pDst;
250 blkCnt = blockSize >> 2;
251
252 /*
253 * outer samples loop
254 */
255 while (blkCnt > 0U)
256 {
257 const q7_t *pCoeffsTmp = pCoeffs;
258 const q7_t *pSamplesTmp = pSamples;
259
260 acc0 = 0;
261 acc1 = 0;
262 acc2 = 0;
263 acc3 = 0;
264 /*
265 * Save 16 input samples in the history buffer
266 */
267 vst1q(pStateCur, vld1q(pTempSrc));
268 pStateCur += 16;
269 pTempSrc += 16;
270
271 /*
272 * inner coefficients loop
273 */
274 int i = tapsBlkCnt;
275 while (i > 0)
276 {
277 /*
278 * load 16 coefs
279 */
280 vecCoeffs = *(q7x16_t *) pCoeffsTmp;
281
282 vecIn0 = vld1q(pSamplesTmp);
283 acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
284
285 vecIn0 = vld1q(&pSamplesTmp[1]);
286 acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
287
288 vecIn0 = vld1q(&pSamplesTmp[2]);
289 acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
290
291 vecIn0 = vld1q(&pSamplesTmp[3]);
292 acc3 = vmladavaq(acc3, vecIn0, vecCoeffs);
293
294 pSamplesTmp += 16;
295 pCoeffsTmp += 16;
296 /*
297 * Decrement the taps block loop counter
298 */
299 i--;
300 }
301 /*
302 * Store the 1.7 format filter output in destination buffer
303 */
304 *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
305 *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
306 *pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8);
307 *pOutput++ = (q7_t) __SSAT((acc3 >> 7U), 8);
308
309 pSamples += 4;
310 /*
311 * Decrement the sample block loop counter
312 */
313 blkCnt--;
314 }
315
316 uint32_t residual = blockSize & 3;
317 switch (residual)
318 {
319 case 3:
320 {
321 const q7_t *pCoeffsTmp = pCoeffs;
322 const q7_t *pSamplesTmp = pSamples;
323
324 acc0 = 0;
325 acc1 = 0;
326 acc2 = 0;
327 /*
328 * Save 16 input samples in the history buffer
329 */
330 vst1q(pStateCur, vld1q(pTempSrc));
331 pStateCur += 16;
332 pTempSrc += 16;
333
334 int i = tapsBlkCnt;
335 while (i > 0)
336 {
337 vecCoeffs = *(q7x16_t *) pCoeffsTmp;
338
339 vecIn0 = vld1q(pSamplesTmp);
340 acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
341
342 vecIn0 = vld1q(&pSamplesTmp[4]);
343 acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
344
345 vecIn0 = vld1q(&pSamplesTmp[8]);
346 acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
347
348 pSamplesTmp += 16;
349 pCoeffsTmp += 16;
350 i--;
351 }
352
353 *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
354 *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
355 *pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8);
356 }
357 break;
358
359 case 2:
360 {
361 const q7_t *pCoeffsTmp = pCoeffs;
362 const q7_t *pSamplesTmp = pSamples;
363
364 acc0 = 0;
365 acc1 = 0;
366 /*
367 * Save 16 input samples in the history buffer
368 */
369 vst1q(pStateCur, vld1q(pTempSrc));
370 pStateCur += 16;
371 pTempSrc += 16;
372
373 int i = tapsBlkCnt;
374 while (i > 0)
375 {
376 vecCoeffs = *(q7x16_t *) pCoeffsTmp;
377
378 vecIn0 = vld1q(pSamplesTmp);
379 acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
380
381 vecIn0 = vld1q(&pSamplesTmp[4]);
382 acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
383
384 pSamplesTmp += 16;
385 pCoeffsTmp += 16;
386 i--;
387 }
388
389 *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
390 *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
391 }
392 break;
393
394 case 1:
395 {
396 const q7_t *pCoeffsTmp = pCoeffs;
397 const q7_t *pSamplesTmp = pSamples;
398
399 acc0 = 0;
400 /*
401 * Save 16 input samples in the history buffer
402 */
403 vst1q(pStateCur, vld1q(pTempSrc));
404 pStateCur += 16;
405 pTempSrc += 16;
406
407 int i = tapsBlkCnt;
408 while (i > 0)
409 {
410 vecCoeffs = *(q7x16_t *) pCoeffsTmp;
411
412 vecIn0 = vld1q(pSamplesTmp);
413 acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
414
415 pSamplesTmp += 16;
416 pCoeffsTmp += 16;
417 i--;
418 }
419 *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
420 }
421 break;
422 }
423
424 /*
425 * Copy the samples back into the history buffer start
426 */
427 pTempSrc = &pState[blockSize];
428 pTempDest = pState;
429
430 blkCnt = numTaps >> 4;
431 while (blkCnt > 0U)
432 {
433 vst1q(pTempDest, vld1q(pTempSrc));
434 pTempSrc += 16;
435 pTempDest += 16;
436 blkCnt--;
437 }
438 blkCnt = numTaps & 0xF;
439 if (blkCnt > 0U)
440 {
441 mve_pred16_t p0 = vctp8q(blkCnt);
442 vstrbq_p_s8(pTempDest, vld1q(pTempSrc), p0);
443 }
444 }
445 #else
arm_fir_q7(const arm_fir_instance_q7 * S,const q7_t * pSrc,q7_t * pDst,uint32_t blockSize)446 ARM_DSP_ATTRIBUTE void arm_fir_q7(
447 const arm_fir_instance_q7 * S,
448 const q7_t * pSrc,
449 q7_t * pDst,
450 uint32_t blockSize)
451 {
452 q7_t *pState = S->pState; /* State pointer */
453 const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
454 q7_t *pStateCurnt; /* Points to the current sample of the state */
455 q7_t *px; /* Temporary pointer for state buffer */
456 const q7_t *pb; /* Temporary pointer for coefficient buffer */
457 q31_t acc0; /* Accumulators */
458 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
459 uint32_t i, tapCnt, blkCnt; /* Loop counters */
460
461 #if defined (ARM_MATH_LOOPUNROLL)
462 q31_t acc1, acc2, acc3; /* Accumulators */
463 q7_t x0, x1, x2, x3, c0; /* Temporary variables to hold state */
464 #endif
465
466 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
467 /* pStateCurnt points to the location where the new input data should be written */
468 pStateCurnt = &(S->pState[(numTaps - 1U)]);
469
470 #if defined (ARM_MATH_LOOPUNROLL)
471
472 /* Loop unrolling: Compute 4 output values simultaneously.
473 * The variables acc0 ... acc3 hold output values that are being computed:
474 *
475 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
476 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
477 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
478 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
479 */
480 blkCnt = blockSize >> 2U;
481
482 while (blkCnt > 0U)
483 {
484 /* Copy 4 new input samples into the state buffer. */
485 *pStateCurnt++ = *pSrc++;
486 *pStateCurnt++ = *pSrc++;
487 *pStateCurnt++ = *pSrc++;
488 *pStateCurnt++ = *pSrc++;
489
490 /* Set all accumulators to zero */
491 acc0 = 0;
492 acc1 = 0;
493 acc2 = 0;
494 acc3 = 0;
495
496 /* Initialize state pointer */
497 px = pState;
498
499 /* Initialize coefficient pointer */
500 pb = pCoeffs;
501
502 /* Read the first 3 samples from the state buffer:
503 * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
504 x0 = *px++;
505 x1 = *px++;
506 x2 = *px++;
507
508 /* Loop unrolling. Process 4 taps at a time. */
509 tapCnt = numTaps >> 2U;
510
511 /* Loop over the number of taps. Unroll by a factor of 4.
512 Repeat until we've computed numTaps-4 coefficients. */
513 while (tapCnt > 0U)
514 {
515 /* Read the b[numTaps] coefficient */
516 c0 = *pb;
517
518 /* Read x[n-numTaps-3] sample */
519 x3 = *px;
520
521 /* acc0 += b[numTaps] * x[n-numTaps] */
522 acc0 += ((q15_t) x0 * c0);
523
524 /* acc1 += b[numTaps] * x[n-numTaps-1] */
525 acc1 += ((q15_t) x1 * c0);
526
527 /* acc2 += b[numTaps] * x[n-numTaps-2] */
528 acc2 += ((q15_t) x2 * c0);
529
530 /* acc3 += b[numTaps] * x[n-numTaps-3] */
531 acc3 += ((q15_t) x3 * c0);
532
533 /* Read the b[numTaps-1] coefficient */
534 c0 = *(pb + 1U);
535
536 /* Read x[n-numTaps-4] sample */
537 x0 = *(px + 1U);
538
539 /* Perform the multiply-accumulates */
540 acc0 += ((q15_t) x1 * c0);
541 acc1 += ((q15_t) x2 * c0);
542 acc2 += ((q15_t) x3 * c0);
543 acc3 += ((q15_t) x0 * c0);
544
545 /* Read the b[numTaps-2] coefficient */
546 c0 = *(pb + 2U);
547
548 /* Read x[n-numTaps-5] sample */
549 x1 = *(px + 2U);
550
551 /* Perform the multiply-accumulates */
552 acc0 += ((q15_t) x2 * c0);
553 acc1 += ((q15_t) x3 * c0);
554 acc2 += ((q15_t) x0 * c0);
555 acc3 += ((q15_t) x1 * c0);
556
557 /* Read the b[numTaps-3] coefficients */
558 c0 = *(pb + 3U);
559
560 /* Read x[n-numTaps-6] sample */
561 x2 = *(px + 3U);
562
563 /* Perform the multiply-accumulates */
564 acc0 += ((q15_t) x3 * c0);
565 acc1 += ((q15_t) x0 * c0);
566 acc2 += ((q15_t) x1 * c0);
567 acc3 += ((q15_t) x2 * c0);
568
569 /* update coefficient pointer */
570 pb += 4U;
571 px += 4U;
572
573 /* Decrement loop counter */
574 tapCnt--;
575 }
576
577 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
578 tapCnt = numTaps % 0x4U;
579
580 while (tapCnt > 0U)
581 {
582 /* Read coefficients */
583 c0 = *(pb++);
584
585 /* Fetch 1 state variable */
586 x3 = *(px++);
587
588 /* Perform the multiply-accumulates */
589 acc0 += ((q15_t) x0 * c0);
590 acc1 += ((q15_t) x1 * c0);
591 acc2 += ((q15_t) x2 * c0);
592 acc3 += ((q15_t) x3 * c0);
593
594 /* Reuse the present sample states for next sample */
595 x0 = x1;
596 x1 = x2;
597 x2 = x3;
598
599 /* Decrement loop counter */
600 tapCnt--;
601 }
602
603 /* The results in the 4 accumulators are in 2.62 format. Convert to 1.31
604 Then store the 4 outputs in the destination buffer. */
605 acc0 = __SSAT((acc0 >> 7U), 8);
606 *pDst++ = acc0;
607 acc1 = __SSAT((acc1 >> 7U), 8);
608 *pDst++ = acc1;
609 acc2 = __SSAT((acc2 >> 7U), 8);
610 *pDst++ = acc2;
611 acc3 = __SSAT((acc3 >> 7U), 8);
612 *pDst++ = acc3;
613
614 /* Advance the state pointer by 4 to process the next group of 4 samples */
615 pState = pState + 4U;
616
617 /* Decrement loop counter */
618 blkCnt--;
619 }
620
621 /* Loop unrolling: Compute remaining output samples */
622 blkCnt = blockSize % 0x4U;
623
624 #else
625
626 /* Initialize blkCnt with number of taps */
627 blkCnt = blockSize;
628
629 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
630
631 while (blkCnt > 0U)
632 {
633 /* Copy one sample at a time into state buffer */
634 *pStateCurnt++ = *pSrc++;
635
636 /* Set the accumulator to zero */
637 acc0 = 0;
638
639 /* Initialize state pointer */
640 px = pState;
641
642 /* Initialize Coefficient pointer */
643 pb = pCoeffs;
644
645 i = numTaps;
646
647 /* Perform the multiply-accumulates */
648 while (i > 0U)
649 {
650 acc0 += (q15_t) * (px++) * (*(pb++));
651 i--;
652 }
653
654 /* The result is in 2.14 format. Convert to 1.7
655 Then store the output in the destination buffer. */
656 *pDst++ = __SSAT((acc0 >> 7U), 8);
657
658 /* Advance state pointer by 1 for the next sample */
659 pState = pState + 1U;
660
661 /* Decrement loop counter */
662 blkCnt--;
663 }
664
665 /* Processing is complete.
666 Now copy the last numTaps - 1 samples to the start of the state buffer.
667 This prepares the state buffer for the next function call. */
668
669 /* Points to the start of the state buffer */
670 pStateCurnt = S->pState;
671
672 #if defined (ARM_MATH_LOOPUNROLL)
673
674 /* Loop unrolling: Compute 4 taps at a time */
675 tapCnt = (numTaps - 1U) >> 2U;
676
677 /* Copy data */
678 while (tapCnt > 0U)
679 {
680 *pStateCurnt++ = *pState++;
681 *pStateCurnt++ = *pState++;
682 *pStateCurnt++ = *pState++;
683 *pStateCurnt++ = *pState++;
684
685 /* Decrement loop counter */
686 tapCnt--;
687 }
688
689 /* Calculate remaining number of copies */
690 tapCnt = (numTaps - 1U) % 0x4U;
691
692 #else
693
694 /* Initialize tapCnt with number of taps */
695 tapCnt = (numTaps - 1U);
696
697 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
698
699 /* Copy remaining data */
700 while (tapCnt > 0U)
701 {
702 *pStateCurnt++ = *pState++;
703
704 /* Decrement the loop counter */
705 tapCnt--;
706 }
707
708 }
709 #endif /* defined(ARM_MATH_MVEI) */
710
711 /**
712 @} end of FIR group
713 */
714