1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_q15.c
4 * Description: Q15 FIR filter processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup FIR
37 @{
38 */
39
40 /**
41 @brief Processing function for the Q15 FIR filter.
42 @param[in] S points to an instance of the Q15 FIR filter structure
43 @param[in] pSrc points to the block of input data
44 @param[out] pDst points to the block of output data
45 @param[in] blockSize number of samples to process
46
47 @par Scaling and Overflow Behavior
48 The function is implemented using a 64-bit internal accumulator.
49 Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
50 The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
51 There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
52 After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
53 Lastly, the accumulator is saturated to yield a result in 1.15 format.
54
55 @remark
56 Refer to \ref arm_fir_fast_q15() for a faster but less precise implementation of this function.
57 */
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
59
60 #define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
61
62
63 #define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs) \
64 for (int j = 0; j < nbAcc; j++) { \
65 const q15_t *pSmp = &pSample[j]; \
66 q63_t acc[4]; \
67 \
68 acc[j] = 0; \
69 for (int i = 0; i < nbVecTaps; i++) { \
70 vecIn0 = vld1q(pSmp + 8 * i); \
71 acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]); \
72 } \
73 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15); \
74 }
75
76 #define FIR_Q15_MAIN_CORE() \
77 { \
78 q15_t *pState = S->pState; /* State pointer */ \
79 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
80 q15_t *pStateCur; /* Points to the current sample of the state */ \
81 const q15_t *pSamples; /* Temporary pointer to the sample buffer */ \
82 q15_t *pOutput; /* Temporary pointer to the output buffer */ \
83 const q15_t *pTempSrc; /* Temporary pointer to the source data */ \
84 q15_t *pTempDest; /* Temporary pointer to the destination buffer */\
85 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
86 int32_t blkCnt; \
87 q15x8_t vecIn0; \
88 \
89 /* \
90 * load coefs \
91 */ \
92 q15x8_t vecCoeffs[NBVECTAPS]; \
93 \
94 for (int i = 0; i < NBVECTAPS; i++) \
95 vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i); \
96 \
97 /* \
98 * pState points to state array which contains previous frame (numTaps - 1) samples \
99 * pStateCur points to the location where the new input data should be written \
100 */ \
101 pStateCur = &(pState[(numTaps - 1u)]); \
102 pTempSrc = pSrc; \
103 pSamples = pState; \
104 pOutput = pDst; \
105 \
106 blkCnt = blockSize >> 2; \
107 while (blkCnt > 0) { \
108 /* \
109 * Save 4 input samples in the history buffer \
110 */ \
111 vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc)); \
112 pStateCur += 4; \
113 pTempSrc += 4; \
114 \
115 FIR_Q15_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs); \
116 pSamples += 4; \
117 \
118 blkCnt--; \
119 } \
120 \
121 /* tail */ \
122 int32_t residual = blockSize & 3; \
123 \
124 for (int i = 0; i < residual; i++) \
125 *pStateCur++ = *pTempSrc++; \
126 \
127 FIR_Q15_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs); \
128 \
129 /* \
130 * Copy the samples back into the history buffer start \
131 */ \
132 pTempSrc = &pState[blockSize]; \
133 pTempDest = pState; \
134 \
135 /* current compiler limitation */ \
136 blkCnt = (numTaps - 1) >> 3; \
137 while (blkCnt > 0) \
138 { \
139 vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc)); \
140 pTempSrc += 8; \
141 pTempDest += 8; \
142 blkCnt--; \
143 } \
144 blkCnt = (numTaps - 1) & 7; \
145 if (blkCnt > 0) \
146 { \
147 mve_pred16_t p = vctp16q(blkCnt); \
148 vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p); \
149 } \
150 }
151
arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)152 static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S,
153 const q15_t * __restrict pSrc,
154 q15_t * __restrict pDst, uint32_t blockSize)
155 {
156 #define NBTAPS 32
157 #define NBVECTAPS (NBTAPS / 8)
158 FIR_Q15_MAIN_CORE();
159 #undef NBVECTAPS
160 #undef NBTAPS
161 }
162
arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)163 static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S,
164 const q15_t * __restrict pSrc,
165 q15_t * __restrict pDst, uint32_t blockSize)
166 {
167 #define NBTAPS 24
168 #define NBVECTAPS (NBTAPS / 8)
169 FIR_Q15_MAIN_CORE();
170 #undef NBVECTAPS
171 #undef NBTAPS
172 }
173
174
arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)175 static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S,
176 const q15_t * __restrict pSrc,
177 q15_t * __restrict pDst, uint32_t blockSize)
178 {
179 #define NBTAPS 16
180 #define NBVECTAPS (NBTAPS / 8)
181 FIR_Q15_MAIN_CORE();
182 #undef NBVECTAPS
183 #undef NBTAPS
184 }
185
arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)186 static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S,
187 const q15_t * __restrict pSrc,
188 q15_t * __restrict pDst, uint32_t blockSize)
189 {
190 #define NBTAPS 8
191 #define NBVECTAPS (NBTAPS / 8)
192 FIR_Q15_MAIN_CORE();
193 #undef NBVECTAPS
194 #undef NBTAPS
195 }
196
197
arm_fir_q15(const arm_fir_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)198 ARM_DSP_ATTRIBUTE void arm_fir_q15(
199 const arm_fir_instance_q15 * S,
200 const q15_t * pSrc,
201 q15_t * pDst,
202 uint32_t blockSize)
203 {
204 q15_t *pState = S->pState; /* State pointer */
205 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
206 q15_t *pStateCur; /* Points to the current sample of the state */
207 const q15_t *pSamples; /* Temporary pointer to the sample buffer */
208 q15_t *pOutput; /* Temporary pointer to the output buffer */
209 const q15_t *pTempSrc; /* Temporary pointer to the source data */
210 q15_t *pTempDest; /* Temporary pointer to the destination buffer */
211 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
212 uint32_t blkCnt;
213 q15x8_t vecIn0;
214 uint32_t tapsBlkCnt = (numTaps + 7) / 8;
215 q63_t acc0, acc1, acc2, acc3;
216
217
218 int32_t nbTaps = (numTaps + 7) >> 3;
219
220 switch(nbTaps) {
221
222 case 1:
223 arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize);
224 return;
225 case 2:
226 arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize);
227 return;
228 case 3:
229 arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize);
230 return;
231 case 4:
232 arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize);
233 return;
234 }
235 /*
236 * pState points to state array which contains previous frame (numTaps - 1) samples
237 * pStateCur points to the location where the new input data should be written
238 */
239 pStateCur = &(pState[(numTaps - 1u)]);
240 pTempSrc = pSrc;
241 pSamples = pState;
242 pOutput = pDst;
243 blkCnt = blockSize >> 2;
244
245 while (blkCnt > 0U)
246 {
247 const q15_t *pCoeffsTmp = pCoeffs;
248 const q15_t *pSamplesTmp = pSamples;
249
250 acc0 = 0LL;
251 acc1 = 0LL;
252 acc2 = 0LL;
253 acc3 = 0LL;
254
255 /*
256 * Save 8 input samples in the history buffer
257 */
258 vst1q(pStateCur, vld1q(pTempSrc));
259 pStateCur += 8;
260 pTempSrc += 8;
261
262 int i = tapsBlkCnt;
263 while (i > 0)
264 {
265 /*
266 * load 8 coefs
267 */
268 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
269
270 vecIn0 = vld1q(pSamplesTmp);
271 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
272
273 vecIn0 = vld1q(&pSamplesTmp[1]);
274 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
275
276 vecIn0 = vld1q(&pSamplesTmp[2]);
277 acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
278
279 vecIn0 = vld1q(&pSamplesTmp[3]);
280 acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
281
282 pSamplesTmp += 8;
283 pCoeffsTmp += 8;
284 /*
285 * Decrement the taps block loop counter
286 */
287 i--;
288 }
289
290 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
291 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
292 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
293 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
294
295 pSamples += 4;
296 /*
297 * Decrement the sample block loop counter
298 */
299 blkCnt--;
300 }
301
302 uint32_t residual = blockSize & 3;
303 switch (residual)
304 {
305 case 3:
306 {
307 const q15_t *pCoeffsTmp = pCoeffs;
308 const q15_t *pSamplesTmp = pSamples;
309
310 acc0 = 0LL;
311 acc1 = 0LL;
312 acc2 = 0LL;
313
314 /*
315 * Save 8 input samples in the history buffer
316 */
317 *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
318 pStateCur += 8;
319 pTempSrc += 8;
320
321 int i = tapsBlkCnt;
322 while (i > 0)
323 {
324 /*
325 * load 8 coefs
326 */
327 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
328
329 vecIn0 = vld1q(pSamplesTmp);
330 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
331
332 vecIn0 = vld1q(&pSamplesTmp[2]);
333 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
334
335 vecIn0 = vld1q(&pSamplesTmp[4]);
336 acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
337
338 pSamplesTmp += 8;
339 pCoeffsTmp += 8;
340 /*
341 * Decrement the taps block loop counter
342 */
343 i--;
344 }
345
346 acc0 = asrl(acc0, 15);
347 acc1 = asrl(acc1, 15);
348 acc2 = asrl(acc2, 15);
349
350 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
351 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
352 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
353 }
354 break;
355
356 case 2:
357 {
358 const q15_t *pCoeffsTmp = pCoeffs;
359 const q15_t *pSamplesTmp = pSamples;
360
361 acc0 = 0LL;
362 acc1 = 0LL;
363 /*
364 * Save 8 input samples in the history buffer
365 */
366 vst1q(pStateCur, vld1q(pTempSrc));
367 pStateCur += 8;
368 pTempSrc += 8;
369
370 int i = tapsBlkCnt;
371 while (i > 0)
372 {
373 /*
374 * load 8 coefs
375 */
376 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
377
378 vecIn0 = vld1q(pSamplesTmp);
379 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
380
381 vecIn0 = vld1q(&pSamplesTmp[2]);
382 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
383
384 pSamplesTmp += 8;
385 pCoeffsTmp += 8;
386 /*
387 * Decrement the taps block loop counter
388 */
389 i--;
390 }
391
392 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
393 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
394 }
395 break;
396
397 case 1:
398 {
399 const q15_t *pCoeffsTmp = pCoeffs;
400 const q15_t *pSamplesTmp = pSamples;
401
402 acc0 = 0LL;
403
404 /*
405 * Save 8 input samples in the history buffer
406 */
407 vst1q(pStateCur, vld1q(pTempSrc));
408 pStateCur += 8;
409 pTempSrc += 8;
410
411 int i = tapsBlkCnt;
412 while (i > 0)
413 {
414 /*
415 * load 8 coefs
416 */
417 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
418
419 vecIn0 = vld1q(pSamplesTmp);
420 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
421
422 pSamplesTmp += 8;
423 pCoeffsTmp += 8;
424 /*
425 * Decrement the taps block loop counter
426 */
427 i--;
428 }
429
430 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
431 }
432 break;
433 }
434
435 /*
436 * Copy the samples back into the history buffer start
437 */
438 pTempSrc = &pState[blockSize];
439 pTempDest = pState;
440
441 blkCnt = numTaps >> 3;
442 while (blkCnt > 0U)
443 {
444 vst1q(pTempDest, vld1q(pTempSrc));
445 pTempSrc += 8;
446 pTempDest += 8;
447 blkCnt--;
448 }
449 blkCnt = numTaps & 7;
450 if (blkCnt > 0U)
451 {
452 mve_pred16_t p0 = vctp16q(blkCnt);
453 vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0);
454 }
455 }
456
457 #else
arm_fir_q15(const arm_fir_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)458 ARM_DSP_ATTRIBUTE void arm_fir_q15(
459 const arm_fir_instance_q15 * S,
460 const q15_t * pSrc,
461 q15_t * pDst,
462 uint32_t blockSize)
463 {
464 q15_t *pState = S->pState; /* State pointer */
465 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
466 q15_t *pStateCurnt; /* Points to the current sample of the state */
467 q15_t *px; /* Temporary pointer for state buffer */
468 const q15_t *pb; /* Temporary pointer for coefficient buffer */
469 q63_t acc0; /* Accumulators */
470 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
471 uint32_t tapCnt, blkCnt; /* Loop counters */
472
473 #if defined (ARM_MATH_LOOPUNROLL)
474 q63_t acc1, acc2, acc3; /* Accumulators */
475 q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */
476 #endif
477
478 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
479 /* pStateCurnt points to the location where the new input data should be written */
480 pStateCurnt = &(S->pState[(numTaps - 1U)]);
481
482 #if defined (ARM_MATH_LOOPUNROLL)
483
484 /* Loop unrolling: Compute 4 output values simultaneously.
485 * The variables acc0 ... acc3 hold output values that are being computed:
486 *
487 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
488 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
489 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
490 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
491 */
492 blkCnt = blockSize >> 2U;
493
494 while (blkCnt > 0U)
495 {
496 /* Copy 4 new input samples into the state buffer. */
497 *pStateCurnt++ = *pSrc++;
498 *pStateCurnt++ = *pSrc++;
499 *pStateCurnt++ = *pSrc++;
500 *pStateCurnt++ = *pSrc++;
501
502 /* Set all accumulators to zero */
503 acc0 = 0;
504 acc1 = 0;
505 acc2 = 0;
506 acc3 = 0;
507
508 /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
509 px = pState;
510
511 /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
512 pb = pCoeffs;
513
514 /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
515 x0 = read_q15x2_ia (&px);
516
517 /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
518 x2 = read_q15x2_ia (&px);
519
520 /* Loop over the number of taps. Unroll by a factor of 4.
521 Repeat until we've computed numTaps-(numTaps%4) coefficients. */
522 tapCnt = numTaps >> 2U;
523
524 while (tapCnt > 0U)
525 {
526 /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
527 c0 = read_q15x2_ia (&pb);
528
529 /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
530 acc0 = __SMLALD(x0, c0, acc0);
531
532 /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
533 acc2 = __SMLALD(x2, c0, acc2);
534
535 /* pack x[n-N-1] and x[n-N-2] */
536 #ifndef ARM_MATH_BIG_ENDIAN
537 x1 = __PKHBT(x2, x0, 0);
538 #else
539 x1 = __PKHBT(x0, x2, 0);
540 #endif
541
542 /* Read state x[n-N-4], x[n-N-5] */
543 x0 = read_q15x2_ia (&px);
544
545 /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
546 acc1 = __SMLALDX(x1, c0, acc1);
547
548 /* pack x[n-N-3] and x[n-N-4] */
549 #ifndef ARM_MATH_BIG_ENDIAN
550 x1 = __PKHBT(x0, x2, 0);
551 #else
552 x1 = __PKHBT(x2, x0, 0);
553 #endif
554
555 /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
556 acc3 = __SMLALDX(x1, c0, acc3);
557
558 /* Read coefficients b[N-2], b[N-3] */
559 c0 = read_q15x2_ia (&pb);
560
561 /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
562 acc0 = __SMLALD(x2, c0, acc0);
563
564 /* Read state x[n-N-6], x[n-N-7] with offset */
565 x2 = read_q15x2_ia (&px);
566
567 /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
568 acc2 = __SMLALD(x0, c0, acc2);
569
570 /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
571 acc1 = __SMLALDX(x1, c0, acc1);
572
573 /* pack x[n-N-5] and x[n-N-6] */
574 #ifndef ARM_MATH_BIG_ENDIAN
575 x1 = __PKHBT(x2, x0, 0);
576 #else
577 x1 = __PKHBT(x0, x2, 0);
578 #endif
579
580 /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
581 acc3 = __SMLALDX(x1, c0, acc3);
582
583 /* Decrement tap count */
584 tapCnt--;
585 }
586
587 /* If the filter length is not a multiple of 4, compute the remaining filter taps.
588 This is always be 2 taps since the filter length is even. */
589 if ((numTaps & 0x3U) != 0U)
590 {
591 /* Read last two coefficients */
592 c0 = read_q15x2_ia (&pb);
593
594 /* Perform the multiply-accumulates */
595 acc0 = __SMLALD(x0, c0, acc0);
596 acc2 = __SMLALD(x2, c0, acc2);
597
598 /* pack state variables */
599 #ifndef ARM_MATH_BIG_ENDIAN
600 x1 = __PKHBT(x2, x0, 0);
601 #else
602 x1 = __PKHBT(x0, x2, 0);
603 #endif
604
605 /* Read last state variables */
606 x0 = read_q15x2 (px);
607
608 /* Perform the multiply-accumulates */
609 acc1 = __SMLALDX(x1, c0, acc1);
610
611 /* pack state variables */
612 #ifndef ARM_MATH_BIG_ENDIAN
613 x1 = __PKHBT(x0, x2, 0);
614 #else
615 x1 = __PKHBT(x2, x0, 0);
616 #endif
617
618 /* Perform the multiply-accumulates */
619 acc3 = __SMLALDX(x1, c0, acc3);
620 }
621
622 /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
623 Then store the 4 outputs in the destination buffer. */
624 #ifndef ARM_MATH_BIG_ENDIAN
625 write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
626 write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
627 #else
628 write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
629 write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
630 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
631
632 /* Advance the state pointer by 4 to process the next group of 4 samples */
633 pState = pState + 4U;
634
635 /* Decrement loop counter */
636 blkCnt--;
637 }
638
639 /* Loop unrolling: Compute remaining output samples */
640 blkCnt = blockSize % 0x4U;
641
642 #else
643
644 /* Initialize blkCnt with number of taps */
645 blkCnt = blockSize;
646
647 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
648
649 while (blkCnt > 0U)
650 {
651 /* Copy two samples into state buffer */
652 *pStateCurnt++ = *pSrc++;
653
654 /* Set the accumulator to zero */
655 acc0 = 0;
656
657 /* Use SIMD to hold states and coefficients */
658 px = pState;
659 pb = pCoeffs;
660
661 tapCnt = numTaps >> 1U;
662
663 while (tapCnt > 0U)
664 {
665 acc0 += (q31_t) *px++ * *pb++;
666 acc0 += (q31_t) *px++ * *pb++;
667
668 tapCnt--;
669 }
670
671
672 /* The result is in 2.30 format. Convert to 1.15 with saturation.
673 Then store the output in the destination buffer. */
674 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
675
676 /* Advance state pointer by 1 for the next sample */
677 pState = pState + 1U;
678
679 /* Decrement loop counter */
680 blkCnt--;
681 }
682
683 /* Processing is complete.
684 Now copy the last numTaps - 1 samples to the start of the state buffer.
685 This prepares the state buffer for the next function call. */
686
687 /* Points to the start of the state buffer */
688 pStateCurnt = S->pState;
689
690 #if defined (ARM_MATH_LOOPUNROLL)
691
692 /* Loop unrolling: Compute 4 taps at a time */
693 tapCnt = (numTaps - 1U) >> 2U;
694
695 /* Copy data */
696 while (tapCnt > 0U)
697 {
698 *pStateCurnt++ = *pState++;
699 *pStateCurnt++ = *pState++;
700 *pStateCurnt++ = *pState++;
701 *pStateCurnt++ = *pState++;
702
703 /* Decrement loop counter */
704 tapCnt--;
705 }
706
707 /* Calculate remaining number of copies */
708 tapCnt = (numTaps - 1U) % 0x4U;
709
710 #else
711
712 /* Initialize tapCnt with number of taps */
713 tapCnt = (numTaps - 1U);
714
715 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
716
717 /* Copy remaining data */
718 while (tapCnt > 0U)
719 {
720 *pStateCurnt++ = *pState++;
721
722 /* Decrement loop counter */
723 tapCnt--;
724 }
725
726 }
727 #endif /* defined(ARM_MATH_MVEI) */
728
729 /**
730 @} end of FIR group
731 */
732