1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_fir_q31.c
4 * Description: Q31 FIR filter processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31
32 /**
33 @ingroup groupFilters
34 */
35
36 /**
37 @addtogroup FIR
38 @{
39 */
40
41 /**
42 @brief Processing function for Q31 FIR filter.
43 @param[in] S points to an instance of the Q31 FIR filter structure
44 @param[in] pSrc points to the block of input data
45 @param[out] pDst points to the block of output data
46 @param[in] blockSize number of samples to process
47
48 @par Scaling and Overflow Behavior
49 The function is implemented using an internal 64-bit accumulator.
50 The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
51 Thus, if the accumulator result overflows it wraps around rather than clip.
52 In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
53 After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
54
55 @remark
56 Refer to \ref arm_fir_fast_q31() for a faster but less precise implementation of this filter.
57 */
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
59
60 #include "arm_helium_utils.h"
61
62
63 #define FIR_Q31_CORE(nbAcc, nbVecTaps, pSample, vecCoeffs) \
64 for (int j = 0; j < nbAcc; j++) { \
65 const q31_t *pSmp = &pSamples[j]; \
66 q31x4_t vecIn0; \
67 q63_t acc[4]; \
68 \
69 acc[j] = 0; \
70 for (int i = 0; i < nbVecTaps; i++) { \
71 vecIn0 = vld1q(pSmp + 4 * i); \
72 acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
73 } \
74 *pOutput++ = (q31_t)asrl(acc[j], 23); \
75 }
76
77
78 #define FIR_Q31_CORE_STR_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \
79 for (int j = 0; j < nbAcc; j++) { \
80 const q31_t *pSmp = &pSamples[j]; \
81 q31x4_t vecIn0; \
82 \
83 acc[j] = 0; \
84 for (int i = 0; i < nbVecTaps; i++) { \
85 vecIn0 = vld1q(pSmp + 4 * i); \
86 acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
87 } \
88 *arm_fir_partial_accu_ptr++ = acc[j]; \
89 }
90
91
92 #define FIR_Q31_CORE_LD_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \
93 for (int j = 0; j < nbAcc; j++) { \
94 const q31_t *pSmp = &pSamples[j]; \
95 q31x4_t vecIn0; \
96 \
97 acc[j] = *arm_fir_partial_accu_ptr++; \
98 \
99 for (int i = 0; i < nbVecTaps; i++) { \
100 vecIn0 = vld1q(pSmp + 4 * i); \
101 acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
102 } \
103 *pOutput++ = (q31_t)asrl(acc[j], 23); \
104 }
105
106
107 #define FIR_Q31_MAIN_CORE() \
108 { \
109 q31_t *pRefStatePtr = S->pState + 2*ARM_ROUND_UP(blockSize, 4); \
110 q31_t *pState = pRefStatePtr; /* State pointer */ \
111 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
112 q31_t *pStateCur; /* Points to the current sample of the state */ \
113 const q31_t *pSamples; /* Temporary pointer to the sample buffer */ \
114 q31_t *pOutput; /* Temporary pointer to the output buffer */ \
115 const q31_t *pTempSrc; /* Temporary pointer to the source data */ \
116 q31_t *pTempDest; /* Temporary pointer to the destination buffer */\
117 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
118 int32_t blkCnt; \
119 \
120 /* \
121 * load coefs \
122 */ \
123 q31x4_t vecCoeffs[NBVECTAPS]; \
124 \
125 for (int i = 0; i < NBVECTAPS; i++) \
126 vecCoeffs[i] = vld1q(pCoeffs + 4 * i); \
127 \
128 /* \
129 * pState points to state array which contains previous frame (numTaps - 1) samples \
130 * pStateCur points to the location where the new input data should be written \
131 */ \
132 pStateCur = &(pState[(numTaps - 1u)]); \
133 pTempSrc = pSrc; \
134 pSamples = pState; \
135 pOutput = pDst; \
136 \
137 blkCnt = blockSize >> 2; \
138 while (blkCnt > 0) { \
139 /* \
140 * Save 4 input samples in the history buffer \
141 */ \
142 vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc)); \
143 pStateCur += 4; \
144 pTempSrc += 4; \
145 \
146 FIR_Q31_CORE(4, NBVECTAPS, pSamples, vecCoeffs); \
147 \
148 pSamples += 4; \
149 /* \
150 * Decrement the sample block loop counter \
151 */ \
152 blkCnt--; \
153 } \
154 \
155 /* tail */ \
156 int32_t residual = blockSize & 3; \
157 switch (residual) { \
158 case 3: \
159 { \
160 for (int i = 0; i < residual; i++) \
161 *pStateCur++ = *pTempSrc++; \
162 \
163 FIR_Q31_CORE(3, NBVECTAPS, pSamples, vecCoeffs); \
164 } \
165 break; \
166 \
167 case 2: \
168 { \
169 for (int i = 0; i < residual; i++) \
170 *pStateCur++ = *pTempSrc++; \
171 \
172 FIR_Q31_CORE(2, NBVECTAPS, pSamples, vecCoeffs); \
173 } \
174 break; \
175 \
176 case 1: \
177 { \
178 for (int i = 0; i < residual; i++) \
179 *pStateCur++ = *pTempSrc++; \
180 \
181 FIR_Q31_CORE(1, NBVECTAPS, pSamples, vecCoeffs); \
182 } \
183 break; \
184 } \
185 \
186 /* \
187 * Copy the samples back into the history buffer start \
188 */ \
189 pTempSrc = &pState[blockSize]; \
190 pTempDest = pState; \
191 \
192 blkCnt =(numTaps - 1) >> 2; \
193 while (blkCnt > 0) \
194 { \
195 vstrwq_s32(pTempDest, vldrwq_s32(pTempSrc)); \
196 pTempSrc += 4; \
197 pTempDest += 4; \
198 blkCnt--; \
199 } \
200 blkCnt = (numTaps - 1) & 3; \
201 if (blkCnt > 0) \
202 { \
203 mve_pred16_t p0 = vctp32q(blkCnt); \
204 vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p0), p0); \
205 } \
206 }
207
arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)208 static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S,
209 const q31_t * __restrict pSrc,
210 q31_t * __restrict pDst, uint32_t blockSize)
211 {
212 q31_t *pRefStatePtr = S->pState + 2*ARM_ROUND_UP(blockSize, 4);
213 q31_t *pState = pRefStatePtr; /* State pointer */
214 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
215 q31_t *pStateCur; /* Points to the current sample of the state */
216 const q31_t *pSamples; /* Temporary pointer to the sample buffer */
217 q31_t *pOutput; /* Temporary pointer to the output buffer */
218 const q31_t *pTempSrc; /* Temporary pointer to the source data */
219 q31_t *pTempDest; /* Temporary pointer to the destination buffer */
220 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
221 uint32_t blkCnt;
222 q31x4_t vecIn0;
223
224
225 /*
226 * pState points to state array which contains previous frame (numTaps - 1) samples
227 * pStateCur points to the location where the new input data should be written
228 */
229 pStateCur = &(pState[(numTaps - 1u)]);
230 pTempSrc = pSrc;
231 pSamples = pState;
232 pOutput = pDst;
233
234 q63_t acc0=0, acc1=0, acc2=0, acc3=0;
235 /*
236 * load 4 coefs
237 */
238 q31x4_t vecCoeffs = *(q31x4_t *) pCoeffs;
239
240 blkCnt = blockSize >> 2;
241 while (blkCnt > 0U)
242 {
243 const q31_t *pSamplesTmp = pSamples;
244
245 /*
246 * Save 4 input samples in the history buffer
247 */
248 vst1q(pStateCur, vld1q(pTempSrc));
249 pStateCur += 4;
250 pTempSrc += 4;
251
252 vecIn0 = vld1q(pSamplesTmp);
253 acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
254
255 vecIn0 = vld1q(&pSamplesTmp[1]);
256 acc1 = vrmlaldavhq(vecIn0, vecCoeffs);
257
258 vecIn0 = vld1q(&pSamplesTmp[2]);
259 acc2 = vrmlaldavhq(vecIn0, vecCoeffs);
260
261 vecIn0 = vld1q(&pSamplesTmp[3]);
262 acc3 = vrmlaldavhq(vecIn0, vecCoeffs);
263
264 acc0 = asrl(acc0, 23);
265 acc1 = asrl(acc1, 23);
266 acc2 = asrl(acc2, 23);
267 acc3 = asrl(acc3, 23);
268
269 *pOutput++ = (q31_t) acc0;
270 *pOutput++ = (q31_t) acc1;
271 *pOutput++ = (q31_t) acc2;
272 *pOutput++ = (q31_t) acc3;
273
274 pSamples += 4;
275 /*
276 * Decrement the sample block loop counter
277 */
278 blkCnt--;
279 }
280
281 uint32_t residual = blockSize & 3;
282 switch (residual)
283 {
284 case 3:
285 {
286 /*
287 * Save 4 input samples in the history buffer
288 */
289 *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
290 pStateCur += 4;
291 pTempSrc += 4;
292
293 vecIn0 = vld1q(pSamples);
294 acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
295
296 vecIn0 = vld1q(&pSamples[1]);
297 acc1 = vrmlaldavhq(vecIn0, vecCoeffs);
298
299 vecIn0 = vld1q(&pSamples[2]);
300 acc2 = vrmlaldavhq(vecIn0, vecCoeffs);
301
302 acc0 = asrl(acc0, 23);
303 acc1 = asrl(acc1, 23);
304 acc2 = asrl(acc2, 23);
305
306 *pOutput++ = (q31_t) acc0;
307 *pOutput++ = (q31_t) acc1;
308 *pOutput++ = (q31_t) acc2;
309 }
310 break;
311
312 case 2:
313 {
314 /*
315 * Save 4 input samples in the history buffer
316 */
317 vst1q(pStateCur, vld1q(pTempSrc));
318 pStateCur += 4;
319 pTempSrc += 4;
320
321 vecIn0 = vld1q(pSamples);
322 acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
323
324 vecIn0 = vld1q(&pSamples[1]);
325 acc1 = vrmlaldavhq(vecIn0, vecCoeffs);
326
327 acc0 = asrl(acc0, 23);
328 acc1 = asrl(acc1, 23);
329
330 *pOutput++ = (q31_t) acc0;
331 *pOutput++ = (q31_t) acc1;
332 }
333 break;
334
335 case 1:
336 {
337 /*
338 * Save 4 input samples in the history buffer
339 */
340 vst1q(pStateCur, vld1q(pTempSrc));
341 pStateCur += 4;
342 pTempSrc += 4;
343
344 vecIn0 = vld1q(pSamples);
345 acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
346
347 acc0 = asrl(acc0, 23);
348
349 *pOutput++ = (q31_t) acc0;
350 }
351 break;
352 }
353
354 /*
355 * Copy the samples back into the history buffer start
356 */
357 pTempSrc = &pState[blockSize];
358 pTempDest = pState;
359
360 blkCnt = (numTaps-1) >> 2;
361 while (blkCnt > 0U)
362 {
363 vst1q(pTempDest, vld1q(pTempSrc));
364 pTempSrc += 4;
365 pTempDest += 4;
366 blkCnt--;
367 }
368 blkCnt = (numTaps-1) & 3;
369 if (blkCnt > 0U)
370 {
371 mve_pred16_t p0 = vctp32q(blkCnt);
372 vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
373 }
374 }
375
376
377
arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)378 static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S,
379 const q31_t * __restrict pSrc,
380 q31_t * __restrict pDst, uint32_t blockSize)
381 {
382 #define NBTAPS 8
383 #define NBVECTAPS (NBTAPS / 4)
384 FIR_Q31_MAIN_CORE();
385 #undef NBVECTAPS
386 #undef NBTAPS
387 }
388
389
arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)390 static void arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S,
391 const q31_t * __restrict pSrc,
392 q31_t * __restrict pDst, uint32_t blockSize)
393 {
394 #define NBTAPS 12
395 #define NBVECTAPS (NBTAPS / 4)
396 FIR_Q31_MAIN_CORE();
397 #undef NBVECTAPS
398 #undef NBTAPS
399 }
400
401
arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)402 static void arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S,
403 const q31_t * __restrict pSrc,
404 q31_t * __restrict pDst, uint32_t blockSize)
405 {
406 #define NBTAPS 16
407 #define NBVECTAPS (NBTAPS / 4)
408 FIR_Q31_MAIN_CORE();
409 #undef NBVECTAPS
410 #undef NBTAPS
411 }
412
413
arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)414 static void arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S,
415 const q31_t * __restrict pSrc,
416 q31_t * __restrict pDst, uint32_t blockSize)
417 {
418 #define NBTAPS 20
419 #define NBVECTAPS (NBTAPS / 4)
420 FIR_Q31_MAIN_CORE();
421 #undef NBVECTAPS
422 #undef NBTAPS
423 }
424
425
arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)426 static void arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S,
427 const q31_t * __restrict pSrc,
428 q31_t * __restrict pDst, uint32_t blockSize)
429 {
430 #define NBTAPS 24
431 #define NBVECTAPS (NBTAPS / 4)
432 FIR_Q31_MAIN_CORE();
433 #undef NBVECTAPS
434 #undef NBTAPS
435 }
436
437
arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)438 static void arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S,
439 const q31_t * __restrict pSrc,
440 q31_t * __restrict pDst, uint32_t blockSize)
441 {
442 #define NBTAPS 28
443 #define NBVECTAPS (NBTAPS / 4)
444 FIR_Q31_MAIN_CORE();
445 #undef NBVECTAPS
446 #undef NBTAPS
447 }
448
arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)449 static void arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S,
450 const q31_t * __restrict pSrc,
451 q31_t * __restrict pDst,
452 uint32_t blockSize)
453 {
454 q31_t *pRefStatePtr = S->pState + 2*ARM_ROUND_UP(blockSize, 4);
455 q31_t *pState = pRefStatePtr; /* State pointer */
456 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
457 q31_t *pStateCur; /* Points to the current sample of the state */
458 const q31_t *pSamples; /* Temporary pointer to the sample buffer */
459 q31_t *pOutput; /* Temporary pointer to the output buffer */
460 const q31_t *pTempSrc; /* Temporary pointer to the source data */
461 q31_t *pTempDest; /* Temporary pointer to the destination buffer */
462 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
463 int32_t blkCnt;
464 q63_t acc0, acc1, acc2, acc3;
465
466 #define MAX_VECT_BATCH 7
467
468 /*
469 * pre-load 28 1st coefs
470 */
471 q31x4_t vecCoeffs0 = vld1q(pCoeffs + 4 * 0);
472 q31x4_t vecCoeffs1 = vld1q(pCoeffs + 4 * 1);
473 q31x4_t vecCoeffs2 = vld1q(pCoeffs + 4 * 2);
474 q31x4_t vecCoeffs3 = vld1q(pCoeffs + 4 * 3);
475 q31x4_t vecCoeffs4 = vld1q(pCoeffs + 4 * 4);
476 q31x4_t vecCoeffs5 = vld1q(pCoeffs + 4 * 5);
477 q31x4_t vecCoeffs6 = vld1q(pCoeffs + 4 * 6);
478
479 /*
480 * pState points to state array which contains previous frame (numTaps - 1) samples
481 * pStateCur points to the location where the new input data should be written
482 */
483 pStateCur = &(pState[(numTaps - 1u)]);
484 pTempSrc = pSrc;
485 pSamples = pState;
486
487 q63_t *arm_fir_partial_accu_ptr = (q63_t*)S->pState;
488
489 blkCnt = blockSize >> 2;
490 while (blkCnt > 0) {
491 /*
492 * Save 4 input samples in the history buffer
493 */
494 vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));
495 pStateCur += 4;
496 pTempSrc += 4;
497
498 const q31_t *pSmp;
499 q31x4_t vecIn0;
500
501 pSmp = &pSamples[0];
502
503 vecIn0 = vld1q(pSmp);
504 acc0 = vrmlaldavhq(vecIn0, vecCoeffs0);
505 vecIn0 = vld1q(pSmp + 4 * 1);
506 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs1);
507 vecIn0 = vld1q(pSmp + 4 * 2);
508 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs2);
509 vecIn0 = vld1q(pSmp + 4 * 3);
510 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs3);
511 vecIn0 = vld1q(pSmp + 4 * 4);
512 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs4);
513 vecIn0 = vld1q(pSmp + 4 * 5);
514 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5);
515 vecIn0 = vld1q(pSmp + 4 * 6);
516 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs6);
517
518 *arm_fir_partial_accu_ptr++ = acc0;
519
520 pSmp = &pSamples[1];
521
522 vecIn0 = vld1q(pSmp);
523 acc1 = vrmlaldavhq(vecIn0, vecCoeffs0);
524 vecIn0 = vld1q(pSmp + 4 * 1);
525 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs1);
526 vecIn0 = vld1q(pSmp + 4 * 2);
527 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs2);
528 vecIn0 = vld1q(pSmp + 4 * 3);
529 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs3);
530 vecIn0 = vld1q(pSmp + 4 * 4);
531 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs4);
532 vecIn0 = vld1q(pSmp + 4 * 5);
533 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5);
534 vecIn0 = vld1q(pSmp + 4 * 6);
535 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs6);
536
537 *arm_fir_partial_accu_ptr++ = acc1;
538
539 pSmp = &pSamples[2];
540
541 vecIn0 = vld1q(pSmp);
542 acc2 = vrmlaldavhq(vecIn0, vecCoeffs0);
543 vecIn0 = vld1q(pSmp + 4 * 1);
544 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs1);
545 vecIn0 = vld1q(pSmp + 4 * 2);
546 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs2);
547 vecIn0 = vld1q(pSmp + 4 * 3);
548 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs3);
549 vecIn0 = vld1q(pSmp + 4 * 4);
550 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs4);
551 vecIn0 = vld1q(pSmp + 4 * 5);
552 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5);
553 vecIn0 = vld1q(pSmp + 4 * 6);
554 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs6);
555 *arm_fir_partial_accu_ptr++ = acc2;
556
557 pSmp = &pSamples[3];
558
559 vecIn0 = vld1q(pSmp);
560 acc3 = vrmlaldavhq(vecIn0, vecCoeffs0);
561 vecIn0 = vld1q(pSmp + 4 * 1);
562 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs1);
563 vecIn0 = vld1q(pSmp + 4 * 2);
564 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs2);
565 vecIn0 = vld1q(pSmp + 4 * 3);
566 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs3);
567 vecIn0 = vld1q(pSmp + 4 * 4);
568 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs4);
569 vecIn0 = vld1q(pSmp + 4 * 5);
570 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5);
571 vecIn0 = vld1q(pSmp + 4 * 6);
572 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs6);
573
574 *arm_fir_partial_accu_ptr++ = acc3;
575
576 pSamples += 4;
577 /*
578 * Decrement the sample block loop counter
579 */
580 blkCnt--;
581 }
582
583
584 /* reminder */
585
586 /* load last 4 coef */
587 vecCoeffs0 = vld1q(pCoeffs + 4 * MAX_VECT_BATCH);
588 arm_fir_partial_accu_ptr = (q63_t*)S->pState;
589 pOutput = pDst;
590 pSamples = pState + (MAX_VECT_BATCH * 4);
591
592
593 blkCnt = blockSize >> 2;
594 while (blkCnt > 0) {
595 q31x4_t vecIn0;
596
597 /* reload intermediate MAC */
598 acc0 = *arm_fir_partial_accu_ptr++;
599 acc1 = *arm_fir_partial_accu_ptr++;
600 acc2 = *arm_fir_partial_accu_ptr++;
601 acc3 = *arm_fir_partial_accu_ptr++;
602
603
604 vecIn0 = vld1q(&pSamples[0]);
605 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs0);
606
607 vecIn0 = vld1q(&pSamples[1]);
608 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs0);
609
610 vecIn0 = vld1q(&pSamples[2]);
611 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs0);
612
613 vecIn0 = vld1q(&pSamples[3]);
614 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs0);
615
616 *pOutput++ = asrl(acc0, 23);
617 *pOutput++ = asrl(acc1, 23);
618 *pOutput++ = asrl(acc2, 23);
619 *pOutput++ = asrl(acc3, 23);
620
621 pSamples += 4;
622 /*
623 * Decrement the sample block loop counter
624 */
625 blkCnt--;
626 }
627
628 /*
629 * Copy the samples back into the history buffer start
630 */
631 pTempSrc = &pState[blockSize];
632 pTempDest = pState;
633
634 blkCnt = numTaps - 1;
635 do {
636 mve_pred16_t p = vctp32q(blkCnt);
637
638 vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p), p);
639 pTempSrc += 4;
640 pTempDest += 4;
641 blkCnt -= 4;
642 }
643 while (blkCnt > 0);
644 }
645
646
647
arm_fir_q31(const arm_fir_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)648 ARM_DSP_ATTRIBUTE void arm_fir_q31(
649 const arm_fir_instance_q31 * S,
650 const q31_t * pSrc,
651 q31_t * pDst,
652 uint32_t blockSize)
653 {
654 q31_t *pRefStatePtr = S->pState + 2*ARM_ROUND_UP(blockSize, 4);
655 q31_t *pState = pRefStatePtr; /* State pointer */
656 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
657 q31_t *pStateCur; /* Points to the current sample of the state */
658 const q31_t *pSamples; /* Temporary pointer to the sample buffer */
659 q31_t *pOutput; /* Temporary pointer to the output buffer */
660 const q31_t *pTempSrc; /* Temporary pointer to the source data */
661 q31_t *pTempDest; /* Temporary pointer to the destination buffer */
662 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
663 uint32_t blkCnt;
664 q31x4_t vecIn0;
665 uint32_t tapsBlkCnt = (numTaps + 3) / 4;
666 q63_t acc0, acc1, acc2, acc3;
667 q31x4_t vecCoeffs;
668
669
670 /*
671 * [1 to 32 taps] specialized routines
672 */
673 if (numTaps <= 4)
674 {
675 arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize);
676 return;
677 }
678 else if (numTaps <= 8)
679 {
680 arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize);
681 return;
682 }
683 else if (numTaps <= 12)
684 {
685 arm_fir_q31_9_12_mve(S, pSrc, pDst, blockSize);
686 return;
687 }
688 else if (numTaps <= 16)
689 {
690 arm_fir_q31_13_16_mve(S, pSrc, pDst, blockSize);
691 return;
692 }
693 else if (numTaps <= 20)
694 {
695 arm_fir_q31_17_20_mve(S, pSrc, pDst, blockSize);
696 return;
697 }
698 else if (numTaps <= 24)
699 {
700 arm_fir_q31_21_24_mve(S, pSrc, pDst, blockSize);
701 return;
702 }
703 else if (numTaps <= 28)
704 {
705 arm_fir_q31_25_28_mve(S, pSrc, pDst, blockSize);
706 return;
707 }
708 else if ((numTaps <= 32) && (blockSize >= 32))
709 {
710 arm_fir_q31_29_32_mve(S, pSrc, pDst, blockSize);
711 return;
712 }
713
714 /*
715 * pState points to state array which contains previous frame (numTaps - 1) samples
716 * pStateCur points to the location where the new input data should be written
717 */
718 pStateCur = &(pState[(numTaps - 1u)]);
719 pSamples = pState;
720 pTempSrc = pSrc;
721 pOutput = pDst;
722 blkCnt = blockSize >> 2;
723 while (blkCnt > 0)
724 {
725 const q31_t *pCoeffsTmp = pCoeffs;
726 const q31_t *pSamplesTmp = pSamples;
727
728 acc0 = 0LL;
729 acc1 = 0LL;
730 acc2 = 0LL;
731 acc3 = 0LL;
732
733 /*
734 * Save 4 input samples in the history buffer
735 */
736 vst1q(pStateCur, vld1q(pTempSrc));
737 pStateCur += 4;
738 pTempSrc += 4;
739
740 int i = tapsBlkCnt;
741 while (i > 0)
742 {
743 /*
744 * load 4 coefs
745 */
746 vecCoeffs = *(q31x4_t *) pCoeffsTmp;
747
748 vecIn0 = vld1q(pSamplesTmp);
749 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
750
751 vecIn0 = vld1q(&pSamplesTmp[1]);
752 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
753
754 vecIn0 = vld1q(&pSamplesTmp[2]);
755 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
756
757 vecIn0 = vld1q(&pSamplesTmp[3]);
758 acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs);
759
760 pSamplesTmp += 4;
761 pCoeffsTmp += 4;
762 /*
763 * Decrement the taps block loop counter
764 */
765 i--;
766 }
767
768 /* .54-> .31 conversion and store accumulators */
769 acc0 = asrl(acc0, 23);
770 acc1 = asrl(acc1, 23);
771 acc2 = asrl(acc2, 23);
772 acc3 = asrl(acc3, 23);
773
774 *pOutput++ = (q31_t) acc0;
775 *pOutput++ = (q31_t) acc1;
776 *pOutput++ = (q31_t) acc2;
777 *pOutput++ = (q31_t) acc3;
778
779 pSamples += 4;
780
781 /*
782 * Decrement the sample block loop counter
783 */
784 blkCnt--;
785 }
786
787 int32_t residual = blockSize & 3;
788 switch (residual)
789 {
790 case 3:
791 {
792 const q31_t *pCoeffsTmp = pCoeffs;
793 const q31_t *pSamplesTmp = pSamples;
794
795 acc0 = 0LL;
796 acc1 = 0LL;
797 acc2 = 0LL;
798
799 /*
800 * Save 4 input samples in the history buffer
801 */
802 *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
803 pStateCur += 4;
804 pTempSrc += 4;
805
806 int i = tapsBlkCnt;
807 while (i > 0)
808 {
809 vecCoeffs = *(q31x4_t *) pCoeffsTmp;
810
811 vecIn0 = vld1q(pSamplesTmp);
812 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
813
814 vecIn0 = vld1q(&pSamplesTmp[1]);
815 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
816
817 vecIn0 = vld1q(&pSamplesTmp[2]);
818 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
819
820 pSamplesTmp += 4;
821 pCoeffsTmp += 4;
822 i--;
823 }
824
825 acc0 = asrl(acc0, 23);
826 acc1 = asrl(acc1, 23);
827 acc2 = asrl(acc2, 23);
828
829 *pOutput++ = (q31_t) acc0;
830 *pOutput++ = (q31_t) acc1;
831 *pOutput++ = (q31_t) acc2;
832 }
833 break;
834
835 case 2:
836 {
837 const q31_t *pCoeffsTmp = pCoeffs;
838 const q31_t *pSamplesTmp = pSamples;
839
840 acc0 = 0LL;
841 acc1 = 0LL;
842
843 /*
844 * Save 4 input samples in the history buffer
845 */
846 vst1q(pStateCur, vld1q(pTempSrc));
847 pStateCur += 4;
848 pTempSrc += 4;
849
850 int i = tapsBlkCnt;
851 while (i > 0)
852 {
853 vecCoeffs = *(q31x4_t *) pCoeffsTmp;
854
855 vecIn0 = vld1q(pSamplesTmp);
856 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
857
858 vecIn0 = vld1q(&pSamplesTmp[1]);
859 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
860
861 pSamplesTmp += 4;
862 pCoeffsTmp += 4;
863 i--;
864 }
865
866 acc0 = asrl(acc0, 23);
867 acc1 = asrl(acc1, 23);
868
869 *pOutput++ = (q31_t) acc0;
870 *pOutput++ = (q31_t) acc1;
871 }
872 break;
873
874 case 1:
875 {
876 const q31_t *pCoeffsTmp = pCoeffs;
877 const q31_t *pSamplesTmp = pSamples;
878
879 acc0 = 0LL;
880
881 /*
882 * Save 4 input samples in the history buffer
883 */
884 vst1q(pStateCur, vld1q(pTempSrc));
885 pStateCur += 4;
886 pTempSrc += 4;
887
888 int i = tapsBlkCnt;
889 while (i > 0)
890 {
891 vecCoeffs = *(q31x4_t *) pCoeffsTmp;
892
893 vecIn0 = vld1q(pSamplesTmp);
894 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
895
896 pSamplesTmp += 4;
897 pCoeffsTmp += 4;
898 i--;
899 }
900
901 acc0 = asrl(acc0, 23);
902
903 *pOutput++ = (q31_t) acc0;
904 }
905 break;
906 }
907
908 /*
909 * Copy the samples back into the history buffer start
910 */
911 pTempSrc = &pState[blockSize];
912 pTempDest = pState;
913
914 blkCnt = (numTaps - 1U) >> 2;
915 while (blkCnt > 0)
916 {
917 vst1q(pTempDest, vld1q(pTempSrc));
918 pTempSrc += 4;
919 pTempDest += 4;
920 blkCnt--;
921 }
922 blkCnt = (numTaps - 1U) & 3;
923 if (blkCnt > 0)
924 {
925 mve_pred16_t p0 = vctp32q(blkCnt);
926 vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
927 }
928 }
929
930 #else
arm_fir_q31(const arm_fir_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)931 ARM_DSP_ATTRIBUTE void arm_fir_q31(
932 const arm_fir_instance_q31 * S,
933 const q31_t * pSrc,
934 q31_t * pDst,
935 uint32_t blockSize)
936 {
937 q31_t *pState = S->pState; /* State pointer */
938 const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
939 q31_t *pStateCurnt; /* Points to the current sample of the state */
940 q31_t *px; /* Temporary pointer for state buffer */
941 const q31_t *pb; /* Temporary pointer for coefficient buffer */
942 q63_t acc0; /* Accumulator */
943 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
944 uint32_t i, tapCnt, blkCnt; /* Loop counters */
945
946 #if defined (ARM_MATH_LOOPUNROLL)
947 q63_t acc1, acc2; /* Accumulators */
948 q31_t x0, x1, x2; /* Temporary variables to hold state values */
949 q31_t c0; /* Temporary variable to hold coefficient value */
950 #endif
951
952 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
953 /* pStateCurnt points to the location where the new input data should be written */
954 pStateCurnt = &(S->pState[(numTaps - 1U)]);
955
956 #if defined (ARM_MATH_LOOPUNROLL)
957
958 /* Loop unrolling: Compute 4 output values simultaneously.
959 * The variables acc0 ... acc3 hold output values that are being computed:
960 *
961 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
962 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
963 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
964 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
965 */
966
967 blkCnt = blockSize / 3;
968
969 while (blkCnt > 0U)
970 {
971 /* Copy 3 new input samples into the state buffer. */
972 *pStateCurnt++ = *pSrc++;
973 *pStateCurnt++ = *pSrc++;
974 *pStateCurnt++ = *pSrc++;
975
976 /* Set all accumulators to zero */
977 acc0 = 0;
978 acc1 = 0;
979 acc2 = 0;
980
981 /* Initialize state pointer */
982 px = pState;
983
984 /* Initialize coefficient pointer */
985 pb = pCoeffs;
986
987 /* Read the first 2 samples from the state buffer: x[n-numTaps], x[n-numTaps-1] */
988 x0 = *px++;
989 x1 = *px++;
990
991 /* Loop unrolling: process 3 taps at a time. */
992 tapCnt = numTaps / 3;
993
994 while (tapCnt > 0U)
995 {
996 /* Read the b[numTaps] coefficient */
997 c0 = *pb;
998
999 /* Read x[n-numTaps-2] sample */
1000 x2 = *(px++);
1001
1002 /* Perform the multiply-accumulates */
1003 acc0 += ((q63_t) x0 * c0);
1004 acc1 += ((q63_t) x1 * c0);
1005 acc2 += ((q63_t) x2 * c0);
1006
1007 /* Read the coefficient and state */
1008 c0 = *(pb + 1U);
1009 x0 = *(px++);
1010
1011 /* Perform the multiply-accumulates */
1012 acc0 += ((q63_t) x1 * c0);
1013 acc1 += ((q63_t) x2 * c0);
1014 acc2 += ((q63_t) x0 * c0);
1015
1016 /* Read the coefficient and state */
1017 c0 = *(pb + 2U);
1018 x1 = *(px++);
1019
1020 /* update coefficient pointer */
1021 pb += 3U;
1022
1023 /* Perform the multiply-accumulates */
1024 acc0 += ((q63_t) x2 * c0);
1025 acc1 += ((q63_t) x0 * c0);
1026 acc2 += ((q63_t) x1 * c0);
1027
1028 /* Decrement loop counter */
1029 tapCnt--;
1030 }
1031
1032 /* Loop unrolling: Compute remaining outputs */
1033 tapCnt = numTaps % 0x3U;
1034
1035 while (tapCnt > 0U)
1036 {
1037 /* Read coefficients */
1038 c0 = *(pb++);
1039
1040 /* Fetch 1 state variable */
1041 x2 = *(px++);
1042
1043 /* Perform the multiply-accumulates */
1044 acc0 += ((q63_t) x0 * c0);
1045 acc1 += ((q63_t) x1 * c0);
1046 acc2 += ((q63_t) x2 * c0);
1047
1048 /* Reuse the present sample states for next sample */
1049 x0 = x1;
1050 x1 = x2;
1051
1052 /* Decrement loop counter */
1053 tapCnt--;
1054 }
1055
1056 /* Advance the state pointer by 3 to process the next group of 3 samples */
1057 pState = pState + 3;
1058
1059 /* The result is in 2.30 format. Convert to 1.31 and store in destination buffer. */
1060 *pDst++ = (q31_t) (acc0 >> 31U);
1061 *pDst++ = (q31_t) (acc1 >> 31U);
1062 *pDst++ = (q31_t) (acc2 >> 31U);
1063
1064 /* Decrement loop counter */
1065 blkCnt--;
1066 }
1067
1068 /* Loop unrolling: Compute remaining output samples */
1069 blkCnt = blockSize % 0x3U;
1070
1071 #else
1072
1073 /* Initialize blkCnt with number of taps */
1074 blkCnt = blockSize;
1075
1076 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1077
1078 while (blkCnt > 0U)
1079 {
1080 /* Copy one sample at a time into state buffer */
1081 *pStateCurnt++ = *pSrc++;
1082
1083 /* Set the accumulator to zero */
1084 acc0 = 0;
1085
1086 /* Initialize state pointer */
1087 px = pState;
1088
1089 /* Initialize Coefficient pointer */
1090 pb = pCoeffs;
1091
1092 i = numTaps;
1093
1094 /* Perform the multiply-accumulates */
1095 do
1096 {
1097 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
1098 acc0 += (q63_t) *px++ * *pb++;
1099
1100 i--;
1101 } while (i > 0U);
1102
1103 /* Result is in 2.62 format. Convert to 1.31 and store in destination buffer. */
1104 *pDst++ = (q31_t) (acc0 >> 31U);
1105
1106 /* Advance state pointer by 1 for the next sample */
1107 pState = pState + 1U;
1108
1109 /* Decrement loop counter */
1110 blkCnt--;
1111 }
1112
1113 /* Processing is complete.
1114 Now copy the last numTaps - 1 samples to the start of the state buffer.
1115 This prepares the state buffer for the next function call. */
1116
1117 /* Points to the start of the state buffer */
1118 pStateCurnt = S->pState;
1119
1120 #if defined (ARM_MATH_LOOPUNROLL)
1121
1122 /* Loop unrolling: Compute 4 taps at a time */
1123 tapCnt = (numTaps - 1U) >> 2U;
1124
1125 /* Copy data */
1126 while (tapCnt > 0U)
1127 {
1128 *pStateCurnt++ = *pState++;
1129 *pStateCurnt++ = *pState++;
1130 *pStateCurnt++ = *pState++;
1131 *pStateCurnt++ = *pState++;
1132
1133 /* Decrement loop counter */
1134 tapCnt--;
1135 }
1136
1137 /* Calculate remaining number of copies */
1138 tapCnt = (numTaps - 1U) % 0x4U;
1139
1140 #else
1141
1142 /* Initialize tapCnt with number of taps */
1143 tapCnt = (numTaps - 1U);
1144
1145 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1146
1147 /* Copy remaining data */
1148 while (tapCnt > 0U)
1149 {
1150 *pStateCurnt++ = *pState++;
1151
1152 /* Decrement loop counter */
1153 tapCnt--;
1154 }
1155
1156 }
1157 #endif /* defined(ARM_MATH_MVEI) */
1158
1159 /**
1160 @} end of FIR group
1161 */
1162