1 extern "C" {
2 extern void filter_test();
3 }
4
5 #include "allocator.h"
6
7 #include <dsppp/arch.hpp>
8 #include <dsppp/fixed_point.hpp>
9 #include <dsppp/matrix.hpp>
10 #include <dsppp/unroll.hpp>
11
12 #include <iostream>
13
14 #include <cmsis_tests.h>
15
16
17 #if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF)
18
19 #define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
20
21
22 #define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs) \
23 for (int j = 0; j < nbAcc; j++) { \
24 const q15_t *pSmp = &pSample[j]; \
25 q63_t acc[4]; \
26 \
27 acc[j] = 0; \
28 for (int i = 0; i < nbVecTaps; i++) { \
29 vecIn0 = vld1q(pSmp + 8 * i); \
30 acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]); \
31 } \
32 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15); \
33 }
34
35 #define FIR_Q15_MAIN_CORE() \
36 { \
37 q15_t *pState = S->pState; /* State pointer */ \
38 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
39 q15_t *pStateCur; /* Points to the current sample of the state */ \
40 const q15_t *pSamples; /* Temporary pointer to the sample buffer */ \
41 q15_t *pOutput; /* Temporary pointer to the output buffer */ \
42 const q15_t *pTempSrc; /* Temporary pointer to the source data */ \
43 q15_t *pTempDest; /* Temporary pointer to the destination buffer */\
44 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
45 int32_t blkCnt; \
46 q15x8_t vecIn0; \
47 \
48 /* \
49 * load coefs \
50 */ \
51 q15x8_t vecCoeffs[NBVECTAPS]; \
52 \
53 for (int i = 0; i < NBVECTAPS; i++) \
54 vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i); \
55 \
56 /* \
57 * pState points to state array which contains previous frame (numTaps - 1) samples \
58 * pStateCur points to the location where the new input data should be written \
59 */ \
60 pStateCur = &(pState[(numTaps - 1u)]); \
61 pTempSrc = pSrc; \
62 pSamples = pState; \
63 pOutput = pDst; \
64 \
65 blkCnt = blockSize >> 2; \
66 while (blkCnt > 0) { \
67 /* \
68 * Save 4 input samples in the history buffer \
69 */ \
70 vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc)); \
71 pStateCur += 4; \
72 pTempSrc += 4; \
73 \
74 FIR_Q15_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs); \
75 pSamples += 4; \
76 \
77 blkCnt--; \
78 } \
79 \
80 /* tail */ \
81 int32_t residual = blockSize & 3; \
82 \
83 for (int i = 0; i < residual; i++) \
84 *pStateCur++ = *pTempSrc++; \
85 \
86 FIR_Q15_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs); \
87 \
88 /* \
89 * Copy the samples back into the history buffer start \
90 */ \
91 pTempSrc = &pState[blockSize]; \
92 pTempDest = pState; \
93 \
94 /* current compiler limitation */ \
95 blkCnt = (numTaps - 1) >> 3; \
96 while (blkCnt > 0) \
97 { \
98 vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc)); \
99 pTempSrc += 8; \
100 pTempDest += 8; \
101 blkCnt--; \
102 } \
103 blkCnt = (numTaps - 1) & 7; \
104 if (blkCnt > 0) \
105 { \
106 mve_pred16_t p = vctp16q(blkCnt); \
107 vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p); \
108 } \
109 }
110
arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)111 static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S,
112 const q15_t * __restrict pSrc,
113 q15_t * __restrict pDst, uint32_t blockSize)
114 {
115 #define NBTAPS 32
116 #define NBVECTAPS (NBTAPS / 8)
117 FIR_Q15_MAIN_CORE();
118 #undef NBVECTAPS
119 #undef NBTAPS
120 }
121
arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)122 static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S,
123 const q15_t * __restrict pSrc,
124 q15_t * __restrict pDst, uint32_t blockSize)
125 {
126 #define NBTAPS 24
127 #define NBVECTAPS (NBTAPS / 8)
128 FIR_Q15_MAIN_CORE();
129 #undef NBVECTAPS
130 #undef NBTAPS
131 }
132
133
arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)134 static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S,
135 const q15_t * __restrict pSrc,
136 q15_t * __restrict pDst, uint32_t blockSize)
137 {
138 #define NBTAPS 16
139 #define NBVECTAPS (NBTAPS / 8)
140 FIR_Q15_MAIN_CORE();
141 #undef NBVECTAPS
142 #undef NBTAPS
143 }
144
arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)145 static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S,
146 const q15_t * __restrict pSrc,
147 q15_t * __restrict pDst, uint32_t blockSize)
148 {
149 #define NBTAPS 8
150 #define NBVECTAPS (NBTAPS / 8)
151 FIR_Q15_MAIN_CORE();
152 #undef NBVECTAPS
153 #undef NBTAPS
154 }
155
156
debug_arm_fir_q15(const arm_fir_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)157 void debug_arm_fir_q15(
158 const arm_fir_instance_q15 * S,
159 const q15_t * pSrc,
160 q15_t * pDst,
161 uint32_t blockSize)
162 {
163 q15_t *pState = S->pState; /* State pointer */
164 const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
165 q15_t *pStateCur; /* Points to the current sample of the state */
166 const q15_t *pSamples; /* Temporary pointer to the sample buffer */
167 q15_t *pOutput; /* Temporary pointer to the output buffer */
168 const q15_t *pTempSrc; /* Temporary pointer to the source data */
169 q15_t *pTempDest; /* Temporary pointer to the destination buffer */
170 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
171 uint32_t blkCnt;
172 q15x8_t vecIn0;
173 uint32_t tapsBlkCnt = (numTaps + 7) / 8;
174 q63_t acc0, acc1, acc2, acc3;
175
176
177 int32_t nbTaps = (numTaps + 7) >> 3;
178
179 switch(nbTaps) {
180
181 case 1:
182 arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize);
183 return;
184 case 2:
185 arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize);
186 return;
187 case 3:
188 arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize);
189 return;
190 case 4:
191 arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize);
192 return;
193 }
194 /*
195 * pState points to state array which contains previous frame (numTaps - 1) samples
196 * pStateCur points to the location where the new input data should be written
197 */
198 pStateCur = &(pState[(numTaps - 1u)]);
199 pTempSrc = pSrc;
200 pSamples = pState;
201 pOutput = pDst;
202 blkCnt = blockSize >> 2;
203
204 while (blkCnt > 0U)
205 {
206 const q15_t *pCoeffsTmp = pCoeffs;
207 const q15_t *pSamplesTmp = pSamples;
208
209 acc0 = 0LL;
210 acc1 = 0LL;
211 acc2 = 0LL;
212 acc3 = 0LL;
213
214 /*
215 * Save 8 input samples in the history buffer
216 */
217 vst1q(pStateCur, vld1q(pTempSrc));
218 pStateCur += 8;
219 pTempSrc += 8;
220
221 //INIT_SYSTICK;
222 //START_CYCLE_MEASUREMENT;
223 int i = tapsBlkCnt;
224 //startSectionNB(3);
225 while (i > 0)
226 {
227 /*
228 * load 8 coefs
229 */
230 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
231
232 vecIn0 = vld1q(pSamplesTmp);
233 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
234
235 vecIn0 = vld1q(&pSamplesTmp[1]);
236 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
237
238 vecIn0 = vld1q(&pSamplesTmp[2]);
239 acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
240
241 vecIn0 = vld1q(&pSamplesTmp[3]);
242 acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
243
244 pSamplesTmp += 8;
245 pCoeffsTmp += 8;
246 /*
247 * Decrement the taps block loop counter
248 */
249 i--;
250 }
251 //stopSectionNB(3);
252 //STOP_CYCLE_MEASUREMENT;
253
254
255 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
256 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
257 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
258 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
259
260 pSamples += 4;
261 /*
262 * Decrement the sample block loop counter
263 */
264 blkCnt--;
265 }
266
267 uint32_t residual = blockSize & 3;
268 switch (residual)
269 {
270 case 3:
271 {
272 const q15_t *pCoeffsTmp = pCoeffs;
273 const q15_t *pSamplesTmp = pSamples;
274
275 acc0 = 0LL;
276 acc1 = 0LL;
277 acc2 = 0LL;
278
279 /*
280 * Save 8 input samples in the history buffer
281 */
282 *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
283 pStateCur += 8;
284 pTempSrc += 8;
285
286 int i = tapsBlkCnt;
287 while (i > 0)
288 {
289 /*
290 * load 8 coefs
291 */
292 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
293
294 vecIn0 = vld1q(pSamplesTmp);
295 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
296
297 vecIn0 = vld1q(&pSamplesTmp[2]);
298 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
299
300 vecIn0 = vld1q(&pSamplesTmp[4]);
301 acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
302
303 pSamplesTmp += 8;
304 pCoeffsTmp += 8;
305 /*
306 * Decrement the taps block loop counter
307 */
308 i--;
309 }
310
311 acc0 = asrl(acc0, 15);
312 acc1 = asrl(acc1, 15);
313 acc2 = asrl(acc2, 15);
314
315 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
316 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
317 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
318 }
319 break;
320
321 case 2:
322 {
323 const q15_t *pCoeffsTmp = pCoeffs;
324 const q15_t *pSamplesTmp = pSamples;
325
326 acc0 = 0LL;
327 acc1 = 0LL;
328 /*
329 * Save 8 input samples in the history buffer
330 */
331 vst1q(pStateCur, vld1q(pTempSrc));
332 pStateCur += 8;
333 pTempSrc += 8;
334
335 int i = tapsBlkCnt;
336 while (i > 0)
337 {
338 /*
339 * load 8 coefs
340 */
341 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
342
343 vecIn0 = vld1q(pSamplesTmp);
344 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
345
346 vecIn0 = vld1q(&pSamplesTmp[2]);
347 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
348
349 pSamplesTmp += 8;
350 pCoeffsTmp += 8;
351 /*
352 * Decrement the taps block loop counter
353 */
354 i--;
355 }
356
357 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
358 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
359 }
360 break;
361
362 case 1:
363 {
364 const q15_t *pCoeffsTmp = pCoeffs;
365 const q15_t *pSamplesTmp = pSamples;
366
367 acc0 = 0LL;
368
369 /*
370 * Save 8 input samples in the history buffer
371 */
372 vst1q(pStateCur, vld1q(pTempSrc));
373 pStateCur += 8;
374 pTempSrc += 8;
375
376 int i = tapsBlkCnt;
377 while (i > 0)
378 {
379 /*
380 * load 8 coefs
381 */
382 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
383
384 vecIn0 = vld1q(pSamplesTmp);
385 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
386
387 pSamplesTmp += 8;
388 pCoeffsTmp += 8;
389 /*
390 * Decrement the taps block loop counter
391 */
392 i--;
393 }
394
395 *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
396 }
397 break;
398
399 }
400
401 /*
402 * Copy the samples back into the history buffer start
403 */
404 pTempSrc = &pState[blockSize];
405 pTempDest = pState;
406
407 blkCnt = numTaps >> 3;
408 while (blkCnt > 0U)
409 {
410 vst1q(pTempDest, vld1q(pTempSrc));
411 pTempSrc += 8;
412 pTempDest += 8;
413 blkCnt--;
414 }
415 blkCnt = numTaps & 7;
416 if (blkCnt > 0U)
417 {
418 mve_pred16_t p0 = vctp16q(blkCnt);
419 vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0);
420 }
421 }
422 #endif
423
424 template<typename T>
425 struct FirType;
426
427 template<>
428 struct FirType<float32_t>
429 {
430 typedef arm_fir_instance_f32 type;
init_stateFirType431 static void init_state(type * S,
432 uint16_t numTaps,
433 const float32_t * pCoeffs,
434 float32_t * pState,
435 uint32_t blockSize)
436 {
437 arm_fir_init_f32(S,numTaps,pCoeffs,pState,blockSize);
438 };
439
init_coefFirType440 static void init_coef(float32_t *coefs,uint16_t numTaps)
441 {
442 for(int i=0;i<numTaps;i++)
443 {
444 coefs[i] = 1.0f / numTaps;
445 }
446 };
447 };
448
449 template<>
450 struct FirType<Q15>
451 {
452 typedef arm_fir_instance_q15 type;
init_stateFirType453 static void init_state(type * S,
454 uint16_t numTaps,
455 const Q15 * pCoeffs,
456 Q15 * pState,
457 uint32_t blockSize)
458 {
459 arm_fir_init_q15(S,numTaps,
460 reinterpret_cast<const q15_t*>(pCoeffs),
461 reinterpret_cast<q15_t*>(pState),blockSize);
462 };
463
init_coefFirType464 static void init_coef(Q15 *coefs,uint16_t numTaps)
465 {
466 for(int i=0;i<numTaps;i++)
467 {
468 coefs[i] = Q15::f(1.0f / numTaps);
469 }
470 };
471 };
472
473
474
475 template<typename T,int BLOCK,int TAPS>
476 struct FIR {
477
FIRFIR478 FIR(const PVector<T,TAPS> &coefs):coef_(coefs),state_(T{})
479 {};
480
481
filterFIR482 PVector<T,BLOCK> filter(const PVector<T,BLOCK> &signal)
483 {
484 constexpr int UNROLL_FACTOR = 4;
485 PVector<T,BLOCK> res(T{});
486 using acc_type = typename number_traits<T>::accumulator;
487 std::array<acc_type,UNROLL_FACTOR> accu;
488 index_t i=0;
489
490 #if defined(ARM_COMPUTE_DISABLE_UNROLL)
491 #pragma clang loop unroll(disable)
492 #endif
493 for(;i<=BLOCK-UNROLL_FACTOR;i+=UNROLL_FACTOR)
494 {
495
496 state_.sub(TAPS-1+i,TAPS-1+i+UNROLL_FACTOR) = copy(signal.sub(i,i+UNROLL_FACTOR));
497
498 //INIT_SYSTICK;
499 //START_CYCLE_MEASUREMENT;
500 //startSectionNB(2);
501 results(accu) =
502 dot(unroll<UNROLL_FACTOR>(
503 [i,this](index_t k){return state_.sub(i+k,i+k+TAPS);}),
504 replicate<UNROLL_FACTOR>(coef_)
505 );
506 //stopSectionNB(2);
507 //STOP_CYCLE_MEASUREMENT;
508
509 for(index_t k=0;k<UNROLL_FACTOR;k++)
510 {
511 res[i+k] = inner::from_accumulator(accu[k]);
512 }
513 }
514
515 #if defined(ARM_COMPUTE_DISABLE_UNROLL)
516 #pragma clang loop unroll(disable)
517 #endif
518 for(;i<BLOCK;i++)
519 {
520 state_[TAPS-1+i] = signal[i];
521 acc_type acc = dot(state_.sub(i,i+TAPS),coef_);
522 res[i] = inner::from_accumulator(acc);
523 }
524
525 state_.sub(0,TAPS) = copy(state_.sub(BLOCK,TAPS+BLOCK));
526 return(res);
527 }
528
purecFIR529 void purec(const T *signal, T *dst)
530 {
531 const T* coef=coef_.const_ptr();
532 T *state = state_.ptr();
533
534 #if defined(ARM_COMPUTE_DISABLE_UNROLL)
535 #pragma clang loop unroll(disable)
536 #endif
537 for(index_t i=0;i<BLOCK;i++)
538 {
539 T acc=T{};
540 state[TAPS-1+i] = signal[i];
541 for(index_t k=0;k<TAPS;k++)
542 {
543 acc += state[i+k]*coef[k];
544 }
545 dst[i] = acc;
546 }
547 for(index_t i=0;i<TAPS-1;i++)
548 {
549 state[i] = state[BLOCK+i];
550 }
551 }
552
553 const PVector<T,TAPS> coef_;
554 PVector<T,TAPS+BLOCK-1> state_;
555 };
556
557 template<typename T,int BLOCK,int TAPS>
test()558 static void test()
559 {
560 constexpr int NB = BLOCK;
561 std::cout << "----\r\n(" << BLOCK << "," << TAPS << ")\r\n";
562
563 typename FirType<T>::type S;
564 PVector<T,NB> signal;
565 PVector<T,TAPS> coefs;
566
567 FirType<T>::init_coef(coefs.ptr(),TAPS);
568
569 init_array(signal,NB);
570 FIR<T,BLOCK,TAPS> fir(coefs);
571
572
573 INIT_SYSTICK;
574 START_CYCLE_MEASUREMENT;
575 startSectionNB(1);
576 PVector<T,BLOCK> res = fir.filter(signal);
577 //PVector<T,BLOCK> res;
578 //fir.purec(signal.const_ptr(),res.ptr());
579 stopSectionNB(1);
580 STOP_CYCLE_MEASUREMENT;
581
582
583 T* state;
584 T* coefsb;
585 state=(T*)malloc(sizeof(T)*(TAPS+BLOCK+BLOCK));
586 coefsb=(T*)malloc(sizeof(T)*(TAPS+32));
587 memset(coefsb,0,sizeof(T)*(TAPS+32));
588 for(int i =0;i<TAPS;i++)
589 {
590 coefsb[i] = coefs[i];
591 }
592 FirType<T>::init_state(&S,TAPS,coefsb,state,BLOCK);
593 PVector<T,NB> ref;
594 //std::cout << "---\r\n";
595
596
597 INIT_SYSTICK;
598 START_CYCLE_MEASUREMENT;
599 arm_fir_q15(&S,
600 reinterpret_cast<const q15_t*>(signal.const_ptr()),
601 reinterpret_cast<q15_t*>(ref.ptr()),BLOCK);
602 STOP_CYCLE_MEASUREMENT;
603
604
605 if (!validate(res.const_ptr(),ref.const_ptr(),BLOCK))
606 {
607 printf("fir failed \r\n");
608 }
609
610 free(state);
611 free(coefsb);
612
613
614 }
615
616
617
618
619 template<typename T>
all_filter_test()620 void all_filter_test()
621 {
622
623 title<T>("FIR test");
624
625
626 test<T,NBVEC_4,8>();
627 test<T,NBVEC_4,16>();
628 test<T,NBVEC_4,24>();
629 test<T,NBVEC_4,32>();
630 test<T,NBVEC_4,64>();
631
632 test<T,NBVEC_32,8>();
633 test<T,NBVEC_32,16>();
634 test<T,NBVEC_32,24>();
635 test<T,NBVEC_32,32>();
636 test<T,NBVEC_32,64>();
637
638 test<T,NBVEC_256,8>();
639 test<T,NBVEC_256,16>();
640 test<T,NBVEC_256,24>();
641 test<T,NBVEC_256,32>();
642 test<T,NBVEC_256,64>();
643
644
645 test<T,NBVEC_512,8>();
646 test<T,NBVEC_512,16>();
647 test<T,NBVEC_512,24>();
648 test<T,NBVEC_512,32>();
649 test<T,NBVEC_512,64>();
650
651
652 }
653
filter_test()654 void filter_test()
655 {
656 //all_filter_test<Q15>();
657 }