1 extern "C" {
2     extern void filter_test();
3 }
4 
5 #include "allocator.h"
6 
7 #include <dsppp/arch.hpp>
8 #include <dsppp/fixed_point.hpp>
9 #include <dsppp/matrix.hpp>
10 #include <dsppp/unroll.hpp>
11 
12 #include <iostream>
13 
14 #include <cmsis_tests.h>
15 
16 
17 #if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF)
18 
19 #define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
20 
21 
22 #define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs)        \
23         for (int j = 0; j < nbAcc; j++) {                                  \
24             const q15_t    *pSmp = &pSample[j];                            \
25             q63_t           acc[4];                                        \
26                                                                            \
27             acc[j] = 0;                                                    \
28             for (int i = 0; i < nbVecTaps; i++) {                          \
29                 vecIn0 = vld1q(pSmp + 8 * i);                  \
30                 acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]);         \
31             }                                                              \
32             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15);               \
33         }
34 
35 #define FIR_Q15_MAIN_CORE()                                                                  \
36 {                                                                                            \
37     q15_t          *pState = S->pState;     /* State pointer */                              \
38     const q15_t    *pCoeffs = S->pCoeffs;   /* Coefficient pointer */                        \
39     q15_t          *pStateCur;              /* Points to the current sample of the state */  \
40     const q15_t    *pSamples;               /* Temporary pointer to the sample buffer */     \
41     q15_t          *pOutput;                /* Temporary pointer to the output buffer */     \
42     const q15_t    *pTempSrc;               /* Temporary pointer to the source data */       \
43     q15_t          *pTempDest;              /* Temporary pointer to the destination buffer */\
44     uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */\
45     int32_t         blkCnt;                                                                  \
46     q15x8_t         vecIn0;                                                                  \
47                                                                                              \
48     /*                                                                                       \
49      * load coefs                                                                            \
50      */                                                                                      \
51     q15x8_t         vecCoeffs[NBVECTAPS];                                                    \
52                                                                                              \
53     for (int i = 0; i < NBVECTAPS; i++)                                                      \
54         vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i);                                          \
55                                                                                              \
56     /*                                                                                       \
57      * pState points to state array which contains previous frame (numTaps - 1) samples      \
58      * pStateCur points to the location where the new input data should be written           \
59      */                                                                                      \
60     pStateCur = &(pState[(numTaps - 1u)]);                                                   \
61     pTempSrc = pSrc;                                                                         \
62     pSamples = pState;                                                                       \
63     pOutput = pDst;                                                                          \
64                                                                                              \
65     blkCnt = blockSize >> 2;                                                                 \
66     while (blkCnt > 0) {                                                                     \
67         /*                                                                                   \
68          * Save 4 input samples in the history buffer                                        \
69          */                                                                                  \
70         vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc));                                         \
71         pStateCur += 4;                                                                      \
72         pTempSrc += 4;                                                                       \
73                                                                                              \
74         FIR_Q15_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs);                            \
75         pSamples += 4;                                                                       \
76                                                                                              \
77         blkCnt--;                                                                            \
78     }                                                                                        \
79                                                                                              \
80     /* tail */                                                                               \
81     int32_t        residual = blockSize & 3;                                                \
82                                                                                              \
83     for (int i = 0; i < residual; i++)                                                       \
84         *pStateCur++ = *pTempSrc++;                                                          \
85                                                                                              \
86     FIR_Q15_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs);                         \
87                                                                                              \
88     /*                                                                                       \
89      * Copy the samples back into the history buffer start                                   \
90      */                                                                                      \
91     pTempSrc = &pState[blockSize];                                                           \
92     pTempDest = pState;                                                                      \
93                                                                                              \
94     /* current compiler limitation */                                                        \
95     blkCnt = (numTaps - 1) >> 3;                                                             \
96     while (blkCnt > 0)                                                                       \
97     {                                                                                        \
98         vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc));                                         \
99         pTempSrc += 8;                                                                       \
100         pTempDest += 8;                                                                      \
101         blkCnt--;                                                                            \
102     }                                                                                        \
103     blkCnt = (numTaps - 1) & 7;                                                              \
104     if (blkCnt > 0)                                                                          \
105     {                                                                                        \
106         mve_pred16_t p = vctp16q(blkCnt);                                                    \
107         vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p);                               \
108     }                                                                                        \
109 }
110 
arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)111 static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S,
112   const q15_t * __restrict pSrc,
113   q15_t * __restrict pDst, uint32_t blockSize)
114 {
115     #define NBTAPS 32
116     #define NBVECTAPS (NBTAPS / 8)
117     FIR_Q15_MAIN_CORE();
118     #undef NBVECTAPS
119     #undef NBTAPS
120 }
121 
arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)122 static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S,
123   const q15_t * __restrict pSrc,
124   q15_t * __restrict pDst, uint32_t blockSize)
125 {
126     #define NBTAPS 24
127     #define NBVECTAPS (NBTAPS / 8)
128     FIR_Q15_MAIN_CORE();
129     #undef NBVECTAPS
130     #undef NBTAPS
131 }
132 
133 
arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)134 static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S,
135   const q15_t * __restrict pSrc,
136   q15_t * __restrict pDst, uint32_t blockSize)
137 {
138     #define NBTAPS 16
139     #define NBVECTAPS (NBTAPS / 8)
140     FIR_Q15_MAIN_CORE();
141     #undef NBVECTAPS
142     #undef NBTAPS
143 }
144 
arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S,const q15_t * __restrict pSrc,q15_t * __restrict pDst,uint32_t blockSize)145 static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S,
146   const q15_t * __restrict pSrc,
147   q15_t * __restrict pDst, uint32_t blockSize)
148 {
149     #define NBTAPS 8
150     #define NBVECTAPS (NBTAPS / 8)
151     FIR_Q15_MAIN_CORE();
152     #undef NBVECTAPS
153     #undef NBTAPS
154 }
155 
156 
debug_arm_fir_q15(const arm_fir_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)157 void debug_arm_fir_q15(
158   const arm_fir_instance_q15 * S,
159   const q15_t * pSrc,
160         q15_t * pDst,
161         uint32_t blockSize)
162 {
163     q15_t    *pState = S->pState;   /* State pointer */
164     const q15_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
165     q15_t    *pStateCur;        /* Points to the current sample of the state */
166     const q15_t    *pSamples;         /* Temporary pointer to the sample buffer */
167     q15_t    *pOutput;          /* Temporary pointer to the output buffer */
168     const q15_t    *pTempSrc;         /* Temporary pointer to the source data */
169     q15_t    *pTempDest;        /* Temporary pointer to the destination buffer */
170     uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
171     uint32_t  blkCnt;
172     q15x8_t vecIn0;
173     uint32_t  tapsBlkCnt = (numTaps + 7) / 8;
174     q63_t     acc0, acc1, acc2, acc3;
175 
176 
177 int32_t nbTaps = (numTaps + 7) >> 3;
178 
179 switch(nbTaps) {
180 
181     case 1:
182         arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize);
183         return;
184     case 2:
185         arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize);
186         return;
187     case 3:
188         arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize);
189         return;
190     case 4:
191         arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize);
192         return;
193     }
194     /*
195      * pState points to state array which contains previous frame (numTaps - 1) samples
196      * pStateCur points to the location where the new input data should be written
197      */
198     pStateCur   = &(pState[(numTaps - 1u)]);
199     pTempSrc    = pSrc;
200     pSamples    = pState;
201     pOutput     = pDst;
202     blkCnt      = blockSize >> 2;
203 
204     while (blkCnt > 0U)
205     {
206         const q15_t    *pCoeffsTmp = pCoeffs;
207         const q15_t    *pSamplesTmp = pSamples;
208 
209         acc0 = 0LL;
210         acc1 = 0LL;
211         acc2 = 0LL;
212         acc3 = 0LL;
213 
214         /*
215          * Save 8 input samples in the history buffer
216          */
217         vst1q(pStateCur, vld1q(pTempSrc));
218         pStateCur += 8;
219         pTempSrc += 8;
220 
221         //INIT_SYSTICK;
222         //START_CYCLE_MEASUREMENT;
223         int       i = tapsBlkCnt;
224         //startSectionNB(3);
225         while (i > 0)
226         {
227             /*
228              * load 8 coefs
229              */
230             q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
231 
232             vecIn0 = vld1q(pSamplesTmp);
233             acc0 =  vmlaldavaq(acc0, vecIn0, vecCoeffs);
234 
235             vecIn0 = vld1q(&pSamplesTmp[1]);
236             acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
237 
238             vecIn0 = vld1q(&pSamplesTmp[2]);
239             acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
240 
241             vecIn0 = vld1q(&pSamplesTmp[3]);
242             acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
243 
244             pSamplesTmp += 8;
245             pCoeffsTmp += 8;
246             /*
247              * Decrement the taps block loop counter
248              */
249             i--;
250         }
251         //stopSectionNB(3);
252         //STOP_CYCLE_MEASUREMENT;
253 
254 
255         *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
256         *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
257         *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
258         *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
259 
260         pSamples += 4;
261         /*
262          * Decrement the sample block loop counter
263          */
264         blkCnt--;
265     }
266 
267     uint32_t  residual = blockSize & 3;
268     switch (residual)
269     {
270     case 3:
271         {
272             const q15_t    *pCoeffsTmp = pCoeffs;
273             const q15_t    *pSamplesTmp = pSamples;
274 
275             acc0 = 0LL;
276             acc1 = 0LL;
277             acc2 = 0LL;
278 
279             /*
280              * Save 8 input samples in the history buffer
281              */
282             *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
283             pStateCur += 8;
284             pTempSrc += 8;
285 
286             int       i = tapsBlkCnt;
287             while (i > 0)
288             {
289                 /*
290                  * load 8 coefs
291                  */
292                 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
293 
294                 vecIn0 = vld1q(pSamplesTmp);
295                 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
296 
297                 vecIn0 = vld1q(&pSamplesTmp[2]);
298                 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
299 
300                 vecIn0 = vld1q(&pSamplesTmp[4]);
301                 acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
302 
303                 pSamplesTmp += 8;
304                 pCoeffsTmp += 8;
305                 /*
306                  * Decrement the taps block loop counter
307                  */
308                 i--;
309             }
310 
311             acc0 = asrl(acc0, 15);
312             acc1 = asrl(acc1, 15);
313             acc2 = asrl(acc2, 15);
314 
315             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
316             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
317             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
318         }
319         break;
320 
321     case 2:
322         {
323             const q15_t    *pCoeffsTmp = pCoeffs;
324             const q15_t    *pSamplesTmp = pSamples;
325 
326             acc0 = 0LL;
327             acc1 = 0LL;
328             /*
329              * Save 8 input samples in the history buffer
330              */
331             vst1q(pStateCur, vld1q(pTempSrc));
332             pStateCur += 8;
333             pTempSrc += 8;
334 
335             int       i = tapsBlkCnt;
336             while (i > 0)
337             {
338                 /*
339                  * load 8 coefs
340                  */
341                 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
342 
343                 vecIn0 = vld1q(pSamplesTmp);
344                 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
345 
346                 vecIn0 = vld1q(&pSamplesTmp[2]);
347                 acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
348 
349                 pSamplesTmp += 8;
350                 pCoeffsTmp += 8;
351                 /*
352                  * Decrement the taps block loop counter
353                  */
354                 i--;
355             }
356 
357             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
358             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
359         }
360         break;
361 
362     case 1:
363         {
364             const q15_t    *pCoeffsTmp = pCoeffs;
365             const q15_t    *pSamplesTmp = pSamples;
366 
367             acc0 = 0LL;
368 
369             /*
370              * Save 8 input samples in the history buffer
371              */
372             vst1q(pStateCur, vld1q(pTempSrc));
373             pStateCur += 8;
374             pTempSrc += 8;
375 
376             int       i = tapsBlkCnt;
377             while (i > 0)
378             {
379                 /*
380                  * load 8 coefs
381                  */
382                 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
383 
384                 vecIn0 = vld1q(pSamplesTmp);
385                 acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
386 
387                 pSamplesTmp += 8;
388                 pCoeffsTmp += 8;
389                 /*
390                  * Decrement the taps block loop counter
391                  */
392                 i--;
393             }
394 
395             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
396         }
397         break;
398 
399     }
400 
401     /*
402      * Copy the samples back into the history buffer start
403      */
404     pTempSrc = &pState[blockSize];
405     pTempDest = pState;
406 
407     blkCnt = numTaps >> 3;
408     while (blkCnt > 0U)
409     {
410         vst1q(pTempDest, vld1q(pTempSrc));
411         pTempSrc += 8;
412         pTempDest += 8;
413         blkCnt--;
414     }
415     blkCnt = numTaps & 7;
416     if (blkCnt > 0U)
417     {
418         mve_pred16_t p0 = vctp16q(blkCnt);
419         vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0);
420     }
421 }
422 #endif
423 
424 template<typename T>
425 struct FirType;
426 
427 template<>
428 struct FirType<float32_t>
429 {
430    typedef arm_fir_instance_f32 type;
init_stateFirType431    static void init_state(type * S,
432                           uint16_t numTaps,
433                           const float32_t * pCoeffs,
434                                 float32_t * pState,
435                                 uint32_t blockSize)
436    {
437        arm_fir_init_f32(S,numTaps,pCoeffs,pState,blockSize);
438    };
439 
init_coefFirType440    static void init_coef(float32_t *coefs,uint16_t numTaps)
441    {
442      for(int i=0;i<numTaps;i++)
443      {
444         coefs[i] = 1.0f / numTaps;
445      }
446    };
447 };
448 
449 template<>
450 struct FirType<Q15>
451 {
452    typedef arm_fir_instance_q15 type;
init_stateFirType453    static void init_state(type * S,
454                           uint16_t numTaps,
455                           const Q15 * pCoeffs,
456                                 Q15 * pState,
457                                 uint32_t blockSize)
458    {
459        arm_fir_init_q15(S,numTaps,
460          reinterpret_cast<const q15_t*>(pCoeffs),
461          reinterpret_cast<q15_t*>(pState),blockSize);
462    };
463 
init_coefFirType464    static void init_coef(Q15 *coefs,uint16_t numTaps)
465    {
466      for(int i=0;i<numTaps;i++)
467      {
468         coefs[i] = Q15::f(1.0f / numTaps);
469      }
470    };
471 };
472 
473 
474 
475 template<typename T,int BLOCK,int TAPS>
476 struct FIR {
477 
FIRFIR478    FIR(const PVector<T,TAPS> &coefs):coef_(coefs),state_(T{})
479    {};
480 
481 
filterFIR482    PVector<T,BLOCK> filter(const PVector<T,BLOCK> &signal)
483    {
484       constexpr int UNROLL_FACTOR = 4;
485       PVector<T,BLOCK> res(T{});
486       using acc_type = typename number_traits<T>::accumulator;
487       std::array<acc_type,UNROLL_FACTOR> accu;
488       index_t i=0;
489 
490 #if defined(ARM_COMPUTE_DISABLE_UNROLL)
491       #pragma clang loop unroll(disable)
492 #endif
493       for(;i<=BLOCK-UNROLL_FACTOR;i+=UNROLL_FACTOR)
494       {
495 
496          state_.sub(TAPS-1+i,TAPS-1+i+UNROLL_FACTOR) = copy(signal.sub(i,i+UNROLL_FACTOR));
497 
498          //INIT_SYSTICK;
499          //START_CYCLE_MEASUREMENT;
500          //startSectionNB(2);
501          results(accu) =
502            dot(unroll<UNROLL_FACTOR>(
503                   [i,this](index_t k){return state_.sub(i+k,i+k+TAPS);}),
504                replicate<UNROLL_FACTOR>(coef_)
505               );
506          //stopSectionNB(2);
507          //STOP_CYCLE_MEASUREMENT;
508 
509          for(index_t k=0;k<UNROLL_FACTOR;k++)
510          {
511             res[i+k] = inner::from_accumulator(accu[k]);
512          }
513       }
514 
515 #if defined(ARM_COMPUTE_DISABLE_UNROLL)
516       #pragma clang loop unroll(disable)
517 #endif
518       for(;i<BLOCK;i++)
519       {
520          state_[TAPS-1+i] = signal[i];
521          acc_type acc = dot(state_.sub(i,i+TAPS),coef_);
522          res[i] = inner::from_accumulator(acc);
523       }
524 
525       state_.sub(0,TAPS) = copy(state_.sub(BLOCK,TAPS+BLOCK));
526       return(res);
527    }
528 
purecFIR529    void purec(const T *signal, T *dst)
530    {
531       const T* coef=coef_.const_ptr();
532       T *state = state_.ptr();
533 
534 #if defined(ARM_COMPUTE_DISABLE_UNROLL)
535       #pragma clang loop unroll(disable)
536 #endif
537       for(index_t i=0;i<BLOCK;i++)
538       {
539          T acc=T{};
540          state[TAPS-1+i] = signal[i];
541          for(index_t k=0;k<TAPS;k++)
542          {
543             acc += state[i+k]*coef[k];
544          }
545          dst[i] = acc;
546       }
547       for(index_t i=0;i<TAPS-1;i++)
548       {
549          state[i] = state[BLOCK+i];
550       }
551    }
552 
553    const PVector<T,TAPS> coef_;
554    PVector<T,TAPS+BLOCK-1> state_;
555 };
556 
557 template<typename T,int BLOCK,int TAPS>
test()558 static void test()
559 {
560    constexpr int NB = BLOCK;
561    std::cout << "----\r\n(" << BLOCK << "," << TAPS << ")\r\n";
562 
563    typename FirType<T>::type S;
564    PVector<T,NB> signal;
565    PVector<T,TAPS> coefs;
566 
567    FirType<T>::init_coef(coefs.ptr(),TAPS);
568 
569    init_array(signal,NB);
570    FIR<T,BLOCK,TAPS> fir(coefs);
571 
572 
573    INIT_SYSTICK;
574    START_CYCLE_MEASUREMENT;
575    startSectionNB(1);
576    PVector<T,BLOCK> res = fir.filter(signal);
577    //PVector<T,BLOCK> res;
578    //fir.purec(signal.const_ptr(),res.ptr());
579    stopSectionNB(1);
580    STOP_CYCLE_MEASUREMENT;
581 
582 
583    T* state;
584    T* coefsb;
585    state=(T*)malloc(sizeof(T)*(TAPS+BLOCK+BLOCK));
586    coefsb=(T*)malloc(sizeof(T)*(TAPS+32));
587    memset(coefsb,0,sizeof(T)*(TAPS+32));
588    for(int i =0;i<TAPS;i++)
589    {
590       coefsb[i] = coefs[i];
591    }
592    FirType<T>::init_state(&S,TAPS,coefsb,state,BLOCK);
593    PVector<T,NB> ref;
594    //std::cout << "---\r\n";
595 
596 
597    INIT_SYSTICK;
598    START_CYCLE_MEASUREMENT;
599    arm_fir_q15(&S,
600                     reinterpret_cast<const q15_t*>(signal.const_ptr()),
601                     reinterpret_cast<q15_t*>(ref.ptr()),BLOCK);
602    STOP_CYCLE_MEASUREMENT;
603 
604 
605    if (!validate(res.const_ptr(),ref.const_ptr(),BLOCK))
606    {
607       printf("fir failed \r\n");
608    }
609 
610    free(state);
611    free(coefsb);
612 
613 
614 }
615 
616 
617 
618 
619 template<typename T>
all_filter_test()620 void all_filter_test()
621 {
622 
623     title<T>("FIR test");
624 
625 
626     test<T,NBVEC_4,8>();
627     test<T,NBVEC_4,16>();
628     test<T,NBVEC_4,24>();
629     test<T,NBVEC_4,32>();
630     test<T,NBVEC_4,64>();
631 
632     test<T,NBVEC_32,8>();
633     test<T,NBVEC_32,16>();
634     test<T,NBVEC_32,24>();
635     test<T,NBVEC_32,32>();
636     test<T,NBVEC_32,64>();
637 
638     test<T,NBVEC_256,8>();
639     test<T,NBVEC_256,16>();
640     test<T,NBVEC_256,24>();
641     test<T,NBVEC_256,32>();
642     test<T,NBVEC_256,64>();
643 
644 
645     test<T,NBVEC_512,8>();
646     test<T,NBVEC_512,16>();
647     test<T,NBVEC_512,24>();
648     test<T,NBVEC_512,32>();
649     test<T,NBVEC_512,64>();
650 
651 
652 }
653 
filter_test()654 void filter_test()
655 {
656     //all_filter_test<Q15>();
657 }