extern "C" { extern void filter_test(); } #include "allocator.h" #include #include #include #include #include #include #if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF) #define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff) #define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs) \ for (int j = 0; j < nbAcc; j++) { \ const q15_t *pSmp = &pSample[j]; \ q63_t acc[4]; \ \ acc[j] = 0; \ for (int i = 0; i < nbVecTaps; i++) { \ vecIn0 = vld1q(pSmp + 8 * i); \ acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]); \ } \ *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15); \ } #define FIR_Q15_MAIN_CORE() \ { \ q15_t *pState = S->pState; /* State pointer */ \ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \ q15_t *pStateCur; /* Points to the current sample of the state */ \ const q15_t *pSamples; /* Temporary pointer to the sample buffer */ \ q15_t *pOutput; /* Temporary pointer to the output buffer */ \ const q15_t *pTempSrc; /* Temporary pointer to the source data */ \ q15_t *pTempDest; /* Temporary pointer to the destination buffer */\ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\ int32_t blkCnt; \ q15x8_t vecIn0; \ \ /* \ * load coefs \ */ \ q15x8_t vecCoeffs[NBVECTAPS]; \ \ for (int i = 0; i < NBVECTAPS; i++) \ vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i); \ \ /* \ * pState points to state array which contains previous frame (numTaps - 1) samples \ * pStateCur points to the location where the new input data should be written \ */ \ pStateCur = &(pState[(numTaps - 1u)]); \ pTempSrc = pSrc; \ pSamples = pState; \ pOutput = pDst; \ \ blkCnt = blockSize >> 2; \ while (blkCnt > 0) { \ /* \ * Save 4 input samples in the history buffer \ */ \ vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc)); \ pStateCur += 4; \ pTempSrc += 4; \ \ FIR_Q15_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs); \ pSamples += 4; \ \ blkCnt--; \ } \ \ /* tail */ \ int32_t residual = blockSize & 3; \ \ for (int i = 0; i < residual; i++) \ *pStateCur++ = *pTempSrc++; \ \ FIR_Q15_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs); \ \ /* \ * Copy the samples back into the history buffer start \ */ \ pTempSrc = &pState[blockSize]; \ pTempDest = pState; \ \ /* current compiler limitation */ \ blkCnt = (numTaps - 1) >> 3; \ while (blkCnt > 0) \ { \ vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc)); \ pTempSrc += 8; \ pTempDest += 8; \ blkCnt--; \ } \ blkCnt = (numTaps - 1) & 7; \ if (blkCnt > 0) \ { \ mve_pred16_t p = vctp16q(blkCnt); \ vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p); \ } \ } static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S, const q15_t * __restrict pSrc, q15_t * __restrict pDst, uint32_t blockSize) { #define NBTAPS 32 #define NBVECTAPS (NBTAPS / 8) FIR_Q15_MAIN_CORE(); #undef NBVECTAPS #undef NBTAPS } static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S, const q15_t * __restrict pSrc, q15_t * __restrict pDst, uint32_t blockSize) { #define NBTAPS 24 #define NBVECTAPS (NBTAPS / 8) FIR_Q15_MAIN_CORE(); #undef NBVECTAPS #undef NBTAPS } static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S, const q15_t * __restrict pSrc, q15_t * __restrict pDst, uint32_t blockSize) { #define NBTAPS 16 #define NBVECTAPS (NBTAPS / 8) FIR_Q15_MAIN_CORE(); #undef NBVECTAPS #undef NBTAPS } static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * __restrict pSrc, q15_t * __restrict pDst, uint32_t blockSize) { #define NBTAPS 8 #define NBVECTAPS (NBTAPS / 8) FIR_Q15_MAIN_CORE(); #undef NBVECTAPS #undef NBTAPS } void debug_arm_fir_q15( const arm_fir_instance_q15 * S, const q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { q15_t *pState = S->pState; /* State pointer */ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ q15_t *pStateCur; /* Points to the current sample of the state */ const q15_t *pSamples; /* Temporary pointer to the sample buffer */ q15_t *pOutput; /* Temporary pointer to the output buffer */ const q15_t *pTempSrc; /* Temporary pointer to the source data */ q15_t *pTempDest; /* Temporary pointer to the destination buffer */ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ uint32_t blkCnt; q15x8_t vecIn0; uint32_t tapsBlkCnt = (numTaps + 7) / 8; q63_t acc0, acc1, acc2, acc3; int32_t nbTaps = (numTaps + 7) >> 3; switch(nbTaps) { case 1: arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize); return; case 2: arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize); return; case 3: arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize); return; case 4: arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize); return; } /* * pState points to state array which contains previous frame (numTaps - 1) samples * pStateCur points to the location where the new input data should be written */ pStateCur = &(pState[(numTaps - 1u)]); pTempSrc = pSrc; pSamples = pState; pOutput = pDst; blkCnt = blockSize >> 2; while (blkCnt > 0U) { const q15_t *pCoeffsTmp = pCoeffs; const q15_t *pSamplesTmp = pSamples; acc0 = 0LL; acc1 = 0LL; acc2 = 0LL; acc3 = 0LL; /* * Save 8 input samples in the history buffer */ vst1q(pStateCur, vld1q(pTempSrc)); pStateCur += 8; pTempSrc += 8; //INIT_SYSTICK; //START_CYCLE_MEASUREMENT; int i = tapsBlkCnt; //startSectionNB(3); while (i > 0) { /* * load 8 coefs */ q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; vecIn0 = vld1q(pSamplesTmp); acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); vecIn0 = vld1q(&pSamplesTmp[1]); acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs); vecIn0 = vld1q(&pSamplesTmp[2]); acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs); vecIn0 = vld1q(&pSamplesTmp[3]); acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs); pSamplesTmp += 8; pCoeffsTmp += 8; /* * Decrement the taps block loop counter */ i--; } //stopSectionNB(3); //STOP_CYCLE_MEASUREMENT; *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15); *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15); *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15); pSamples += 4; /* * Decrement the sample block loop counter */ blkCnt--; } uint32_t residual = blockSize & 3; switch (residual) { case 3: { const q15_t *pCoeffsTmp = pCoeffs; const q15_t *pSamplesTmp = pSamples; acc0 = 0LL; acc1 = 0LL; acc2 = 0LL; /* * Save 8 input samples in the history buffer */ *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc; pStateCur += 8; pTempSrc += 8; int i = tapsBlkCnt; while (i > 0) { /* * load 8 coefs */ q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; vecIn0 = vld1q(pSamplesTmp); acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); vecIn0 = vld1q(&pSamplesTmp[2]); acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs); vecIn0 = vld1q(&pSamplesTmp[4]); acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs); pSamplesTmp += 8; pCoeffsTmp += 8; /* * Decrement the taps block loop counter */ i--; } acc0 = asrl(acc0, 15); acc1 = asrl(acc1, 15); acc2 = asrl(acc2, 15); *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15); *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15); } break; case 2: { const q15_t *pCoeffsTmp = pCoeffs; const q15_t *pSamplesTmp = pSamples; acc0 = 0LL; acc1 = 0LL; /* * Save 8 input samples in the history buffer */ vst1q(pStateCur, vld1q(pTempSrc)); pStateCur += 8; pTempSrc += 8; int i = tapsBlkCnt; while (i > 0) { /* * load 8 coefs */ q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; vecIn0 = vld1q(pSamplesTmp); acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); vecIn0 = vld1q(&pSamplesTmp[2]); acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs); pSamplesTmp += 8; pCoeffsTmp += 8; /* * Decrement the taps block loop counter */ i--; } *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15); } break; case 1: { const q15_t *pCoeffsTmp = pCoeffs; const q15_t *pSamplesTmp = pSamples; acc0 = 0LL; /* * Save 8 input samples in the history buffer */ vst1q(pStateCur, vld1q(pTempSrc)); pStateCur += 8; pTempSrc += 8; int i = tapsBlkCnt; while (i > 0) { /* * load 8 coefs */ q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; vecIn0 = vld1q(pSamplesTmp); acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); pSamplesTmp += 8; pCoeffsTmp += 8; /* * Decrement the taps block loop counter */ i--; } *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); } break; } /* * Copy the samples back into the history buffer start */ pTempSrc = &pState[blockSize]; pTempDest = pState; blkCnt = numTaps >> 3; while (blkCnt > 0U) { vst1q(pTempDest, vld1q(pTempSrc)); pTempSrc += 8; pTempDest += 8; blkCnt--; } blkCnt = numTaps & 7; if (blkCnt > 0U) { mve_pred16_t p0 = vctp16q(blkCnt); vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0); } } #endif template struct FirType; template<> struct FirType { typedef arm_fir_instance_f32 type; static void init_state(type * S, uint16_t numTaps, const float32_t * pCoeffs, float32_t * pState, uint32_t blockSize) { arm_fir_init_f32(S,numTaps,pCoeffs,pState,blockSize); }; static void init_coef(float32_t *coefs,uint16_t numTaps) { for(int i=0;i struct FirType { typedef arm_fir_instance_q15 type; static void init_state(type * S, uint16_t numTaps, const Q15 * pCoeffs, Q15 * pState, uint32_t blockSize) { arm_fir_init_q15(S,numTaps, reinterpret_cast(pCoeffs), reinterpret_cast(pState),blockSize); }; static void init_coef(Q15 *coefs,uint16_t numTaps) { for(int i=0;i struct FIR { FIR(const PVector &coefs):coef_(coefs),state_(T{}) {}; PVector filter(const PVector &signal) { constexpr int UNROLL_FACTOR = 4; PVector res(T{}); using acc_type = typename number_traits::accumulator; std::array accu; index_t i=0; #if defined(ARM_COMPUTE_DISABLE_UNROLL) #pragma clang loop unroll(disable) #endif for(;i<=BLOCK-UNROLL_FACTOR;i+=UNROLL_FACTOR) { state_.sub(TAPS-1+i,TAPS-1+i+UNROLL_FACTOR) = copy(signal.sub(i,i+UNROLL_FACTOR)); //INIT_SYSTICK; //START_CYCLE_MEASUREMENT; //startSectionNB(2); results(accu) = dot(unroll( [i,this](index_t k){return state_.sub(i+k,i+k+TAPS);}), replicate(coef_) ); //stopSectionNB(2); //STOP_CYCLE_MEASUREMENT; for(index_t k=0;k coef_; PVector state_; }; template static void test() { constexpr int NB = BLOCK; std::cout << "----\r\n(" << BLOCK << "," << TAPS << ")\r\n"; typename FirType::type S; PVector signal; PVector coefs; FirType::init_coef(coefs.ptr(),TAPS); init_array(signal,NB); FIR fir(coefs); INIT_SYSTICK; START_CYCLE_MEASUREMENT; startSectionNB(1); PVector res = fir.filter(signal); //PVector res; //fir.purec(signal.const_ptr(),res.ptr()); stopSectionNB(1); STOP_CYCLE_MEASUREMENT; T* state; T* coefsb; state=(T*)malloc(sizeof(T)*(TAPS+BLOCK+BLOCK)); coefsb=(T*)malloc(sizeof(T)*(TAPS+32)); memset(coefsb,0,sizeof(T)*(TAPS+32)); for(int i =0;i::init_state(&S,TAPS,coefsb,state,BLOCK); PVector ref; //std::cout << "---\r\n"; INIT_SYSTICK; START_CYCLE_MEASUREMENT; arm_fir_q15(&S, reinterpret_cast(signal.const_ptr()), reinterpret_cast(ref.ptr()),BLOCK); STOP_CYCLE_MEASUREMENT; if (!validate(res.const_ptr(),ref.const_ptr(),BLOCK)) { printf("fir failed \r\n"); } free(state); free(coefsb); } template void all_filter_test() { title("FIR test"); test(); test(); test(); test(); test(); test(); test(); test(); test(); test(); test(); test(); test(); test(); test(); test(); test(); test(); test(); test(); } void filter_test() { //all_filter_test(); }