1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_q31.c
4  * Description:  Q31 FIR filter processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 
32 /**
33   @ingroup groupFilters
34  */
35 
36 /**
37   @addtogroup FIR
38   @{
39  */
40 
41 /**
42   @brief         Processing function for Q31 FIR filter.
43   @param[in]     S          points to an instance of the Q31 FIR filter structure
44   @param[in]     pSrc       points to the block of input data
45   @param[out]    pDst       points to the block of output data
46   @param[in]     blockSize  number of samples to process
47 
48   @par           Scaling and Overflow Behavior
49                    The function is implemented using an internal 64-bit accumulator.
50                    The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
51                    Thus, if the accumulator result overflows it wraps around rather than clip.
52                    In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
53                    After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
54 
55  @remark
56                    Refer to \ref arm_fir_fast_q31() for a faster but less precise implementation of this filter.
57  */
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
59 
60 #include "arm_helium_utils.h"
61 
62 
63 #define FIR_Q31_CORE(nbAcc, nbVecTaps, pSample, vecCoeffs)                 \
64         for (int j = 0; j < nbAcc; j++) {                                  \
65             const q31_t    *pSmp = &pSamples[j];                           \
66             q31x4_t         vecIn0;                                        \
67             q63_t           acc[4];                                        \
68                                                                            \
69             acc[j] = 0;                                                    \
70             for (int i = 0; i < nbVecTaps; i++) {                          \
71                 vecIn0 = vld1q(pSmp + 4 * i);                  \
72                 acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]);       \
73             }                                                              \
74             *pOutput++ = (q31_t)asrl(acc[j], 23);                          \
75         }
76 
77 
78 #define FIR_Q31_CORE_STR_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs)     \
79         for (int j = 0; j < nbAcc; j++) {                                  \
80             const q31_t    *pSmp = &pSamples[j];                           \
81             q31x4_t         vecIn0;                                        \
82                                                                            \
83             acc[j] = 0;                                                    \
84             for (int i = 0; i < nbVecTaps; i++) {                          \
85                 vecIn0 = vld1q(pSmp + 4 * i);                  \
86                 acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]);       \
87             }                                                              \
88             *arm_fir_partial_accu_ptr++ = acc[j];                          \
89         }
90 
91 
92 #define FIR_Q31_CORE_LD_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs)      \
93         for (int j = 0; j < nbAcc; j++) {                                  \
94             const q31_t    *pSmp = &pSamples[j];                           \
95             q31x4_t         vecIn0;                                        \
96                                                                            \
97             acc[j] = *arm_fir_partial_accu_ptr++;                          \
98                                                                            \
99             for (int i = 0; i < nbVecTaps; i++) {                          \
100                 vecIn0 = vld1q(pSmp + 4 * i);                  \
101                 acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]);       \
102             }                                                              \
103             *pOutput++ = (q31_t)asrl(acc[j], 23);                          \
104         }
105 
106 
107 #define FIR_Q31_MAIN_CORE()                                                              \
108 {                                                                                        \
109     q31_t *pRefStatePtr = S->pState + 2*ARM_ROUND_UP(blockSize, 4);                          \
110     q31_t      *pState = pRefStatePtr; /* State pointer */                               \
111     const q31_t *pCoeffs = S->pCoeffs;  /* Coefficient pointer */                        \
112     q31_t       *pStateCur;             /* Points to the current sample of the state */  \
113     const q31_t *pSamples;              /* Temporary pointer to the sample buffer */     \
114     q31_t       *pOutput;               /* Temporary pointer to the output buffer */     \
115     const q31_t *pTempSrc;              /* Temporary pointer to the source data */       \
116     q31_t       *pTempDest;             /* Temporary pointer to the destination buffer */\
117     uint32_t     numTaps = S->numTaps;  /* Number of filter coefficients in the filter */\
118     int32_t      blkCnt;                                                                 \
119                                                                                          \
120     /*                                                                                   \
121      * load coefs                                                                        \
122      */                                                                                  \
123     q31x4_t         vecCoeffs[NBVECTAPS];                                                \
124                                                                                          \
125     for (int i = 0; i < NBVECTAPS; i++)                                                  \
126         vecCoeffs[i] = vld1q(pCoeffs + 4 * i);                                           \
127                                                                                          \
128     /*                                                                                   \
129      * pState points to state array which contains previous frame (numTaps - 1) samples  \
130      * pStateCur points to the location where the new input data should be written       \
131      */                                                                                  \
132     pStateCur = &(pState[(numTaps - 1u)]);                                               \
133     pTempSrc = pSrc;                                                                     \
134     pSamples = pState;                                                                   \
135     pOutput = pDst;                                                                      \
136                                                                                          \
137     blkCnt = blockSize >> 2;                                                             \
138     while (blkCnt > 0) {                                                                 \
139         /*                                                                               \
140          * Save 4 input samples in the history buffer                                    \
141          */                                                                              \
142         vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));                                     \
143         pStateCur += 4;                                                                  \
144         pTempSrc += 4;                                                                   \
145                                                                                          \
146         FIR_Q31_CORE(4, NBVECTAPS, pSamples, vecCoeffs);                                 \
147                                                                                          \
148         pSamples += 4;                                                                   \
149         /*                                                                               \
150          * Decrement the sample block loop counter                                       \
151          */                                                                              \
152         blkCnt--;                                                                        \
153     }                                                                                    \
154                                                                                          \
155     /* tail */                                                                           \
156     int32_t        residual = blockSize & 3;                                             \
157     switch (residual) {                                                                  \
158       case 3:                                                                            \
159           {                                                                              \
160               for (int i = 0; i < residual; i++)                                         \
161                   *pStateCur++ = *pTempSrc++;                                            \
162                                                                                          \
163               FIR_Q31_CORE(3, NBVECTAPS, pSamples, vecCoeffs);                           \
164           }                                                                              \
165           break;                                                                         \
166                                                                                          \
167       case 2:                                                                            \
168           {                                                                              \
169               for (int i = 0; i < residual; i++)                                         \
170                   *pStateCur++ = *pTempSrc++;                                            \
171                                                                                          \
172                FIR_Q31_CORE(2, NBVECTAPS, pSamples, vecCoeffs);                          \
173           }                                                                              \
174           break;                                                                         \
175                                                                                          \
176       case 1:                                                                            \
177           {                                                                              \
178               for (int i = 0; i < residual; i++)                                         \
179                   *pStateCur++ = *pTempSrc++;                                            \
180                                                                                          \
181               FIR_Q31_CORE(1, NBVECTAPS, pSamples, vecCoeffs);                           \
182           }                                                                              \
183           break;                                                                         \
184     }                                                                                    \
185                                                                                          \
186     /*                                                                                   \
187      * Copy the samples back into the history buffer start                               \
188      */                                                                                  \
189     pTempSrc = &pState[blockSize];                                                       \
190     pTempDest = pState;                                                                  \
191                                                                                          \
192     blkCnt =(numTaps - 1) >> 2;                                                          \
193     while (blkCnt > 0)                                                                   \
194     {                                                                                    \
195         vstrwq_s32(pTempDest, vldrwq_s32(pTempSrc));                                     \
196         pTempSrc += 4;                                                                   \
197         pTempDest += 4;                                                                  \
198         blkCnt--;                                                                        \
199     }                                                                                    \
200     blkCnt = (numTaps - 1) & 3;                                                          \
201     if (blkCnt > 0)                                                                      \
202     {                                                                                    \
203         mve_pred16_t p0 = vctp32q(blkCnt);                                               \
204         vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p0), p0);                         \
205     }                                                                                    \
206 }
207 
arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)208 static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S,
209     const q31_t * __restrict pSrc,
210     q31_t * __restrict pDst, uint32_t blockSize)
211 {
212     q31_t *pRefStatePtr = S->pState + 2*ARM_ROUND_UP(blockSize, 4);
213     q31_t      *pState = pRefStatePtr; /* State pointer */
214     const q31_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
215     q31_t    *pStateCur;        /* Points to the current sample of the state */
216     const q31_t    *pSamples;         /* Temporary pointer to the sample buffer */
217     q31_t    *pOutput;          /* Temporary pointer to the output buffer */
218     const q31_t    *pTempSrc;         /* Temporary pointer to the source data */
219     q31_t    *pTempDest;        /* Temporary pointer to the destination buffer */
220     uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
221     uint32_t  blkCnt;
222     q31x4_t vecIn0;
223 
224 
225     /*
226      * pState points to state array which contains previous frame (numTaps - 1) samples
227      * pStateCur points to the location where the new input data should be written
228      */
229     pStateCur = &(pState[(numTaps - 1u)]);
230     pTempSrc = pSrc;
231     pSamples = pState;
232     pOutput = pDst;
233 
234     q63_t     acc0=0, acc1=0, acc2=0, acc3=0;
235     /*
236      * load 4 coefs
237      */
238     q31x4_t vecCoeffs = *(q31x4_t *) pCoeffs;
239 
240     blkCnt = blockSize >> 2;
241     while (blkCnt > 0U)
242     {
243         const q31_t    *pSamplesTmp = pSamples;
244 
245         /*
246          * Save 4 input samples in the history buffer
247          */
248         vst1q(pStateCur, vld1q(pTempSrc));
249         pStateCur += 4;
250         pTempSrc += 4;
251 
252         vecIn0 = vld1q(pSamplesTmp);
253         acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
254 
255         vecIn0 = vld1q(&pSamplesTmp[1]);
256         acc1 = vrmlaldavhq(vecIn0, vecCoeffs);
257 
258         vecIn0 = vld1q(&pSamplesTmp[2]);
259         acc2 = vrmlaldavhq(vecIn0, vecCoeffs);
260 
261         vecIn0 = vld1q(&pSamplesTmp[3]);
262         acc3 = vrmlaldavhq(vecIn0, vecCoeffs);
263 
264         acc0 = asrl(acc0, 23);
265         acc1 = asrl(acc1, 23);
266         acc2 = asrl(acc2, 23);
267         acc3 = asrl(acc3, 23);
268 
269         *pOutput++ = (q31_t) acc0;
270         *pOutput++ = (q31_t) acc1;
271         *pOutput++ = (q31_t) acc2;
272         *pOutput++ = (q31_t) acc3;
273 
274         pSamples += 4;
275         /*
276          * Decrement the sample block loop counter
277          */
278         blkCnt--;
279     }
280 
281     uint32_t  residual = blockSize & 3;
282     switch (residual)
283     {
284     case 3:
285         {
286             /*
287              * Save 4 input samples in the history buffer
288              */
289             *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
290             pStateCur += 4;
291             pTempSrc += 4;
292 
293             vecIn0 = vld1q(pSamples);
294             acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
295 
296             vecIn0 = vld1q(&pSamples[1]);
297             acc1 = vrmlaldavhq(vecIn0, vecCoeffs);
298 
299             vecIn0 = vld1q(&pSamples[2]);
300             acc2 = vrmlaldavhq(vecIn0, vecCoeffs);
301 
302             acc0 = asrl(acc0, 23);
303             acc1 = asrl(acc1, 23);
304             acc2 = asrl(acc2, 23);
305 
306             *pOutput++ = (q31_t) acc0;
307             *pOutput++ = (q31_t) acc1;
308             *pOutput++ = (q31_t) acc2;
309         }
310         break;
311 
312     case 2:
313         {
314             /*
315              * Save 4 input samples in the history buffer
316              */
317             vst1q(pStateCur, vld1q(pTempSrc));
318             pStateCur += 4;
319             pTempSrc += 4;
320 
321             vecIn0 = vld1q(pSamples);
322             acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
323 
324             vecIn0 = vld1q(&pSamples[1]);
325             acc1 = vrmlaldavhq(vecIn0, vecCoeffs);
326 
327             acc0 = asrl(acc0, 23);
328             acc1 = asrl(acc1, 23);
329 
330             *pOutput++ = (q31_t) acc0;
331             *pOutput++ = (q31_t) acc1;
332         }
333         break;
334 
335     case 1:
336         {
337             /*
338              * Save 4 input samples in the history buffer
339              */
340             vst1q(pStateCur, vld1q(pTempSrc));
341             pStateCur += 4;
342             pTempSrc += 4;
343 
344             vecIn0 = vld1q(pSamples);
345             acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
346 
347             acc0 = asrl(acc0, 23);
348 
349             *pOutput++ = (q31_t) acc0;
350         }
351         break;
352     }
353 
354     /*
355      * Copy the samples back into the history buffer start
356      */
357     pTempSrc = &pState[blockSize];
358     pTempDest = pState;
359 
360     blkCnt = (numTaps-1) >> 2;
361     while (blkCnt > 0U)
362     {
363         vst1q(pTempDest, vld1q(pTempSrc));
364         pTempSrc += 4;
365         pTempDest += 4;
366         blkCnt--;
367     }
368     blkCnt = (numTaps-1) & 3;
369     if (blkCnt > 0U)
370     {
371         mve_pred16_t p0 = vctp32q(blkCnt);
372         vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
373     }
374 }
375 
376 
377 
arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)378 static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S,
379     const q31_t * __restrict pSrc,
380     q31_t * __restrict pDst, uint32_t blockSize)
381 {
382     #define NBTAPS 8
383     #define NBVECTAPS (NBTAPS / 4)
384     FIR_Q31_MAIN_CORE();
385     #undef NBVECTAPS
386     #undef NBTAPS
387 }
388 
389 
arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)390 static void arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S,
391     const q31_t * __restrict pSrc,
392     q31_t * __restrict pDst, uint32_t blockSize)
393 {
394     #define NBTAPS 12
395     #define NBVECTAPS (NBTAPS / 4)
396     FIR_Q31_MAIN_CORE();
397     #undef NBVECTAPS
398     #undef NBTAPS
399 }
400 
401 
arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)402 static void arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S,
403     const q31_t * __restrict pSrc,
404     q31_t * __restrict pDst, uint32_t blockSize)
405 {
406     #define NBTAPS 16
407     #define NBVECTAPS (NBTAPS / 4)
408     FIR_Q31_MAIN_CORE();
409     #undef NBVECTAPS
410     #undef NBTAPS
411 }
412 
413 
arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)414 static void arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S,
415     const q31_t * __restrict pSrc,
416     q31_t * __restrict pDst, uint32_t blockSize)
417 {
418     #define NBTAPS 20
419     #define NBVECTAPS (NBTAPS / 4)
420     FIR_Q31_MAIN_CORE();
421     #undef NBVECTAPS
422     #undef NBTAPS
423 }
424 
425 
arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)426 static void arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S,
427     const q31_t * __restrict pSrc,
428     q31_t * __restrict pDst, uint32_t blockSize)
429 {
430     #define NBTAPS 24
431     #define NBVECTAPS (NBTAPS / 4)
432     FIR_Q31_MAIN_CORE();
433     #undef NBVECTAPS
434     #undef NBTAPS
435 }
436 
437 
arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)438 static void arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S,
439     const q31_t * __restrict pSrc,
440     q31_t * __restrict pDst, uint32_t blockSize)
441 {
442     #define NBTAPS 28
443     #define NBVECTAPS (NBTAPS / 4)
444     FIR_Q31_MAIN_CORE();
445     #undef NBVECTAPS
446     #undef NBTAPS
447 }
448 
arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S,const q31_t * __restrict pSrc,q31_t * __restrict pDst,uint32_t blockSize)449 static void arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S,
450     const q31_t * __restrict pSrc,
451     q31_t * __restrict pDst,
452                                uint32_t blockSize)
453 {
454     q31_t *pRefStatePtr = S->pState + 2*ARM_ROUND_UP(blockSize, 4);
455     q31_t      *pState = pRefStatePtr; /* State pointer */
456     const q31_t    *pCoeffs = S->pCoeffs;       /* Coefficient pointer */
457     q31_t          *pStateCur;  /* Points to the current sample of the state */
458     const q31_t    *pSamples;   /* Temporary pointer to the sample buffer */
459     q31_t          *pOutput;    /* Temporary pointer to the output buffer */
460     const q31_t    *pTempSrc;   /* Temporary pointer to the source data */
461     q31_t          *pTempDest;  /* Temporary pointer to the destination buffer */
462     uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
463     int32_t         blkCnt;
464     q63_t           acc0, acc1, acc2, acc3;
465 
466 #define MAX_VECT_BATCH 7
467 
468     /*
469      * pre-load 28 1st coefs
470      */
471     q31x4_t         vecCoeffs0 = vld1q(pCoeffs + 4 * 0);
472     q31x4_t         vecCoeffs1 = vld1q(pCoeffs + 4 * 1);
473     q31x4_t         vecCoeffs2 = vld1q(pCoeffs + 4 * 2);
474     q31x4_t         vecCoeffs3 = vld1q(pCoeffs + 4 * 3);
475     q31x4_t         vecCoeffs4 = vld1q(pCoeffs + 4 * 4);
476     q31x4_t         vecCoeffs5 = vld1q(pCoeffs + 4 * 5);
477     q31x4_t         vecCoeffs6 = vld1q(pCoeffs + 4 * 6);
478 
479     /*
480      * pState points to state array which contains previous frame (numTaps - 1) samples
481      * pStateCur points to the location where the new input data should be written
482      */
483     pStateCur = &(pState[(numTaps - 1u)]);
484     pTempSrc = pSrc;
485     pSamples = pState;
486 
487     q63_t          *arm_fir_partial_accu_ptr = (q63_t*)S->pState;
488 
489     blkCnt = blockSize >> 2;
490     while (blkCnt > 0) {
491         /*
492          * Save 4 input samples in the history buffer
493          */
494         vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));
495         pStateCur += 4;
496         pTempSrc += 4;
497 
498         const q31_t    *pSmp;
499         q31x4_t         vecIn0;
500 
501         pSmp = &pSamples[0];
502 
503         vecIn0 = vld1q(pSmp);
504         acc0 = vrmlaldavhq(vecIn0, vecCoeffs0);
505         vecIn0 = vld1q(pSmp + 4 * 1);
506         acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs1);
507         vecIn0 = vld1q(pSmp + 4 * 2);
508         acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs2);
509         vecIn0 = vld1q(pSmp + 4 * 3);
510         acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs3);
511         vecIn0 = vld1q(pSmp + 4 * 4);
512         acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs4);
513         vecIn0 = vld1q(pSmp + 4 * 5);
514         acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5);
515         vecIn0 = vld1q(pSmp + 4 * 6);
516         acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs6);
517 
518         *arm_fir_partial_accu_ptr++ = acc0;
519 
520         pSmp = &pSamples[1];
521 
522         vecIn0 = vld1q(pSmp);
523         acc1 = vrmlaldavhq(vecIn0, vecCoeffs0);
524         vecIn0 = vld1q(pSmp + 4 * 1);
525         acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs1);
526         vecIn0 = vld1q(pSmp + 4 * 2);
527         acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs2);
528         vecIn0 = vld1q(pSmp + 4 * 3);
529         acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs3);
530         vecIn0 = vld1q(pSmp + 4 * 4);
531         acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs4);
532         vecIn0 = vld1q(pSmp + 4 * 5);
533         acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5);
534         vecIn0 = vld1q(pSmp + 4 * 6);
535         acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs6);
536 
537         *arm_fir_partial_accu_ptr++ = acc1;
538 
539         pSmp = &pSamples[2];
540 
541         vecIn0 = vld1q(pSmp);
542         acc2 = vrmlaldavhq(vecIn0, vecCoeffs0);
543         vecIn0 = vld1q(pSmp + 4 * 1);
544         acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs1);
545         vecIn0 = vld1q(pSmp + 4 * 2);
546         acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs2);
547         vecIn0 = vld1q(pSmp + 4 * 3);
548         acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs3);
549         vecIn0 = vld1q(pSmp + 4 * 4);
550         acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs4);
551         vecIn0 = vld1q(pSmp + 4 * 5);
552         acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5);
553         vecIn0 = vld1q(pSmp + 4 * 6);
554         acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs6);
555         *arm_fir_partial_accu_ptr++ = acc2;
556 
557         pSmp = &pSamples[3];
558 
559         vecIn0 = vld1q(pSmp);
560         acc3 = vrmlaldavhq(vecIn0, vecCoeffs0);
561         vecIn0 = vld1q(pSmp + 4 * 1);
562         acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs1);
563         vecIn0 = vld1q(pSmp + 4 * 2);
564         acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs2);
565         vecIn0 = vld1q(pSmp + 4 * 3);
566         acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs3);
567         vecIn0 = vld1q(pSmp + 4 * 4);
568         acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs4);
569         vecIn0 = vld1q(pSmp + 4 * 5);
570         acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5);
571         vecIn0 = vld1q(pSmp + 4 * 6);
572         acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs6);
573 
574         *arm_fir_partial_accu_ptr++ = acc3;
575 
576         pSamples += 4;
577         /*
578          * Decrement the sample block loop counter
579          */
580         blkCnt--;
581     }
582 
583 
584     /* reminder */
585 
586     /* load last 4 coef */
587     vecCoeffs0 = vld1q(pCoeffs + 4 * MAX_VECT_BATCH);
588     arm_fir_partial_accu_ptr = (q63_t*)S->pState;
589     pOutput = pDst;
590     pSamples = pState + (MAX_VECT_BATCH * 4);
591 
592 
593     blkCnt = blockSize >> 2;
594     while (blkCnt > 0) {
595         q31x4_t         vecIn0;
596 
597         /* reload intermediate MAC */
598         acc0 = *arm_fir_partial_accu_ptr++;
599         acc1 = *arm_fir_partial_accu_ptr++;
600         acc2 = *arm_fir_partial_accu_ptr++;
601         acc3 = *arm_fir_partial_accu_ptr++;
602 
603 
604         vecIn0 = vld1q(&pSamples[0]);
605         acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs0);
606 
607         vecIn0 = vld1q(&pSamples[1]);
608         acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs0);
609 
610         vecIn0 = vld1q(&pSamples[2]);
611         acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs0);
612 
613         vecIn0 = vld1q(&pSamples[3]);
614         acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs0);
615 
616         *pOutput++ = asrl(acc0, 23);
617         *pOutput++ = asrl(acc1, 23);
618         *pOutput++ = asrl(acc2, 23);
619         *pOutput++ = asrl(acc3, 23);
620 
621         pSamples += 4;
622         /*
623          * Decrement the sample block loop counter
624          */
625         blkCnt--;
626     }
627 
628     /*
629      * Copy the samples back into the history buffer start
630      */
631     pTempSrc = &pState[blockSize];
632     pTempDest = pState;
633 
634     blkCnt = numTaps - 1;
635     do {
636         mve_pred16_t    p = vctp32q(blkCnt);
637 
638         vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p), p);
639         pTempSrc += 4;
640         pTempDest += 4;
641         blkCnt -= 4;
642     }
643     while (blkCnt > 0);
644 }
645 
646 
647 
arm_fir_q31(const arm_fir_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)648 ARM_DSP_ATTRIBUTE void arm_fir_q31(
649   const arm_fir_instance_q31 * S,
650   const q31_t * pSrc,
651         q31_t * pDst,
652         uint32_t blockSize)
653 {
654     q31_t *pRefStatePtr = S->pState + 2*ARM_ROUND_UP(blockSize, 4);
655     q31_t      *pState = pRefStatePtr; /* State pointer */
656     const q31_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
657     q31_t    *pStateCur;        /* Points to the current sample of the state */
658     const q31_t    *pSamples;         /* Temporary pointer to the sample buffer */
659     q31_t    *pOutput;          /* Temporary pointer to the output buffer */
660     const q31_t    *pTempSrc;         /* Temporary pointer to the source data */
661     q31_t    *pTempDest;        /* Temporary pointer to the destination buffer */
662     uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
663     uint32_t  blkCnt;
664     q31x4_t vecIn0;
665     uint32_t  tapsBlkCnt = (numTaps + 3) / 4;
666     q63_t     acc0, acc1, acc2, acc3;
667     q31x4_t vecCoeffs;
668 
669 
670     /*
671      * [1 to 32 taps] specialized routines
672      */
673     if (numTaps <= 4)
674     {
675         arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize);
676         return;
677     }
678     else if (numTaps <= 8)
679     {
680         arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize);
681         return;
682     }
683     else if (numTaps <= 12)
684     {
685         arm_fir_q31_9_12_mve(S, pSrc, pDst, blockSize);
686         return;
687     }
688     else if (numTaps <= 16)
689     {
690         arm_fir_q31_13_16_mve(S, pSrc, pDst, blockSize);
691         return;
692     }
693     else if (numTaps <= 20)
694     {
695         arm_fir_q31_17_20_mve(S, pSrc, pDst, blockSize);
696         return;
697     }
698     else if (numTaps <= 24)
699     {
700         arm_fir_q31_21_24_mve(S, pSrc, pDst, blockSize);
701         return;
702     }
703     else if (numTaps <= 28)
704     {
705         arm_fir_q31_25_28_mve(S, pSrc, pDst, blockSize);
706         return;
707     }
708     else if ((numTaps <= 32)  && (blockSize >= 32))
709     {
710         arm_fir_q31_29_32_mve(S, pSrc, pDst, blockSize);
711         return;
712     }
713 
714     /*
715      * pState points to state array which contains previous frame (numTaps - 1) samples
716      * pStateCur points to the location where the new input data should be written
717      */
718     pStateCur   = &(pState[(numTaps - 1u)]);
719     pSamples    = pState;
720     pTempSrc    = pSrc;
721     pOutput     = pDst;
722     blkCnt      = blockSize >> 2;
723     while (blkCnt > 0)
724     {
725         const q31_t    *pCoeffsTmp = pCoeffs;
726         const q31_t    *pSamplesTmp = pSamples;
727 
728         acc0 = 0LL;
729         acc1 = 0LL;
730         acc2 = 0LL;
731         acc3 = 0LL;
732 
733         /*
734          * Save 4 input samples in the history buffer
735          */
736         vst1q(pStateCur, vld1q(pTempSrc));
737         pStateCur += 4;
738         pTempSrc += 4;
739 
740         int       i = tapsBlkCnt;
741         while (i > 0)
742         {
743             /*
744              * load 4 coefs
745              */
746             vecCoeffs = *(q31x4_t *) pCoeffsTmp;
747 
748             vecIn0 = vld1q(pSamplesTmp);
749             acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
750 
751             vecIn0 = vld1q(&pSamplesTmp[1]);
752             acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
753 
754             vecIn0 = vld1q(&pSamplesTmp[2]);
755             acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
756 
757             vecIn0 = vld1q(&pSamplesTmp[3]);
758             acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs);
759 
760             pSamplesTmp += 4;
761             pCoeffsTmp += 4;
762             /*
763              * Decrement the taps block loop counter
764              */
765             i--;
766         }
767 
768         /* .54-> .31 conversion and store accumulators */
769         acc0 = asrl(acc0, 23);
770         acc1 = asrl(acc1, 23);
771         acc2 = asrl(acc2, 23);
772         acc3 = asrl(acc3, 23);
773 
774         *pOutput++ = (q31_t) acc0;
775         *pOutput++ = (q31_t) acc1;
776         *pOutput++ = (q31_t) acc2;
777         *pOutput++ = (q31_t) acc3;
778 
779         pSamples += 4;
780 
781         /*
782          * Decrement the sample block loop counter
783          */
784         blkCnt--;
785     }
786 
787     int32_t  residual = blockSize & 3;
788     switch (residual)
789     {
790     case 3:
791         {
792             const q31_t    *pCoeffsTmp = pCoeffs;
793             const q31_t    *pSamplesTmp = pSamples;
794 
795             acc0 = 0LL;
796             acc1 = 0LL;
797             acc2 = 0LL;
798 
799             /*
800              * Save 4 input samples in the history buffer
801              */
802             *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
803             pStateCur += 4;
804             pTempSrc += 4;
805 
806             int       i = tapsBlkCnt;
807             while (i > 0)
808             {
809                 vecCoeffs = *(q31x4_t *) pCoeffsTmp;
810 
811                 vecIn0 = vld1q(pSamplesTmp);
812                 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
813 
814                 vecIn0 = vld1q(&pSamplesTmp[1]);
815                 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
816 
817                 vecIn0 = vld1q(&pSamplesTmp[2]);
818                 acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
819 
820                 pSamplesTmp += 4;
821                 pCoeffsTmp += 4;
822                 i--;
823             }
824 
825             acc0 = asrl(acc0, 23);
826             acc1 = asrl(acc1, 23);
827             acc2 = asrl(acc2, 23);
828 
829             *pOutput++ = (q31_t) acc0;
830             *pOutput++ = (q31_t) acc1;
831             *pOutput++ = (q31_t) acc2;
832         }
833         break;
834 
835     case 2:
836         {
837             const q31_t    *pCoeffsTmp = pCoeffs;
838             const q31_t    *pSamplesTmp = pSamples;
839 
840             acc0 = 0LL;
841             acc1 = 0LL;
842 
843             /*
844              * Save 4 input samples in the history buffer
845              */
846             vst1q(pStateCur, vld1q(pTempSrc));
847             pStateCur += 4;
848             pTempSrc += 4;
849 
850             int       i = tapsBlkCnt;
851             while (i > 0)
852             {
853                 vecCoeffs = *(q31x4_t *) pCoeffsTmp;
854 
855                 vecIn0 = vld1q(pSamplesTmp);
856                 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
857 
858                 vecIn0 = vld1q(&pSamplesTmp[1]);
859                 acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
860 
861                 pSamplesTmp += 4;
862                 pCoeffsTmp += 4;
863                 i--;
864             }
865 
866             acc0 = asrl(acc0, 23);
867             acc1 = asrl(acc1, 23);
868 
869             *pOutput++ = (q31_t) acc0;
870             *pOutput++ = (q31_t) acc1;
871         }
872         break;
873 
874     case 1:
875         {
876             const q31_t    *pCoeffsTmp = pCoeffs;
877             const q31_t    *pSamplesTmp = pSamples;
878 
879             acc0 = 0LL;
880 
881             /*
882              * Save 4 input samples in the history buffer
883              */
884             vst1q(pStateCur, vld1q(pTempSrc));
885             pStateCur += 4;
886             pTempSrc += 4;
887 
888             int       i = tapsBlkCnt;
889             while (i > 0)
890             {
891                 vecCoeffs = *(q31x4_t *) pCoeffsTmp;
892 
893                 vecIn0 = vld1q(pSamplesTmp);
894                 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
895 
896                 pSamplesTmp += 4;
897                 pCoeffsTmp += 4;
898                 i--;
899             }
900 
901             acc0 = asrl(acc0, 23);
902 
903             *pOutput++ = (q31_t) acc0;
904         }
905         break;
906     }
907 
908     /*
909      * Copy the samples back into the history buffer start
910      */
911     pTempSrc = &pState[blockSize];
912     pTempDest = pState;
913 
914     blkCnt = (numTaps - 1U) >> 2;
915     while (blkCnt > 0)
916     {
917         vst1q(pTempDest, vld1q(pTempSrc));
918         pTempSrc += 4;
919         pTempDest += 4;
920         blkCnt--;
921     }
922     blkCnt = (numTaps - 1U) & 3;
923     if (blkCnt > 0)
924     {
925         mve_pred16_t p0 = vctp32q(blkCnt);
926         vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
927     }
928 }
929 
930 #else
arm_fir_q31(const arm_fir_instance_q31 * S,const q31_t * pSrc,q31_t * pDst,uint32_t blockSize)931 ARM_DSP_ATTRIBUTE void arm_fir_q31(
932   const arm_fir_instance_q31 * S,
933   const q31_t * pSrc,
934         q31_t * pDst,
935         uint32_t blockSize)
936 {
937         q31_t *pState = S->pState;                     /* State pointer */
938   const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
939         q31_t *pStateCurnt;                            /* Points to the current sample of the state */
940         q31_t *px;                                     /* Temporary pointer for state buffer */
941   const q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
942         q63_t acc0;                                    /* Accumulator */
943         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
944         uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
945 
946 #if defined (ARM_MATH_LOOPUNROLL)
947         q63_t acc1, acc2;                              /* Accumulators */
948         q31_t x0, x1, x2;                              /* Temporary variables to hold state values */
949         q31_t c0;                                      /* Temporary variable to hold coefficient value */
950 #endif
951 
952   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
953   /* pStateCurnt points to the location where the new input data should be written */
954   pStateCurnt = &(S->pState[(numTaps - 1U)]);
955 
956 #if defined (ARM_MATH_LOOPUNROLL)
957 
958   /* Loop unrolling: Compute 4 output values simultaneously.
959    * The variables acc0 ... acc3 hold output values that are being computed:
960    *
961    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
962    *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
963    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
964    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
965    */
966 
967   blkCnt = blockSize / 3;
968 
969   while (blkCnt > 0U)
970   {
971     /* Copy 3 new input samples into the state buffer. */
972     *pStateCurnt++ = *pSrc++;
973     *pStateCurnt++ = *pSrc++;
974     *pStateCurnt++ = *pSrc++;
975 
976     /* Set all accumulators to zero */
977     acc0 = 0;
978     acc1 = 0;
979     acc2 = 0;
980 
981     /* Initialize state pointer */
982     px = pState;
983 
984     /* Initialize coefficient pointer */
985     pb = pCoeffs;
986 
987     /* Read the first 2 samples from the state buffer: x[n-numTaps], x[n-numTaps-1] */
988     x0 = *px++;
989     x1 = *px++;
990 
991     /* Loop unrolling: process 3 taps at a time. */
992     tapCnt = numTaps / 3;
993 
994     while (tapCnt > 0U)
995     {
996       /* Read the b[numTaps] coefficient */
997       c0 = *pb;
998 
999       /* Read x[n-numTaps-2] sample */
1000       x2 = *(px++);
1001 
1002       /* Perform the multiply-accumulates */
1003       acc0 += ((q63_t) x0 * c0);
1004       acc1 += ((q63_t) x1 * c0);
1005       acc2 += ((q63_t) x2 * c0);
1006 
1007       /* Read the coefficient and state */
1008       c0 = *(pb + 1U);
1009       x0 = *(px++);
1010 
1011       /* Perform the multiply-accumulates */
1012       acc0 += ((q63_t) x1 * c0);
1013       acc1 += ((q63_t) x2 * c0);
1014       acc2 += ((q63_t) x0 * c0);
1015 
1016       /* Read the coefficient and state */
1017       c0 = *(pb + 2U);
1018       x1 = *(px++);
1019 
1020       /* update coefficient pointer */
1021       pb += 3U;
1022 
1023       /* Perform the multiply-accumulates */
1024       acc0 += ((q63_t) x2 * c0);
1025       acc1 += ((q63_t) x0 * c0);
1026       acc2 += ((q63_t) x1 * c0);
1027 
1028       /* Decrement loop counter */
1029       tapCnt--;
1030     }
1031 
1032     /* Loop unrolling: Compute remaining outputs */
1033     tapCnt = numTaps % 0x3U;
1034 
1035     while (tapCnt > 0U)
1036     {
1037       /* Read coefficients */
1038       c0 = *(pb++);
1039 
1040       /* Fetch 1 state variable */
1041       x2 = *(px++);
1042 
1043       /* Perform the multiply-accumulates */
1044       acc0 += ((q63_t) x0 * c0);
1045       acc1 += ((q63_t) x1 * c0);
1046       acc2 += ((q63_t) x2 * c0);
1047 
1048       /* Reuse the present sample states for next sample */
1049       x0 = x1;
1050       x1 = x2;
1051 
1052       /* Decrement loop counter */
1053       tapCnt--;
1054     }
1055 
1056     /* Advance the state pointer by 3 to process the next group of 3 samples */
1057     pState = pState + 3;
1058 
1059     /* The result is in 2.30 format. Convert to 1.31 and store in destination buffer. */
1060     *pDst++ = (q31_t) (acc0 >> 31U);
1061     *pDst++ = (q31_t) (acc1 >> 31U);
1062     *pDst++ = (q31_t) (acc2 >> 31U);
1063 
1064     /* Decrement loop counter */
1065     blkCnt--;
1066   }
1067 
1068   /* Loop unrolling: Compute remaining output samples */
1069   blkCnt = blockSize % 0x3U;
1070 
1071 #else
1072 
1073   /* Initialize blkCnt with number of taps */
1074   blkCnt = blockSize;
1075 
1076 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1077 
1078   while (blkCnt > 0U)
1079   {
1080     /* Copy one sample at a time into state buffer */
1081     *pStateCurnt++ = *pSrc++;
1082 
1083     /* Set the accumulator to zero */
1084     acc0 = 0;
1085 
1086     /* Initialize state pointer */
1087     px = pState;
1088 
1089     /* Initialize Coefficient pointer */
1090     pb = pCoeffs;
1091 
1092     i = numTaps;
1093 
1094     /* Perform the multiply-accumulates */
1095     do
1096     {
1097       /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
1098       acc0 += (q63_t) *px++ * *pb++;
1099 
1100       i--;
1101     } while (i > 0U);
1102 
1103     /* Result is in 2.62 format. Convert to 1.31 and store in destination buffer. */
1104     *pDst++ = (q31_t) (acc0 >> 31U);
1105 
1106     /* Advance state pointer by 1 for the next sample */
1107     pState = pState + 1U;
1108 
1109     /* Decrement loop counter */
1110     blkCnt--;
1111   }
1112 
1113   /* Processing is complete.
1114      Now copy the last numTaps - 1 samples to the start of the state buffer.
1115      This prepares the state buffer for the next function call. */
1116 
1117   /* Points to the start of the state buffer */
1118   pStateCurnt = S->pState;
1119 
1120 #if defined (ARM_MATH_LOOPUNROLL)
1121 
1122   /* Loop unrolling: Compute 4 taps at a time */
1123   tapCnt = (numTaps - 1U) >> 2U;
1124 
1125   /* Copy data */
1126   while (tapCnt > 0U)
1127   {
1128     *pStateCurnt++ = *pState++;
1129     *pStateCurnt++ = *pState++;
1130     *pStateCurnt++ = *pState++;
1131     *pStateCurnt++ = *pState++;
1132 
1133     /* Decrement loop counter */
1134     tapCnt--;
1135   }
1136 
1137   /* Calculate remaining number of copies */
1138   tapCnt = (numTaps - 1U) % 0x4U;
1139 
1140 #else
1141 
1142   /* Initialize tapCnt with number of taps */
1143   tapCnt = (numTaps - 1U);
1144 
1145 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
1146 
1147   /* Copy remaining data */
1148   while (tapCnt > 0U)
1149   {
1150     *pStateCurnt++ = *pState++;
1151 
1152     /* Decrement loop counter */
1153     tapCnt--;
1154   }
1155 
1156 }
1157 #endif /* defined(ARM_MATH_MVEI) */
1158 
1159 /**
1160   @} end of FIR group
1161  */
1162