1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_q31.c
4  * Description:  Convolution of Q31 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Conv
37   @{
38  */
39 
40 /**
41   @brief         Convolution of Q31 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1.
47 
48   @par           Scaling and Overflow Behavior
49                    The function is implemented using an internal 64-bit accumulator.
50                    The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
51                    There is no saturation on intermediate additions.
52                    Thus, if the accumulator overflows it wraps around and distorts the result.
53                    The input signals should be scaled down to avoid intermediate overflows.
54                    Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
55                    as maximum of min(srcALen, srcBLen) number of additions are carried internally.
56                    The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
57 
58   @remark
59                    Refer to \ref arm_conv_fast_q31() for a faster but less precise implementation of this function.
60  */
61 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
62 #include "arm_helium_utils.h"
63 #include "arm_vec_filtering.h"
64 
arm_conv_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst)65 ARM_DSP_ATTRIBUTE void arm_conv_q31(
66   const q31_t * pSrcA,
67         uint32_t srcALen,
68   const q31_t * pSrcB,
69         uint32_t srcBLen,
70         q31_t * pDst)
71 {
72     const q31_t    *pIn1 = pSrcA;     /* inputA pointer               */
73     const q31_t    *pIn2 = pSrcB;     /* inputB pointer               */
74     /*
75      * Loop to perform MAC operations according to correlation equation
76      */
77     const q31_t    *pX;
78     const q31_t    *pY;
79     const q31_t    *pA;
80     const q31_t    *pB;
81     int32_t   i = 0U, j = 0;    /* loop counters */
82     int32_t   block1, block2, block3;
83     uint32_t  vddupStartIdx = 3;
84     uint32x4_t decrIdxVec = vddupq_u32(vddupStartIdx, 1);
85 
86 
87     if (srcALen < srcBLen)
88     {
89         /*
90          * Initialization to inputB pointer
91          */
92         pIn1 = pSrcB;
93         /*
94          * Initialization to the end of inputA pointer
95          */
96         pIn2 = pSrcA;
97         /*
98          * Swapping the lengths
99          */
100         j = srcALen;
101         srcALen = srcBLen;
102         srcBLen = j;
103     }
104 
105     block1 = srcBLen - 1;
106     block2 = srcALen - srcBLen + 1;
107     block3 = srcBLen - 1;
108 
109     pA = pIn1;
110     pB = pIn2 - 3;
111 
112     for (i = 0; i <= block1 - 2; i += 2)
113     {
114         uint32_t  count = i + 1;
115         int64_t   acc0 = 0LL;
116         int64_t   acc1 = 0LL;
117 
118         pX = pA;
119         pY = pB;
120         MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count);
121 
122         *pDst++ = (q31_t) acc0;
123         *pDst++ = (q31_t) acc1;
124         pB += 2;
125     }
126     for (; i < block1; i++)
127     {
128         uint32_t  count = i + 1;
129         int64_t   acc = 0LL;
130 
131         pX = pA;
132         pY = pB;
133         MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count);
134 
135         *pDst++ = (q31_t) acc;
136         pB++;
137     }
138 
139     for (i = 0; i <= block2 - 4; i += 4)
140     {
141         uint32_t  count = srcBLen;
142         int64_t   acc0 = 0LL;
143         int64_t   acc1 = 0LL;
144         int64_t   acc2 = 0LL;
145         int64_t   acc3 = 0LL;
146 
147         pX = pA;
148         pY = pB;
149         /*
150          * compute 4 accumulators per loop
151          * size is fixed for all accumulators
152          * X pointer is incrementing for successive accumulators
153          */
154         MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count);
155         *pDst++ = (q31_t) acc0;
156         *pDst++ = (q31_t) acc1;
157         *pDst++ = (q31_t) acc2;
158         *pDst++ = (q31_t) acc3;
159 
160         pA += 4;
161     }
162 
163     for (; i <= block2 - 2; i += 2)
164     {
165         uint32_t  count = srcBLen;
166         int64_t   acc0 = 0LL;
167         int64_t   acc1 = 0LL;
168 
169         pX = pA;
170         pY = pB;
171         /*
172          * compute 2 accumulators per loop
173          * size is fixed for all accumulators
174          * X pointer is incrementing for successive accumulators
175          */
176         MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count);
177         *pDst++ = (q31_t) acc0;
178         *pDst++ = (q31_t) acc1;
179 
180         pA += 2;
181     }
182     if (block2 & 1)
183     {
184         uint32_t  count = srcBLen;
185         int64_t   acc = 0LL;
186 
187         pX = pA;
188         pY = pB;
189 
190         MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count);
191         *pDst++ = (q31_t) acc;
192         pA++;
193     }
194 
195     for (i = block3; i >= 2; i -= 2)
196     {
197         uint32_t  count = i;
198         int64_t   acc0 = 0LL;
199         int64_t   acc1 = 0LL;
200 
201         pX = pA;
202         pY = pB;
203 
204         MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count);
205         *pDst++ = (q31_t) acc0;
206         *pDst++ = (q31_t) acc1;
207         pA += 2;
208     }
209 
210     for (; i >= 1; i--)
211     {
212         uint32_t  count = i;
213         int64_t   acc = 0LL;
214 
215         pX = pA;
216         pY = pB;
217 
218         MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count);
219         *pDst++ = (q31_t) acc;
220         pA++;
221     }
222 }
223 
224 #else
arm_conv_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst)225 ARM_DSP_ATTRIBUTE void arm_conv_q31(
226   const q31_t * pSrcA,
227         uint32_t srcALen,
228   const q31_t * pSrcB,
229         uint32_t srcBLen,
230         q31_t * pDst)
231 {
232 
233 #if (1)
234 //#if !defined(ARM_MATH_CM0_FAMILY)
235 
236   const q31_t *pIn1;                                   /* InputA pointer */
237   const q31_t *pIn2;                                   /* InputB pointer */
238         q31_t *pOut = pDst;                            /* Output pointer */
239   const q31_t *px;                                     /* Intermediate inputA pointer */
240   const q31_t *py;                                     /* Intermediate inputB pointer */
241   const q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
242         q63_t sum;                                     /* Accumulators */
243         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
244         uint32_t j, k, count, blkCnt;                  /* Loop counters */
245 
246 #if defined (ARM_MATH_LOOPUNROLL)
247         q63_t acc0, acc1, acc2;                        /* Accumulators */
248         q31_t x0, x1, x2, c0;                          /* Temporary variables to hold state and coefficient values */
249 #endif
250 
251   /* The algorithm implementation is based on the lengths of the inputs. */
252   /* srcB is always made to slide across srcA. */
253   /* So srcBLen is always considered as shorter or equal to srcALen */
254   if (srcALen >= srcBLen)
255   {
256     /* Initialization of inputA pointer */
257     pIn1 = pSrcA;
258 
259     /* Initialization of inputB pointer */
260     pIn2 = pSrcB;
261   }
262   else
263   {
264     /* Initialization of inputA pointer */
265     pIn1 = pSrcB;
266 
267     /* Initialization of inputB pointer */
268     pIn2 = pSrcA;
269 
270     /* srcBLen is always considered as shorter or equal to srcALen */
271     j = srcBLen;
272     srcBLen = srcALen;
273     srcALen = j;
274   }
275 
276   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
277   /* The function is internally
278    * divided into three stages according to the number of multiplications that has to be
279    * taken place between inputA samples and inputB samples. In the first stage of the
280    * algorithm, the multiplications increase by one for every iteration.
281    * In the second stage of the algorithm, srcBLen number of multiplications are done.
282    * In the third stage of the algorithm, the multiplications decrease by one
283    * for every iteration. */
284 
285   /* The algorithm is implemented in three stages.
286      The loop counters of each stage is initiated here. */
287   blockSize1 = srcBLen - 1U;
288   blockSize2 = srcALen - (srcBLen - 1U);
289   blockSize3 = blockSize1;
290 
291   /* --------------------------
292    * Initializations of stage1
293    * -------------------------*/
294 
295   /* sum = x[0] * y[0]
296    * sum = x[0] * y[1] + x[1] * y[0]
297    * ....
298    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
299    */
300 
301   /* In this stage the MAC operations are increased by 1 for every iteration.
302      The count variable holds the number of MAC operations performed */
303   count = 1U;
304 
305   /* Working pointer of inputA */
306   px = pIn1;
307 
308   /* Working pointer of inputB */
309   py = pIn2;
310 
311 
312   /* ------------------------
313    * Stage1 process
314    * ----------------------*/
315 
316   /* The first stage starts here */
317   while (blockSize1 > 0U)
318   {
319     /* Accumulator is made zero for every iteration */
320     sum = 0;
321 
322 #if defined (ARM_MATH_LOOPUNROLL)
323 
324     /* Loop unrolling: Compute 4 outputs at a time */
325     k = count >> 2U;
326 
327     while (k > 0U)
328     {
329       /* x[0] * y[srcBLen - 1] */
330       sum += (q63_t) *px++ * (*py--);
331 
332       /* x[1] * y[srcBLen - 2] */
333       sum += (q63_t) *px++ * (*py--);
334 
335       /* x[2] * y[srcBLen - 3] */
336       sum += (q63_t) *px++ * (*py--);
337 
338       /* x[3] * y[srcBLen - 4] */
339       sum += (q63_t) *px++ * (*py--);
340 
341       /* Decrement loop counter */
342       k--;
343     }
344 
345     /* Loop unrolling: Compute remaining outputs */
346     k = count % 0x4U;
347 
348 #else
349 
350     /* Initialize k with number of samples */
351     k = count;
352 
353 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
354 
355     while (k > 0U)
356     {
357       /* Perform the multiply-accumulate */
358       sum += (q63_t) *px++ * *py--;
359 
360       /* Decrement loop counter */
361       k--;
362     }
363 
364     /* Store the result in the accumulator in the destination buffer. */
365     *pOut++ = (q31_t) (sum >> 31);
366 
367     /* Update the inputA and inputB pointers for next MAC calculation */
368     py = pIn2 + count;
369     px = pIn1;
370 
371     /* Increment MAC count */
372     count++;
373 
374     /* Decrement loop counter */
375     blockSize1--;
376   }
377 
378   /* --------------------------
379    * Initializations of stage2
380    * ------------------------*/
381 
382   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
383    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen]   * y[0]
384    * ....
385    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
386    */
387 
388   /* Working pointer of inputA */
389   px = pIn1;
390 
391   /* Working pointer of inputB */
392   pSrc2 = pIn2 + (srcBLen - 1U);
393   py = pSrc2;
394 
395   /* count is index by which the pointer pIn1 to be incremented */
396   count = 0U;
397 
398   /* -------------------
399    * Stage2 process
400    * ------------------*/
401 
402   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
403    * So, to loop unroll over blockSize2,
404    * srcBLen should be greater than or equal to 4 */
405   if (srcBLen >= 4U)
406   {
407 #if defined (ARM_MATH_LOOPUNROLL)
408 
409     /* Loop unroll by 3 */
410     blkCnt = blockSize2 / 3;
411 
412     while (blkCnt > 0U)
413     {
414       /* Set all accumulators to zero */
415       acc0 = 0;
416       acc1 = 0;
417       acc2 = 0;
418 
419       /* read x[0], x[1], x[2] samples */
420       x0 = *px++;
421       x1 = *px++;
422 
423       /* Apply loop unrolling and compute 3 MACs simultaneously. */
424       k = srcBLen / 3;
425 
426       /* First part of the processing with loop unrolling.  Compute 3 MACs at a time.
427        ** a second loop below computes MACs for the remaining 1 to 2 samples. */
428       do
429       {
430         /* Read y[srcBLen - 1] sample */
431         c0 = *(py);
432         /* Read x[3] sample */
433         x2 = *(px);
434 
435         /* Perform the multiply-accumulate */
436         /* acc0 +=  x[0] * y[srcBLen - 1] */
437         acc0 += ((q63_t) x0 * c0);
438         /* acc1 +=  x[1] * y[srcBLen - 1] */
439         acc1 += ((q63_t) x1 * c0);
440         /* acc2 +=  x[2] * y[srcBLen - 1] */
441         acc2 += ((q63_t) x2 * c0);
442 
443         /* Read y[srcBLen - 2] sample */
444         c0 = *(py - 1U);
445         /* Read x[4] sample */
446         x0 = *(px + 1U);
447 
448         /* Perform the multiply-accumulate */
449         /* acc0 +=  x[1] * y[srcBLen - 2] */
450         acc0 += ((q63_t) x1 * c0);
451         /* acc1 +=  x[2] * y[srcBLen - 2] */
452         acc1 += ((q63_t) x2 * c0);
453         /* acc2 +=  x[3] * y[srcBLen - 2] */
454         acc2 += ((q63_t) x0 * c0);
455 
456         /* Read y[srcBLen - 3] sample */
457         c0 = *(py - 2U);
458         /* Read x[5] sample */
459         x1 = *(px + 2U);
460 
461         /* Perform the multiply-accumulate */
462         /* acc0 +=  x[2] * y[srcBLen - 3] */
463         acc0 += ((q63_t) x2 * c0);
464         /* acc1 +=  x[3] * y[srcBLen - 2] */
465         acc1 += ((q63_t) x0 * c0);
466         /* acc2 +=  x[4] * y[srcBLen - 2] */
467         acc2 += ((q63_t) x1 * c0);
468 
469         /* update scratch pointers */
470         px += 3U;
471         py -= 3U;
472 
473       } while (--k);
474 
475       /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
476        ** No loop unrolling is used. */
477       k = srcBLen - (3 * (srcBLen / 3));
478 
479       while (k > 0U)
480       {
481         /* Read y[srcBLen - 5] sample */
482         c0 = *py--;
483         /* Read x[7] sample */
484         x2 = *px++;
485 
486         /* Perform the multiply-accumulates */
487         /* acc0 +=  x[4] * y[srcBLen - 5] */
488         acc0 += ((q63_t) x0 * c0);
489         /* acc1 +=  x[5] * y[srcBLen - 5] */
490         acc1 += ((q63_t) x1 * c0);
491         /* acc2 +=  x[6] * y[srcBLen - 5] */
492         acc2 += ((q63_t) x2 * c0);
493 
494         /* Reuse the present samples for the next MAC */
495         x0 = x1;
496         x1 = x2;
497 
498         /* Decrement loop counter */
499         k--;
500       }
501 
502       /* Store the result in the accumulator in the destination buffer. */
503       *pOut++ = (q31_t) (acc0 >> 31);
504       *pOut++ = (q31_t) (acc1 >> 31);
505       *pOut++ = (q31_t) (acc2 >> 31);
506 
507       /* Increment the pointer pIn1 index, count by 3 */
508       count += 3U;
509 
510       /* Update the inputA and inputB pointers for next MAC calculation */
511       px = pIn1 + count;
512       py = pSrc2;
513 
514       /* Decrement loop counter */
515       blkCnt--;
516     }
517 
518     /* Loop unrolling: Compute remaining outputs */
519     blkCnt = blockSize2 - 3 * (blockSize2 / 3);
520 
521 #else
522 
523     /* Initialize blkCnt with number of samples */
524     blkCnt = blockSize2;
525 
526 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
527 
528     while (blkCnt > 0U)
529     {
530       /* Accumulator is made zero for every iteration */
531       sum = 0;
532 
533 #if defined (ARM_MATH_LOOPUNROLL)
534 
535     /* Loop unrolling: Compute 4 outputs at a time */
536       k = srcBLen >> 2U;
537 
538       while (k > 0U)
539       {
540         /* Perform the multiply-accumulates */
541         sum += (q63_t) *px++ * *py--;
542         sum += (q63_t) *px++ * *py--;
543         sum += (q63_t) *px++ * *py--;
544         sum += (q63_t) *px++ * *py--;
545 
546         /* Decrement loop counter */
547         k--;
548       }
549 
550       /* Loop unrolling: Compute remaining outputs */
551       k = srcBLen % 0x4U;
552 
553 #else
554 
555       /* Initialize blkCnt with number of samples */
556       k = srcBLen;
557 
558 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
559 
560       while (k > 0U)
561       {
562         /* Perform the multiply-accumulate */
563         sum += (q63_t) *px++ * *py--;
564 
565         /* Decrement the loop counter */
566         k--;
567       }
568 
569       /* Store the result in the accumulator in the destination buffer. */
570       *pOut++ = (q31_t) (sum >> 31);
571 
572       /* Increment MAC count */
573       count++;
574 
575       /* Update the inputA and inputB pointers for next MAC calculation */
576       px = pIn1 + count;
577       py = pSrc2;
578 
579       /* Decrement loop counter */
580       blkCnt--;
581     }
582   }
583   else
584   {
585     /* If the srcBLen is not a multiple of 4,
586      * the blockSize2 loop cannot be unrolled by 4 */
587     blkCnt = blockSize2;
588 
589     while (blkCnt > 0U)
590     {
591       /* Accumulator is made zero for every iteration */
592       sum = 0;
593 
594       /* srcBLen number of MACS should be performed */
595       k = srcBLen;
596 
597       while (k > 0U)
598       {
599         /* Perform the multiply-accumulate */
600         sum += (q63_t) *px++ * *py--;
601 
602         /* Decrement the loop counter */
603         k--;
604       }
605 
606       /* Store the result in the accumulator in the destination buffer. */
607       *pOut++ = (q31_t) (sum >> 31);
608 
609       /* Increment MAC count */
610       count++;
611 
612       /* Update the inputA and inputB pointers for next MAC calculation */
613       px = pIn1 + count;
614       py = pSrc2;
615 
616       /* Decrement loop counter */
617       blkCnt--;
618     }
619   }
620 
621 
622   /* --------------------------
623    * Initializations of stage3
624    * -------------------------*/
625 
626   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
627    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
628    * ....
629    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
630    * sum +=  x[srcALen-1] * y[srcBLen-1]
631    */
632 
633   /* In this stage the MAC operations are decreased by 1 for every iteration.
634      The blockSize3 variable holds the number of MAC operations performed */
635 
636   /* Working pointer of inputA */
637   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
638   px = pSrc1;
639 
640   /* Working pointer of inputB */
641   pSrc2 = pIn2 + (srcBLen - 1U);
642   py = pSrc2;
643 
644   /* -------------------
645    * Stage3 process
646    * ------------------*/
647 
648   while (blockSize3 > 0U)
649   {
650     /* Accumulator is made zero for every iteration */
651     sum = 0;
652 
653 #if defined (ARM_MATH_LOOPUNROLL)
654 
655     /* Loop unrolling: Compute 4 outputs at a time */
656     k = blockSize3 >> 2U;
657 
658     while (k > 0U)
659     {
660       /* Perform the multiply-accumulate */
661       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
662       sum += (q63_t) *px++ * *py--;
663 
664       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
665       sum += (q63_t) *px++ * *py--;
666 
667       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
668       sum += (q63_t) *px++ * *py--;
669 
670       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
671       sum += (q63_t) *px++ * *py--;
672 
673       /* Decrement loop counter */
674       k--;
675     }
676 
677     /* Loop unrolling: Compute remaining outputs */
678     k = blockSize3 % 0x4U;
679 
680 #else
681 
682     /* Initialize blkCnt with number of samples */
683     k = blockSize3;
684 
685 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
686 
687     while (k > 0U)
688     {
689       /* Perform the multiply-accumulate */
690       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
691       sum += (q63_t) *px++ * *py--;
692 
693       /* Decrement loop counter */
694       k--;
695     }
696 
697     /* Store the result in the accumulator in the destination buffer. */
698     *pOut++ = (q31_t) (sum >> 31);
699 
700     /* Update the inputA and inputB pointers for next MAC calculation */
701     px = ++pSrc1;
702     py = pSrc2;
703 
704     /* Decrement loop counter */
705     blockSize3--;
706   }
707 
708 #else
709 /* alternate version for CM0_FAMILY */
710 
711   const q31_t *pIn1 = pSrcA;                           /* InputA pointer */
712   const q31_t *pIn2 = pSrcB;                           /* InputB pointer */
713         q63_t sum;                                     /* Accumulators */
714         uint32_t i, j;                                 /* Loop counters */
715 
716   /* Loop to calculate convolution for output length number of times */
717   for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
718   {
719     /* Initialize sum with zero to carry out MAC operations */
720     sum = 0;
721 
722     /* Loop to perform MAC operations according to convolution equation */
723     for (j = 0U; j <= i; j++)
724     {
725       /* Check the array limitations */
726       if (((i - j) < srcBLen) && (j < srcALen))
727       {
728         /* z[i] += x[i-j] * y[j] */
729         sum += ((q63_t) pIn1[j] * pIn2[i - j]);
730       }
731     }
732 
733     /* Store the output in the destination buffer */
734     pDst[i] = (q31_t) (sum >> 31U);
735   }
736 
737 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
738 
739 }
740 #endif /* defined(ARM_MATH_MVEI) */
741 
742 /**
743   @} end of Conv group
744  */
745