1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_q31.c
4 * Description: Convolution of Q31 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Conv
37 @{
38 */
39
40 /**
41 @brief Convolution of Q31 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
47
48 @par Scaling and Overflow Behavior
49 The function is implemented using an internal 64-bit accumulator.
50 The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
51 There is no saturation on intermediate additions.
52 Thus, if the accumulator overflows it wraps around and distorts the result.
53 The input signals should be scaled down to avoid intermediate overflows.
54 Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
55 as maximum of min(srcALen, srcBLen) number of additions are carried internally.
56 The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
57
58 @remark
59 Refer to \ref arm_conv_fast_q31() for a faster but less precise implementation of this function.
60 */
61 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
62 #include "arm_helium_utils.h"
63 #include "arm_vec_filtering.h"
64
arm_conv_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst)65 ARM_DSP_ATTRIBUTE void arm_conv_q31(
66 const q31_t * pSrcA,
67 uint32_t srcALen,
68 const q31_t * pSrcB,
69 uint32_t srcBLen,
70 q31_t * pDst)
71 {
72 const q31_t *pIn1 = pSrcA; /* inputA pointer */
73 const q31_t *pIn2 = pSrcB; /* inputB pointer */
74 /*
75 * Loop to perform MAC operations according to correlation equation
76 */
77 const q31_t *pX;
78 const q31_t *pY;
79 const q31_t *pA;
80 const q31_t *pB;
81 int32_t i = 0U, j = 0; /* loop counters */
82 int32_t block1, block2, block3;
83 uint32_t vddupStartIdx = 3;
84 uint32x4_t decrIdxVec = vddupq_u32(vddupStartIdx, 1);
85
86
87 if (srcALen < srcBLen)
88 {
89 /*
90 * Initialization to inputB pointer
91 */
92 pIn1 = pSrcB;
93 /*
94 * Initialization to the end of inputA pointer
95 */
96 pIn2 = pSrcA;
97 /*
98 * Swapping the lengths
99 */
100 j = srcALen;
101 srcALen = srcBLen;
102 srcBLen = j;
103 }
104
105 block1 = srcBLen - 1;
106 block2 = srcALen - srcBLen + 1;
107 block3 = srcBLen - 1;
108
109 pA = pIn1;
110 pB = pIn2 - 3;
111
112 for (i = 0; i <= block1 - 2; i += 2)
113 {
114 uint32_t count = i + 1;
115 int64_t acc0 = 0LL;
116 int64_t acc1 = 0LL;
117
118 pX = pA;
119 pY = pB;
120 MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count);
121
122 *pDst++ = (q31_t) acc0;
123 *pDst++ = (q31_t) acc1;
124 pB += 2;
125 }
126 for (; i < block1; i++)
127 {
128 uint32_t count = i + 1;
129 int64_t acc = 0LL;
130
131 pX = pA;
132 pY = pB;
133 MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count);
134
135 *pDst++ = (q31_t) acc;
136 pB++;
137 }
138
139 for (i = 0; i <= block2 - 4; i += 4)
140 {
141 uint32_t count = srcBLen;
142 int64_t acc0 = 0LL;
143 int64_t acc1 = 0LL;
144 int64_t acc2 = 0LL;
145 int64_t acc3 = 0LL;
146
147 pX = pA;
148 pY = pB;
149 /*
150 * compute 4 accumulators per loop
151 * size is fixed for all accumulators
152 * X pointer is incrementing for successive accumulators
153 */
154 MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count);
155 *pDst++ = (q31_t) acc0;
156 *pDst++ = (q31_t) acc1;
157 *pDst++ = (q31_t) acc2;
158 *pDst++ = (q31_t) acc3;
159
160 pA += 4;
161 }
162
163 for (; i <= block2 - 2; i += 2)
164 {
165 uint32_t count = srcBLen;
166 int64_t acc0 = 0LL;
167 int64_t acc1 = 0LL;
168
169 pX = pA;
170 pY = pB;
171 /*
172 * compute 2 accumulators per loop
173 * size is fixed for all accumulators
174 * X pointer is incrementing for successive accumulators
175 */
176 MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count);
177 *pDst++ = (q31_t) acc0;
178 *pDst++ = (q31_t) acc1;
179
180 pA += 2;
181 }
182 if (block2 & 1)
183 {
184 uint32_t count = srcBLen;
185 int64_t acc = 0LL;
186
187 pX = pA;
188 pY = pB;
189
190 MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count);
191 *pDst++ = (q31_t) acc;
192 pA++;
193 }
194
195 for (i = block3; i >= 2; i -= 2)
196 {
197 uint32_t count = i;
198 int64_t acc0 = 0LL;
199 int64_t acc1 = 0LL;
200
201 pX = pA;
202 pY = pB;
203
204 MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count);
205 *pDst++ = (q31_t) acc0;
206 *pDst++ = (q31_t) acc1;
207 pA += 2;
208 }
209
210 for (; i >= 1; i--)
211 {
212 uint32_t count = i;
213 int64_t acc = 0LL;
214
215 pX = pA;
216 pY = pB;
217
218 MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count);
219 *pDst++ = (q31_t) acc;
220 pA++;
221 }
222 }
223
224 #else
arm_conv_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst)225 ARM_DSP_ATTRIBUTE void arm_conv_q31(
226 const q31_t * pSrcA,
227 uint32_t srcALen,
228 const q31_t * pSrcB,
229 uint32_t srcBLen,
230 q31_t * pDst)
231 {
232
233 #if (1)
234 //#if !defined(ARM_MATH_CM0_FAMILY)
235
236 const q31_t *pIn1; /* InputA pointer */
237 const q31_t *pIn2; /* InputB pointer */
238 q31_t *pOut = pDst; /* Output pointer */
239 const q31_t *px; /* Intermediate inputA pointer */
240 const q31_t *py; /* Intermediate inputB pointer */
241 const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
242 q63_t sum; /* Accumulators */
243 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
244 uint32_t j, k, count, blkCnt; /* Loop counters */
245
246 #if defined (ARM_MATH_LOOPUNROLL)
247 q63_t acc0, acc1, acc2; /* Accumulators */
248 q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */
249 #endif
250
251 /* The algorithm implementation is based on the lengths of the inputs. */
252 /* srcB is always made to slide across srcA. */
253 /* So srcBLen is always considered as shorter or equal to srcALen */
254 if (srcALen >= srcBLen)
255 {
256 /* Initialization of inputA pointer */
257 pIn1 = pSrcA;
258
259 /* Initialization of inputB pointer */
260 pIn2 = pSrcB;
261 }
262 else
263 {
264 /* Initialization of inputA pointer */
265 pIn1 = pSrcB;
266
267 /* Initialization of inputB pointer */
268 pIn2 = pSrcA;
269
270 /* srcBLen is always considered as shorter or equal to srcALen */
271 j = srcBLen;
272 srcBLen = srcALen;
273 srcALen = j;
274 }
275
276 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
277 /* The function is internally
278 * divided into three stages according to the number of multiplications that has to be
279 * taken place between inputA samples and inputB samples. In the first stage of the
280 * algorithm, the multiplications increase by one for every iteration.
281 * In the second stage of the algorithm, srcBLen number of multiplications are done.
282 * In the third stage of the algorithm, the multiplications decrease by one
283 * for every iteration. */
284
285 /* The algorithm is implemented in three stages.
286 The loop counters of each stage is initiated here. */
287 blockSize1 = srcBLen - 1U;
288 blockSize2 = srcALen - (srcBLen - 1U);
289 blockSize3 = blockSize1;
290
291 /* --------------------------
292 * Initializations of stage1
293 * -------------------------*/
294
295 /* sum = x[0] * y[0]
296 * sum = x[0] * y[1] + x[1] * y[0]
297 * ....
298 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
299 */
300
301 /* In this stage the MAC operations are increased by 1 for every iteration.
302 The count variable holds the number of MAC operations performed */
303 count = 1U;
304
305 /* Working pointer of inputA */
306 px = pIn1;
307
308 /* Working pointer of inputB */
309 py = pIn2;
310
311
312 /* ------------------------
313 * Stage1 process
314 * ----------------------*/
315
316 /* The first stage starts here */
317 while (blockSize1 > 0U)
318 {
319 /* Accumulator is made zero for every iteration */
320 sum = 0;
321
322 #if defined (ARM_MATH_LOOPUNROLL)
323
324 /* Loop unrolling: Compute 4 outputs at a time */
325 k = count >> 2U;
326
327 while (k > 0U)
328 {
329 /* x[0] * y[srcBLen - 1] */
330 sum += (q63_t) *px++ * (*py--);
331
332 /* x[1] * y[srcBLen - 2] */
333 sum += (q63_t) *px++ * (*py--);
334
335 /* x[2] * y[srcBLen - 3] */
336 sum += (q63_t) *px++ * (*py--);
337
338 /* x[3] * y[srcBLen - 4] */
339 sum += (q63_t) *px++ * (*py--);
340
341 /* Decrement loop counter */
342 k--;
343 }
344
345 /* Loop unrolling: Compute remaining outputs */
346 k = count % 0x4U;
347
348 #else
349
350 /* Initialize k with number of samples */
351 k = count;
352
353 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
354
355 while (k > 0U)
356 {
357 /* Perform the multiply-accumulate */
358 sum += (q63_t) *px++ * *py--;
359
360 /* Decrement loop counter */
361 k--;
362 }
363
364 /* Store the result in the accumulator in the destination buffer. */
365 *pOut++ = (q31_t) (sum >> 31);
366
367 /* Update the inputA and inputB pointers for next MAC calculation */
368 py = pIn2 + count;
369 px = pIn1;
370
371 /* Increment MAC count */
372 count++;
373
374 /* Decrement loop counter */
375 blockSize1--;
376 }
377
378 /* --------------------------
379 * Initializations of stage2
380 * ------------------------*/
381
382 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
383 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
384 * ....
385 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
386 */
387
388 /* Working pointer of inputA */
389 px = pIn1;
390
391 /* Working pointer of inputB */
392 pSrc2 = pIn2 + (srcBLen - 1U);
393 py = pSrc2;
394
395 /* count is index by which the pointer pIn1 to be incremented */
396 count = 0U;
397
398 /* -------------------
399 * Stage2 process
400 * ------------------*/
401
402 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
403 * So, to loop unroll over blockSize2,
404 * srcBLen should be greater than or equal to 4 */
405 if (srcBLen >= 4U)
406 {
407 #if defined (ARM_MATH_LOOPUNROLL)
408
409 /* Loop unroll by 3 */
410 blkCnt = blockSize2 / 3;
411
412 while (blkCnt > 0U)
413 {
414 /* Set all accumulators to zero */
415 acc0 = 0;
416 acc1 = 0;
417 acc2 = 0;
418
419 /* read x[0], x[1], x[2] samples */
420 x0 = *px++;
421 x1 = *px++;
422
423 /* Apply loop unrolling and compute 3 MACs simultaneously. */
424 k = srcBLen / 3;
425
426 /* First part of the processing with loop unrolling. Compute 3 MACs at a time.
427 ** a second loop below computes MACs for the remaining 1 to 2 samples. */
428 do
429 {
430 /* Read y[srcBLen - 1] sample */
431 c0 = *(py);
432 /* Read x[3] sample */
433 x2 = *(px);
434
435 /* Perform the multiply-accumulate */
436 /* acc0 += x[0] * y[srcBLen - 1] */
437 acc0 += ((q63_t) x0 * c0);
438 /* acc1 += x[1] * y[srcBLen - 1] */
439 acc1 += ((q63_t) x1 * c0);
440 /* acc2 += x[2] * y[srcBLen - 1] */
441 acc2 += ((q63_t) x2 * c0);
442
443 /* Read y[srcBLen - 2] sample */
444 c0 = *(py - 1U);
445 /* Read x[4] sample */
446 x0 = *(px + 1U);
447
448 /* Perform the multiply-accumulate */
449 /* acc0 += x[1] * y[srcBLen - 2] */
450 acc0 += ((q63_t) x1 * c0);
451 /* acc1 += x[2] * y[srcBLen - 2] */
452 acc1 += ((q63_t) x2 * c0);
453 /* acc2 += x[3] * y[srcBLen - 2] */
454 acc2 += ((q63_t) x0 * c0);
455
456 /* Read y[srcBLen - 3] sample */
457 c0 = *(py - 2U);
458 /* Read x[5] sample */
459 x1 = *(px + 2U);
460
461 /* Perform the multiply-accumulate */
462 /* acc0 += x[2] * y[srcBLen - 3] */
463 acc0 += ((q63_t) x2 * c0);
464 /* acc1 += x[3] * y[srcBLen - 2] */
465 acc1 += ((q63_t) x0 * c0);
466 /* acc2 += x[4] * y[srcBLen - 2] */
467 acc2 += ((q63_t) x1 * c0);
468
469 /* update scratch pointers */
470 px += 3U;
471 py -= 3U;
472
473 } while (--k);
474
475 /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
476 ** No loop unrolling is used. */
477 k = srcBLen - (3 * (srcBLen / 3));
478
479 while (k > 0U)
480 {
481 /* Read y[srcBLen - 5] sample */
482 c0 = *py--;
483 /* Read x[7] sample */
484 x2 = *px++;
485
486 /* Perform the multiply-accumulates */
487 /* acc0 += x[4] * y[srcBLen - 5] */
488 acc0 += ((q63_t) x0 * c0);
489 /* acc1 += x[5] * y[srcBLen - 5] */
490 acc1 += ((q63_t) x1 * c0);
491 /* acc2 += x[6] * y[srcBLen - 5] */
492 acc2 += ((q63_t) x2 * c0);
493
494 /* Reuse the present samples for the next MAC */
495 x0 = x1;
496 x1 = x2;
497
498 /* Decrement loop counter */
499 k--;
500 }
501
502 /* Store the result in the accumulator in the destination buffer. */
503 *pOut++ = (q31_t) (acc0 >> 31);
504 *pOut++ = (q31_t) (acc1 >> 31);
505 *pOut++ = (q31_t) (acc2 >> 31);
506
507 /* Increment the pointer pIn1 index, count by 3 */
508 count += 3U;
509
510 /* Update the inputA and inputB pointers for next MAC calculation */
511 px = pIn1 + count;
512 py = pSrc2;
513
514 /* Decrement loop counter */
515 blkCnt--;
516 }
517
518 /* Loop unrolling: Compute remaining outputs */
519 blkCnt = blockSize2 - 3 * (blockSize2 / 3);
520
521 #else
522
523 /* Initialize blkCnt with number of samples */
524 blkCnt = blockSize2;
525
526 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
527
528 while (blkCnt > 0U)
529 {
530 /* Accumulator is made zero for every iteration */
531 sum = 0;
532
533 #if defined (ARM_MATH_LOOPUNROLL)
534
535 /* Loop unrolling: Compute 4 outputs at a time */
536 k = srcBLen >> 2U;
537
538 while (k > 0U)
539 {
540 /* Perform the multiply-accumulates */
541 sum += (q63_t) *px++ * *py--;
542 sum += (q63_t) *px++ * *py--;
543 sum += (q63_t) *px++ * *py--;
544 sum += (q63_t) *px++ * *py--;
545
546 /* Decrement loop counter */
547 k--;
548 }
549
550 /* Loop unrolling: Compute remaining outputs */
551 k = srcBLen % 0x4U;
552
553 #else
554
555 /* Initialize blkCnt with number of samples */
556 k = srcBLen;
557
558 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
559
560 while (k > 0U)
561 {
562 /* Perform the multiply-accumulate */
563 sum += (q63_t) *px++ * *py--;
564
565 /* Decrement the loop counter */
566 k--;
567 }
568
569 /* Store the result in the accumulator in the destination buffer. */
570 *pOut++ = (q31_t) (sum >> 31);
571
572 /* Increment MAC count */
573 count++;
574
575 /* Update the inputA and inputB pointers for next MAC calculation */
576 px = pIn1 + count;
577 py = pSrc2;
578
579 /* Decrement loop counter */
580 blkCnt--;
581 }
582 }
583 else
584 {
585 /* If the srcBLen is not a multiple of 4,
586 * the blockSize2 loop cannot be unrolled by 4 */
587 blkCnt = blockSize2;
588
589 while (blkCnt > 0U)
590 {
591 /* Accumulator is made zero for every iteration */
592 sum = 0;
593
594 /* srcBLen number of MACS should be performed */
595 k = srcBLen;
596
597 while (k > 0U)
598 {
599 /* Perform the multiply-accumulate */
600 sum += (q63_t) *px++ * *py--;
601
602 /* Decrement the loop counter */
603 k--;
604 }
605
606 /* Store the result in the accumulator in the destination buffer. */
607 *pOut++ = (q31_t) (sum >> 31);
608
609 /* Increment MAC count */
610 count++;
611
612 /* Update the inputA and inputB pointers for next MAC calculation */
613 px = pIn1 + count;
614 py = pSrc2;
615
616 /* Decrement loop counter */
617 blkCnt--;
618 }
619 }
620
621
622 /* --------------------------
623 * Initializations of stage3
624 * -------------------------*/
625
626 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
627 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
628 * ....
629 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
630 * sum += x[srcALen-1] * y[srcBLen-1]
631 */
632
633 /* In this stage the MAC operations are decreased by 1 for every iteration.
634 The blockSize3 variable holds the number of MAC operations performed */
635
636 /* Working pointer of inputA */
637 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
638 px = pSrc1;
639
640 /* Working pointer of inputB */
641 pSrc2 = pIn2 + (srcBLen - 1U);
642 py = pSrc2;
643
644 /* -------------------
645 * Stage3 process
646 * ------------------*/
647
648 while (blockSize3 > 0U)
649 {
650 /* Accumulator is made zero for every iteration */
651 sum = 0;
652
653 #if defined (ARM_MATH_LOOPUNROLL)
654
655 /* Loop unrolling: Compute 4 outputs at a time */
656 k = blockSize3 >> 2U;
657
658 while (k > 0U)
659 {
660 /* Perform the multiply-accumulate */
661 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
662 sum += (q63_t) *px++ * *py--;
663
664 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
665 sum += (q63_t) *px++ * *py--;
666
667 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
668 sum += (q63_t) *px++ * *py--;
669
670 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
671 sum += (q63_t) *px++ * *py--;
672
673 /* Decrement loop counter */
674 k--;
675 }
676
677 /* Loop unrolling: Compute remaining outputs */
678 k = blockSize3 % 0x4U;
679
680 #else
681
682 /* Initialize blkCnt with number of samples */
683 k = blockSize3;
684
685 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
686
687 while (k > 0U)
688 {
689 /* Perform the multiply-accumulate */
690 /* sum += x[srcALen-1] * y[srcBLen-1] */
691 sum += (q63_t) *px++ * *py--;
692
693 /* Decrement loop counter */
694 k--;
695 }
696
697 /* Store the result in the accumulator in the destination buffer. */
698 *pOut++ = (q31_t) (sum >> 31);
699
700 /* Update the inputA and inputB pointers for next MAC calculation */
701 px = ++pSrc1;
702 py = pSrc2;
703
704 /* Decrement loop counter */
705 blockSize3--;
706 }
707
708 #else
709 /* alternate version for CM0_FAMILY */
710
711 const q31_t *pIn1 = pSrcA; /* InputA pointer */
712 const q31_t *pIn2 = pSrcB; /* InputB pointer */
713 q63_t sum; /* Accumulators */
714 uint32_t i, j; /* Loop counters */
715
716 /* Loop to calculate convolution for output length number of times */
717 for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
718 {
719 /* Initialize sum with zero to carry out MAC operations */
720 sum = 0;
721
722 /* Loop to perform MAC operations according to convolution equation */
723 for (j = 0U; j <= i; j++)
724 {
725 /* Check the array limitations */
726 if (((i - j) < srcBLen) && (j < srcALen))
727 {
728 /* z[i] += x[i-j] * y[j] */
729 sum += ((q63_t) pIn1[j] * pIn2[i - j]);
730 }
731 }
732
733 /* Store the output in the destination buffer */
734 pDst[i] = (q31_t) (sum >> 31U);
735 }
736
737 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
738
739 }
740 #endif /* defined(ARM_MATH_MVEI) */
741
742 /**
743 @} end of Conv group
744 */
745