1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_fast_q15.c
4  * Description:  Fast Q15 Convolution
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Conv
37   @{
38  */
39 
40 /**
41   @brief         Convolution of Q15 sequences (fast version).
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1
47 
48   @par           Scaling and Overflow Behavior
49                    This fast version uses a 32-bit accumulator with 2.30 format.
50                    The accumulator maintains full precision of the intermediate multiplication results
51                    but provides only a single guard bit. There is no saturation on intermediate additions.
52                    Thus, if the accumulator overflows it wraps around and distorts the result.
53                    The input signals should be scaled down to avoid intermediate overflows.
54                    Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
55                    as maximum of min(srcALen, srcBLen) number of additions are carried internally.
56                    The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
57 
58   @remark
59                    Refer to \ref arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
60  */
61 
arm_conv_fast_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst)62 ARM_DSP_ATTRIBUTE void arm_conv_fast_q15(
63   const q15_t * pSrcA,
64         uint32_t srcALen,
65   const q15_t * pSrcB,
66         uint32_t srcBLen,
67         q15_t * pDst)
68 {
69   const q15_t *pIn1;                                   /* InputA pointer */
70   const q15_t *pIn2;                                   /* InputB pointer */
71         q15_t *pOut = pDst;                            /* Output pointer */
72         q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators */
73   const q15_t *px;                                     /* Intermediate inputA pointer */
74   const q15_t *py;                                     /* Intermediate inputB pointer */
75   const q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
76         q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
77         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
78         uint32_t j, k, count, blkCnt;                  /* Loop counters */
79 
80   /* The algorithm implementation is based on the lengths of the inputs. */
81   /* srcB is always made to slide across srcA. */
82   /* So srcBLen is always considered as shorter or equal to srcALen */
83   if (srcALen >= srcBLen)
84   {
85     /* Initialization of inputA pointer */
86     pIn1 = pSrcA;
87 
88     /* Initialization of inputB pointer */
89     pIn2 = pSrcB;
90   }
91   else
92   {
93     /* Initialization of inputA pointer */
94     pIn1 = pSrcB;
95 
96     /* Initialization of inputB pointer */
97     pIn2 = pSrcA;
98 
99     /* srcBLen is always considered as shorter or equal to srcALen */
100     j = srcBLen;
101     srcBLen = srcALen;
102     srcALen = j;
103   }
104 
105   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
106   /* The function is internally
107    * divided into three stages according to the number of multiplications that has to be
108    * taken place between inputA samples and inputB samples. In the first stage of the
109    * algorithm, the multiplications increase by one for every iteration.
110    * In the second stage of the algorithm, srcBLen number of multiplications are done.
111    * In the third stage of the algorithm, the multiplications decrease by one
112    * for every iteration. */
113 
114   /* The algorithm is implemented in three stages.
115      The loop counters of each stage is initiated here. */
116   blockSize1 = srcBLen - 1U;
117   blockSize2 = srcALen - (srcBLen - 1U);
118   blockSize3 = blockSize1;
119 
120   /* --------------------------
121    * Initializations of stage1
122    * -------------------------*/
123 
124   /* sum = x[0] * y[0]
125    * sum = x[0] * y[1] + x[1] * y[0]
126    * ....
127    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
128    */
129 
130   /* In this stage the MAC operations are increased by 1 for every iteration.
131      The count variable holds the number of MAC operations performed */
132   count = 1U;
133 
134   /* Working pointer of inputA */
135   px = pIn1;
136 
137   /* Working pointer of inputB */
138   py = pIn2;
139 
140 
141   /* ------------------------
142    * Stage1 process
143    * ----------------------*/
144 
145   /* For loop unrolling by 4, this stage is divided into two. */
146   /* First part of this stage computes the MAC operations less than 4 */
147   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
148 
149   /* The first part of the stage starts here */
150   while ((count < 4U) && (blockSize1 > 0U))
151   {
152     /* Accumulator is made zero for every iteration */
153     sum = 0;
154 
155     /* Loop over number of MAC operations between
156      * inputA samples and inputB samples */
157     k = count;
158 
159     while (k > 0U)
160     {
161       /* Perform the multiply-accumulates */
162       sum = __SMLAD(*px++, *py--, sum);
163 
164       /* Decrement the loop counter */
165       k--;
166     }
167 
168     /* Store the result in the accumulator in the destination buffer. */
169     *pOut++ = (q15_t) (sum >> 15);
170 
171     /* Update the inputA and inputB pointers for next MAC calculation */
172     py = pIn2 + count;
173     px = pIn1;
174 
175     /* Increment MAC count */
176     count++;
177 
178     /* Decrement loop counter */
179     blockSize1--;
180   }
181 
182   /* The second part of the stage starts here */
183   /* The internal loop, over count, is unrolled by 4 */
184   /* To, read the last two inputB samples using SIMD:
185    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
186   py = py - 1;
187 
188   while (blockSize1 > 0U)
189   {
190     /* Accumulator is made zero for every iteration */
191     sum = 0;
192 
193     /* Apply loop unrolling and compute 4 MACs simultaneously. */
194     k = count >> 2U;
195 
196     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
197      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
198     while (k > 0U)
199     {
200       /* Perform the multiply-accumulates */
201       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
202       sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
203       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
204       sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
205 
206       /* Decrement loop counter */
207       k--;
208     }
209 
210     /* For the next MAC operations, the pointer py is used without SIMD
211      * So, py is incremented by 1 */
212     py = py + 1U;
213 
214     /* If the count is not a multiple of 4, compute any remaining MACs here.
215      ** No loop unrolling is used. */
216     k = count % 0x4U;
217 
218     while (k > 0U)
219     {
220       /* Perform the multiply-accumulates */
221       sum = __SMLAD(*px++, *py--, sum);
222 
223       /* Decrement the loop counter */
224       k--;
225     }
226 
227     /* Store the result in the accumulator in the destination buffer. */
228     *pOut++ = (q15_t) (sum >> 15);
229 
230     /* Update the inputA and inputB pointers for next MAC calculation */
231     py = pIn2 + (count - 1U);
232     px = pIn1;
233 
234     /* Increment MAC count */
235     count++;
236 
237     /* Decrement loop counter */
238     blockSize1--;
239   }
240 
241   /* --------------------------
242    * Initializations of stage2
243    * ------------------------*/
244 
245   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
246    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
247    * ....
248    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
249    */
250 
251   /* Working pointer of inputA */
252   px = pIn1;
253 
254   /* Working pointer of inputB */
255   pSrc2 = pIn2 + (srcBLen - 1U);
256   py = pSrc2;
257 
258   /* count is the index by which the pointer pIn1 to be incremented */
259   count = 0U;
260 
261   /* --------------------
262    * Stage2 process
263    * -------------------*/
264 
265   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
266    * So, to loop unroll over blockSize2,
267    * srcBLen should be greater than or equal to 4 */
268   if (srcBLen >= 4U)
269   {
270     /* Loop unroll over blockSize2, by 4 */
271     blkCnt = blockSize2 >> 2U;
272 
273     while (blkCnt > 0U)
274     {
275       py = py - 1U;
276 
277       /* Set all accumulators to zero */
278       acc0 = 0;
279       acc1 = 0;
280       acc2 = 0;
281       acc3 = 0;
282 
283       /* read x[0], x[1] samples */
284       x0 = read_q15x2 ((q15_t *) px);
285       /* read x[1], x[2] samples */
286       x1 = read_q15x2 ((q15_t *) px + 1);
287 	  px += 2U;
288 
289       /* Apply loop unrolling and compute 4 MACs simultaneously. */
290       k = srcBLen >> 2U;
291 
292       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
293        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
294       do
295       {
296         /* Read the last two inputB samples using SIMD:
297          * y[srcBLen - 1] and y[srcBLen - 2] */
298         c0 = read_q15x2_da ((q15_t **) &py);
299 
300         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
301         acc0 = __SMLADX(x0, c0, acc0);
302 
303         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
304         acc1 = __SMLADX(x1, c0, acc1);
305 
306         /* Read x[2], x[3] */
307         x2 = read_q15x2 ((q15_t *) px);
308 
309         /* Read x[3], x[4] */
310         x3 = read_q15x2 ((q15_t *) px + 1);
311 
312         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
313         acc2 = __SMLADX(x2, c0, acc2);
314 
315         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
316         acc3 = __SMLADX(x3, c0, acc3);
317 
318         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
319         c0 = read_q15x2_da ((q15_t **) &py);
320 
321         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
322         acc0 = __SMLADX(x2, c0, acc0);
323 
324         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
325         acc1 = __SMLADX(x3, c0, acc1);
326 
327         /* Read x[4], x[5] */
328         x0 = read_q15x2 ((q15_t *) px + 2);
329 
330         /* Read x[5], x[6] */
331         x1 = read_q15x2 ((q15_t *) px + 3);
332 		px += 4U;
333 
334         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
335         acc2 = __SMLADX(x0, c0, acc2);
336 
337         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
338         acc3 = __SMLADX(x1, c0, acc3);
339 
340       } while (--k);
341 
342       /* For the next MAC operations, SIMD is not used
343        * So, the 16 bit pointer if inputB, py is updated */
344 
345       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
346        ** No loop unrolling is used. */
347       k = srcBLen % 0x4U;
348 
349       if (k == 1U)
350       {
351         /* Read y[srcBLen - 5] */
352         c0 = *(py+1);
353 
354 #ifdef  ARM_MATH_BIG_ENDIAN
355         c0 = c0 << 16U;
356 #else
357         c0 = c0 & 0x0000FFFF;
358 #endif /* #ifdef  ARM_MATH_BIG_ENDIAN */
359 
360         /* Read x[7] */
361         x3 = read_q15x2 ((q15_t *) px);
362 		px++;
363 
364         /* Perform the multiply-accumulates */
365         acc0 = __SMLAD(x0, c0, acc0);
366         acc1 = __SMLAD(x1, c0, acc1);
367         acc2 = __SMLADX(x1, c0, acc2);
368         acc3 = __SMLADX(x3, c0, acc3);
369       }
370 
371       if (k == 2U)
372       {
373         /* Read y[srcBLen - 5], y[srcBLen - 6] */
374         c0 = read_q15x2 ((q15_t *) py);
375 
376         /* Read x[7], x[8] */
377         x3 = read_q15x2 ((q15_t *) px);
378 
379         /* Read x[9] */
380         x2 = read_q15x2 ((q15_t *) px + 1);
381 		px += 2U;
382 
383         /* Perform the multiply-accumulates */
384         acc0 = __SMLADX(x0, c0, acc0);
385         acc1 = __SMLADX(x1, c0, acc1);
386         acc2 = __SMLADX(x3, c0, acc2);
387         acc3 = __SMLADX(x2, c0, acc3);
388       }
389 
390       if (k == 3U)
391       {
392         /* Read y[srcBLen - 5], y[srcBLen - 6] */
393         c0 = read_q15x2 ((q15_t *) py);
394 
395         /* Read x[7], x[8] */
396         x3 = read_q15x2 ((q15_t *) px);
397 
398         /* Read x[9] */
399         x2 = read_q15x2 ((q15_t *) px + 1);
400 
401         /* Perform the multiply-accumulates */
402         acc0 = __SMLADX(x0, c0, acc0);
403         acc1 = __SMLADX(x1, c0, acc1);
404         acc2 = __SMLADX(x3, c0, acc2);
405         acc3 = __SMLADX(x2, c0, acc3);
406 
407         /* Read y[srcBLen - 7] */
408 		c0 = *(py-1);
409 #ifdef  ARM_MATH_BIG_ENDIAN
410         c0 = c0 << 16U;
411 #else
412         c0 = c0 & 0x0000FFFF;
413 #endif /* #ifdef  ARM_MATH_BIG_ENDIAN */
414 
415         /* Read x[10] */
416         x3 =  read_q15x2 ((q15_t *) px + 2);
417 		px += 3U;
418 
419         /* Perform the multiply-accumulates */
420         acc0 = __SMLADX(x1, c0, acc0);
421         acc1 = __SMLAD(x2, c0, acc1);
422         acc2 = __SMLADX(x2, c0, acc2);
423         acc3 = __SMLADX(x3, c0, acc3);
424       }
425 
426       /* Store the result in the accumulator in the destination buffer. */
427 #ifndef ARM_MATH_BIG_ENDIAN
428       write_q15x2_ia (&pOut, __PKHBT((acc0 >> 15), (acc1 >> 15), 16));
429       write_q15x2_ia (&pOut, __PKHBT((acc2 >> 15), (acc3 >> 15), 16));
430 #else
431       write_q15x2_ia (&pOut, __PKHBT((acc1 >> 15), (acc0 >> 15), 16));
432       write_q15x2_ia (&pOut, __PKHBT((acc3 >> 15), (acc2 >> 15), 16));
433 #endif /*#ifndef  ARM_MATH_BIG_ENDIAN*/
434 
435       /* Increment the pointer pIn1 index, count by 4 */
436       count += 4U;
437 
438       /* Update the inputA and inputB pointers for next MAC calculation */
439       px = pIn1 + count;
440       py = pSrc2;
441 
442       /* Decrement loop counter */
443       blkCnt--;
444     }
445 
446     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
447      ** No loop unrolling is used. */
448     blkCnt = blockSize2 % 0x4U;
449 
450     while (blkCnt > 0U)
451     {
452       /* Accumulator is made zero for every iteration */
453       sum = 0;
454 
455       /* Apply loop unrolling and compute 4 MACs simultaneously. */
456       k = srcBLen >> 2U;
457 
458       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
459        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
460       while (k > 0U)
461       {
462         /* Perform the multiply-accumulates */
463         sum += ((q31_t) *px++ * *py--);
464         sum += ((q31_t) *px++ * *py--);
465         sum += ((q31_t) *px++ * *py--);
466         sum += ((q31_t) *px++ * *py--);
467 
468         /* Decrement loop counter */
469         k--;
470       }
471 
472       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
473        ** No loop unrolling is used. */
474       k = srcBLen % 0x4U;
475 
476       while (k > 0U)
477       {
478         /* Perform the multiply-accumulates */
479         sum += ((q31_t) *px++ * *py--);
480 
481         /* Decrement loop counter */
482         k--;
483       }
484 
485       /* Store the result in the accumulator in the destination buffer. */
486       *pOut++ = (q15_t) (sum >> 15);
487 
488       /* Increment the pointer pIn1 index, count by 1 */
489       count++;
490 
491       /* Update the inputA and inputB pointers for next MAC calculation */
492       px = pIn1 + count;
493       py = pSrc2;
494 
495       /* Decrement loop counter */
496       blkCnt--;
497     }
498   }
499   else
500   {
501     /* If the srcBLen is not a multiple of 4,
502      * the blockSize2 loop cannot be unrolled by 4 */
503     blkCnt = blockSize2;
504 
505     while (blkCnt > 0U)
506     {
507       /* Accumulator is made zero for every iteration */
508       sum = 0;
509 
510       /* srcBLen number of MACS should be performed */
511       k = srcBLen;
512 
513       while (k > 0U)
514       {
515         /* Perform the multiply-accumulate */
516         sum += ((q31_t) *px++ * *py--);
517 
518         /* Decrement loop counter */
519         k--;
520       }
521 
522       /* Store the result in the accumulator in the destination buffer. */
523       *pOut++ = (q15_t) (sum >> 15);
524 
525       /* Increment MAC count */
526       count++;
527 
528       /* Update the inputA and inputB pointers for next MAC calculation */
529       px = pIn1 + count;
530       py = pSrc2;
531 
532       /* Decrement loop counter */
533       blkCnt--;
534     }
535   }
536 
537   /* --------------------------
538    * Initializations of stage3
539    * -------------------------*/
540 
541   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
542    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
543    * ....
544    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
545    * sum +=  x[srcALen-1] * y[srcBLen-1]
546    */
547 
548   /* In this stage the MAC operations are decreased by 1 for every iteration.
549      The blockSize3 variable holds the number of MAC operations performed */
550 
551   /* Working pointer of inputA */
552   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
553   px = pSrc1;
554 
555   /* Working pointer of inputB */
556   pSrc2 = pIn2 + (srcBLen - 1U);
557   pIn2 = pSrc2 - 1U;
558   py = pIn2;
559 
560   /* -------------------
561    * Stage3 process
562    * ------------------*/
563 
564   /* For loop unrolling by 4, this stage is divided into two. */
565   /* First part of this stage computes the MAC operations greater than 4 */
566   /* Second part of this stage computes the MAC operations less than or equal to 4 */
567 
568   /* The first part of the stage starts here */
569   j = blockSize3 >> 2U;
570 
571   while ((j > 0U) && (blockSize3 > 0U))
572   {
573     /* Accumulator is made zero for every iteration */
574     sum = 0;
575 
576     /* Apply loop unrolling and compute 4 MACs simultaneously. */
577     k = blockSize3 >> 2U;
578 
579     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
580      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
581     while (k > 0U)
582     {
583       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
584        * with y[srcBLen - 1], y[srcBLen - 2] respectively */
585       sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
586       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
587        * with y[srcBLen - 3], y[srcBLen - 4] respectively */
588       sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
589 
590       /* Decrement loop counter */
591       k--;
592     }
593 
594     /* For the next MAC operations, the pointer py is used without SIMD
595      * So, py is incremented by 1 */
596     py = py + 1U;
597 
598     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
599      ** No loop unrolling is used. */
600     k = blockSize3 % 0x4U;
601 
602     while (k > 0U)
603     {
604       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
605       sum = __SMLAD(*px++, *py--, sum);
606 
607       /* Decrement loop counter */
608       k--;
609     }
610 
611     /* Store the result in the accumulator in the destination buffer. */
612     *pOut++ = (q15_t) (sum >> 15);
613 
614     /* Update the inputA and inputB pointers for next MAC calculation */
615     px = ++pSrc1;
616     py = pIn2;
617 
618     /* Decrement  loop counter */
619     blockSize3--;
620 
621     j--;
622   }
623 
624   /* The second part of the stage starts here */
625   /* SIMD is not used for the next MAC operations,
626    * so pointer py is updated to read only one sample at a time */
627   py = py + 1U;
628 
629   while (blockSize3 > 0U)
630   {
631     /* Accumulator is made zero for every iteration */
632     sum = 0;
633 
634     /* Apply loop unrolling and compute 4 MACs simultaneously. */
635     k = blockSize3;
636 
637     while (k > 0U)
638     {
639       /* Perform the multiply-accumulates */
640       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
641       sum = __SMLAD(*px++, *py--, sum);
642 
643       /* Decrement loop counter */
644       k--;
645     }
646 
647     /* Store the result in the accumulator in the destination buffer. */
648     *pOut++ = (q15_t) (sum >> 15);
649 
650     /* Update the inputA and inputB pointers for next MAC calculation */
651     px = ++pSrc1;
652     py = pSrc2;
653 
654     /* Decrement the loop counter */
655     blockSize3--;
656   }
657 
658 }
659 
660 /**
661   @} end of Conv group
662  */
663