1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_q15.c
4  * Description:  Convolution of Q15 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Conv
37   @{
38  */
39 
40 /**
41   @brief         Convolution of Q15 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1.
47 
48   @par           Scaling and Overflow Behavior
49                    The function is implemented using a 64-bit internal accumulator.
50                    Both inputs are in 1.15 format and multiplications yield a 2.30 result.
51                    The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
52                    This approach provides 33 guard bits and there is no risk of overflow.
53                    The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
54 
55   @remark
56                    Refer to \ref arm_conv_fast_q15() for a faster but less precise version of this function.
57   @remark
58                    Refer to \ref arm_conv_opt_q15() for a faster implementation of this function using scratch buffers.
59  */
60 
61 
62 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
63 #include "arm_helium_utils.h"
64 #include "arm_vec_filtering.h"
65 
66 
arm_conv_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst)67 ARM_DSP_ATTRIBUTE void arm_conv_q15(
68   const q15_t * pSrcA,
69         uint32_t srcALen,
70   const q15_t * pSrcB,
71         uint32_t srcBLen,
72         q15_t * pDst)
73 {
74     const q15_t    *pIn1 = pSrcA;     /* inputA pointer               */
75     const q15_t    *pIn2 = pSrcB;     /* inputB pointer               */
76     /*
77      * Loop to perform MAC operations according to correlation equation
78      */
79     const q15_t    *pX;
80     const q15_t    *pY;
81     const q15_t    *pA;
82     const q15_t    *pB;
83     int32_t   i = 0U, j = 0;    /* loop counters */
84     int32_t   block1, block2, block3;
85 
86 
87 
88     uint16x8_t decrIdxVec = vddupq_u16(7, 1);
89 
90 
91     if (srcALen < srcBLen)
92     {
93         /*
94          * Initialization to inputB pointer
95          */
96         pIn1 = pSrcB;
97         /*
98          * Initialization to the end of inputA pointer
99          */
100         pIn2 = pSrcA;
101         /*
102          * Swapping the lengths
103          */
104         j = srcALen;
105         srcALen = srcBLen;
106         srcBLen = j;
107     }
108 
109     block1 = srcBLen - 1;
110     block2 = srcALen - srcBLen + 1;
111     block3 = srcBLen - 1;
112 
113 
114     pA = pIn1;
115     pB = pIn2 - 7;
116 
117     for (i = 0; i <= block1 - 2; i += 2)
118     {
119         uint32_t  count = i + 1;
120         int64_t   acc0 = 0LL;
121         int64_t   acc1 = 0LL;
122 
123         pX = pA;
124         pY = pB;
125 
126         MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count);
127         *pDst++ = (q15_t) acc0;
128         *pDst++ = (q15_t) acc1;
129         pB += 2;
130     }
131     for (; i < block1; i++)
132     {
133         uint32_t  count = i + 1;
134         int64_t   acc = 0LL;
135 
136         pX = pA;
137         pY = pB;
138 
139         MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count);
140         *pDst++ = (q15_t) acc;
141         pB++;
142     }
143 
144     for (i = 0; i <= block2 - 4; i += 4)
145     {
146         uint32_t  count = srcBLen;
147         int64_t   acc0 = 0LL;
148         int64_t   acc1 = 0LL;
149         int64_t   acc2 = 0LL;
150         int64_t   acc3 = 0LL;
151 
152         pX = pA;
153         pY = pB;
154         /*
155          * compute 4 accumulators per loop
156          * size is fixed for all accumulators
157          * X pointer is incrementing for successive accumulators
158          */
159         MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count);
160         *pDst++ = (q15_t) acc0;
161         *pDst++ = (q15_t) acc1;
162         *pDst++ = (q15_t) acc2;
163         *pDst++ = (q15_t) acc3;
164 
165         pA += 4;
166     }
167     for (; i <= block2 - 2; i += 2)
168     {
169         uint32_t  count = srcBLen;
170         int64_t   acc0 = 0LL;
171         int64_t   acc1 = 0LL;
172 
173         pX = pA;
174         pY = pB;
175         /*
176          * compute 2 accumulators per loop
177          * size is fixed for all accumulators
178          * X pointer is incrementing for successive accumulators
179          */
180         MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count);
181         *pDst++ = (q15_t) acc0;
182         *pDst++ = (q15_t) acc1;
183 
184         pA += 2;
185     }
186     if (block2 & 1)
187     {
188         uint32_t  count = srcBLen;
189         int64_t   acc = 0LL;
190 
191         pX = pA;
192         pY = pB;
193 
194         MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count);
195         *pDst++ = (q15_t) acc;
196         pA++;
197     }
198 
199     for (i = block3; i >= 2; i -= 2)
200     {
201         uint32_t  count = i;
202         int64_t   acc0 = 0LL;
203         int64_t   acc1 = 0LL;
204 
205         pX = pA;
206         pY = pB;
207 
208         MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count);
209         *pDst++ = (q15_t) acc0;
210         *pDst++ = (q15_t) acc1;
211         pA += 2;
212     }
213     for (; i > 0; i--)
214     {
215         uint32_t  count = i;
216         int64_t   acc = 0LL;
217 
218         pX = pA;
219         pY = pB;
220 
221         MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count);
222         *pDst++ = (q15_t) acc;
223         pA++;
224     }
225 
226 
227 }
228 #else
arm_conv_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst)229 ARM_DSP_ATTRIBUTE void arm_conv_q15(
230   const q15_t * pSrcA,
231         uint32_t srcALen,
232   const q15_t * pSrcB,
233         uint32_t srcBLen,
234         q15_t * pDst)
235 {
236 
237 #if defined (ARM_MATH_DSP)
238 
239   const q15_t *pIn1;                                   /* InputA pointer */
240   const q15_t *pIn2;                                   /* InputB pointer */
241         q15_t *pOut = pDst;                            /* Output pointer */
242         q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulators */
243   const q15_t *px;                                     /* Intermediate inputA pointer */
244   const q15_t *py;                                     /* Intermediate inputB pointer */
245   const q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
246         q31_t x0, x1, x2, x3, c0;                      /* Temporary input variables to hold state and coefficient values */
247         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
248         uint32_t j, k, count, blkCnt;                  /* Loop counters */
249 
250 
251 
252   /* The algorithm implementation is based on the lengths of the inputs. */
253   /* srcB is always made to slide across srcA. */
254   /* So srcBLen is always considered as shorter or equal to srcALen */
255   if (srcALen >= srcBLen)
256   {
257     /* Initialization of inputA pointer */
258     pIn1 = pSrcA;
259 
260     /* Initialization of inputB pointer */
261     pIn2 = pSrcB;
262   }
263   else
264   {
265     /* Initialization of inputA pointer */
266     pIn1 = pSrcB;
267 
268     /* Initialization of inputB pointer */
269     pIn2 = pSrcA;
270 
271     /* srcBLen is always considered as shorter or equal to srcALen */
272     j = srcBLen;
273     srcBLen = srcALen;
274     srcALen = j;
275   }
276 
277   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
278   /* The function is internally
279    * divided into three stages according to the number of multiplications that has to be
280    * taken place between inputA samples and inputB samples. In the first stage of the
281    * algorithm, the multiplications increase by one for every iteration.
282    * In the second stage of the algorithm, srcBLen number of multiplications are done.
283    * In the third stage of the algorithm, the multiplications decrease by one
284    * for every iteration. */
285 
286   /* The algorithm is implemented in three stages.
287      The loop counters of each stage is initiated here. */
288   blockSize1 = srcBLen - 1U;
289   blockSize2 = srcALen - (srcBLen - 1U);
290 
291 
292 
293 
294   /* --------------------------
295    * Initializations of stage1
296    * -------------------------*/
297 
298   /* sum = x[0] * y[0]
299    * sum = x[0] * y[1] + x[1] * y[0]
300    * ....
301    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
302    */
303 
304   /* In this stage the MAC operations are increased by 1 for every iteration.
305      The count variable holds the number of MAC operations performed */
306   count = 1U;
307 
308   /* Working pointer of inputA */
309   px = pIn1;
310 
311   /* Working pointer of inputB */
312   py = pIn2;
313 
314   /* ------------------------
315    * Stage1 process
316    * ----------------------*/
317 
318   /* For loop unrolling by 4, this stage is divided into two. */
319   /* First part of this stage computes the MAC operations less than 4 */
320   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
321 
322   /* The first part of the stage starts here */
323   while ((count < 4U) && (blockSize1 > 0U))
324   {
325     /* Accumulator is made zero for every iteration */
326     sum = 0;
327 
328     /* Loop over number of MAC operations between
329      * inputA samples and inputB samples */
330     k = count;
331 
332     while (k > 0U)
333     {
334       /* Perform the multiply-accumulates */
335       sum = __SMLALD(*px++, *py--, sum);
336 
337       /* Decrement loop counter */
338       k--;
339     }
340 
341     /* Store the result in the accumulator in the destination buffer. */
342     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
343 
344     /* Update the inputA and inputB pointers for next MAC calculation */
345     py = pIn2 + count;
346     px = pIn1;
347 
348     /* Increment MAC count */
349     count++;
350 
351     /* Decrement loop counter */
352     blockSize1--;
353   }
354 
355   /* The second part of the stage starts here */
356   /* The internal loop, over count, is unrolled by 4 */
357   /* To, read the last two inputB samples using SIMD:
358    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
359   py = py - 1;
360 
361   while (blockSize1 > 0U)
362   {
363     /* Accumulator is made zero for every iteration */
364     sum = 0;
365 
366     /* Apply loop unrolling and compute 4 MACs simultaneously. */
367     k = count >> 2U;
368 
369     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
370      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
371     while (k > 0U)
372     {
373       /* Perform the multiply-accumulate */
374       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
375       sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
376       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
377       sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
378 
379       /* Decrement loop counter */
380       k--;
381     }
382 
383     /* For the next MAC operations, the pointer py is used without SIMD
384      * So, py is incremented by 1 */
385     py = py + 1U;
386 
387     /* If the count is not a multiple of 4, compute any remaining MACs here.
388      ** No loop unrolling is used. */
389     k = count % 0x4U;
390 
391     while (k > 0U)
392     {
393       /* Perform the multiply-accumulate */
394       sum = __SMLALD(*px++, *py--, sum);
395 
396       /* Decrement loop counter */
397       k--;
398     }
399 
400     /* Store the result in the accumulator in the destination buffer. */
401     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
402 
403     /* Update the inputA and inputB pointers for next MAC calculation */
404     py = pIn2 + (count - 1U);
405     px = pIn1;
406 
407     /* Increment MAC count */
408     count++;
409 
410     /* Decrement loop counter */
411     blockSize1--;
412   }
413 
414   /* --------------------------
415    * Initializations of stage2
416    * ------------------------*/
417 
418   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
419    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
420    * ....
421    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
422    */
423 
424   /* Working pointer of inputA */
425   px = pIn1;
426 
427   /* Working pointer of inputB */
428   pSrc2 = pIn2 + (srcBLen - 1U);
429   py = pSrc2;
430 
431   /* count is the index by which the pointer pIn1 to be incremented */
432   count = 0U;
433 
434   /* -------------------
435    * Stage2 process
436    * ------------------*/
437 
438   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
439    * So, to loop unroll over blockSize2,
440    * srcBLen should be greater than or equal to 4 */
441   if (srcBLen >= 4U)
442   {
443     /* Loop unrolling: Compute 4 outputs at a time */
444     blkCnt = blockSize2 >> 2U;
445 
446     while (blkCnt > 0U)
447     {
448       py = py - 1U;
449 
450       /* Set all accumulators to zero */
451       acc0 = 0;
452       acc1 = 0;
453       acc2 = 0;
454       acc3 = 0;
455 
456       /* read x[0], x[1] samples */
457       x0 = read_q15x2 ((q15_t *) px);
458 
459       /* read x[1], x[2] samples */
460       x1 = read_q15x2 ((q15_t *) px + 1);
461       px += 2U;
462 
463       /* Apply loop unrolling and compute 4 MACs simultaneously. */
464       k = srcBLen >> 2U;
465 
466       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
467        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
468       do
469       {
470         /* Read the last two inputB samples using SIMD:
471          * y[srcBLen - 1] and y[srcBLen - 2] */
472         c0 = read_q15x2_da ((q15_t **) &py);
473 
474         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
475         acc0 = __SMLALDX(x0, c0, acc0);
476 
477         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
478         acc1 = __SMLALDX(x1, c0, acc1);
479 
480         /* Read x[2], x[3] */
481         x2 = read_q15x2 ((q15_t *) px);
482 
483         /* Read x[3], x[4] */
484         x3 = read_q15x2 ((q15_t *) px + 1);
485 
486         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
487         acc2 = __SMLALDX(x2, c0, acc2);
488 
489         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
490         acc3 = __SMLALDX(x3, c0, acc3);
491 
492         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
493         c0 = read_q15x2_da ((q15_t **) &py);
494 
495         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
496         acc0 = __SMLALDX(x2, c0, acc0);
497 
498         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
499         acc1 = __SMLALDX(x3, c0, acc1);
500 
501         /* Read x[4], x[5] */
502         x0 = read_q15x2 ((q15_t *) px + 2);
503 
504         /* Read x[5], x[6] */
505         x1 = read_q15x2 ((q15_t *) px + 3);
506 
507         px += 4U;
508 
509         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
510         acc2 = __SMLALDX(x0, c0, acc2);
511 
512         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
513         acc3 = __SMLALDX(x1, c0, acc3);
514 
515       } while (--k);
516 
517       /* For the next MAC operations, SIMD is not used
518        * So, the 16 bit pointer if inputB, py is updated */
519 
520       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
521        ** No loop unrolling is used. */
522       k = srcBLen % 0x4U;
523 
524       if (k == 1U)
525       {
526         /* Read y[srcBLen - 5] */
527         c0 = *(py + 1);
528 #ifdef  ARM_MATH_BIG_ENDIAN
529         c0 = c0 << 16U;
530 #else
531         c0 = c0 & 0x0000FFFF;
532 #endif /* #ifdef  ARM_MATH_BIG_ENDIAN */
533 
534         /* Read x[7] */
535         x3 = read_q15x2 ((q15_t *) px);
536         px++;
537 
538         /* Perform the multiply-accumulate */
539         acc0 = __SMLALD(x0, c0, acc0);
540         acc1 = __SMLALD(x1, c0, acc1);
541         acc2 = __SMLALDX(x1, c0, acc2);
542         acc3 = __SMLALDX(x3, c0, acc3);
543       }
544 
545       if (k == 2U)
546       {
547         /* Read y[srcBLen - 5], y[srcBLen - 6] */
548         c0 = read_q15x2 ((q15_t *) py);
549 
550         /* Read x[7], x[8] */
551         x3 = read_q15x2 ((q15_t *) px);
552 
553         /* Read x[9] */
554         x2 = read_q15x2 ((q15_t *) px + 1);
555         px += 2U;
556 
557         /* Perform the multiply-accumulate */
558         acc0 = __SMLALDX(x0, c0, acc0);
559         acc1 = __SMLALDX(x1, c0, acc1);
560         acc2 = __SMLALDX(x3, c0, acc2);
561         acc3 = __SMLALDX(x2, c0, acc3);
562       }
563 
564       if (k == 3U)
565       {
566         /* Read y[srcBLen - 5], y[srcBLen - 6] */
567         c0 = read_q15x2 ((q15_t *) py);
568 
569         /* Read x[7], x[8] */
570         x3 = read_q15x2 ((q15_t *) px);
571 
572         /* Read x[9] */
573         x2 = read_q15x2 ((q15_t *) px + 1);
574 
575         /* Perform the multiply-accumulate */
576         acc0 = __SMLALDX(x0, c0, acc0);
577         acc1 = __SMLALDX(x1, c0, acc1);
578         acc2 = __SMLALDX(x3, c0, acc2);
579         acc3 = __SMLALDX(x2, c0, acc3);
580 
581         c0 = *(py-1);
582 #ifdef  ARM_MATH_BIG_ENDIAN
583         c0 = c0 << 16U;
584 #else
585         c0 = c0 & 0x0000FFFF;
586 #endif /* #ifdef  ARM_MATH_BIG_ENDIAN */
587 
588         /* Read x[10] */
589         x3 =  read_q15x2 ((q15_t *) px + 2);
590         px += 3U;
591 
592         /* Perform the multiply-accumulates */
593         acc0 = __SMLALDX(x1, c0, acc0);
594         acc1 = __SMLALD(x2, c0, acc1);
595         acc2 = __SMLALDX(x2, c0, acc2);
596         acc3 = __SMLALDX(x3, c0, acc3);
597       }
598 
599       /* Store the result in the accumulator in the destination buffer. */
600       {
601         int32_t sat0 = __SSAT((acc0 >> 15), 16);
602         int32_t sat1 = __SSAT((acc1 >> 15), 16);
603         int32_t sat2 = __SSAT((acc2 >> 15), 16);
604         int32_t sat3 = __SSAT((acc3 >> 15), 16);
605 #ifndef  ARM_MATH_BIG_ENDIAN
606         write_q15x2_ia (&pOut, __PKHBT(sat0, sat1, 16));
607         write_q15x2_ia (&pOut, __PKHBT(sat2, sat3, 16));
608 #else
609         write_q15x2_ia (&pOut, __PKHBT(sat1, sat0, 16));
610         write_q15x2_ia (&pOut, __PKHBT(sat3, sat2, 16));
611 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
612       }
613       /* Increment the pointer pIn1 index, count by 4 */
614       count += 4U;
615 
616       /* Update the inputA and inputB pointers for next MAC calculation */
617       px = pIn1 + count;
618       py = pSrc2;
619 
620       /* Decrement loop counter */
621       blkCnt--;
622     }
623 
624     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
625      ** No loop unrolling is used. */
626     blkCnt = blockSize2 % 0x4U;
627 
628     while (blkCnt > 0U)
629     {
630       /* Accumulator is made zero for every iteration */
631       sum = 0;
632 
633       /* Apply loop unrolling and compute 4 MACs simultaneously. */
634       k = srcBLen >> 2U;
635 
636       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
637        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
638       while (k > 0U)
639       {
640         /* Perform the multiply-accumulates */
641         sum += (q63_t) ((q31_t) *px++ * *py--);
642         sum += (q63_t) ((q31_t) *px++ * *py--);
643         sum += (q63_t) ((q31_t) *px++ * *py--);
644         sum += (q63_t) ((q31_t) *px++ * *py--);
645 
646         /* Decrement loop counter */
647         k--;
648       }
649 
650       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
651        ** No loop unrolling is used. */
652       k = srcBLen % 0x4U;
653 
654       while (k > 0U)
655       {
656         /* Perform the multiply-accumulates */
657         sum += (q63_t) ((q31_t) *px++ * *py--);
658 
659         /* Decrement the loop counter */
660         k--;
661       }
662 
663       /* Store the result in the accumulator in the destination buffer. */
664       *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
665 
666       /* Increment the pointer pIn1 index, count by 1 */
667       count++;
668 
669       /* Update the inputA and inputB pointers for next MAC calculation */
670       px = pIn1 + count;
671       py = pSrc2;
672 
673       /* Decrement the loop counter */
674       blkCnt--;
675     }
676   }
677   else
678   {
679     /* If the srcBLen is not a multiple of 4,
680      * the blockSize2 loop cannot be unrolled by 4 */
681     blkCnt = blockSize2;
682 
683     while (blkCnt > 0U)
684     {
685       /* Accumulator is made zero for every iteration */
686       sum = 0;
687 
688       /* srcBLen number of MACS should be performed */
689       k = srcBLen;
690 
691       while (k > 0U)
692       {
693         /* Perform the multiply-accumulate */
694         sum += (q63_t) ((q31_t) *px++ * *py--);
695 
696         /* Decrement the loop counter */
697         k--;
698       }
699 
700       /* Store the result in the accumulator in the destination buffer. */
701       *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
702 
703       /* Increment the MAC count */
704       count++;
705 
706       /* Update the inputA and inputB pointers for next MAC calculation */
707       px = pIn1 + count;
708       py = pSrc2;
709 
710       /* Decrement the loop counter */
711       blkCnt--;
712     }
713   }
714 
715 
716   /* --------------------------
717    * Initializations of stage3
718    * -------------------------*/
719 
720   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
721    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
722    * ....
723    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
724    * sum +=  x[srcALen-1] * y[srcBLen-1]
725    */
726 
727   /* In this stage the MAC operations are decreased by 1 for every iteration.
728      The blockSize3 variable holds the number of MAC operations performed */
729   blockSize3 = srcBLen - 1U;
730 
731   /* Working pointer of inputA */
732   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
733   px = pSrc1;
734 
735   /* Working pointer of inputB */
736   pSrc2 = pIn2 + (srcBLen - 1U);
737   pIn2 = pSrc2 - 1U;
738   py = pIn2;
739 
740   /* -------------------
741    * Stage3 process
742    * ------------------*/
743 
744   /* For loop unrolling by 4, this stage is divided into two. */
745   /* First part of this stage computes the MAC operations greater than 4 */
746   /* Second part of this stage computes the MAC operations less than or equal to 4 */
747 
748   /* The first part of the stage starts here */
749   j = blockSize3 >> 2U;
750 
751   while ((j > 0U) && (blockSize3 > 0U))
752   {
753     /* Accumulator is made zero for every iteration */
754     sum = 0;
755 
756     /* Apply loop unrolling and compute 4 MACs simultaneously. */
757     k = blockSize3 >> 2U;
758 
759     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
760      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
761     while (k > 0U)
762     {
763       /* Perform the multiply-accumulate */
764       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
765        * with y[srcBLen - 1], y[srcBLen - 2] respectively */
766       sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
767       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
768        * with y[srcBLen - 3], y[srcBLen - 4] respectively */
769       sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
770 
771       /* Decrement loop counter */
772       k--;
773     }
774 
775     /* For the next MAC operations, the pointer py is used without SIMD
776      * So, py is incremented by 1 */
777     py = py + 1U;
778 
779     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
780      ** No loop unrolling is used. */
781     k = blockSize3 % 0x4U;
782 
783     while (k > 0U)
784     {
785       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
786       sum = __SMLALD(*px++, *py--, sum);
787 
788       /* Decrement loop counter */
789       k--;
790     }
791 
792     /* Store the result in the accumulator in the destination buffer. */
793     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
794 
795     /* Update the inputA and inputB pointers for next MAC calculation */
796     px = ++pSrc1;
797     py = pIn2;
798 
799     /* Decrement loop counter */
800     blockSize3--;
801 
802     j--;
803   }
804 
805   /* The second part of the stage starts here */
806   /* SIMD is not used for the next MAC operations,
807    * so pointer py is updated to read only one sample at a time */
808   py = py + 1U;
809 
810   while (blockSize3 > 0U)
811   {
812     /* Accumulator is made zero for every iteration */
813     sum = 0;
814 
815     /* Apply loop unrolling and compute 4 MACs simultaneously. */
816     k = blockSize3;
817 
818     while (k > 0U)
819     {
820       /* Perform the multiply-accumulates */
821       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
822       sum = __SMLALD(*px++, *py--, sum);
823 
824       /* Decrement loop counter */
825       k--;
826     }
827 
828     /* Store the result in the accumulator in the destination buffer. */
829     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
830 
831     /* Update the inputA and inputB pointers for next MAC calculation */
832     px = ++pSrc1;
833     py = pSrc2;
834 
835     /* Decrement loop counter */
836     blockSize3--;
837   }
838 
839 #else /* #if defined (ARM_MATH_DSP) */
840 
841   const q15_t *pIn1 = pSrcA;                           /* InputA pointer */
842   const q15_t *pIn2 = pSrcB;                           /* InputB pointer */
843         q63_t sum;                                     /* Accumulator */
844         uint32_t i, j;                                 /* Loop counters */
845 
846   /* Loop to calculate convolution for output length number of values */
847   for (i = 0; i < (srcALen + srcBLen - 1); i++)
848   {
849     /* Initialize sum with zero to carry on MAC operations */
850     sum = 0;
851 
852     /* Loop to perform MAC operations according to convolution equation */
853     for (j = 0U; j <= i; j++)
854     {
855       /* Check the array limitations */
856       if (((i - j) < srcBLen) && (j < srcALen))
857       {
858         /* z[i] += x[i-j] * y[j] */
859         sum += ((q31_t) pIn1[j] * pIn2[i - j]);
860       }
861     }
862 
863     /* Store the output in the destination buffer */
864     pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
865   }
866 
867 #endif /* #if defined (ARM_MATH_DSP) */
868 
869 }
870 #endif /* defined(ARM_MATH_MVEI) */
871 
872 /**
873   @} end of Conv group
874  */
875