1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_correlate_q15.c
4  * Description:  Correlation of Q15 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Corr
37   @{
38  */
39 
40 /**
41   @brief         Correlation of Q15 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
47 
48   @par           Scaling and Overflow Behavior
49                    The function is implemented using a 64-bit internal accumulator.
50                    Both inputs are in 1.15 format and multiplications yield a 2.30 result.
51                    The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
52                    This approach provides 33 guard bits and there is no risk of overflow.
53                    The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
54 
55   @remark
56                    Refer to \ref arm_correlate_fast_q15() for a faster but less precise version of this function.
57   @remark
58                    Refer to \ref arm_correlate_opt_q15() for a faster implementation of this function using scratch buffers.
59  */
60 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
61 #include "arm_helium_utils.h"
62 #include "arm_vec_filtering.h"
63 
arm_correlate_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst)64 ARM_DSP_ATTRIBUTE void arm_correlate_q15(
65   const q15_t * pSrcA,
66         uint32_t srcALen,
67   const q15_t * pSrcB,
68         uint32_t srcBLen,
69         q15_t * pDst)
70 {
71     const q15_t    *pIn1 = pSrcA;     /* inputA pointer               */
72     const q15_t    *pIn2 = pSrcB + (srcBLen - 1U);    /* inputB pointer               */
73     /*
74      * Loop to perform MAC operations according to correlation equation
75      */
76     const q15_t    *pX;
77     const q15_t    *pY;
78     const q15_t    *pA;
79     const q15_t    *pB;
80     int32_t   i = 0U, j = 0;    /* loop counters */
81     int32_t   inv = 2U;         /* Reverse order flag */
82     uint32_t  tot = 0U;         /* Length */
83     int32_t   block1, block2, block3;
84     int32_t   incr;
85 
86     tot = ((srcALen + srcBLen) - 2U);
87     if (srcALen > srcBLen)
88     {
89         /*
90          * Calculating the number of zeros to be padded to the output
91          */
92         j = srcALen - srcBLen;
93         /*
94          * Initialize the pointer after zero padding
95          */
96         pDst += j;
97     }
98     else if (srcALen < srcBLen)
99     {
100         /*
101          * Initialization to inputB pointer
102          */
103         pIn1 = pSrcB;
104         /*
105          * Initialization to the end of inputA pointer
106          */
107         pIn2 = pSrcA + (srcALen - 1U);
108         /*
109          * Initialisation of the pointer after zero padding
110          */
111         pDst = pDst + tot;
112         /*
113          * Swapping the lengths
114          */
115         j = srcALen;
116         srcALen = srcBLen;
117         srcBLen = j;
118         /*
119          * Setting the reverse flag
120          */
121         inv = -2;
122     }
123 
124     block1 = srcBLen - 1;
125     block2 = srcALen - srcBLen + 1;
126     block3 = srcBLen - 1;
127 
128     pA = pIn1;
129     pB = pIn2;
130     incr = inv / 2;
131 
132     for (i = 0U; i <= block1 - 2; i += 2)
133     {
134         uint32_t  count = i + 1;
135         int64_t   acc0 = 0LL;
136         int64_t   acc1 = 0LL;
137 
138         /*
139          * compute 2 accumulators per loop
140          * size is incrementing for second accumulator
141          * Y pointer is decrementing for second accumulator
142          */
143         pX = pA;
144         pY = pB;
145         MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count);
146 
147         *pDst = (q15_t) acc0;
148         pDst += incr;
149         *pDst = (q15_t) acc1;
150         pDst += incr;
151         pB -= 2;
152     }
153     for (; i < block1; i++)
154     {
155         uint32_t  count = i + 1;
156         int64_t   acc = 0LL;
157 
158         pX = pA;
159         pY = pB;
160         MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count);
161 
162         *pDst = (q15_t) acc;
163         pDst += incr;
164         pB--;
165     }
166 
167     for (i = 0U; i <= block2 - 4; i += 4)
168     {
169         int64_t   acc0 = 0LL;
170         int64_t   acc1 = 0LL;
171         int64_t   acc2 = 0LL;
172         int64_t   acc3 = 0LL;
173 
174         pX = pA;
175         pY = pB;
176         /*
177          * compute 4 accumulators per loop
178          * size is fixed for all accumulators
179          * X pointer is incrementing for successive accumulators
180          */
181         MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
182 
183         *pDst = (q15_t) acc0;
184         pDst += incr;
185         *pDst = (q15_t) acc1;
186         pDst += incr;
187         *pDst = (q15_t) acc2;
188         pDst += incr;
189         *pDst = (q15_t) acc3;
190         pDst += incr;
191         pA += 4;
192     }
193 
194     for (; i <= block2 - 2; i += 2)
195     {
196         int64_t   acc0 = 0LL;
197         int64_t   acc1 = 0LL;
198 
199         pX = pA;
200         pY = pB;
201         /*
202          * compute 2 accumulators per loop
203          * size is fixed for all accumulators
204          * X pointer is incrementing for second accumulator
205          */
206         MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, srcBLen);
207 
208         *pDst = (q15_t) acc0;
209         pDst += incr;
210         *pDst = (q15_t) acc1;
211         pDst += incr;
212         pA += 2;
213     }
214 
215     if (block2 & 1)
216     {
217         int64_t   acc = 0LL;
218 
219         pX = pA;
220         pY = pB;
221         MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, srcBLen);
222 
223         *pDst = (q15_t) acc;
224         pDst += incr;
225         pA++;
226     }
227 
228     for (i = block3 - 1; i > 0; i -= 2)
229     {
230 
231         uint32_t  count = (i + 1);
232         int64_t   acc0 = 0LL;
233         int64_t   acc1 = 0LL;
234 
235         pX = pA;
236         pY = pB;
237         /*
238          * compute 2 accumulators per loop
239          * size is decrementing for second accumulator
240          * X pointer is incrementing for second accumulator
241          */
242         MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count);
243 
244         *pDst = (q15_t) acc0;
245         pDst += incr;
246         *pDst = (q15_t) acc1;
247         pDst += incr;
248         pA += 2;
249 
250     }
251     for (; i >= 0; i--)
252     {
253         uint32_t  count = (i + 1);
254         int64_t   acc = 0LL;
255 
256         pX = pA;
257         pY = pB;
258         MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count);
259 
260         *pDst = (q15_t) acc;
261         pDst += incr;
262         pA++;
263     }
264 }
265 
266 #else
arm_correlate_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst)267 ARM_DSP_ATTRIBUTE void arm_correlate_q15(
268   const q15_t * pSrcA,
269         uint32_t srcALen,
270   const q15_t * pSrcB,
271         uint32_t srcBLen,
272         q15_t * pDst)
273 {
274 
275 #if defined (ARM_MATH_DSP)
276 
277   const q15_t *pIn1;                                   /* InputA pointer */
278   const q15_t *pIn2;                                   /* InputB pointer */
279         q15_t *pOut = pDst;                            /* Output pointer */
280         q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulators */
281   const q15_t *px;                                     /* Intermediate inputA pointer */
282   const q15_t *py;                                     /* Intermediate inputB pointer */
283   const q15_t *pSrc1;                                  /* Intermediate pointers */
284         q31_t x0, x1, x2, x3, c0;                      /* Temporary input variables for holding input and coefficient values */
285         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
286         uint32_t j, k, count, blkCnt;                  /* Loop counters */
287         uint32_t outBlockSize;
288         int32_t inc = 1;                               /* Destination address modifier */
289 
290   /* The algorithm implementation is based on the lengths of the inputs. */
291   /* srcB is always made to slide across srcA. */
292   /* So srcBLen is always considered as shorter or equal to srcALen */
293   /* But CORR(x, y) is reverse of CORR(y, x) */
294   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
295   /* and the destination pointer modifier, inc is set to -1 */
296   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
297   /* But to improve the performance,
298    * we include zeroes in the output instead of zero padding either of the the inputs*/
299   /* If srcALen > srcBLen,
300    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
301   /* If srcALen < srcBLen,
302    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
303   if (srcALen >= srcBLen)
304   {
305     /* Initialization of inputA pointer */
306     pIn1 = pSrcA;
307 
308     /* Initialization of inputB pointer */
309     pIn2 = pSrcB;
310 
311     /* Number of output samples is calculated */
312     outBlockSize = (srcALen * 2U) - 1U;
313 
314     /* When srcALen > srcBLen, zero padding is done to srcB
315      * to make their lengths equal.
316      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
317      * number of output samples are made zero */
318     j = outBlockSize - (srcALen + (srcBLen - 1U));
319 
320     /* Updating the pointer position to non zero value */
321     pOut += j;
322   }
323   else
324   {
325     /* Initialization of inputA pointer */
326     pIn1 = pSrcB;
327 
328     /* Initialization of inputB pointer */
329     pIn2 = pSrcA;
330 
331     /* srcBLen is always considered as shorter or equal to srcALen */
332     j = srcBLen;
333     srcBLen = srcALen;
334     srcALen = j;
335 
336     /* CORR(x, y) = Reverse order(CORR(y, x)) */
337     /* Hence set the destination pointer to point to the last output sample */
338     pOut = pDst + ((srcALen + srcBLen) - 2U);
339 
340     /* Destination address modifier is set to -1 */
341     inc = -1;
342   }
343 
344   /* The function is internally
345    * divided into three stages according to the number of multiplications that has to be
346    * taken place between inputA samples and inputB samples. In the first stage of the
347    * algorithm, the multiplications increase by one for every iteration.
348    * In the second stage of the algorithm, srcBLen number of multiplications are done.
349    * In the third stage of the algorithm, the multiplications decrease by one
350    * for every iteration. */
351 
352   /* The algorithm is implemented in three stages.
353      The loop counters of each stage is initiated here. */
354   blockSize1 = srcBLen - 1U;
355   blockSize2 = srcALen - (srcBLen - 1U);
356   blockSize3 = blockSize1;
357 
358   /* --------------------------
359    * Initializations of stage1
360    * -------------------------*/
361 
362   /* sum = x[0] * y[srcBlen - 1]
363    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
364    * ....
365    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
366    */
367 
368   /* In this stage the MAC operations are increased by 1 for every iteration.
369      The count variable holds the number of MAC operations performed */
370   count = 1U;
371 
372   /* Working pointer of inputA */
373   px = pIn1;
374 
375   /* Working pointer of inputB */
376   pSrc1 = pIn2 + (srcBLen - 1U);
377   py = pSrc1;
378 
379   /* ------------------------
380    * Stage1 process
381    * ----------------------*/
382 
383   /* The first loop starts here */
384   while (blockSize1 > 0U)
385   {
386     /* Accumulator is made zero for every iteration */
387     sum = 0;
388 
389     /* Apply loop unrolling and compute 4 MACs simultaneously. */
390     k = count >> 2U;
391 
392     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
393      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
394     while (k > 0U)
395     {
396       /* Perform the multiply-accumulate */
397       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
398       sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
399       /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
400       sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
401 
402       /* Decrement loop counter */
403       k--;
404     }
405 
406     /* If the count is not a multiple of 4, compute any remaining MACs here.
407      ** No loop unrolling is used. */
408     k = count % 0x4U;
409 
410     while (k > 0U)
411     {
412       /* Perform the multiply-accumulate */
413       /* x[0] * y[srcBLen - 1] */
414       sum = __SMLALD(*px++, *py++, sum);
415 
416       /* Decrement loop counter */
417       k--;
418     }
419 
420     /* Store the result in the accumulator in the destination buffer. */
421     *pOut = (q15_t) (__SSAT((sum >> 15), 16));
422     /* Destination pointer is updated according to the address modifier, inc */
423     pOut += inc;
424 
425     /* Update the inputA and inputB pointers for next MAC calculation */
426     py = pSrc1 - count;
427     px = pIn1;
428 
429     /* Increment MAC count */
430     count++;
431 
432     /* Decrement loop counter */
433     blockSize1--;
434   }
435 
436   /* --------------------------
437    * Initializations of stage2
438    * ------------------------*/
439 
440   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
441    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
442    * ....
443    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
444    */
445 
446   /* Working pointer of inputA */
447   px = pIn1;
448 
449   /* Working pointer of inputB */
450   py = pIn2;
451 
452   /* count is the index by which the pointer pIn1 to be incremented */
453   count = 0U;
454 
455   /* -------------------
456    * Stage2 process
457    * ------------------*/
458 
459   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
460    * So, to loop unroll over blockSize2,
461    * srcBLen should be greater than or equal to 4 */
462   if (srcBLen >= 4U)
463   {
464     /* Loop unrolling: Compute 4 outputs at a time */
465     blkCnt = blockSize2 >> 2U;
466 
467     while (blkCnt > 0U)
468     {
469       /* Set all accumulators to zero */
470       acc0 = 0;
471       acc1 = 0;
472       acc2 = 0;
473       acc3 = 0;
474 
475       /* read x[0], x[1] samples */
476       x0 = read_q15x2 ((q15_t *) px);
477 
478       /* read x[1], x[2] samples */
479       x1 = read_q15x2 ((q15_t *) px + 1);
480       px += 2U;
481 
482       /* Apply loop unrolling and compute 4 MACs simultaneously. */
483       k = srcBLen >> 2U;
484 
485       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
486        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
487       do
488       {
489         /* Read the first two inputB samples using SIMD:
490          * y[0] and y[1] */
491         c0 = read_q15x2_ia ((q15_t **) &py);
492 
493         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
494         acc0 = __SMLALD(x0, c0, acc0);
495 
496         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
497         acc1 = __SMLALD(x1, c0, acc1);
498 
499         /* Read x[2], x[3] */
500         x2 = read_q15x2 ((q15_t *) px);
501 
502         /* Read x[3], x[4] */
503         x3 = read_q15x2 ((q15_t *) px + 1);
504 
505         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
506         acc2 = __SMLALD(x2, c0, acc2);
507 
508         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
509         acc3 = __SMLALD(x3, c0, acc3);
510 
511         /* Read y[2] and y[3] */
512         c0 = read_q15x2_ia ((q15_t **) &py);
513 
514         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
515         acc0 = __SMLALD(x2, c0, acc0);
516 
517         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
518         acc1 = __SMLALD(x3, c0, acc1);
519 
520         /* Read x[4], x[5] */
521         x0 = read_q15x2 ((q15_t *) px + 2);
522 
523         /* Read x[5], x[6] */
524         x1 = read_q15x2 ((q15_t *) px + 3);
525         px += 4U;
526 
527         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
528         acc2 = __SMLALD(x0, c0, acc2);
529 
530         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
531         acc3 = __SMLALD(x1, c0, acc3);
532 
533       } while (--k);
534 
535       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
536        ** No loop unrolling is used. */
537       k = srcBLen % 0x4U;
538 
539       if (k == 1U)
540       {
541         /* Read y[4] */
542         c0 = *py;
543 #ifdef  ARM_MATH_BIG_ENDIAN
544         c0 = c0 << 16U;
545 #else
546         c0 = c0 & 0x0000FFFF;
547 #endif /* #ifdef  ARM_MATH_BIG_ENDIAN */
548 
549         /* Read x[7] */
550         x3 = read_q15x2 ((q15_t *) px);
551         px++;
552 
553         /* Perform the multiply-accumulate */
554         acc0 = __SMLALD (x0, c0, acc0);
555         acc1 = __SMLALD (x1, c0, acc1);
556         acc2 = __SMLALDX(x1, c0, acc2);
557         acc3 = __SMLALDX(x3, c0, acc3);
558       }
559 
560       if (k == 2U)
561       {
562         /* Read y[4], y[5] */
563         c0 = read_q15x2 ((q15_t *) py);
564 
565         /* Read x[7], x[8] */
566         x3 = read_q15x2 ((q15_t *) px);
567 
568         /* Read x[9] */
569         x2 = read_q15x2 ((q15_t *) px + 1);
570         px += 2U;
571 
572         /* Perform the multiply-accumulate */
573         acc0 = __SMLALD(x0, c0, acc0);
574         acc1 = __SMLALD(x1, c0, acc1);
575         acc2 = __SMLALD(x3, c0, acc2);
576         acc3 = __SMLALD(x2, c0, acc3);
577       }
578 
579       if (k == 3U)
580       {
581         /* Read y[4], y[5] */
582         c0 = read_q15x2_ia ((q15_t **) &py);
583 
584         /* Read x[7], x[8] */
585         x3 = read_q15x2 ((q15_t *) px);
586 
587         /* Read x[9] */
588         x2 = read_q15x2 ((q15_t *) px + 1);
589 
590         /* Perform the multiply-accumulate */
591         acc0 = __SMLALD(x0, c0, acc0);
592         acc1 = __SMLALD(x1, c0, acc1);
593         acc2 = __SMLALD(x3, c0, acc2);
594         acc3 = __SMLALD(x2, c0, acc3);
595 
596         c0 = (*py);
597 
598         /* Read y[6] */
599 #ifdef  ARM_MATH_BIG_ENDIAN
600         c0 = c0 << 16U;
601 #else
602         c0 = c0 & 0x0000FFFF;
603 #endif /* #ifdef  ARM_MATH_BIG_ENDIAN */
604 
605         /* Read x[10] */
606         x3 = read_q15x2 ((q15_t *) px + 2);
607         px += 3U;
608 
609         /* Perform the multiply-accumulates */
610         acc0 = __SMLALDX(x1, c0, acc0);
611         acc1 = __SMLALD (x2, c0, acc1);
612         acc2 = __SMLALDX(x2, c0, acc2);
613         acc3 = __SMLALDX(x3, c0, acc3);
614       }
615 
616       /* Store the result in the accumulator in the destination buffer. */
617       *pOut = (q15_t) (__SSAT(acc0 >> 15, 16));
618       /* Destination pointer is updated according to the address modifier, inc */
619       pOut += inc;
620 
621       *pOut = (q15_t) (__SSAT(acc1 >> 15, 16));
622       pOut += inc;
623 
624       *pOut = (q15_t) (__SSAT(acc2 >> 15, 16));
625       pOut += inc;
626 
627       *pOut = (q15_t) (__SSAT(acc3 >> 15, 16));
628       pOut += inc;
629 
630       /* Increment the count by 4 as 4 output values are computed */
631       count += 4U;
632 
633       /* Update the inputA and inputB pointers for next MAC calculation */
634       px = pIn1 + count;
635       py = pIn2;
636 
637       /* Decrement loop counter */
638       blkCnt--;
639     }
640 
641     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
642      ** No loop unrolling is used. */
643     blkCnt = blockSize2 % 0x4U;
644 
645     while (blkCnt > 0U)
646     {
647       /* Accumulator is made zero for every iteration */
648       sum = 0;
649 
650       /* Apply loop unrolling and compute 4 MACs simultaneously. */
651       k = srcBLen >> 2U;
652 
653       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
654        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
655       while (k > 0U)
656       {
657         /* Perform the multiply-accumulates */
658         sum += ((q63_t) *px++ * *py++);
659         sum += ((q63_t) *px++ * *py++);
660         sum += ((q63_t) *px++ * *py++);
661         sum += ((q63_t) *px++ * *py++);
662 
663         /* Decrement loop counter */
664         k--;
665       }
666 
667       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
668        ** No loop unrolling is used. */
669       k = srcBLen % 0x4U;
670 
671       while (k > 0U)
672       {
673         /* Perform the multiply-accumulates */
674         sum += ((q63_t) *px++ * *py++);
675 
676         /* Decrement the loop counter */
677         k--;
678       }
679 
680       /* Store the result in the accumulator in the destination buffer. */
681       *pOut = (q15_t) (__SSAT(sum >> 15, 16));
682       /* Destination pointer is updated according to the address modifier, inc */
683       pOut += inc;
684 
685       /* Increment count by 1, as one output value is computed */
686       count++;
687 
688       /* Update the inputA and inputB pointers for next MAC calculation */
689       px = pIn1 + count;
690       py = pIn2;
691 
692       /* Decrement the loop counter */
693       blkCnt--;
694     }
695   }
696   else
697   {
698     /* If the srcBLen is not a multiple of 4,
699      * the blockSize2 loop cannot be unrolled by 4 */
700     blkCnt = blockSize2;
701 
702     while (blkCnt > 0U)
703     {
704       /* Accumulator is made zero for every iteration */
705       sum = 0;
706 
707       /* srcBLen number of MACS should be performed */
708       k = srcBLen;
709 
710       while (k > 0U)
711       {
712         /* Perform the multiply-accumulate */
713         sum += ((q63_t) *px++ * *py++);
714 
715         /* Decrement the loop counter */
716         k--;
717       }
718 
719       /* Store the result in the accumulator in the destination buffer. */
720       *pOut = (q15_t) (__SSAT(sum >> 15, 16));
721       /* Destination pointer is updated according to the address modifier, inc */
722       pOut += inc;
723 
724       /* Increment the MAC count */
725       count++;
726 
727       /* Update the inputA and inputB pointers for next MAC calculation */
728       px = pIn1 + count;
729       py = pIn2;
730 
731       /* Decrement the loop counter */
732       blkCnt--;
733     }
734   }
735 
736 
737   /* --------------------------
738    * Initializations of stage3
739    * -------------------------*/
740 
741   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
742    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
743    * ....
744    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
745    * sum +=  x[srcALen-1] * y[0]
746    */
747 
748   /* In this stage the MAC operations are decreased by 1 for every iteration.
749      The count variable holds the number of MAC operations performed */
750   count = srcBLen - 1U;
751 
752   /* Working pointer of inputA */
753   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
754   px = pSrc1;
755 
756   /* Working pointer of inputB */
757   py = pIn2;
758 
759   /* -------------------
760    * Stage3 process
761    * ------------------*/
762 
763   while (blockSize3 > 0U)
764   {
765     /* Accumulator is made zero for every iteration */
766     sum = 0;
767 
768     /* Apply loop unrolling and compute 4 MACs simultaneously. */
769     k = count >> 2U;
770 
771     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
772      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
773     while (k > 0U)
774     {
775       /* Perform the multiply-accumulate */
776       /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
777       sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
778       /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
779       sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
780 
781       /* Decrement loop counter */
782       k--;
783     }
784 
785     /* If the count is not a multiple of 4, compute any remaining MACs here.
786      ** No loop unrolling is used. */
787     k = count % 0x4U;
788 
789     while (k > 0U)
790     {
791       /* Perform the multiply-accumulate */
792       sum = __SMLALD(*px++, *py++, sum);
793 
794       /* Decrement loop counter */
795       k--;
796     }
797 
798     /* Store the result in the accumulator in the destination buffer. */
799     *pOut = (q15_t) (__SSAT((sum >> 15), 16));
800     /* Destination pointer is updated according to the address modifier, inc */
801     pOut += inc;
802 
803     /* Update the inputA and inputB pointers for next MAC calculation */
804     px = ++pSrc1;
805     py = pIn2;
806 
807     /* Decrement MAC count */
808     count--;
809 
810     /* Decrement loop counter */
811     blockSize3--;
812   }
813 
814 #else /* #if defined (ARM_MATH_DSP) */
815 
816   const q15_t *pIn1 = pSrcA;                           /* InputA pointer */
817   const q15_t *pIn2 = pSrcB + (srcBLen - 1U);          /* InputB pointer */
818         q63_t sum;                                     /* Accumulators */
819         uint32_t i = 0U, j;                            /* Loop counters */
820         uint32_t inv = 0U;                             /* Reverse order flag */
821         uint32_t tot = 0U;                             /* Length */
822 
823   /* The algorithm implementation is based on the lengths of the inputs. */
824   /* srcB is always made to slide across srcA. */
825   /* So srcBLen is always considered as shorter or equal to srcALen */
826   /* But CORR(x, y) is reverse of CORR(y, x) */
827   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
828   /* and a varaible, inv is set to 1 */
829   /* If lengths are not equal then zero pad has to be done to  make the two
830    * inputs of same length. But to improve the performance, we include zeroes
831    * in the output instead of zero padding either of the the inputs*/
832   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
833    * starting of the output buffer */
834   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
835    * ending of the output buffer */
836   /* Once the zero padding is done the remaining of the output is calcualted
837    * using convolution but with the shorter signal time shifted. */
838 
839   /* Calculate the length of the remaining sequence */
840   tot = ((srcALen + srcBLen) - 2U);
841 
842   if (srcALen > srcBLen)
843   {
844     /* Calculating the number of zeros to be padded to the output */
845     j = srcALen - srcBLen;
846 
847     /* Initialise the pointer after zero padding */
848     pDst += j;
849   }
850 
851   else if (srcALen < srcBLen)
852   {
853     /* Initialization to inputB pointer */
854     pIn1 = pSrcB;
855 
856     /* Initialization to the end of inputA pointer */
857     pIn2 = pSrcA + (srcALen - 1U);
858 
859     /* Initialisation of the pointer after zero padding */
860     pDst = pDst + tot;
861 
862     /* Swapping the lengths */
863     j = srcALen;
864     srcALen = srcBLen;
865     srcBLen = j;
866 
867     /* Setting the reverse flag */
868     inv = 1;
869   }
870 
871   /* Loop to calculate convolution for output length number of values */
872   for (i = 0U; i <= tot; i++)
873   {
874     /* Initialize sum with zero to carry on MAC operations */
875     sum = 0;
876 
877     /* Loop to perform MAC operations according to convolution equation */
878     for (j = 0U; j <= i; j++)
879     {
880       /* Check the array limitations */
881       if (((i - j) < srcBLen) && (j < srcALen))
882       {
883         /* z[i] += x[i-j] * y[j] */
884         sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)]);
885       }
886     }
887 
888     /* Store the output in the destination buffer */
889     if (inv == 1)
890       *pDst-- = (q15_t) __SSAT((sum >> 15U), 16U);
891     else
892       *pDst++ = (q15_t) __SSAT((sum >> 15U), 16U);
893   }
894 
895 #endif /* #if defined (ARM_MATH_DSP) */
896 
897 }
898 #endif /* defined(ARM_MATH_MVEI) */
899 
900 /**
901   @} end of Corr group
902  */
903