1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_correlate_q31.c
4  * Description:  Correlation of Q31 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Corr
37   @{
38  */
39 
40 /**
41   @brief         Correlation of Q31 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
47 
48   @par           Scaling and Overflow Behavior
49                    The function is implemented using an internal 64-bit accumulator.
50                    The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
51                    There is no saturation on intermediate additions.
52                    Thus, if the accumulator overflows it wraps around and distorts the result.
53                    The input signals should be scaled down to avoid intermediate overflows.
54                    Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
55                    maximum of min(srcALen, srcBLen) number of additions is carried internally.
56                    The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
57 
58   @remark
59                    Refer to \ref arm_correlate_fast_q31() for a faster but less precise implementation of this function.
60  */
61 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
62 #include "arm_helium_utils.h"
63 #include "arm_vec_filtering.h"
arm_correlate_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst)64 ARM_DSP_ATTRIBUTE void arm_correlate_q31(
65   const q31_t * pSrcA,
66         uint32_t srcALen,
67   const q31_t * pSrcB,
68         uint32_t srcBLen,
69         q31_t * pDst)
70 {
71     const q31_t    *pIn1 = pSrcA;     /* inputA pointer               */
72     const q31_t    *pIn2 = pSrcB + (srcBLen - 1U);    /* inputB pointer               */
73     /*
74      * Loop to perform MAC operations according to correlation equation
75      */
76     const q31_t    *pX;
77     const q31_t    *pY;
78     const q31_t    *pA;
79     const q31_t    *pB;
80     int32_t   i = 0U, j = 0;    /* loop counters */
81     int32_t   inv = 4;          /* Reverse order flag */
82     uint32_t  tot = 0U;         /* Length */
83     int32_t   block1, block2, block3;
84     int32_t   incr;
85 
86     tot = ((srcALen + srcBLen) - 2U);
87     if (srcALen > srcBLen)
88     {
89         /*
90          * Calculating the number of zeros to be padded to the output
91          */
92         j = srcALen - srcBLen;
93         /*
94          * Initialize the pointer after zero padding
95          */
96         pDst += j;
97     }
98     else if (srcALen < srcBLen)
99     {
100         /*
101          * Initialization to inputB pointer
102          */
103         pIn1 = pSrcB;
104         /*
105          * Initialization to the end of inputA pointer
106          */
107         pIn2 = pSrcA + (srcALen - 1U);
108         /*
109          * Initialization of the pointer after zero padding
110          */
111         pDst = pDst + tot;
112         /*
113          * Swapping the lengths
114          */
115         j = srcALen;
116         srcALen = srcBLen;
117         srcBLen = j;
118         /*
119          * Setting the reverse flag
120          */
121         inv = -4;
122 
123     }
124 
125     block1 = srcBLen - 1;
126     block2 = srcALen - srcBLen + 1;
127     block3 = srcBLen - 1;
128     pA = pIn1;
129     pB = pIn2;
130     incr = inv / 4;
131 
132     for (i = 0U; i <= block1 - 2; i += 2)
133     {
134         uint32_t  count = i + 1;
135         int64_t   acc0 = 0LL;
136         int64_t   acc1 = 0LL;
137 
138         /* compute 2 accumulators per loop */
139         /* size is incrementing for second accumulator */
140         /* Y pointer is decrementing for second accumulator */
141         pX = pA;
142         pY = pB;
143         MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count);
144 
145         *pDst = (q31_t) acc0;
146         pDst += incr;
147         *pDst = (q31_t) acc1;
148         pDst += incr;
149         pB -= 2;
150     }
151     for (; i < block1; i++)
152     {
153         uint32_t  count = i + 1;
154         int64_t   acc = 0LL;
155 
156         pX = pA;
157         pY = pB;
158         MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count);
159 
160         *pDst = (q31_t) acc;
161         pDst += incr;
162         pB--;
163     }
164 
165     for (i = 0U; i <= block2 - 4; i += 4)
166     {
167         int64_t   acc0 = 0LL;
168         int64_t   acc1 = 0LL;
169         int64_t   acc2 = 0LL;
170         int64_t   acc3 = 0LL;
171 
172         pX = pA;
173         pY = pB;
174         /* compute 4 accumulators per loop */
175         /* size is fixed for all accumulators */
176         /* X pointer is incrementing for successive accumulators */
177         MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
178 
179         *pDst = (q31_t) acc0;
180         pDst += incr;
181         *pDst = (q31_t) acc1;
182         pDst += incr;
183         *pDst = (q31_t) acc2;
184         pDst += incr;
185         *pDst = (q31_t) acc3;
186         pDst += incr;
187         pA += 4;
188     }
189 
190     for (; i <= block2 - 2; i += 2)
191     {
192         int64_t   acc0 = 0LL;
193         int64_t   acc1 = 0LL;
194 
195         pX = pA;
196         pY = pB;
197         /* compute 2 accumulators per loop */
198         /* size is fixed for all accumulators */
199         /* X pointer is incrementing for second accumulator */
200         MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, srcBLen);
201 
202         *pDst = (q31_t) acc0;
203         pDst += incr;
204         *pDst = (q31_t) acc1;
205         pDst += incr;
206         pA += 2;
207     }
208 
209     if (block2 & 1)
210     {
211         int64_t   acc = 0LL;
212 
213         pX = pA;
214         pY = pB;
215         MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, srcBLen);
216 
217         *pDst = (q31_t) acc;
218         pDst += incr;
219         pA++;
220     }
221 
222     for (i = block3 - 1; i > 0; i -= 2)
223     {
224 
225         uint32_t  count = (i + 1);
226         int64_t   acc0 = 0LL;
227         int64_t   acc1 = 0LL;
228 
229         pX = pA;
230         pY = pB;
231         /* compute 2 accumulators per loop */
232         /* size is decrementing for second accumulator */
233         /* X pointer is incrementing for second accumulator */
234         MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count);
235 
236         *pDst = (q31_t) acc0;
237         pDst += incr;
238         *pDst = (q31_t) acc1;
239         pDst += incr;
240         pA += 2;
241 
242     }
243     for (; i >= 0; i--)
244     {
245         uint32_t  count = (i + 1);
246         int64_t   acc = 0LL;
247 
248         pX = pA;
249         pY = pB;
250         MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count);
251 
252         *pDst = (q31_t) acc;
253         pDst += incr;
254         pA++;
255     }
256 }
257 #else
arm_correlate_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst)258 ARM_DSP_ATTRIBUTE void arm_correlate_q31(
259   const q31_t * pSrcA,
260         uint32_t srcALen,
261   const q31_t * pSrcB,
262         uint32_t srcBLen,
263         q31_t * pDst)
264 {
265 
266 #if (1)
267 //#if !defined(ARM_MATH_CM0_FAMILY)
268 
269   const q31_t *pIn1;                                   /* InputA pointer */
270   const q31_t *pIn2;                                   /* InputB pointer */
271         q31_t *pOut = pDst;                            /* Output pointer */
272   const q31_t *px;                                     /* Intermediate inputA pointer */
273   const q31_t *py;                                     /* Intermediate inputB pointer */
274   const q31_t *pSrc1;                                  /* Intermediate pointers */
275         q63_t sum;                                     /* Accumulators */
276         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
277         uint32_t j, k, count, blkCnt;                  /* Loop counters */
278         uint32_t outBlockSize;
279         int32_t inc = 1;                               /* Destination address modifier */
280 
281 #if defined (ARM_MATH_LOOPUNROLL)
282         q63_t acc0, acc1, acc2;                        /* Accumulators */
283         q31_t x0, x1, x2, c0;                          /* Temporary variables for holding input and coefficient values */
284 #endif
285 
286   /* The algorithm implementation is based on the lengths of the inputs. */
287   /* srcB is always made to slide across srcA. */
288   /* So srcBLen is always considered as shorter or equal to srcALen */
289   /* But CORR(x, y) is reverse of CORR(y, x) */
290   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
291   /* and the destination pointer modifier, inc is set to -1 */
292   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
293   /* But to improve the performance,
294    * we include zeroes in the output instead of zero padding either of the the inputs*/
295   /* If srcALen > srcBLen,
296    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
297   /* If srcALen < srcBLen,
298    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
299   if (srcALen >= srcBLen)
300   {
301     /* Initialization of inputA pointer */
302     pIn1 = pSrcA;
303 
304     /* Initialization of inputB pointer */
305     pIn2 = pSrcB;
306 
307     /* Number of output samples is calculated */
308     outBlockSize = (2U * srcALen) - 1U;
309 
310     /* When srcALen > srcBLen, zero padding is done to srcB
311      * to make their lengths equal.
312      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
313      * number of output samples are made zero */
314     j = outBlockSize - (srcALen + (srcBLen - 1U));
315 
316     /* Updating the pointer position to non zero value */
317     pOut += j;
318   }
319   else
320   {
321     /* Initialization of inputA pointer */
322     pIn1 = pSrcB;
323 
324     /* Initialization of inputB pointer */
325     pIn2 = pSrcA;
326 
327     /* srcBLen is always considered as shorter or equal to srcALen */
328     j = srcBLen;
329     srcBLen = srcALen;
330     srcALen = j;
331 
332     /* CORR(x, y) = Reverse order(CORR(y, x)) */
333     /* Hence set the destination pointer to point to the last output sample */
334     pOut = pDst + ((srcALen + srcBLen) - 2U);
335 
336     /* Destination address modifier is set to -1 */
337     inc = -1;
338   }
339 
340   /* The function is internally
341    * divided into three stages according to the number of multiplications that has to be
342    * taken place between inputA samples and inputB samples. In the first stage of the
343    * algorithm, the multiplications increase by one for every iteration.
344    * In the second stage of the algorithm, srcBLen number of multiplications are done.
345    * In the third stage of the algorithm, the multiplications decrease by one
346    * for every iteration. */
347 
348   /* The algorithm is implemented in three stages.
349      The loop counters of each stage is initiated here. */
350   blockSize1 = srcBLen - 1U;
351   blockSize2 = srcALen - (srcBLen - 1U);
352   blockSize3 = blockSize1;
353 
354   /* --------------------------
355    * Initializations of stage1
356    * -------------------------*/
357 
358   /* sum = x[0] * y[srcBlen - 1]
359    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
360    * ....
361    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
362    */
363 
364   /* In this stage the MAC operations are increased by 1 for every iteration.
365      The count variable holds the number of MAC operations performed */
366   count = 1U;
367 
368   /* Working pointer of inputA */
369   px = pIn1;
370 
371   /* Working pointer of inputB */
372   pSrc1 = pIn2 + (srcBLen - 1U);
373   py = pSrc1;
374 
375 
376   /* ------------------------
377    * Stage1 process
378    * ----------------------*/
379 
380   /* The first stage starts here */
381   while (blockSize1 > 0U)
382   {
383     /* Accumulator is made zero for every iteration */
384     sum = 0;
385 
386 #if defined (ARM_MATH_LOOPUNROLL)
387 
388     /* Loop unrolling: Compute 4 outputs at a time */
389     k = count >> 2U;
390 
391     while (k > 0U)
392     {
393       /* x[0] * y[srcBLen - 4] */
394       sum += (q63_t) *px++ * (*py++);
395 
396       /* x[1] * y[srcBLen - 3] */
397       sum += (q63_t) *px++ * (*py++);
398 
399       /* x[2] * y[srcBLen - 2] */
400       sum += (q63_t) *px++ * (*py++);
401 
402       /* x[3] * y[srcBLen - 1] */
403       sum += (q63_t) *px++ * (*py++);
404 
405       /* Decrement loop counter */
406       k--;
407     }
408 
409     /* Loop unrolling: Compute remaining outputs */
410     k = count % 0x4U;
411 
412 #else
413 
414     /* Initialize k with number of samples */
415     k = count;
416 
417 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
418 
419     while (k > 0U)
420     {
421       /* Perform the multiply-accumulate */
422       /* x[0] * y[srcBLen - 1] */
423       sum += (q63_t) *px++ * (*py++);
424 
425       /* Decrement loop counter */
426       k--;
427     }
428 
429     /* Store the result in the accumulator in the destination buffer. */
430     *pOut = (q31_t) (sum >> 31);
431     /* Destination pointer is updated according to the address modifier, inc */
432     pOut += inc;
433 
434     /* Update the inputA and inputB pointers for next MAC calculation */
435     py = pSrc1 - count;
436     px = pIn1;
437 
438     /* Increment MAC count */
439     count++;
440 
441     /* Decrement loop counter */
442     blockSize1--;
443   }
444 
445   /* --------------------------
446    * Initializations of stage2
447    * ------------------------*/
448 
449   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
450    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
451    * ....
452    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
453    */
454 
455   /* Working pointer of inputA */
456   px = pIn1;
457 
458   /* Working pointer of inputB */
459   py = pIn2;
460 
461   /* count is index by which the pointer pIn1 to be incremented */
462   count = 0U;
463 
464   /* -------------------
465    * Stage2 process
466    * ------------------*/
467 
468   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
469    * So, to loop unroll over blockSize2,
470    * srcBLen should be greater than or equal to 4 */
471   if (srcBLen >= 4U)
472   {
473 #if defined (ARM_MATH_LOOPUNROLL)
474 
475     /* Loop unroll by 3 */
476     blkCnt = blockSize2 / 3;
477 
478     while (blkCnt > 0U)
479     {
480       /* Set all accumulators to zero */
481       acc0 = 0;
482       acc1 = 0;
483       acc2 = 0;
484 
485       /* read x[0], x[1] samples */
486       x0 = *px++;
487       x1 = *px++;
488 
489       /* Apply loop unrolling and compute 3 MACs simultaneously. */
490       k = srcBLen / 3;
491 
492       /* First part of the processing with loop unrolling.  Compute 3 MACs at a time.
493        ** a second loop below computes MACs for the remaining 1 to 2 samples. */
494       do
495       {
496         /* Read y[0] sample */
497         c0 = *(py);
498         /* Read x[2] sample */
499         x2 = *(px);
500 
501         /* Perform the multiply-accumulate */
502         /* acc0 +=  x[0] * y[0] */
503         acc0 += ((q63_t) x0 * c0);
504         /* acc1 +=  x[1] * y[0] */
505         acc1 += ((q63_t) x1 * c0);
506         /* acc2 +=  x[2] * y[0] */
507         acc2 += ((q63_t) x2 * c0);
508 
509         /* Read y[1] sample */
510         c0 = *(py + 1U);
511         /* Read x[3] sample */
512         x0 = *(px + 1U);
513 
514         /* Perform the multiply-accumulate */
515         /* acc0 +=  x[1] * y[1] */
516         acc0 += ((q63_t) x1 * c0);
517         /* acc1 +=  x[2] * y[1] */
518         acc1 += ((q63_t) x2 * c0);
519         /* acc2 +=  x[3] * y[1] */
520         acc2 += ((q63_t) x0 * c0);
521 
522         /* Read y[2] sample */
523         c0 = *(py + 2U);
524         /* Read x[4] sample */
525         x1 = *(px + 2U);
526 
527         /* Perform the multiply-accumulate */
528         /* acc0 +=  x[2] * y[2] */
529         acc0 += ((q63_t) x2 * c0);
530         /* acc1 +=  x[3] * y[2] */
531         acc1 += ((q63_t) x0 * c0);
532         /* acc2 +=  x[4] * y[2] */
533         acc2 += ((q63_t) x1 * c0);
534 
535         /* update scratch pointers */
536         px += 3U;
537         py += 3U;
538 
539       } while (--k);
540 
541       /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
542        ** No loop unrolling is used. */
543       k = srcBLen - (3 * (srcBLen / 3));
544 
545       while (k > 0U)
546       {
547         /* Read y[4] sample */
548         c0 = *(py++);
549 
550         /* Read x[7] sample */
551         x2 = *(px++);
552 
553         /* Perform the multiply-accumulates */
554         /* acc0 +=  x[4] * y[4] */
555         acc0 += ((q63_t) x0 * c0);
556         /* acc1 +=  x[5] * y[4] */
557         acc1 += ((q63_t) x1 * c0);
558         /* acc2 +=  x[6] * y[4] */
559         acc2 += ((q63_t) x2 * c0);
560 
561         /* Reuse the present samples for the next MAC */
562         x0 = x1;
563         x1 = x2;
564 
565         /* Decrement loop counter */
566         k--;
567       }
568 
569       /* Store the result in the accumulator in the destination buffer. */
570       *pOut = (q31_t) (acc0 >> 31);
571       /* Destination pointer is updated according to the address modifier, inc */
572       pOut += inc;
573 
574       *pOut = (q31_t) (acc1 >> 31);
575       pOut += inc;
576 
577       *pOut = (q31_t) (acc2 >> 31);
578       pOut += inc;
579 
580       /* Increment the pointer pIn1 index, count by 3 */
581       count += 3U;
582 
583       /* Update the inputA and inputB pointers for next MAC calculation */
584       px = pIn1 + count;
585       py = pIn2;
586 
587       /* Decrement loop counter */
588       blkCnt--;
589     }
590 
591     /* Loop unrolling: Compute remaining outputs */
592     blkCnt = blockSize2 - 3 * (blockSize2 / 3);
593 
594 #else
595 
596     /* Initialize blkCnt with number of samples */
597     blkCnt = blockSize2;
598 
599 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
600 
601     while (blkCnt > 0U)
602     {
603       /* Accumulator is made zero for every iteration */
604       sum = 0;
605 
606 #if defined (ARM_MATH_LOOPUNROLL)
607 
608     /* Loop unrolling: Compute 4 outputs at a time */
609       k = srcBLen >> 2U;
610 
611       while (k > 0U)
612       {
613         /* Perform the multiply-accumulates */
614         sum += (q63_t) *px++ * *py++;
615         sum += (q63_t) *px++ * *py++;
616         sum += (q63_t) *px++ * *py++;
617         sum += (q63_t) *px++ * *py++;
618 
619         /* Decrement loop counter */
620         k--;
621       }
622 
623       /* Loop unrolling: Compute remaining outputs */
624       k = srcBLen % 0x4U;
625 
626 #else
627 
628       /* Initialize blkCnt with number of samples */
629       k = srcBLen;
630 
631 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
632 
633       while (k > 0U)
634       {
635         /* Perform the multiply-accumulate */
636         sum += (q63_t) *px++ * *py++;
637 
638         /* Decrement the loop counter */
639         k--;
640       }
641 
642       /* Store the result in the accumulator in the destination buffer. */
643       *pOut = (q31_t) (sum >> 31);
644       /* Destination pointer is updated according to the address modifier, inc */
645       pOut += inc;
646 
647       /* Increment MAC count */
648       count++;
649 
650       /* Update the inputA and inputB pointers for next MAC calculation */
651       px = pIn1 + count;
652       py = pIn2;
653 
654       /* Decrement loop counter */
655       blkCnt--;
656     }
657   }
658   else
659   {
660     /* If the srcBLen is not a multiple of 4,
661      * the blockSize2 loop cannot be unrolled by 4 */
662     blkCnt = blockSize2;
663 
664     while (blkCnt > 0U)
665     {
666       /* Accumulator is made zero for every iteration */
667       sum = 0;
668 
669       /* srcBLen number of MACS should be performed */
670       k = srcBLen;
671 
672       while (k > 0U)
673       {
674         /* Perform the multiply-accumulate */
675         sum += (q63_t) *px++ * *py++;
676 
677         /* Decrement the loop counter */
678         k--;
679       }
680 
681       /* Store the result in the accumulator in the destination buffer. */
682       *pOut = (q31_t) (sum >> 31);
683       /* Destination pointer is updated according to the address modifier, inc */
684       pOut += inc;
685 
686       /* Increment MAC count */
687       count++;
688 
689       /* Update the inputA and inputB pointers for next MAC calculation */
690       px = pIn1 + count;
691       py = pIn2;
692 
693       /* Decrement loop counter */
694       blkCnt--;
695     }
696   }
697 
698 
699   /* --------------------------
700    * Initializations of stage3
701    * -------------------------*/
702 
703   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
704    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
705    * ....
706    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
707    * sum +=  x[srcALen-1] * y[0]
708    */
709 
710   /* In this stage the MAC operations are decreased by 1 for every iteration.
711      The count variable holds the number of MAC operations performed */
712   count = srcBLen - 1U;
713 
714   /* Working pointer of inputA */
715   pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
716   px = pSrc1;
717 
718   /* Working pointer of inputB */
719   py = pIn2;
720 
721   /* -------------------
722    * Stage3 process
723    * ------------------*/
724 
725   while (blockSize3 > 0U)
726   {
727     /* Accumulator is made zero for every iteration */
728     sum = 0;
729 
730 #if defined (ARM_MATH_LOOPUNROLL)
731 
732     /* Loop unrolling: Compute 4 outputs at a time */
733     k = count >> 2U;
734 
735     while (k > 0U)
736     {
737       /* Perform the multiply-accumulate */
738       /* sum += x[srcALen - srcBLen + 4] * y[3] */
739       sum += (q63_t) *px++ * *py++;
740 
741       /* sum += x[srcALen - srcBLen + 3] * y[2] */
742       sum += (q63_t) *px++ * *py++;
743 
744       /* sum += x[srcALen - srcBLen + 2] * y[1] */
745       sum += (q63_t) *px++ * *py++;
746 
747       /* sum += x[srcALen - srcBLen + 1] * y[0] */
748       sum += (q63_t) *px++ * *py++;
749 
750       /* Decrement loop counter */
751       k--;
752     }
753 
754     /* Loop unrolling: Compute remaining outputs */
755     k = count % 0x4U;
756 
757 #else
758 
759     /* Initialize blkCnt with number of samples */
760     k = count;
761 
762 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
763 
764     while (k > 0U)
765     {
766       /* Perform the multiply-accumulate */
767       sum += (q63_t) *px++ * *py++;
768 
769       /* Decrement loop counter */
770       k--;
771     }
772 
773     /* Store the result in the accumulator in the destination buffer. */
774     *pOut = (q31_t) (sum >> 31);
775     /* Destination pointer is updated according to the address modifier, inc */
776     pOut += inc;
777 
778     /* Update the inputA and inputB pointers for next MAC calculation */
779     px = ++pSrc1;
780     py = pIn2;
781 
782     /* Decrement MAC count */
783     count--;
784 
785     /* Decrement loop counter */
786     blockSize3--;
787   }
788 
789 #else
790 /* alternate version for CM0_FAMILY */
791 
792   const q31_t *pIn1 = pSrcA;                           /* InputA pointer */
793   const q31_t *pIn2 = pSrcB + (srcBLen - 1U);          /* InputB pointer */
794         q63_t sum;                                     /* Accumulators */
795         uint32_t i = 0U, j;                            /* Loop counters */
796         uint32_t inv = 0U;                             /* Reverse order flag */
797         uint32_t tot = 0U;                             /* Length */
798 
799   /* The algorithm implementation is based on the lengths of the inputs. */
800   /* srcB is always made to slide across srcA. */
801   /* So srcBLen is always considered as shorter or equal to srcALen */
802   /* But CORR(x, y) is reverse of CORR(y, x) */
803   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
804   /* and a varaible, inv is set to 1 */
805   /* If lengths are not equal then zero pad has to be done to  make the two
806    * inputs of same length. But to improve the performance, we include zeroes
807    * in the output instead of zero padding either of the the inputs*/
808   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
809    * starting of the output buffer */
810   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
811    * ending of the output buffer */
812   /* Once the zero padding is done the remaining of the output is calcualted
813    * using correlation but with the shorter signal time shifted. */
814 
815   /* Calculate the length of the remaining sequence */
816   tot = ((srcALen + srcBLen) - 2U);
817 
818   if (srcALen > srcBLen)
819   {
820     /* Calculating the number of zeros to be padded to the output */
821     j = srcALen - srcBLen;
822 
823     /* Initialise the pointer after zero padding */
824     pDst += j;
825   }
826 
827   else if (srcALen < srcBLen)
828   {
829     /* Initialization to inputB pointer */
830     pIn1 = pSrcB;
831 
832     /* Initialization to the end of inputA pointer */
833     pIn2 = pSrcA + (srcALen - 1U);
834 
835     /* Initialisation of the pointer after zero padding */
836     pDst = pDst + tot;
837 
838     /* Swapping the lengths */
839     j = srcALen;
840     srcALen = srcBLen;
841     srcBLen = j;
842 
843     /* Setting the reverse flag */
844     inv = 1;
845   }
846 
847   /* Loop to calculate correlation for output length number of times */
848   for (i = 0U; i <= tot; i++)
849   {
850     /* Initialize sum with zero to carry out MAC operations */
851     sum = 0;
852 
853     /* Loop to perform MAC operations according to correlation equation */
854     for (j = 0U; j <= i; j++)
855     {
856       /* Check the array limitations */
857       if (((i - j) < srcBLen) && (j < srcALen))
858       {
859         /* z[i] += x[i-j] * y[j] */
860         sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)]);
861       }
862     }
863 
864     /* Store the output in the destination buffer */
865     if (inv == 1)
866       *pDst-- = (q31_t) (sum >> 31U);
867     else
868       *pDst++ = (q31_t) (sum >> 31U);
869   }
870 
871 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
872 
873 }
874 #endif /* defined(ARM_MATH_MVEI) */
875 
876 /**
877   @} end of Corr group
878  */
879