1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_correlate_f32.c
4  * Description:  Correlation of floating-point sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @defgroup Corr Correlation
37 
38   Correlation is a mathematical operation that is similar to convolution.
39   As with convolution, correlation uses two signals to produce a third signal.
40   The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
41   Correlation is commonly used to measure the similarity between two signals.
42   It has applications in pattern recognition, cryptanalysis, and searching.
43   The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
44   Fast versions of the Q15 and Q31 functions are also provided.
45 
46   @par           Algorithm
47                    Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
48                    The convolution of the two signals is denoted by
49                    \f[
50                    c[n] = a[n] * b[n]
51                    \f]
52 
53                    In correlation, one of the signals is flipped in time
54 
55                    \f[
56                    c[n] = a[n] * b[-n]
57                    \f]
58   @par
59                    and this is mathematically defined as
60                    \f[
61                    c[n] = \sum_{k=0}^{srcALen} a[k] b[k-n]
62                    \f]
63   @par
64                    The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
65                    The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
66                    The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
67 
68   @note
69                    The <code>pDst</code> should be initialized to all zeros before being used.
70 
71   @par           Fixed-Point Behavior
72                    Correlation requires summing up a large number of intermediate products.
73                    As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
74                    Refer to the function specific documentation below for further details of the particular algorithm used.
75 
76   @par           Fast Versions
77                    Fast versions are supported for Q31 and Q15.  Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
78                    the input signals should be scaled down to avoid intermediate overflows.
79 
80   @par           Opt Versions
81                    Opt versions are supported for Q15 and Q7.  Design uses internal scratch buffer for getting good optimisation.
82                    These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions of correlate
83 
84   @par           Long versions:
85                    For convolution of long vectors, those functions are
86                    no more adapted and will be very slow.
87                    An implementation based upon FFTs should be used.
88  */
89 
90 /**
91   @addtogroup Corr
92   @{
93  */
94 
95 /**
96   @brief         Correlation of floating-point sequences.
97   @param[in]     pSrcA      points to the first input sequence
98   @param[in]     srcALen    length of the first input sequence
99   @param[in]     pSrcB      points to the second input sequence
100   @param[in]     srcBLen    length of the second input sequence
101   @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
102  */
103 
104 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
105 
106 #include "arm_helium_utils.h"
107 #include "arm_vec_filtering.h"
108 
109 
arm_correlate_f32(const float32_t * pSrcA,uint32_t srcALen,const float32_t * pSrcB,uint32_t srcBLen,float32_t * pDst)110 ARM_DSP_ATTRIBUTE void arm_correlate_f32(
111   const float32_t * pSrcA,
112         uint32_t srcALen,
113   const float32_t * pSrcB,
114         uint32_t srcBLen,
115         float32_t * pDst)
116 {
117     const float32_t *pIn1 = pSrcA;    /* inputA pointer               */
118     const float32_t *pIn2 = pSrcB + (srcBLen - 1U);   /* inputB pointer               */
119     const float32_t *pX, *pY;
120     const float32_t *pA, *pB;
121     int32_t   i = 0U, j = 0;    /* loop counters */
122     int32_t   inv = 4U;         /* Reverse order flag */
123     uint32_t  tot = 0U;         /* Length */
124     int32_t   block1, block2, block3;
125     int32_t   incr;
126 
127     tot = ((srcALen + srcBLen) - 2U);
128     if (srcALen > srcBLen)
129     {
130         /*
131          * Calculating the number of zeros to be padded to the output
132          */
133         j = srcALen - srcBLen;
134         /*
135          * Initialize the pointer after zero padding
136          */
137         pDst += j;
138     }
139     else if (srcALen < srcBLen)
140     {
141         /*
142          * Initialization to inputB pointer
143          */
144         pIn1 = pSrcB;
145         /*
146          * Initialization to the end of inputA pointer
147          */
148         pIn2 = pSrcA + (srcALen - 1U);
149         /*
150          * Initialisation of the pointer after zero padding
151          */
152         pDst = pDst + tot;
153         /*
154          * Swapping the lengths
155          */
156 
157         j = srcALen;
158         srcALen = srcBLen;
159         srcBLen = j;
160         /*
161          * Setting the reverse flag
162          */
163         inv = -4;
164     }
165 
166     block1 = srcBLen - 1;
167     block2 = srcALen - srcBLen + 1;
168     block3 = srcBLen - 1;
169 
170     pA = pIn1;
171     pB = pIn2;
172     incr = inv / 4;
173 
174     for (i = 0U; i <= block1 - 2; i += 2)
175     {
176         uint32_t  count = i + 1;
177         float32_t acc0;
178         float32_t acc1;
179 
180         /*
181          * compute 2 accumulators per loop
182          * size is incrementing for second accumulator
183          * Y pointer is decrementing for second accumulator
184          */
185         pX = pA;
186         pY = pB;
187         MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count);
188 
189         *pDst = acc0;
190         pDst += incr;
191         *pDst = acc1;
192         pDst += incr;
193         pB -= 2;
194     }
195     for (; i < block1; i++)
196     {
197         uint32_t  count = i + 1;
198         float32_t acc;
199 
200         pX = pA;
201         pY = pB;
202         MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, count);
203 
204         *pDst = acc;
205         pDst += incr;
206         pB--;
207     }
208 
209     for (i = 0U; i <= block2 - 4; i += 4)
210     {
211         float32_t acc0;
212         float32_t acc1;
213         float32_t acc2;
214         float32_t acc3;
215 
216         pX = pA;
217         pY = pB;
218         /*
219          * compute 4 accumulators per loop
220          * size is fixed for all accumulators
221          * X pointer is incrementing for successive accumulators
222          */
223         MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F32(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
224 
225         *pDst = acc0;
226         pDst += incr;
227         *pDst = acc1;
228         pDst += incr;
229         *pDst = acc2;
230         pDst += incr;
231         *pDst = acc3;
232         pDst += incr;
233         pA += 4;
234     }
235 
236     for (; i <= block2 - 2; i += 2)
237     {
238         float32_t acc0;
239         float32_t acc1;
240 
241         pX = pA;
242         pY = pB;
243         /*
244          * compute 2 accumulators per loop
245          * size is fixed for all accumulators
246          * X pointer is incrementing for second accumulator
247          */
248         MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, srcBLen);
249 
250         *pDst = acc0;
251         pDst += incr;
252         *pDst = acc1;
253         pDst += incr;
254         pA += 2;
255     }
256 
257     if (block2 & 1)
258     {
259         float32_t acc;
260 
261         pX = pA;
262         pY = pB;
263         MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, srcBLen);
264 
265         *pDst = acc;
266         pDst += incr;
267         pA++;
268     }
269 
270     for (i = block3 - 1; i > 0; i -= 2)
271     {
272 
273         uint32_t  count = (i + 1);
274         float32_t acc0;
275         float32_t acc1;
276 
277         pX = pA;
278         pY = pB;
279         /*
280          * compute 2 accumulators per loop
281          * size is decrementing for second accumulator
282          * X pointer is incrementing for second accumulator
283          */
284         MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count);
285 
286         *pDst = acc0;
287         pDst += incr;
288         *pDst = acc1;
289         pDst += incr;
290         pA += 2;
291 
292     }
293     for (; i >= 0; i--)
294     {
295         uint32_t  count = (i + 1);
296         float32_t acc;
297 
298         pX = pA;
299         pY = pB;
300         MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, count);
301 
302         *pDst = acc;
303         pDst += incr;
304         pA++;
305     }
306 }
307 
308 #else
arm_correlate_f32(const float32_t * pSrcA,uint32_t srcALen,const float32_t * pSrcB,uint32_t srcBLen,float32_t * pDst)309 ARM_DSP_ATTRIBUTE void arm_correlate_f32(
310   const float32_t * pSrcA,
311         uint32_t srcALen,
312   const float32_t * pSrcB,
313         uint32_t srcBLen,
314         float32_t * pDst)
315 {
316 
317 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_AUTOVECTORIZE)
318 
319   const float32_t *pIn1;                               /* InputA pointer */
320   const float32_t *pIn2;                               /* InputB pointer */
321         float32_t *pOut = pDst;                        /* Output pointer */
322   const float32_t *px;                                 /* Intermediate inputA pointer */
323   const float32_t *py;                                 /* Intermediate inputB pointer */
324   const float32_t *pSrc1;
325         float32_t sum;
326         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
327         uint32_t j, k, count, blkCnt;                  /* Loop counters */
328         uint32_t outBlockSize;                         /* Loop counter */
329         int32_t inc = 1;                               /* Destination address modifier */
330 
331 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
332     float32_t acc0, acc1, acc2, acc3,c0;                    /* Accumulators */
333 #if !defined(ARM_MATH_NEON)
334     float32_t x0, x1, x2, x3;                        /* temporary variables for holding input and coefficient values */
335 #endif
336 #endif
337 
338   /* The algorithm implementation is based on the lengths of the inputs. */
339   /* srcB is always made to slide across srcA. */
340   /* So srcBLen is always considered as shorter or equal to srcALen */
341   /* But CORR(x, y) is reverse of CORR(y, x) */
342   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
343   /* and the destination pointer modifier, inc is set to -1 */
344   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
345   /* But to improve the performance,
346    * we assume zeroes in the output instead of zero padding either of the the inputs*/
347   /* If srcALen > srcBLen,
348    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
349   /* If srcALen < srcBLen,
350    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
351   if (srcALen >= srcBLen)
352   {
353     /* Initialization of inputA pointer */
354     pIn1 = pSrcA;
355 
356     /* Initialization of inputB pointer */
357     pIn2 = pSrcB;
358 
359     /* Number of output samples is calculated */
360     outBlockSize = (2U * srcALen) - 1U;
361 
362     /* When srcALen > srcBLen, zero padding has to be done to srcB
363      * to make their lengths equal.
364      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
365      * number of output samples are made zero */
366     j = outBlockSize - (srcALen + (srcBLen - 1U));
367 
368     /* Updating the pointer position to non zero value */
369     pOut += j;
370   }
371   else
372   {
373     /* Initialization of inputA pointer */
374     pIn1 = pSrcB;
375 
376     /* Initialization of inputB pointer */
377     pIn2 = pSrcA;
378 
379     /* srcBLen is always considered as shorter or equal to srcALen */
380     j = srcBLen;
381     srcBLen = srcALen;
382     srcALen = j;
383 
384     /* CORR(x, y) = Reverse order(CORR(y, x)) */
385     /* Hence set the destination pointer to point to the last output sample */
386     pOut = pDst + ((srcALen + srcBLen) - 2U);
387 
388     /* Destination address modifier is set to -1 */
389     inc = -1;
390   }
391 
392   /* The function is internally
393    * divided into three stages according to the number of multiplications that has to be
394    * taken place between inputA samples and inputB samples. In the first stage of the
395    * algorithm, the multiplications increase by one for every iteration.
396    * In the second stage of the algorithm, srcBLen number of multiplications are done.
397    * In the third stage of the algorithm, the multiplications decrease by one
398    * for every iteration. */
399 
400   /* The algorithm is implemented in three stages.
401      The loop counters of each stage is initiated here. */
402   blockSize1 = srcBLen - 1U;
403   blockSize2 = srcALen - (srcBLen - 1U);
404   blockSize3 = blockSize1;
405 
406   /* --------------------------
407    * Initializations of stage1
408    * -------------------------*/
409 
410   /* sum = x[0] * y[srcBlen - 1]
411    * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
412    * ....
413    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
414    */
415 
416   /* In this stage the MAC operations are increased by 1 for every iteration.
417      The count variable holds the number of MAC operations performed */
418   count = 1U;
419 
420   /* Working pointer of inputA */
421   px = pIn1;
422 
423   /* Working pointer of inputB */
424   pSrc1 = pIn2 + (srcBLen - 1U);
425   py = pSrc1;
426 
427   /* ------------------------
428    * Stage1 process
429    * ----------------------*/
430 
431   /* The first stage starts here */
432   while (blockSize1 > 0U)
433   {
434     /* Accumulator is made zero for every iteration */
435     sum = 0.0f;
436 
437 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
438 
439     /* Loop unrolling: Compute 4 outputs at a time */
440     k = count >> 2U;
441 
442 #if defined(ARM_MATH_NEON)
443     float32x4_t x,y;
444     float32x4_t res = vdupq_n_f32(0) ;
445     float32x2_t accum = vdup_n_f32(0);
446 
447     while (k > 0U)
448     {
449       x = vld1q_f32(px);
450       y = vld1q_f32(py);
451 
452       res = vmlaq_f32(res,x, y);
453 
454       px += 4;
455       py += 4;
456 
457       /* Decrement the loop counter */
458       k--;
459     }
460 
461     accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
462     sum += accum[0] + accum[1];
463 
464     k = count & 0x3;
465 #else
466     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
467      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
468     while (k > 0U)
469     {
470       /* x[0] * y[srcBLen - 4] */
471       sum += *px++ * *py++;
472 
473       /* x[1] * y[srcBLen - 3] */
474       sum += *px++ * *py++;
475 
476       /* x[2] * y[srcBLen - 2] */
477       sum += *px++ * *py++;
478 
479       /* x[3] * y[srcBLen - 1] */
480       sum += *px++ * *py++;
481 
482       /* Decrement loop counter */
483       k--;
484     }
485 
486     /* Loop unrolling: Compute remaining outputs */
487     k = count % 0x4U;
488 
489 #endif /* #if defined(ARM_MATH_NEON) */
490 #else
491 
492     /* Initialize k with number of samples */
493     k = count;
494 
495 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
496 
497     while (k > 0U)
498     {
499       /* Perform the multiply-accumulate */
500       /* x[0] * y[srcBLen - 1] */
501       sum += *px++ * *py++;
502 
503       /* Decrement loop counter */
504       k--;
505     }
506 
507     /* Store the result in the accumulator in the destination buffer. */
508     *pOut = sum;
509     /* Destination pointer is updated according to the address modifier, inc */
510     pOut += inc;
511 
512     /* Update the inputA and inputB pointers for next MAC calculation */
513     py = pSrc1 - count;
514     px = pIn1;
515 
516     /* Increment MAC count */
517     count++;
518 
519     /* Decrement loop counter */
520     blockSize1--;
521   }
522 
523   /* --------------------------
524    * Initializations of stage2
525    * ------------------------*/
526 
527   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
528    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen]   * y[srcBLen-1]
529    * ....
530    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
531    */
532 
533   /* Working pointer of inputA */
534   px = pIn1;
535 
536   /* Working pointer of inputB */
537   py = pIn2;
538 
539   /* count is index by which the pointer pIn1 to be incremented */
540   count = 0U;
541 
542   /* -------------------
543    * Stage2 process
544    * ------------------*/
545 
546   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
547    * So, to loop unroll over blockSize2,
548    * srcBLen should be greater than or equal to 4 */
549   if (srcBLen >= 4U)
550   {
551 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
552 
553     /* Loop unrolling: Compute 4 outputs at a time */
554     blkCnt = blockSize2 >> 2U;
555 
556 #if defined(ARM_MATH_NEON)
557       float32x4_t c;
558       float32x4_t x1v;
559       float32x4_t x2v;
560       float32x4_t x;
561       float32x4_t res = vdupq_n_f32(0) ;
562 #endif /* #if defined(ARM_MATH_NEON) */
563 
564     while (blkCnt > 0U)
565     {
566       /* Set all accumulators to zero */
567       acc0 = 0.0f;
568       acc1 = 0.0f;
569       acc2 = 0.0f;
570       acc3 = 0.0f;
571 
572 #if defined(ARM_MATH_NEON)
573       /* Compute 4 MACs simultaneously. */
574       k = srcBLen >> 2U;
575 
576       res = vdupq_n_f32(0) ;
577 
578       x1v = vld1q_f32(px);
579       px += 4;
580       do
581       {
582         x2v = vld1q_f32(px);
583         c = vld1q_f32(py);
584 
585         py += 4;
586 
587         x = x1v;
588         res = vmlaq_n_f32(res,x,c[0]);
589 
590         x = vextq_f32(x1v,x2v,1);
591 
592         res = vmlaq_n_f32(res,x,c[1]);
593 
594         x = vextq_f32(x1v,x2v,2);
595 
596 	res = vmlaq_n_f32(res,x,c[2]);
597 
598         x = vextq_f32(x1v,x2v,3);
599 
600 	res = vmlaq_n_f32(res,x,c[3]);
601 
602         x1v = x2v;
603         px+=4;
604       } while (--k);
605 
606       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
607        ** No loop unrolling is used. */
608       k = srcBLen & 0x3;
609 
610       while (k > 0U)
611       {
612         /* Read y[srcBLen - 5] sample */
613         c0 = *(py++);
614 
615         res = vmlaq_n_f32(res,x1v,c0);
616 
617         /* Reuse the present samples for the next MAC */
618         x1v[0] = x1v[1];
619         x1v[1] = x1v[2];
620         x1v[2] = x1v[3];
621 
622         x1v[3] = *(px++);
623 
624         /* Decrement the loop counter */
625         k--;
626       }
627 
628       px-=1;
629 
630       acc0 = res[0];
631       acc1 = res[1];
632       acc2 = res[2];
633       acc3 = res[3];
634 #else
635       /* read x[0], x[1], x[2] samples */
636       x0 = *px++;
637       x1 = *px++;
638       x2 = *px++;
639 
640       /* Apply loop unrolling and compute 4 MACs simultaneously. */
641       k = srcBLen >> 2U;
642 
643       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
644        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
645       do
646       {
647         /* Read y[0] sample */
648         c0 = *(py++);
649         /* Read x[3] sample */
650         x3 = *(px++);
651 
652         /* Perform the multiply-accumulate */
653         /* acc0 +=  x[0] * y[0] */
654         acc0 += x0 * c0;
655         /* acc1 +=  x[1] * y[0] */
656         acc1 += x1 * c0;
657         /* acc2 +=  x[2] * y[0] */
658         acc2 += x2 * c0;
659         /* acc3 +=  x[3] * y[0] */
660         acc3 += x3 * c0;
661 
662         /* Read y[1] sample */
663         c0 = *(py++);
664         /* Read x[4] sample */
665         x0 = *(px++);
666 
667         /* Perform the multiply-accumulate */
668         /* acc0 +=  x[1] * y[1] */
669         acc0 += x1 * c0;
670         /* acc1 +=  x[2] * y[1] */
671         acc1 += x2 * c0;
672         /* acc2 +=  x[3] * y[1] */
673         acc2 += x3 * c0;
674         /* acc3 +=  x[4] * y[1] */
675         acc3 += x0 * c0;
676 
677         /* Read y[2] sample */
678         c0 = *(py++);
679         /* Read x[5] sample */
680         x1 = *(px++);
681 
682         /* Perform the multiply-accumulate */
683         /* acc0 +=  x[2] * y[2] */
684         acc0 += x2 * c0;
685         /* acc1 +=  x[3] * y[2] */
686         acc1 += x3 * c0;
687         /* acc2 +=  x[4] * y[2] */
688         acc2 += x0 * c0;
689         /* acc3 +=  x[5] * y[2] */
690         acc3 += x1 * c0;
691 
692         /* Read y[3] sample */
693         c0 = *(py++);
694         /* Read x[6] sample */
695         x2 = *(px++);
696 
697         /* Perform the multiply-accumulate */
698         /* acc0 +=  x[3] * y[3] */
699         acc0 += x3 * c0;
700         /* acc1 +=  x[4] * y[3] */
701         acc1 += x0 * c0;
702         /* acc2 +=  x[5] * y[3] */
703         acc2 += x1 * c0;
704         /* acc3 +=  x[6] * y[3] */
705         acc3 += x2 * c0;
706 
707       } while (--k);
708 
709       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
710        ** No loop unrolling is used. */
711       k = srcBLen % 0x4U;
712 
713       while (k > 0U)
714       {
715         /* Read y[4] sample */
716         c0 = *(py++);
717         /* Read x[7] sample */
718         x3 = *(px++);
719 
720         /* Perform the multiply-accumulate */
721         /* acc0 +=  x[4] * y[4] */
722         acc0 += x0 * c0;
723         /* acc1 +=  x[5] * y[4] */
724         acc1 += x1 * c0;
725         /* acc2 +=  x[6] * y[4] */
726         acc2 += x2 * c0;
727         /* acc3 +=  x[7] * y[4] */
728         acc3 += x3 * c0;
729 
730         /* Reuse the present samples for the next MAC */
731         x0 = x1;
732         x1 = x2;
733         x2 = x3;
734 
735         /* Decrement the loop counter */
736         k--;
737       }
738 
739 #endif /* #if defined(ARM_MATH_NEON) */
740 
741       /* Store the result in the accumulator in the destination buffer. */
742       *pOut = acc0;
743       /* Destination pointer is updated according to the address modifier, inc */
744       pOut += inc;
745 
746       *pOut = acc1;
747       pOut += inc;
748 
749       *pOut = acc2;
750       pOut += inc;
751 
752       *pOut = acc3;
753       pOut += inc;
754 
755       /* Increment the pointer pIn1 index, count by 4 */
756       count += 4U;
757 
758       /* Update the inputA and inputB pointers for next MAC calculation */
759       px = pIn1 + count;
760       py = pIn2;
761 
762       /* Decrement loop counter */
763       blkCnt--;
764     }
765 
766     /* Loop unrolling: Compute remaining outputs */
767     blkCnt = blockSize2 % 0x4U;
768 
769 #else
770 
771     /* Initialize blkCnt with number of samples */
772     blkCnt = blockSize2;
773 
774 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
775 
776     while (blkCnt > 0U)
777     {
778       /* Accumulator is made zero for every iteration */
779       sum = 0.0f;
780 
781 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
782 
783     /* Loop unrolling: Compute 4 outputs at a time */
784       k = srcBLen >> 2U;
785 
786 #if defined(ARM_MATH_NEON)
787     float32x4_t x,y;
788     float32x4_t res = vdupq_n_f32(0) ;
789     float32x2_t accum = vdup_n_f32(0);
790 
791     while (k > 0U)
792     {
793       x = vld1q_f32(px);
794       y = vld1q_f32(py);
795 
796       res = vmlaq_f32(res,x, y);
797 
798       px += 4;
799       py += 4;
800       /* Decrement the loop counter */
801       k--;
802     }
803 
804     accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
805     sum += accum[0] + accum[1];
806 #else
807       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
808        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
809       while (k > 0U)
810       {
811         /* Perform the multiply-accumulate */
812         sum += *px++ * *py++;
813         sum += *px++ * *py++;
814         sum += *px++ * *py++;
815         sum += *px++ * *py++;
816 
817         /* Decrement loop counter */
818         k--;
819       }
820 #endif /* #if defined(ARM_MATH_NEON) */
821       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
822        ** No loop unrolling is used. */
823       k = srcBLen % 0x4U;
824 #else
825 
826       /* Initialize blkCnt with number of samples */
827       k = srcBLen;
828 
829 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
830 
831       while (k > 0U)
832       {
833         /* Perform the multiply-accumulate */
834         sum += *px++ * *py++;
835 
836         /* Decrement the loop counter */
837         k--;
838       }
839 
840       /* Store the result in the accumulator in the destination buffer. */
841       *pOut = sum;
842 
843       /* Destination pointer is updated according to the address modifier, inc */
844       pOut += inc;
845 
846       /* Increment the pointer pIn1 index, count by 1 */
847       count++;
848 
849       /* Update the inputA and inputB pointers for next MAC calculation */
850       px = pIn1 + count;
851       py = pIn2;
852 
853       /* Decrement the loop counter */
854       blkCnt--;
855     }
856   }
857   else
858   {
859     /* If the srcBLen is not a multiple of 4,
860      * the blockSize2 loop cannot be unrolled by 4 */
861     blkCnt = blockSize2;
862 
863     while (blkCnt > 0U)
864     {
865       /* Accumulator is made zero for every iteration */
866       sum = 0.0f;
867 
868       /* Loop over srcBLen */
869       k = srcBLen;
870 
871       while (k > 0U)
872       {
873         /* Perform the multiply-accumulate */
874         sum += *px++ * *py++;
875 
876         /* Decrement the loop counter */
877         k--;
878       }
879 
880       /* Store the result in the accumulator in the destination buffer. */
881       *pOut = sum;
882       /* Destination pointer is updated according to the address modifier, inc */
883       pOut += inc;
884 
885       /* Increment the pointer pIn1 index, count by 1 */
886       count++;
887 
888       /* Update the inputA and inputB pointers for next MAC calculation */
889       px = pIn1 + count;
890       py = pIn2;
891 
892       /* Decrement the loop counter */
893       blkCnt--;
894     }
895   }
896 
897 
898   /* --------------------------
899    * Initializations of stage3
900    * -------------------------*/
901 
902   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
903    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
904    * ....
905    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
906    * sum +=  x[srcALen-1] * y[0]
907    */
908 
909   /* In this stage the MAC operations are decreased by 1 for every iteration.
910      The count variable holds the number of MAC operations performed */
911   count = srcBLen - 1U;
912 
913   /* Working pointer of inputA */
914   pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
915   px = pSrc1;
916 
917   /* Working pointer of inputB */
918   py = pIn2;
919 
920   /* -------------------
921    * Stage3 process
922    * ------------------*/
923 
924   while (blockSize3 > 0U)
925   {
926     /* Accumulator is made zero for every iteration */
927     sum = 0.0f;
928 
929 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
930 
931     /* Loop unrolling: Compute 4 outputs at a time */
932     k = count >> 2U;
933 
934 #if defined(ARM_MATH_NEON)
935     float32x4_t x,y;
936     float32x4_t res = vdupq_n_f32(0) ;
937     float32x2_t accum = vdup_n_f32(0);
938 
939     while (k > 0U)
940     {
941       x = vld1q_f32(px);
942       y = vld1q_f32(py);
943 
944       res = vmlaq_f32(res,x, y);
945 
946       px += 4;
947       py += 4;
948 
949       /* Decrement the loop counter */
950       k--;
951     }
952 
953     accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
954     sum += accum[0] + accum[1];
955 #else
956     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
957      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
958     while (k > 0U)
959     {
960       /* Perform the multiply-accumulate */
961       /* sum += x[srcALen - srcBLen + 4] * y[3] */
962       sum += *px++ * *py++;
963 
964       /* sum += x[srcALen - srcBLen + 3] * y[2] */
965       sum += *px++ * *py++;
966 
967       /* sum += x[srcALen - srcBLen + 2] * y[1] */
968       sum += *px++ * *py++;
969 
970       /* sum += x[srcALen - srcBLen + 1] * y[0] */
971       sum += *px++ * *py++;
972 
973       /* Decrement loop counter */
974       k--;
975     }
976 
977 #endif /* #if defined (ARM_MATH_NEON) */
978     /* Loop unrolling: Compute remaining outputs */
979     k = count % 0x4U;
980 
981 #else
982 
983     /* Initialize blkCnt with number of samples */
984     k = count;
985 
986 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
987 
988     while (k > 0U)
989     {
990       /* Perform the multiply-accumulate */
991       sum += *px++ * *py++;
992 
993       /* Decrement loop counter */
994       k--;
995     }
996 
997     /* Store the result in the accumulator in the destination buffer. */
998     *pOut = sum;
999     /* Destination pointer is updated according to the address modifier, inc */
1000     pOut += inc;
1001 
1002     /* Update the inputA and inputB pointers for next MAC calculation */
1003     px = ++pSrc1;
1004     py = pIn2;
1005 
1006     /* Decrement MAC count */
1007     count--;
1008 
1009     /* Decrement the loop counter */
1010     blockSize3--;
1011   }
1012 
1013 #else
1014 /* alternate version for CM0_FAMILY */
1015 
1016   const float32_t *pIn1 = pSrcA;                       /* inputA pointer */
1017   const float32_t *pIn2 = pSrcB + (srcBLen - 1U);      /* inputB pointer */
1018         float32_t sum;                                 /* Accumulator */
1019         uint32_t i = 0U, j;                            /* Loop counters */
1020         uint32_t inv = 0U;                             /* Reverse order flag */
1021         uint32_t tot = 0U;                             /* Length */
1022 
1023   /* The algorithm implementation is based on the lengths of the inputs. */
1024   /* srcB is always made to slide across srcA. */
1025   /* So srcBLen is always considered as shorter or equal to srcALen */
1026   /* But CORR(x, y) is reverse of CORR(y, x) */
1027   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
1028   /* and a varaible, inv is set to 1 */
1029   /* If lengths are not equal then zero pad has to be done to  make the two
1030    * inputs of same length. But to improve the performance, we assume zeroes
1031    * in the output instead of zero padding either of the the inputs*/
1032   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
1033    * starting of the output buffer */
1034   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
1035    * ending of the output buffer */
1036   /* Once the zero padding is done the remaining of the output is calcualted
1037    * using convolution but with the shorter signal time shifted. */
1038 
1039   /* Calculate the length of the remaining sequence */
1040   tot = ((srcALen + srcBLen) - 2U);
1041 
1042   if (srcALen > srcBLen)
1043   {
1044     /* Calculating the number of zeros to be padded to the output */
1045     j = srcALen - srcBLen;
1046 
1047     /* Initialise the pointer after zero padding */
1048     pDst += j;
1049   }
1050 
1051   else if (srcALen < srcBLen)
1052   {
1053     /* Initialization to inputB pointer */
1054     pIn1 = pSrcB;
1055 
1056     /* Initialization to the end of inputA pointer */
1057     pIn2 = pSrcA + (srcALen - 1U);
1058 
1059     /* Initialisation of the pointer after zero padding */
1060     pDst = pDst + tot;
1061 
1062     /* Swapping the lengths */
1063     j = srcALen;
1064     srcALen = srcBLen;
1065     srcBLen = j;
1066 
1067     /* Setting the reverse flag */
1068     inv = 1;
1069 
1070   }
1071 
1072   /* Loop to calculate convolution for output length number of times */
1073   for (i = 0U; i <= tot; i++)
1074   {
1075     /* Initialize sum with zero to carry out MAC operations */
1076     sum = 0.0f;
1077 
1078     /* Loop to perform MAC operations according to convolution equation */
1079     for (j = 0U; j <= i; j++)
1080     {
1081       /* Check the array limitations */
1082       if ((((i - j) < srcBLen) && (j < srcALen)))
1083       {
1084         /* z[i] += x[i-j] * y[j] */
1085         sum += pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)];
1086       }
1087     }
1088 
1089     /* Store the output in the destination buffer */
1090     if (inv == 1)
1091       *pDst-- = sum;
1092     else
1093       *pDst++ = sum;
1094   }
1095 
1096 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
1097 
1098 }
1099 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1100 
1101 /**
1102   @} end of Corr group
1103  */
1104