1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_correlate_q7.c
4  * Description:  Correlation of Q7 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Corr
37   @{
38  */
39 
40 /**
41   @brief         Correlation of Q7 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
47 
48   @par           Scaling and Overflow Behavior
49                    The function is implemented using a 32-bit internal accumulator.
50                    Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
51                    The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
52                    This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
53                    The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
54 
55  @remark
56                    Refer to \ref arm_correlate_opt_q7() for a faster implementation of this function.
57  */
58 
59 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(ARM_DSP_BUILT_WITH_GCC)
60 #pragma message "Scalar version of arm_correlate_q7 built. Helium version has build issues with gcc."
61 #endif
62 
63 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(ARM_DSP_BUILT_WITH_GCC)
64 #include "arm_helium_utils.h"
65 
66 #include "arm_vec_filtering.h"
arm_correlate_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst)67 ARM_DSP_ATTRIBUTE void arm_correlate_q7(
68   const q7_t * pSrcA,
69         uint32_t srcALen,
70   const q7_t * pSrcB,
71         uint32_t srcBLen,
72         q7_t * pDst)
73 {
74     const q7_t     *pIn1 = pSrcA;                     /* inputA pointer               */
75     const q7_t     *pIn2 = pSrcB + (srcBLen - 1U);    /* inputB pointer               */
76     const q7_t     *pX, *pY;
77     const q7_t     *pA, *pB;
78     int32_t   i = 0U, j = 0;                    /* loop counters */
79     int32_t   inv = 1U;                         /* Reverse order flag */
80     uint32_t  tot = 0U;                         /* Length */
81     int32_t   block1, block2, block3;
82     int32_t   incr;
83 
84     tot = ((srcALen + srcBLen) - 2U);
85     if (srcALen > srcBLen)
86     {
87         /*
88          * Calculating the number of zeros to be padded to the output
89          */
90         j = srcALen - srcBLen;
91         /*
92          * Initialize the pointer after zero padding
93          */
94         pDst += j;
95     }
96     else if (srcALen < srcBLen)
97     {
98         /*
99          * Initialization to inputB pointer
100          */
101         pIn1 = pSrcB;
102         /*
103          * Initialization to the end of inputA pointer
104          */
105         pIn2 = pSrcA + (srcALen - 1U);
106         /*
107          * Initialisation of the pointer after zero padding
108          */
109         pDst = pDst + tot;
110         /*
111          * Swapping the lengths
112          */
113         j = srcALen;
114         srcALen = srcBLen;
115         srcBLen = j;
116         /*
117          * Setting the reverse flag
118          */
119         inv = -1;
120     }
121 
122     block1 = srcBLen - 1;
123     block2 = srcALen - srcBLen + 1;
124     block3 = srcBLen - 1;
125 
126     pA = pIn1;
127     pB = pIn2;
128     incr = inv;
129 
130     for (i = 0U; i <= block1 - 2; i += 2)
131     {
132         uint32_t  count = i + 1;
133         int32_t   acc0 = 0;
134         int32_t   acc1 = 0;
135 
136         /*
137          * compute 2 accumulators per loop
138          * size is incrementing for second accumulator
139          * Y pointer is decrementing for second accumulator
140          */
141         pX = pA;
142         pY = pB;
143         MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count);
144 
145         *pDst = (q7_t) acc0;
146         pDst += incr;
147         *pDst = (q7_t) acc1;
148         pDst += incr;
149         pB -= 2;
150     }
151     for (; i < block1; i++)
152     {
153         uint32_t  count = i + 1;
154         int32_t   acc = 0;
155 
156         pX = pA;
157         pY = pB;
158         MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count);
159 
160         *pDst = (q7_t) acc;
161         pDst += incr;
162         pB--;
163     }
164 
165     for (i = 0U; i <= block2 - 4; i += 4)
166     {
167         int32_t   acc0 = 0;
168         int32_t   acc1 = 0;
169         int32_t   acc2 = 0;
170         int32_t   acc3 = 0;
171 
172         pX = pA;
173         pY = pB;
174         /*
175          * compute 4 accumulators per loop
176          * size is fixed for all accumulators
177          * X pointer is incrementing for successive accumulators
178          */
179         MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
180 
181         *pDst = (q7_t) acc0;
182         pDst += incr;
183         *pDst = (q7_t) acc1;
184         pDst += incr;
185         *pDst = (q7_t) acc2;
186         pDst += incr;
187         *pDst = (q7_t) acc3;
188         pDst += incr;
189 
190         pA += 4;
191     }
192 
193     for (; i <= block2 - 2; i += 2)
194     {
195         int32_t   acc0 = 0LL;
196         int32_t   acc1 = 0LL;
197 
198         pX = pA;
199         pY = pB;
200         /*
201          * compute 2 accumulators per loop
202          * size is fixed for all accumulators
203          * X pointer is incrementing for second accumulator
204          */
205         MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, srcBLen);
206 
207         *pDst = (q7_t) acc0;
208         pDst += incr;
209         *pDst = (q7_t) acc1;
210         pDst += incr;
211         pA += 2;
212     }
213 
214     if (block2 & 1)
215     {
216         int32_t   acc = 0LL;
217 
218         pX = pA;
219         pY = pB;
220         MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, srcBLen);
221 
222         *pDst = (q7_t) acc;
223         pDst += incr;
224         pA++;
225     }
226 
227     for (i = block3 - 1; i > 0; i -= 2)
228     {
229         uint32_t  count = (i + 1);
230         int32_t   acc0 = 0LL;
231         int32_t   acc1 = 0LL;
232 
233         pX = pA;
234         pY = pB;
235         /*
236          * compute 2 accumulators per loop
237          * size is decrementing for second accumulator
238          * X pointer is incrementing for second accumulator
239          */
240         MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count);
241 
242         *pDst = (q7_t) acc0;
243         pDst += incr;
244         *pDst = (q7_t) acc1;
245         pDst += incr;
246         pA += 2;
247 
248     }
249     for (; i >= 0; i--)
250     {
251         uint32_t  count = (i + 1);
252         int64_t   acc = 0LL;
253 
254         pX = pA;
255         pY = pB;
256         MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count);
257 
258         *pDst = (q7_t) acc;
259         pDst += incr;
260         pA++;
261     }
262 }
263 
264 #else
arm_correlate_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst)265 ARM_DSP_ATTRIBUTE void arm_correlate_q7(
266   const q7_t * pSrcA,
267         uint32_t srcALen,
268   const q7_t * pSrcB,
269         uint32_t srcBLen,
270         q7_t * pDst)
271 {
272 
273 #if (1)
274 //#if !defined(ARM_MATH_CM0_FAMILY)
275 
276   const q7_t *pIn1;                                    /* InputA pointer */
277   const q7_t *pIn2;                                    /* InputB pointer */
278         q7_t *pOut = pDst;                             /* Output pointer */
279   const q7_t *px;                                      /* Intermediate inputA pointer */
280   const q7_t *py;                                      /* Intermediate inputB pointer */
281   const q7_t *pSrc1;                                   /* Intermediate pointers */
282         q31_t sum;                                     /* Accumulators */
283         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
284         uint32_t j, k, count, blkCnt;                  /* Loop counters */
285         uint32_t outBlockSize;
286         int32_t inc = 1;
287 
288 #if defined (ARM_MATH_LOOPUNROLL)
289         q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
290         q31_t input1, input2;                          /* Temporary input variables */
291         q15_t in1, in2;                                /* Temporary input variables */
292         q7_t x0, x1, x2, x3, c0, c1;                   /* Temporary variables for holding input and coefficient values */
293 #endif
294 
295   /* The algorithm implementation is based on the lengths of the inputs. */
296   /* srcB is always made to slide across srcA. */
297   /* So srcBLen is always considered as shorter or equal to srcALen */
298   /* But CORR(x, y) is reverse of CORR(y, x) */
299   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
300   /* and the destination pointer modifier, inc is set to -1 */
301   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
302   /* But to improve the performance,
303    * we include zeroes in the output instead of zero padding either of the the inputs*/
304   /* If srcALen > srcBLen,
305    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
306   /* If srcALen < srcBLen,
307    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
308   if (srcALen >= srcBLen)
309   {
310     /* Initialization of inputA pointer */
311     pIn1 = pSrcA;
312 
313     /* Initialization of inputB pointer */
314     pIn2 = pSrcB;
315 
316     /* Number of output samples is calculated */
317     outBlockSize = (2U * srcALen) - 1U;
318 
319     /* When srcALen > srcBLen, zero padding is done to srcB
320      * to make their lengths equal.
321      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
322      * number of output samples are made zero */
323     j = outBlockSize - (srcALen + (srcBLen - 1U));
324 
325     /* Updating the pointer position to non zero value */
326     pOut += j;
327   }
328   else
329   {
330     /* Initialization of inputA pointer */
331     pIn1 = pSrcB;
332 
333     /* Initialization of inputB pointer */
334     pIn2 = pSrcA;
335 
336     /* srcBLen is always considered as shorter or equal to srcALen */
337     j = srcBLen;
338     srcBLen = srcALen;
339     srcALen = j;
340 
341     /* CORR(x, y) = Reverse order(CORR(y, x)) */
342     /* Hence set the destination pointer to point to the last output sample */
343     pOut = pDst + ((srcALen + srcBLen) - 2U);
344 
345     /* Destination address modifier is set to -1 */
346     inc = -1;
347   }
348 
349   /* The function is internally
350    * divided into three stages according to the number of multiplications that has to be
351    * taken place between inputA samples and inputB samples. In the first stage of the
352    * algorithm, the multiplications increase by one for every iteration.
353    * In the second stage of the algorithm, srcBLen number of multiplications are done.
354    * In the third stage of the algorithm, the multiplications decrease by one
355    * for every iteration. */
356 
357   /* The algorithm is implemented in three stages.
358      The loop counters of each stage is initiated here. */
359   blockSize1 = srcBLen - 1U;
360   blockSize2 = srcALen - (srcBLen - 1U);
361   blockSize3 = blockSize1;
362 
363   /* --------------------------
364    * Initializations of stage1
365    * -------------------------*/
366 
367   /* sum = x[0] * y[srcBlen - 1]
368    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
369    * ....
370    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
371    */
372 
373   /* In this stage the MAC operations are increased by 1 for every iteration.
374      The count variable holds the number of MAC operations performed */
375   count = 1U;
376 
377   /* Working pointer of inputA */
378   px = pIn1;
379 
380   /* Working pointer of inputB */
381   pSrc1 = pIn2 + (srcBLen - 1U);
382   py = pSrc1;
383 
384   /* ------------------------
385    * Stage1 process
386    * ----------------------*/
387 
388   /* The first stage starts here */
389   while (blockSize1 > 0U)
390   {
391     /* Accumulator is made zero for every iteration */
392     sum = 0;
393 
394 #if defined (ARM_MATH_LOOPUNROLL)
395 
396     /* Loop unrolling: Compute 4 outputs at a time */
397     k = count >> 2U;
398 
399     while (k > 0U)
400     {
401       /* x[0] , x[1] */
402       in1 = (q15_t) *px++;
403       in2 = (q15_t) *px++;
404       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
405 
406       /* y[srcBLen - 4] , y[srcBLen - 3] */
407       in1 = (q15_t) *py++;
408       in2 = (q15_t) *py++;
409       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
410 
411       /* x[0] * y[srcBLen - 4] */
412       /* x[1] * y[srcBLen - 3] */
413       sum = __SMLAD(input1, input2, sum);
414 
415       /* x[2] , x[3] */
416       in1 = (q15_t) *px++;
417       in2 = (q15_t) *px++;
418       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
419 
420       /* y[srcBLen - 2] , y[srcBLen - 1] */
421       in1 = (q15_t) *py++;
422       in2 = (q15_t) *py++;
423       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
424 
425       /* x[2] * y[srcBLen - 2] */
426       /* x[3] * y[srcBLen - 1] */
427       sum = __SMLAD(input1, input2, sum);
428 
429       /* Decrement loop counter */
430       k--;
431     }
432 
433     /* Loop unrolling: Compute remaining outputs */
434     k = count % 0x4U;
435 
436 #else
437 
438     /* Initialize k with number of samples */
439     k = count;
440 
441 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
442 
443     while (k > 0U)
444     {
445       /* Perform the multiply-accumulate */
446       /* x[0] * y[srcBLen - 1] */
447       sum += (q31_t) ((q15_t) *px++ * *py++);
448 
449       /* Decrement loop counter */
450       k--;
451     }
452 
453     /* Store the result in the accumulator in the destination buffer. */
454     *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
455     /* Destination pointer is updated according to the address modifier, inc */
456     pOut += inc;
457 
458     /* Update the inputA and inputB pointers for next MAC calculation */
459     py = pSrc1 - count;
460     px = pIn1;
461 
462     /* Increment MAC count */
463     count++;
464 
465     /* Decrement loop counter */
466     blockSize1--;
467   }
468 
469   /* --------------------------
470    * Initializations of stage2
471    * ------------------------*/
472 
473   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
474    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
475    * ....
476    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
477    */
478 
479   /* Working pointer of inputA */
480   px = pIn1;
481 
482   /* Working pointer of inputB */
483   py = pIn2;
484 
485   /* count is index by which the pointer pIn1 to be incremented */
486   count = 0U;
487 
488   /* -------------------
489    * Stage2 process
490    * ------------------*/
491 
492   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
493    * So, to loop unroll over blockSize2,
494    * srcBLen should be greater than or equal to 4 */
495   if (srcBLen >= 4U)
496   {
497 #if defined (ARM_MATH_LOOPUNROLL)
498 
499     /* Loop unrolling: Compute 4 outputs at a time */
500     blkCnt = blockSize2 >> 2U;
501 
502     while (blkCnt > 0U)
503     {
504       /* Set all accumulators to zero */
505       acc0 = 0;
506       acc1 = 0;
507       acc2 = 0;
508       acc3 = 0;
509 
510       /* read x[0], x[1], x[2] samples */
511       x0 = *px++;
512       x1 = *px++;
513       x2 = *px++;
514 
515       /* Apply loop unrolling and compute 4 MACs simultaneously. */
516       k = srcBLen >> 2U;
517 
518       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
519        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
520       do
521       {
522         /* Read y[0] sample */
523         c0 = *py++;
524         /* Read y[1] sample */
525         c1 = *py++;
526 
527         /* Read x[3] sample */
528         x3 = *px++;
529 
530         /* x[0] and x[1] are packed */
531         in1 = (q15_t) x0;
532         in2 = (q15_t) x1;
533 
534         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
535 
536         /* y[0] and y[1] are packed */
537         in1 = (q15_t) c0;
538         in2 = (q15_t) c1;
539 
540         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
541 
542         /* acc0 += x[0] * y[0] + x[1] * y[1]  */
543         acc0 = __SMLAD(input1, input2, acc0);
544 
545         /* x[1] and x[2] are packed */
546         in1 = (q15_t) x1;
547         in2 = (q15_t) x2;
548 
549         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
550 
551         /* acc1 += x[1] * y[0] + x[2] * y[1] */
552         acc1 = __SMLAD(input1, input2, acc1);
553 
554         /* x[2] and x[3] are packed */
555         in1 = (q15_t) x2;
556         in2 = (q15_t) x3;
557 
558         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
559 
560         /* acc2 += x[2] * y[0] + x[3] * y[1]  */
561         acc2 = __SMLAD(input1, input2, acc2);
562 
563         /* Read x[4] sample */
564         x0 = *px++;
565 
566         /* x[3] and x[4] are packed */
567         in1 = (q15_t) x3;
568         in2 = (q15_t) x0;
569 
570         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
571 
572         /* acc3 += x[3] * y[0] + x[4] * y[1]  */
573         acc3 = __SMLAD(input1, input2, acc3);
574 
575         /* Read y[2] sample */
576         c0 = *py++;
577         /* Read y[3] sample */
578         c1 = *py++;
579 
580         /* Read x[5] sample */
581         x1 = *px++;
582 
583         /* x[2] and x[3] are packed */
584         in1 = (q15_t) x2;
585         in2 = (q15_t) x3;
586 
587         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
588 
589         /* y[2] and y[3] are packed */
590         in1 = (q15_t) c0;
591         in2 = (q15_t) c1;
592 
593         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
594 
595         /* acc0 += x[2] * y[2] + x[3] * y[3]  */
596         acc0 = __SMLAD(input1, input2, acc0);
597 
598         /* x[3] and x[4] are packed */
599         in1 = (q15_t) x3;
600         in2 = (q15_t) x0;
601 
602         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
603 
604         /* acc1 += x[3] * y[2] + x[4] * y[3]  */
605         acc1 = __SMLAD(input1, input2, acc1);
606 
607         /* x[4] and x[5] are packed */
608         in1 = (q15_t) x0;
609         in2 = (q15_t) x1;
610 
611         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
612 
613         /* acc2 += x[4] * y[2] + x[5] * y[3]  */
614         acc2 = __SMLAD(input1, input2, acc2);
615 
616         /* Read x[6] sample */
617         x2 = *px++;
618 
619         /* x[5] and x[6] are packed */
620         in1 = (q15_t) x1;
621         in2 = (q15_t) x2;
622 
623         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
624 
625         /* acc3 += x[5] * y[2] + x[6] * y[3]  */
626         acc3 = __SMLAD(input1, input2, acc3);
627 
628       } while (--k);
629 
630       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
631        ** No loop unrolling is used. */
632       k = srcBLen % 0x4U;
633 
634       while (k > 0U)
635       {
636         /* Read y[4] sample */
637         c0 = *py++;
638         /* Read x[7] sample */
639         x3 = *px++;
640 
641         /* Perform the multiply-accumulates */
642         /* acc0 +=  x[4] * y[4] */
643         acc0 += ((q15_t) x0 * c0);
644         /* acc1 +=  x[5] * y[4] */
645         acc1 += ((q15_t) x1 * c0);
646         /* acc2 +=  x[6] * y[4] */
647         acc2 += ((q15_t) x2 * c0);
648         /* acc3 +=  x[7] * y[4] */
649         acc3 += ((q15_t) x3 * c0);
650 
651         /* Reuse the present samples for the next MAC */
652         x0 = x1;
653         x1 = x2;
654         x2 = x3;
655 
656         /* Decrement loop counter */
657         k--;
658       }
659 
660       /* Store the result in the accumulator in the destination buffer. */
661       *pOut = (q7_t) (__SSAT(acc0 >> 7, 8));
662       /* Destination pointer is updated according to the address modifier, inc */
663       pOut += inc;
664 
665       *pOut = (q7_t) (__SSAT(acc1 >> 7, 8));
666       pOut += inc;
667 
668       *pOut = (q7_t) (__SSAT(acc2 >> 7, 8));
669       pOut += inc;
670 
671       *pOut = (q7_t) (__SSAT(acc3 >> 7, 8));
672       pOut += inc;
673 
674 	  count += 4U;
675       /* Update the inputA and inputB pointers for next MAC calculation */
676       px = pIn1 + count;
677       py = pIn2;
678 
679       /* Decrement loop counter */
680       blkCnt--;
681     }
682 
683     /* Loop unrolling: Compute remaining outputs */
684     blkCnt = blockSize2 % 0x4U;
685 
686 #else
687 
688     /* Initialize blkCnt with number of samples */
689     blkCnt = blockSize2;
690 
691 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
692 
693     while (blkCnt > 0U)
694     {
695       /* Accumulator is made zero for every iteration */
696       sum = 0;
697 
698 #if defined (ARM_MATH_LOOPUNROLL)
699 
700     /* Loop unrolling: Compute 4 outputs at a time */
701       k = srcBLen >> 2U;
702 
703       while (k > 0U)
704       {
705 
706         /* Reading two inputs of SrcA buffer and packing */
707         in1 = (q15_t) *px++;
708         in2 = (q15_t) *px++;
709         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
710 
711         /* Reading two inputs of SrcB buffer and packing */
712         in1 = (q15_t) *py++;
713         in2 = (q15_t) *py++;
714         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
715 
716         /* Perform the multiply-accumulate */
717         sum = __SMLAD(input1, input2, sum);
718 
719         /* Reading two inputs of SrcA buffer and packing */
720         in1 = (q15_t) *px++;
721         in2 = (q15_t) *px++;
722         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
723 
724         /* Reading two inputs of SrcB buffer and packing */
725         in1 = (q15_t) *py++;
726         in2 = (q15_t) *py++;
727         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
728 
729         /* Perform the multiply-accumulate */
730         sum = __SMLAD(input1, input2, sum);
731 
732         /* Decrement loop counter */
733         k--;
734       }
735 
736       /* Loop unrolling: Compute remaining outputs */
737       k = srcBLen % 0x4U;
738 
739 #else
740 
741       /* Initialize blkCnt with number of samples */
742       k = srcBLen;
743 
744 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
745 
746       while (k > 0U)
747       {
748         /* Perform the multiply-accumulate */
749         sum += ((q15_t) *px++ * *py++);
750 
751         /* Decrement the loop counter */
752         k--;
753       }
754 
755       /* Store the result in the accumulator in the destination buffer. */
756       *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
757       /* Destination pointer is updated according to the address modifier, inc */
758       pOut += inc;
759 
760       /* Increment the pointer pIn1 index, count by 1 */
761       count++;
762 
763       /* Update the inputA and inputB pointers for next MAC calculation */
764       px = pIn1 + count;
765       py = pIn2;
766 
767       /* Decrement the loop counter */
768       blkCnt--;
769     }
770   }
771   else
772   {
773     /* If the srcBLen is not a multiple of 4,
774      * the blockSize2 loop cannot be unrolled by 4 */
775     blkCnt = blockSize2;
776 
777     while (blkCnt > 0U)
778     {
779       /* Accumulator is made zero for every iteration */
780       sum = 0;
781 
782       /* srcBLen number of MACS should be performed */
783       k = srcBLen;
784 
785       while (k > 0U)
786       {
787         /* Perform the multiply-accumulate */
788         sum += ((q15_t) *px++ * *py++);
789 
790         /* Decrement the loop counter */
791         k--;
792       }
793 
794       /* Store the result in the accumulator in the destination buffer. */
795       *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
796       /* Destination pointer is updated according to the address modifier, inc */
797       pOut += inc;
798 
799       /* Increment the MAC count */
800       count++;
801 
802       /* Update the inputA and inputB pointers for next MAC calculation */
803       px = pIn1 + count;
804       py = pIn2;
805 
806       /* Decrement loop counter */
807       blkCnt--;
808     }
809   }
810 
811 
812   /* --------------------------
813    * Initializations of stage3
814    * -------------------------*/
815 
816   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
817    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
818    * ....
819    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
820    * sum +=  x[srcALen-1] * y[0]
821    */
822 
823   /* In this stage the MAC operations are decreased by 1 for every iteration.
824      The count variable holds the number of MAC operations performed */
825   count = srcBLen - 1U;
826 
827   /* Working pointer of inputA */
828   pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
829   px = pSrc1;
830 
831   /* Working pointer of inputB */
832   py = pIn2;
833 
834   /* -------------------
835    * Stage3 process
836    * ------------------*/
837 
838   while (blockSize3 > 0U)
839   {
840     /* Accumulator is made zero for every iteration */
841     sum = 0;
842 
843 #if defined (ARM_MATH_LOOPUNROLL)
844 
845     /* Loop unrolling: Compute 4 outputs at a time */
846     k = count >> 2U;
847 
848     while (k > 0U)
849     {
850       /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2]  */
851       in1 = (q15_t) *px++;
852       in2 = (q15_t) *px++;
853       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
854 
855       /* y[0] , y[1] */
856       in1 = (q15_t) *py++;
857       in2 = (q15_t) *py++;
858       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
859 
860       /* sum += x[srcALen - srcBLen + 1] * y[0] */
861       /* sum += x[srcALen - srcBLen + 2] * y[1] */
862       sum = __SMLAD(input1, input2, sum);
863 
864       /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */
865       in1 = (q15_t) *px++;
866       in2 = (q15_t) *px++;
867       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
868 
869       /* y[2] , y[3] */
870       in1 = (q15_t) *py++;
871       in2 = (q15_t) *py++;
872       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
873 
874       /* sum += x[srcALen - srcBLen + 3] * y[2] */
875       /* sum += x[srcALen - srcBLen + 4] * y[3] */
876       sum = __SMLAD(input1, input2, sum);
877 
878       /* Decrement loop counter */
879       k--;
880     }
881 
882     /* Loop unrolling: Compute remaining outputs */
883     k = count % 0x4U;
884 
885 #else
886 
887     /* Initialize blkCnt with number of samples */
888     k = count;
889 
890 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
891 
892     while (k > 0U)
893     {
894       /* Perform the multiply-accumulate */
895       sum += ((q15_t) *px++ * *py++);
896 
897       /* Decrement loop counter */
898       k--;
899     }
900 
901     /* Store the result in the accumulator in the destination buffer. */
902     *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
903     /* Destination pointer is updated according to the address modifier, inc */
904     pOut += inc;
905 
906     /* Update the inputA and inputB pointers for next MAC calculation */
907     px = ++pSrc1;
908     py = pIn2;
909 
910     /* Decrement MAC count */
911     count--;
912 
913     /* Decrement loop counter */
914     blockSize3--;
915   }
916 
917 #else
918 /* alternate version for CM0_FAMILY */
919 
920   const q7_t *pIn1 = pSrcA;                            /* InputA pointer */
921   const q7_t *pIn2 = pSrcB + (srcBLen - 1U);           /* InputB pointer */
922         q31_t sum;                                     /* Accumulator */
923         uint32_t i = 0U, j;                            /* Loop counters */
924         int32_t inc = 1;                               /* Destination address modifier */
925         uint32_t tot = 0U;                             /* Length */
926 
927   /* The algorithm implementation is based on the lengths of the inputs. */
928   /* srcB is always made to slide across srcA. */
929   /* So srcBLen is always considered as shorter or equal to srcALen */
930   /* But CORR(x, y) is reverse of CORR(y, x) */
931   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
932   /* and a varaible, inc is set to -1 */
933   /* If lengths are not equal then zero pad has to be done to  make the two
934    * inputs of same length. But to improve the performance, we include zeroes
935    * in the output instead of zero padding either of the the inputs*/
936   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
937    * starting of the output buffer */
938   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
939    * ending of the output buffer */
940   /* Once the zero padding is done the remaining of the output is calcualted
941    * using convolution but with the shorter signal time shifted. */
942 
943   /* Calculate the length of the remaining sequence */
944   tot = ((srcALen + srcBLen) - 2U);
945 
946   if (srcALen > srcBLen)
947   {
948     /* Calculating the number of zeros to be padded to the output */
949     j = srcALen - srcBLen;
950 
951     /* Initialise the pointer after zero padding */
952     pDst += j;
953   }
954 
955   else if (srcALen < srcBLen)
956   {
957     /* Initialization to inputB pointer */
958     pIn1 = pSrcB;
959 
960     /* Initialization to the end of inputA pointer */
961     pIn2 = pSrcA + (srcALen - 1U);
962 
963     /* Initialisation of the pointer after zero padding */
964     pDst = pDst + tot;
965 
966     /* Swapping the lengths */
967     j = srcALen;
968     srcALen = srcBLen;
969     srcBLen = j;
970 
971     /* Filling destination in reverse order */
972     inc = -1;
973   }
974 
975   /* Loop to calculate convolution for output length number of times */
976   for (i = 0U; i <= tot; i++)
977   {
978     /* Initialize sum with zero to carry out MAC operations */
979     sum = 0;
980 
981     /* Loop to perform MAC operations according to convolution equation */
982     for (j = 0U; j <= i; j++)
983     {
984       /* Check the array limitations */
985       if (((i - j) < srcBLen) && (j < srcALen))
986       {
987         /* z[i] += x[i-j] * y[j] */
988         sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)]);
989       }
990     }
991 
992     /* Store the output in the destination buffer */
993     *pDst = (q7_t) __SSAT((sum >> 7U), 8U);
994     pDst += inc;
995   }
996 
997 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
998 
999 }
1000 #endif /* defined(ARM_MATH_MVEI) */
1001 
1002 /**
1003   @} end of Corr group
1004  */
1005