1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_f32.c
4  * Description:  Convolution of floating-point sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @defgroup Conv Convolution
37 
38   Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
39   Convolution is similar to correlation and is frequently used in filtering and data analysis.
40   The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
41   The library also provides fast versions of the Q15 and Q31 functions.
42 
43  @par            Algorithm
44                    Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and
45                    <code>srcBLen</code> samples respectively. Then the convolution
46                    \f[
47                       c[n] = a[n] * b[n]
48                    \f]
49   @par
50                    is defined as
51                    \f[
52                    c[n] = \sum_{k=0}^{srcALen} a[k] b[n-k]
53                    \f]
54   @par
55                    Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.
56                    <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and
57                    <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
58                    The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.
59   @par
60                    Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,
61                    the signal <code>b[n]</code> slides over <code>a[n]</code>.
62                    For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
63   @par
64                    Note that convolution is a commutative operation:
65                    \f[
66                       a[n] * b[n] = b[n] * a[n].
67                    \f]
68   @par
69                    This means that switching the A and B arguments to the convolution functions has no effect.
70 
71   @par           Fixed-Point Behavior
72                    Convolution requires summing up a large number of intermediate products.
73                    As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
74                    Refer to the function specific documentation below for further details of the particular algorithm used.
75 
76   @par           Fast Versions
77                    Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
78                    the input signals should be scaled down to avoid intermediate overflows.
79 
80   @par           Opt Versions
81                    Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
82                    These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions
83 
84   @par           Long versions:
85                    For convolution of long vectors, those functions are
86                    no more adapted and will be very slow.
87                    An implementation based upon FFTs should be used.
88 
89  */
90 
91 /**
92   @addtogroup Conv
93   @{
94  */
95 
96 /**
97   @brief         Convolution of floating-point sequences.
98   @param[in]     pSrcA      points to the first input sequence
99   @param[in]     srcALen    length of the first input sequence
100   @param[in]     pSrcB      points to the second input sequence
101   @param[in]     srcBLen    length of the second input sequence
102   @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1.
103  */
104 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
105 
106 #include "arm_helium_utils.h"
107 #include "arm_vec_filtering.h"
108 
109 
arm_conv_f32(const float32_t * pSrcA,uint32_t srcALen,const float32_t * pSrcB,uint32_t srcBLen,float32_t * pDst)110 ARM_DSP_ATTRIBUTE void arm_conv_f32(
111   const float32_t * pSrcA,
112         uint32_t srcALen,
113   const float32_t * pSrcB,
114         uint32_t srcBLen,
115         float32_t * pDst)
116 {
117     const float32_t *pIn1 = pSrcA;    /* inputA pointer               */
118     const float32_t *pIn2 = pSrcB;    /* inputB pointer               */
119     /*
120      * Loop to perform MAC operations according to correlation equation
121      */
122     const float32_t *pX;
123     const float32_t *pY;
124     const float32_t *pA;
125     const float32_t *pB;
126     int32_t   i = 0U, j = 0;    /* loop counters */
127     int32_t   block1, block2, block3;
128     uint32_t  vddupStartIdx = 3;
129     uint32x4_t decrIdxVec = vddupq_u32(vddupStartIdx, 1);
130 
131     if (srcALen < srcBLen)
132     {
133         /*
134          * Initialization to inputB pointer
135          */
136         pIn1 = pSrcB;
137         /*
138          * Initialization to the end of inputA pointer
139          */
140         pIn2 = pSrcA;
141         /*
142          * Swapping the lengths
143          */
144         j = srcALen;
145         srcALen = srcBLen;
146         srcBLen = j;
147     }
148 
149     block1 = srcBLen - 1;
150     block2 = srcALen - srcBLen + 1;
151     block3 = srcBLen - 1;
152 
153     pA = pIn1;
154     pB = pIn2 - 3;
155 
156     for (i = 0; i <= block1 - 2; i += 2)
157     {
158         uint32_t  count = i + 1;
159         float32_t acc0;
160         float32_t acc1;
161 
162         pX = pA;
163         pY = pB;
164         /*
165          * compute 2 accumulators per loop
166          * size is incrementing for successive accumulators
167          * Y pointer is incrementing for successive accumulators
168          */
169         MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count);
170 
171         *pDst++ = acc0;
172         *pDst++ = acc1;
173         pB += 2;
174     }
175 
176     for (; i < block1; i++)
177     {
178         uint32_t  count = i + 1;
179         float32_t acc;
180 
181         pX = pA;
182         pY = pB;
183         MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count);
184 
185         *pDst++ = acc;
186         pB++;
187     }
188 
189     for (i = 0; i <= block2 - 2; i += 2)
190     {
191         uint32_t  count = srcBLen;
192         float32_t acc0 = 0;
193         float32_t acc1 = 0;
194 
195         pX = pA;
196         pY = pB;
197         /*
198          * compute 2 accumulators per loop
199          * size is fixed for all accumulators
200          * X pointer is incrementing for successive accumulators
201          */
202         MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count);
203         *pDst++ = acc0;
204         *pDst++ = acc1;
205         pA += 2;
206     }
207     if (block2 & 1)
208     {
209         uint32_t  count = srcBLen;
210         float32_t acc = 0;
211 
212         pX = pA;
213         pY = pB;
214         MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count);
215 
216         *pDst++ = acc;
217         pA++;
218     }
219 
220     for (i = block3; i >= 2; i -= 2)
221     {
222         int32_t   count = i;
223         float32_t acc0;
224         float32_t acc1;
225 
226         pX = pA;
227         pY = pB;
228         /*
229          * compute 2 accumulators per loop
230          * size is decrementing for successive accumulators
231          * X pointer is incrementing for successive accumulators
232          */
233         MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count);
234 
235         *pDst++ = acc0;
236         *pDst++ = acc1;
237         pA += 2;
238     }
239     for (; i >= 1; i--)
240     {
241         int32_t   count = i;
242         float32_t acc;
243 
244         pX = pA;
245         pY = pB;
246         MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count);
247 
248         *pDst++ = acc;
249         pA++;
250     }
251 }
252 #else
arm_conv_f32(const float32_t * pSrcA,uint32_t srcALen,const float32_t * pSrcB,uint32_t srcBLen,float32_t * pDst)253 ARM_DSP_ATTRIBUTE void arm_conv_f32(
254   const float32_t * pSrcA,
255         uint32_t srcALen,
256   const float32_t * pSrcB,
257         uint32_t srcBLen,
258         float32_t * pDst)
259 {
260 
261 #if defined(ARM_MATH_DSP)
262 
263   const float32_t *pIn1;                               /* InputA pointer */
264   const float32_t *pIn2;                               /* InputB pointer */
265         float32_t *pOut = pDst;                        /* Output pointer */
266   const float32_t *px;                                 /* Intermediate inputA pointer */
267   const float32_t *py;                                 /* Intermediate inputB pointer */
268   const float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
269         float32_t sum;                                 /* Accumulators */
270         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
271         uint32_t j, k, count, blkCnt;                  /* Loop counters */
272 
273 
274 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
275         float32_t acc0, acc1, acc2, acc3, c0;              /* Accumulators */
276 #if !defined(ARM_MATH_NEON)
277         float32_t x0, x1, x2, x3;                  /* Temporary variables to hold state and coefficient values */
278 #endif
279 #endif
280 
281   /* The algorithm implementation is based on the lengths of the inputs. */
282   /* srcB is always made to slide across srcA. */
283   /* So srcBLen is always considered as shorter or equal to srcALen */
284   if (srcALen >= srcBLen)
285   {
286     /* Initialization of inputA pointer */
287     pIn1 = pSrcA;
288 
289     /* Initialization of inputB pointer */
290     pIn2 = pSrcB;
291   }
292   else
293   {
294     /* Initialization of inputA pointer */
295     pIn1 = pSrcB;
296 
297     /* Initialization of inputB pointer */
298     pIn2 = pSrcA;
299 
300     /* srcBLen is always considered as shorter or equal to srcALen */
301     j = srcBLen;
302     srcBLen = srcALen;
303     srcALen = j;
304   }
305 
306   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
307   /* The function is internally
308    * divided into three stages according to the number of multiplications that has to be
309    * taken place between inputA samples and inputB samples. In the first stage of the
310    * algorithm, the multiplications increase by one for every iteration.
311    * In the second stage of the algorithm, srcBLen number of multiplications are done.
312    * In the third stage of the algorithm, the multiplications decrease by one
313    * for every iteration. */
314 
315   /* The algorithm is implemented in three stages.
316      The loop counters of each stage is initiated here. */
317   blockSize1 = srcBLen - 1U;
318   blockSize2 = srcALen - (srcBLen - 1U);
319   blockSize3 = blockSize1;
320 
321   /* --------------------------
322    * Initializations of stage1
323    * -------------------------*/
324 
325   /* sum = x[0] * y[0]
326    * sum = x[0] * y[1] + x[1] * y[0]
327    * ....
328    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
329    */
330 
331   /* In this stage the MAC operations are increased by 1 for every iteration.
332      The count variable holds the number of MAC operations performed */
333   count = 1U;
334 
335   /* Working pointer of inputA */
336   px = pIn1;
337 
338   /* Working pointer of inputB */
339   py = pIn2;
340 
341 
342   /* ------------------------
343    * Stage1 process
344    * ----------------------*/
345 #if defined(ARM_MATH_NEON)
346     float32x4_t vec1;
347     float32x4_t vec2;
348     float32x4_t res = vdupq_n_f32(0) ;
349     float32x2_t accum = vdup_n_f32(0);
350 #endif /* #if defined(ARM_MATH_NEON) */
351 
352   /* The first stage starts here */
353   while (blockSize1 > 0U)
354   {
355     /* Accumulator is made zero for every iteration */
356     sum = 0.0f;
357 
358 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
359     /* Loop unrolling: Compute 4 outputs at a time */
360     k = count >> 2U;
361 
362 #if defined(ARM_MATH_NEON)
363     res = vdupq_n_f32(0) ;
364     accum = vdup_n_f32(0);
365 
366     /* Compute 4 MACs simultaneously. */
367     k = count >> 2U;
368 
369     /* First part of the processing.  Compute 4 MACs at a time.
370      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
371 
372     while (k > 0U)
373     {
374       vec1 = vld1q_f32(px);
375       vec2 = vld1q_f32(py-3);
376       vec2 = vrev64q_f32(vec2);
377       vec2 = vcombine_f32(vget_high_f32(vec2), vget_low_f32(vec2));
378 
379       res = vmlaq_f32(res,vec1, vec2);
380 
381       /* Increment pointers */
382       px += 4;
383       py -= 4;
384 
385       /* Decrement the loop counter */
386       k--;
387     }
388 
389     accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
390     sum += accum[0] + accum[1];
391 
392     /* If the count is not a multiple of 4, compute any remaining MACs here.
393      ** No loop unrolling is used. */
394     k = count & 3;
395 #else
396     while (k > 0U)
397     {
398       /* x[0] * y[srcBLen - 1] */
399       sum += *px++ * *py--;
400 
401       /* x[1] * y[srcBLen - 2] */
402       sum += *px++ * *py--;
403 
404       /* x[2] * y[srcBLen - 3] */
405       sum += *px++ * *py--;
406 
407       /* x[3] * y[srcBLen - 4] */
408       sum += *px++ * *py--;
409 
410       /* Decrement loop counter */
411       k--;
412     }
413 
414     /* Loop unrolling: Compute remaining outputs */
415     k = count % 0x4U;
416 
417 #endif /* #if defined(ARM_MATH_NEON) */
418 
419 #else /* defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
420     /* Initialize k with number of samples */
421     k = count;
422 
423 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
424 
425     while (k > 0U)
426     {
427       /* Perform the multiply-accumulate */
428       sum += *px++ * *py--;
429 
430       /* Decrement loop counter */
431       k--;
432     }
433 
434     /* Store the result in the accumulator in the destination buffer. */
435     *pOut++ = sum;
436 
437     /* Update the inputA and inputB pointers for next MAC calculation */
438     py = pIn2 + count;
439     px = pIn1;
440 
441     /* Increment MAC count */
442     count++;
443 
444     /* Decrement loop counter */
445     blockSize1--;
446   }
447 
448   /* --------------------------
449    * Initializations of stage2
450    * ------------------------*/
451 
452   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
453    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen]   * y[0]
454    * ....
455    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
456    */
457 
458   /* Working pointer of inputA */
459   px = pIn1;
460 
461   /* Working pointer of inputB */
462   pSrc2 = pIn2 + (srcBLen - 1U);
463   py = pSrc2;
464 
465   /* count is index by which the pointer pIn1 to be incremented */
466   count = 0U;
467 
468   /* -------------------
469    * Stage2 process
470    * ------------------*/
471 
472   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
473    * So, to loop unroll over blockSize2,
474    * srcBLen should be greater than or equal to 4 */
475   if (srcBLen >= 4U)
476   {
477 
478 #if defined(ARM_MATH_NEON)
479       float32x4_t c;
480       float32x4_t x1v;
481       float32x4_t x2v;
482       float32x4_t x;
483       float32x4_t res = vdupq_n_f32(0) ;
484 #endif /* #if defined(ARM_MATH_NEON) */
485 
486 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
487 
488     /* Loop unrolling: Compute 4 outputs at a time */
489     blkCnt = blockSize2 >> 2U;
490 
491     while (blkCnt > 0U)
492     {
493       /* Set all accumulators to zero */
494       acc0 = 0.0f;
495       acc1 = 0.0f;
496       acc2 = 0.0f;
497       acc3 = 0.0f;
498 
499        /* Apply loop unrolling and compute 4 MACs simultaneously. */
500       k = srcBLen >> 2U;
501 
502 #if defined(ARM_MATH_NEON)
503       res = vdupq_n_f32(0) ;
504 
505       x1v = vld1q_f32(px);
506       x2v = vld1q_f32(px+4);
507 
508       do
509       {
510         c = vld1q_f32(py-3);
511 
512         px += 4;
513         x = x1v;
514         res = vmlaq_n_f32(res,x,c[3]);
515 
516 	x = vextq_f32(x1v,x2v,1);
517 
518         res = vmlaq_n_f32(res,x,c[2]);
519 
520         x = vextq_f32(x1v,x2v,2);
521 
522 	res = vmlaq_n_f32(res,x,c[1]);
523 
524 	x = vextq_f32(x1v,x2v,3);
525 
526 	res = vmlaq_n_f32(res,x,c[0]);
527 
528         py -= 4;
529 
530         x1v = x2v ;
531         x2v = vld1q_f32(px+4);
532 
533       } while (--k);
534 
535 
536       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
537        ** No loop unrolling is used. */
538       k = srcBLen & 0x3;
539 
540       x1v = vld1q_f32(px);
541       px += 4;
542 
543       while (k > 0U)
544       {
545         /* Read y[srcBLen - 5] sample */
546         c0 = *(py--);
547 
548         res = vmlaq_n_f32(res,x1v,c0);
549 
550         /* Reuse the present samples for the next MAC */
551         x1v[0] = x1v[1];
552         x1v[1] = x1v[2];
553         x1v[2] = x1v[3];
554 
555         x1v[3] = *(px++);
556 
557         /* Decrement the loop counter */
558         k--;
559       }
560 
561       acc0 = res[0];
562       acc1 = res[1];
563       acc2 = res[2];
564       acc3 = res[3];
565 
566 #else
567       /* read x[0], x[1], x[2] samples */
568       x0 = *px++;
569       x1 = *px++;
570       x2 = *px++;
571 
572       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
573        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
574       do
575       {
576         /* Read y[srcBLen - 1] sample */
577         c0 = *py--;
578         /* Read x[3] sample */
579         x3 = *(px);
580 
581         /* Perform the multiply-accumulate */
582         /* acc0 +=  x[0] * y[srcBLen - 1] */
583         acc0 += x0 * c0;
584         /* acc1 +=  x[1] * y[srcBLen - 1] */
585         acc1 += x1 * c0;
586         /* acc2 +=  x[2] * y[srcBLen - 1] */
587         acc2 += x2 * c0;
588         /* acc3 +=  x[3] * y[srcBLen - 1] */
589         acc3 += x3 * c0;
590 
591         /* Read y[srcBLen - 2] sample */
592         c0 = *py--;
593         /* Read x[4] sample */
594         x0 = *(px + 1U);
595 
596         /* Perform the multiply-accumulate */
597         /* acc0 +=  x[1] * y[srcBLen - 2] */
598         acc0 += x1 * c0;
599         /* acc1 +=  x[2] * y[srcBLen - 2] */
600         acc1 += x2 * c0;
601         /* acc2 +=  x[3] * y[srcBLen - 2] */
602         acc2 += x3 * c0;
603         /* acc3 +=  x[4] * y[srcBLen - 2] */
604         acc3 += x0 * c0;
605 
606         /* Read y[srcBLen - 3] sample */
607         c0 = *py--;
608         /* Read x[5] sample */
609         x1 = *(px + 2U);
610 
611         /* Perform the multiply-accumulate */
612         /* acc0 +=  x[2] * y[srcBLen - 3] */
613         acc0 += x2 * c0;
614         /* acc1 +=  x[3] * y[srcBLen - 2] */
615         acc1 += x3 * c0;
616         /* acc2 +=  x[4] * y[srcBLen - 2] */
617         acc2 += x0 * c0;
618         /* acc3 +=  x[5] * y[srcBLen - 2] */
619         acc3 += x1 * c0;
620 
621         /* Read y[srcBLen - 4] sample */
622         c0 = *py--;
623         /* Read x[6] sample */
624         x2 = *(px + 3U);
625         px += 4U;
626 
627         /* Perform the multiply-accumulate */
628         /* acc0 +=  x[3] * y[srcBLen - 4] */
629         acc0 += x3 * c0;
630         /* acc1 +=  x[4] * y[srcBLen - 4] */
631         acc1 += x0 * c0;
632         /* acc2 +=  x[5] * y[srcBLen - 4] */
633         acc2 += x1 * c0;
634         /* acc3 +=  x[6] * y[srcBLen - 4] */
635         acc3 += x2 * c0;
636 
637       } while (--k);
638 
639       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
640        ** No loop unrolling is used. */
641       k = srcBLen % 0x4U;
642 
643       while (k > 0U)
644       {
645         /* Read y[srcBLen - 5] sample */
646         c0 = *py--;
647         /* Read x[7] sample */
648         x3 = *px++;
649 
650         /* Perform the multiply-accumulate */
651         /* acc0 +=  x[4] * y[srcBLen - 5] */
652         acc0 += x0 * c0;
653         /* acc1 +=  x[5] * y[srcBLen - 5] */
654         acc1 += x1 * c0;
655         /* acc2 +=  x[6] * y[srcBLen - 5] */
656         acc2 += x2 * c0;
657         /* acc3 +=  x[7] * y[srcBLen - 5] */
658         acc3 += x3 * c0;
659 
660         /* Reuse the present samples for the next MAC */
661         x0 = x1;
662         x1 = x2;
663         x2 = x3;
664 
665         /* Decrement the loop counter */
666         k--;
667       }
668 #endif /* #if defined(ARM_MATH_NEON) */
669 
670       /* Store the result in the accumulator in the destination buffer. */
671       *pOut++ = acc0;
672       *pOut++ = acc1;
673       *pOut++ = acc2;
674       *pOut++ = acc3;
675 
676       /* Increment the pointer pIn1 index, count by 4 */
677       count += 4U;
678 
679       /* Update the inputA and inputB pointers for next MAC calculation */
680       px = pIn1 + count;
681       py = pSrc2;
682 
683       /* Decrement the loop counter */
684       blkCnt--;
685     }
686 
687     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
688      ** No loop unrolling is used. */
689     blkCnt = blockSize2 % 0x4U;
690 
691 #else
692 
693     /* Initialize blkCnt with number of samples */
694     blkCnt = blockSize2;
695 
696 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined (ARM_MATH_NEON)*/
697 
698     while (blkCnt > 0U)
699     {
700       /* Accumulator is made zero for every iteration */
701       sum = 0.0f;
702 
703 #if defined(ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL)
704       /* Loop unrolling: Compute 4 outputs at a time */
705       k = srcBLen >> 2U;
706 
707 #if defined (ARM_MATH_NEON)
708       float32x4_t res = vdupq_n_f32(0) ;
709       float32x4_t x = vdupq_n_f32(0) ;
710       float32x4_t y = vdupq_n_f32(0) ;
711       float32x2_t accum = vdup_n_f32(0) ;
712 
713       /* First part of the processing.  Compute 4 MACs at a time.
714        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
715       while (k > 0U)
716       {
717         x = vld1q_f32(px);
718         y = vld1q_f32(py-3);
719 
720         y = vrev64q_f32(y);
721         y = vcombine_f32(vget_high_f32(y), vget_low_f32(y));
722 
723         res = vmlaq_f32(res,x,y);
724 
725         px += 4 ;
726         py -= 4 ;
727 
728         /* Decrement the loop counter */
729         k--;
730       }
731 
732       accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
733       sum += accum[0] + accum[1];
734 
735       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
736        ** No loop unrolling is used. */
737       k = srcBLen & 0x3U;
738 
739 #else
740       while (k > 0U)
741       {
742         /* Perform the multiply-accumulate */
743         sum += *px++ * *py--;
744         sum += *px++ * *py--;
745         sum += *px++ * *py--;
746         sum += *px++ * *py--;
747 
748         /* Decrement loop counter */
749         k--;
750       }
751 
752       /* Loop unrolling: Compute remaining outputs */
753       k = srcBLen % 0x4U;
754 
755 #endif /* if defined (ARM_MATH_NEON) */
756 #else
757       /* Initialize blkCnt with number of samples */
758       k = srcBLen;
759 
760 #endif /* #if defined(ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL) */
761 
762       while (k > 0U)
763       {
764         /* Perform the multiply-accumulate */
765         sum += *px++ * *py--;
766 
767         /* Decrement the loop counter */
768         k--;
769       }
770 
771       /* Store the result in the accumulator in the destination buffer. */
772       *pOut++ = sum;
773 
774       /* Increment the MAC count */
775       count++;
776 
777       /* Update the inputA and inputB pointers for next MAC calculation */
778       px = pIn1 + count;
779       py = pSrc2;
780 
781       /* Decrement the loop counter */
782       blkCnt--;
783     }
784   }
785   else
786   {
787     /* If the srcBLen is not a multiple of 4,
788      * the blockSize2 loop cannot be unrolled by 4 */
789     blkCnt = blockSize2;
790 
791     while (blkCnt > 0U)
792     {
793       /* Accumulator is made zero for every iteration */
794       sum = 0.0f;
795 
796       /* srcBLen number of MACS should be performed */
797       k = srcBLen;
798 
799       while (k > 0U)
800       {
801         /* Perform the multiply-accumulate */
802         sum += *px++ * *py--;
803 
804         /* Decrement the loop counter */
805         k--;
806       }
807 
808       /* Store the result in the accumulator in the destination buffer. */
809       *pOut++ = sum;
810 
811       /* Increment the MAC count */
812       count++;
813 
814       /* Update the inputA and inputB pointers for next MAC calculation */
815       px = pIn1 + count;
816       py = pSrc2;
817 
818       /* Decrement the loop counter */
819       blkCnt--;
820     }
821   }
822 
823 
824   /* --------------------------
825    * Initializations of stage3
826    * -------------------------*/
827 
828   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
829    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
830    * ....
831    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
832    * sum +=  x[srcALen-1] * y[srcBLen-1]
833    */
834 
835   /* In this stage the MAC operations are decreased by 1 for every iteration.
836      The blockSize3 variable holds the number of MAC operations performed */
837 
838   /* Working pointer of inputA */
839   pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
840   px = pSrc1;
841 
842   /* Working pointer of inputB */
843   pSrc2 = pIn2 + (srcBLen - 1U);
844   py = pSrc2;
845 
846   /* -------------------
847    * Stage3 process
848    * ------------------*/
849   while (blockSize3 > 0U)
850   {
851     /* Accumulator is made zero for every iteration */
852     sum = 0.0f;
853 
854 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
855     /* Loop unrolling: Compute 4 outputs at a time */
856     k = blockSize3 >> 2U;
857 
858 #if defined(ARM_MATH_NEON)
859     float32x4_t res = vdupq_n_f32(0) ;
860     float32x4_t x = vdupq_n_f32(0) ;
861     float32x4_t y = vdupq_n_f32(0) ;
862     float32x2_t accum = vdup_n_f32(0) ;
863 
864     while (k > 0U)
865     {
866       x = vld1q_f32(px);
867       y = vld1q_f32(py-3);
868 
869       y = vrev64q_f32(y);
870       y = vcombine_f32(vget_high_f32(y), vget_low_f32(y));
871 
872       res = vmlaq_f32(res,x,y);
873 
874       px += 4 ;
875       py -= 4 ;
876 
877       /* Decrement the loop counter */
878       k--;
879     }
880 
881     accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
882     sum += accum[0] + accum[1];
883 
884 #else
885     while (k > 0U)
886     {
887       /* Perform the multiply-accumulate */
888       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
889       sum += *px++ * *py--;
890 
891       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
892       sum += *px++ * *py--;
893 
894       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
895       sum += *px++ * *py--;
896 
897       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
898       sum += *px++ * *py--;
899 
900       /* Decrement loop counter */
901       k--;
902     }
903 #endif /* #if defined (ARM_MATH_NEON) */
904 
905     /* Loop unrolling: Compute remaining outputs */
906     k = blockSize3 % 0x4U;
907 #else
908 
909     /* Initialize blkCnt with number of samples */
910     k = blockSize3;
911 
912 #endif /* #if defined (ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL)*/
913 
914     while (k > 0U)
915     {
916       /* Perform the multiply-accumulate */
917       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
918       sum += *px++ * *py--;
919 
920       /* Decrement loop counter */
921       k--;
922     }
923 
924     /* Store the result in the accumulator in the destination buffer. */
925     *pOut++ = sum;
926 
927     /* Update the inputA and inputB pointers for next MAC calculation */
928     px = ++pSrc1;
929     py = pSrc2;
930 
931     /* Decrement the loop counter */
932     blockSize3--;
933   }
934 
935 #else
936 /* alternate version for CM0_FAMILY */
937 
938   const float32_t *pIn1 = pSrcA;                       /* InputA pointer */
939   const float32_t *pIn2 = pSrcB;                       /* InputB pointer */
940         float32_t sum;                                 /* Accumulator */
941         uint32_t i, j;                                 /* Loop counters */
942 
943   /* Loop to calculate convolution for output length number of times */
944   for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
945   {
946     /* Initialize sum with zero to carry out MAC operations */
947     sum = 0.0f;
948 
949     /* Loop to perform MAC operations according to convolution equation */
950     for (j = 0U; j <= i; j++)
951     {
952       /* Check the array limitations */
953       if (((i - j) < srcBLen) && (j < srcALen))
954       {
955         /* z[i] += x[i-j] * y[j] */
956         sum += ( pIn1[j] * pIn2[i - j]);
957       }
958     }
959 
960     /* Store the output in the destination buffer */
961     pDst[i] = sum;
962   }
963 
964 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
965 
966 }
967 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
968 
969 /**
970   @} end of Conv group
971  */
972