1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_f32.c
4 * Description: Convolution of floating-point sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @defgroup Conv Convolution
37
38 Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
39 Convolution is similar to correlation and is frequently used in filtering and data analysis.
40 The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
41 The library also provides fast versions of the Q15 and Q31 functions.
42
43 @par Algorithm
44 Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and
45 <code>srcBLen</code> samples respectively. Then the convolution
46 \f[
47 c[n] = a[n] * b[n]
48 \f]
49 @par
50 is defined as
51 \f[
52 c[n] = \sum_{k=0}^{srcALen} a[k] b[n-k]
53 \f]
54 @par
55 Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.
56 <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and
57 <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
58 The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.
59 @par
60 Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,
61 the signal <code>b[n]</code> slides over <code>a[n]</code>.
62 For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
63 @par
64 Note that convolution is a commutative operation:
65 \f[
66 a[n] * b[n] = b[n] * a[n].
67 \f]
68 @par
69 This means that switching the A and B arguments to the convolution functions has no effect.
70
71 @par Fixed-Point Behavior
72 Convolution requires summing up a large number of intermediate products.
73 As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
74 Refer to the function specific documentation below for further details of the particular algorithm used.
75
76 @par Fast Versions
77 Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
78 the input signals should be scaled down to avoid intermediate overflows.
79
80 @par Opt Versions
81 Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
82 These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions
83
84 @par Long versions:
85 For convolution of long vectors, those functions are
86 no more adapted and will be very slow.
87 An implementation based upon FFTs should be used.
88
89 */
90
91 /**
92 @addtogroup Conv
93 @{
94 */
95
96 /**
97 @brief Convolution of floating-point sequences.
98 @param[in] pSrcA points to the first input sequence
99 @param[in] srcALen length of the first input sequence
100 @param[in] pSrcB points to the second input sequence
101 @param[in] srcBLen length of the second input sequence
102 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
103 */
104 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
105
106 #include "arm_helium_utils.h"
107 #include "arm_vec_filtering.h"
108
109
arm_conv_f32(const float32_t * pSrcA,uint32_t srcALen,const float32_t * pSrcB,uint32_t srcBLen,float32_t * pDst)110 ARM_DSP_ATTRIBUTE void arm_conv_f32(
111 const float32_t * pSrcA,
112 uint32_t srcALen,
113 const float32_t * pSrcB,
114 uint32_t srcBLen,
115 float32_t * pDst)
116 {
117 const float32_t *pIn1 = pSrcA; /* inputA pointer */
118 const float32_t *pIn2 = pSrcB; /* inputB pointer */
119 /*
120 * Loop to perform MAC operations according to correlation equation
121 */
122 const float32_t *pX;
123 const float32_t *pY;
124 const float32_t *pA;
125 const float32_t *pB;
126 int32_t i = 0U, j = 0; /* loop counters */
127 int32_t block1, block2, block3;
128 uint32_t vddupStartIdx = 3;
129 uint32x4_t decrIdxVec = vddupq_u32(vddupStartIdx, 1);
130
131 if (srcALen < srcBLen)
132 {
133 /*
134 * Initialization to inputB pointer
135 */
136 pIn1 = pSrcB;
137 /*
138 * Initialization to the end of inputA pointer
139 */
140 pIn2 = pSrcA;
141 /*
142 * Swapping the lengths
143 */
144 j = srcALen;
145 srcALen = srcBLen;
146 srcBLen = j;
147 }
148
149 block1 = srcBLen - 1;
150 block2 = srcALen - srcBLen + 1;
151 block3 = srcBLen - 1;
152
153 pA = pIn1;
154 pB = pIn2 - 3;
155
156 for (i = 0; i <= block1 - 2; i += 2)
157 {
158 uint32_t count = i + 1;
159 float32_t acc0;
160 float32_t acc1;
161
162 pX = pA;
163 pY = pB;
164 /*
165 * compute 2 accumulators per loop
166 * size is incrementing for successive accumulators
167 * Y pointer is incrementing for successive accumulators
168 */
169 MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count);
170
171 *pDst++ = acc0;
172 *pDst++ = acc1;
173 pB += 2;
174 }
175
176 for (; i < block1; i++)
177 {
178 uint32_t count = i + 1;
179 float32_t acc;
180
181 pX = pA;
182 pY = pB;
183 MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count);
184
185 *pDst++ = acc;
186 pB++;
187 }
188
189 for (i = 0; i <= block2 - 2; i += 2)
190 {
191 uint32_t count = srcBLen;
192 float32_t acc0 = 0;
193 float32_t acc1 = 0;
194
195 pX = pA;
196 pY = pB;
197 /*
198 * compute 2 accumulators per loop
199 * size is fixed for all accumulators
200 * X pointer is incrementing for successive accumulators
201 */
202 MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count);
203 *pDst++ = acc0;
204 *pDst++ = acc1;
205 pA += 2;
206 }
207 if (block2 & 1)
208 {
209 uint32_t count = srcBLen;
210 float32_t acc = 0;
211
212 pX = pA;
213 pY = pB;
214 MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count);
215
216 *pDst++ = acc;
217 pA++;
218 }
219
220 for (i = block3; i >= 2; i -= 2)
221 {
222 int32_t count = i;
223 float32_t acc0;
224 float32_t acc1;
225
226 pX = pA;
227 pY = pB;
228 /*
229 * compute 2 accumulators per loop
230 * size is decrementing for successive accumulators
231 * X pointer is incrementing for successive accumulators
232 */
233 MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count);
234
235 *pDst++ = acc0;
236 *pDst++ = acc1;
237 pA += 2;
238 }
239 for (; i >= 1; i--)
240 {
241 int32_t count = i;
242 float32_t acc;
243
244 pX = pA;
245 pY = pB;
246 MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count);
247
248 *pDst++ = acc;
249 pA++;
250 }
251 }
252 #else
arm_conv_f32(const float32_t * pSrcA,uint32_t srcALen,const float32_t * pSrcB,uint32_t srcBLen,float32_t * pDst)253 ARM_DSP_ATTRIBUTE void arm_conv_f32(
254 const float32_t * pSrcA,
255 uint32_t srcALen,
256 const float32_t * pSrcB,
257 uint32_t srcBLen,
258 float32_t * pDst)
259 {
260
261 #if defined(ARM_MATH_DSP)
262
263 const float32_t *pIn1; /* InputA pointer */
264 const float32_t *pIn2; /* InputB pointer */
265 float32_t *pOut = pDst; /* Output pointer */
266 const float32_t *px; /* Intermediate inputA pointer */
267 const float32_t *py; /* Intermediate inputB pointer */
268 const float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
269 float32_t sum; /* Accumulators */
270 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
271 uint32_t j, k, count, blkCnt; /* Loop counters */
272
273
274 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
275 float32_t acc0, acc1, acc2, acc3, c0; /* Accumulators */
276 #if !defined(ARM_MATH_NEON)
277 float32_t x0, x1, x2, x3; /* Temporary variables to hold state and coefficient values */
278 #endif
279 #endif
280
281 /* The algorithm implementation is based on the lengths of the inputs. */
282 /* srcB is always made to slide across srcA. */
283 /* So srcBLen is always considered as shorter or equal to srcALen */
284 if (srcALen >= srcBLen)
285 {
286 /* Initialization of inputA pointer */
287 pIn1 = pSrcA;
288
289 /* Initialization of inputB pointer */
290 pIn2 = pSrcB;
291 }
292 else
293 {
294 /* Initialization of inputA pointer */
295 pIn1 = pSrcB;
296
297 /* Initialization of inputB pointer */
298 pIn2 = pSrcA;
299
300 /* srcBLen is always considered as shorter or equal to srcALen */
301 j = srcBLen;
302 srcBLen = srcALen;
303 srcALen = j;
304 }
305
306 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
307 /* The function is internally
308 * divided into three stages according to the number of multiplications that has to be
309 * taken place between inputA samples and inputB samples. In the first stage of the
310 * algorithm, the multiplications increase by one for every iteration.
311 * In the second stage of the algorithm, srcBLen number of multiplications are done.
312 * In the third stage of the algorithm, the multiplications decrease by one
313 * for every iteration. */
314
315 /* The algorithm is implemented in three stages.
316 The loop counters of each stage is initiated here. */
317 blockSize1 = srcBLen - 1U;
318 blockSize2 = srcALen - (srcBLen - 1U);
319 blockSize3 = blockSize1;
320
321 /* --------------------------
322 * Initializations of stage1
323 * -------------------------*/
324
325 /* sum = x[0] * y[0]
326 * sum = x[0] * y[1] + x[1] * y[0]
327 * ....
328 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
329 */
330
331 /* In this stage the MAC operations are increased by 1 for every iteration.
332 The count variable holds the number of MAC operations performed */
333 count = 1U;
334
335 /* Working pointer of inputA */
336 px = pIn1;
337
338 /* Working pointer of inputB */
339 py = pIn2;
340
341
342 /* ------------------------
343 * Stage1 process
344 * ----------------------*/
345 #if defined(ARM_MATH_NEON)
346 float32x4_t vec1;
347 float32x4_t vec2;
348 float32x4_t res = vdupq_n_f32(0) ;
349 float32x2_t accum = vdup_n_f32(0);
350 #endif /* #if defined(ARM_MATH_NEON) */
351
352 /* The first stage starts here */
353 while (blockSize1 > 0U)
354 {
355 /* Accumulator is made zero for every iteration */
356 sum = 0.0f;
357
358 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
359 /* Loop unrolling: Compute 4 outputs at a time */
360 k = count >> 2U;
361
362 #if defined(ARM_MATH_NEON)
363 res = vdupq_n_f32(0) ;
364 accum = vdup_n_f32(0);
365
366 /* Compute 4 MACs simultaneously. */
367 k = count >> 2U;
368
369 /* First part of the processing. Compute 4 MACs at a time.
370 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
371
372 while (k > 0U)
373 {
374 vec1 = vld1q_f32(px);
375 vec2 = vld1q_f32(py-3);
376 vec2 = vrev64q_f32(vec2);
377 vec2 = vcombine_f32(vget_high_f32(vec2), vget_low_f32(vec2));
378
379 res = vmlaq_f32(res,vec1, vec2);
380
381 /* Increment pointers */
382 px += 4;
383 py -= 4;
384
385 /* Decrement the loop counter */
386 k--;
387 }
388
389 accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
390 sum += accum[0] + accum[1];
391
392 /* If the count is not a multiple of 4, compute any remaining MACs here.
393 ** No loop unrolling is used. */
394 k = count & 3;
395 #else
396 while (k > 0U)
397 {
398 /* x[0] * y[srcBLen - 1] */
399 sum += *px++ * *py--;
400
401 /* x[1] * y[srcBLen - 2] */
402 sum += *px++ * *py--;
403
404 /* x[2] * y[srcBLen - 3] */
405 sum += *px++ * *py--;
406
407 /* x[3] * y[srcBLen - 4] */
408 sum += *px++ * *py--;
409
410 /* Decrement loop counter */
411 k--;
412 }
413
414 /* Loop unrolling: Compute remaining outputs */
415 k = count % 0x4U;
416
417 #endif /* #if defined(ARM_MATH_NEON) */
418
419 #else /* defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
420 /* Initialize k with number of samples */
421 k = count;
422
423 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
424
425 while (k > 0U)
426 {
427 /* Perform the multiply-accumulate */
428 sum += *px++ * *py--;
429
430 /* Decrement loop counter */
431 k--;
432 }
433
434 /* Store the result in the accumulator in the destination buffer. */
435 *pOut++ = sum;
436
437 /* Update the inputA and inputB pointers for next MAC calculation */
438 py = pIn2 + count;
439 px = pIn1;
440
441 /* Increment MAC count */
442 count++;
443
444 /* Decrement loop counter */
445 blockSize1--;
446 }
447
448 /* --------------------------
449 * Initializations of stage2
450 * ------------------------*/
451
452 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
453 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
454 * ....
455 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
456 */
457
458 /* Working pointer of inputA */
459 px = pIn1;
460
461 /* Working pointer of inputB */
462 pSrc2 = pIn2 + (srcBLen - 1U);
463 py = pSrc2;
464
465 /* count is index by which the pointer pIn1 to be incremented */
466 count = 0U;
467
468 /* -------------------
469 * Stage2 process
470 * ------------------*/
471
472 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
473 * So, to loop unroll over blockSize2,
474 * srcBLen should be greater than or equal to 4 */
475 if (srcBLen >= 4U)
476 {
477
478 #if defined(ARM_MATH_NEON)
479 float32x4_t c;
480 float32x4_t x1v;
481 float32x4_t x2v;
482 float32x4_t x;
483 float32x4_t res = vdupq_n_f32(0) ;
484 #endif /* #if defined(ARM_MATH_NEON) */
485
486 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
487
488 /* Loop unrolling: Compute 4 outputs at a time */
489 blkCnt = blockSize2 >> 2U;
490
491 while (blkCnt > 0U)
492 {
493 /* Set all accumulators to zero */
494 acc0 = 0.0f;
495 acc1 = 0.0f;
496 acc2 = 0.0f;
497 acc3 = 0.0f;
498
499 /* Apply loop unrolling and compute 4 MACs simultaneously. */
500 k = srcBLen >> 2U;
501
502 #if defined(ARM_MATH_NEON)
503 res = vdupq_n_f32(0) ;
504
505 x1v = vld1q_f32(px);
506 x2v = vld1q_f32(px+4);
507
508 do
509 {
510 c = vld1q_f32(py-3);
511
512 px += 4;
513 x = x1v;
514 res = vmlaq_n_f32(res,x,c[3]);
515
516 x = vextq_f32(x1v,x2v,1);
517
518 res = vmlaq_n_f32(res,x,c[2]);
519
520 x = vextq_f32(x1v,x2v,2);
521
522 res = vmlaq_n_f32(res,x,c[1]);
523
524 x = vextq_f32(x1v,x2v,3);
525
526 res = vmlaq_n_f32(res,x,c[0]);
527
528 py -= 4;
529
530 x1v = x2v ;
531 x2v = vld1q_f32(px+4);
532
533 } while (--k);
534
535
536 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
537 ** No loop unrolling is used. */
538 k = srcBLen & 0x3;
539
540 x1v = vld1q_f32(px);
541 px += 4;
542
543 while (k > 0U)
544 {
545 /* Read y[srcBLen - 5] sample */
546 c0 = *(py--);
547
548 res = vmlaq_n_f32(res,x1v,c0);
549
550 /* Reuse the present samples for the next MAC */
551 x1v[0] = x1v[1];
552 x1v[1] = x1v[2];
553 x1v[2] = x1v[3];
554
555 x1v[3] = *(px++);
556
557 /* Decrement the loop counter */
558 k--;
559 }
560
561 acc0 = res[0];
562 acc1 = res[1];
563 acc2 = res[2];
564 acc3 = res[3];
565
566 #else
567 /* read x[0], x[1], x[2] samples */
568 x0 = *px++;
569 x1 = *px++;
570 x2 = *px++;
571
572 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
573 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
574 do
575 {
576 /* Read y[srcBLen - 1] sample */
577 c0 = *py--;
578 /* Read x[3] sample */
579 x3 = *(px);
580
581 /* Perform the multiply-accumulate */
582 /* acc0 += x[0] * y[srcBLen - 1] */
583 acc0 += x0 * c0;
584 /* acc1 += x[1] * y[srcBLen - 1] */
585 acc1 += x1 * c0;
586 /* acc2 += x[2] * y[srcBLen - 1] */
587 acc2 += x2 * c0;
588 /* acc3 += x[3] * y[srcBLen - 1] */
589 acc3 += x3 * c0;
590
591 /* Read y[srcBLen - 2] sample */
592 c0 = *py--;
593 /* Read x[4] sample */
594 x0 = *(px + 1U);
595
596 /* Perform the multiply-accumulate */
597 /* acc0 += x[1] * y[srcBLen - 2] */
598 acc0 += x1 * c0;
599 /* acc1 += x[2] * y[srcBLen - 2] */
600 acc1 += x2 * c0;
601 /* acc2 += x[3] * y[srcBLen - 2] */
602 acc2 += x3 * c0;
603 /* acc3 += x[4] * y[srcBLen - 2] */
604 acc3 += x0 * c0;
605
606 /* Read y[srcBLen - 3] sample */
607 c0 = *py--;
608 /* Read x[5] sample */
609 x1 = *(px + 2U);
610
611 /* Perform the multiply-accumulate */
612 /* acc0 += x[2] * y[srcBLen - 3] */
613 acc0 += x2 * c0;
614 /* acc1 += x[3] * y[srcBLen - 2] */
615 acc1 += x3 * c0;
616 /* acc2 += x[4] * y[srcBLen - 2] */
617 acc2 += x0 * c0;
618 /* acc3 += x[5] * y[srcBLen - 2] */
619 acc3 += x1 * c0;
620
621 /* Read y[srcBLen - 4] sample */
622 c0 = *py--;
623 /* Read x[6] sample */
624 x2 = *(px + 3U);
625 px += 4U;
626
627 /* Perform the multiply-accumulate */
628 /* acc0 += x[3] * y[srcBLen - 4] */
629 acc0 += x3 * c0;
630 /* acc1 += x[4] * y[srcBLen - 4] */
631 acc1 += x0 * c0;
632 /* acc2 += x[5] * y[srcBLen - 4] */
633 acc2 += x1 * c0;
634 /* acc3 += x[6] * y[srcBLen - 4] */
635 acc3 += x2 * c0;
636
637 } while (--k);
638
639 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
640 ** No loop unrolling is used. */
641 k = srcBLen % 0x4U;
642
643 while (k > 0U)
644 {
645 /* Read y[srcBLen - 5] sample */
646 c0 = *py--;
647 /* Read x[7] sample */
648 x3 = *px++;
649
650 /* Perform the multiply-accumulate */
651 /* acc0 += x[4] * y[srcBLen - 5] */
652 acc0 += x0 * c0;
653 /* acc1 += x[5] * y[srcBLen - 5] */
654 acc1 += x1 * c0;
655 /* acc2 += x[6] * y[srcBLen - 5] */
656 acc2 += x2 * c0;
657 /* acc3 += x[7] * y[srcBLen - 5] */
658 acc3 += x3 * c0;
659
660 /* Reuse the present samples for the next MAC */
661 x0 = x1;
662 x1 = x2;
663 x2 = x3;
664
665 /* Decrement the loop counter */
666 k--;
667 }
668 #endif /* #if defined(ARM_MATH_NEON) */
669
670 /* Store the result in the accumulator in the destination buffer. */
671 *pOut++ = acc0;
672 *pOut++ = acc1;
673 *pOut++ = acc2;
674 *pOut++ = acc3;
675
676 /* Increment the pointer pIn1 index, count by 4 */
677 count += 4U;
678
679 /* Update the inputA and inputB pointers for next MAC calculation */
680 px = pIn1 + count;
681 py = pSrc2;
682
683 /* Decrement the loop counter */
684 blkCnt--;
685 }
686
687 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
688 ** No loop unrolling is used. */
689 blkCnt = blockSize2 % 0x4U;
690
691 #else
692
693 /* Initialize blkCnt with number of samples */
694 blkCnt = blockSize2;
695
696 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined (ARM_MATH_NEON)*/
697
698 while (blkCnt > 0U)
699 {
700 /* Accumulator is made zero for every iteration */
701 sum = 0.0f;
702
703 #if defined(ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL)
704 /* Loop unrolling: Compute 4 outputs at a time */
705 k = srcBLen >> 2U;
706
707 #if defined (ARM_MATH_NEON)
708 float32x4_t res = vdupq_n_f32(0) ;
709 float32x4_t x = vdupq_n_f32(0) ;
710 float32x4_t y = vdupq_n_f32(0) ;
711 float32x2_t accum = vdup_n_f32(0) ;
712
713 /* First part of the processing. Compute 4 MACs at a time.
714 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
715 while (k > 0U)
716 {
717 x = vld1q_f32(px);
718 y = vld1q_f32(py-3);
719
720 y = vrev64q_f32(y);
721 y = vcombine_f32(vget_high_f32(y), vget_low_f32(y));
722
723 res = vmlaq_f32(res,x,y);
724
725 px += 4 ;
726 py -= 4 ;
727
728 /* Decrement the loop counter */
729 k--;
730 }
731
732 accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
733 sum += accum[0] + accum[1];
734
735 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
736 ** No loop unrolling is used. */
737 k = srcBLen & 0x3U;
738
739 #else
740 while (k > 0U)
741 {
742 /* Perform the multiply-accumulate */
743 sum += *px++ * *py--;
744 sum += *px++ * *py--;
745 sum += *px++ * *py--;
746 sum += *px++ * *py--;
747
748 /* Decrement loop counter */
749 k--;
750 }
751
752 /* Loop unrolling: Compute remaining outputs */
753 k = srcBLen % 0x4U;
754
755 #endif /* if defined (ARM_MATH_NEON) */
756 #else
757 /* Initialize blkCnt with number of samples */
758 k = srcBLen;
759
760 #endif /* #if defined(ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL) */
761
762 while (k > 0U)
763 {
764 /* Perform the multiply-accumulate */
765 sum += *px++ * *py--;
766
767 /* Decrement the loop counter */
768 k--;
769 }
770
771 /* Store the result in the accumulator in the destination buffer. */
772 *pOut++ = sum;
773
774 /* Increment the MAC count */
775 count++;
776
777 /* Update the inputA and inputB pointers for next MAC calculation */
778 px = pIn1 + count;
779 py = pSrc2;
780
781 /* Decrement the loop counter */
782 blkCnt--;
783 }
784 }
785 else
786 {
787 /* If the srcBLen is not a multiple of 4,
788 * the blockSize2 loop cannot be unrolled by 4 */
789 blkCnt = blockSize2;
790
791 while (blkCnt > 0U)
792 {
793 /* Accumulator is made zero for every iteration */
794 sum = 0.0f;
795
796 /* srcBLen number of MACS should be performed */
797 k = srcBLen;
798
799 while (k > 0U)
800 {
801 /* Perform the multiply-accumulate */
802 sum += *px++ * *py--;
803
804 /* Decrement the loop counter */
805 k--;
806 }
807
808 /* Store the result in the accumulator in the destination buffer. */
809 *pOut++ = sum;
810
811 /* Increment the MAC count */
812 count++;
813
814 /* Update the inputA and inputB pointers for next MAC calculation */
815 px = pIn1 + count;
816 py = pSrc2;
817
818 /* Decrement the loop counter */
819 blkCnt--;
820 }
821 }
822
823
824 /* --------------------------
825 * Initializations of stage3
826 * -------------------------*/
827
828 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
829 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
830 * ....
831 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
832 * sum += x[srcALen-1] * y[srcBLen-1]
833 */
834
835 /* In this stage the MAC operations are decreased by 1 for every iteration.
836 The blockSize3 variable holds the number of MAC operations performed */
837
838 /* Working pointer of inputA */
839 pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
840 px = pSrc1;
841
842 /* Working pointer of inputB */
843 pSrc2 = pIn2 + (srcBLen - 1U);
844 py = pSrc2;
845
846 /* -------------------
847 * Stage3 process
848 * ------------------*/
849 while (blockSize3 > 0U)
850 {
851 /* Accumulator is made zero for every iteration */
852 sum = 0.0f;
853
854 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
855 /* Loop unrolling: Compute 4 outputs at a time */
856 k = blockSize3 >> 2U;
857
858 #if defined(ARM_MATH_NEON)
859 float32x4_t res = vdupq_n_f32(0) ;
860 float32x4_t x = vdupq_n_f32(0) ;
861 float32x4_t y = vdupq_n_f32(0) ;
862 float32x2_t accum = vdup_n_f32(0) ;
863
864 while (k > 0U)
865 {
866 x = vld1q_f32(px);
867 y = vld1q_f32(py-3);
868
869 y = vrev64q_f32(y);
870 y = vcombine_f32(vget_high_f32(y), vget_low_f32(y));
871
872 res = vmlaq_f32(res,x,y);
873
874 px += 4 ;
875 py -= 4 ;
876
877 /* Decrement the loop counter */
878 k--;
879 }
880
881 accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
882 sum += accum[0] + accum[1];
883
884 #else
885 while (k > 0U)
886 {
887 /* Perform the multiply-accumulate */
888 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
889 sum += *px++ * *py--;
890
891 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
892 sum += *px++ * *py--;
893
894 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
895 sum += *px++ * *py--;
896
897 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
898 sum += *px++ * *py--;
899
900 /* Decrement loop counter */
901 k--;
902 }
903 #endif /* #if defined (ARM_MATH_NEON) */
904
905 /* Loop unrolling: Compute remaining outputs */
906 k = blockSize3 % 0x4U;
907 #else
908
909 /* Initialize blkCnt with number of samples */
910 k = blockSize3;
911
912 #endif /* #if defined (ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL)*/
913
914 while (k > 0U)
915 {
916 /* Perform the multiply-accumulate */
917 /* sum += x[srcALen-1] * y[srcBLen-1] */
918 sum += *px++ * *py--;
919
920 /* Decrement loop counter */
921 k--;
922 }
923
924 /* Store the result in the accumulator in the destination buffer. */
925 *pOut++ = sum;
926
927 /* Update the inputA and inputB pointers for next MAC calculation */
928 px = ++pSrc1;
929 py = pSrc2;
930
931 /* Decrement the loop counter */
932 blockSize3--;
933 }
934
935 #else
936 /* alternate version for CM0_FAMILY */
937
938 const float32_t *pIn1 = pSrcA; /* InputA pointer */
939 const float32_t *pIn2 = pSrcB; /* InputB pointer */
940 float32_t sum; /* Accumulator */
941 uint32_t i, j; /* Loop counters */
942
943 /* Loop to calculate convolution for output length number of times */
944 for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
945 {
946 /* Initialize sum with zero to carry out MAC operations */
947 sum = 0.0f;
948
949 /* Loop to perform MAC operations according to convolution equation */
950 for (j = 0U; j <= i; j++)
951 {
952 /* Check the array limitations */
953 if (((i - j) < srcBLen) && (j < srcALen))
954 {
955 /* z[i] += x[i-j] * y[j] */
956 sum += ( pIn1[j] * pIn2[i - j]);
957 }
958 }
959
960 /* Store the output in the destination buffer */
961 pDst[i] = sum;
962 }
963
964 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
965
966 }
967 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
968
969 /**
970 @} end of Conv group
971 */
972