1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_correlate_f32.c
4 * Description: Correlation of floating-point sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @defgroup Corr Correlation
37
38 Correlation is a mathematical operation that is similar to convolution.
39 As with convolution, correlation uses two signals to produce a third signal.
40 The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
41 Correlation is commonly used to measure the similarity between two signals.
42 It has applications in pattern recognition, cryptanalysis, and searching.
43 The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
44 Fast versions of the Q15 and Q31 functions are also provided.
45
46 @par Algorithm
47 Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
48 The convolution of the two signals is denoted by
49 \f[
50 c[n] = a[n] * b[n]
51 \f]
52
53 In correlation, one of the signals is flipped in time
54
55 \f[
56 c[n] = a[n] * b[-n]
57 \f]
58 @par
59 and this is mathematically defined as
60 \f[
61 c[n] = \sum_{k=0}^{srcALen} a[k] b[k-n]
62 \f]
63 @par
64 The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
65 The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
66 The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
67
68 @note
69 The <code>pDst</code> should be initialized to all zeros before being used.
70
71 @par Fixed-Point Behavior
72 Correlation requires summing up a large number of intermediate products.
73 As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
74 Refer to the function specific documentation below for further details of the particular algorithm used.
75
76 @par Fast Versions
77 Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
78 the input signals should be scaled down to avoid intermediate overflows.
79
80 @par Opt Versions
81 Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
82 These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions of correlate
83
84 @par Long versions:
85 For convolution of long vectors, those functions are
86 no more adapted and will be very slow.
87 An implementation based upon FFTs should be used.
88 */
89
90 /**
91 @addtogroup Corr
92 @{
93 */
94
95 /**
96 @brief Correlation of floating-point sequences.
97 @param[in] pSrcA points to the first input sequence
98 @param[in] srcALen length of the first input sequence
99 @param[in] pSrcB points to the second input sequence
100 @param[in] srcBLen length of the second input sequence
101 @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
102 */
103
104 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
105
106 #include "arm_helium_utils.h"
107 #include "arm_vec_filtering.h"
108
109
arm_correlate_f32(const float32_t * pSrcA,uint32_t srcALen,const float32_t * pSrcB,uint32_t srcBLen,float32_t * pDst)110 ARM_DSP_ATTRIBUTE void arm_correlate_f32(
111 const float32_t * pSrcA,
112 uint32_t srcALen,
113 const float32_t * pSrcB,
114 uint32_t srcBLen,
115 float32_t * pDst)
116 {
117 const float32_t *pIn1 = pSrcA; /* inputA pointer */
118 const float32_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
119 const float32_t *pX, *pY;
120 const float32_t *pA, *pB;
121 int32_t i = 0U, j = 0; /* loop counters */
122 int32_t inv = 4U; /* Reverse order flag */
123 uint32_t tot = 0U; /* Length */
124 int32_t block1, block2, block3;
125 int32_t incr;
126
127 tot = ((srcALen + srcBLen) - 2U);
128 if (srcALen > srcBLen)
129 {
130 /*
131 * Calculating the number of zeros to be padded to the output
132 */
133 j = srcALen - srcBLen;
134 /*
135 * Initialize the pointer after zero padding
136 */
137 pDst += j;
138 }
139 else if (srcALen < srcBLen)
140 {
141 /*
142 * Initialization to inputB pointer
143 */
144 pIn1 = pSrcB;
145 /*
146 * Initialization to the end of inputA pointer
147 */
148 pIn2 = pSrcA + (srcALen - 1U);
149 /*
150 * Initialisation of the pointer after zero padding
151 */
152 pDst = pDst + tot;
153 /*
154 * Swapping the lengths
155 */
156
157 j = srcALen;
158 srcALen = srcBLen;
159 srcBLen = j;
160 /*
161 * Setting the reverse flag
162 */
163 inv = -4;
164 }
165
166 block1 = srcBLen - 1;
167 block2 = srcALen - srcBLen + 1;
168 block3 = srcBLen - 1;
169
170 pA = pIn1;
171 pB = pIn2;
172 incr = inv / 4;
173
174 for (i = 0U; i <= block1 - 2; i += 2)
175 {
176 uint32_t count = i + 1;
177 float32_t acc0;
178 float32_t acc1;
179
180 /*
181 * compute 2 accumulators per loop
182 * size is incrementing for second accumulator
183 * Y pointer is decrementing for second accumulator
184 */
185 pX = pA;
186 pY = pB;
187 MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count);
188
189 *pDst = acc0;
190 pDst += incr;
191 *pDst = acc1;
192 pDst += incr;
193 pB -= 2;
194 }
195 for (; i < block1; i++)
196 {
197 uint32_t count = i + 1;
198 float32_t acc;
199
200 pX = pA;
201 pY = pB;
202 MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, count);
203
204 *pDst = acc;
205 pDst += incr;
206 pB--;
207 }
208
209 for (i = 0U; i <= block2 - 4; i += 4)
210 {
211 float32_t acc0;
212 float32_t acc1;
213 float32_t acc2;
214 float32_t acc3;
215
216 pX = pA;
217 pY = pB;
218 /*
219 * compute 4 accumulators per loop
220 * size is fixed for all accumulators
221 * X pointer is incrementing for successive accumulators
222 */
223 MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F32(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
224
225 *pDst = acc0;
226 pDst += incr;
227 *pDst = acc1;
228 pDst += incr;
229 *pDst = acc2;
230 pDst += incr;
231 *pDst = acc3;
232 pDst += incr;
233 pA += 4;
234 }
235
236 for (; i <= block2 - 2; i += 2)
237 {
238 float32_t acc0;
239 float32_t acc1;
240
241 pX = pA;
242 pY = pB;
243 /*
244 * compute 2 accumulators per loop
245 * size is fixed for all accumulators
246 * X pointer is incrementing for second accumulator
247 */
248 MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, srcBLen);
249
250 *pDst = acc0;
251 pDst += incr;
252 *pDst = acc1;
253 pDst += incr;
254 pA += 2;
255 }
256
257 if (block2 & 1)
258 {
259 float32_t acc;
260
261 pX = pA;
262 pY = pB;
263 MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, srcBLen);
264
265 *pDst = acc;
266 pDst += incr;
267 pA++;
268 }
269
270 for (i = block3 - 1; i > 0; i -= 2)
271 {
272
273 uint32_t count = (i + 1);
274 float32_t acc0;
275 float32_t acc1;
276
277 pX = pA;
278 pY = pB;
279 /*
280 * compute 2 accumulators per loop
281 * size is decrementing for second accumulator
282 * X pointer is incrementing for second accumulator
283 */
284 MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count);
285
286 *pDst = acc0;
287 pDst += incr;
288 *pDst = acc1;
289 pDst += incr;
290 pA += 2;
291
292 }
293 for (; i >= 0; i--)
294 {
295 uint32_t count = (i + 1);
296 float32_t acc;
297
298 pX = pA;
299 pY = pB;
300 MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, count);
301
302 *pDst = acc;
303 pDst += incr;
304 pA++;
305 }
306 }
307
308 #else
arm_correlate_f32(const float32_t * pSrcA,uint32_t srcALen,const float32_t * pSrcB,uint32_t srcBLen,float32_t * pDst)309 ARM_DSP_ATTRIBUTE void arm_correlate_f32(
310 const float32_t * pSrcA,
311 uint32_t srcALen,
312 const float32_t * pSrcB,
313 uint32_t srcBLen,
314 float32_t * pDst)
315 {
316
317 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_AUTOVECTORIZE)
318
319 const float32_t *pIn1; /* InputA pointer */
320 const float32_t *pIn2; /* InputB pointer */
321 float32_t *pOut = pDst; /* Output pointer */
322 const float32_t *px; /* Intermediate inputA pointer */
323 const float32_t *py; /* Intermediate inputB pointer */
324 const float32_t *pSrc1;
325 float32_t sum;
326 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
327 uint32_t j, k, count, blkCnt; /* Loop counters */
328 uint32_t outBlockSize; /* Loop counter */
329 int32_t inc = 1; /* Destination address modifier */
330
331 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
332 float32_t acc0, acc1, acc2, acc3,c0; /* Accumulators */
333 #if !defined(ARM_MATH_NEON)
334 float32_t x0, x1, x2, x3; /* temporary variables for holding input and coefficient values */
335 #endif
336 #endif
337
338 /* The algorithm implementation is based on the lengths of the inputs. */
339 /* srcB is always made to slide across srcA. */
340 /* So srcBLen is always considered as shorter or equal to srcALen */
341 /* But CORR(x, y) is reverse of CORR(y, x) */
342 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
343 /* and the destination pointer modifier, inc is set to -1 */
344 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
345 /* But to improve the performance,
346 * we assume zeroes in the output instead of zero padding either of the the inputs*/
347 /* If srcALen > srcBLen,
348 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
349 /* If srcALen < srcBLen,
350 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
351 if (srcALen >= srcBLen)
352 {
353 /* Initialization of inputA pointer */
354 pIn1 = pSrcA;
355
356 /* Initialization of inputB pointer */
357 pIn2 = pSrcB;
358
359 /* Number of output samples is calculated */
360 outBlockSize = (2U * srcALen) - 1U;
361
362 /* When srcALen > srcBLen, zero padding has to be done to srcB
363 * to make their lengths equal.
364 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
365 * number of output samples are made zero */
366 j = outBlockSize - (srcALen + (srcBLen - 1U));
367
368 /* Updating the pointer position to non zero value */
369 pOut += j;
370 }
371 else
372 {
373 /* Initialization of inputA pointer */
374 pIn1 = pSrcB;
375
376 /* Initialization of inputB pointer */
377 pIn2 = pSrcA;
378
379 /* srcBLen is always considered as shorter or equal to srcALen */
380 j = srcBLen;
381 srcBLen = srcALen;
382 srcALen = j;
383
384 /* CORR(x, y) = Reverse order(CORR(y, x)) */
385 /* Hence set the destination pointer to point to the last output sample */
386 pOut = pDst + ((srcALen + srcBLen) - 2U);
387
388 /* Destination address modifier is set to -1 */
389 inc = -1;
390 }
391
392 /* The function is internally
393 * divided into three stages according to the number of multiplications that has to be
394 * taken place between inputA samples and inputB samples. In the first stage of the
395 * algorithm, the multiplications increase by one for every iteration.
396 * In the second stage of the algorithm, srcBLen number of multiplications are done.
397 * In the third stage of the algorithm, the multiplications decrease by one
398 * for every iteration. */
399
400 /* The algorithm is implemented in three stages.
401 The loop counters of each stage is initiated here. */
402 blockSize1 = srcBLen - 1U;
403 blockSize2 = srcALen - (srcBLen - 1U);
404 blockSize3 = blockSize1;
405
406 /* --------------------------
407 * Initializations of stage1
408 * -------------------------*/
409
410 /* sum = x[0] * y[srcBlen - 1]
411 * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
412 * ....
413 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
414 */
415
416 /* In this stage the MAC operations are increased by 1 for every iteration.
417 The count variable holds the number of MAC operations performed */
418 count = 1U;
419
420 /* Working pointer of inputA */
421 px = pIn1;
422
423 /* Working pointer of inputB */
424 pSrc1 = pIn2 + (srcBLen - 1U);
425 py = pSrc1;
426
427 /* ------------------------
428 * Stage1 process
429 * ----------------------*/
430
431 /* The first stage starts here */
432 while (blockSize1 > 0U)
433 {
434 /* Accumulator is made zero for every iteration */
435 sum = 0.0f;
436
437 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
438
439 /* Loop unrolling: Compute 4 outputs at a time */
440 k = count >> 2U;
441
442 #if defined(ARM_MATH_NEON)
443 float32x4_t x,y;
444 float32x4_t res = vdupq_n_f32(0) ;
445 float32x2_t accum = vdup_n_f32(0);
446
447 while (k > 0U)
448 {
449 x = vld1q_f32(px);
450 y = vld1q_f32(py);
451
452 res = vmlaq_f32(res,x, y);
453
454 px += 4;
455 py += 4;
456
457 /* Decrement the loop counter */
458 k--;
459 }
460
461 accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
462 sum += accum[0] + accum[1];
463
464 k = count & 0x3;
465 #else
466 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
467 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
468 while (k > 0U)
469 {
470 /* x[0] * y[srcBLen - 4] */
471 sum += *px++ * *py++;
472
473 /* x[1] * y[srcBLen - 3] */
474 sum += *px++ * *py++;
475
476 /* x[2] * y[srcBLen - 2] */
477 sum += *px++ * *py++;
478
479 /* x[3] * y[srcBLen - 1] */
480 sum += *px++ * *py++;
481
482 /* Decrement loop counter */
483 k--;
484 }
485
486 /* Loop unrolling: Compute remaining outputs */
487 k = count % 0x4U;
488
489 #endif /* #if defined(ARM_MATH_NEON) */
490 #else
491
492 /* Initialize k with number of samples */
493 k = count;
494
495 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
496
497 while (k > 0U)
498 {
499 /* Perform the multiply-accumulate */
500 /* x[0] * y[srcBLen - 1] */
501 sum += *px++ * *py++;
502
503 /* Decrement loop counter */
504 k--;
505 }
506
507 /* Store the result in the accumulator in the destination buffer. */
508 *pOut = sum;
509 /* Destination pointer is updated according to the address modifier, inc */
510 pOut += inc;
511
512 /* Update the inputA and inputB pointers for next MAC calculation */
513 py = pSrc1 - count;
514 px = pIn1;
515
516 /* Increment MAC count */
517 count++;
518
519 /* Decrement loop counter */
520 blockSize1--;
521 }
522
523 /* --------------------------
524 * Initializations of stage2
525 * ------------------------*/
526
527 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
528 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
529 * ....
530 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
531 */
532
533 /* Working pointer of inputA */
534 px = pIn1;
535
536 /* Working pointer of inputB */
537 py = pIn2;
538
539 /* count is index by which the pointer pIn1 to be incremented */
540 count = 0U;
541
542 /* -------------------
543 * Stage2 process
544 * ------------------*/
545
546 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
547 * So, to loop unroll over blockSize2,
548 * srcBLen should be greater than or equal to 4 */
549 if (srcBLen >= 4U)
550 {
551 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
552
553 /* Loop unrolling: Compute 4 outputs at a time */
554 blkCnt = blockSize2 >> 2U;
555
556 #if defined(ARM_MATH_NEON)
557 float32x4_t c;
558 float32x4_t x1v;
559 float32x4_t x2v;
560 float32x4_t x;
561 float32x4_t res = vdupq_n_f32(0) ;
562 #endif /* #if defined(ARM_MATH_NEON) */
563
564 while (blkCnt > 0U)
565 {
566 /* Set all accumulators to zero */
567 acc0 = 0.0f;
568 acc1 = 0.0f;
569 acc2 = 0.0f;
570 acc3 = 0.0f;
571
572 #if defined(ARM_MATH_NEON)
573 /* Compute 4 MACs simultaneously. */
574 k = srcBLen >> 2U;
575
576 res = vdupq_n_f32(0) ;
577
578 x1v = vld1q_f32(px);
579 px += 4;
580 do
581 {
582 x2v = vld1q_f32(px);
583 c = vld1q_f32(py);
584
585 py += 4;
586
587 x = x1v;
588 res = vmlaq_n_f32(res,x,c[0]);
589
590 x = vextq_f32(x1v,x2v,1);
591
592 res = vmlaq_n_f32(res,x,c[1]);
593
594 x = vextq_f32(x1v,x2v,2);
595
596 res = vmlaq_n_f32(res,x,c[2]);
597
598 x = vextq_f32(x1v,x2v,3);
599
600 res = vmlaq_n_f32(res,x,c[3]);
601
602 x1v = x2v;
603 px+=4;
604 } while (--k);
605
606 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
607 ** No loop unrolling is used. */
608 k = srcBLen & 0x3;
609
610 while (k > 0U)
611 {
612 /* Read y[srcBLen - 5] sample */
613 c0 = *(py++);
614
615 res = vmlaq_n_f32(res,x1v,c0);
616
617 /* Reuse the present samples for the next MAC */
618 x1v[0] = x1v[1];
619 x1v[1] = x1v[2];
620 x1v[2] = x1v[3];
621
622 x1v[3] = *(px++);
623
624 /* Decrement the loop counter */
625 k--;
626 }
627
628 px-=1;
629
630 acc0 = res[0];
631 acc1 = res[1];
632 acc2 = res[2];
633 acc3 = res[3];
634 #else
635 /* read x[0], x[1], x[2] samples */
636 x0 = *px++;
637 x1 = *px++;
638 x2 = *px++;
639
640 /* Apply loop unrolling and compute 4 MACs simultaneously. */
641 k = srcBLen >> 2U;
642
643 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
644 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
645 do
646 {
647 /* Read y[0] sample */
648 c0 = *(py++);
649 /* Read x[3] sample */
650 x3 = *(px++);
651
652 /* Perform the multiply-accumulate */
653 /* acc0 += x[0] * y[0] */
654 acc0 += x0 * c0;
655 /* acc1 += x[1] * y[0] */
656 acc1 += x1 * c0;
657 /* acc2 += x[2] * y[0] */
658 acc2 += x2 * c0;
659 /* acc3 += x[3] * y[0] */
660 acc3 += x3 * c0;
661
662 /* Read y[1] sample */
663 c0 = *(py++);
664 /* Read x[4] sample */
665 x0 = *(px++);
666
667 /* Perform the multiply-accumulate */
668 /* acc0 += x[1] * y[1] */
669 acc0 += x1 * c0;
670 /* acc1 += x[2] * y[1] */
671 acc1 += x2 * c0;
672 /* acc2 += x[3] * y[1] */
673 acc2 += x3 * c0;
674 /* acc3 += x[4] * y[1] */
675 acc3 += x0 * c0;
676
677 /* Read y[2] sample */
678 c0 = *(py++);
679 /* Read x[5] sample */
680 x1 = *(px++);
681
682 /* Perform the multiply-accumulate */
683 /* acc0 += x[2] * y[2] */
684 acc0 += x2 * c0;
685 /* acc1 += x[3] * y[2] */
686 acc1 += x3 * c0;
687 /* acc2 += x[4] * y[2] */
688 acc2 += x0 * c0;
689 /* acc3 += x[5] * y[2] */
690 acc3 += x1 * c0;
691
692 /* Read y[3] sample */
693 c0 = *(py++);
694 /* Read x[6] sample */
695 x2 = *(px++);
696
697 /* Perform the multiply-accumulate */
698 /* acc0 += x[3] * y[3] */
699 acc0 += x3 * c0;
700 /* acc1 += x[4] * y[3] */
701 acc1 += x0 * c0;
702 /* acc2 += x[5] * y[3] */
703 acc2 += x1 * c0;
704 /* acc3 += x[6] * y[3] */
705 acc3 += x2 * c0;
706
707 } while (--k);
708
709 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
710 ** No loop unrolling is used. */
711 k = srcBLen % 0x4U;
712
713 while (k > 0U)
714 {
715 /* Read y[4] sample */
716 c0 = *(py++);
717 /* Read x[7] sample */
718 x3 = *(px++);
719
720 /* Perform the multiply-accumulate */
721 /* acc0 += x[4] * y[4] */
722 acc0 += x0 * c0;
723 /* acc1 += x[5] * y[4] */
724 acc1 += x1 * c0;
725 /* acc2 += x[6] * y[4] */
726 acc2 += x2 * c0;
727 /* acc3 += x[7] * y[4] */
728 acc3 += x3 * c0;
729
730 /* Reuse the present samples for the next MAC */
731 x0 = x1;
732 x1 = x2;
733 x2 = x3;
734
735 /* Decrement the loop counter */
736 k--;
737 }
738
739 #endif /* #if defined(ARM_MATH_NEON) */
740
741 /* Store the result in the accumulator in the destination buffer. */
742 *pOut = acc0;
743 /* Destination pointer is updated according to the address modifier, inc */
744 pOut += inc;
745
746 *pOut = acc1;
747 pOut += inc;
748
749 *pOut = acc2;
750 pOut += inc;
751
752 *pOut = acc3;
753 pOut += inc;
754
755 /* Increment the pointer pIn1 index, count by 4 */
756 count += 4U;
757
758 /* Update the inputA and inputB pointers for next MAC calculation */
759 px = pIn1 + count;
760 py = pIn2;
761
762 /* Decrement loop counter */
763 blkCnt--;
764 }
765
766 /* Loop unrolling: Compute remaining outputs */
767 blkCnt = blockSize2 % 0x4U;
768
769 #else
770
771 /* Initialize blkCnt with number of samples */
772 blkCnt = blockSize2;
773
774 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
775
776 while (blkCnt > 0U)
777 {
778 /* Accumulator is made zero for every iteration */
779 sum = 0.0f;
780
781 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
782
783 /* Loop unrolling: Compute 4 outputs at a time */
784 k = srcBLen >> 2U;
785
786 #if defined(ARM_MATH_NEON)
787 float32x4_t x,y;
788 float32x4_t res = vdupq_n_f32(0) ;
789 float32x2_t accum = vdup_n_f32(0);
790
791 while (k > 0U)
792 {
793 x = vld1q_f32(px);
794 y = vld1q_f32(py);
795
796 res = vmlaq_f32(res,x, y);
797
798 px += 4;
799 py += 4;
800 /* Decrement the loop counter */
801 k--;
802 }
803
804 accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
805 sum += accum[0] + accum[1];
806 #else
807 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
808 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
809 while (k > 0U)
810 {
811 /* Perform the multiply-accumulate */
812 sum += *px++ * *py++;
813 sum += *px++ * *py++;
814 sum += *px++ * *py++;
815 sum += *px++ * *py++;
816
817 /* Decrement loop counter */
818 k--;
819 }
820 #endif /* #if defined(ARM_MATH_NEON) */
821 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
822 ** No loop unrolling is used. */
823 k = srcBLen % 0x4U;
824 #else
825
826 /* Initialize blkCnt with number of samples */
827 k = srcBLen;
828
829 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
830
831 while (k > 0U)
832 {
833 /* Perform the multiply-accumulate */
834 sum += *px++ * *py++;
835
836 /* Decrement the loop counter */
837 k--;
838 }
839
840 /* Store the result in the accumulator in the destination buffer. */
841 *pOut = sum;
842
843 /* Destination pointer is updated according to the address modifier, inc */
844 pOut += inc;
845
846 /* Increment the pointer pIn1 index, count by 1 */
847 count++;
848
849 /* Update the inputA and inputB pointers for next MAC calculation */
850 px = pIn1 + count;
851 py = pIn2;
852
853 /* Decrement the loop counter */
854 blkCnt--;
855 }
856 }
857 else
858 {
859 /* If the srcBLen is not a multiple of 4,
860 * the blockSize2 loop cannot be unrolled by 4 */
861 blkCnt = blockSize2;
862
863 while (blkCnt > 0U)
864 {
865 /* Accumulator is made zero for every iteration */
866 sum = 0.0f;
867
868 /* Loop over srcBLen */
869 k = srcBLen;
870
871 while (k > 0U)
872 {
873 /* Perform the multiply-accumulate */
874 sum += *px++ * *py++;
875
876 /* Decrement the loop counter */
877 k--;
878 }
879
880 /* Store the result in the accumulator in the destination buffer. */
881 *pOut = sum;
882 /* Destination pointer is updated according to the address modifier, inc */
883 pOut += inc;
884
885 /* Increment the pointer pIn1 index, count by 1 */
886 count++;
887
888 /* Update the inputA and inputB pointers for next MAC calculation */
889 px = pIn1 + count;
890 py = pIn2;
891
892 /* Decrement the loop counter */
893 blkCnt--;
894 }
895 }
896
897
898 /* --------------------------
899 * Initializations of stage3
900 * -------------------------*/
901
902 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
903 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
904 * ....
905 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
906 * sum += x[srcALen-1] * y[0]
907 */
908
909 /* In this stage the MAC operations are decreased by 1 for every iteration.
910 The count variable holds the number of MAC operations performed */
911 count = srcBLen - 1U;
912
913 /* Working pointer of inputA */
914 pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
915 px = pSrc1;
916
917 /* Working pointer of inputB */
918 py = pIn2;
919
920 /* -------------------
921 * Stage3 process
922 * ------------------*/
923
924 while (blockSize3 > 0U)
925 {
926 /* Accumulator is made zero for every iteration */
927 sum = 0.0f;
928
929 #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
930
931 /* Loop unrolling: Compute 4 outputs at a time */
932 k = count >> 2U;
933
934 #if defined(ARM_MATH_NEON)
935 float32x4_t x,y;
936 float32x4_t res = vdupq_n_f32(0) ;
937 float32x2_t accum = vdup_n_f32(0);
938
939 while (k > 0U)
940 {
941 x = vld1q_f32(px);
942 y = vld1q_f32(py);
943
944 res = vmlaq_f32(res,x, y);
945
946 px += 4;
947 py += 4;
948
949 /* Decrement the loop counter */
950 k--;
951 }
952
953 accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
954 sum += accum[0] + accum[1];
955 #else
956 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
957 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
958 while (k > 0U)
959 {
960 /* Perform the multiply-accumulate */
961 /* sum += x[srcALen - srcBLen + 4] * y[3] */
962 sum += *px++ * *py++;
963
964 /* sum += x[srcALen - srcBLen + 3] * y[2] */
965 sum += *px++ * *py++;
966
967 /* sum += x[srcALen - srcBLen + 2] * y[1] */
968 sum += *px++ * *py++;
969
970 /* sum += x[srcALen - srcBLen + 1] * y[0] */
971 sum += *px++ * *py++;
972
973 /* Decrement loop counter */
974 k--;
975 }
976
977 #endif /* #if defined (ARM_MATH_NEON) */
978 /* Loop unrolling: Compute remaining outputs */
979 k = count % 0x4U;
980
981 #else
982
983 /* Initialize blkCnt with number of samples */
984 k = count;
985
986 #endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
987
988 while (k > 0U)
989 {
990 /* Perform the multiply-accumulate */
991 sum += *px++ * *py++;
992
993 /* Decrement loop counter */
994 k--;
995 }
996
997 /* Store the result in the accumulator in the destination buffer. */
998 *pOut = sum;
999 /* Destination pointer is updated according to the address modifier, inc */
1000 pOut += inc;
1001
1002 /* Update the inputA and inputB pointers for next MAC calculation */
1003 px = ++pSrc1;
1004 py = pIn2;
1005
1006 /* Decrement MAC count */
1007 count--;
1008
1009 /* Decrement the loop counter */
1010 blockSize3--;
1011 }
1012
1013 #else
1014 /* alternate version for CM0_FAMILY */
1015
1016 const float32_t *pIn1 = pSrcA; /* inputA pointer */
1017 const float32_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
1018 float32_t sum; /* Accumulator */
1019 uint32_t i = 0U, j; /* Loop counters */
1020 uint32_t inv = 0U; /* Reverse order flag */
1021 uint32_t tot = 0U; /* Length */
1022
1023 /* The algorithm implementation is based on the lengths of the inputs. */
1024 /* srcB is always made to slide across srcA. */
1025 /* So srcBLen is always considered as shorter or equal to srcALen */
1026 /* But CORR(x, y) is reverse of CORR(y, x) */
1027 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
1028 /* and a varaible, inv is set to 1 */
1029 /* If lengths are not equal then zero pad has to be done to make the two
1030 * inputs of same length. But to improve the performance, we assume zeroes
1031 * in the output instead of zero padding either of the the inputs*/
1032 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
1033 * starting of the output buffer */
1034 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
1035 * ending of the output buffer */
1036 /* Once the zero padding is done the remaining of the output is calcualted
1037 * using convolution but with the shorter signal time shifted. */
1038
1039 /* Calculate the length of the remaining sequence */
1040 tot = ((srcALen + srcBLen) - 2U);
1041
1042 if (srcALen > srcBLen)
1043 {
1044 /* Calculating the number of zeros to be padded to the output */
1045 j = srcALen - srcBLen;
1046
1047 /* Initialise the pointer after zero padding */
1048 pDst += j;
1049 }
1050
1051 else if (srcALen < srcBLen)
1052 {
1053 /* Initialization to inputB pointer */
1054 pIn1 = pSrcB;
1055
1056 /* Initialization to the end of inputA pointer */
1057 pIn2 = pSrcA + (srcALen - 1U);
1058
1059 /* Initialisation of the pointer after zero padding */
1060 pDst = pDst + tot;
1061
1062 /* Swapping the lengths */
1063 j = srcALen;
1064 srcALen = srcBLen;
1065 srcBLen = j;
1066
1067 /* Setting the reverse flag */
1068 inv = 1;
1069
1070 }
1071
1072 /* Loop to calculate convolution for output length number of times */
1073 for (i = 0U; i <= tot; i++)
1074 {
1075 /* Initialize sum with zero to carry out MAC operations */
1076 sum = 0.0f;
1077
1078 /* Loop to perform MAC operations according to convolution equation */
1079 for (j = 0U; j <= i; j++)
1080 {
1081 /* Check the array limitations */
1082 if ((((i - j) < srcBLen) && (j < srcALen)))
1083 {
1084 /* z[i] += x[i-j] * y[j] */
1085 sum += pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)];
1086 }
1087 }
1088
1089 /* Store the output in the destination buffer */
1090 if (inv == 1)
1091 *pDst-- = sum;
1092 else
1093 *pDst++ = sum;
1094 }
1095
1096 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
1097
1098 }
1099 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1100
1101 /**
1102 @} end of Corr group
1103 */
1104