1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_correlate_q15.c
4 * Description: Correlation of Q15 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Corr
37 @{
38 */
39
40 /**
41 @brief Correlation of Q15 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
47
48 @par Scaling and Overflow Behavior
49 The function is implemented using a 64-bit internal accumulator.
50 Both inputs are in 1.15 format and multiplications yield a 2.30 result.
51 The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
52 This approach provides 33 guard bits and there is no risk of overflow.
53 The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
54
55 @remark
56 Refer to \ref arm_correlate_fast_q15() for a faster but less precise version of this function.
57 @remark
58 Refer to \ref arm_correlate_opt_q15() for a faster implementation of this function using scratch buffers.
59 */
60 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
61 #include "arm_helium_utils.h"
62 #include "arm_vec_filtering.h"
63
arm_correlate_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst)64 ARM_DSP_ATTRIBUTE void arm_correlate_q15(
65 const q15_t * pSrcA,
66 uint32_t srcALen,
67 const q15_t * pSrcB,
68 uint32_t srcBLen,
69 q15_t * pDst)
70 {
71 const q15_t *pIn1 = pSrcA; /* inputA pointer */
72 const q15_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
73 /*
74 * Loop to perform MAC operations according to correlation equation
75 */
76 const q15_t *pX;
77 const q15_t *pY;
78 const q15_t *pA;
79 const q15_t *pB;
80 int32_t i = 0U, j = 0; /* loop counters */
81 int32_t inv = 2U; /* Reverse order flag */
82 uint32_t tot = 0U; /* Length */
83 int32_t block1, block2, block3;
84 int32_t incr;
85
86 tot = ((srcALen + srcBLen) - 2U);
87 if (srcALen > srcBLen)
88 {
89 /*
90 * Calculating the number of zeros to be padded to the output
91 */
92 j = srcALen - srcBLen;
93 /*
94 * Initialize the pointer after zero padding
95 */
96 pDst += j;
97 }
98 else if (srcALen < srcBLen)
99 {
100 /*
101 * Initialization to inputB pointer
102 */
103 pIn1 = pSrcB;
104 /*
105 * Initialization to the end of inputA pointer
106 */
107 pIn2 = pSrcA + (srcALen - 1U);
108 /*
109 * Initialisation of the pointer after zero padding
110 */
111 pDst = pDst + tot;
112 /*
113 * Swapping the lengths
114 */
115 j = srcALen;
116 srcALen = srcBLen;
117 srcBLen = j;
118 /*
119 * Setting the reverse flag
120 */
121 inv = -2;
122 }
123
124 block1 = srcBLen - 1;
125 block2 = srcALen - srcBLen + 1;
126 block3 = srcBLen - 1;
127
128 pA = pIn1;
129 pB = pIn2;
130 incr = inv / 2;
131
132 for (i = 0U; i <= block1 - 2; i += 2)
133 {
134 uint32_t count = i + 1;
135 int64_t acc0 = 0LL;
136 int64_t acc1 = 0LL;
137
138 /*
139 * compute 2 accumulators per loop
140 * size is incrementing for second accumulator
141 * Y pointer is decrementing for second accumulator
142 */
143 pX = pA;
144 pY = pB;
145 MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count);
146
147 *pDst = (q15_t) acc0;
148 pDst += incr;
149 *pDst = (q15_t) acc1;
150 pDst += incr;
151 pB -= 2;
152 }
153 for (; i < block1; i++)
154 {
155 uint32_t count = i + 1;
156 int64_t acc = 0LL;
157
158 pX = pA;
159 pY = pB;
160 MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count);
161
162 *pDst = (q15_t) acc;
163 pDst += incr;
164 pB--;
165 }
166
167 for (i = 0U; i <= block2 - 4; i += 4)
168 {
169 int64_t acc0 = 0LL;
170 int64_t acc1 = 0LL;
171 int64_t acc2 = 0LL;
172 int64_t acc3 = 0LL;
173
174 pX = pA;
175 pY = pB;
176 /*
177 * compute 4 accumulators per loop
178 * size is fixed for all accumulators
179 * X pointer is incrementing for successive accumulators
180 */
181 MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
182
183 *pDst = (q15_t) acc0;
184 pDst += incr;
185 *pDst = (q15_t) acc1;
186 pDst += incr;
187 *pDst = (q15_t) acc2;
188 pDst += incr;
189 *pDst = (q15_t) acc3;
190 pDst += incr;
191 pA += 4;
192 }
193
194 for (; i <= block2 - 2; i += 2)
195 {
196 int64_t acc0 = 0LL;
197 int64_t acc1 = 0LL;
198
199 pX = pA;
200 pY = pB;
201 /*
202 * compute 2 accumulators per loop
203 * size is fixed for all accumulators
204 * X pointer is incrementing for second accumulator
205 */
206 MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, srcBLen);
207
208 *pDst = (q15_t) acc0;
209 pDst += incr;
210 *pDst = (q15_t) acc1;
211 pDst += incr;
212 pA += 2;
213 }
214
215 if (block2 & 1)
216 {
217 int64_t acc = 0LL;
218
219 pX = pA;
220 pY = pB;
221 MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, srcBLen);
222
223 *pDst = (q15_t) acc;
224 pDst += incr;
225 pA++;
226 }
227
228 for (i = block3 - 1; i > 0; i -= 2)
229 {
230
231 uint32_t count = (i + 1);
232 int64_t acc0 = 0LL;
233 int64_t acc1 = 0LL;
234
235 pX = pA;
236 pY = pB;
237 /*
238 * compute 2 accumulators per loop
239 * size is decrementing for second accumulator
240 * X pointer is incrementing for second accumulator
241 */
242 MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count);
243
244 *pDst = (q15_t) acc0;
245 pDst += incr;
246 *pDst = (q15_t) acc1;
247 pDst += incr;
248 pA += 2;
249
250 }
251 for (; i >= 0; i--)
252 {
253 uint32_t count = (i + 1);
254 int64_t acc = 0LL;
255
256 pX = pA;
257 pY = pB;
258 MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count);
259
260 *pDst = (q15_t) acc;
261 pDst += incr;
262 pA++;
263 }
264 }
265
266 #else
arm_correlate_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst)267 ARM_DSP_ATTRIBUTE void arm_correlate_q15(
268 const q15_t * pSrcA,
269 uint32_t srcALen,
270 const q15_t * pSrcB,
271 uint32_t srcBLen,
272 q15_t * pDst)
273 {
274
275 #if defined (ARM_MATH_DSP)
276
277 const q15_t *pIn1; /* InputA pointer */
278 const q15_t *pIn2; /* InputB pointer */
279 q15_t *pOut = pDst; /* Output pointer */
280 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
281 const q15_t *px; /* Intermediate inputA pointer */
282 const q15_t *py; /* Intermediate inputB pointer */
283 const q15_t *pSrc1; /* Intermediate pointers */
284 q31_t x0, x1, x2, x3, c0; /* Temporary input variables for holding input and coefficient values */
285 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
286 uint32_t j, k, count, blkCnt; /* Loop counters */
287 uint32_t outBlockSize;
288 int32_t inc = 1; /* Destination address modifier */
289
290 /* The algorithm implementation is based on the lengths of the inputs. */
291 /* srcB is always made to slide across srcA. */
292 /* So srcBLen is always considered as shorter or equal to srcALen */
293 /* But CORR(x, y) is reverse of CORR(y, x) */
294 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
295 /* and the destination pointer modifier, inc is set to -1 */
296 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
297 /* But to improve the performance,
298 * we include zeroes in the output instead of zero padding either of the the inputs*/
299 /* If srcALen > srcBLen,
300 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
301 /* If srcALen < srcBLen,
302 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
303 if (srcALen >= srcBLen)
304 {
305 /* Initialization of inputA pointer */
306 pIn1 = pSrcA;
307
308 /* Initialization of inputB pointer */
309 pIn2 = pSrcB;
310
311 /* Number of output samples is calculated */
312 outBlockSize = (srcALen * 2U) - 1U;
313
314 /* When srcALen > srcBLen, zero padding is done to srcB
315 * to make their lengths equal.
316 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
317 * number of output samples are made zero */
318 j = outBlockSize - (srcALen + (srcBLen - 1U));
319
320 /* Updating the pointer position to non zero value */
321 pOut += j;
322 }
323 else
324 {
325 /* Initialization of inputA pointer */
326 pIn1 = pSrcB;
327
328 /* Initialization of inputB pointer */
329 pIn2 = pSrcA;
330
331 /* srcBLen is always considered as shorter or equal to srcALen */
332 j = srcBLen;
333 srcBLen = srcALen;
334 srcALen = j;
335
336 /* CORR(x, y) = Reverse order(CORR(y, x)) */
337 /* Hence set the destination pointer to point to the last output sample */
338 pOut = pDst + ((srcALen + srcBLen) - 2U);
339
340 /* Destination address modifier is set to -1 */
341 inc = -1;
342 }
343
344 /* The function is internally
345 * divided into three stages according to the number of multiplications that has to be
346 * taken place between inputA samples and inputB samples. In the first stage of the
347 * algorithm, the multiplications increase by one for every iteration.
348 * In the second stage of the algorithm, srcBLen number of multiplications are done.
349 * In the third stage of the algorithm, the multiplications decrease by one
350 * for every iteration. */
351
352 /* The algorithm is implemented in three stages.
353 The loop counters of each stage is initiated here. */
354 blockSize1 = srcBLen - 1U;
355 blockSize2 = srcALen - (srcBLen - 1U);
356 blockSize3 = blockSize1;
357
358 /* --------------------------
359 * Initializations of stage1
360 * -------------------------*/
361
362 /* sum = x[0] * y[srcBlen - 1]
363 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
364 * ....
365 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
366 */
367
368 /* In this stage the MAC operations are increased by 1 for every iteration.
369 The count variable holds the number of MAC operations performed */
370 count = 1U;
371
372 /* Working pointer of inputA */
373 px = pIn1;
374
375 /* Working pointer of inputB */
376 pSrc1 = pIn2 + (srcBLen - 1U);
377 py = pSrc1;
378
379 /* ------------------------
380 * Stage1 process
381 * ----------------------*/
382
383 /* The first loop starts here */
384 while (blockSize1 > 0U)
385 {
386 /* Accumulator is made zero for every iteration */
387 sum = 0;
388
389 /* Apply loop unrolling and compute 4 MACs simultaneously. */
390 k = count >> 2U;
391
392 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
393 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
394 while (k > 0U)
395 {
396 /* Perform the multiply-accumulate */
397 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
398 sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
399 /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
400 sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
401
402 /* Decrement loop counter */
403 k--;
404 }
405
406 /* If the count is not a multiple of 4, compute any remaining MACs here.
407 ** No loop unrolling is used. */
408 k = count % 0x4U;
409
410 while (k > 0U)
411 {
412 /* Perform the multiply-accumulate */
413 /* x[0] * y[srcBLen - 1] */
414 sum = __SMLALD(*px++, *py++, sum);
415
416 /* Decrement loop counter */
417 k--;
418 }
419
420 /* Store the result in the accumulator in the destination buffer. */
421 *pOut = (q15_t) (__SSAT((sum >> 15), 16));
422 /* Destination pointer is updated according to the address modifier, inc */
423 pOut += inc;
424
425 /* Update the inputA and inputB pointers for next MAC calculation */
426 py = pSrc1 - count;
427 px = pIn1;
428
429 /* Increment MAC count */
430 count++;
431
432 /* Decrement loop counter */
433 blockSize1--;
434 }
435
436 /* --------------------------
437 * Initializations of stage2
438 * ------------------------*/
439
440 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
441 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
442 * ....
443 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
444 */
445
446 /* Working pointer of inputA */
447 px = pIn1;
448
449 /* Working pointer of inputB */
450 py = pIn2;
451
452 /* count is the index by which the pointer pIn1 to be incremented */
453 count = 0U;
454
455 /* -------------------
456 * Stage2 process
457 * ------------------*/
458
459 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
460 * So, to loop unroll over blockSize2,
461 * srcBLen should be greater than or equal to 4 */
462 if (srcBLen >= 4U)
463 {
464 /* Loop unrolling: Compute 4 outputs at a time */
465 blkCnt = blockSize2 >> 2U;
466
467 while (blkCnt > 0U)
468 {
469 /* Set all accumulators to zero */
470 acc0 = 0;
471 acc1 = 0;
472 acc2 = 0;
473 acc3 = 0;
474
475 /* read x[0], x[1] samples */
476 x0 = read_q15x2 ((q15_t *) px);
477
478 /* read x[1], x[2] samples */
479 x1 = read_q15x2 ((q15_t *) px + 1);
480 px += 2U;
481
482 /* Apply loop unrolling and compute 4 MACs simultaneously. */
483 k = srcBLen >> 2U;
484
485 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
486 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
487 do
488 {
489 /* Read the first two inputB samples using SIMD:
490 * y[0] and y[1] */
491 c0 = read_q15x2_ia ((q15_t **) &py);
492
493 /* acc0 += x[0] * y[0] + x[1] * y[1] */
494 acc0 = __SMLALD(x0, c0, acc0);
495
496 /* acc1 += x[1] * y[0] + x[2] * y[1] */
497 acc1 = __SMLALD(x1, c0, acc1);
498
499 /* Read x[2], x[3] */
500 x2 = read_q15x2 ((q15_t *) px);
501
502 /* Read x[3], x[4] */
503 x3 = read_q15x2 ((q15_t *) px + 1);
504
505 /* acc2 += x[2] * y[0] + x[3] * y[1] */
506 acc2 = __SMLALD(x2, c0, acc2);
507
508 /* acc3 += x[3] * y[0] + x[4] * y[1] */
509 acc3 = __SMLALD(x3, c0, acc3);
510
511 /* Read y[2] and y[3] */
512 c0 = read_q15x2_ia ((q15_t **) &py);
513
514 /* acc0 += x[2] * y[2] + x[3] * y[3] */
515 acc0 = __SMLALD(x2, c0, acc0);
516
517 /* acc1 += x[3] * y[2] + x[4] * y[3] */
518 acc1 = __SMLALD(x3, c0, acc1);
519
520 /* Read x[4], x[5] */
521 x0 = read_q15x2 ((q15_t *) px + 2);
522
523 /* Read x[5], x[6] */
524 x1 = read_q15x2 ((q15_t *) px + 3);
525 px += 4U;
526
527 /* acc2 += x[4] * y[2] + x[5] * y[3] */
528 acc2 = __SMLALD(x0, c0, acc2);
529
530 /* acc3 += x[5] * y[2] + x[6] * y[3] */
531 acc3 = __SMLALD(x1, c0, acc3);
532
533 } while (--k);
534
535 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
536 ** No loop unrolling is used. */
537 k = srcBLen % 0x4U;
538
539 if (k == 1U)
540 {
541 /* Read y[4] */
542 c0 = *py;
543 #ifdef ARM_MATH_BIG_ENDIAN
544 c0 = c0 << 16U;
545 #else
546 c0 = c0 & 0x0000FFFF;
547 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
548
549 /* Read x[7] */
550 x3 = read_q15x2 ((q15_t *) px);
551 px++;
552
553 /* Perform the multiply-accumulate */
554 acc0 = __SMLALD (x0, c0, acc0);
555 acc1 = __SMLALD (x1, c0, acc1);
556 acc2 = __SMLALDX(x1, c0, acc2);
557 acc3 = __SMLALDX(x3, c0, acc3);
558 }
559
560 if (k == 2U)
561 {
562 /* Read y[4], y[5] */
563 c0 = read_q15x2 ((q15_t *) py);
564
565 /* Read x[7], x[8] */
566 x3 = read_q15x2 ((q15_t *) px);
567
568 /* Read x[9] */
569 x2 = read_q15x2 ((q15_t *) px + 1);
570 px += 2U;
571
572 /* Perform the multiply-accumulate */
573 acc0 = __SMLALD(x0, c0, acc0);
574 acc1 = __SMLALD(x1, c0, acc1);
575 acc2 = __SMLALD(x3, c0, acc2);
576 acc3 = __SMLALD(x2, c0, acc3);
577 }
578
579 if (k == 3U)
580 {
581 /* Read y[4], y[5] */
582 c0 = read_q15x2_ia ((q15_t **) &py);
583
584 /* Read x[7], x[8] */
585 x3 = read_q15x2 ((q15_t *) px);
586
587 /* Read x[9] */
588 x2 = read_q15x2 ((q15_t *) px + 1);
589
590 /* Perform the multiply-accumulate */
591 acc0 = __SMLALD(x0, c0, acc0);
592 acc1 = __SMLALD(x1, c0, acc1);
593 acc2 = __SMLALD(x3, c0, acc2);
594 acc3 = __SMLALD(x2, c0, acc3);
595
596 c0 = (*py);
597
598 /* Read y[6] */
599 #ifdef ARM_MATH_BIG_ENDIAN
600 c0 = c0 << 16U;
601 #else
602 c0 = c0 & 0x0000FFFF;
603 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
604
605 /* Read x[10] */
606 x3 = read_q15x2 ((q15_t *) px + 2);
607 px += 3U;
608
609 /* Perform the multiply-accumulates */
610 acc0 = __SMLALDX(x1, c0, acc0);
611 acc1 = __SMLALD (x2, c0, acc1);
612 acc2 = __SMLALDX(x2, c0, acc2);
613 acc3 = __SMLALDX(x3, c0, acc3);
614 }
615
616 /* Store the result in the accumulator in the destination buffer. */
617 *pOut = (q15_t) (__SSAT(acc0 >> 15, 16));
618 /* Destination pointer is updated according to the address modifier, inc */
619 pOut += inc;
620
621 *pOut = (q15_t) (__SSAT(acc1 >> 15, 16));
622 pOut += inc;
623
624 *pOut = (q15_t) (__SSAT(acc2 >> 15, 16));
625 pOut += inc;
626
627 *pOut = (q15_t) (__SSAT(acc3 >> 15, 16));
628 pOut += inc;
629
630 /* Increment the count by 4 as 4 output values are computed */
631 count += 4U;
632
633 /* Update the inputA and inputB pointers for next MAC calculation */
634 px = pIn1 + count;
635 py = pIn2;
636
637 /* Decrement loop counter */
638 blkCnt--;
639 }
640
641 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
642 ** No loop unrolling is used. */
643 blkCnt = blockSize2 % 0x4U;
644
645 while (blkCnt > 0U)
646 {
647 /* Accumulator is made zero for every iteration */
648 sum = 0;
649
650 /* Apply loop unrolling and compute 4 MACs simultaneously. */
651 k = srcBLen >> 2U;
652
653 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
654 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
655 while (k > 0U)
656 {
657 /* Perform the multiply-accumulates */
658 sum += ((q63_t) *px++ * *py++);
659 sum += ((q63_t) *px++ * *py++);
660 sum += ((q63_t) *px++ * *py++);
661 sum += ((q63_t) *px++ * *py++);
662
663 /* Decrement loop counter */
664 k--;
665 }
666
667 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
668 ** No loop unrolling is used. */
669 k = srcBLen % 0x4U;
670
671 while (k > 0U)
672 {
673 /* Perform the multiply-accumulates */
674 sum += ((q63_t) *px++ * *py++);
675
676 /* Decrement the loop counter */
677 k--;
678 }
679
680 /* Store the result in the accumulator in the destination buffer. */
681 *pOut = (q15_t) (__SSAT(sum >> 15, 16));
682 /* Destination pointer is updated according to the address modifier, inc */
683 pOut += inc;
684
685 /* Increment count by 1, as one output value is computed */
686 count++;
687
688 /* Update the inputA and inputB pointers for next MAC calculation */
689 px = pIn1 + count;
690 py = pIn2;
691
692 /* Decrement the loop counter */
693 blkCnt--;
694 }
695 }
696 else
697 {
698 /* If the srcBLen is not a multiple of 4,
699 * the blockSize2 loop cannot be unrolled by 4 */
700 blkCnt = blockSize2;
701
702 while (blkCnt > 0U)
703 {
704 /* Accumulator is made zero for every iteration */
705 sum = 0;
706
707 /* srcBLen number of MACS should be performed */
708 k = srcBLen;
709
710 while (k > 0U)
711 {
712 /* Perform the multiply-accumulate */
713 sum += ((q63_t) *px++ * *py++);
714
715 /* Decrement the loop counter */
716 k--;
717 }
718
719 /* Store the result in the accumulator in the destination buffer. */
720 *pOut = (q15_t) (__SSAT(sum >> 15, 16));
721 /* Destination pointer is updated according to the address modifier, inc */
722 pOut += inc;
723
724 /* Increment the MAC count */
725 count++;
726
727 /* Update the inputA and inputB pointers for next MAC calculation */
728 px = pIn1 + count;
729 py = pIn2;
730
731 /* Decrement the loop counter */
732 blkCnt--;
733 }
734 }
735
736
737 /* --------------------------
738 * Initializations of stage3
739 * -------------------------*/
740
741 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
742 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
743 * ....
744 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
745 * sum += x[srcALen-1] * y[0]
746 */
747
748 /* In this stage the MAC operations are decreased by 1 for every iteration.
749 The count variable holds the number of MAC operations performed */
750 count = srcBLen - 1U;
751
752 /* Working pointer of inputA */
753 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
754 px = pSrc1;
755
756 /* Working pointer of inputB */
757 py = pIn2;
758
759 /* -------------------
760 * Stage3 process
761 * ------------------*/
762
763 while (blockSize3 > 0U)
764 {
765 /* Accumulator is made zero for every iteration */
766 sum = 0;
767
768 /* Apply loop unrolling and compute 4 MACs simultaneously. */
769 k = count >> 2U;
770
771 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
772 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
773 while (k > 0U)
774 {
775 /* Perform the multiply-accumulate */
776 /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
777 sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
778 /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
779 sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
780
781 /* Decrement loop counter */
782 k--;
783 }
784
785 /* If the count is not a multiple of 4, compute any remaining MACs here.
786 ** No loop unrolling is used. */
787 k = count % 0x4U;
788
789 while (k > 0U)
790 {
791 /* Perform the multiply-accumulate */
792 sum = __SMLALD(*px++, *py++, sum);
793
794 /* Decrement loop counter */
795 k--;
796 }
797
798 /* Store the result in the accumulator in the destination buffer. */
799 *pOut = (q15_t) (__SSAT((sum >> 15), 16));
800 /* Destination pointer is updated according to the address modifier, inc */
801 pOut += inc;
802
803 /* Update the inputA and inputB pointers for next MAC calculation */
804 px = ++pSrc1;
805 py = pIn2;
806
807 /* Decrement MAC count */
808 count--;
809
810 /* Decrement loop counter */
811 blockSize3--;
812 }
813
814 #else /* #if defined (ARM_MATH_DSP) */
815
816 const q15_t *pIn1 = pSrcA; /* InputA pointer */
817 const q15_t *pIn2 = pSrcB + (srcBLen - 1U); /* InputB pointer */
818 q63_t sum; /* Accumulators */
819 uint32_t i = 0U, j; /* Loop counters */
820 uint32_t inv = 0U; /* Reverse order flag */
821 uint32_t tot = 0U; /* Length */
822
823 /* The algorithm implementation is based on the lengths of the inputs. */
824 /* srcB is always made to slide across srcA. */
825 /* So srcBLen is always considered as shorter or equal to srcALen */
826 /* But CORR(x, y) is reverse of CORR(y, x) */
827 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
828 /* and a varaible, inv is set to 1 */
829 /* If lengths are not equal then zero pad has to be done to make the two
830 * inputs of same length. But to improve the performance, we include zeroes
831 * in the output instead of zero padding either of the the inputs*/
832 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
833 * starting of the output buffer */
834 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
835 * ending of the output buffer */
836 /* Once the zero padding is done the remaining of the output is calcualted
837 * using convolution but with the shorter signal time shifted. */
838
839 /* Calculate the length of the remaining sequence */
840 tot = ((srcALen + srcBLen) - 2U);
841
842 if (srcALen > srcBLen)
843 {
844 /* Calculating the number of zeros to be padded to the output */
845 j = srcALen - srcBLen;
846
847 /* Initialise the pointer after zero padding */
848 pDst += j;
849 }
850
851 else if (srcALen < srcBLen)
852 {
853 /* Initialization to inputB pointer */
854 pIn1 = pSrcB;
855
856 /* Initialization to the end of inputA pointer */
857 pIn2 = pSrcA + (srcALen - 1U);
858
859 /* Initialisation of the pointer after zero padding */
860 pDst = pDst + tot;
861
862 /* Swapping the lengths */
863 j = srcALen;
864 srcALen = srcBLen;
865 srcBLen = j;
866
867 /* Setting the reverse flag */
868 inv = 1;
869 }
870
871 /* Loop to calculate convolution for output length number of values */
872 for (i = 0U; i <= tot; i++)
873 {
874 /* Initialize sum with zero to carry on MAC operations */
875 sum = 0;
876
877 /* Loop to perform MAC operations according to convolution equation */
878 for (j = 0U; j <= i; j++)
879 {
880 /* Check the array limitations */
881 if (((i - j) < srcBLen) && (j < srcALen))
882 {
883 /* z[i] += x[i-j] * y[j] */
884 sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)]);
885 }
886 }
887
888 /* Store the output in the destination buffer */
889 if (inv == 1)
890 *pDst-- = (q15_t) __SSAT((sum >> 15U), 16U);
891 else
892 *pDst++ = (q15_t) __SSAT((sum >> 15U), 16U);
893 }
894
895 #endif /* #if defined (ARM_MATH_DSP) */
896
897 }
898 #endif /* defined(ARM_MATH_MVEI) */
899
900 /**
901 @} end of Corr group
902 */
903