1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_correlate_q7.c
4 * Description: Correlation of Q7 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Corr
37 @{
38 */
39
40 /**
41 @brief Correlation of Q7 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
47
48 @par Scaling and Overflow Behavior
49 The function is implemented using a 32-bit internal accumulator.
50 Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
51 The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
52 This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
53 The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
54
55 @remark
56 Refer to \ref arm_correlate_opt_q7() for a faster implementation of this function.
57 */
58
59 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(ARM_DSP_BUILT_WITH_GCC)
60 #pragma message "Scalar version of arm_correlate_q7 built. Helium version has build issues with gcc."
61 #endif
62
63 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(ARM_DSP_BUILT_WITH_GCC)
64 #include "arm_helium_utils.h"
65
66 #include "arm_vec_filtering.h"
arm_correlate_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst)67 ARM_DSP_ATTRIBUTE void arm_correlate_q7(
68 const q7_t * pSrcA,
69 uint32_t srcALen,
70 const q7_t * pSrcB,
71 uint32_t srcBLen,
72 q7_t * pDst)
73 {
74 const q7_t *pIn1 = pSrcA; /* inputA pointer */
75 const q7_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
76 const q7_t *pX, *pY;
77 const q7_t *pA, *pB;
78 int32_t i = 0U, j = 0; /* loop counters */
79 int32_t inv = 1U; /* Reverse order flag */
80 uint32_t tot = 0U; /* Length */
81 int32_t block1, block2, block3;
82 int32_t incr;
83
84 tot = ((srcALen + srcBLen) - 2U);
85 if (srcALen > srcBLen)
86 {
87 /*
88 * Calculating the number of zeros to be padded to the output
89 */
90 j = srcALen - srcBLen;
91 /*
92 * Initialize the pointer after zero padding
93 */
94 pDst += j;
95 }
96 else if (srcALen < srcBLen)
97 {
98 /*
99 * Initialization to inputB pointer
100 */
101 pIn1 = pSrcB;
102 /*
103 * Initialization to the end of inputA pointer
104 */
105 pIn2 = pSrcA + (srcALen - 1U);
106 /*
107 * Initialisation of the pointer after zero padding
108 */
109 pDst = pDst + tot;
110 /*
111 * Swapping the lengths
112 */
113 j = srcALen;
114 srcALen = srcBLen;
115 srcBLen = j;
116 /*
117 * Setting the reverse flag
118 */
119 inv = -1;
120 }
121
122 block1 = srcBLen - 1;
123 block2 = srcALen - srcBLen + 1;
124 block3 = srcBLen - 1;
125
126 pA = pIn1;
127 pB = pIn2;
128 incr = inv;
129
130 for (i = 0U; i <= block1 - 2; i += 2)
131 {
132 uint32_t count = i + 1;
133 int32_t acc0 = 0;
134 int32_t acc1 = 0;
135
136 /*
137 * compute 2 accumulators per loop
138 * size is incrementing for second accumulator
139 * Y pointer is decrementing for second accumulator
140 */
141 pX = pA;
142 pY = pB;
143 MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count);
144
145 *pDst = (q7_t) acc0;
146 pDst += incr;
147 *pDst = (q7_t) acc1;
148 pDst += incr;
149 pB -= 2;
150 }
151 for (; i < block1; i++)
152 {
153 uint32_t count = i + 1;
154 int32_t acc = 0;
155
156 pX = pA;
157 pY = pB;
158 MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count);
159
160 *pDst = (q7_t) acc;
161 pDst += incr;
162 pB--;
163 }
164
165 for (i = 0U; i <= block2 - 4; i += 4)
166 {
167 int32_t acc0 = 0;
168 int32_t acc1 = 0;
169 int32_t acc2 = 0;
170 int32_t acc3 = 0;
171
172 pX = pA;
173 pY = pB;
174 /*
175 * compute 4 accumulators per loop
176 * size is fixed for all accumulators
177 * X pointer is incrementing for successive accumulators
178 */
179 MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
180
181 *pDst = (q7_t) acc0;
182 pDst += incr;
183 *pDst = (q7_t) acc1;
184 pDst += incr;
185 *pDst = (q7_t) acc2;
186 pDst += incr;
187 *pDst = (q7_t) acc3;
188 pDst += incr;
189
190 pA += 4;
191 }
192
193 for (; i <= block2 - 2; i += 2)
194 {
195 int32_t acc0 = 0LL;
196 int32_t acc1 = 0LL;
197
198 pX = pA;
199 pY = pB;
200 /*
201 * compute 2 accumulators per loop
202 * size is fixed for all accumulators
203 * X pointer is incrementing for second accumulator
204 */
205 MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, srcBLen);
206
207 *pDst = (q7_t) acc0;
208 pDst += incr;
209 *pDst = (q7_t) acc1;
210 pDst += incr;
211 pA += 2;
212 }
213
214 if (block2 & 1)
215 {
216 int32_t acc = 0LL;
217
218 pX = pA;
219 pY = pB;
220 MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, srcBLen);
221
222 *pDst = (q7_t) acc;
223 pDst += incr;
224 pA++;
225 }
226
227 for (i = block3 - 1; i > 0; i -= 2)
228 {
229 uint32_t count = (i + 1);
230 int32_t acc0 = 0LL;
231 int32_t acc1 = 0LL;
232
233 pX = pA;
234 pY = pB;
235 /*
236 * compute 2 accumulators per loop
237 * size is decrementing for second accumulator
238 * X pointer is incrementing for second accumulator
239 */
240 MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count);
241
242 *pDst = (q7_t) acc0;
243 pDst += incr;
244 *pDst = (q7_t) acc1;
245 pDst += incr;
246 pA += 2;
247
248 }
249 for (; i >= 0; i--)
250 {
251 uint32_t count = (i + 1);
252 int64_t acc = 0LL;
253
254 pX = pA;
255 pY = pB;
256 MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count);
257
258 *pDst = (q7_t) acc;
259 pDst += incr;
260 pA++;
261 }
262 }
263
264 #else
arm_correlate_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst)265 ARM_DSP_ATTRIBUTE void arm_correlate_q7(
266 const q7_t * pSrcA,
267 uint32_t srcALen,
268 const q7_t * pSrcB,
269 uint32_t srcBLen,
270 q7_t * pDst)
271 {
272
273 #if (1)
274 //#if !defined(ARM_MATH_CM0_FAMILY)
275
276 const q7_t *pIn1; /* InputA pointer */
277 const q7_t *pIn2; /* InputB pointer */
278 q7_t *pOut = pDst; /* Output pointer */
279 const q7_t *px; /* Intermediate inputA pointer */
280 const q7_t *py; /* Intermediate inputB pointer */
281 const q7_t *pSrc1; /* Intermediate pointers */
282 q31_t sum; /* Accumulators */
283 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
284 uint32_t j, k, count, blkCnt; /* Loop counters */
285 uint32_t outBlockSize;
286 int32_t inc = 1;
287
288 #if defined (ARM_MATH_LOOPUNROLL)
289 q31_t acc0, acc1, acc2, acc3; /* Accumulators */
290 q31_t input1, input2; /* Temporary input variables */
291 q15_t in1, in2; /* Temporary input variables */
292 q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables for holding input and coefficient values */
293 #endif
294
295 /* The algorithm implementation is based on the lengths of the inputs. */
296 /* srcB is always made to slide across srcA. */
297 /* So srcBLen is always considered as shorter or equal to srcALen */
298 /* But CORR(x, y) is reverse of CORR(y, x) */
299 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
300 /* and the destination pointer modifier, inc is set to -1 */
301 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
302 /* But to improve the performance,
303 * we include zeroes in the output instead of zero padding either of the the inputs*/
304 /* If srcALen > srcBLen,
305 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
306 /* If srcALen < srcBLen,
307 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
308 if (srcALen >= srcBLen)
309 {
310 /* Initialization of inputA pointer */
311 pIn1 = pSrcA;
312
313 /* Initialization of inputB pointer */
314 pIn2 = pSrcB;
315
316 /* Number of output samples is calculated */
317 outBlockSize = (2U * srcALen) - 1U;
318
319 /* When srcALen > srcBLen, zero padding is done to srcB
320 * to make their lengths equal.
321 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
322 * number of output samples are made zero */
323 j = outBlockSize - (srcALen + (srcBLen - 1U));
324
325 /* Updating the pointer position to non zero value */
326 pOut += j;
327 }
328 else
329 {
330 /* Initialization of inputA pointer */
331 pIn1 = pSrcB;
332
333 /* Initialization of inputB pointer */
334 pIn2 = pSrcA;
335
336 /* srcBLen is always considered as shorter or equal to srcALen */
337 j = srcBLen;
338 srcBLen = srcALen;
339 srcALen = j;
340
341 /* CORR(x, y) = Reverse order(CORR(y, x)) */
342 /* Hence set the destination pointer to point to the last output sample */
343 pOut = pDst + ((srcALen + srcBLen) - 2U);
344
345 /* Destination address modifier is set to -1 */
346 inc = -1;
347 }
348
349 /* The function is internally
350 * divided into three stages according to the number of multiplications that has to be
351 * taken place between inputA samples and inputB samples. In the first stage of the
352 * algorithm, the multiplications increase by one for every iteration.
353 * In the second stage of the algorithm, srcBLen number of multiplications are done.
354 * In the third stage of the algorithm, the multiplications decrease by one
355 * for every iteration. */
356
357 /* The algorithm is implemented in three stages.
358 The loop counters of each stage is initiated here. */
359 blockSize1 = srcBLen - 1U;
360 blockSize2 = srcALen - (srcBLen - 1U);
361 blockSize3 = blockSize1;
362
363 /* --------------------------
364 * Initializations of stage1
365 * -------------------------*/
366
367 /* sum = x[0] * y[srcBlen - 1]
368 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
369 * ....
370 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
371 */
372
373 /* In this stage the MAC operations are increased by 1 for every iteration.
374 The count variable holds the number of MAC operations performed */
375 count = 1U;
376
377 /* Working pointer of inputA */
378 px = pIn1;
379
380 /* Working pointer of inputB */
381 pSrc1 = pIn2 + (srcBLen - 1U);
382 py = pSrc1;
383
384 /* ------------------------
385 * Stage1 process
386 * ----------------------*/
387
388 /* The first stage starts here */
389 while (blockSize1 > 0U)
390 {
391 /* Accumulator is made zero for every iteration */
392 sum = 0;
393
394 #if defined (ARM_MATH_LOOPUNROLL)
395
396 /* Loop unrolling: Compute 4 outputs at a time */
397 k = count >> 2U;
398
399 while (k > 0U)
400 {
401 /* x[0] , x[1] */
402 in1 = (q15_t) *px++;
403 in2 = (q15_t) *px++;
404 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
405
406 /* y[srcBLen - 4] , y[srcBLen - 3] */
407 in1 = (q15_t) *py++;
408 in2 = (q15_t) *py++;
409 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
410
411 /* x[0] * y[srcBLen - 4] */
412 /* x[1] * y[srcBLen - 3] */
413 sum = __SMLAD(input1, input2, sum);
414
415 /* x[2] , x[3] */
416 in1 = (q15_t) *px++;
417 in2 = (q15_t) *px++;
418 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
419
420 /* y[srcBLen - 2] , y[srcBLen - 1] */
421 in1 = (q15_t) *py++;
422 in2 = (q15_t) *py++;
423 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
424
425 /* x[2] * y[srcBLen - 2] */
426 /* x[3] * y[srcBLen - 1] */
427 sum = __SMLAD(input1, input2, sum);
428
429 /* Decrement loop counter */
430 k--;
431 }
432
433 /* Loop unrolling: Compute remaining outputs */
434 k = count % 0x4U;
435
436 #else
437
438 /* Initialize k with number of samples */
439 k = count;
440
441 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
442
443 while (k > 0U)
444 {
445 /* Perform the multiply-accumulate */
446 /* x[0] * y[srcBLen - 1] */
447 sum += (q31_t) ((q15_t) *px++ * *py++);
448
449 /* Decrement loop counter */
450 k--;
451 }
452
453 /* Store the result in the accumulator in the destination buffer. */
454 *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
455 /* Destination pointer is updated according to the address modifier, inc */
456 pOut += inc;
457
458 /* Update the inputA and inputB pointers for next MAC calculation */
459 py = pSrc1 - count;
460 px = pIn1;
461
462 /* Increment MAC count */
463 count++;
464
465 /* Decrement loop counter */
466 blockSize1--;
467 }
468
469 /* --------------------------
470 * Initializations of stage2
471 * ------------------------*/
472
473 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
474 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
475 * ....
476 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
477 */
478
479 /* Working pointer of inputA */
480 px = pIn1;
481
482 /* Working pointer of inputB */
483 py = pIn2;
484
485 /* count is index by which the pointer pIn1 to be incremented */
486 count = 0U;
487
488 /* -------------------
489 * Stage2 process
490 * ------------------*/
491
492 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
493 * So, to loop unroll over blockSize2,
494 * srcBLen should be greater than or equal to 4 */
495 if (srcBLen >= 4U)
496 {
497 #if defined (ARM_MATH_LOOPUNROLL)
498
499 /* Loop unrolling: Compute 4 outputs at a time */
500 blkCnt = blockSize2 >> 2U;
501
502 while (blkCnt > 0U)
503 {
504 /* Set all accumulators to zero */
505 acc0 = 0;
506 acc1 = 0;
507 acc2 = 0;
508 acc3 = 0;
509
510 /* read x[0], x[1], x[2] samples */
511 x0 = *px++;
512 x1 = *px++;
513 x2 = *px++;
514
515 /* Apply loop unrolling and compute 4 MACs simultaneously. */
516 k = srcBLen >> 2U;
517
518 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
519 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
520 do
521 {
522 /* Read y[0] sample */
523 c0 = *py++;
524 /* Read y[1] sample */
525 c1 = *py++;
526
527 /* Read x[3] sample */
528 x3 = *px++;
529
530 /* x[0] and x[1] are packed */
531 in1 = (q15_t) x0;
532 in2 = (q15_t) x1;
533
534 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
535
536 /* y[0] and y[1] are packed */
537 in1 = (q15_t) c0;
538 in2 = (q15_t) c1;
539
540 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
541
542 /* acc0 += x[0] * y[0] + x[1] * y[1] */
543 acc0 = __SMLAD(input1, input2, acc0);
544
545 /* x[1] and x[2] are packed */
546 in1 = (q15_t) x1;
547 in2 = (q15_t) x2;
548
549 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
550
551 /* acc1 += x[1] * y[0] + x[2] * y[1] */
552 acc1 = __SMLAD(input1, input2, acc1);
553
554 /* x[2] and x[3] are packed */
555 in1 = (q15_t) x2;
556 in2 = (q15_t) x3;
557
558 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
559
560 /* acc2 += x[2] * y[0] + x[3] * y[1] */
561 acc2 = __SMLAD(input1, input2, acc2);
562
563 /* Read x[4] sample */
564 x0 = *px++;
565
566 /* x[3] and x[4] are packed */
567 in1 = (q15_t) x3;
568 in2 = (q15_t) x0;
569
570 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
571
572 /* acc3 += x[3] * y[0] + x[4] * y[1] */
573 acc3 = __SMLAD(input1, input2, acc3);
574
575 /* Read y[2] sample */
576 c0 = *py++;
577 /* Read y[3] sample */
578 c1 = *py++;
579
580 /* Read x[5] sample */
581 x1 = *px++;
582
583 /* x[2] and x[3] are packed */
584 in1 = (q15_t) x2;
585 in2 = (q15_t) x3;
586
587 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
588
589 /* y[2] and y[3] are packed */
590 in1 = (q15_t) c0;
591 in2 = (q15_t) c1;
592
593 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
594
595 /* acc0 += x[2] * y[2] + x[3] * y[3] */
596 acc0 = __SMLAD(input1, input2, acc0);
597
598 /* x[3] and x[4] are packed */
599 in1 = (q15_t) x3;
600 in2 = (q15_t) x0;
601
602 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
603
604 /* acc1 += x[3] * y[2] + x[4] * y[3] */
605 acc1 = __SMLAD(input1, input2, acc1);
606
607 /* x[4] and x[5] are packed */
608 in1 = (q15_t) x0;
609 in2 = (q15_t) x1;
610
611 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
612
613 /* acc2 += x[4] * y[2] + x[5] * y[3] */
614 acc2 = __SMLAD(input1, input2, acc2);
615
616 /* Read x[6] sample */
617 x2 = *px++;
618
619 /* x[5] and x[6] are packed */
620 in1 = (q15_t) x1;
621 in2 = (q15_t) x2;
622
623 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
624
625 /* acc3 += x[5] * y[2] + x[6] * y[3] */
626 acc3 = __SMLAD(input1, input2, acc3);
627
628 } while (--k);
629
630 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
631 ** No loop unrolling is used. */
632 k = srcBLen % 0x4U;
633
634 while (k > 0U)
635 {
636 /* Read y[4] sample */
637 c0 = *py++;
638 /* Read x[7] sample */
639 x3 = *px++;
640
641 /* Perform the multiply-accumulates */
642 /* acc0 += x[4] * y[4] */
643 acc0 += ((q15_t) x0 * c0);
644 /* acc1 += x[5] * y[4] */
645 acc1 += ((q15_t) x1 * c0);
646 /* acc2 += x[6] * y[4] */
647 acc2 += ((q15_t) x2 * c0);
648 /* acc3 += x[7] * y[4] */
649 acc3 += ((q15_t) x3 * c0);
650
651 /* Reuse the present samples for the next MAC */
652 x0 = x1;
653 x1 = x2;
654 x2 = x3;
655
656 /* Decrement loop counter */
657 k--;
658 }
659
660 /* Store the result in the accumulator in the destination buffer. */
661 *pOut = (q7_t) (__SSAT(acc0 >> 7, 8));
662 /* Destination pointer is updated according to the address modifier, inc */
663 pOut += inc;
664
665 *pOut = (q7_t) (__SSAT(acc1 >> 7, 8));
666 pOut += inc;
667
668 *pOut = (q7_t) (__SSAT(acc2 >> 7, 8));
669 pOut += inc;
670
671 *pOut = (q7_t) (__SSAT(acc3 >> 7, 8));
672 pOut += inc;
673
674 count += 4U;
675 /* Update the inputA and inputB pointers for next MAC calculation */
676 px = pIn1 + count;
677 py = pIn2;
678
679 /* Decrement loop counter */
680 blkCnt--;
681 }
682
683 /* Loop unrolling: Compute remaining outputs */
684 blkCnt = blockSize2 % 0x4U;
685
686 #else
687
688 /* Initialize blkCnt with number of samples */
689 blkCnt = blockSize2;
690
691 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
692
693 while (blkCnt > 0U)
694 {
695 /* Accumulator is made zero for every iteration */
696 sum = 0;
697
698 #if defined (ARM_MATH_LOOPUNROLL)
699
700 /* Loop unrolling: Compute 4 outputs at a time */
701 k = srcBLen >> 2U;
702
703 while (k > 0U)
704 {
705
706 /* Reading two inputs of SrcA buffer and packing */
707 in1 = (q15_t) *px++;
708 in2 = (q15_t) *px++;
709 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
710
711 /* Reading two inputs of SrcB buffer and packing */
712 in1 = (q15_t) *py++;
713 in2 = (q15_t) *py++;
714 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
715
716 /* Perform the multiply-accumulate */
717 sum = __SMLAD(input1, input2, sum);
718
719 /* Reading two inputs of SrcA buffer and packing */
720 in1 = (q15_t) *px++;
721 in2 = (q15_t) *px++;
722 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
723
724 /* Reading two inputs of SrcB buffer and packing */
725 in1 = (q15_t) *py++;
726 in2 = (q15_t) *py++;
727 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
728
729 /* Perform the multiply-accumulate */
730 sum = __SMLAD(input1, input2, sum);
731
732 /* Decrement loop counter */
733 k--;
734 }
735
736 /* Loop unrolling: Compute remaining outputs */
737 k = srcBLen % 0x4U;
738
739 #else
740
741 /* Initialize blkCnt with number of samples */
742 k = srcBLen;
743
744 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
745
746 while (k > 0U)
747 {
748 /* Perform the multiply-accumulate */
749 sum += ((q15_t) *px++ * *py++);
750
751 /* Decrement the loop counter */
752 k--;
753 }
754
755 /* Store the result in the accumulator in the destination buffer. */
756 *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
757 /* Destination pointer is updated according to the address modifier, inc */
758 pOut += inc;
759
760 /* Increment the pointer pIn1 index, count by 1 */
761 count++;
762
763 /* Update the inputA and inputB pointers for next MAC calculation */
764 px = pIn1 + count;
765 py = pIn2;
766
767 /* Decrement the loop counter */
768 blkCnt--;
769 }
770 }
771 else
772 {
773 /* If the srcBLen is not a multiple of 4,
774 * the blockSize2 loop cannot be unrolled by 4 */
775 blkCnt = blockSize2;
776
777 while (blkCnt > 0U)
778 {
779 /* Accumulator is made zero for every iteration */
780 sum = 0;
781
782 /* srcBLen number of MACS should be performed */
783 k = srcBLen;
784
785 while (k > 0U)
786 {
787 /* Perform the multiply-accumulate */
788 sum += ((q15_t) *px++ * *py++);
789
790 /* Decrement the loop counter */
791 k--;
792 }
793
794 /* Store the result in the accumulator in the destination buffer. */
795 *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
796 /* Destination pointer is updated according to the address modifier, inc */
797 pOut += inc;
798
799 /* Increment the MAC count */
800 count++;
801
802 /* Update the inputA and inputB pointers for next MAC calculation */
803 px = pIn1 + count;
804 py = pIn2;
805
806 /* Decrement loop counter */
807 blkCnt--;
808 }
809 }
810
811
812 /* --------------------------
813 * Initializations of stage3
814 * -------------------------*/
815
816 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
817 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
818 * ....
819 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
820 * sum += x[srcALen-1] * y[0]
821 */
822
823 /* In this stage the MAC operations are decreased by 1 for every iteration.
824 The count variable holds the number of MAC operations performed */
825 count = srcBLen - 1U;
826
827 /* Working pointer of inputA */
828 pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
829 px = pSrc1;
830
831 /* Working pointer of inputB */
832 py = pIn2;
833
834 /* -------------------
835 * Stage3 process
836 * ------------------*/
837
838 while (blockSize3 > 0U)
839 {
840 /* Accumulator is made zero for every iteration */
841 sum = 0;
842
843 #if defined (ARM_MATH_LOOPUNROLL)
844
845 /* Loop unrolling: Compute 4 outputs at a time */
846 k = count >> 2U;
847
848 while (k > 0U)
849 {
850 /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2] */
851 in1 = (q15_t) *px++;
852 in2 = (q15_t) *px++;
853 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
854
855 /* y[0] , y[1] */
856 in1 = (q15_t) *py++;
857 in2 = (q15_t) *py++;
858 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
859
860 /* sum += x[srcALen - srcBLen + 1] * y[0] */
861 /* sum += x[srcALen - srcBLen + 2] * y[1] */
862 sum = __SMLAD(input1, input2, sum);
863
864 /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */
865 in1 = (q15_t) *px++;
866 in2 = (q15_t) *px++;
867 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
868
869 /* y[2] , y[3] */
870 in1 = (q15_t) *py++;
871 in2 = (q15_t) *py++;
872 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
873
874 /* sum += x[srcALen - srcBLen + 3] * y[2] */
875 /* sum += x[srcALen - srcBLen + 4] * y[3] */
876 sum = __SMLAD(input1, input2, sum);
877
878 /* Decrement loop counter */
879 k--;
880 }
881
882 /* Loop unrolling: Compute remaining outputs */
883 k = count % 0x4U;
884
885 #else
886
887 /* Initialize blkCnt with number of samples */
888 k = count;
889
890 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
891
892 while (k > 0U)
893 {
894 /* Perform the multiply-accumulate */
895 sum += ((q15_t) *px++ * *py++);
896
897 /* Decrement loop counter */
898 k--;
899 }
900
901 /* Store the result in the accumulator in the destination buffer. */
902 *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
903 /* Destination pointer is updated according to the address modifier, inc */
904 pOut += inc;
905
906 /* Update the inputA and inputB pointers for next MAC calculation */
907 px = ++pSrc1;
908 py = pIn2;
909
910 /* Decrement MAC count */
911 count--;
912
913 /* Decrement loop counter */
914 blockSize3--;
915 }
916
917 #else
918 /* alternate version for CM0_FAMILY */
919
920 const q7_t *pIn1 = pSrcA; /* InputA pointer */
921 const q7_t *pIn2 = pSrcB + (srcBLen - 1U); /* InputB pointer */
922 q31_t sum; /* Accumulator */
923 uint32_t i = 0U, j; /* Loop counters */
924 int32_t inc = 1; /* Destination address modifier */
925 uint32_t tot = 0U; /* Length */
926
927 /* The algorithm implementation is based on the lengths of the inputs. */
928 /* srcB is always made to slide across srcA. */
929 /* So srcBLen is always considered as shorter or equal to srcALen */
930 /* But CORR(x, y) is reverse of CORR(y, x) */
931 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
932 /* and a varaible, inc is set to -1 */
933 /* If lengths are not equal then zero pad has to be done to make the two
934 * inputs of same length. But to improve the performance, we include zeroes
935 * in the output instead of zero padding either of the the inputs*/
936 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
937 * starting of the output buffer */
938 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
939 * ending of the output buffer */
940 /* Once the zero padding is done the remaining of the output is calcualted
941 * using convolution but with the shorter signal time shifted. */
942
943 /* Calculate the length of the remaining sequence */
944 tot = ((srcALen + srcBLen) - 2U);
945
946 if (srcALen > srcBLen)
947 {
948 /* Calculating the number of zeros to be padded to the output */
949 j = srcALen - srcBLen;
950
951 /* Initialise the pointer after zero padding */
952 pDst += j;
953 }
954
955 else if (srcALen < srcBLen)
956 {
957 /* Initialization to inputB pointer */
958 pIn1 = pSrcB;
959
960 /* Initialization to the end of inputA pointer */
961 pIn2 = pSrcA + (srcALen - 1U);
962
963 /* Initialisation of the pointer after zero padding */
964 pDst = pDst + tot;
965
966 /* Swapping the lengths */
967 j = srcALen;
968 srcALen = srcBLen;
969 srcBLen = j;
970
971 /* Filling destination in reverse order */
972 inc = -1;
973 }
974
975 /* Loop to calculate convolution for output length number of times */
976 for (i = 0U; i <= tot; i++)
977 {
978 /* Initialize sum with zero to carry out MAC operations */
979 sum = 0;
980
981 /* Loop to perform MAC operations according to convolution equation */
982 for (j = 0U; j <= i; j++)
983 {
984 /* Check the array limitations */
985 if (((i - j) < srcBLen) && (j < srcALen))
986 {
987 /* z[i] += x[i-j] * y[j] */
988 sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)]);
989 }
990 }
991
992 /* Store the output in the destination buffer */
993 *pDst = (q7_t) __SSAT((sum >> 7U), 8U);
994 pDst += inc;
995 }
996
997 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
998
999 }
1000 #endif /* defined(ARM_MATH_MVEI) */
1001
1002 /**
1003 @} end of Corr group
1004 */
1005