1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_correlate_fast_q15.c
4 * Description: Fast Q15 Correlation
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Corr
37 @{
38 */
39
40 /**
41 @brief Correlation of Q15 sequences (fast version).
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
47
48 @par Scaling and Overflow Behavior
49 This fast version uses a 32-bit accumulator with 2.30 format.
50 The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
51 There is no saturation on intermediate additions.
52 Thus, if the accumulator overflows it wraps around and distorts the result.
53 The input signals should be scaled down to avoid intermediate overflows.
54 Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
55 maximum of min(srcALen, srcBLen) number of additions is carried internally.
56 The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
57
58 @remark
59 Refer to \ref arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
60 */
61
arm_correlate_fast_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst)62 void arm_correlate_fast_q15(
63 const q15_t * pSrcA,
64 uint32_t srcALen,
65 const q15_t * pSrcB,
66 uint32_t srcBLen,
67 q15_t * pDst)
68 {
69 const q15_t *pIn1; /* InputA pointer */
70 const q15_t *pIn2; /* InputB pointer */
71 q15_t *pOut = pDst; /* Output pointer */
72 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
73 const q15_t *px; /* Intermediate inputA pointer */
74 const q15_t *py; /* Intermediate inputB pointer */
75 const q15_t *pSrc1; /* Intermediate pointers */
76 q31_t x0, x1, x2, x3, c0; /* Temporary variables for holding input and coefficient values */
77 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
78 uint32_t j, k, count, blkCnt; /* Loop counters */
79 uint32_t outBlockSize;
80 int32_t inc = 1; /* Destination address modifier */
81
82
83 /* The algorithm implementation is based on the lengths of the inputs. */
84 /* srcB is always made to slide across srcA. */
85 /* So srcBLen is always considered as shorter or equal to srcALen */
86 /* But CORR(x, y) is reverse of CORR(y, x) */
87 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
88 /* and the destination pointer modifier, inc is set to -1 */
89 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
90 /* But to improve the performance,
91 * we include zeroes in the output instead of zero padding either of the the inputs*/
92 /* If srcALen > srcBLen,
93 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
94 /* If srcALen < srcBLen,
95 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
96 if (srcALen >= srcBLen)
97 {
98 /* Initialization of inputA pointer */
99 pIn1 = pSrcA;
100
101 /* Initialization of inputB pointer */
102 pIn2 = pSrcB;
103
104 /* Number of output samples is calculated */
105 outBlockSize = (2U * srcALen) - 1U;
106
107 /* When srcALen > srcBLen, zero padding is done to srcB
108 * to make their lengths equal.
109 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
110 * number of output samples are made zero */
111 j = outBlockSize - (srcALen + (srcBLen - 1U));
112
113 /* Updating the pointer position to non zero value */
114 pOut += j;
115
116 }
117 else
118 {
119 /* Initialization of inputA pointer */
120 pIn1 = pSrcB;
121
122 /* Initialization of inputB pointer */
123 pIn2 = pSrcA;
124
125 /* srcBLen is always considered as shorter or equal to srcALen */
126 j = srcBLen;
127 srcBLen = srcALen;
128 srcALen = j;
129
130 /* CORR(x, y) = Reverse order(CORR(y, x)) */
131 /* Hence set the destination pointer to point to the last output sample */
132 pOut = pDst + ((srcALen + srcBLen) - 2U);
133
134 /* Destination address modifier is set to -1 */
135 inc = -1;
136
137 }
138
139 /* The function is internally
140 * divided into three stages according to the number of multiplications that has to be
141 * taken place between inputA samples and inputB samples. In the first stage of the
142 * algorithm, the multiplications increase by one for every iteration.
143 * In the second stage of the algorithm, srcBLen number of multiplications are done.
144 * In the third stage of the algorithm, the multiplications decrease by one
145 * for every iteration. */
146
147 /* The algorithm is implemented in three stages.
148 The loop counters of each stage is initiated here. */
149 blockSize1 = srcBLen - 1U;
150 blockSize2 = srcALen - (srcBLen - 1U);
151 blockSize3 = blockSize1;
152
153 /* --------------------------
154 * Initializations of stage1
155 * -------------------------*/
156
157 /* sum = x[0] * y[srcBlen - 1]
158 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
159 * ....
160 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
161 */
162
163 /* In this stage the MAC operations are increased by 1 for every iteration.
164 The count variable holds the number of MAC operations performed */
165 count = 1U;
166
167 /* Working pointer of inputA */
168 px = pIn1;
169
170 /* Working pointer of inputB */
171 pSrc1 = pIn2 + (srcBLen - 1U);
172 py = pSrc1;
173
174 /* ------------------------
175 * Stage1 process
176 * ----------------------*/
177
178 /* The first loop starts here */
179 while (blockSize1 > 0U)
180 {
181 /* Accumulator is made zero for every iteration */
182 sum = 0;
183
184 /* Apply loop unrolling and compute 4 MACs simultaneously. */
185 k = count >> 2U;
186
187 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
188 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
189 while (k > 0U)
190 {
191 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
192 sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
193 /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
194 sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
195
196 /* Decrement loop counter */
197 k--;
198 }
199
200 /* If the count is not a multiple of 4, compute any remaining MACs here.
201 No loop unrolling is used. */
202 k = count % 0x4U;
203
204 while (k > 0U)
205 {
206 /* Perform the multiply-accumulates */
207 /* x[0] * y[srcBLen - 1] */
208 sum = __SMLAD(*px++, *py++, sum);
209
210 /* Decrement the loop counter */
211 k--;
212 }
213
214 /* Store the result in the accumulator in the destination buffer. */
215 *pOut = (q15_t) (sum >> 15);
216 /* Destination pointer is updated according to the address modifier, inc */
217 pOut += inc;
218
219 /* Update the inputA and inputB pointers for next MAC calculation */
220 py = pSrc1 - count;
221 px = pIn1;
222
223 /* Increment MAC count */
224 count++;
225
226 /* Decrement loop counter */
227 blockSize1--;
228 }
229
230 /* --------------------------
231 * Initializations of stage2
232 * ------------------------*/
233
234 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
235 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
236 * ....
237 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
238 */
239
240 /* Working pointer of inputA */
241 px = pIn1;
242
243 /* Working pointer of inputB */
244 py = pIn2;
245
246 /* count is the index by which the pointer pIn1 to be incremented */
247 count = 0U;
248
249 /* --------------------
250 * Stage2 process
251 * -------------------*/
252
253 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
254 * So, to loop unroll over blockSize2,
255 * srcBLen should be greater than or equal to 4 */
256 if (srcBLen >= 4U)
257 {
258 /* Loop unroll over blockSize2, by 4 */
259 blkCnt = blockSize2 >> 2U;
260
261 while (blkCnt > 0U)
262 {
263 /* Set all accumulators to zero */
264 acc0 = 0;
265 acc1 = 0;
266 acc2 = 0;
267 acc3 = 0;
268
269 /* read x[0], x[1] samples */
270 x0 = read_q15x2 ((q15_t *) px);
271 /* read x[1], x[2] samples */
272 x1 = read_q15x2 ((q15_t *) px + 1);
273 px += 2U;
274
275 /* Apply loop unrolling and compute 4 MACs simultaneously. */
276 k = srcBLen >> 2U;
277
278 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
279 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
280 do
281 {
282 /* Read the first two inputB samples using SIMD:
283 * y[0] and y[1] */
284 c0 = read_q15x2_ia ((q15_t **) &py);
285
286 /* acc0 += x[0] * y[0] + x[1] * y[1] */
287 acc0 = __SMLAD(x0, c0, acc0);
288
289 /* acc1 += x[1] * y[0] + x[2] * y[1] */
290 acc1 = __SMLAD(x1, c0, acc1);
291
292 /* Read x[2], x[3] */
293 x2 = read_q15x2 ((q15_t *) px);
294
295 /* Read x[3], x[4] */
296 x3 = read_q15x2 ((q15_t *) px + 1);
297
298 /* acc2 += x[2] * y[0] + x[3] * y[1] */
299 acc2 = __SMLAD(x2, c0, acc2);
300
301 /* acc3 += x[3] * y[0] + x[4] * y[1] */
302 acc3 = __SMLAD(x3, c0, acc3);
303
304 /* Read y[2] and y[3] */
305 c0 = read_q15x2_ia ((q15_t **) &py);
306
307 /* acc0 += x[2] * y[2] + x[3] * y[3] */
308 acc0 = __SMLAD(x2, c0, acc0);
309
310 /* acc1 += x[3] * y[2] + x[4] * y[3] */
311 acc1 = __SMLAD(x3, c0, acc1);
312
313 /* Read x[4], x[5] */
314 x0 = read_q15x2 ((q15_t *) px + 2);
315
316 /* Read x[5], x[6] */
317 x1 = read_q15x2 ((q15_t *) px + 3);
318 px += 4U;
319
320 /* acc2 += x[4] * y[2] + x[5] * y[3] */
321 acc2 = __SMLAD(x0, c0, acc2);
322
323 /* acc3 += x[5] * y[2] + x[6] * y[3] */
324 acc3 = __SMLAD(x1, c0, acc3);
325
326 } while (--k);
327
328 /* For the next MAC operations, SIMD is not used
329 * So, the 16 bit pointer if inputB, py is updated */
330
331 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
332 ** No loop unrolling is used. */
333 k = srcBLen % 0x4U;
334
335 if (k == 1U)
336 {
337 /* Read y[4] */
338 c0 = *py;
339
340 #ifdef ARM_MATH_BIG_ENDIAN
341 c0 = c0 << 16U;
342 #else
343 c0 = c0 & 0x0000FFFF;
344 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
345
346 /* Read x[7] */
347 x3 = read_q15x2 ((q15_t *) px);
348 px++;
349
350 /* Perform the multiply-accumulates */
351 acc0 = __SMLAD (x0, c0, acc0);
352 acc1 = __SMLAD (x1, c0, acc1);
353 acc2 = __SMLADX(x1, c0, acc2);
354 acc3 = __SMLADX(x3, c0, acc3);
355 }
356
357 if (k == 2U)
358 {
359 /* Read y[4], y[5] */
360 c0 = read_q15x2 ((q15_t *) py);
361
362 /* Read x[7], x[8] */
363 x3 = read_q15x2 ((q15_t *) px);
364
365 /* Read x[9] */
366 x2 = read_q15x2 ((q15_t *) px + 1);
367 px += 2U;
368
369 /* Perform the multiply-accumulates */
370 acc0 = __SMLAD(x0, c0, acc0);
371 acc1 = __SMLAD(x1, c0, acc1);
372 acc2 = __SMLAD(x3, c0, acc2);
373 acc3 = __SMLAD(x2, c0, acc3);
374 }
375
376 if (k == 3U)
377 {
378 /* Read y[4], y[5] */
379 c0 = read_q15x2_ia ((q15_t **) &py);
380
381 /* Read x[7], x[8] */
382 x3 = read_q15x2 ((q15_t *) px);
383
384 /* Read x[9] */
385 x2 = read_q15x2 ((q15_t *) px + 1);
386
387 /* Perform the multiply-accumulates */
388 acc0 = __SMLAD(x0, c0, acc0);
389 acc1 = __SMLAD(x1, c0, acc1);
390 acc2 = __SMLAD(x3, c0, acc2);
391 acc3 = __SMLAD(x2, c0, acc3);
392
393 c0 = (*py);
394 /* Read y[6] */
395 #ifdef ARM_MATH_BIG_ENDIAN
396 c0 = c0 << 16U;
397 #else
398 c0 = c0 & 0x0000FFFF;
399 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
400
401 /* Read x[10] */
402 x3 = read_q15x2 ((q15_t *) px + 2);
403 px += 3U;
404
405 /* Perform the multiply-accumulates */
406 acc0 = __SMLADX(x1, c0, acc0);
407 acc1 = __SMLAD (x2, c0, acc1);
408 acc2 = __SMLADX(x2, c0, acc2);
409 acc3 = __SMLADX(x3, c0, acc3);
410 }
411
412 /* Store the result in the accumulator in the destination buffer. */
413 *pOut = (q15_t) (acc0 >> 15);
414 /* Destination pointer is updated according to the address modifier, inc */
415 pOut += inc;
416
417 *pOut = (q15_t) (acc1 >> 15);
418 pOut += inc;
419
420 *pOut = (q15_t) (acc2 >> 15);
421 pOut += inc;
422
423 *pOut = (q15_t) (acc3 >> 15);
424 pOut += inc;
425
426 /* Increment the pointer pIn1 index, count by 4 */
427 count += 4U;
428
429 /* Update the inputA and inputB pointers for next MAC calculation */
430 px = pIn1 + count;
431 py = pIn2;
432
433 /* Decrement loop counter */
434 blkCnt--;
435 }
436
437 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
438 ** No loop unrolling is used. */
439 blkCnt = blockSize2 % 0x4U;
440
441 while (blkCnt > 0U)
442 {
443 /* Accumulator is made zero for every iteration */
444 sum = 0;
445
446 /* Apply loop unrolling and compute 4 MACs simultaneously. */
447 k = srcBLen >> 2U;
448
449 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
450 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
451 while (k > 0U)
452 {
453 /* Perform the multiply-accumulates */
454 sum += ((q31_t) *px++ * *py++);
455 sum += ((q31_t) *px++ * *py++);
456 sum += ((q31_t) *px++ * *py++);
457 sum += ((q31_t) *px++ * *py++);
458
459 /* Decrement loop counter */
460 k--;
461 }
462
463 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
464 ** No loop unrolling is used. */
465 k = srcBLen % 0x4U;
466
467 while (k > 0U)
468 {
469 /* Perform the multiply-accumulates */
470 sum += ((q31_t) * px++ * *py++);
471
472 /* Decrement loop counter */
473 k--;
474 }
475
476 /* Store the result in the accumulator in the destination buffer. */
477 *pOut = (q15_t) (sum >> 15);
478 /* Destination pointer is updated according to the address modifier, inc */
479 pOut += inc;
480
481 /* Increment the pointer pIn1 index, count by 1 */
482 count++;
483
484 /* Update the inputA and inputB pointers for next MAC calculation */
485 px = pIn1 + count;
486 py = pIn2;
487
488 /* Decrement loop counter */
489 blkCnt--;
490 }
491 }
492 else
493 {
494 /* If the srcBLen is not a multiple of 4,
495 * the blockSize2 loop cannot be unrolled by 4 */
496 blkCnt = blockSize2;
497
498 while (blkCnt > 0U)
499 {
500 /* Accumulator is made zero for every iteration */
501 sum = 0;
502
503 /* srcBLen number of MACS should be performed */
504 k = srcBLen;
505
506 while (k > 0U)
507 {
508 /* Perform the multiply-accumulate */
509 sum += ((q31_t) *px++ * *py++);
510
511 /* Decrement loop counter */
512 k--;
513 }
514
515 /* Store the result in the accumulator in the destination buffer. */
516 *pOut = (q15_t) (sum >> 15);
517 /* Destination pointer is updated according to the address modifier, inc */
518 pOut += inc;
519
520 /* Increment MAC count */
521 count++;
522
523 /* Update the inputA and inputB pointers for next MAC calculation */
524 px = pIn1 + count;
525 py = pIn2;
526
527 /* Decrement loop counter */
528 blkCnt--;
529 }
530 }
531
532 /* --------------------------
533 * Initializations of stage3
534 * -------------------------*/
535
536 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
537 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
538 * ....
539 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
540 * sum += x[srcALen-1] * y[0]
541 */
542
543 /* In this stage the MAC operations are decreased by 1 for every iteration.
544 The count variable holds the number of MAC operations performed */
545 count = srcBLen - 1U;
546
547 /* Working pointer of inputA */
548 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
549 px = pSrc1;
550
551 /* Working pointer of inputB */
552 py = pIn2;
553
554 /* -------------------
555 * Stage3 process
556 * ------------------*/
557
558 while (blockSize3 > 0U)
559 {
560 /* Accumulator is made zero for every iteration */
561 sum = 0;
562
563 /* Apply loop unrolling and compute 4 MACs simultaneously. */
564 k = count >> 2U;
565
566 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
567 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
568 while (k > 0U)
569 {
570 /* Perform the multiply-accumulates */
571 /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
572 sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
573 /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
574 sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
575
576 /* Decrement loop counter */
577 k--;
578 }
579
580 /* If the count is not a multiple of 4, compute any remaining MACs here.
581 ** No loop unrolling is used. */
582 k = count % 0x4U;
583
584 while (k > 0U)
585 {
586 /* Perform the multiply-accumulates */
587 sum = __SMLAD(*px++, *py++, sum);
588
589 /* Decrement loop counter */
590 k--;
591 }
592
593 /* Store the result in the accumulator in the destination buffer. */
594 *pOut = (q15_t) (sum >> 15);
595 /* Destination pointer is updated according to the address modifier, inc */
596 pOut += inc;
597
598 /* Update the inputA and inputB pointers for next MAC calculation */
599 px = ++pSrc1;
600 py = pIn2;
601
602 /* Decrement the MAC count */
603 count--;
604
605 /* Decrement the loop counter */
606 blockSize3--;
607 }
608
609 }
610
611 /**
612 @} end of Corr group
613 */
614