1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_correlate_fast_q15.c
4  * Description:  Fast Q15 Correlation
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Corr
37   @{
38  */
39 
40 /**
41   @brief         Correlation of Q15 sequences (fast version).
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
47 
48   @par           Scaling and Overflow Behavior
49                    This fast version uses a 32-bit accumulator with 2.30 format.
50                    The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
51                    There is no saturation on intermediate additions.
52                    Thus, if the accumulator overflows it wraps around and distorts the result.
53                    The input signals should be scaled down to avoid intermediate overflows.
54                    Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
55                    maximum of min(srcALen, srcBLen) number of additions is carried internally.
56                    The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
57 
58   @remark
59                    Refer to \ref arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
60  */
61 
arm_correlate_fast_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst)62 void arm_correlate_fast_q15(
63   const q15_t * pSrcA,
64         uint32_t srcALen,
65   const q15_t * pSrcB,
66         uint32_t srcBLen,
67         q15_t * pDst)
68 {
69   const q15_t *pIn1;                                   /* InputA pointer */
70   const q15_t *pIn2;                                   /* InputB pointer */
71         q15_t *pOut = pDst;                            /* Output pointer */
72         q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators */
73   const q15_t *px;                                     /* Intermediate inputA pointer */
74   const q15_t *py;                                     /* Intermediate inputB pointer */
75   const q15_t *pSrc1;                                  /* Intermediate pointers */
76         q31_t x0, x1, x2, x3, c0;                      /* Temporary variables for holding input and coefficient values */
77         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
78         uint32_t j, k, count, blkCnt;                  /* Loop counters */
79         uint32_t outBlockSize;
80         int32_t inc = 1;                               /* Destination address modifier */
81 
82 
83   /* The algorithm implementation is based on the lengths of the inputs. */
84   /* srcB is always made to slide across srcA. */
85   /* So srcBLen is always considered as shorter or equal to srcALen */
86   /* But CORR(x, y) is reverse of CORR(y, x) */
87   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
88   /* and the destination pointer modifier, inc is set to -1 */
89   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
90   /* But to improve the performance,
91    * we include zeroes in the output instead of zero padding either of the the inputs*/
92   /* If srcALen > srcBLen,
93    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
94   /* If srcALen < srcBLen,
95    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
96   if (srcALen >= srcBLen)
97   {
98     /* Initialization of inputA pointer */
99     pIn1 = pSrcA;
100 
101     /* Initialization of inputB pointer */
102     pIn2 = pSrcB;
103 
104     /* Number of output samples is calculated */
105     outBlockSize = (2U * srcALen) - 1U;
106 
107     /* When srcALen > srcBLen, zero padding is done to srcB
108      * to make their lengths equal.
109      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
110      * number of output samples are made zero */
111     j = outBlockSize - (srcALen + (srcBLen - 1U));
112 
113     /* Updating the pointer position to non zero value */
114     pOut += j;
115 
116   }
117   else
118   {
119     /* Initialization of inputA pointer */
120     pIn1 = pSrcB;
121 
122     /* Initialization of inputB pointer */
123     pIn2 = pSrcA;
124 
125     /* srcBLen is always considered as shorter or equal to srcALen */
126     j = srcBLen;
127     srcBLen = srcALen;
128     srcALen = j;
129 
130     /* CORR(x, y) = Reverse order(CORR(y, x)) */
131     /* Hence set the destination pointer to point to the last output sample */
132     pOut = pDst + ((srcALen + srcBLen) - 2U);
133 
134     /* Destination address modifier is set to -1 */
135     inc = -1;
136 
137   }
138 
139   /* The function is internally
140    * divided into three stages according to the number of multiplications that has to be
141    * taken place between inputA samples and inputB samples. In the first stage of the
142    * algorithm, the multiplications increase by one for every iteration.
143    * In the second stage of the algorithm, srcBLen number of multiplications are done.
144    * In the third stage of the algorithm, the multiplications decrease by one
145    * for every iteration. */
146 
147   /* The algorithm is implemented in three stages.
148      The loop counters of each stage is initiated here. */
149   blockSize1 = srcBLen - 1U;
150   blockSize2 = srcALen - (srcBLen - 1U);
151   blockSize3 = blockSize1;
152 
153   /* --------------------------
154    * Initializations of stage1
155    * -------------------------*/
156 
157   /* sum = x[0] * y[srcBlen - 1]
158    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
159    * ....
160    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
161    */
162 
163   /* In this stage the MAC operations are increased by 1 for every iteration.
164      The count variable holds the number of MAC operations performed */
165   count = 1U;
166 
167   /* Working pointer of inputA */
168   px = pIn1;
169 
170   /* Working pointer of inputB */
171   pSrc1 = pIn2 + (srcBLen - 1U);
172   py = pSrc1;
173 
174   /* ------------------------
175    * Stage1 process
176    * ----------------------*/
177 
178   /* The first loop starts here */
179   while (blockSize1 > 0U)
180   {
181     /* Accumulator is made zero for every iteration */
182     sum = 0;
183 
184     /* Apply loop unrolling and compute 4 MACs simultaneously. */
185     k = count >> 2U;
186 
187     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
188      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
189     while (k > 0U)
190     {
191       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
192       sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
193       /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
194       sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
195 
196       /* Decrement loop counter */
197       k--;
198     }
199 
200     /* If the count is not a multiple of 4, compute any remaining MACs here.
201        No loop unrolling is used. */
202     k = count % 0x4U;
203 
204     while (k > 0U)
205     {
206       /* Perform the multiply-accumulates */
207       /* x[0] * y[srcBLen - 1] */
208       sum = __SMLAD(*px++, *py++, sum);
209 
210       /* Decrement the loop counter */
211       k--;
212     }
213 
214     /* Store the result in the accumulator in the destination buffer. */
215     *pOut = (q15_t) (sum >> 15);
216     /* Destination pointer is updated according to the address modifier, inc */
217     pOut += inc;
218 
219     /* Update the inputA and inputB pointers for next MAC calculation */
220     py = pSrc1 - count;
221     px = pIn1;
222 
223     /* Increment MAC count */
224     count++;
225 
226     /* Decrement loop counter */
227     blockSize1--;
228   }
229 
230   /* --------------------------
231    * Initializations of stage2
232    * ------------------------*/
233 
234   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
235    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
236    * ....
237    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
238    */
239 
240   /* Working pointer of inputA */
241   px = pIn1;
242 
243   /* Working pointer of inputB */
244   py = pIn2;
245 
246   /* count is the index by which the pointer pIn1 to be incremented */
247   count = 0U;
248 
249   /* --------------------
250    * Stage2 process
251    * -------------------*/
252 
253   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
254    * So, to loop unroll over blockSize2,
255    * srcBLen should be greater than or equal to 4 */
256   if (srcBLen >= 4U)
257   {
258     /* Loop unroll over blockSize2, by 4 */
259     blkCnt = blockSize2 >> 2U;
260 
261     while (blkCnt > 0U)
262     {
263       /* Set all accumulators to zero */
264       acc0 = 0;
265       acc1 = 0;
266       acc2 = 0;
267       acc3 = 0;
268 
269       /* read x[0], x[1] samples */
270       x0 = read_q15x2 ((q15_t *) px);
271       /* read x[1], x[2] samples */
272       x1 = read_q15x2 ((q15_t *) px + 1);
273 	  px += 2U;
274 
275       /* Apply loop unrolling and compute 4 MACs simultaneously. */
276       k = srcBLen >> 2U;
277 
278       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
279        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
280       do
281       {
282         /* Read the first two inputB samples using SIMD:
283          * y[0] and y[1] */
284         c0 = read_q15x2_ia ((q15_t **) &py);
285 
286         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
287         acc0 = __SMLAD(x0, c0, acc0);
288 
289         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
290         acc1 = __SMLAD(x1, c0, acc1);
291 
292         /* Read x[2], x[3] */
293         x2 = read_q15x2 ((q15_t *) px);
294 
295         /* Read x[3], x[4] */
296         x3 = read_q15x2 ((q15_t *) px + 1);
297 
298         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
299         acc2 = __SMLAD(x2, c0, acc2);
300 
301         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
302         acc3 = __SMLAD(x3, c0, acc3);
303 
304         /* Read y[2] and y[3] */
305         c0 = read_q15x2_ia ((q15_t **) &py);
306 
307         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
308         acc0 = __SMLAD(x2, c0, acc0);
309 
310         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
311         acc1 = __SMLAD(x3, c0, acc1);
312 
313         /* Read x[4], x[5] */
314         x0 = read_q15x2 ((q15_t *) px + 2);
315 
316         /* Read x[5], x[6] */
317         x1 = read_q15x2 ((q15_t *) px + 3);
318 		px += 4U;
319 
320         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
321         acc2 = __SMLAD(x0, c0, acc2);
322 
323         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
324         acc3 = __SMLAD(x1, c0, acc3);
325 
326       } while (--k);
327 
328       /* For the next MAC operations, SIMD is not used
329        * So, the 16 bit pointer if inputB, py is updated */
330 
331       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
332        ** No loop unrolling is used. */
333       k = srcBLen % 0x4U;
334 
335       if (k == 1U)
336       {
337         /* Read y[4] */
338         c0 = *py;
339 
340 #ifdef  ARM_MATH_BIG_ENDIAN
341         c0 = c0 << 16U;
342 #else
343         c0 = c0 & 0x0000FFFF;
344 #endif /* #ifdef  ARM_MATH_BIG_ENDIAN */
345 
346         /* Read x[7] */
347         x3 = read_q15x2 ((q15_t *) px);
348 		px++;
349 
350         /* Perform the multiply-accumulates */
351         acc0 = __SMLAD (x0, c0, acc0);
352         acc1 = __SMLAD (x1, c0, acc1);
353         acc2 = __SMLADX(x1, c0, acc2);
354         acc3 = __SMLADX(x3, c0, acc3);
355       }
356 
357       if (k == 2U)
358       {
359         /* Read y[4], y[5] */
360         c0 = read_q15x2 ((q15_t *) py);
361 
362         /* Read x[7], x[8] */
363         x3 = read_q15x2 ((q15_t *) px);
364 
365         /* Read x[9] */
366         x2 = read_q15x2 ((q15_t *) px + 1);
367 		px += 2U;
368 
369         /* Perform the multiply-accumulates */
370         acc0 = __SMLAD(x0, c0, acc0);
371         acc1 = __SMLAD(x1, c0, acc1);
372         acc2 = __SMLAD(x3, c0, acc2);
373         acc3 = __SMLAD(x2, c0, acc3);
374       }
375 
376       if (k == 3U)
377       {
378         /* Read y[4], y[5] */
379         c0 = read_q15x2_ia ((q15_t **) &py);
380 
381         /* Read x[7], x[8] */
382         x3 = read_q15x2 ((q15_t *) px);
383 
384         /* Read x[9] */
385         x2 = read_q15x2 ((q15_t *) px + 1);
386 
387         /* Perform the multiply-accumulates */
388         acc0 = __SMLAD(x0, c0, acc0);
389         acc1 = __SMLAD(x1, c0, acc1);
390         acc2 = __SMLAD(x3, c0, acc2);
391         acc3 = __SMLAD(x2, c0, acc3);
392 
393         c0 = (*py);
394         /* Read y[6] */
395 #ifdef  ARM_MATH_BIG_ENDIAN
396         c0 = c0 << 16U;
397 #else
398         c0 = c0 & 0x0000FFFF;
399 #endif /* #ifdef  ARM_MATH_BIG_ENDIAN */
400 
401         /* Read x[10] */
402         x3 = read_q15x2 ((q15_t *) px + 2);
403 		px += 3U;
404 
405         /* Perform the multiply-accumulates */
406         acc0 = __SMLADX(x1, c0, acc0);
407         acc1 = __SMLAD (x2, c0, acc1);
408         acc2 = __SMLADX(x2, c0, acc2);
409         acc3 = __SMLADX(x3, c0, acc3);
410       }
411 
412       /* Store the result in the accumulator in the destination buffer. */
413       *pOut = (q15_t) (acc0 >> 15);
414       /* Destination pointer is updated according to the address modifier, inc */
415       pOut += inc;
416 
417       *pOut = (q15_t) (acc1 >> 15);
418       pOut += inc;
419 
420       *pOut = (q15_t) (acc2 >> 15);
421       pOut += inc;
422 
423       *pOut = (q15_t) (acc3 >> 15);
424       pOut += inc;
425 
426       /* Increment the pointer pIn1 index, count by 4 */
427       count += 4U;
428 
429       /* Update the inputA and inputB pointers for next MAC calculation */
430       px = pIn1 + count;
431       py = pIn2;
432 
433       /* Decrement loop counter */
434       blkCnt--;
435     }
436 
437     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
438      ** No loop unrolling is used. */
439     blkCnt = blockSize2 % 0x4U;
440 
441     while (blkCnt > 0U)
442     {
443       /* Accumulator is made zero for every iteration */
444       sum = 0;
445 
446       /* Apply loop unrolling and compute 4 MACs simultaneously. */
447       k = srcBLen >> 2U;
448 
449       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
450        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
451       while (k > 0U)
452       {
453         /* Perform the multiply-accumulates */
454         sum += ((q31_t) *px++ * *py++);
455         sum += ((q31_t) *px++ * *py++);
456         sum += ((q31_t) *px++ * *py++);
457         sum += ((q31_t) *px++ * *py++);
458 
459         /* Decrement loop counter */
460         k--;
461       }
462 
463       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
464        ** No loop unrolling is used. */
465       k = srcBLen % 0x4U;
466 
467       while (k > 0U)
468       {
469         /* Perform the multiply-accumulates */
470         sum += ((q31_t) * px++ * *py++);
471 
472         /* Decrement loop counter */
473         k--;
474       }
475 
476       /* Store the result in the accumulator in the destination buffer. */
477       *pOut = (q15_t) (sum >> 15);
478       /* Destination pointer is updated according to the address modifier, inc */
479       pOut += inc;
480 
481       /* Increment the pointer pIn1 index, count by 1 */
482       count++;
483 
484       /* Update the inputA and inputB pointers for next MAC calculation */
485       px = pIn1 + count;
486       py = pIn2;
487 
488       /* Decrement loop counter */
489       blkCnt--;
490     }
491   }
492   else
493   {
494     /* If the srcBLen is not a multiple of 4,
495      * the blockSize2 loop cannot be unrolled by 4 */
496     blkCnt = blockSize2;
497 
498     while (blkCnt > 0U)
499     {
500       /* Accumulator is made zero for every iteration */
501       sum = 0;
502 
503       /* srcBLen number of MACS should be performed */
504       k = srcBLen;
505 
506       while (k > 0U)
507       {
508         /* Perform the multiply-accumulate */
509         sum += ((q31_t) *px++ * *py++);
510 
511         /* Decrement loop counter */
512         k--;
513       }
514 
515       /* Store the result in the accumulator in the destination buffer. */
516       *pOut = (q15_t) (sum >> 15);
517       /* Destination pointer is updated according to the address modifier, inc */
518       pOut += inc;
519 
520       /* Increment MAC count */
521       count++;
522 
523       /* Update the inputA and inputB pointers for next MAC calculation */
524       px = pIn1 + count;
525       py = pIn2;
526 
527       /* Decrement loop counter */
528       blkCnt--;
529     }
530   }
531 
532   /* --------------------------
533    * Initializations of stage3
534    * -------------------------*/
535 
536   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
537    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
538    * ....
539    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
540    * sum +=  x[srcALen-1] * y[0]
541    */
542 
543   /* In this stage the MAC operations are decreased by 1 for every iteration.
544      The count variable holds the number of MAC operations performed */
545   count = srcBLen - 1U;
546 
547   /* Working pointer of inputA */
548   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
549   px = pSrc1;
550 
551   /* Working pointer of inputB */
552   py = pIn2;
553 
554   /* -------------------
555    * Stage3 process
556    * ------------------*/
557 
558   while (blockSize3 > 0U)
559   {
560     /* Accumulator is made zero for every iteration */
561     sum = 0;
562 
563     /* Apply loop unrolling and compute 4 MACs simultaneously. */
564     k = count >> 2U;
565 
566     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
567      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
568     while (k > 0U)
569     {
570       /* Perform the multiply-accumulates */
571       /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
572       sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
573       /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
574       sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
575 
576       /* Decrement loop counter */
577       k--;
578     }
579 
580     /* If the count is not a multiple of 4, compute any remaining MACs here.
581      ** No loop unrolling is used. */
582     k = count % 0x4U;
583 
584     while (k > 0U)
585     {
586       /* Perform the multiply-accumulates */
587       sum = __SMLAD(*px++, *py++, sum);
588 
589       /* Decrement loop counter */
590       k--;
591     }
592 
593     /* Store the result in the accumulator in the destination buffer. */
594     *pOut = (q15_t) (sum >> 15);
595     /* Destination pointer is updated according to the address modifier, inc */
596     pOut += inc;
597 
598     /* Update the inputA and inputB pointers for next MAC calculation */
599     px = ++pSrc1;
600     py = pIn2;
601 
602     /* Decrement the MAC count */
603     count--;
604 
605     /* Decrement the loop counter */
606     blockSize3--;
607   }
608 
609 }
610 
611 /**
612   @} end of Corr group
613  */
614