1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_correlate_fast_opt_q15.c
4  * Description:  Fast Q15 Correlation
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Corr
37   @{
38  */
39 
40 /**
41   @brief         Correlation of Q15 sequences (fast version).
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence.
46   @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
47   @param[in]     pScratch   points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
48 
49   @par           Scaling and Overflow Behavior
50                    This fast version uses a 32-bit accumulator with 2.30 format.
51                    The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
52                    There is no saturation on intermediate additions.
53                    Thus, if the accumulator overflows it wraps around and distorts the result.
54                    The input signals should be scaled down to avoid intermediate overflows.
55                    Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
56                    maximum of min(srcALen, srcBLen) number of additions is carried internally.
57                    The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
58 
59   @remark
60                    Refer to \ref arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
61  */
62 
arm_correlate_fast_opt_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,q15_t * pScratch)63 ARM_DSP_ATTRIBUTE void arm_correlate_fast_opt_q15(
64   const q15_t * pSrcA,
65         uint32_t srcALen,
66   const q15_t * pSrcB,
67         uint32_t srcBLen,
68         q15_t * pDst,
69         q15_t * pScratch)
70 {
71   const q15_t *pIn1;                                   /* InputA pointer */
72   const q15_t *pIn2;                                   /* InputB pointer */
73         q31_t acc0;                                    /* Accumulators */
74         q15_t *pOut = pDst;                            /* Output pointer */
75         q15_t *pScr1 = pScratch;                       /* Temporary pointer for scratch */
76   const q15_t *py;                                     /* Intermediate inputB pointer */
77         uint32_t j, blkCnt, outBlockSize;              /* Loop counter */
78         int32_t inc = 1;                               /* Destination address modifier */
79         uint32_t tapCnt;                               /* Loop count */
80 
81 #if defined (ARM_MATH_LOOPUNROLL)
82         q31_t acc1, acc2, acc3;                        /* Accumulators */
83         q31_t x1, x2, x3;                              /* Temporary variables for holding input and coefficient values */
84         q31_t y1, y2;                                  /* State variables */
85 #endif
86 
87   /* The algorithm implementation is based on the lengths of the inputs. */
88   /* srcB is always made to slide across srcA. */
89   /* So srcBLen is always considered as shorter or equal to srcALen */
90   /* But CORR(x, y) is reverse of CORR(y, x) */
91   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
92   /* and the destination pointer modifier, inc is set to -1 */
93   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
94   /* But to improve the performance,
95    * we include zeroes in the output instead of zero padding either of the the inputs*/
96   /* If srcALen > srcBLen,
97    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
98   /* If srcALen < srcBLen,
99    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
100   if (srcALen >= srcBLen)
101   {
102     /* Initialization of inputA pointer */
103     pIn1 = pSrcA;
104 
105     /* Initialization of inputB pointer */
106     pIn2 = pSrcB;
107 
108     /* Number of output samples is calculated */
109     outBlockSize = (2U * srcALen) - 1U;
110 
111     /* When srcALen > srcBLen, zero padding is done to srcB
112      * to make their lengths equal.
113      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
114      * number of output samples are made zero */
115     j = outBlockSize - (srcALen + (srcBLen - 1U));
116 
117     /* Updating the pointer position to non zero value */
118     pOut += j;
119   }
120   else
121   {
122     /* Initialization of inputA pointer */
123     pIn1 = pSrcB;
124 
125     /* Initialization of inputB pointer */
126     pIn2 = pSrcA;
127 
128     /* srcBLen is always considered as shorter or equal to srcALen */
129     j = srcBLen;
130     srcBLen = srcALen;
131     srcALen = j;
132 
133     /* CORR(x, y) = Reverse order(CORR(y, x)) */
134     /* Hence set the destination pointer to point to the last output sample */
135     pOut = pDst + ((srcALen + srcBLen) - 2U);
136 
137     /* Destination address modifier is set to -1 */
138     inc = -1;
139   }
140 
141   pScr1 = pScratch;
142 
143   /* Fill (srcBLen - 1U) zeros in scratch buffer */
144   arm_fill_q15(0, pScr1, (srcBLen - 1U));
145 
146   /* Update temporary scratch pointer */
147   pScr1 += (srcBLen - 1U);
148 
149 
150   /* Copy (srcALen) samples in scratch buffer */
151   arm_copy_q15(pIn1, pScr1, srcALen);
152 
153   /* Update pointers */
154   pScr1 += srcALen;
155 
156 
157   /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
158   arm_fill_q15(0, pScr1, (srcBLen - 1U));
159 
160   /* Update pointer */
161   pScr1 += (srcBLen - 1U);
162 
163   /* Temporary pointer for scratch2 */
164   py = pIn2;
165 
166 
167   /* Actual correlation process starts here */
168 
169 #if defined (ARM_MATH_LOOPUNROLL)
170 
171   /* Loop unrolling: Compute 4 outputs at a time */
172   blkCnt = (srcALen + srcBLen - 1U) >> 2;
173 
174   while (blkCnt > 0)
175   {
176     /* Initialze temporary scratch pointer as scratch1 */
177     pScr1 = pScratch;
178 
179     /* Clear Accumlators */
180     acc0 = 0;
181     acc1 = 0;
182     acc2 = 0;
183     acc3 = 0;
184 
185     /* Read two samples from scratch buffer */
186     x1 = read_q15x2_ia (&pScr1);
187 
188     /* Read next two samples from scratch buffer */
189     x2 = read_q15x2_ia (&pScr1);
190 
191     tapCnt = (srcBLen) >> 2U;
192 
193     while (tapCnt > 0U)
194     {
195       /* Read four samples from smaller buffer */
196       y1 = read_q15x2_ia ((q15_t **) &pIn2);
197       y2 = read_q15x2_ia ((q15_t **) &pIn2);
198 
199       /* multiply and accumulate */
200       acc0 = __SMLAD(x1, y1, acc0);
201       acc2 = __SMLAD(x2, y1, acc2);
202 
203       /* pack input data */
204 #ifndef ARM_MATH_BIG_ENDIAN
205       x3 = __PKHBT(x2, x1, 0);
206 #else
207       x3 = __PKHBT(x1, x2, 0);
208 #endif
209 
210       /* multiply and accumulate */
211       acc1 = __SMLADX(x3, y1, acc1);
212 
213       /* Read next two samples from scratch buffer */
214       x1 = read_q15x2_ia (&pScr1);
215 
216       /* multiply and accumulate */
217       acc0 = __SMLAD(x2, y2, acc0);
218       acc2 = __SMLAD(x1, y2, acc2);
219 
220       /* pack input data */
221 #ifndef ARM_MATH_BIG_ENDIAN
222       x3 = __PKHBT(x1, x2, 0);
223 #else
224       x3 = __PKHBT(x2, x1, 0);
225 #endif
226 
227       acc3 = __SMLADX(x3, y1, acc3);
228       acc1 = __SMLADX(x3, y2, acc1);
229 
230       x2 = read_q15x2_ia (&pScr1);
231 
232 #ifndef ARM_MATH_BIG_ENDIAN
233       x3 = __PKHBT(x2, x1, 0);
234 #else
235       x3 = __PKHBT(x1, x2, 0);
236 #endif
237 
238       acc3 = __SMLADX(x3, y2, acc3);
239 
240       /* Decrement loop counter */
241       tapCnt--;
242     }
243 
244     /* Update scratch pointer for remaining samples of smaller length sequence */
245     pScr1 -= 4U;
246 
247     /* apply same above for remaining samples of smaller length sequence */
248     tapCnt = (srcBLen) & 3U;
249 
250     while (tapCnt > 0U)
251     {
252       /* accumulate the results */
253       acc0 += (*pScr1++ * *pIn2);
254       acc1 += (*pScr1++ * *pIn2);
255       acc2 += (*pScr1++ * *pIn2);
256       acc3 += (*pScr1++ * *pIn2++);
257 
258       pScr1 -= 3U;
259 
260       /* Decrement loop counter */
261       tapCnt--;
262     }
263 
264     blkCnt--;
265 
266     /* Store the results in the accumulators in the destination buffer. */
267     *pOut = (__SSAT(acc0 >> 15U, 16));
268     pOut += inc;
269     *pOut = (__SSAT(acc1 >> 15U, 16));
270     pOut += inc;
271     *pOut = (__SSAT(acc2 >> 15U, 16));
272     pOut += inc;
273     *pOut = (__SSAT(acc3 >> 15U, 16));
274     pOut += inc;
275 
276     /* Initialization of inputB pointer */
277     pIn2 = py;
278 
279     pScratch += 4U;
280   }
281 
282   /* Loop unrolling: Compute remaining outputs */
283   blkCnt = (srcALen + srcBLen - 1U) & 0x3;
284 
285 #else
286 
287   /* Initialize blkCnt with number of samples */
288   blkCnt = (srcALen + srcBLen - 1U);
289 
290 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
291 
292   /* Calculate correlation for remaining samples of Bigger length sequence */
293   while (blkCnt > 0)
294   {
295     /* Initialze temporary scratch pointer as scratch1 */
296     pScr1 = pScratch;
297 
298     /* Clear Accumlators */
299     acc0 = 0;
300 
301     tapCnt = (srcBLen) >> 1U;
302 
303     while (tapCnt > 0U)
304     {
305 
306       /* Read next two samples from scratch buffer */
307       acc0 += (*pScr1++ * *pIn2++);
308       acc0 += (*pScr1++ * *pIn2++);
309 
310       /* Decrement loop counter */
311       tapCnt--;
312     }
313 
314     tapCnt = (srcBLen) & 1U;
315 
316     /* apply same above for remaining samples of smaller length sequence */
317     while (tapCnt > 0U)
318     {
319 
320       /* accumulate the results */
321       acc0 += (*pScr1++ * *pIn2++);
322 
323       /* Decrement loop counter */
324       tapCnt--;
325     }
326 
327     blkCnt--;
328 
329     /* The result is in 2.30 format.  Convert to 1.15 with saturation.
330      ** Then store the output in the destination buffer. */
331     *pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
332     pOut += inc;
333 
334     /* Initialization of inputB pointer */
335     pIn2 = py;
336 
337     pScratch += 1U;
338   }
339 
340 }
341 
342 /**
343   @} end of Corr group
344  */
345