1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_correlate_opt_q15.c
4  * Description:  Correlation of Q15 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Corr
37   @{
38  */
39 
40 /**
41   @brief         Correlation of Q15 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
47   @param[in]     pScratch   points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
48 
49   @par           Scaling and Overflow Behavior
50                    The function is implemented using a 64-bit internal accumulator.
51                    Both inputs are in 1.15 format and multiplications yield a 2.30 result.
52                    The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
53                    This approach provides 33 guard bits and there is no risk of overflow.
54                    The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
55 
56  @remark
57                    Refer to \ref arm_correlate_fast_q15() for a faster but less precise version of this function.
58  */
59 
arm_correlate_opt_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,q15_t * pScratch)60 ARM_DSP_ATTRIBUTE void arm_correlate_opt_q15(
61   const q15_t * pSrcA,
62         uint32_t srcALen,
63   const q15_t * pSrcB,
64         uint32_t srcBLen,
65         q15_t * pDst,
66         q15_t * pScratch)
67 {
68         q63_t acc0;                                    /* Accumulators */
69         q15_t *pOut = pDst;                            /* Output pointer */
70         q15_t *pScr1;                                  /* Temporary pointer for scratch1 */
71   const q15_t *pIn1;                                   /* InputA pointer */
72   const q15_t *pIn2;                                   /* InputB pointer */
73   const q15_t *py;                                     /* Intermediate inputB pointer */
74         uint32_t j, blkCnt, outBlockSize;              /* Loop counter */
75         int32_t inc = 1;                               /* Output pointer increment */
76         uint32_t tapCnt;
77 
78 #if defined (ARM_MATH_LOOPUNROLL)
79         q63_t acc1, acc2, acc3;                        /* Accumulators */
80         q31_t x1, x2, x3;                              /* Temporary variables for holding input1 and input2 values */
81         q31_t y1, y2;                                  /* State variables */
82 #endif
83 
84   /* The algorithm implementation is based on the lengths of the inputs. */
85   /* srcB is always made to slide across srcA. */
86   /* So srcBLen is always considered as shorter or equal to srcALen */
87   /* But CORR(x, y) is reverse of CORR(y, x) */
88   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
89   /* and the destination pointer modifier, inc is set to -1 */
90   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
91   /* But to improve the performance,
92    * we include zeroes in the output instead of zero padding either of the the inputs*/
93   /* If srcALen > srcBLen,
94    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
95   /* If srcALen < srcBLen,
96    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
97   if (srcALen >= srcBLen)
98   {
99     /* Initialization of inputA pointer */
100     pIn1 = pSrcA;
101 
102     /* Initialization of inputB pointer */
103     pIn2 = pSrcB;
104 
105     /* Number of output samples is calculated */
106     outBlockSize = (srcALen * 2U) - 1U;
107 
108     /* When srcALen > srcBLen, zero padding is done to srcB
109      * to make their lengths equal.
110      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
111      * number of output samples are made zero */
112     j = outBlockSize - (srcALen + (srcBLen - 1U));
113 
114     /* Updating the pointer position to non zero value */
115     pOut += j;
116   }
117   else
118   {
119     /* Initialization of inputA pointer */
120     pIn1 = pSrcB;
121 
122     /* Initialization of inputB pointer */
123     pIn2 = pSrcA;
124 
125     /* srcBLen is always considered as shorter or equal to srcALen */
126     j = srcBLen;
127     srcBLen = srcALen;
128     srcALen = j;
129 
130     /* CORR(x, y) = Reverse order(CORR(y, x)) */
131     /* Hence set the destination pointer to point to the last output sample */
132     pOut = pDst + ((srcALen + srcBLen) - 2U);
133 
134     /* Destination address modifier is set to -1 */
135     inc = -1;
136   }
137 
138   pScr1 = pScratch;
139 
140   /* Fill (srcBLen - 1U) zeros in scratch buffer */
141   arm_fill_q15(0, pScr1, (srcBLen - 1U));
142 
143   /* Update temporary scratch pointer */
144   pScr1 += (srcBLen - 1U);
145 
146   /* Copy (srcALen) samples in scratch buffer */
147   arm_copy_q15(pIn1, pScr1, srcALen);
148 
149   /* Update pointers */
150   pScr1 += srcALen;
151 
152 
153   /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
154   arm_fill_q15(0, pScr1, (srcBLen - 1U));
155 
156   /* Update pointer */
157   pScr1 += (srcBLen - 1U);
158 
159   /* Temporary pointer for scratch2 */
160   py = pIn2;
161 
162 
163   /* Actual correlation process starts here */
164 #if defined (ARM_MATH_LOOPUNROLL)
165 
166   /* Loop unrolling: Compute 4 outputs at a time */
167   blkCnt = (srcALen + srcBLen - 1U) >> 2;
168 
169   while (blkCnt > 0)
170   {
171     /* Initialze temporary scratch pointer as scratch1 */
172     pScr1 = pScratch;
173 
174     /* Clear Accumlators */
175     acc0 = 0;
176     acc1 = 0;
177     acc2 = 0;
178     acc3 = 0;
179 
180     /* Read two samples from scratch1 buffer */
181     x1 = read_q15x2_ia (&pScr1);
182 
183     /* Read next two samples from scratch1 buffer */
184     x2 = read_q15x2_ia (&pScr1);
185 
186     tapCnt = (srcBLen) >> 2U;
187 
188     while (tapCnt > 0U)
189     {
190       /* Read four samples from smaller buffer */
191       y1 = read_q15x2_ia ((q15_t **) &pIn2);
192       y2 = read_q15x2_ia ((q15_t **) &pIn2);
193 
194       /* multiply and accumulate */
195       acc0 = __SMLALD(x1, y1, acc0);
196       acc2 = __SMLALD(x2, y1, acc2);
197 
198       /* pack input data */
199 #ifndef ARM_MATH_BIG_ENDIAN
200       x3 = __PKHBT(x2, x1, 0);
201 #else
202       x3 = __PKHBT(x1, x2, 0);
203 #endif
204 
205       /* multiply and accumulate */
206       acc1 = __SMLALDX(x3, y1, acc1);
207 
208       /* Read next two samples from scratch1 buffer */
209       x1 = read_q15x2_ia (&pScr1);
210 
211       /* multiply and accumulate */
212       acc0 = __SMLALD(x2, y2, acc0);
213       acc2 = __SMLALD(x1, y2, acc2);
214 
215       /* pack input data */
216 #ifndef ARM_MATH_BIG_ENDIAN
217       x3 = __PKHBT(x1, x2, 0);
218 #else
219       x3 = __PKHBT(x2, x1, 0);
220 #endif
221 
222       acc3 = __SMLALDX(x3, y1, acc3);
223       acc1 = __SMLALDX(x3, y2, acc1);
224 
225       x2 = read_q15x2_ia (&pScr1);
226 
227 #ifndef ARM_MATH_BIG_ENDIAN
228       x3 = __PKHBT(x2, x1, 0);
229 #else
230       x3 = __PKHBT(x1, x2, 0);
231 #endif
232 
233       acc3 = __SMLALDX(x3, y2, acc3);
234 
235       /* Decrement loop counter */
236       tapCnt--;
237     }
238 
239     /* Update scratch pointer for remaining samples of smaller length sequence */
240     pScr1 -= 4U;
241 
242     /* apply same above for remaining samples of smaller length sequence */
243     tapCnt = (srcBLen) & 3U;
244 
245     while (tapCnt > 0U)
246     {
247       /* accumulate the results */
248       acc0 += (*pScr1++ * *pIn2);
249       acc1 += (*pScr1++ * *pIn2);
250       acc2 += (*pScr1++ * *pIn2);
251       acc3 += (*pScr1++ * *pIn2++);
252 
253       pScr1 -= 3U;
254 
255       /* Decrement loop counter */
256       tapCnt--;
257     }
258 
259     blkCnt--;
260 
261 
262     /* Store the results in the accumulators in the destination buffer. */
263     *pOut = (__SSAT(acc0 >> 15U, 16));
264     pOut += inc;
265     *pOut = (__SSAT(acc1 >> 15U, 16));
266     pOut += inc;
267     *pOut = (__SSAT(acc2 >> 15U, 16));
268     pOut += inc;
269     *pOut = (__SSAT(acc3 >> 15U, 16));
270     pOut += inc;
271 
272     /* Initialization of inputB pointer */
273     pIn2 = py;
274 
275     pScratch += 4U;
276   }
277 
278 
279   /* Loop unrolling: Compute remaining outputs */
280   blkCnt = (srcALen + srcBLen - 1U) & 0x3;
281 
282 #else
283 
284   /* Initialize blkCnt with number of samples */
285   blkCnt = (srcALen + srcBLen - 1U);
286 
287 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
288 
289   /* Calculate correlation for remaining samples of Bigger length sequence */
290   while (blkCnt > 0)
291   {
292     /* Initialze temporary scratch pointer as scratch1 */
293     pScr1 = pScratch;
294 
295     /* Clear Accumlators */
296     acc0 = 0;
297 
298     tapCnt = (srcBLen) >> 1U;
299 
300     while (tapCnt > 0U)
301     {
302 
303       /* Read next two samples from scratch1 buffer */
304       acc0 += (*pScr1++ * *pIn2++);
305       acc0 += (*pScr1++ * *pIn2++);
306 
307       /* Decrement loop counter */
308       tapCnt--;
309     }
310 
311     tapCnt = (srcBLen) & 1U;
312 
313     /* apply same above for remaining samples of smaller length sequence */
314     while (tapCnt > 0U)
315     {
316       /* accumulate the results */
317       acc0 += (*pScr1++ * *pIn2++);
318 
319       /* Decrement loop counter */
320       tapCnt--;
321     }
322 
323     blkCnt--;
324 
325     /* The result is in 2.30 format.  Convert to 1.15 with saturation.
326        Then store the output in the destination buffer. */
327     *pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
328     pOut += inc;
329 
330     /* Initialization of inputB pointer */
331     pIn2 = py;
332 
333     pScratch += 1U;
334   }
335 
336 }
337 
338 /**
339   @} end of Corr group
340  */
341