1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_correlate_f64.c
4  * Description:  Correlation of floating-point sequences
5  *
6  * $Date:        10 August 2022
7  * $Revision:    V1.10.1
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Corr
37   @{
38  */
39 
40 /**
41   @brief         Correlation of floating-point sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
47   @return        none
48  */
49 
arm_correlate_f64(const float64_t * pSrcA,uint32_t srcALen,const float64_t * pSrcB,uint32_t srcBLen,float64_t * pDst)50 void arm_correlate_f64(
51     const float64_t * pSrcA,
52     uint32_t srcALen,
53     const float64_t * pSrcB,
54     uint32_t srcBLen,
55     float64_t * pDst)
56 {
57     const float64_t *pIn1;                               /* InputA pointer */
58     const float64_t *pIn2;                               /* InputB pointer */
59     float64_t *pOut = pDst;                        /* Output pointer */
60     const float64_t *px;                                 /* Intermediate inputA pointer */
61     const float64_t *py;                                 /* Intermediate inputB pointer */
62     const float64_t *pSrc1;
63     float64_t sum;
64     uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
65     uint32_t j, k, count, blkCnt;                  /* Loop counters */
66     uint32_t outBlockSize;                         /* Loop counter */
67     int32_t inc = 1;                               /* Destination address modifier */
68 #if defined(ARM_MATH_NEON) && defined(__aarch64__)
69     float64x2_t sumV,pxV,pyV ;
70 #endif
71 
72     /* The algorithm implementation is based on the lengths of the inputs. */
73     /* srcB is always made to slide across srcA. */
74     /* So srcBLen is always considered as shorter or equal to srcALen */
75     /* But CORR(x, y) is reverse of CORR(y, x) */
76     /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
77     /* and the destination pointer modifier, inc is set to -1 */
78     /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
79     /* But to improve the performance,
80      * we assume zeroes in the output instead of zero padding either of the the inputs*/
81     /* If srcALen > srcBLen,
82      * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
83     /* If srcALen < srcBLen,
84      * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
85     if (srcALen >= srcBLen)
86     {
87         /* Initialization of inputA pointer */
88         pIn1 = pSrcA;
89 
90         /* Initialization of inputB pointer */
91         pIn2 = pSrcB;
92 
93         /* Number of output samples is calculated */
94         outBlockSize = (2U * srcALen) - 1U;
95 
96         /* When srcALen > srcBLen, zero padding has to be done to srcB
97          * to make their lengths equal.
98          * Instead, (outBlockSize - (srcALen + srcBLen - 1))
99          * number of output samples are made zero */
100         j = outBlockSize - (srcALen + (srcBLen - 1U));
101 
102         /* Updating the pointer position to non zero value */
103         pOut += j;
104     }
105     else
106     {
107         /* Initialization of inputA pointer */
108         pIn1 = pSrcB;
109 
110         /* Initialization of inputB pointer */
111         pIn2 = pSrcA;
112 
113         /* srcBLen is always considered as shorter or equal to srcALen */
114         j = srcBLen;
115         srcBLen = srcALen;
116         srcALen = j;
117 
118         /* CORR(x, y) = Reverse order(CORR(y, x)) */
119         /* Hence set the destination pointer to point to the last output sample */
120         pOut = pDst + ((srcALen + srcBLen) - 2U);
121 
122         /* Destination address modifier is set to -1 */
123         inc = -1;
124     }
125 
126     /* The function is internally
127      * divided into three stages according to the number of multiplications that has to be
128      * taken place between inputA samples and inputB samples. In the first stage of the
129      * algorithm, the multiplications increase by one for every iteration.
130      * In the second stage of the algorithm, srcBLen number of multiplications are done.
131      * In the third stage of the algorithm, the multiplications decrease by one
132      * for every iteration. */
133 
134     /* The algorithm is implemented in three stages.
135      The loop counters of each stage is initiated here. */
136     blockSize1 = srcBLen - 1U;
137     blockSize2 = srcALen - (srcBLen - 1U);
138     blockSize3 = blockSize1;
139 
140     /* --------------------------
141      * Initializations of stage1
142      * -------------------------*/
143 
144     /* sum = x[0] * y[srcBlen - 1]
145      * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
146      * ....
147      * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
148      */
149 
150     /* In this stage the MAC operations are increased by 1 for every iteration.
151      The count variable holds the number of MAC operations performed */
152     count = 1U;
153 
154     /* Working pointer of inputA */
155     px = pIn1;
156 
157     /* Working pointer of inputB */
158     pSrc1 = pIn2 + (srcBLen - 1U);
159     py = pSrc1;
160 
161     /* ------------------------
162      * Stage1 process
163      * ----------------------*/
164 
165     /* The first stage starts here */
166     while (blockSize1 > 0U)
167     {
168         /* Accumulator is made zero for every iteration */
169         sum = 0.;
170 #if defined(ARM_MATH_NEON) && defined(__aarch64__)
171         sumV = vdupq_n_f64(0.0);
172         k = count >> 1U ;
173 
174         while(k > 0U)
175         {
176             pxV = vld1q_f64(px);
177             pyV = vld1q_f64(py);
178             sumV = vmlaq_f64(sumV, pxV, pyV);
179             px+=2;
180             py+=2;
181             k--;
182         }
183         sum =vaddvq_f64(sumV);
184         k = count & 1 ;
185 #else
186         /* Initialize k with number of samples */
187         k = count;
188 #endif
189         while (k > 0U)
190         {
191             /* Perform the multiply-accumulate */
192             /* x[0] * y[srcBLen - 1] */
193             sum += *px++ * *py++;
194 
195             /* Decrement loop counter */
196             k--;
197         }
198 
199         /* Store the result in the accumulator in the destination buffer. */
200 
201         *pOut = sum;
202         /* Destination pointer is updated according to the address modifier, inc */
203         pOut += inc;
204 
205         /* Update the inputA and inputB pointers for next MAC calculation */
206         py = pSrc1 - count;
207         px = pIn1;
208 
209         /* Increment MAC count */
210         count++;
211 
212         /* Decrement loop counter */
213         blockSize1--;
214 
215     }
216 
217     /* --------------------------
218      * Initializations of stage2
219      * ------------------------*/
220 
221     /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
222      * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen]   * y[srcBLen-1]
223      * ....
224      * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
225      */
226 
227     /* Working pointer of inputA */
228     px = pIn1;
229 
230     /* Working pointer of inputB */
231     py = pIn2;
232 
233     /* count is index by which the pointer pIn1 to be incremented */
234     count = 0U;
235 
236     /* -------------------
237      * Stage2 process
238      * ------------------*/
239 
240     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
241      * So, to loop unroll over blockSize2,
242      * srcBLen should be greater than or equal to 4 */
243     if (srcBLen >= 4U)
244     {
245         /* Initialize blkCnt with number of samples */
246         blkCnt = blockSize2;
247 
248         while (blkCnt > 0U)
249         {
250             /* Accumulator is made zero for every iteration */
251             sum = 0.;
252 #if defined(ARM_MATH_NEON) && defined(__aarch64__)
253             sumV = vdupq_n_f64(0.0);
254             k = srcBLen >> 1U ;
255             while(k > 0U)
256             {
257                 pxV = vld1q_f64(px);
258                 pyV = vld1q_f64(py);
259                 sumV = vmlaq_f64(sumV, pxV, pyV);
260                 px+=2;
261                 py+=2;
262                 k--;
263             }
264             sum =vaddvq_f64(sumV);
265             k = srcBLen & 1 ;
266 
267 #else
268 
269             /* Initialize blkCnt with number of samples */
270             k = srcBLen;
271 #endif
272             while (k > 0U)
273             {
274                 /* Perform the multiply-accumulate */
275                 sum += *px++ * *py++;
276 
277                 /* Decrement the loop counter */
278                 k--;
279             }
280 
281             /* Store the result in the accumulator in the destination buffer. */
282             *pOut = sum;
283 
284             /* Destination pointer is updated according to the address modifier, inc */
285             pOut += inc;
286 
287             /* Increment the pointer pIn1 index, count by 1 */
288             count++;
289 
290             /* Update the inputA and inputB pointers for next MAC calculation */
291             px = pIn1 + count;
292             py = pIn2;
293 
294             /* Decrement the loop counter */
295             blkCnt--;
296         }
297     }
298     else
299     {
300         /* If the srcBLen is not a multiple of 4,
301          * the blockSize2 loop cannot be unrolled by 4 */
302         blkCnt = blockSize2;
303 
304         while (blkCnt > 0U)
305         {
306             /* Accumulator is made zero for every iteration */
307             sum = 0.;
308 #if defined(ARM_MATH_NEON) && defined(__aarch64__)
309             sumV = vdupq_n_f64(0.0);
310             k = srcBLen >> 1U ;
311             while(k > 0U)
312             {
313                 pxV = vld1q_f64(px);
314                 pyV = vld1q_f64(py);
315                 sumV = vmlaq_f64(sumV, pxV, pyV);
316                 px+=2;
317                 py+=2;
318                 k--;
319             }
320             sum =vaddvq_f64(sumV);
321             k = srcBLen & 1 ;
322 
323 #else
324 
325             /* Loop over srcBLen */
326             k = srcBLen;
327 #endif
328             while (k > 0U)
329             {
330                 /* Perform the multiply-accumulate */
331                 sum += *px++ * *py++;
332 
333                 /* Decrement the loop counter */
334                 k--;
335             }
336 
337             /* Store the result in the accumulator in the destination buffer. */
338             *pOut = sum;
339             /* Destination pointer is updated according to the address modifier, inc */
340             pOut += inc;
341 
342             /* Increment the pointer pIn1 index, count by 1 */
343             count++;
344 
345             /* Update the inputA and inputB pointers for next MAC calculation */
346             px = pIn1 + count;
347             py = pIn2;
348 
349             /* Decrement the loop counter */
350             blkCnt--;
351         }
352     }
353 
354 
355     /* --------------------------
356      * Initializations of stage3
357      * -------------------------*/
358 
359     /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
360      * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
361      * ....
362      * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
363      * sum +=  x[srcALen-1] * y[0]
364      */
365 
366     /* In this stage the MAC operations are decreased by 1 for every iteration.
367      The count variable holds the number of MAC operations performed */
368     count = srcBLen - 1U;
369 
370     /* Working pointer of inputA */
371     pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
372     px = pSrc1;
373 
374     /* Working pointer of inputB */
375     py = pIn2;
376 
377     /* -------------------
378      * Stage3 process
379      * ------------------*/
380 
381     while (blockSize3 > 0U)
382     {
383         /* Accumulator is made zero for every iteration */
384         sum = 0.;
385 #if defined(ARM_MATH_NEON) && defined(__aarch64__)
386         sumV = vdupq_n_f64(0.0);
387         k = count >> 1U ;
388 
389         while(k > 0U)
390         {
391             pxV = vld1q_f64(px);
392             pyV = vld1q_f64(py);
393             sumV = vmlaq_f64(sumV, pxV, pyV);
394             px+=2;
395             py+=2;
396             k--;
397         }
398         sum =vaddvq_f64(sumV);
399         k = count & 1 ;
400 #else
401 
402         /* Initialize blkCnt with number of samples */
403         k = count;
404 #endif
405         while (k > 0U)
406         {
407             /* Perform the multiply-accumulate */
408             sum += *px++ * *py++;
409 
410             /* Decrement loop counter */
411             k--;
412         }
413 
414         /* Store the result in the accumulator in the destination buffer. */
415         *pOut = sum;
416         /* Destination pointer is updated according to the address modifier, inc */
417         pOut += inc;
418 
419         /* Update the inputA and inputB pointers for next MAC calculation */
420         px = ++pSrc1;
421         py = pIn2;
422 
423         /* Decrement MAC count */
424         count--;
425 
426         /* Decrement the loop counter */
427         blockSize3--;
428     }
429 }
430 
431 /**
432   @} end of Corr group
433  */
434