1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_correlate_f64.c
4  * Description:  Correlation of floating-point sequences
5  *
6  * $Date:        10 August 2022
7  * $Revision:    V1.10.1
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Corr
37   @{
38  */
39 
40 /**
41   @brief         Correlation of floating-point sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
47  */
48 
arm_correlate_f64(const float64_t * pSrcA,uint32_t srcALen,const float64_t * pSrcB,uint32_t srcBLen,float64_t * pDst)49 ARM_DSP_ATTRIBUTE void arm_correlate_f64(
50     const float64_t * pSrcA,
51     uint32_t srcALen,
52     const float64_t * pSrcB,
53     uint32_t srcBLen,
54     float64_t * pDst)
55 {
56     const float64_t *pIn1;                               /* InputA pointer */
57     const float64_t *pIn2;                               /* InputB pointer */
58     float64_t *pOut = pDst;                        /* Output pointer */
59     const float64_t *px;                                 /* Intermediate inputA pointer */
60     const float64_t *py;                                 /* Intermediate inputB pointer */
61     const float64_t *pSrc1;
62     float64_t sum;
63     uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
64     uint32_t j, k, count, blkCnt;                  /* Loop counters */
65     uint32_t outBlockSize;                         /* Loop counter */
66     int32_t inc = 1;                               /* Destination address modifier */
67 #if defined(ARM_MATH_NEON) && defined(__aarch64__)
68     float64x2_t sumV,pxV,pyV ;
69 #endif
70 
71     /* The algorithm implementation is based on the lengths of the inputs. */
72     /* srcB is always made to slide across srcA. */
73     /* So srcBLen is always considered as shorter or equal to srcALen */
74     /* But CORR(x, y) is reverse of CORR(y, x) */
75     /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
76     /* and the destination pointer modifier, inc is set to -1 */
77     /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
78     /* But to improve the performance,
79      * we assume zeroes in the output instead of zero padding either of the the inputs*/
80     /* If srcALen > srcBLen,
81      * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
82     /* If srcALen < srcBLen,
83      * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
84     if (srcALen >= srcBLen)
85     {
86         /* Initialization of inputA pointer */
87         pIn1 = pSrcA;
88 
89         /* Initialization of inputB pointer */
90         pIn2 = pSrcB;
91 
92         /* Number of output samples is calculated */
93         outBlockSize = (2U * srcALen) - 1U;
94 
95         /* When srcALen > srcBLen, zero padding has to be done to srcB
96          * to make their lengths equal.
97          * Instead, (outBlockSize - (srcALen + srcBLen - 1))
98          * number of output samples are made zero */
99         j = outBlockSize - (srcALen + (srcBLen - 1U));
100 
101         /* Updating the pointer position to non zero value */
102         pOut += j;
103     }
104     else
105     {
106         /* Initialization of inputA pointer */
107         pIn1 = pSrcB;
108 
109         /* Initialization of inputB pointer */
110         pIn2 = pSrcA;
111 
112         /* srcBLen is always considered as shorter or equal to srcALen */
113         j = srcBLen;
114         srcBLen = srcALen;
115         srcALen = j;
116 
117         /* CORR(x, y) = Reverse order(CORR(y, x)) */
118         /* Hence set the destination pointer to point to the last output sample */
119         pOut = pDst + ((srcALen + srcBLen) - 2U);
120 
121         /* Destination address modifier is set to -1 */
122         inc = -1;
123     }
124 
125     /* The function is internally
126      * divided into three stages according to the number of multiplications that has to be
127      * taken place between inputA samples and inputB samples. In the first stage of the
128      * algorithm, the multiplications increase by one for every iteration.
129      * In the second stage of the algorithm, srcBLen number of multiplications are done.
130      * In the third stage of the algorithm, the multiplications decrease by one
131      * for every iteration. */
132 
133     /* The algorithm is implemented in three stages.
134      The loop counters of each stage is initiated here. */
135     blockSize1 = srcBLen - 1U;
136     blockSize2 = srcALen - (srcBLen - 1U);
137     blockSize3 = blockSize1;
138 
139     /* --------------------------
140      * Initializations of stage1
141      * -------------------------*/
142 
143     /* sum = x[0] * y[srcBlen - 1]
144      * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
145      * ....
146      * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
147      */
148 
149     /* In this stage the MAC operations are increased by 1 for every iteration.
150      The count variable holds the number of MAC operations performed */
151     count = 1U;
152 
153     /* Working pointer of inputA */
154     px = pIn1;
155 
156     /* Working pointer of inputB */
157     pSrc1 = pIn2 + (srcBLen - 1U);
158     py = pSrc1;
159 
160     /* ------------------------
161      * Stage1 process
162      * ----------------------*/
163 
164     /* The first stage starts here */
165     while (blockSize1 > 0U)
166     {
167         /* Accumulator is made zero for every iteration */
168         sum = 0.;
169 #if defined(ARM_MATH_NEON) && defined(__aarch64__)
170         sumV = vdupq_n_f64(0.0);
171         k = count >> 1U ;
172 
173         while(k > 0U)
174         {
175             pxV = vld1q_f64(px);
176             pyV = vld1q_f64(py);
177             sumV = vmlaq_f64(sumV, pxV, pyV);
178             px+=2;
179             py+=2;
180             k--;
181         }
182         sum =vaddvq_f64(sumV);
183         k = count & 1 ;
184 #else
185         /* Initialize k with number of samples */
186         k = count;
187 #endif
188         while (k > 0U)
189         {
190             /* Perform the multiply-accumulate */
191             /* x[0] * y[srcBLen - 1] */
192             sum += *px++ * *py++;
193 
194             /* Decrement loop counter */
195             k--;
196         }
197 
198         /* Store the result in the accumulator in the destination buffer. */
199 
200         *pOut = sum;
201         /* Destination pointer is updated according to the address modifier, inc */
202         pOut += inc;
203 
204         /* Update the inputA and inputB pointers for next MAC calculation */
205         py = pSrc1 - count;
206         px = pIn1;
207 
208         /* Increment MAC count */
209         count++;
210 
211         /* Decrement loop counter */
212         blockSize1--;
213 
214     }
215 
216     /* --------------------------
217      * Initializations of stage2
218      * ------------------------*/
219 
220     /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
221      * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen]   * y[srcBLen-1]
222      * ....
223      * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
224      */
225 
226     /* Working pointer of inputA */
227     px = pIn1;
228 
229     /* Working pointer of inputB */
230     py = pIn2;
231 
232     /* count is index by which the pointer pIn1 to be incremented */
233     count = 0U;
234 
235     /* -------------------
236      * Stage2 process
237      * ------------------*/
238 
239     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
240      * So, to loop unroll over blockSize2,
241      * srcBLen should be greater than or equal to 4 */
242     if (srcBLen >= 4U)
243     {
244         /* Initialize blkCnt with number of samples */
245         blkCnt = blockSize2;
246 
247         while (blkCnt > 0U)
248         {
249             /* Accumulator is made zero for every iteration */
250             sum = 0.;
251 #if defined(ARM_MATH_NEON) && defined(__aarch64__)
252             sumV = vdupq_n_f64(0.0);
253             k = srcBLen >> 1U ;
254             while(k > 0U)
255             {
256                 pxV = vld1q_f64(px);
257                 pyV = vld1q_f64(py);
258                 sumV = vmlaq_f64(sumV, pxV, pyV);
259                 px+=2;
260                 py+=2;
261                 k--;
262             }
263             sum =vaddvq_f64(sumV);
264             k = srcBLen & 1 ;
265 
266 #else
267 
268             /* Initialize blkCnt with number of samples */
269             k = srcBLen;
270 #endif
271             while (k > 0U)
272             {
273                 /* Perform the multiply-accumulate */
274                 sum += *px++ * *py++;
275 
276                 /* Decrement the loop counter */
277                 k--;
278             }
279 
280             /* Store the result in the accumulator in the destination buffer. */
281             *pOut = sum;
282 
283             /* Destination pointer is updated according to the address modifier, inc */
284             pOut += inc;
285 
286             /* Increment the pointer pIn1 index, count by 1 */
287             count++;
288 
289             /* Update the inputA and inputB pointers for next MAC calculation */
290             px = pIn1 + count;
291             py = pIn2;
292 
293             /* Decrement the loop counter */
294             blkCnt--;
295         }
296     }
297     else
298     {
299         /* If the srcBLen is not a multiple of 4,
300          * the blockSize2 loop cannot be unrolled by 4 */
301         blkCnt = blockSize2;
302 
303         while (blkCnt > 0U)
304         {
305             /* Accumulator is made zero for every iteration */
306             sum = 0.;
307 #if defined(ARM_MATH_NEON) && defined(__aarch64__)
308             sumV = vdupq_n_f64(0.0);
309             k = srcBLen >> 1U ;
310             while(k > 0U)
311             {
312                 pxV = vld1q_f64(px);
313                 pyV = vld1q_f64(py);
314                 sumV = vmlaq_f64(sumV, pxV, pyV);
315                 px+=2;
316                 py+=2;
317                 k--;
318             }
319             sum =vaddvq_f64(sumV);
320             k = srcBLen & 1 ;
321 
322 #else
323 
324             /* Loop over srcBLen */
325             k = srcBLen;
326 #endif
327             while (k > 0U)
328             {
329                 /* Perform the multiply-accumulate */
330                 sum += *px++ * *py++;
331 
332                 /* Decrement the loop counter */
333                 k--;
334             }
335 
336             /* Store the result in the accumulator in the destination buffer. */
337             *pOut = sum;
338             /* Destination pointer is updated according to the address modifier, inc */
339             pOut += inc;
340 
341             /* Increment the pointer pIn1 index, count by 1 */
342             count++;
343 
344             /* Update the inputA and inputB pointers for next MAC calculation */
345             px = pIn1 + count;
346             py = pIn2;
347 
348             /* Decrement the loop counter */
349             blkCnt--;
350         }
351     }
352 
353 
354     /* --------------------------
355      * Initializations of stage3
356      * -------------------------*/
357 
358     /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
359      * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
360      * ....
361      * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
362      * sum +=  x[srcALen-1] * y[0]
363      */
364 
365     /* In this stage the MAC operations are decreased by 1 for every iteration.
366      The count variable holds the number of MAC operations performed */
367     count = srcBLen - 1U;
368 
369     /* Working pointer of inputA */
370     pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
371     px = pSrc1;
372 
373     /* Working pointer of inputB */
374     py = pIn2;
375 
376     /* -------------------
377      * Stage3 process
378      * ------------------*/
379 
380     while (blockSize3 > 0U)
381     {
382         /* Accumulator is made zero for every iteration */
383         sum = 0.;
384 #if defined(ARM_MATH_NEON) && defined(__aarch64__)
385         sumV = vdupq_n_f64(0.0);
386         k = count >> 1U ;
387 
388         while(k > 0U)
389         {
390             pxV = vld1q_f64(px);
391             pyV = vld1q_f64(py);
392             sumV = vmlaq_f64(sumV, pxV, pyV);
393             px+=2;
394             py+=2;
395             k--;
396         }
397         sum =vaddvq_f64(sumV);
398         k = count & 1 ;
399 #else
400 
401         /* Initialize blkCnt with number of samples */
402         k = count;
403 #endif
404         while (k > 0U)
405         {
406             /* Perform the multiply-accumulate */
407             sum += *px++ * *py++;
408 
409             /* Decrement loop counter */
410             k--;
411         }
412 
413         /* Store the result in the accumulator in the destination buffer. */
414         *pOut = sum;
415         /* Destination pointer is updated according to the address modifier, inc */
416         pOut += inc;
417 
418         /* Update the inputA and inputB pointers for next MAC calculation */
419         px = ++pSrc1;
420         py = pIn2;
421 
422         /* Decrement MAC count */
423         count--;
424 
425         /* Decrement the loop counter */
426         blockSize3--;
427     }
428 }
429 
430 /**
431   @} end of Corr group
432  */
433