1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_correlate_f16.c
4 * Description: Correlation of floating-point sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 /**
33 @ingroup groupFilters
34 */
35
36
37
38 /**
39 @addtogroup Corr
40 @{
41 */
42
43 /**
44 @brief Correlation of floating-point sequences.
45 @param[in] pSrcA points to the first input sequence
46 @param[in] srcALen length of the first input sequence
47 @param[in] pSrcB points to the second input sequence
48 @param[in] srcBLen length of the second input sequence
49 @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
50 */
51
52 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
53
54 #include "arm_helium_utils.h"
55 #include "arm_vec_filtering.h"
56
57 #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F16(acc0, acc1, pX, pY, count) \
58 { \
59 float16_t const *pSrcX, *pSrcY; \
60 f16x8_t acc0Vec, acc1Vec, xVec, yVec; \
61 uint32_t k; \
62 \
63 acc0Vec = vdupq_n_f16(0.0f); \
64 acc1Vec = vdupq_n_f16(0.0f); \
65 pSrcX = (float16_t const *) pX; \
66 pSrcY = (float16_t const *) pY; \
67 k = count >> 3; \
68 while (k > 0U) \
69 { \
70 xVec = vld1q(pSrcX); pSrcX += 8; \
71 yVec = vldrhq_f16(&pSrcY[-1]); \
72 acc1Vec = vfmaq_f16(acc1Vec, xVec, yVec); \
73 yVec = vld1q(pSrcY); pSrcY += 8; \
74 acc0Vec = vfmaq_f16(acc0Vec, xVec, yVec); \
75 /* Decrement the loop counter */ \
76 k--; \
77 } \
78 k = count % 0x8U; \
79 /* use predication to finalize MAC sum */ \
80 /* acc1 requires 1 additional sample */ \
81 /* so add 1 to unmask an extra lane in final MAC computation */ \
82 mve_pred16_t p0 = vctp16q(k+1); \
83 xVec = vld1q(pSrcX); pSrcX += 8; \
84 yVec = vldrhq_f16(&pSrcY[-1]); \
85 acc1Vec = vfmaq_m_f16(acc1Vec, xVec, yVec,p0); \
86 /* acc0 requires exact number of sample */ \
87 /* disable extra lanes in final MAC computation */ \
88 p0 = vctp16q(k); \
89 yVec = vld1q(pSrcY); pSrcY += 8; \
90 acc0Vec = vfmaq_m_f16(acc0Vec, xVec, yVec,p0); \
91 \
92 acc0 = vecAddAcrossF16Mve(acc0Vec); \
93 acc1 = vecAddAcrossF16Mve(acc1Vec); \
94 }
95
96 #define MVE_INTR_CORR_SINGLE_F16(acc, pX, pY, count) \
97 { \
98 float16_t const *pSrcX, *pSrcY; \
99 f16x8_t accVec, xVec, yVec; \
100 uint16_t k; \
101 \
102 accVec = vdupq_n_f16(0.0f); \
103 pSrcX = (float16_t const *) pX; \
104 pSrcY = (float16_t const *) pY; \
105 k = count >> 3; \
106 \
107 while (k > 0U) \
108 { \
109 yVec = vld1q(pSrcY); pSrcY += 8; \
110 xVec = vld1q(pSrcX); pSrcX += 8; \
111 accVec = vfmaq(accVec, xVec, yVec); \
112 /* Decrement the loop counter */ \
113 k--; \
114 } \
115 /* Loop with tail predication expected here */ \
116 k = count % 0x8U; \
117 if (k > 0U) \
118 { \
119 mve_pred16_t p0 = vctp16q(k); \
120 yVec = vld1q(pSrcY); pSrcY += 8; \
121 xVec = vld1q(pSrcX); pSrcX += 8; \
122 accVec = vfmaq_m(accVec, xVec, yVec, p0); \
123 } \
124 \
125 acc = vecAddAcrossF16Mve(accVec); \
126 }
127
128 #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F16(acc0, acc1, acc2, acc3, pX, pY, count) \
129 { \
130 float16_t const *pSrcX, *pSrcY; \
131 f16x8_t acc0Vec, acc1Vec, acc2Vec, acc3Vec, xVec, yVec; \
132 uint32_t k; \
133 \
134 acc0Vec = vdupq_n_f16(0.0f); \
135 acc1Vec = vdupq_n_f16(0.0f); \
136 acc2Vec = vdupq_n_f16(0.0f); \
137 acc3Vec = vdupq_n_f16(0.0f); \
138 pSrcX = (float16_t const *) pX; \
139 pSrcY = (float16_t const *) pY; \
140 k = count >> 3; \
141 \
142 while (k > 0U) \
143 { \
144 yVec = vld1q(pSrcY); pSrcY += 8; \
145 xVec = vldrhq_f16(&pSrcX[1]); \
146 acc1Vec = vfmaq_f16(acc1Vec, xVec, yVec); \
147 xVec = vldrhq_f16(&pSrcX[2]); \
148 acc2Vec = vfmaq_f16(acc2Vec, xVec, yVec); \
149 xVec = vldrhq_f16(&pSrcX[3]); \
150 acc3Vec = vfmaq_f16(acc3Vec, xVec, yVec); \
151 xVec = vld1q(pSrcX); pSrcX += 8; \
152 acc0Vec = vfmaq_f16(acc0Vec, xVec, yVec); \
153 /* Decrement the loop counter */ \
154 k--; \
155 } \
156 /* loop + tail predication expected here */ \
157 k = count % 0x8U; \
158 if (k > 0U) \
159 { \
160 mve_pred16_t p0 = vctp16q(k); \
161 yVec = vld1q(pSrcY); pSrcY += 8; \
162 xVec = vldrhq_f16(&pSrcX[1]); \
163 acc1Vec = vfmaq_m_f16(acc1Vec, xVec, yVec, p0); \
164 xVec = vldrhq_f16(&pSrcX[2]); \
165 acc2Vec = vfmaq_m_f16(acc2Vec, xVec, yVec, p0); \
166 xVec = vldrhq_f16(&pSrcX[3]); \
167 acc3Vec = vfmaq_m_f16(acc3Vec, xVec, yVec, p0); \
168 xVec = vld1q(pSrcX); pSrcX += 8; \
169 acc0Vec = vfmaq_m_f16(acc0Vec, xVec, yVec, p0); \
170 } \
171 \
172 acc0 = vecAddAcrossF16Mve(acc0Vec); \
173 acc1 = vecAddAcrossF16Mve(acc1Vec); \
174 acc2 = vecAddAcrossF16Mve(acc2Vec); \
175 acc3 = vecAddAcrossF16Mve(acc3Vec); \
176 }
177
178 #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F16(acc0, acc1, pX, pY, count) \
179 { \
180 float16_t const *pSrcX, *pSrcY; \
181 f16x8_t acc0Vec, acc1Vec, xVec, yVec; \
182 uint32_t k; \
183 \
184 acc0Vec = vdupq_n_f16(0.0f); \
185 acc1Vec = vdupq_n_f16(0.0f); \
186 pSrcX = (float16_t const *) pX; \
187 pSrcY = (float16_t const *) pY; \
188 k = count >> 3; \
189 \
190 while (k > 0U) \
191 { \
192 yVec = vld1q(pSrcY); pSrcY += 8; \
193 xVec = vldrhq_f16(&pSrcX[1]); \
194 acc1Vec = vfmaq_f16(acc1Vec, xVec, yVec); \
195 xVec = vld1q(pSrcX); pSrcX += 8; \
196 acc0Vec = vfmaq_f16(acc0Vec, xVec, yVec); \
197 /* Decrement the loop counter */ \
198 k--; \
199 } \
200 /* loop + tail predication expected here */ \
201 k = count % 0x8U; \
202 if (k > 0U) \
203 { \
204 mve_pred16_t p0 = vctp16q(k); \
205 yVec = vld1q(pSrcY); pSrcY += 8;; \
206 xVec = vldrhq_f16(&pSrcX[1]); \
207 acc1Vec = vfmaq_m_f16(acc1Vec, xVec, yVec, p0); \
208 xVec = vld1q(pSrcX); pSrcX += 8; \
209 acc0Vec = vfmaq_m_f16(acc0Vec, xVec, yVec, p0); \
210 } \
211 \
212 acc0 = vecAddAcrossF16Mve(acc0Vec); \
213 acc1 = vecAddAcrossF16Mve(acc1Vec); \
214 }
215
216 #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F16(acc0, acc1, pX, pY, count) \
217 { \
218 float16_t const *pSrcX, *pSrcY; \
219 f16x8_t acc0Vec, acc1Vec, xVec, yVec; \
220 uint32_t k; \
221 \
222 acc0Vec = vdupq_n_f16(0.0f); \
223 acc1Vec = vdupq_n_f16(0.0f); \
224 pSrcX = (float16_t const *) pX; \
225 pSrcY = (float16_t const *) pY; \
226 k = (count-1) >> 3; \
227 \
228 while (k > 0U) \
229 { \
230 yVec = vld1q(pSrcY); pSrcY += 8; \
231 xVec = vldrhq_f16(&pSrcX[1]); \
232 acc1Vec = vfmaq_f16(acc1Vec, xVec, yVec); \
233 xVec = vld1q(pSrcX); pSrcX += 8; \
234 acc0Vec = vfmaq_f16(acc0Vec, xVec, yVec); \
235 /* Decrement the loop counter */ \
236 k--; \
237 } \
238 /* use predication to finalize MAC sum */ \
239 /* acc1 requires exact number of sample (count-1) */ \
240 /* disable extra lanes in final MAC computation */ \
241 k = (count-1) % 0x8U; \
242 mve_pred16_t p0 = vctp16q(k); \
243 yVec = vld1q(pSrcY); pSrcY += 8; \
244 xVec = vldrhq_f16(&pSrcX[1]); \
245 acc1Vec = vfmaq_m_f16(acc1Vec, xVec, yVec, p0); \
246 /* acc0 requires 1 additional sample (count) */ \
247 /* so add 1 to unmask an extra lane in final MAC computation */ \
248 p0 = vctp16q(k+1); \
249 xVec = vld1q(pSrcX); pSrcX += 8; \
250 acc0Vec = vfmaq_m_f16(acc0Vec, xVec, yVec, p0); \
251 \
252 acc0 = vecAddAcrossF16Mve(acc0Vec); \
253 acc1 = vecAddAcrossF16Mve(acc1Vec); \
254 }
255
256
257
arm_correlate_f16(const float16_t * pSrcA,uint32_t srcALen,const float16_t * pSrcB,uint32_t srcBLen,float16_t * pDst)258 ARM_DSP_ATTRIBUTE void arm_correlate_f16(
259 const float16_t * pSrcA,
260 uint32_t srcALen,
261 const float16_t * pSrcB,
262 uint32_t srcBLen,
263 float16_t * pDst)
264 {
265 float16_t *pIn1 = (float16_t *)pSrcA; /* inputA pointer */
266 float16_t *pIn2 = (float16_t *)pSrcB + (srcBLen - 1U); /* inputB pointer */
267 float16_t *pX;
268 float16_t *pY;
269 float16_t *pA;
270 float16_t *pB;
271 int32_t i = 0U, j = 0; /* loop counters */
272 int32_t inv = 2U; /* Reverse order flag */
273 uint32_t tot = 0U; /* Length */
274 int32_t block1, block2, block3;
275 int32_t incr;
276
277 tot = ((srcALen + srcBLen) - 2U);
278 if (srcALen > srcBLen)
279 {
280 /*
281 * Calculating the number of zeros to be padded to the output
282 */
283 j = srcALen - srcBLen;
284 /*
285 * Initialize the pointer after zero padding
286 */
287 pDst += j;
288 }
289 else if (srcALen < srcBLen)
290 {
291 /*
292 * Initialization to inputB pointer
293 */
294 pIn1 = (float16_t *)pSrcB;
295 /*
296 * Initialization to the end of inputA pointer
297 */
298 pIn2 = (float16_t *)pSrcA + (srcALen - 1U);
299 /*
300 * Initialisation of the pointer after zero padding
301 */
302 pDst = pDst + tot;
303 /*
304 * Swapping the lengths
305 */
306
307 j = srcALen;
308 srcALen = srcBLen;
309 srcBLen = j;
310 /*
311 * Setting the reverse flag
312 */
313 inv = -2;
314
315 }
316
317 block1 = srcBLen - 1;
318 block2 = srcALen - srcBLen + 1;
319 block3 = srcBLen - 1;
320
321 pA = pIn1;
322 pB = pIn2;
323 incr = inv / 2;
324
325 for (i = 0U; i <= block1 - 2; i += 2)
326 {
327 uint32_t count = i + 1;
328 _Float16 acc0;
329 _Float16 acc1;
330 /*
331 * compute 2 accumulators per loop
332 * size is incrementing for second accumulator
333 * Y pointer is decrementing for second accumulator
334 */
335 pX = pA;
336 pY = pB;
337 MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F16(acc0, acc1, pX, pY, count);
338
339 *pDst = acc0;
340 pDst += incr;
341 *pDst = acc1;
342 pDst += incr;
343 pB -= 2;
344 }
345 for (; i < block1; i++)
346 {
347 uint32_t count = i + 1;
348 _Float16 acc;
349
350 pX = pA;
351 pY = pB;
352 MVE_INTR_CORR_SINGLE_F16(acc, pX, pY, count);
353
354 *pDst = acc;
355 pDst += incr;
356 pB--;
357 }
358
359 for (i = 0U; i <= block2 - 4; i += 4)
360 {
361 _Float16 acc0;
362 _Float16 acc1;
363 _Float16 acc2;
364 _Float16 acc3;
365
366 pX = pA;
367 pY = pB;
368 /*
369 * compute 4 accumulators per loop
370 * size is fixed for all accumulators
371 * X pointer is incrementing for successive accumulators
372 */
373 MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F16(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
374
375 *pDst = acc0;
376 pDst += incr;
377 *pDst = acc1;
378 pDst += incr;
379 *pDst = acc2;
380 pDst += incr;
381 *pDst = acc3;
382 pDst += incr;
383 pA += 4;
384 }
385
386 for (; i <= block2 - 2; i += 2)
387 {
388 _Float16 acc0;
389 _Float16 acc1;
390
391 pX = pA;
392 pY = pB;
393 /*
394 * compute 2 accumulators per loop
395 * size is fixed for all accumulators
396 * X pointer is incrementing for second accumulator
397 */
398 MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F16(acc0, acc1, pX, pY, srcBLen);
399
400 *pDst = acc0;
401 pDst += incr;
402 *pDst = acc1;
403 pDst += incr;
404 pA += 2;
405 }
406
407 if (block2 & 1)
408 {
409 _Float16 acc;
410
411 pX = pA;
412 pY = pB;
413 MVE_INTR_CORR_SINGLE_F16(acc, pX, pY, srcBLen);
414
415 *pDst = acc;
416 pDst += incr;
417 pA++;
418 }
419
420 for (i = block3 - 1; i > 0; i -= 2)
421 {
422
423 uint32_t count = (i + 1);
424 _Float16 acc0;
425 _Float16 acc1;
426
427 pX = pA;
428 pY = pB;
429 /*
430 * compute 2 accumulators per loop
431 * size is decrementing for second accumulator
432 * X pointer is incrementing for second accumulator
433 */
434 MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F16(acc0, acc1, pX, pY, count);
435
436 *pDst = acc0;
437 pDst += incr;
438 *pDst = acc1;
439 pDst += incr;
440 pA += 2;
441
442 }
443 for (; i >= 0; i--)
444 {
445 uint32_t count = (i + 1);
446 _Float16 acc;
447
448 pX = pA;
449 pY = pB;
450 MVE_INTR_CORR_SINGLE_F16(acc, pX, pY, count);
451
452 *pDst = acc;
453 pDst += incr;
454 pA++;
455 }
456 }
457
458 #else
arm_correlate_f16(const float16_t * pSrcA,uint32_t srcALen,const float16_t * pSrcB,uint32_t srcBLen,float16_t * pDst)459 ARM_DSP_ATTRIBUTE void arm_correlate_f16(
460 const float16_t * pSrcA,
461 uint32_t srcALen,
462 const float16_t * pSrcB,
463 uint32_t srcBLen,
464 float16_t * pDst)
465 {
466
467 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_AUTOVECTORIZE)
468
469 const float16_t *pIn1; /* InputA pointer */
470 const float16_t *pIn2; /* InputB pointer */
471 float16_t *pOut = pDst; /* Output pointer */
472 const float16_t *px; /* Intermediate inputA pointer */
473 const float16_t *py; /* Intermediate inputB pointer */
474 const float16_t *pSrc1;
475 _Float16 sum;
476 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
477 uint32_t j, k, count, blkCnt; /* Loop counters */
478 uint32_t outBlockSize; /* Loop counter */
479 int32_t inc = 1; /* Destination address modifier */
480
481 #if defined (ARM_MATH_LOOPUNROLL)
482 _Float16 acc0, acc1, acc2, acc3,c0; /* Accumulators */
483 _Float16 x0, x1, x2, x3; /* temporary variables for holding input and coefficient values */
484 #endif
485
486 /* The algorithm implementation is based on the lengths of the inputs. */
487 /* srcB is always made to slide across srcA. */
488 /* So srcBLen is always considered as shorter or equal to srcALen */
489 /* But CORR(x, y) is reverse of CORR(y, x) */
490 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
491 /* and the destination pointer modifier, inc is set to -1 */
492 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
493 /* But to improve the performance,
494 * we assume zeroes in the output instead of zero padding either of the the inputs*/
495 /* If srcALen > srcBLen,
496 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
497 /* If srcALen < srcBLen,
498 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
499 if (srcALen >= srcBLen)
500 {
501 /* Initialization of inputA pointer */
502 pIn1 = pSrcA;
503
504 /* Initialization of inputB pointer */
505 pIn2 = pSrcB;
506
507 /* Number of output samples is calculated */
508 outBlockSize = (2U * srcALen) - 1U;
509
510 /* When srcALen > srcBLen, zero padding has to be done to srcB
511 * to make their lengths equal.
512 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
513 * number of output samples are made zero */
514 j = outBlockSize - (srcALen + (srcBLen - 1U));
515
516 /* Updating the pointer position to non zero value */
517 pOut += j;
518 }
519 else
520 {
521 /* Initialization of inputA pointer */
522 pIn1 = pSrcB;
523
524 /* Initialization of inputB pointer */
525 pIn2 = pSrcA;
526
527 /* srcBLen is always considered as shorter or equal to srcALen */
528 j = srcBLen;
529 srcBLen = srcALen;
530 srcALen = j;
531
532 /* CORR(x, y) = Reverse order(CORR(y, x)) */
533 /* Hence set the destination pointer to point to the last output sample */
534 pOut = pDst + ((srcALen + srcBLen) - 2U);
535
536 /* Destination address modifier is set to -1 */
537 inc = -1;
538 }
539
540 /* The function is internally
541 * divided into three stages according to the number of multiplications that has to be
542 * taken place between inputA samples and inputB samples. In the first stage of the
543 * algorithm, the multiplications increase by one for every iteration.
544 * In the second stage of the algorithm, srcBLen number of multiplications are done.
545 * In the third stage of the algorithm, the multiplications decrease by one
546 * for every iteration. */
547
548 /* The algorithm is implemented in three stages.
549 The loop counters of each stage is initiated here. */
550 blockSize1 = srcBLen - 1U;
551 blockSize2 = srcALen - (srcBLen - 1U);
552 blockSize3 = blockSize1;
553
554 /* --------------------------
555 * Initializations of stage1
556 * -------------------------*/
557
558 /* sum = x[0] * y[srcBlen - 1]
559 * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
560 * ....
561 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
562 */
563
564 /* In this stage the MAC operations are increased by 1 for every iteration.
565 The count variable holds the number of MAC operations performed */
566 count = 1U;
567
568 /* Working pointer of inputA */
569 px = pIn1;
570
571 /* Working pointer of inputB */
572 pSrc1 = pIn2 + (srcBLen - 1U);
573 py = pSrc1;
574
575 /* ------------------------
576 * Stage1 process
577 * ----------------------*/
578
579 /* The first stage starts here */
580 while (blockSize1 > 0U)
581 {
582 /* Accumulator is made zero for every iteration */
583 sum = 0.0f16;
584
585 #if defined (ARM_MATH_LOOPUNROLL)
586
587 /* Loop unrolling: Compute 4 outputs at a time */
588 k = count >> 2U;
589
590
591 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
592 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
593 while (k > 0U)
594 {
595 /* x[0] * y[srcBLen - 4] */
596 sum += (_Float16)*px++ * (_Float16)*py++;
597
598 /* x[1] * y[srcBLen - 3] */
599 sum += (_Float16)*px++ * (_Float16)*py++;
600
601 /* x[2] * y[srcBLen - 2] */
602 sum += (_Float16)*px++ * (_Float16)*py++;
603
604 /* x[3] * y[srcBLen - 1] */
605 sum += (_Float16)*px++ * (_Float16)*py++;
606
607 /* Decrement loop counter */
608 k--;
609 }
610
611 /* Loop unrolling: Compute remaining outputs */
612 k = count % 0x4U;
613
614 #else
615
616 /* Initialize k with number of samples */
617 k = count;
618
619 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
620
621 while (k > 0U)
622 {
623 /* Perform the multiply-accumulate */
624 /* x[0] * y[srcBLen - 1] */
625 sum += (_Float16)*px++ * (_Float16)*py++;
626
627 /* Decrement loop counter */
628 k--;
629 }
630
631 /* Store the result in the accumulator in the destination buffer. */
632 *pOut = sum;
633 /* Destination pointer is updated according to the address modifier, inc */
634 pOut += inc;
635
636 /* Update the inputA and inputB pointers for next MAC calculation */
637 py = pSrc1 - count;
638 px = pIn1;
639
640 /* Increment MAC count */
641 count++;
642
643 /* Decrement loop counter */
644 blockSize1--;
645 }
646
647 /* --------------------------
648 * Initializations of stage2
649 * ------------------------*/
650
651 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
652 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
653 * ....
654 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
655 */
656
657 /* Working pointer of inputA */
658 px = pIn1;
659
660 /* Working pointer of inputB */
661 py = pIn2;
662
663 /* count is index by which the pointer pIn1 to be incremented */
664 count = 0U;
665
666 /* -------------------
667 * Stage2 process
668 * ------------------*/
669
670 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
671 * So, to loop unroll over blockSize2,
672 * srcBLen should be greater than or equal to 4 */
673 if (srcBLen >= 4U)
674 {
675 #if defined (ARM_MATH_LOOPUNROLL)
676
677 /* Loop unrolling: Compute 4 outputs at a time */
678 blkCnt = blockSize2 >> 2U;
679
680 while (blkCnt > 0U)
681 {
682 /* Set all accumulators to zero */
683 acc0 = 0.0f16;
684 acc1 = 0.0f16;
685 acc2 = 0.0f16;
686 acc3 = 0.0f16;
687
688
689 /* read x[0], x[1], x[2] samples */
690 x0 = *px++;
691 x1 = *px++;
692 x2 = *px++;
693
694 /* Apply loop unrolling and compute 4 MACs simultaneously. */
695 k = srcBLen >> 2U;
696
697 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
698 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
699 do
700 {
701 /* Read y[0] sample */
702 c0 = *(py++);
703 /* Read x[3] sample */
704 x3 = *(px++);
705
706 /* Perform the multiply-accumulate */
707 /* acc0 += x[0] * y[0] */
708 acc0 += (_Float16)x0 * (_Float16)c0;
709 /* acc1 += x[1] * y[0] */
710 acc1 += (_Float16)x1 * (_Float16)c0;
711 /* acc2 += x[2] * y[0] */
712 acc2 += (_Float16)x2 * (_Float16)c0;
713 /* acc3 += x[3] * y[0] */
714 acc3 += (_Float16)x3 * (_Float16)c0;
715
716 /* Read y[1] sample */
717 c0 = *(py++);
718 /* Read x[4] sample */
719 x0 = *(px++);
720
721 /* Perform the multiply-accumulate */
722 /* acc0 += x[1] * y[1] */
723 acc0 += (_Float16)x1 * (_Float16)c0;
724 /* acc1 += x[2] * y[1] */
725 acc1 += (_Float16)x2 * (_Float16)c0;
726 /* acc2 += x[3] * y[1] */
727 acc2 += (_Float16)x3 * (_Float16)c0;
728 /* acc3 += x[4] * y[1] */
729 acc3 += (_Float16)x0 * (_Float16)c0;
730
731 /* Read y[2] sample */
732 c0 = *(py++);
733 /* Read x[5] sample */
734 x1 = *(px++);
735
736 /* Perform the multiply-accumulate */
737 /* acc0 += x[2] * y[2] */
738 acc0 += (_Float16)x2 * (_Float16)c0;
739 /* acc1 += x[3] * y[2] */
740 acc1 += (_Float16)x3 * (_Float16)c0;
741 /* acc2 += x[4] * y[2] */
742 acc2 += (_Float16)x0 * (_Float16)c0;
743 /* acc3 += x[5] * y[2] */
744 acc3 += (_Float16)x1 * (_Float16)c0;
745
746 /* Read y[3] sample */
747 c0 = *(py++);
748 /* Read x[6] sample */
749 x2 = *(px++);
750
751 /* Perform the multiply-accumulate */
752 /* acc0 += x[3] * y[3] */
753 acc0 += (_Float16)x3 * (_Float16)c0;
754 /* acc1 += x[4] * y[3] */
755 acc1 += (_Float16)x0 * (_Float16)c0;
756 /* acc2 += x[5] * y[3] */
757 acc2 += (_Float16)x1 * (_Float16)c0;
758 /* acc3 += x[6] * y[3] */
759 acc3 += (_Float16)x2 * (_Float16)c0;
760
761 } while (--k);
762
763 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
764 ** No loop unrolling is used. */
765 k = srcBLen % 0x4U;
766
767 while (k > 0U)
768 {
769 /* Read y[4] sample */
770 c0 = *(py++);
771 /* Read x[7] sample */
772 x3 = *(px++);
773
774 /* Perform the multiply-accumulate */
775 /* acc0 += x[4] * y[4] */
776 acc0 += (_Float16)x0 * (_Float16)c0;
777 /* acc1 += x[5] * y[4] */
778 acc1 += (_Float16)x1 * (_Float16)c0;
779 /* acc2 += x[6] * y[4] */
780 acc2 += (_Float16)x2 * (_Float16)c0;
781 /* acc3 += x[7] * y[4] */
782 acc3 += (_Float16)x3 * (_Float16)c0;
783
784 /* Reuse the present samples for the next MAC */
785 x0 = x1;
786 x1 = x2;
787 x2 = x3;
788
789 /* Decrement the loop counter */
790 k--;
791 }
792
793 /* Store the result in the accumulator in the destination buffer. */
794 *pOut = acc0;
795 /* Destination pointer is updated according to the address modifier, inc */
796 pOut += inc;
797
798 *pOut = acc1;
799 pOut += inc;
800
801 *pOut = acc2;
802 pOut += inc;
803
804 *pOut = acc3;
805 pOut += inc;
806
807 /* Increment the pointer pIn1 index, count by 4 */
808 count += 4U;
809
810 /* Update the inputA and inputB pointers for next MAC calculation */
811 px = pIn1 + count;
812 py = pIn2;
813
814 /* Decrement loop counter */
815 blkCnt--;
816 }
817
818 /* Loop unrolling: Compute remaining outputs */
819 blkCnt = blockSize2 % 0x4U;
820
821 #else
822
823 /* Initialize blkCnt with number of samples */
824 blkCnt = blockSize2;
825
826 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
827
828 while (blkCnt > 0U)
829 {
830 /* Accumulator is made zero for every iteration */
831 sum = 0.0f16;
832
833 #if defined (ARM_MATH_LOOPUNROLL)
834
835 /* Loop unrolling: Compute 4 outputs at a time */
836 k = srcBLen >> 2U;
837
838
839 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
840 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
841 while (k > 0U)
842 {
843 /* Perform the multiply-accumulate */
844 sum += (_Float16)*px++ * (_Float16)*py++;
845 sum += (_Float16)*px++ * (_Float16)*py++;
846 sum += (_Float16)*px++ * (_Float16)*py++;
847 sum += (_Float16)*px++ * (_Float16)*py++;
848
849 /* Decrement loop counter */
850 k--;
851 }
852 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
853 ** No loop unrolling is used. */
854 k = srcBLen % 0x4U;
855 #else
856
857 /* Initialize blkCnt with number of samples */
858 k = srcBLen;
859
860 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
861
862 while (k > 0U)
863 {
864 /* Perform the multiply-accumulate */
865 sum += (_Float16)*px++ * (_Float16)*py++;
866
867 /* Decrement the loop counter */
868 k--;
869 }
870
871 /* Store the result in the accumulator in the destination buffer. */
872 *pOut = sum;
873
874 /* Destination pointer is updated according to the address modifier, inc */
875 pOut += inc;
876
877 /* Increment the pointer pIn1 index, count by 1 */
878 count++;
879
880 /* Update the inputA and inputB pointers for next MAC calculation */
881 px = pIn1 + count;
882 py = pIn2;
883
884 /* Decrement the loop counter */
885 blkCnt--;
886 }
887 }
888 else
889 {
890 /* If the srcBLen is not a multiple of 4,
891 * the blockSize2 loop cannot be unrolled by 4 */
892 blkCnt = blockSize2;
893
894 while (blkCnt > 0U)
895 {
896 /* Accumulator is made zero for every iteration */
897 sum = 0.0f16;
898
899 /* Loop over srcBLen */
900 k = srcBLen;
901
902 while (k > 0U)
903 {
904 /* Perform the multiply-accumulate */
905 sum += (_Float16)*px++ * (_Float16)*py++;
906
907 /* Decrement the loop counter */
908 k--;
909 }
910
911 /* Store the result in the accumulator in the destination buffer. */
912 *pOut = sum;
913 /* Destination pointer is updated according to the address modifier, inc */
914 pOut += inc;
915
916 /* Increment the pointer pIn1 index, count by 1 */
917 count++;
918
919 /* Update the inputA and inputB pointers for next MAC calculation */
920 px = pIn1 + count;
921 py = pIn2;
922
923 /* Decrement the loop counter */
924 blkCnt--;
925 }
926 }
927
928
929 /* --------------------------
930 * Initializations of stage3
931 * -------------------------*/
932
933 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
934 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
935 * ....
936 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
937 * sum += x[srcALen-1] * y[0]
938 */
939
940 /* In this stage the MAC operations are decreased by 1 for every iteration.
941 The count variable holds the number of MAC operations performed */
942 count = srcBLen - 1U;
943
944 /* Working pointer of inputA */
945 pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
946 px = pSrc1;
947
948 /* Working pointer of inputB */
949 py = pIn2;
950
951 /* -------------------
952 * Stage3 process
953 * ------------------*/
954
955 while (blockSize3 > 0U)
956 {
957 /* Accumulator is made zero for every iteration */
958 sum = 0.0f16;
959
960 #if defined (ARM_MATH_LOOPUNROLL)
961
962 /* Loop unrolling: Compute 4 outputs at a time */
963 k = count >> 2U;
964
965
966 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
967 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
968 while (k > 0U)
969 {
970 /* Perform the multiply-accumulate */
971 /* sum += x[srcALen - srcBLen + 4] * y[3] */
972 sum += (_Float16)*px++ * (_Float16)*py++;
973
974 /* sum += x[srcALen - srcBLen + 3] * y[2] */
975 sum += (_Float16)*px++ * (_Float16)*py++;
976
977 /* sum += x[srcALen - srcBLen + 2] * y[1] */
978 sum += (_Float16)*px++ * (_Float16)*py++;
979
980 /* sum += x[srcALen - srcBLen + 1] * y[0] */
981 sum += (_Float16)*px++ * (_Float16)*py++;
982
983 /* Decrement loop counter */
984 k--;
985 }
986
987 /* Loop unrolling: Compute remaining outputs */
988 k = count % 0x4U;
989
990 #else
991
992 /* Initialize blkCnt with number of samples */
993 k = count;
994
995 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
996
997 while (k > 0U)
998 {
999 /* Perform the multiply-accumulate */
1000 sum += (_Float16)*px++ * (_Float16)*py++;
1001
1002 /* Decrement loop counter */
1003 k--;
1004 }
1005
1006 /* Store the result in the accumulator in the destination buffer. */
1007 *pOut = sum;
1008 /* Destination pointer is updated according to the address modifier, inc */
1009 pOut += inc;
1010
1011 /* Update the inputA and inputB pointers for next MAC calculation */
1012 px = ++pSrc1;
1013 py = pIn2;
1014
1015 /* Decrement MAC count */
1016 count--;
1017
1018 /* Decrement the loop counter */
1019 blockSize3--;
1020 }
1021
1022 #else
1023 /* alternate version for CM0_FAMILY */
1024
1025 const float16_t *pIn1 = pSrcA; /* inputA pointer */
1026 const float16_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
1027 _Float16 sum; /* Accumulator */
1028 uint32_t i = 0U, j; /* Loop counters */
1029 uint32_t inv = 0U; /* Reverse order flag */
1030 uint32_t tot = 0U; /* Length */
1031
1032 /* The algorithm implementation is based on the lengths of the inputs. */
1033 /* srcB is always made to slide across srcA. */
1034 /* So srcBLen is always considered as shorter or equal to srcALen */
1035 /* But CORR(x, y) is reverse of CORR(y, x) */
1036 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
1037 /* and a varaible, inv is set to 1 */
1038 /* If lengths are not equal then zero pad has to be done to make the two
1039 * inputs of same length. But to improve the performance, we assume zeroes
1040 * in the output instead of zero padding either of the the inputs*/
1041 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
1042 * starting of the output buffer */
1043 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
1044 * ending of the output buffer */
1045 /* Once the zero padding is done the remaining of the output is calcualted
1046 * using convolution but with the shorter signal time shifted. */
1047
1048 /* Calculate the length of the remaining sequence */
1049 tot = ((srcALen + srcBLen) - 2U);
1050
1051 if (srcALen > srcBLen)
1052 {
1053 /* Calculating the number of zeros to be padded to the output */
1054 j = srcALen - srcBLen;
1055
1056 /* Initialise the pointer after zero padding */
1057 pDst += j;
1058 }
1059
1060 else if (srcALen < srcBLen)
1061 {
1062 /* Initialization to inputB pointer */
1063 pIn1 = pSrcB;
1064
1065 /* Initialization to the end of inputA pointer */
1066 pIn2 = pSrcA + (srcALen - 1U);
1067
1068 /* Initialisation of the pointer after zero padding */
1069 pDst = pDst + tot;
1070
1071 /* Swapping the lengths */
1072 j = srcALen;
1073 srcALen = srcBLen;
1074 srcBLen = j;
1075
1076 /* Setting the reverse flag */
1077 inv = 1;
1078
1079 }
1080
1081 /* Loop to calculate convolution for output length number of times */
1082 for (i = 0U; i <= tot; i++)
1083 {
1084 /* Initialize sum with zero to carry out MAC operations */
1085 sum = 0.0f16;
1086
1087 /* Loop to perform MAC operations according to convolution equation */
1088 for (j = 0U; j <= i; j++)
1089 {
1090 /* Check the array limitations */
1091 if ((((i - j) < srcBLen) && (j < srcALen)))
1092 {
1093 /* z[i] += x[i-j] * y[j] */
1094 sum += (_Float16)pIn1[j] * (_Float16)pIn2[-((int32_t) i - (int32_t) j)];
1095 }
1096 }
1097
1098 /* Store the output in the destination buffer */
1099 if (inv == 1)
1100 *pDst-- = sum;
1101 else
1102 *pDst++ = sum;
1103 }
1104
1105 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
1106
1107 }
1108 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1109
1110 /**
1111 @} end of Corr group
1112 */
1113
1114 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
1115