1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_partial_f32.c
4 * Description: Partial convolution of floating-point sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @defgroup PartialConv Partial Convolution
37
38 Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.
39 Each function has two additional arguments.
40 <code>firstIndex</code> specifies the starting index of the subset of output samples.
41 <code>numPoints</code> is the number of output samples to compute.
42 The function computes the output in the range
43 <code>[firstIndex, ..., firstIndex+numPoints-1]</code>.
44 The output array <code>pDst</code> contains <code>numPoints</code> values.
45
46 The allowable range of output indices is [0 srcALen+srcBLen-2].
47 If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.
48 Otherwise the functions return ARM_MATH_SUCCESS.
49 \note Refer to \ref arm_conv_f32() for details on fixed point behavior.
50
51 @par Fast Versions
52 Fast versions are supported for Q31 and Q15 of partial convolution.
53 Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires
54 the input signals should be scaled down to avoid intermediate overflows.
55
56 @par Opt Versions
57 Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
58 These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions of partial convolution
59
60 @par Long versions:
61 For convolution of long vectors, those functions are
62 no more adapted and will be very slow.
63 An implementation based upon FFTs should be used.
64
65 */
66
67 /**
68 @addtogroup PartialConv
69 @{
70 */
71
72 /**
73 @brief Partial convolution of floating-point sequences.
74 @param[in] pSrcA points to the first input sequence
75 @param[in] srcALen length of the first input sequence
76 @param[in] pSrcB points to the second input sequence
77 @param[in] srcBLen length of the second input sequence
78 @param[out] pDst points to the location where the output result is written
79 @param[in] firstIndex is the first output sample to start with
80 @param[in] numPoints is the number of output points to be computed
81 @return execution status
82 - \ref ARM_MATH_SUCCESS : Operation successful
83 - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
84 */
85
arm_conv_partial_f32(const float32_t * pSrcA,uint32_t srcALen,const float32_t * pSrcB,uint32_t srcBLen,float32_t * pDst,uint32_t firstIndex,uint32_t numPoints)86 arm_status arm_conv_partial_f32(
87 const float32_t * pSrcA,
88 uint32_t srcALen,
89 const float32_t * pSrcB,
90 uint32_t srcBLen,
91 float32_t * pDst,
92 uint32_t firstIndex,
93 uint32_t numPoints)
94 {
95 #if defined (ARM_MATH_DSP)
96 const float32_t *pIn1 = pSrcA; /* InputA pointer */
97 const float32_t *pIn2 = pSrcB; /* InputB pointer */
98 float32_t *pOut = pDst; /* Output pointer */
99 const float32_t *px; /* Intermediate inputA pointer */
100 const float32_t *py; /* Intermediate inputB pointer */
101 const float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
102 float32_t sum; /* Accumulator */
103 uint32_t j, k, count, blkCnt, check;
104 int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
105 arm_status status; /* Status of Partial convolution */
106
107 #if defined (ARM_MATH_LOOPUNROLL)
108 float32_t acc0, acc1, acc2, acc3; /* Accumulator */
109 float32_t x0, x1, x2, x3, c0; /* Temporary variables */
110 #endif
111
112 /* Check for range of output samples to be calculated */
113 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
114 {
115 /* Set status as ARM_MATH_ARGUMENT_ERROR */
116 status = ARM_MATH_ARGUMENT_ERROR;
117 }
118 else
119 {
120 /* The algorithm implementation is based on the lengths of the inputs. */
121 /* srcB is always made to slide across srcA. */
122 /* So srcBLen is always considered as shorter or equal to srcALen */
123 if (srcALen >= srcBLen)
124 {
125 /* Initialization of inputA pointer */
126 pIn1 = pSrcA;
127
128 /* Initialization of inputB pointer */
129 pIn2 = pSrcB;
130 }
131 else
132 {
133 /* Initialization of inputA pointer */
134 pIn1 = pSrcB;
135
136 /* Initialization of inputB pointer */
137 pIn2 = pSrcA;
138
139 /* srcBLen is always considered as shorter or equal to srcALen */
140 j = srcBLen;
141 srcBLen = srcALen;
142 srcALen = j;
143 }
144
145 /* Conditions to check which loopCounter holds
146 * the first and last indices of the output samples to be calculated. */
147 check = firstIndex + numPoints;
148 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
149 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
150 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
151 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
152 blockSize2 = ((int32_t) check - blockSize3) - (blockSize1 + (int32_t) firstIndex);
153 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
154
155 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
156 /* The function is internally
157 * divided into three stages according to the number of multiplications that has to be
158 * taken place between inputA samples and inputB samples. In the first stage of the
159 * algorithm, the multiplications increase by one for every iteration.
160 * In the second stage of the algorithm, srcBLen number of multiplications are done.
161 * In the third stage of the algorithm, the multiplications decrease by one
162 * for every iteration. */
163
164 /* Set the output pointer to point to the firstIndex
165 * of the output sample to be calculated. */
166 pOut = pDst + firstIndex;
167
168 /* --------------------------
169 * Initializations of stage1
170 * -------------------------*/
171
172 /* sum = x[0] * y[0]
173 * sum = x[0] * y[1] + x[1] * y[0]
174 * ....
175 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
176 */
177
178 /* In this stage the MAC operations are increased by 1 for every iteration.
179 The count variable holds the number of MAC operations performed.
180 Since the partial convolution starts from firstIndex
181 Number of Macs to be performed is firstIndex + 1 */
182 count = 1U + firstIndex;
183
184 /* Working pointer of inputA */
185 px = pIn1;
186
187 /* Working pointer of inputB */
188 pSrc1 = pIn2 + firstIndex;
189 py = pSrc1;
190
191 /* ------------------------
192 * Stage1 process
193 * ----------------------*/
194
195 /* The first stage starts here */
196 while (blockSize1 > 0)
197 {
198 /* Accumulator is made zero for every iteration */
199 sum = 0.0f;
200
201 #if defined (ARM_MATH_LOOPUNROLL)
202
203 /* Loop unrolling: Compute 4 outputs at a time */
204 k = count >> 2U;
205
206 while (k > 0U)
207 {
208 /* x[0] * y[srcBLen - 1] */
209 sum += *px++ * *py--;
210
211 /* x[1] * y[srcBLen - 2] */
212 sum += *px++ * *py--;
213
214 /* x[2] * y[srcBLen - 3] */
215 sum += *px++ * *py--;
216
217 /* x[3] * y[srcBLen - 4] */
218 sum += *px++ * *py--;
219
220 /* Decrement loop counter */
221 k--;
222 }
223
224 /* Loop unrolling: Compute remaining outputs */
225 k = count % 0x4U;
226
227 #else
228
229 /* Initialize k with number of samples */
230 k = count;
231
232 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
233
234 while (k > 0U)
235 {
236 /* Perform the multiply-accumulate */
237 sum += *px++ * *py--;
238
239 /* Decrement loop counter */
240 k--;
241 }
242
243 /* Store the result in the accumulator in the destination buffer. */
244 *pOut++ = sum;
245
246 /* Update the inputA and inputB pointers for next MAC calculation */
247 py = ++pSrc1;
248 px = pIn1;
249
250 /* Increment MAC count */
251 count++;
252
253 /* Decrement loop counter */
254 blockSize1--;
255 }
256
257 /* --------------------------
258 * Initializations of stage2
259 * ------------------------*/
260
261 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
262 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
263 * ....
264 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
265 */
266
267 /* Working pointer of inputA */
268 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
269 {
270 pSrc1 = pIn1 + firstIndex - srcBLen + 1;
271 }
272 else
273 {
274 pSrc1 = pIn1;
275 }
276 px = pSrc1;
277
278 /* Working pointer of inputB */
279 pSrc2 = pIn2 + (srcBLen - 1U);
280 py = pSrc2;
281
282 /* count is index by which the pointer pIn1 to be incremented */
283 count = 0U;
284
285 /* -------------------
286 * Stage2 process
287 * ------------------*/
288
289 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
290 * So, to loop unroll over blockSize2,
291 * srcBLen should be greater than or equal to 4 */
292 if (srcBLen >= 4U)
293 {
294 #if defined (ARM_MATH_LOOPUNROLL)
295
296 /* Loop unrolling: Compute 4 outputs at a time */
297 blkCnt = ((uint32_t) blockSize2 >> 2U);
298
299 while (blkCnt > 0U)
300 {
301 /* Set all accumulators to zero */
302 acc0 = 0.0f;
303 acc1 = 0.0f;
304 acc2 = 0.0f;
305 acc3 = 0.0f;
306
307 /* read x[0], x[1], x[2] samples */
308 x0 = *px++;
309 x1 = *px++;
310 x2 = *px++;
311
312 /* Apply loop unrolling and compute 4 MACs simultaneously. */
313 k = srcBLen >> 2U;
314
315 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
316 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
317 do
318 {
319 /* Read y[srcBLen - 1] sample */
320 c0 = *py--;
321 /* Read x[3] sample */
322 x3 = *px++;
323
324 /* Perform the multiply-accumulate */
325 /* acc0 += x[0] * y[srcBLen - 1] */
326 acc0 += x0 * c0;
327 /* acc1 += x[1] * y[srcBLen - 1] */
328 acc1 += x1 * c0;
329 /* acc2 += x[2] * y[srcBLen - 1] */
330 acc2 += x2 * c0;
331 /* acc3 += x[3] * y[srcBLen - 1] */
332 acc3 += x3 * c0;
333
334 /* Read y[srcBLen - 2] sample */
335 c0 = *py--;
336 /* Read x[4] sample */
337 x0 = *px++;
338
339 /* Perform the multiply-accumulate */
340 /* acc0 += x[1] * y[srcBLen - 2] */
341 acc0 += x1 * c0;
342 /* acc1 += x[2] * y[srcBLen - 2] */
343 acc1 += x2 * c0;
344 /* acc2 += x[3] * y[srcBLen - 2] */
345 acc2 += x3 * c0;
346 /* acc3 += x[4] * y[srcBLen - 2] */
347 acc3 += x0 * c0;
348
349 /* Read y[srcBLen - 3] sample */
350 c0 = *py--;
351 /* Read x[5] sample */
352 x1 = *px++;
353
354 /* Perform the multiply-accumulate */
355 /* acc0 += x[2] * y[srcBLen - 3] */
356 acc0 += x2 * c0;
357 /* acc1 += x[3] * y[srcBLen - 2] */
358 acc1 += x3 * c0;
359 /* acc2 += x[4] * y[srcBLen - 2] */
360 acc2 += x0 * c0;
361 /* acc3 += x[5] * y[srcBLen - 2] */
362 acc3 += x1 * c0;
363
364 /* Read y[srcBLen - 4] sample */
365 c0 = *py--;
366 /* Read x[6] sample */
367 x2 = *px++;
368
369 /* Perform the multiply-accumulate */
370 /* acc0 += x[3] * y[srcBLen - 4] */
371 acc0 += x3 * c0;
372 /* acc1 += x[4] * y[srcBLen - 4] */
373 acc1 += x0 * c0;
374 /* acc2 += x[5] * y[srcBLen - 4] */
375 acc2 += x1 * c0;
376 /* acc3 += x[6] * y[srcBLen - 4] */
377 acc3 += x2 * c0;
378
379 } while (--k);
380
381 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
382 ** No loop unrolling is used. */
383 k = srcBLen % 0x4U;
384
385 while (k > 0U)
386 {
387 /* Read y[srcBLen - 5] sample */
388 c0 = *py--;
389 /* Read x[7] sample */
390 x3 = *px++;
391
392 /* Perform the multiply-accumulates */
393 /* acc0 += x[4] * y[srcBLen - 5] */
394 acc0 += x0 * c0;
395 /* acc1 += x[5] * y[srcBLen - 5] */
396 acc1 += x1 * c0;
397 /* acc2 += x[6] * y[srcBLen - 5] */
398 acc2 += x2 * c0;
399 /* acc3 += x[7] * y[srcBLen - 5] */
400 acc3 += x3 * c0;
401
402 /* Reuse the present samples for the next MAC */
403 x0 = x1;
404 x1 = x2;
405 x2 = x3;
406
407 /* Decrement the loop counter */
408 k--;
409 }
410
411 /* Store the result in the accumulator in the destination buffer. */
412 *pOut++ = acc0;
413 *pOut++ = acc1;
414 *pOut++ = acc2;
415 *pOut++ = acc3;
416
417 /* Increment the pointer pIn1 index, count by 4 */
418 count += 4U;
419
420 /* Update the inputA and inputB pointers for next MAC calculation */
421 px = pSrc1 + count;
422 py = pSrc2;
423
424 /* Decrement loop counter */
425 blkCnt--;
426 }
427
428 /* Loop unrolling: Compute remaining outputs */
429 blkCnt = (uint32_t) blockSize2 % 0x4U;
430
431 #else
432
433 /* Initialize blkCnt with number of samples */
434 blkCnt = blockSize2;
435
436 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
437
438 while (blkCnt > 0U)
439 {
440 /* Accumulator is made zero for every iteration */
441 sum = 0.0f;
442
443 #if defined (ARM_MATH_LOOPUNROLL)
444
445 /* Loop unrolling: Compute 4 outputs at a time */
446 k = srcBLen >> 2U;
447
448 while (k > 0U)
449 {
450 /* Perform the multiply-accumulates */
451 sum += *px++ * *py--;
452 sum += *px++ * *py--;
453 sum += *px++ * *py--;
454 sum += *px++ * *py--;
455
456 /* Decrement loop counter */
457 k--;
458 }
459
460 /* Loop unrolling: Compute remaining outputs */
461 k = srcBLen % 0x4U;
462
463 #else
464
465 /* Initialize blkCnt with number of samples */
466 k = srcBLen;
467
468 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
469
470 while (k > 0U)
471 {
472 /* Perform the multiply-accumulate */
473 sum += *px++ * *py--;
474
475 /* Decrement loop counter */
476 k--;
477 }
478
479 /* Store the result in the accumulator in the destination buffer. */
480 *pOut++ = sum;
481
482 /* Increment MAC count */
483 count++;
484
485 /* Update the inputA and inputB pointers for next MAC calculation */
486 px = pSrc1 + count;
487 py = pSrc2;
488
489 /* Decrement loop counter */
490 blkCnt--;
491 }
492 }
493 else
494 {
495 /* If the srcBLen is not a multiple of 4,
496 * the blockSize2 loop cannot be unrolled by 4 */
497 blkCnt = (uint32_t) blockSize2;
498
499 while (blkCnt > 0U)
500 {
501 /* Accumulator is made zero for every iteration */
502 sum = 0.0f;
503
504 /* srcBLen number of MACS should be performed */
505 k = srcBLen;
506
507 while (k > 0U)
508 {
509 /* Perform the multiply-accumulate */
510 sum += *px++ * *py--;
511
512 /* Decrement loop counter */
513 k--;
514 }
515
516 /* Store the result in the accumulator in the destination buffer. */
517 *pOut++ = sum;
518
519 /* Increment the MAC count */
520 count++;
521
522 /* Update the inputA and inputB pointers for next MAC calculation */
523 px = pSrc1 + count;
524 py = pSrc2;
525
526 /* Decrement the loop counter */
527 blkCnt--;
528 }
529 }
530
531
532 /* --------------------------
533 * Initializations of stage3
534 * -------------------------*/
535
536 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
537 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
538 * ....
539 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
540 * sum += x[srcALen-1] * y[srcBLen-1]
541 */
542
543 /* In this stage the MAC operations are decreased by 1 for every iteration.
544 The blockSize3 variable holds the number of MAC operations performed */
545 count = srcBLen - 1U;
546
547 /* Working pointer of inputA */
548 if (firstIndex > srcALen)
549 {
550 pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
551 }
552 else
553 {
554 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
555 }
556 px = pSrc1;
557
558 /* Working pointer of inputB */
559 pSrc2 = pIn2 + (srcBLen - 1U);
560 py = pSrc2;
561
562 /* -------------------
563 * Stage3 process
564 * ------------------*/
565
566 while (blockSize3 > 0)
567 {
568 /* Accumulator is made zero for every iteration */
569 sum = 0.0f;
570
571 #if defined (ARM_MATH_LOOPUNROLL)
572
573 /* Loop unrolling: Compute 4 outputs at a time */
574 k = count >> 2U;
575
576 while (k > 0U)
577 {
578 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
579 sum += *px++ * *py--;
580
581 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
582 sum += *px++ * *py--;
583
584 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
585 sum += *px++ * *py--;
586
587 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
588 sum += *px++ * *py--;
589
590 /* Decrement loop counter */
591 k--;
592 }
593
594 /* Loop unrolling: Compute remaining outputs */
595 k = count % 0x4U;
596
597 #else
598
599 /* Initialize blkCnt with number of samples */
600 k = count;
601
602 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
603
604 while (k > 0U)
605 {
606 /* Perform the multiply-accumulate */
607 /* sum += x[srcALen-1] * y[srcBLen-1] */
608 sum += *px++ * *py--;
609
610 /* Decrement loop counter */
611 k--;
612 }
613
614 /* Store the result in the accumulator in the destination buffer. */
615 *pOut++ = sum;
616
617 /* Update the inputA and inputB pointers for next MAC calculation */
618 px = ++pSrc1;
619 py = pSrc2;
620
621 /* Decrement MAC count */
622 count--;
623
624 /* Decrement the loop counter */
625 blockSize3--;
626 }
627
628 /* Set status as ARM_MATH_SUCCESS */
629 status = ARM_MATH_SUCCESS;
630 }
631
632 /* Return to application */
633 return (status);
634
635 #else
636 /* alternate version for CM0_FAMILY */
637
638 const float32_t *pIn1 = pSrcA; /* InputA pointer */
639 const float32_t *pIn2 = pSrcB; /* InputB pointer */
640 float32_t sum; /* Accumulator */
641 uint32_t i, j; /* Loop counters */
642 arm_status status; /* Status of Partial convolution */
643 /* Check for range of output samples to be calculated */
644 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
645 {
646 /* Set status as ARM_MATH_ARGUMENT_ERROR */
647 status = ARM_MATH_ARGUMENT_ERROR;
648 }
649 else
650 {
651 /* Loop to calculate convolution for output length number of values */
652 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
653 {
654 /* Initialize sum with zero to carry on MAC operations */
655 sum = 0.0f;
656
657 /* Loop to perform MAC operations according to convolution equation */
658 for (j = 0U; j <= i; j++)
659 {
660 /* Check the array limitations */
661 if (((i - j) < srcBLen) && (j < srcALen))
662 {
663 /* z[i] += x[i-j] * y[j] */
664 sum += ( pIn1[j] * pIn2[i - j]);
665 }
666 }
667
668 /* Store the output in the destination buffer */
669 pDst[i] = sum;
670 }
671
672 /* Set status as ARM_SUCCESS */
673 status = ARM_MATH_SUCCESS;
674 }
675
676 /* Return to application */
677 return (status);
678
679 #endif /* defined(ARM_MATH_DSP) */
680 }
681
682 /**
683 @} end of PartialConv group
684 */
685