1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_partial_f32.c
4  * Description:  Partial convolution of floating-point sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @defgroup PartialConv Partial Convolution
37 
38   Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.
39   Each function has two additional arguments.
40   <code>firstIndex</code> specifies the starting index of the subset of output samples.
41   <code>numPoints</code> is the number of output samples to compute.
42   The function computes the output in the range
43   <code>[firstIndex, ..., firstIndex+numPoints-1]</code>.
44   The output array <code>pDst</code> contains <code>numPoints</code> values.
45 
46   The allowable range of output indices is [0 srcALen+srcBLen-2].
47   If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.
48   Otherwise the functions return ARM_MATH_SUCCESS.
49   \note Refer to \ref arm_conv_f32() for details on fixed point behavior.
50 
51   @par           Fast Versions
52                    Fast versions are supported for Q31 and Q15 of partial convolution.
53                    Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires
54                    the input signals should be scaled down to avoid intermediate overflows.
55 
56   @par           Opt Versions
57                    Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
58                    These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions of partial convolution
59 
60   @par           Long versions:
61                    For convolution of long vectors, those functions are
62                    no more adapted and will be very slow.
63                    An implementation based upon FFTs should be used.
64 
65  */
66 
67 /**
68   @addtogroup PartialConv
69   @{
70  */
71 
72 /**
73   @brief         Partial convolution of floating-point sequences.
74   @param[in]     pSrcA       points to the first input sequence
75   @param[in]     srcALen     length of the first input sequence
76   @param[in]     pSrcB       points to the second input sequence
77   @param[in]     srcBLen     length of the second input sequence
78   @param[out]    pDst        points to the location where the output result is written
79   @param[in]     firstIndex  is the first output sample to start with
80   @param[in]     numPoints   is the number of output points to be computed
81   @return        execution status
82                    - \ref ARM_MATH_SUCCESS        : Operation successful
83                    - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
84  */
85 
arm_conv_partial_f32(const float32_t * pSrcA,uint32_t srcALen,const float32_t * pSrcB,uint32_t srcBLen,float32_t * pDst,uint32_t firstIndex,uint32_t numPoints)86 arm_status arm_conv_partial_f32(
87   const float32_t * pSrcA,
88         uint32_t srcALen,
89   const float32_t * pSrcB,
90         uint32_t srcBLen,
91         float32_t * pDst,
92         uint32_t firstIndex,
93         uint32_t numPoints)
94 {
95 #if defined (ARM_MATH_DSP)
96   const float32_t *pIn1 = pSrcA;                       /* InputA pointer */
97   const float32_t *pIn2 = pSrcB;                       /* InputB pointer */
98         float32_t *pOut = pDst;                        /* Output pointer */
99   const float32_t *px;                                 /* Intermediate inputA pointer */
100   const float32_t *py;                                 /* Intermediate inputB pointer */
101   const float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
102         float32_t sum;                                 /* Accumulator */
103         uint32_t j, k, count, blkCnt, check;
104         int32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
105         arm_status status;                             /* Status of Partial convolution */
106 
107 #if defined (ARM_MATH_LOOPUNROLL)
108         float32_t acc0, acc1, acc2, acc3;              /* Accumulator */
109         float32_t x0, x1, x2, x3, c0;                  /* Temporary variables */
110 #endif
111 
112   /* Check for range of output samples to be calculated */
113   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
114   {
115     /* Set status as ARM_MATH_ARGUMENT_ERROR */
116     status = ARM_MATH_ARGUMENT_ERROR;
117   }
118   else
119   {
120     /* The algorithm implementation is based on the lengths of the inputs. */
121     /* srcB is always made to slide across srcA. */
122     /* So srcBLen is always considered as shorter or equal to srcALen */
123     if (srcALen >= srcBLen)
124     {
125       /* Initialization of inputA pointer */
126       pIn1 = pSrcA;
127 
128       /* Initialization of inputB pointer */
129       pIn2 = pSrcB;
130     }
131     else
132     {
133       /* Initialization of inputA pointer */
134       pIn1 = pSrcB;
135 
136       /* Initialization of inputB pointer */
137       pIn2 = pSrcA;
138 
139       /* srcBLen is always considered as shorter or equal to srcALen */
140       j = srcBLen;
141       srcBLen = srcALen;
142       srcALen = j;
143     }
144 
145     /* Conditions to check which loopCounter holds
146      * the first and last indices of the output samples to be calculated. */
147     check = firstIndex + numPoints;
148     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
149     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
150     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
151     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
152     blockSize2 = ((int32_t) check - blockSize3) - (blockSize1 + (int32_t) firstIndex);
153     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
154 
155     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
156     /* The function is internally
157      * divided into three stages according to the number of multiplications that has to be
158      * taken place between inputA samples and inputB samples. In the first stage of the
159      * algorithm, the multiplications increase by one for every iteration.
160      * In the second stage of the algorithm, srcBLen number of multiplications are done.
161      * In the third stage of the algorithm, the multiplications decrease by one
162      * for every iteration. */
163 
164     /* Set the output pointer to point to the firstIndex
165      * of the output sample to be calculated. */
166     pOut = pDst + firstIndex;
167 
168     /* --------------------------
169      * Initializations of stage1
170      * -------------------------*/
171 
172     /* sum = x[0] * y[0]
173      * sum = x[0] * y[1] + x[1] * y[0]
174      * ....
175      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
176      */
177 
178     /* In this stage the MAC operations are increased by 1 for every iteration.
179        The count variable holds the number of MAC operations performed.
180        Since the partial convolution starts from firstIndex
181        Number of Macs to be performed is firstIndex + 1 */
182     count = 1U + firstIndex;
183 
184     /* Working pointer of inputA */
185     px = pIn1;
186 
187     /* Working pointer of inputB */
188     pSrc1 = pIn2 + firstIndex;
189     py = pSrc1;
190 
191     /* ------------------------
192      * Stage1 process
193      * ----------------------*/
194 
195     /* The first stage starts here */
196     while (blockSize1 > 0)
197     {
198       /* Accumulator is made zero for every iteration */
199       sum = 0.0f;
200 
201 #if defined (ARM_MATH_LOOPUNROLL)
202 
203       /* Loop unrolling: Compute 4 outputs at a time */
204       k = count >> 2U;
205 
206       while (k > 0U)
207       {
208         /* x[0] * y[srcBLen - 1] */
209         sum += *px++ * *py--;
210 
211         /* x[1] * y[srcBLen - 2] */
212         sum += *px++ * *py--;
213 
214         /* x[2] * y[srcBLen - 3] */
215         sum += *px++ * *py--;
216 
217         /* x[3] * y[srcBLen - 4] */
218         sum += *px++ * *py--;
219 
220         /* Decrement loop counter */
221         k--;
222       }
223 
224       /* Loop unrolling: Compute remaining outputs */
225       k = count % 0x4U;
226 
227 #else
228 
229       /* Initialize k with number of samples */
230       k = count;
231 
232 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
233 
234       while (k > 0U)
235       {
236         /* Perform the multiply-accumulate */
237         sum += *px++ * *py--;
238 
239         /* Decrement loop counter */
240         k--;
241       }
242 
243       /* Store the result in the accumulator in the destination buffer. */
244       *pOut++ = sum;
245 
246       /* Update the inputA and inputB pointers for next MAC calculation */
247       py = ++pSrc1;
248       px = pIn1;
249 
250       /* Increment MAC count */
251       count++;
252 
253       /* Decrement loop counter */
254       blockSize1--;
255     }
256 
257     /* --------------------------
258      * Initializations of stage2
259      * ------------------------*/
260 
261     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
262      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
263      * ....
264      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
265      */
266 
267     /* Working pointer of inputA */
268     if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
269     {
270       pSrc1 = pIn1 + firstIndex - srcBLen + 1;
271     }
272     else
273     {
274       pSrc1 = pIn1;
275     }
276     px = pSrc1;
277 
278     /* Working pointer of inputB */
279     pSrc2 = pIn2 + (srcBLen - 1U);
280     py = pSrc2;
281 
282     /* count is index by which the pointer pIn1 to be incremented */
283     count = 0U;
284 
285     /* -------------------
286      * Stage2 process
287      * ------------------*/
288 
289     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
290      * So, to loop unroll over blockSize2,
291      * srcBLen should be greater than or equal to 4 */
292     if (srcBLen >= 4U)
293     {
294 #if defined (ARM_MATH_LOOPUNROLL)
295 
296       /* Loop unrolling: Compute 4 outputs at a time */
297       blkCnt = ((uint32_t) blockSize2 >> 2U);
298 
299       while (blkCnt > 0U)
300       {
301         /* Set all accumulators to zero */
302         acc0 = 0.0f;
303         acc1 = 0.0f;
304         acc2 = 0.0f;
305         acc3 = 0.0f;
306 
307         /* read x[0], x[1], x[2] samples */
308         x0 = *px++;
309         x1 = *px++;
310         x2 = *px++;
311 
312         /* Apply loop unrolling and compute 4 MACs simultaneously. */
313         k = srcBLen >> 2U;
314 
315         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
316          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
317         do
318         {
319           /* Read y[srcBLen - 1] sample */
320           c0 = *py--;
321           /* Read x[3] sample */
322           x3 = *px++;
323 
324           /* Perform the multiply-accumulate */
325           /* acc0 +=  x[0] * y[srcBLen - 1] */
326           acc0 += x0 * c0;
327           /* acc1 +=  x[1] * y[srcBLen - 1] */
328           acc1 += x1 * c0;
329           /* acc2 +=  x[2] * y[srcBLen - 1] */
330           acc2 += x2 * c0;
331           /* acc3 +=  x[3] * y[srcBLen - 1] */
332           acc3 += x3 * c0;
333 
334           /* Read y[srcBLen - 2] sample */
335           c0 = *py--;
336           /* Read x[4] sample */
337           x0 = *px++;
338 
339           /* Perform the multiply-accumulate */
340           /* acc0 +=  x[1] * y[srcBLen - 2] */
341           acc0 += x1 * c0;
342           /* acc1 +=  x[2] * y[srcBLen - 2] */
343           acc1 += x2 * c0;
344           /* acc2 +=  x[3] * y[srcBLen - 2] */
345           acc2 += x3 * c0;
346           /* acc3 +=  x[4] * y[srcBLen - 2] */
347           acc3 += x0 * c0;
348 
349           /* Read y[srcBLen - 3] sample */
350           c0 = *py--;
351           /* Read x[5] sample */
352           x1 = *px++;
353 
354           /* Perform the multiply-accumulate */
355           /* acc0 +=  x[2] * y[srcBLen - 3] */
356           acc0 += x2 * c0;
357           /* acc1 +=  x[3] * y[srcBLen - 2] */
358           acc1 += x3 * c0;
359           /* acc2 +=  x[4] * y[srcBLen - 2] */
360           acc2 += x0 * c0;
361           /* acc3 +=  x[5] * y[srcBLen - 2] */
362           acc3 += x1 * c0;
363 
364           /* Read y[srcBLen - 4] sample */
365           c0 = *py--;
366           /* Read x[6] sample */
367           x2 = *px++;
368 
369           /* Perform the multiply-accumulate */
370           /* acc0 +=  x[3] * y[srcBLen - 4] */
371           acc0 += x3 * c0;
372           /* acc1 +=  x[4] * y[srcBLen - 4] */
373           acc1 += x0 * c0;
374           /* acc2 +=  x[5] * y[srcBLen - 4] */
375           acc2 += x1 * c0;
376           /* acc3 +=  x[6] * y[srcBLen - 4] */
377           acc3 += x2 * c0;
378 
379         } while (--k);
380 
381         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
382          ** No loop unrolling is used. */
383         k = srcBLen % 0x4U;
384 
385         while (k > 0U)
386         {
387           /* Read y[srcBLen - 5] sample */
388           c0 = *py--;
389           /* Read x[7] sample */
390           x3 = *px++;
391 
392           /* Perform the multiply-accumulates */
393           /* acc0 +=  x[4] * y[srcBLen - 5] */
394           acc0 += x0 * c0;
395           /* acc1 +=  x[5] * y[srcBLen - 5] */
396           acc1 += x1 * c0;
397           /* acc2 +=  x[6] * y[srcBLen - 5] */
398           acc2 += x2 * c0;
399           /* acc3 +=  x[7] * y[srcBLen - 5] */
400           acc3 += x3 * c0;
401 
402           /* Reuse the present samples for the next MAC */
403           x0 = x1;
404           x1 = x2;
405           x2 = x3;
406 
407           /* Decrement the loop counter */
408           k--;
409         }
410 
411         /* Store the result in the accumulator in the destination buffer. */
412         *pOut++ = acc0;
413         *pOut++ = acc1;
414         *pOut++ = acc2;
415         *pOut++ = acc3;
416 
417         /* Increment the pointer pIn1 index, count by 4 */
418         count += 4U;
419 
420         /* Update the inputA and inputB pointers for next MAC calculation */
421         px = pSrc1 + count;
422         py = pSrc2;
423 
424         /* Decrement loop counter */
425         blkCnt--;
426       }
427 
428       /* Loop unrolling: Compute remaining outputs */
429       blkCnt = (uint32_t) blockSize2 % 0x4U;
430 
431 #else
432 
433       /* Initialize blkCnt with number of samples */
434       blkCnt = blockSize2;
435 
436 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
437 
438       while (blkCnt > 0U)
439       {
440         /* Accumulator is made zero for every iteration */
441         sum = 0.0f;
442 
443 #if defined (ARM_MATH_LOOPUNROLL)
444 
445         /* Loop unrolling: Compute 4 outputs at a time */
446         k = srcBLen >> 2U;
447 
448         while (k > 0U)
449         {
450           /* Perform the multiply-accumulates */
451           sum += *px++ * *py--;
452           sum += *px++ * *py--;
453           sum += *px++ * *py--;
454           sum += *px++ * *py--;
455 
456           /* Decrement loop counter */
457           k--;
458         }
459 
460         /* Loop unrolling: Compute remaining outputs */
461         k = srcBLen % 0x4U;
462 
463 #else
464 
465         /* Initialize blkCnt with number of samples */
466         k = srcBLen;
467 
468 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
469 
470         while (k > 0U)
471         {
472           /* Perform the multiply-accumulate */
473           sum += *px++ * *py--;
474 
475           /* Decrement loop counter */
476           k--;
477         }
478 
479         /* Store the result in the accumulator in the destination buffer. */
480         *pOut++ = sum;
481 
482         /* Increment MAC count */
483         count++;
484 
485         /* Update the inputA and inputB pointers for next MAC calculation */
486         px = pSrc1 + count;
487         py = pSrc2;
488 
489         /* Decrement loop counter */
490         blkCnt--;
491       }
492     }
493     else
494     {
495       /* If the srcBLen is not a multiple of 4,
496        * the blockSize2 loop cannot be unrolled by 4 */
497       blkCnt = (uint32_t) blockSize2;
498 
499       while (blkCnt > 0U)
500       {
501         /* Accumulator is made zero for every iteration */
502         sum = 0.0f;
503 
504         /* srcBLen number of MACS should be performed */
505         k = srcBLen;
506 
507         while (k > 0U)
508         {
509           /* Perform the multiply-accumulate */
510           sum += *px++ * *py--;
511 
512           /* Decrement loop counter */
513           k--;
514         }
515 
516         /* Store the result in the accumulator in the destination buffer. */
517         *pOut++ = sum;
518 
519         /* Increment the MAC count */
520         count++;
521 
522         /* Update the inputA and inputB pointers for next MAC calculation */
523         px = pSrc1 + count;
524         py = pSrc2;
525 
526         /* Decrement the loop counter */
527         blkCnt--;
528       }
529     }
530 
531 
532     /* --------------------------
533      * Initializations of stage3
534      * -------------------------*/
535 
536     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
537      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
538      * ....
539      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
540      * sum +=  x[srcALen-1] * y[srcBLen-1]
541      */
542 
543     /* In this stage the MAC operations are decreased by 1 for every iteration.
544        The blockSize3 variable holds the number of MAC operations performed */
545     count = srcBLen - 1U;
546 
547     /* Working pointer of inputA */
548     if (firstIndex > srcALen)
549     {
550        pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
551     }
552     else
553     {
554        pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
555     }
556     px = pSrc1;
557 
558     /* Working pointer of inputB */
559     pSrc2 = pIn2 + (srcBLen - 1U);
560     py = pSrc2;
561 
562     /* -------------------
563      * Stage3 process
564      * ------------------*/
565 
566     while (blockSize3 > 0)
567     {
568       /* Accumulator is made zero for every iteration */
569       sum = 0.0f;
570 
571 #if defined (ARM_MATH_LOOPUNROLL)
572 
573       /* Loop unrolling: Compute 4 outputs at a time */
574       k = count >> 2U;
575 
576       while (k > 0U)
577       {
578         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
579         sum += *px++ * *py--;
580 
581         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
582         sum += *px++ * *py--;
583 
584         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
585         sum += *px++ * *py--;
586 
587         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
588         sum += *px++ * *py--;
589 
590         /* Decrement loop counter */
591         k--;
592       }
593 
594       /* Loop unrolling: Compute remaining outputs */
595       k = count % 0x4U;
596 
597 #else
598 
599       /* Initialize blkCnt with number of samples */
600       k = count;
601 
602 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
603 
604       while (k > 0U)
605       {
606         /* Perform the multiply-accumulate */
607         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
608         sum += *px++ * *py--;
609 
610         /* Decrement loop counter */
611         k--;
612       }
613 
614       /* Store the result in the accumulator in the destination buffer. */
615       *pOut++ = sum;
616 
617       /* Update the inputA and inputB pointers for next MAC calculation */
618       px = ++pSrc1;
619       py = pSrc2;
620 
621       /* Decrement MAC count */
622       count--;
623 
624       /* Decrement the loop counter */
625       blockSize3--;
626     }
627 
628     /* Set status as ARM_MATH_SUCCESS */
629     status = ARM_MATH_SUCCESS;
630   }
631 
632   /* Return to application */
633   return (status);
634 
635 #else
636 /* alternate version for CM0_FAMILY */
637 
638   const float32_t *pIn1 = pSrcA;                       /* InputA pointer */
639   const float32_t *pIn2 = pSrcB;                       /* InputB pointer */
640         float32_t sum;                                 /* Accumulator */
641         uint32_t i, j;                                 /* Loop counters */
642         arm_status status;                             /* Status of Partial convolution */
643   /* Check for range of output samples to be calculated */
644   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
645   {
646     /* Set status as ARM_MATH_ARGUMENT_ERROR */
647     status = ARM_MATH_ARGUMENT_ERROR;
648   }
649   else
650   {
651     /* Loop to calculate convolution for output length number of values */
652     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
653     {
654       /* Initialize sum with zero to carry on MAC operations */
655       sum = 0.0f;
656 
657       /* Loop to perform MAC operations according to convolution equation */
658       for (j = 0U; j <= i; j++)
659       {
660         /* Check the array limitations */
661         if (((i - j) < srcBLen) && (j < srcALen))
662         {
663           /* z[i] += x[i-j] * y[j] */
664           sum += ( pIn1[j] * pIn2[i - j]);
665         }
666       }
667 
668       /* Store the output in the destination buffer */
669       pDst[i] = sum;
670     }
671 
672     /* Set status as ARM_SUCCESS */
673     status = ARM_MATH_SUCCESS;
674   }
675 
676   /* Return to application */
677   return (status);
678 
679 #endif /* defined(ARM_MATH_DSP) */
680 }
681 
682 /**
683   @} end of PartialConv group
684  */
685