1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_partial_fast_q31.c
4  * Description:  Fast Q31 Partial convolution
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup PartialConv
37   @{
38  */
39 
40 /**
41   @brief         Partial convolution of Q31 sequences (fast version).
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written
47   @param[in]     firstIndex is the first output sample to start with
48   @param[in]     numPoints  is the number of output points to be computed
49   @return        execution status
50                    - \ref ARM_MATH_SUCCESS        : Operation successful
51                    - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
52 
53   @remark
54                    Refer to \ref arm_conv_partial_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
55  */
56 
arm_conv_partial_fast_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst,uint32_t firstIndex,uint32_t numPoints)57 arm_status arm_conv_partial_fast_q31(
58   const q31_t * pSrcA,
59         uint32_t srcALen,
60   const q31_t * pSrcB,
61         uint32_t srcBLen,
62         q31_t * pDst,
63         uint32_t firstIndex,
64         uint32_t numPoints)
65 {
66   const q31_t *pIn1;                                   /* InputA pointer */
67   const q31_t *pIn2;                                   /* InputB pointer */
68         q31_t *pOut = pDst;                            /* Output pointer */
69   const q31_t *px;                                     /* Intermediate inputA pointer */
70   const q31_t *py;                                     /* Intermediate inputB pointer */
71   const q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
72         q31_t sum;                                     /* Accumulators */
73         uint32_t j, k, count, check, blkCnt;
74         int32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
75         arm_status status;                             /* Status of Partial convolution */
76 
77 #if defined (ARM_MATH_LOOPUNROLL)
78         q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
79         q31_t x0, x1, x2, x3, c0;
80 #endif
81 
82   /* Check for range of output samples to be calculated */
83   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
84   {
85     /* Set status as ARM_MATH_ARGUMENT_ERROR */
86     status = ARM_MATH_ARGUMENT_ERROR;
87   }
88   else
89   {
90     /* The algorithm implementation is based on the lengths of the inputs. */
91     /* srcB is always made to slide across srcA. */
92     /* So srcBLen is always considered as shorter or equal to srcALen */
93     if (srcALen >= srcBLen)
94     {
95       /* Initialization of inputA pointer */
96       pIn1 = pSrcA;
97 
98       /* Initialization of inputB pointer */
99       pIn2 = pSrcB;
100     }
101     else
102     {
103       /* Initialization of inputA pointer */
104       pIn1 = pSrcB;
105 
106       /* Initialization of inputB pointer */
107       pIn2 = pSrcA;
108 
109       /* srcBLen is always considered as shorter or equal to srcALen */
110       j = srcBLen;
111       srcBLen = srcALen;
112       srcALen = j;
113     }
114 
115     /* Conditions to check which loopCounter holds
116      * the first and last indices of the output samples to be calculated. */
117     check = firstIndex + numPoints;
118     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
119     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
120     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
121     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :  (int32_t)numPoints) : 0;
122     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
123     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
124 
125     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
126     /* The function is internally
127      * divided into three stages according to the number of multiplications that has to be
128      * taken place between inputA samples and inputB samples. In the first stage of the
129      * algorithm, the multiplications increase by one for every iteration.
130      * In the second stage of the algorithm, srcBLen number of multiplications are done.
131      * In the third stage of the algorithm, the multiplications decrease by one
132      * for every iteration. */
133 
134     /* Set the output pointer to point to the firstIndex
135      * of the output sample to be calculated. */
136     pOut = pDst + firstIndex;
137 
138     /* --------------------------
139      * Initializations of stage1
140      * -------------------------*/
141 
142     /* sum = x[0] * y[0]
143      * sum = x[0] * y[1] + x[1] * y[0]
144      * ....
145      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
146      */
147 
148     /* In this stage the MAC operations are increased by 1 for every iteration.
149        The count variable holds the number of MAC operations performed.
150        Since the partial convolution starts from firstIndex
151        Number of Macs to be performed is firstIndex + 1 */
152     count = 1U + firstIndex;
153 
154     /* Working pointer of inputA */
155     px = pIn1;
156 
157     /* Working pointer of inputB */
158     pSrc2 = pIn2 + firstIndex;
159     py = pSrc2;
160 
161     /* ------------------------
162      * Stage1 process
163      * ----------------------*/
164 
165     /* The first stage starts here */
166     while (blockSize1 > 0)
167     {
168       /* Accumulator is made zero for every iteration */
169       sum = 0;
170 
171 #if defined (ARM_MATH_LOOPUNROLL)
172 
173       /* Loop unrolling: Compute 4 outputs at a time */
174       k = count >> 2U;
175 
176       while (k > 0U)
177       {
178         /* x[0] * y[srcBLen - 1] */
179         sum = (q31_t) ((((q63_t) sum << 32) +
180                         ((q63_t) *px++ * (*py--))) >> 32);
181 
182         /* x[1] * y[srcBLen - 2] */
183         sum = (q31_t) ((((q63_t) sum << 32) +
184                         ((q63_t) *px++ * (*py--))) >> 32);
185 
186         /* x[2] * y[srcBLen - 3] */
187         sum = (q31_t) ((((q63_t) sum << 32) +
188                         ((q63_t) *px++ * (*py--))) >> 32);
189 
190         /* x[3] * y[srcBLen - 4] */
191         sum = (q31_t) ((((q63_t) sum << 32) +
192                         ((q63_t) *px++ * (*py--))) >> 32);
193 
194         /* Decrement loop counter */
195         k--;
196       }
197 
198       /* Loop unrolling: Compute remaining outputs */
199       k = count % 0x4U;
200 
201 #else
202 
203       /* Initialize k with number of samples */
204       k = count;
205 
206 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
207 
208       while (k > 0U)
209       {
210         /* Perform the multiply-accumulate */
211         sum = (q31_t) ((((q63_t) sum << 32) +
212                         ((q63_t) *px++ * (*py--))) >> 32);
213 
214         /* Decrement loop counter */
215         k--;
216       }
217 
218       /* Store the result in the accumulator in the destination buffer. */
219       *pOut++ = sum << 1;
220 
221       /* Update the inputA and inputB pointers for next MAC calculation */
222       py = ++pSrc2;
223       px = pIn1;
224 
225       /* Increment MAC count */
226       count++;
227 
228       /* Decrement loop counter */
229       blockSize1--;
230     }
231 
232     /* --------------------------
233      * Initializations of stage2
234      * ------------------------*/
235 
236     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
237      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
238      * ....
239      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
240      */
241 
242     /* Working pointer of inputA */
243     if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
244     {
245       pSrc1 = pIn1 + firstIndex - srcBLen + 1;
246     }
247     else
248     {
249       pSrc1 = pIn1;
250     }
251     px = pSrc1;
252 
253     /* Working pointer of inputB */
254     pSrc2 = pIn2 + (srcBLen - 1U);
255     py = pSrc2;
256 
257     /* count is index by which the pointer pIn1 to be incremented */
258     count = 0U;
259 
260     /* -------------------
261      * Stage2 process
262      * ------------------*/
263 
264     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
265      * So, to loop unroll over blockSize2,
266      * srcBLen should be greater than or equal to 4 */
267     if (srcBLen >= 4U)
268     {
269 #if defined (ARM_MATH_LOOPUNROLL)
270 
271     /* Loop unrolling: Compute 4 outputs at a time */
272       blkCnt = ((uint32_t) blockSize2 >> 2U);
273 
274       while (blkCnt > 0U)
275       {
276         /* Set all accumulators to zero */
277         acc0 = 0;
278         acc1 = 0;
279         acc2 = 0;
280         acc3 = 0;
281 
282         /* read x[0], x[1], x[2] samples */
283         x0 = *px++;
284         x1 = *px++;
285         x2 = *px++;
286 
287         /* Apply loop unrolling and compute 4 MACs simultaneously. */
288         k = srcBLen >> 2U;
289 
290         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
291          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
292         do
293         {
294           /* Read y[srcBLen - 1] sample */
295           c0 = *py--;
296           /* Read x[3] sample */
297           x3 = *px++;
298 
299           /* Perform the multiply-accumulate */
300           /* acc0 +=  x[0] * y[srcBLen - 1] */
301           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
302           /* acc1 +=  x[1] * y[srcBLen - 1] */
303           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
304           /* acc2 +=  x[2] * y[srcBLen - 1] */
305           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
306           /* acc3 +=  x[3] * y[srcBLen - 1] */
307           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
308 
309           /* Read y[srcBLen - 2] sample */
310           c0 = *py--;
311           /* Read x[4] sample */
312           x0 = *px++;
313 
314           /* Perform the multiply-accumulate */
315           /* acc0 +=  x[1] * y[srcBLen - 2] */
316           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
317           /* acc1 +=  x[2] * y[srcBLen - 2] */
318           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
319           /* acc2 +=  x[3] * y[srcBLen - 2] */
320           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
321           /* acc3 +=  x[4] * y[srcBLen - 2] */
322           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
323 
324           /* Read y[srcBLen - 3] sample */
325           c0 = *py--;
326           /* Read x[5] sample */
327           x1 = *px++;
328 
329           /* Perform the multiply-accumulates */
330           /* acc0 +=  x[2] * y[srcBLen - 3] */
331           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
332           /* acc1 +=  x[3] * y[srcBLen - 2] */
333           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
334           /* acc2 +=  x[4] * y[srcBLen - 2] */
335           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
336           /* acc3 +=  x[5] * y[srcBLen - 2] */
337           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
338 
339           /* Read y[srcBLen - 4] sample */
340           c0 = *py--;
341           /* Read x[6] sample */
342           x2 = *px++;
343 
344           /* Perform the multiply-accumulates */
345           /* acc0 +=  x[3] * y[srcBLen - 4] */
346           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
347           /* acc1 +=  x[4] * y[srcBLen - 4] */
348           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
349           /* acc2 +=  x[5] * y[srcBLen - 4] */
350           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
351           /* acc3 +=  x[6] * y[srcBLen - 4] */
352           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
353 
354         } while (--k);
355 
356         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
357          ** No loop unrolling is used. */
358         k = srcBLen % 0x4U;
359 
360         while (k > 0U)
361         {
362           /* Read y[srcBLen - 5] sample */
363           c0 = *py--;
364           /* Read x[7] sample */
365           x3 = *px++;
366 
367           /* Perform the multiply-accumulates */
368           /* acc0 +=  x[4] * y[srcBLen - 5] */
369           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
370           /* acc1 +=  x[5] * y[srcBLen - 5] */
371           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
372           /* acc2 +=  x[6] * y[srcBLen - 5] */
373           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
374           /* acc3 +=  x[7] * y[srcBLen - 5] */
375           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
376 
377           /* Reuse the present samples for the next MAC */
378           x0 = x1;
379           x1 = x2;
380           x2 = x3;
381 
382           /* Decrement the loop counter */
383           k--;
384         }
385 
386         /* Store the result in the accumulator in the destination buffer. */
387         *pOut++ = (q31_t) (acc0 << 1);
388         *pOut++ = (q31_t) (acc1 << 1);
389         *pOut++ = (q31_t) (acc2 << 1);
390         *pOut++ = (q31_t) (acc3 << 1);
391 
392         /* Increment the pointer pIn1 index, count by 4 */
393         count += 4U;
394 
395         /* Update the inputA and inputB pointers for next MAC calculation */
396         px = pSrc1 + count;
397         py = pSrc2;
398 
399         /* Decrement loop counter */
400         blkCnt--;
401       }
402 
403       /* Loop unrolling: Compute remaining outputs */
404       blkCnt = (uint32_t) blockSize2 % 0x4U;
405 
406 #else
407 
408       /* Initialize blkCnt with number of samples */
409       blkCnt = blockSize2;
410 
411 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
412 
413       while (blkCnt > 0U)
414       {
415         /* Accumulator is made zero for every iteration */
416         sum = 0;
417 
418 #if defined (ARM_MATH_LOOPUNROLL)
419 
420         /* Loop unrolling: Compute 4 outputs at a time */
421         k = srcBLen >> 2U;
422 
423         while (k > 0U)
424         {
425           /* Perform the multiply-accumulates */
426           sum = (q31_t) ((((q63_t) sum << 32) +
427                           ((q63_t) * px++ * (*py--))) >> 32);
428           sum = (q31_t) ((((q63_t) sum << 32) +
429                           ((q63_t) * px++ * (*py--))) >> 32);
430           sum = (q31_t) ((((q63_t) sum << 32) +
431                           ((q63_t) * px++ * (*py--))) >> 32);
432           sum = (q31_t) ((((q63_t) sum << 32) +
433                           ((q63_t) * px++ * (*py--))) >> 32);
434 
435           /* Decrement loop counter */
436           k--;
437         }
438 
439         /* Loop unrolling: Compute remaining outputs */
440         k = srcBLen % 0x4U;
441 
442 #else
443 
444         /* Initialize blkCnt with number of samples */
445         k = srcBLen;
446 
447 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
448 
449         while (k > 0U)
450         {
451           /* Perform the multiply-accumulate */
452           sum = (q31_t) ((((q63_t) sum << 32) +
453                           ((q63_t) *px++ * (*py--))) >> 32);
454 
455           /* Decrement loop counter */
456           k--;
457         }
458 
459         /* Store the result in the accumulator in the destination buffer. */
460         *pOut++ = sum << 1;
461 
462         /* Increment MAC count */
463         count++;
464 
465         /* Update the inputA and inputB pointers for next MAC calculation */
466         px = pSrc1 + count;
467         py = pSrc2;
468 
469         /* Decrement loop counter */
470         blkCnt--;
471       }
472     }
473     else
474     {
475       /* If the srcBLen is not a multiple of 4,
476        * the blockSize2 loop cannot be unrolled by 4 */
477       blkCnt = (uint32_t) blockSize2;
478 
479       while (blkCnt > 0U)
480       {
481         /* Accumulator is made zero for every iteration */
482         sum = 0;
483 
484         /* srcBLen number of MACS should be performed */
485         k = srcBLen;
486 
487         while (k > 0U)
488         {
489           /* Perform the multiply-accumulate */
490           sum = (q31_t) ((((q63_t) sum << 32) +
491                           ((q63_t) *px++ * (*py--))) >> 32);
492 
493           /* Decrement loop counter */
494           k--;
495         }
496 
497         /* Store the result in the accumulator in the destination buffer. */
498         *pOut++ = sum << 1;
499 
500         /* Increment the MAC count */
501         count++;
502 
503         /* Update the inputA and inputB pointers for next MAC calculation */
504         px = pSrc1 + count;
505         py = pSrc2;
506 
507         /* Decrement the loop counter */
508         blkCnt--;
509       }
510     }
511 
512 
513     /* --------------------------
514      * Initializations of stage3
515      * -------------------------*/
516 
517     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
518      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
519      * ....
520      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
521      * sum +=  x[srcALen-1] * y[srcBLen-1]
522      */
523 
524     /* In this stage the MAC operations are decreased by 1 for every iteration.
525        The count variable holds the number of MAC operations performed */
526     count = srcBLen - 1U;
527 
528     /* Working pointer of inputA */
529     if (firstIndex > srcALen)
530     {
531        pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
532     }
533     else
534     {
535        pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
536     }
537     px = pSrc1;
538 
539     /* Working pointer of inputB */
540     pSrc2 = pIn2 + (srcBLen - 1U);
541     py = pSrc2;
542 
543     /* -------------------
544      * Stage3 process
545      * ------------------*/
546 
547     while (blockSize3 > 0)
548     {
549       /* Accumulator is made zero for every iteration */
550       sum = 0;
551 
552 #if defined (ARM_MATH_LOOPUNROLL)
553 
554       /* Loop unrolling: Compute 4 outputs at a time */
555       k = count >> 2U;
556 
557       while (k > 0U)
558       {
559         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
560         sum = (q31_t) ((((q63_t) sum << 32) +
561                         ((q63_t) *px++ * (*py--))) >> 32);
562 
563         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
564         sum = (q31_t) ((((q63_t) sum << 32) +
565                         ((q63_t) *px++ * (*py--))) >> 32);
566 
567         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
568         sum = (q31_t) ((((q63_t) sum << 32) +
569                         ((q63_t) *px++ * (*py--))) >> 32);
570 
571         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
572         sum = (q31_t) ((((q63_t) sum << 32) +
573                         ((q63_t) *px++ * (*py--))) >> 32);
574 
575         /* Decrement loop counter */
576         k--;
577       }
578 
579       /* Loop unrolling: Compute remaining outputs */
580       k = count % 0x4U;
581 
582 #else
583 
584       /* Initialize blkCnt with number of samples */
585       k = count;
586 
587 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
588 
589       while (k > 0U)
590       {
591         /* Perform the multiply-accumulates */
592         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
593         sum = (q31_t) ((((q63_t) sum << 32) +
594                         ((q63_t) *px++ * (*py--))) >> 32);
595 
596         /* Decrement loop counter */
597         k--;
598       }
599 
600       /* Store the result in the accumulator in the destination buffer. */
601       *pOut++ = sum << 1;
602 
603       /* Update the inputA and inputB pointers for next MAC calculation */
604       px = ++pSrc1;
605       py = pSrc2;
606 
607       /* Decrement MAC count */
608       count--;
609 
610       /* Decrement the loop counter */
611       blockSize3--;
612     }
613 
614     /* Set status as ARM_MATH_SUCCESS */
615     status = ARM_MATH_SUCCESS;
616   }
617 
618   /* Return to application */
619   return (status);
620 
621 }
622 
623 /**
624   @} end of PartialConv group
625  */
626