1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_partial_q31.c
4  * Description:  Partial convolution of Q31 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup PartialConv
37   @{
38  */
39 
40 /**
41   @brief         Partial convolution of Q31 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written
47   @param[in]     firstIndex is the first output sample to start with
48   @param[in]     numPoints  is the number of output points to be computed
49   @return        execution status
50                    - \ref ARM_MATH_SUCCESS        : Operation successful
51                    - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
52 
53   @remark
54                    Refer to \ref arm_conv_partial_fast_q31() for a faster but less precise implementation of this function.
55  */
56 
arm_conv_partial_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst,uint32_t firstIndex,uint32_t numPoints)57 arm_status arm_conv_partial_q31(
58   const q31_t * pSrcA,
59         uint32_t srcALen,
60   const q31_t * pSrcB,
61         uint32_t srcBLen,
62         q31_t * pDst,
63         uint32_t firstIndex,
64         uint32_t numPoints)
65 {
66 
67 #if defined(ARM_MATH_DSP)
68 
69   const q31_t *pIn1;                                   /* InputA pointer */
70   const q31_t *pIn2;                                   /* InputB pointer */
71         q31_t *pOut = pDst;                            /* Output pointer */
72   const q31_t *px;                                     /* Intermediate inputA pointer */
73   const q31_t *py;                                     /* Intermediate inputB pointer */
74   const q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
75         q63_t sum;                                     /* Accumulator */
76         uint32_t j, k, count, blkCnt, check;
77         int32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
78         arm_status status;                             /* Status of Partial convolution */
79 
80 #if defined (ARM_MATH_LOOPUNROLL)
81         q63_t acc0, acc1, acc2;                        /* Accumulator */
82         q31_t x0, x1, x2, c0;                          /* Temporary variables */
83 #endif
84 
85   /* Check for range of output samples to be calculated */
86   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
87   {
88     /* Set status as ARM_MATH_ARGUMENT_ERROR */
89     status = ARM_MATH_ARGUMENT_ERROR;
90   }
91   else
92   {
93     /* The algorithm implementation is based on the lengths of the inputs. */
94     /* srcB is always made to slide across srcA. */
95     /* So srcBLen is always considered as shorter or equal to srcALen */
96     if (srcALen >= srcBLen)
97     {
98       /* Initialization of inputA pointer */
99       pIn1 = pSrcA;
100 
101       /* Initialization of inputB pointer */
102       pIn2 = pSrcB;
103     }
104     else
105     {
106       /* Initialization of inputA pointer */
107       pIn1 = pSrcB;
108 
109       /* Initialization of inputB pointer */
110       pIn2 = pSrcA;
111 
112       /* srcBLen is always considered as shorter or equal to srcALen */
113       j = srcBLen;
114       srcBLen = srcALen;
115       srcALen = j;
116     }
117 
118     /* Conditions to check which loopCounter holds
119      * the first and last indices of the output samples to be calculated. */
120     check = firstIndex + numPoints;
121     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
122     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
123     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
124     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :  (int32_t)numPoints) : 0;
125     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
126     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
127 
128     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
129     /* The function is internally
130      * divided into three stages according to the number of multiplications that has to be
131      * taken place between inputA samples and inputB samples. In the first stage of the
132      * algorithm, the multiplications increase by one for every iteration.
133      * In the second stage of the algorithm, srcBLen number of multiplications are done.
134      * In the third stage of the algorithm, the multiplications decrease by one
135      * for every iteration. */
136 
137     /* Set the output pointer to point to the firstIndex
138      * of the output sample to be calculated. */
139     pOut = pDst + firstIndex;
140 
141     /* --------------------------
142      * Initializations of stage1
143      * -------------------------*/
144 
145     /* sum = x[0] * y[0]
146      * sum = x[0] * y[1] + x[1] * y[0]
147      * ....
148      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
149      */
150 
151     /* In this stage the MAC operations are increased by 1 for every iteration.
152        The count variable holds the number of MAC operations performed.
153        Since the partial convolution starts from firstIndex
154        Number of Macs to be performed is firstIndex + 1 */
155     count = 1U + firstIndex;
156 
157     /* Working pointer of inputA */
158     px = pIn1;
159 
160     /* Working pointer of inputB */
161     pSrc2 = pIn2 + firstIndex;
162     py = pSrc2;
163 
164     /* ------------------------
165      * Stage1 process
166      * ----------------------*/
167 
168     /* The first stage starts here */
169     while (blockSize1 > 0)
170     {
171       /* Accumulator is made zero for every iteration */
172       sum = 0;
173 
174 #if defined (ARM_MATH_LOOPUNROLL)
175 
176       /* Loop unrolling: Compute 4 outputs at a time */
177       k = count >> 2U;
178 
179       while (k > 0U)
180       {
181         /* x[0] * y[srcBLen - 1] */
182         sum += (q63_t) *px++ * (*py--);
183 
184         /* x[1] * y[srcBLen - 2] */
185         sum += (q63_t) *px++ * (*py--);
186 
187         /* x[2] * y[srcBLen - 3] */
188         sum += (q63_t) *px++ * (*py--);
189 
190         /* x[3] * y[srcBLen - 4] */
191         sum += (q63_t) *px++ * (*py--);
192 
193         /* Decrement loop counter */
194         k--;
195       }
196 
197       /* Loop unrolling: Compute remaining outputs */
198       k = count % 0x4U;
199 
200 #else
201 
202       /* Initialize k with number of samples */
203       k = count;
204 
205 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
206 
207       while (k > 0U)
208       {
209         /* Perform the multiply-accumulate */
210         sum += (q63_t) *px++ * (*py--);
211 
212         /* Decrement loop counter */
213         k--;
214       }
215 
216       /* Store the result in the accumulator in the destination buffer. */
217       *pOut++ = (q31_t) (sum >> 31);
218 
219       /* Update the inputA and inputB pointers for next MAC calculation */
220       py = ++pSrc2;
221       px = pIn1;
222 
223       /* Increment MAC count */
224       count++;
225 
226       /* Decrement loop counter */
227       blockSize1--;
228     }
229 
230     /* --------------------------
231      * Initializations of stage2
232      * ------------------------*/
233 
234     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
235      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
236      * ....
237      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
238      */
239 
240     /* Working pointer of inputA */
241     if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
242     {
243       pSrc1 = pIn1 + firstIndex - srcBLen + 1;
244     }
245     else
246     {
247       pSrc1 = pIn1;
248     }
249     px = pSrc1;
250 
251     /* Working pointer of inputB */
252     pSrc2 = pIn2 + (srcBLen - 1U);
253     py = pSrc2;
254 
255     /* count is index by which the pointer pIn1 to be incremented */
256     count = 0U;
257 
258     /* -------------------
259      * Stage2 process
260      * ------------------*/
261 
262     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
263      * So, to loop unroll over blockSize2,
264      * srcBLen should be greater than or equal to 4 */
265     if (srcBLen >= 4U)
266     {
267 #if defined (ARM_MATH_LOOPUNROLL)
268 
269       /* Loop unroll over blkCnt */
270       blkCnt = blockSize2 / 3;
271 
272       while (blkCnt > 0U)
273       {
274         /* Set all accumulators to zero */
275         acc0 = 0;
276         acc1 = 0;
277         acc2 = 0;
278 
279         /* read x[0], x[1] samples */
280         x0 = *px++;
281         x1 = *px++;
282 
283         /* Apply loop unrolling and compute 3 MACs simultaneously. */
284         k = srcBLen / 3;
285 
286         /* First part of the processing with loop unrolling.  Compute 3 MACs at a time.
287          ** a second loop below computes MACs for the remaining 1 to 2 samples. */
288         do
289         {
290           /* Read y[srcBLen - 1] sample */
291           c0 = *(py);
292 
293           /* Read x[2] sample */
294           x2 = *(px);
295 
296           /* Perform the multiply-accumulate */
297           /* acc0 +=  x[0] * y[srcBLen - 1] */
298           acc0 += (q63_t) x0 * c0;
299           /* acc1 +=  x[1] * y[srcBLen - 1] */
300           acc1 += (q63_t) x1 * c0;
301           /* acc2 +=  x[2] * y[srcBLen - 1] */
302           acc2 += (q63_t) x2 * c0;
303 
304           /* Read y[srcBLen - 2] sample */
305           c0 = *(py - 1U);
306 
307           /* Read x[3] sample */
308           x0 = *(px + 1U);
309 
310           /* Perform the multiply-accumulate */
311           /* acc0 +=  x[1] * y[srcBLen - 2] */
312           acc0 += (q63_t) x1 * c0;
313           /* acc1 +=  x[2] * y[srcBLen - 2] */
314           acc1 += (q63_t) x2 * c0;
315           /* acc2 +=  x[3] * y[srcBLen - 2] */
316           acc2 += (q63_t) x0 * c0;
317 
318           /* Read y[srcBLen - 3] sample */
319           c0 = *(py - 2U);
320 
321           /* Read x[4] sample */
322           x1 = *(px + 2U);
323 
324           /* Perform the multiply-accumulate */
325           /* acc0 +=  x[2] * y[srcBLen - 3] */
326           acc0 += (q63_t) x2 * c0;
327           /* acc1 +=  x[3] * y[srcBLen - 2] */
328           acc1 += (q63_t) x0 * c0;
329           /* acc2 +=  x[4] * y[srcBLen - 2] */
330           acc2 += (q63_t) x1 * c0;
331 
332 
333           px += 3U;
334 
335           py -= 3U;
336 
337         } while (--k);
338 
339         /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
340          ** No loop unrolling is used. */
341         k = srcBLen - (3 * (srcBLen / 3));
342 
343         while (k > 0U)
344         {
345           /* Read y[srcBLen - 5] sample */
346           c0 = *py--;
347           /* Read x[7] sample */
348           x2 = *px++;
349 
350           /* Perform the multiply-accumulates */
351           /* acc0 +=  x[4] * y[srcBLen - 5] */
352           acc0 += (q63_t) x0 * c0;
353           /* acc1 +=  x[5] * y[srcBLen - 5] */
354           acc1 += (q63_t) x1 * c0;
355           /* acc2 +=  x[6] * y[srcBLen - 5] */
356           acc2 += (q63_t) x2 * c0;
357 
358           /* Reuse the present samples for the next MAC */
359           x0 = x1;
360           x1 = x2;
361 
362           /* Decrement the loop counter */
363           k--;
364         }
365 
366         /* Store the result in the accumulator in the destination buffer. */
367         *pOut++ = (q31_t) (acc0 >> 31);
368         *pOut++ = (q31_t) (acc1 >> 31);
369         *pOut++ = (q31_t) (acc2 >> 31);
370 
371         /* Increment the pointer pIn1 index, count by 3 */
372         count += 3U;
373 
374         /* Update the inputA and inputB pointers for next MAC calculation */
375         px = pSrc1 + count;
376         py = pSrc2;
377 
378         /* Decrement loop counter */
379         blkCnt--;
380       }
381 
382       /* Loop unrolling: Compute remaining outputs */
383       blkCnt = blockSize2 - 3 * (blockSize2 / 3);
384 
385 #else
386 
387       /* Initialize blkCnt with number of samples */
388       blkCnt = blockSize2;
389 
390 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
391 
392       while (blkCnt > 0U)
393       {
394         /* Accumulator is made zero for every iteration */
395         sum = 0;
396 
397 #if defined (ARM_MATH_LOOPUNROLL)
398 
399         /* Loop unrolling: Compute 4 outputs at a time */
400         k = srcBLen >> 2U;
401 
402         while (k > 0U)
403         {
404           /* Perform the multiply-accumulates */
405           sum += (q63_t) *px++ * (*py--);
406           sum += (q63_t) *px++ * (*py--);
407           sum += (q63_t) *px++ * (*py--);
408           sum += (q63_t) *px++ * (*py--);
409 
410           /* Decrement loop counter */
411           k--;
412         }
413 
414         /* Loop unrolling: Compute remaining outputs */
415         k = srcBLen % 0x4U;
416 
417 #else
418 
419         /* Initialize blkCnt with number of samples */
420         k = srcBLen;
421 
422 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
423 
424         while (k > 0U)
425         {
426           /* Perform the multiply-accumulate */
427           sum += (q63_t) *px++ * *py--;
428 
429           /* Decrement loop counter */
430           k--;
431         }
432 
433         /* Store the result in the accumulator in the destination buffer. */
434         *pOut++ = (q31_t) (sum >> 31);
435 
436         /* Increment MAC count */
437         count++;
438 
439         /* Update the inputA and inputB pointers for next MAC calculation */
440         px = pSrc1 + count;
441         py = pSrc2;
442 
443         /* Decrement loop counter */
444         blkCnt--;
445       }
446     }
447     else
448     {
449       /* If the srcBLen is not a multiple of 4,
450        * the blockSize2 loop cannot be unrolled by 4 */
451       blkCnt = (uint32_t) blockSize2;
452 
453       while (blkCnt > 0U)
454       {
455         /* Accumulator is made zero for every iteration */
456         sum = 0;
457 
458         /* srcBLen number of MACS should be performed */
459         k = srcBLen;
460 
461         while (k > 0U)
462         {
463           /* Perform the multiply-accumulate */
464           sum += (q63_t) *px++ * *py--;
465 
466           /* Decrement loop counter */
467           k--;
468         }
469 
470         /* Store the result in the accumulator in the destination buffer. */
471         *pOut++ = (q31_t) (sum >> 31);
472 
473         /* Increment the MAC count */
474         count++;
475 
476         /* Update the inputA and inputB pointers for next MAC calculation */
477         px = pSrc1 + count;
478         py = pSrc2;
479 
480         /* Decrement the loop counter */
481         blkCnt--;
482       }
483     }
484 
485 
486     /* --------------------------
487      * Initializations of stage3
488      * -------------------------*/
489 
490     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
491      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
492      * ....
493      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
494      * sum +=  x[srcALen-1] * y[srcBLen-1]
495      */
496 
497     /* In this stage the MAC operations are decreased by 1 for every iteration.
498        The blockSize3 variable holds the number of MAC operations performed */
499     count = srcBLen - 1U;
500 
501     /* Working pointer of inputA */
502     if (firstIndex > srcALen)
503     {
504        pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
505     }
506     else
507     {
508        pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
509     }
510     px = pSrc1;
511 
512     /* Working pointer of inputB */
513     pSrc2 = pIn2 + (srcBLen - 1U);
514     py = pSrc2;
515 
516     /* -------------------
517      * Stage3 process
518      * ------------------*/
519 
520     while (blockSize3 > 0)
521     {
522       /* Accumulator is made zero for every iteration */
523       sum = 0;
524 
525 #if defined (ARM_MATH_LOOPUNROLL)
526 
527       /* Loop unrolling: Compute 4 outputs at a time */
528       k = count >> 2U;
529 
530       while (k > 0U)
531       {
532         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
533         sum += (q63_t) *px++ * *py--;
534 
535         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
536         sum += (q63_t) *px++ * *py--;
537 
538         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
539         sum += (q63_t) *px++ * *py--;
540 
541         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
542         sum += (q63_t) *px++ * *py--;
543 
544         /* Decrement loop counter */
545         k--;
546       }
547 
548       /* Loop unrolling: Compute remaining outputs */
549       k = count % 0x4U;
550 
551 #else
552 
553       /* Initialize blkCnt with number of samples */
554       k = count;
555 
556 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
557 
558       while (k > 0U)
559       {
560         /* Perform the multiply-accumulate */
561         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
562         sum += (q63_t) *px++ * *py--;
563 
564         /* Decrement loop counter */
565         k--;
566       }
567 
568       /* Store the result in the accumulator in the destination buffer. */
569       *pOut++ = (q31_t) (sum >> 31);
570 
571       /* Update the inputA and inputB pointers for next MAC calculation */
572       px = ++pSrc1;
573       py = pSrc2;
574 
575       /* Decrement MAC count */
576       count--;
577 
578       /* Decrement the loop counter */
579       blockSize3--;
580     }
581 
582     /* Set status as ARM_MATH_SUCCESS */
583     status = ARM_MATH_SUCCESS;
584   }
585 
586   /* Return to application */
587   return (status);
588 
589 #else
590 /* alternate version for CM0_FAMILY */
591 
592   const q31_t *pIn1 = pSrcA;                           /* InputA pointer */
593   const q31_t *pIn2 = pSrcB;                           /* InputB pointer */
594         q63_t sum;                                     /* Accumulator */
595         uint32_t i, j;                                 /* Loop counters */
596         arm_status status;                             /* Status of Partial convolution */
597 
598   /* Check for range of output samples to be calculated */
599   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
600   {
601     /* Set status as ARM_MATH_ARGUMENT_ERROR */
602     status = ARM_MATH_ARGUMENT_ERROR;
603   }
604   else
605   {
606     /* Loop to calculate convolution for output length number of values */
607     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
608     {
609       /* Initialize sum with zero to carry on MAC operations */
610       sum = 0;
611 
612       /* Loop to perform MAC operations according to convolution equation */
613       for (j = 0U; j <= i; j++)
614       {
615         /* Check the array limitations */
616         if (((i - j) < srcBLen) && (j < srcALen))
617         {
618           /* z[i] += x[i-j] * y[j] */
619           sum += ((q63_t) pIn1[j] * pIn2[i - j]);
620         }
621       }
622 
623       /* Store the output in the destination buffer */
624       pDst[i] = (q31_t) (sum >> 31U);
625     }
626 
627     /* Set status as ARM_MATH_SUCCESS */
628     status = ARM_MATH_SUCCESS;
629   }
630 
631   /* Return to application */
632   return (status);
633 
634 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
635 
636 }
637 
638 /**
639   @} end of PartialConv group
640  */
641