1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_fast_q31.c
4  * Description:  Fast Q31 Convolution
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Conv
37   @{
38  */
39 
40 /**
41   @brief         Convolution of Q31 sequences (fast version).
42   @param[in]     pSrcA      points to the first input sequence.
43   @param[in]     srcALen    length of the first input sequence.
44   @param[in]     pSrcB      points to the second input sequence.
45   @param[in]     srcBLen    length of the second input sequence.
46   @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1.
47 
48   @par           Scaling and Overflow Behavior
49                    This function is optimized for speed at the expense of fixed-point precision and overflow protection.
50                    The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
51                    These intermediate results are accumulated in a 32-bit register in 2.30 format.
52                    Finally, the accumulator is saturated and converted to a 1.31 result.
53   @par
54                    The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
55                    In order to avoid overflows completely the input signals must be scaled down.
56                    Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
57                    as maximum of min(srcALen, srcBLen) number of additions are carried internally.
58   @remark
59                    Refer to \ref arm_conv_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
60  */
61 
arm_conv_fast_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst)62 void arm_conv_fast_q31(
63   const q31_t * pSrcA,
64         uint32_t srcALen,
65   const q31_t * pSrcB,
66         uint32_t srcBLen,
67         q31_t * pDst)
68 {
69   const q31_t *pIn1;                                   /* InputA pointer */
70   const q31_t *pIn2;                                   /* InputB pointer */
71         q31_t *pOut = pDst;                            /* Output pointer */
72   const q31_t *px;                                     /* Intermediate inputA pointer */
73   const q31_t *py;                                     /* Intermediate inputB pointer */
74   const q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
75         q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators */
76         q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
77         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
78         uint32_t j, k, count, blkCnt;                  /* Loop counters */
79 
80   /* The algorithm implementation is based on the lengths of the inputs. */
81   /* srcB is always made to slide across srcA. */
82   /* So srcBLen is always considered as shorter or equal to srcALen */
83   if (srcALen >= srcBLen)
84   {
85     /* Initialization of inputA pointer */
86     pIn1 = pSrcA;
87 
88     /* Initialization of inputB pointer */
89     pIn2 = pSrcB;
90   }
91   else
92   {
93     /* Initialization of inputA pointer */
94     pIn1 = pSrcB;
95 
96     /* Initialization of inputB pointer */
97     pIn2 = pSrcA;
98 
99     /* srcBLen is always considered as shorter or equal to srcALen */
100     j = srcBLen;
101     srcBLen = srcALen;
102     srcALen = j;
103   }
104 
105   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
106   /* The function is internally
107    * divided into three stages according to the number of multiplications that has to be
108    * taken place between inputA samples and inputB samples. In the first stage of the
109    * algorithm, the multiplications increase by one for every iteration.
110    * In the second stage of the algorithm, srcBLen number of multiplications are done.
111    * In the third stage of the algorithm, the multiplications decrease by one
112    * for every iteration. */
113 
114   /* The algorithm is implemented in three stages.
115      The loop counters of each stage is initiated here. */
116   blockSize1 = srcBLen - 1U;
117   blockSize2 = srcALen - (srcBLen - 1U);
118   blockSize3 = blockSize1;
119 
120   /* --------------------------
121    * Initializations of stage1
122    * -------------------------*/
123 
124   /* sum = x[0] * y[0]
125    * sum = x[0] * y[1] + x[1] * y[0]
126    * ....
127    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
128    */
129 
130   /* In this stage the MAC operations are increased by 1 for every iteration.
131      The count variable holds the number of MAC operations performed */
132   count = 1U;
133 
134   /* Working pointer of inputA */
135   px = pIn1;
136 
137   /* Working pointer of inputB */
138   py = pIn2;
139 
140 
141   /* ------------------------
142    * Stage1 process
143    * ----------------------*/
144 
145   /* The first stage starts here */
146   while (blockSize1 > 0U)
147   {
148     /* Accumulator is made zero for every iteration */
149     sum = 0;
150 
151     /* Apply loop unrolling and compute 4 MACs simultaneously. */
152     k = count >> 2U;
153 
154     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
155      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
156     while (k > 0U)
157     {
158       /* x[0] * y[srcBLen - 1] */
159       sum = (q31_t) ((((q63_t) sum << 32) +
160                       ((q63_t) *px++ * (*py--))) >> 32);
161 
162       /* x[1] * y[srcBLen - 2] */
163       sum = (q31_t) ((((q63_t) sum << 32) +
164                       ((q63_t) *px++ * (*py--))) >> 32);
165 
166       /* x[2] * y[srcBLen - 3] */
167       sum = (q31_t) ((((q63_t) sum << 32) +
168                       ((q63_t) *px++ * (*py--))) >> 32);
169 
170       /* x[3] * y[srcBLen - 4] */
171       sum = (q31_t) ((((q63_t) sum << 32) +
172                       ((q63_t) *px++ * (*py--))) >> 32);
173 
174       /* Decrement loop counter */
175       k--;
176     }
177 
178     /* If the count is not a multiple of 4, compute any remaining MACs here.
179      ** No loop unrolling is used. */
180     k = count % 0x4U;
181 
182     while (k > 0U)
183     {
184       /* Perform the multiply-accumulate */
185       sum = (q31_t) ((((q63_t) sum << 32) +
186                       ((q63_t) *px++ * (*py--))) >> 32);
187 
188       /* Decrement loop counter */
189       k--;
190     }
191 
192     /* Store the result in the accumulator in the destination buffer. */
193     *pOut++ = sum << 1;
194 
195     /* Update the inputA and inputB pointers for next MAC calculation */
196     py = pIn2 + count;
197     px = pIn1;
198 
199     /* Increment MAC count */
200     count++;
201 
202     /* Decrement loop counter */
203     blockSize1--;
204   }
205 
206   /* --------------------------
207    * Initializations of stage2
208    * ------------------------*/
209 
210   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
211    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
212    * ....
213    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
214    */
215 
216   /* Working pointer of inputA */
217   px = pIn1;
218 
219   /* Working pointer of inputB */
220   pSrc2 = pIn2 + (srcBLen - 1U);
221   py = pSrc2;
222 
223   /* count is index by which the pointer pIn1 to be incremented */
224   count = 0U;
225 
226   /* -------------------
227    * Stage2 process
228    * ------------------*/
229 
230   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
231    * So, to loop unroll over blockSize2,
232    * srcBLen should be greater than or equal to 4 */
233   if (srcBLen >= 4U)
234   {
235     /* Loop unroll over blockSize2, by 4 */
236     blkCnt = blockSize2 >> 2U;
237 
238     while (blkCnt > 0U)
239     {
240       /* Set all accumulators to zero */
241       acc0 = 0;
242       acc1 = 0;
243       acc2 = 0;
244       acc3 = 0;
245 
246       /* read x[0], x[1], x[2] samples */
247       x0 = *px++;
248       x1 = *px++;
249       x2 = *px++;
250 
251       /* Apply loop unrolling and compute 4 MACs simultaneously. */
252       k = srcBLen >> 2U;
253 
254       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
255        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
256       do
257       {
258         /* Read y[srcBLen - 1] sample */
259         c0 = *py--;
260         /* Read x[3] sample */
261         x3 = *px++;
262 
263         /* Perform the multiply-accumulate */
264         /* acc0 +=  x[0] * y[srcBLen - 1] */
265         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
266         /* acc1 +=  x[1] * y[srcBLen - 1] */
267         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
268         /* acc2 +=  x[2] * y[srcBLen - 1] */
269         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
270         /* acc3 +=  x[3] * y[srcBLen - 1] */
271         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
272 
273 
274         /* Read y[srcBLen - 2] sample */
275         c0 = *py--;
276         /* Read x[4] sample */
277         x0 = *px++;
278 
279         /* Perform the multiply-accumulate */
280         /* acc0 +=  x[1] * y[srcBLen - 2] */
281         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
282         /* acc1 +=  x[2] * y[srcBLen - 2] */
283         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
284         /* acc2 +=  x[3] * y[srcBLen - 2] */
285         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
286         /* acc3 +=  x[4] * y[srcBLen - 2] */
287         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
288 
289 
290         /* Read y[srcBLen - 3] sample */
291         c0 = *py--;
292         /* Read x[5] sample */
293         x1 = *px++;
294 
295         /* Perform the multiply-accumulates */
296         /* acc0 +=  x[2] * y[srcBLen - 3] */
297         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
298         /* acc1 +=  x[3] * y[srcBLen - 3] */
299         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
300         /* acc2 +=  x[4] * y[srcBLen - 3] */
301         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
302         /* acc3 +=  x[5] * y[srcBLen - 3] */
303         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
304 
305 
306         /* Read y[srcBLen - 4] sample */
307         c0 = *py--;
308         /* Read x[6] sample */
309         x2 = *px++;
310 
311         /* Perform the multiply-accumulates */
312         /* acc0 +=  x[3] * y[srcBLen - 4] */
313         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
314         /* acc1 +=  x[4] * y[srcBLen - 4] */
315         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
316         /* acc2 +=  x[5] * y[srcBLen - 4] */
317         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
318         /* acc3 +=  x[6] * y[srcBLen - 4] */
319         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
320 
321 
322       } while (--k);
323 
324       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
325        ** No loop unrolling is used. */
326       k = srcBLen % 0x4U;
327 
328       while (k > 0U)
329       {
330         /* Read y[srcBLen - 5] sample */
331         c0 = *py--;
332         /* Read x[7] sample */
333         x3 = *px++;
334 
335         /* Perform the multiply-accumulates */
336         /* acc0 +=  x[4] * y[srcBLen - 5] */
337         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
338         /* acc1 +=  x[5] * y[srcBLen - 5] */
339         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
340         /* acc2 +=  x[6] * y[srcBLen - 5] */
341         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
342         /* acc3 +=  x[7] * y[srcBLen - 5] */
343         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
344 
345         /* Reuse the present samples for the next MAC */
346         x0 = x1;
347         x1 = x2;
348         x2 = x3;
349 
350         /* Decrement loop counter */
351         k--;
352       }
353 
354       /* Store the result in the accumulator in the destination buffer. */
355       *pOut++ = (q31_t) (acc0 << 1);
356       *pOut++ = (q31_t) (acc1 << 1);
357       *pOut++ = (q31_t) (acc2 << 1);
358       *pOut++ = (q31_t) (acc3 << 1);
359 
360       /* Increment the pointer pIn1 index, count by 4 */
361       count += 4U;
362 
363       /* Update the inputA and inputB pointers for next MAC calculation */
364       px = pIn1 + count;
365       py = pSrc2;
366 
367       /* Decrement loop counter */
368       blkCnt--;
369     }
370 
371     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
372      ** No loop unrolling is used. */
373     blkCnt = blockSize2 % 0x4U;
374 
375     while (blkCnt > 0U)
376     {
377       /* Accumulator is made zero for every iteration */
378       sum = 0;
379 
380       /* Apply loop unrolling and compute 4 MACs simultaneously. */
381       k = srcBLen >> 2U;
382 
383       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
384        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
385       while (k > 0U)
386       {
387         /* Perform the multiply-accumulates */
388         sum = (q31_t) ((((q63_t) sum << 32) +
389                         ((q63_t) *px++ * (*py--))) >> 32);
390         sum = (q31_t) ((((q63_t) sum << 32) +
391                         ((q63_t) *px++ * (*py--))) >> 32);
392         sum = (q31_t) ((((q63_t) sum << 32) +
393                         ((q63_t) *px++ * (*py--))) >> 32);
394         sum = (q31_t) ((((q63_t) sum << 32) +
395                         ((q63_t) *px++ * (*py--))) >> 32);
396 
397         /* Decrement loop counter */
398         k--;
399       }
400 
401       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
402        ** No loop unrolling is used. */
403       k = srcBLen % 0x4U;
404 
405       while (k > 0U)
406       {
407         /* Perform the multiply-accumulate */
408         sum = (q31_t) ((((q63_t) sum << 32) +
409                         ((q63_t) *px++ * (*py--))) >> 32);
410 
411         /* Decrement loop counter */
412         k--;
413       }
414 
415       /* Store the result in the accumulator in the destination buffer. */
416       *pOut++ = sum << 1;
417 
418       /* Increment MAC count */
419       count++;
420 
421       /* Update the inputA and inputB pointers for next MAC calculation */
422       px = pIn1 + count;
423       py = pSrc2;
424 
425       /* Decrement loop counter */
426       blkCnt--;
427     }
428   }
429   else
430   {
431     /* If the srcBLen is not a multiple of 4,
432      * the blockSize2 loop cannot be unrolled by 4 */
433     blkCnt = blockSize2;
434 
435     while (blkCnt > 0U)
436     {
437       /* Accumulator is made zero for every iteration */
438       sum = 0;
439 
440       /* srcBLen number of MACS should be performed */
441       k = srcBLen;
442 
443       while (k > 0U)
444       {
445         /* Perform the multiply-accumulate */
446         sum = (q31_t) ((((q63_t) sum << 32) +
447                         ((q63_t) *px++ * (*py--))) >> 32);
448 
449         /* Decrement loop counter */
450         k--;
451       }
452 
453       /* Store the result in the accumulator in the destination buffer. */
454       *pOut++ = sum << 1;
455 
456       /* Increment MAC count */
457       count++;
458 
459       /* Update the inputA and inputB pointers for next MAC calculation */
460       px = pIn1 + count;
461       py = pSrc2;
462 
463       /* Decrement loop counter */
464       blkCnt--;
465     }
466   }
467 
468 
469   /* --------------------------
470    * Initializations of stage3
471    * -------------------------*/
472 
473   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
474    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
475    * ....
476    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
477    * sum +=  x[srcALen-1] * y[srcBLen-1]
478    */
479 
480   /* In this stage the MAC operations are decreased by 1 for every iteration.
481      The blockSize3 variable holds the number of MAC operations performed */
482 
483   /* Working pointer of inputA */
484   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
485   px = pSrc1;
486 
487   /* Working pointer of inputB */
488   pSrc2 = pIn2 + (srcBLen - 1U);
489   py = pSrc2;
490 
491   /* -------------------
492    * Stage3 process
493    * ------------------*/
494 
495   while (blockSize3 > 0U)
496   {
497     /* Accumulator is made zero for every iteration */
498     sum = 0;
499 
500     /* Apply loop unrolling and compute 4 MACs simultaneously. */
501     k = blockSize3 >> 2U;
502 
503     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
504      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
505     while (k > 0U)
506     {
507       /* Perform the multiply-accumulate */
508       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
509       sum = (q31_t) ((((q63_t) sum << 32) +
510                       ((q63_t) *px++ * (*py--))) >> 32);
511 
512       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
513       sum = (q31_t) ((((q63_t) sum << 32) +
514                       ((q63_t) *px++ * (*py--))) >> 32);
515 
516       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
517       sum = (q31_t) ((((q63_t) sum << 32) +
518                       ((q63_t) *px++ * (*py--))) >> 32);
519 
520       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
521       sum = (q31_t) ((((q63_t) sum << 32) +
522                       ((q63_t) *px++ * (*py--))) >> 32);
523 
524       /* Decrement loop counter */
525       k--;
526     }
527 
528     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
529      ** No loop unrolling is used. */
530     k = blockSize3 % 0x4U;
531 
532     while (k > 0U)
533     {
534       /* Perform the multiply-accumulate */
535       sum = (q31_t) ((((q63_t) sum << 32) +
536                       ((q63_t) *px++ * (*py--))) >> 32);
537 
538       /* Decrement loop counter */
539       k--;
540     }
541 
542     /* Store the result in the accumulator in the destination buffer. */
543     *pOut++ = sum << 1;
544 
545     /* Update the inputA and inputB pointers for next MAC calculation */
546     px = ++pSrc1;
547     py = pSrc2;
548 
549     /* Decrement loop counter */
550     blockSize3--;
551   }
552 
553 }
554 
555 /**
556   @} end of Conv group
557  */
558