1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_fast_opt_q15.c
4  * Description:  Fast Q15 Convolution
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Conv
37   @{
38  */
39 
40 /**
41   @brief         Convolution of Q15 sequences (fast version).
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1
47   @param[in]     pScratch1  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2
48   @param[in]     pScratch2  points to scratch buffer of size min(srcALen, srcBLen
49 
50   @par           Scaling and Overflow Behavior
51                    This fast version uses a 32-bit accumulator with 2.30 format.
52                    The accumulator maintains full precision of the intermediate multiplication results
53                    but provides only a single guard bit. There is no saturation on intermediate additions.
54                    Thus, if the accumulator overflows it wraps around and distorts the result.
55                    The input signals should be scaled down to avoid intermediate overflows.
56                    Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
57                    as maximum of min(srcALen, srcBLen) number of additions are carried internally.
58                    The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
59 
60   @remark
61                    Refer to \ref arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
62  */
63 
arm_conv_fast_opt_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,q15_t * pScratch1,q15_t * pScratch2)64 ARM_DSP_ATTRIBUTE void arm_conv_fast_opt_q15(
65   const q15_t * pSrcA,
66         uint32_t srcALen,
67   const q15_t * pSrcB,
68         uint32_t srcBLen,
69         q15_t * pDst,
70         q15_t * pScratch1,
71         q15_t * pScratch2)
72 {
73         q31_t acc0;                                    /* Accumulators */
74   const q15_t *pIn1;                                   /* InputA pointer */
75   const q15_t *pIn2;                                   /* InputB pointer */
76         q15_t *pOut = pDst;                            /* Output pointer */
77         q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
78         q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
79   const q15_t *px;                                     /* Intermediate inputA pointer */
80         q15_t *py;                                     /* Intermediate inputB pointer */
81         uint32_t j, k, blkCnt;                         /* Loop counter */
82         uint32_t tapCnt;                               /* Loop count */
83 
84 #if defined (ARM_MATH_LOOPUNROLL)
85         q31_t acc1, acc2, acc3;                        /* Accumulators */
86         q31_t x1, x2, x3;                              /* Temporary variables to hold state and coefficient values */
87         q31_t y1, y2;                                  /* State variables */
88 #endif
89 
90 
91   /* The algorithm implementation is based on the lengths of the inputs. */
92   /* srcB is always made to slide across srcA. */
93   /* So srcBLen is always considered as shorter or equal to srcALen */
94   if (srcALen >= srcBLen)
95   {
96     /* Initialization of inputA pointer */
97     pIn1 = pSrcA;
98 
99     /* Initialization of inputB pointer */
100     pIn2 = pSrcB;
101   }
102   else
103   {
104     /* Initialization of inputA pointer */
105     pIn1 = pSrcB;
106 
107     /* Initialization of inputB pointer */
108     pIn2 = pSrcA;
109 
110     /* srcBLen is always considered as shorter or equal to srcALen */
111     j = srcBLen;
112     srcBLen = srcALen;
113     srcALen = j;
114   }
115 
116   /* Pointer to take end of scratch2 buffer */
117   pScr2 = pScratch2 + srcBLen - 1;
118 
119   /* points to smaller length sequence */
120   px = pIn2;
121 
122 #if defined (ARM_MATH_LOOPUNROLL)
123 
124   /* Loop unrolling: Compute 4 outputs at a time */
125   k = srcBLen >> 2U;
126 
127   /* Copy smaller length input sequence in reverse order into second scratch buffer */
128   while (k > 0U)
129   {
130     /* copy second buffer in reversal manner */
131     *pScr2-- = *px++;
132     *pScr2-- = *px++;
133     *pScr2-- = *px++;
134     *pScr2-- = *px++;
135 
136     /* Decrement loop counter */
137     k--;
138   }
139 
140   /* Loop unrolling: Compute remaining outputs */
141   k = srcBLen % 0x4U;
142 
143 #else
144 
145   /* Initialize k with number of samples */
146   k = srcBLen;
147 
148 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
149 
150   while (k > 0U)
151   {
152     /* copy second buffer in reversal manner for remaining samples */
153     *pScr2-- = *px++;
154 
155     /* Decrement loop counter */
156     k--;
157   }
158 
159   /* Initialze temporary scratch pointer */
160   pScr1 = pScratch1;
161 
162   /* Assuming scratch1 buffer is aligned by 32-bit */
163   /* Fill (srcBLen - 1U) zeros in scratch1 buffer */
164   arm_fill_q15(0, pScr1, (srcBLen - 1U));
165 
166   /* Update temporary scratch pointer */
167   pScr1 += (srcBLen - 1U);
168 
169   /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
170 
171   /* Copy (srcALen) samples in scratch buffer */
172   arm_copy_q15(pIn1, pScr1, srcALen);
173 
174   /* Update pointers */
175   pScr1 += srcALen;
176 
177 
178   /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
179   arm_fill_q15(0, pScr1, (srcBLen - 1U));
180 
181   /* Update pointer */
182   pScr1 += (srcBLen - 1U);
183 
184   /* Temporary pointer for scratch2 */
185   py = pScratch2;
186 
187 
188   /* Initialization of pIn2 pointer */
189   pIn2 = py;
190 
191 #if defined (ARM_MATH_LOOPUNROLL)
192 
193   /* Loop unrolling: Compute 4 outputs at a time */
194   blkCnt = (srcALen + srcBLen - 1U) >> 2;
195 
196   while (blkCnt > 0)
197   {
198     /* Initialze temporary scratch pointer as scratch1 */
199     pScr1 = pScratch1;
200 
201     /* Clear Accumlators */
202     acc0 = 0;
203     acc1 = 0;
204     acc2 = 0;
205     acc3 = 0;
206 
207     /* Read two samples from scratch1 buffer */
208     x1 = read_q15x2_ia (&pScr1);
209 
210     /* Read next two samples from scratch1 buffer */
211     x2 = read_q15x2_ia (&pScr1);
212 
213     tapCnt = (srcBLen) >> 2U;
214 
215     while (tapCnt > 0U)
216     {
217 
218       /* Read four samples from smaller buffer */
219       y1 = read_q15x2_ia ((q15_t **) &pIn2);
220       y2 = read_q15x2_ia ((q15_t **) &pIn2);
221 
222       /* multiply and accumulate */
223       acc0 = __SMLAD(x1, y1, acc0);
224       acc2 = __SMLAD(x2, y1, acc2);
225 
226       /* pack input data */
227 #ifndef ARM_MATH_BIG_ENDIAN
228       x3 = __PKHBT(x2, x1, 0);
229 #else
230       x3 = __PKHBT(x1, x2, 0);
231 #endif
232 
233       /* multiply and accumulate */
234       acc1 = __SMLADX(x3, y1, acc1);
235 
236       /* Read next two samples from scratch1 buffer */
237       x1 = read_q15x2_ia (&pScr1);
238 
239       /* multiply and accumulate */
240       acc0 = __SMLAD(x2, y2, acc0);
241       acc2 = __SMLAD(x1, y2, acc2);
242 
243       /* pack input data */
244 #ifndef ARM_MATH_BIG_ENDIAN
245       x3 = __PKHBT(x1, x2, 0);
246 #else
247       x3 = __PKHBT(x2, x1, 0);
248 #endif
249 
250       acc3 = __SMLADX(x3, y1, acc3);
251       acc1 = __SMLADX(x3, y2, acc1);
252 
253       x2 = read_q15x2_ia (&pScr1);
254 
255 #ifndef ARM_MATH_BIG_ENDIAN
256       x3 = __PKHBT(x2, x1, 0);
257 #else
258       x3 = __PKHBT(x1, x2, 0);
259 #endif
260 
261       acc3 = __SMLADX(x3, y2, acc3);
262 
263       /* Decrement loop counter */
264       tapCnt--;
265     }
266 
267     /* Update scratch pointer for remaining samples of smaller length sequence */
268     pScr1 -= 4U;
269 
270     /* apply same above for remaining samples of smaller length sequence */
271     tapCnt = (srcBLen) & 3U;
272 
273     while (tapCnt > 0U)
274     {
275       /* accumulate the results */
276       acc0 += (*pScr1++ * *pIn2);
277       acc1 += (*pScr1++ * *pIn2);
278       acc2 += (*pScr1++ * *pIn2);
279       acc3 += (*pScr1++ * *pIn2++);
280 
281       pScr1 -= 3U;
282 
283       /* Decrement loop counter */
284       tapCnt--;
285     }
286 
287     blkCnt--;
288 
289     /* Store the results in the accumulators in the destination buffer. */
290 #ifndef ARM_MATH_BIG_ENDIAN
291     write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
292     write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
293 #else
294     write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
295     write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
296 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
297 
298     /* Initialization of inputB pointer */
299     pIn2 = py;
300 
301     pScratch1 += 4U;
302   }
303 
304   /* Loop unrolling: Compute remaining outputs */
305   blkCnt = (srcALen + srcBLen - 1U) & 0x3;
306 
307 #else
308 
309   /* Initialize blkCnt with number of samples */
310   blkCnt = (srcALen + srcBLen - 1U);
311 
312 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
313 
314   /* Calculate convolution for remaining samples of Bigger length sequence */
315   while (blkCnt > 0)
316   {
317     /* Initialze temporary scratch pointer as scratch1 */
318     pScr1 = pScratch1;
319 
320     /* Clear Accumlators */
321     acc0 = 0;
322 
323     tapCnt = (srcBLen) >> 1U;
324 
325     while (tapCnt > 0U)
326     {
327 
328       /* Read next two samples from scratch1 buffer */
329       acc0 += (*pScr1++ * *pIn2++);
330       acc0 += (*pScr1++ * *pIn2++);
331 
332       /* Decrement loop counter */
333       tapCnt--;
334     }
335 
336     tapCnt = (srcBLen) & 1U;
337 
338     /* apply same above for remaining samples of smaller length sequence */
339     while (tapCnt > 0U)
340     {
341 
342       /* accumulate the results */
343       acc0 += (*pScr1++ * *pIn2++);
344 
345       /* Decrement loop counter */
346       tapCnt--;
347     }
348 
349     blkCnt--;
350 
351     /* The result is in 2.30 format.  Convert to 1.15 with saturation.
352        Then store the output in the destination buffer. */
353     *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
354 
355     /* Initialization of inputB pointer */
356     pIn2 = py;
357 
358     pScratch1 += 1U;
359   }
360 
361 }
362 
363 /**
364   @} end of Conv group
365  */
366