1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_opt_q15.c
4  * Description:  Convolution of Q15 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Conv
37   @{
38  */
39 
40 /**
41   @brief         Convolution of Q15 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1.
47   @param[in]     pScratch1  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
48   @param[in]     pScratch2  points to scratch buffer of size min(srcALen, srcBLen).
49 
50   @par           Scaling and Overflow Behavior
51                    The function is implemented using a 64-bit internal accumulator.
52                    Both inputs are in 1.15 format and multiplications yield a 2.30 result.
53                    The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
54                    This approach provides 33 guard bits and there is no risk of overflow.
55                    The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
56   @remark
57                    Refer to \ref arm_conv_fast_q15() for a faster but less precise version of this function.
58  */
59 
arm_conv_opt_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,q15_t * pScratch1,q15_t * pScratch2)60 void arm_conv_opt_q15(
61   const q15_t * pSrcA,
62         uint32_t srcALen,
63   const q15_t * pSrcB,
64         uint32_t srcBLen,
65         q15_t * pDst,
66         q15_t * pScratch1,
67         q15_t * pScratch2)
68 {
69         q63_t acc0;                                    /* Accumulators */
70   const q15_t *pIn1;                                   /* InputA pointer */
71   const q15_t *pIn2;                                   /* InputB pointer */
72         q15_t *pOut = pDst;                            /* Output pointer */
73         q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
74         q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
75   const q15_t *px;                                     /* Intermediate inputA pointer */
76         q15_t *py;                                     /* Intermediate inputB pointer */
77         uint32_t j, k, blkCnt;                         /* Loop counter */
78         uint32_t tapCnt;                               /* Loop count */
79 
80 #if defined (ARM_MATH_LOOPUNROLL)
81         q63_t acc1, acc2, acc3;                        /* Accumulators */
82         q31_t x1, x2, x3;                              /* Temporary variables to hold state and coefficient values */
83         q31_t y1, y2;                                  /* State variables */
84 #endif
85 
86 
87   /* The algorithm implementation is based on the lengths of the inputs. */
88   /* srcB is always made to slide across srcA. */
89   /* So srcBLen is always considered as shorter or equal to srcALen */
90   if (srcALen >= srcBLen)
91   {
92     /* Initialization of inputA pointer */
93     pIn1 = pSrcA;
94 
95     /* Initialization of inputB pointer */
96     pIn2 = pSrcB;
97   }
98   else
99   {
100     /* Initialization of inputA pointer */
101     pIn1 = pSrcB;
102 
103     /* Initialization of inputB pointer */
104     pIn2 = pSrcA;
105 
106     /* srcBLen is always considered as shorter or equal to srcALen */
107     j = srcBLen;
108     srcBLen = srcALen;
109     srcALen = j;
110   }
111 
112   /* Pointer to take end of scratch2 buffer */
113   pScr2 = pScratch2 + srcBLen - 1;
114 
115   /* points to smaller length sequence */
116   px = pIn2;
117 
118 #if defined (ARM_MATH_LOOPUNROLL)
119 
120   /* Loop unrolling: Compute 4 outputs at a time */
121   k = srcBLen >> 2U;
122 
123   /* Copy smaller length input sequence in reverse order into second scratch buffer */
124   while (k > 0U)
125   {
126     /* copy second buffer in reversal manner */
127     *pScr2-- = *px++;
128     *pScr2-- = *px++;
129     *pScr2-- = *px++;
130     *pScr2-- = *px++;
131 
132     /* Decrement loop counter */
133     k--;
134   }
135 
136   /* Loop unrolling: Compute remaining outputs */
137   k = srcBLen % 0x4U;
138 
139 #else
140 
141   /* Initialize k with number of samples */
142   k = srcBLen;
143 
144 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
145 
146   while (k > 0U)
147   {
148     /* copy second buffer in reversal manner for remaining samples */
149     *pScr2-- = *px++;
150 
151     /* Decrement loop counter */
152     k--;
153   }
154 
155   /* Initialze temporary scratch pointer */
156   pScr1 = pScratch1;
157 
158   /* Assuming scratch1 buffer is aligned by 32-bit */
159   /* Fill (srcBLen - 1U) zeros in scratch1 buffer */
160   arm_fill_q15(0, pScr1, (srcBLen - 1U));
161 
162   /* Update temporary scratch pointer */
163   pScr1 += (srcBLen - 1U);
164 
165   /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
166 
167   /* Copy (srcALen) samples in scratch buffer */
168   arm_copy_q15(pIn1, pScr1, srcALen);
169 
170   /* Update pointers */
171   pScr1 += srcALen;
172 
173 
174   /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
175   arm_fill_q15(0, pScr1, (srcBLen - 1U));
176 
177   /* Update pointer */
178   pScr1 += (srcBLen - 1U);
179 
180   /* Temporary pointer for scratch2 */
181   py = pScratch2;
182 
183 
184   /* Initialization of pIn2 pointer */
185   pIn2 = py;
186 
187 #if defined (ARM_MATH_LOOPUNROLL)
188 
189   /* Loop unrolling: Compute 4 outputs at a time */
190   blkCnt = (srcALen + srcBLen - 1U) >> 2;
191 
192   while (blkCnt > 0)
193   {
194     /* Initialze temporary scratch pointer as scratch1 */
195     pScr1 = pScratch1;
196 
197     /* Clear Accumlators */
198     acc0 = 0;
199     acc1 = 0;
200     acc2 = 0;
201     acc3 = 0;
202 
203     /* Read two samples from scratch1 buffer */
204     x1 = read_q15x2_ia (&pScr1);
205 
206     /* Read next two samples from scratch1 buffer */
207     x2 = read_q15x2_ia (&pScr1);
208 
209     tapCnt = (srcBLen) >> 2U;
210 
211     while (tapCnt > 0U)
212     {
213 
214       /* Read four samples from smaller buffer */
215       y1 = read_q15x2_ia ((q15_t **) &pIn2);
216       y2 = read_q15x2_ia ((q15_t **) &pIn2);
217 
218       /* multiply and accumulate */
219       acc0 = __SMLALD(x1, y1, acc0);
220       acc2 = __SMLALD(x2, y1, acc2);
221 
222       /* pack input data */
223 #ifndef ARM_MATH_BIG_ENDIAN
224       x3 = __PKHBT(x2, x1, 0);
225 #else
226       x3 = __PKHBT(x1, x2, 0);
227 #endif
228 
229       /* multiply and accumulate */
230       acc1 = __SMLALDX(x3, y1, acc1);
231 
232       /* Read next two samples from scratch1 buffer */
233       x1 = read_q15x2_ia (&pScr1);
234 
235       /* multiply and accumulate */
236       acc0 = __SMLALD(x2, y2, acc0);
237       acc2 = __SMLALD(x1, y2, acc2);
238 
239       /* pack input data */
240 #ifndef ARM_MATH_BIG_ENDIAN
241       x3 = __PKHBT(x1, x2, 0);
242 #else
243       x3 = __PKHBT(x2, x1, 0);
244 #endif
245 
246       acc3 = __SMLALDX(x3, y1, acc3);
247       acc1 = __SMLALDX(x3, y2, acc1);
248 
249       x2 = read_q15x2_ia (&pScr1);
250 
251 #ifndef ARM_MATH_BIG_ENDIAN
252       x3 = __PKHBT(x2, x1, 0);
253 #else
254       x3 = __PKHBT(x1, x2, 0);
255 #endif
256 
257       acc3 = __SMLALDX(x3, y2, acc3);
258 
259       /* Decrement loop counter */
260       tapCnt--;
261     }
262 
263     /* Update scratch pointer for remaining samples of smaller length sequence */
264     pScr1 -= 4U;
265 
266     /* apply same above for remaining samples of smaller length sequence */
267     tapCnt = (srcBLen) & 3U;
268 
269     while (tapCnt > 0U)
270     {
271       /* accumulate the results */
272       acc0 += (*pScr1++ * *pIn2);
273       acc1 += (*pScr1++ * *pIn2);
274       acc2 += (*pScr1++ * *pIn2);
275       acc3 += (*pScr1++ * *pIn2++);
276 
277       pScr1 -= 3U;
278 
279       /* Decrement loop counter */
280       tapCnt--;
281     }
282 
283     blkCnt--;
284 
285     /* Store the results in the accumulators in the destination buffer. */
286 #ifndef ARM_MATH_BIG_ENDIAN
287     write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
288     write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
289 #else
290     write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
291     write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
292 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
293 
294     /* Initialization of inputB pointer */
295     pIn2 = py;
296 
297     pScratch1 += 4U;
298   }
299 
300   /* Loop unrolling: Compute remaining outputs */
301   blkCnt = (srcALen + srcBLen - 1U) & 0x3;
302 
303 #else
304 
305   /* Initialize blkCnt with number of samples */
306   blkCnt = (srcALen + srcBLen - 1U);
307 
308 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
309 
310   /* Calculate convolution for remaining samples of Bigger length sequence */
311   while (blkCnt > 0)
312   {
313     /* Initialze temporary scratch pointer as scratch1 */
314     pScr1 = pScratch1;
315 
316     /* Clear Accumlators */
317     acc0 = 0;
318 
319     tapCnt = (srcBLen) >> 1U;
320 
321     while (tapCnt > 0U)
322     {
323 
324       /* Read next two samples from scratch1 buffer */
325       acc0 += (*pScr1++ * *pIn2++);
326       acc0 += (*pScr1++ * *pIn2++);
327 
328       /* Decrement loop counter */
329       tapCnt--;
330     }
331 
332     tapCnt = (srcBLen) & 1U;
333 
334     /* apply same above for remaining samples of smaller length sequence */
335     while (tapCnt > 0U)
336     {
337 
338       /* accumulate the results */
339       acc0 += (*pScr1++ * *pIn2++);
340 
341       /* Decrement loop counter */
342       tapCnt--;
343     }
344 
345     blkCnt--;
346 
347     /* The result is in 2.30 format.  Convert to 1.15 with saturation.
348        Then store the output in the destination buffer. */
349     *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
350 
351     /* Initialization of inputB pointer */
352     pIn2 = py;
353 
354     pScratch1 += 1U;
355   }
356 
357 }
358 
359 /**
360   @} end of Conv group
361  */
362