1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_opt_q7.c
4  * Description:  Convolution of Q7 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Conv
37   @{
38  */
39 
40 /**
41   @brief         Convolution of Q7 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1.
47   @param[in]     pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
48   @param[in]     pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
49 
50   @par           Scaling and Overflow Behavior
51                    The function is implemented using a 32-bit internal accumulator.
52                    Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
53                    The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
54                    This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
55                    The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
56  */
57 
arm_conv_opt_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst,q15_t * pScratch1,q15_t * pScratch2)58 ARM_DSP_ATTRIBUTE void arm_conv_opt_q7(
59   const q7_t * pSrcA,
60         uint32_t srcALen,
61   const q7_t * pSrcB,
62         uint32_t srcBLen,
63         q7_t * pDst,
64         q15_t * pScratch1,
65         q15_t * pScratch2)
66 {
67         q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch */
68         q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch */
69         q15_t x4;                                      /* Temporary input variable */
70         q15_t *py;                                     /* Temporary input2 pointer */
71         q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
72   const q7_t *pIn1, *pIn2;                             /* InputA and inputB pointer */
73         uint32_t j, k, blkCnt, tapCnt;                 /* Loop counter */
74         q31_t x1, x2, x3, y1;                          /* Temporary input variables */
75   const q7_t *px;                                      /* Temporary input1 pointer */
76         q7_t *pOut = pDst;                             /* Output pointer */
77         q7_t out0, out1, out2, out3;                   /* Temporary variables */
78 
79   /* The algorithm implementation is based on the lengths of the inputs. */
80   /* srcB is always made to slide across srcA. */
81   /* So srcBLen is always considered as shorter or equal to srcALen */
82   if (srcALen >= srcBLen)
83   {
84     /* Initialization of inputA pointer */
85     pIn1 = pSrcA;
86 
87     /* Initialization of inputB pointer */
88     pIn2 = pSrcB;
89   }
90   else
91   {
92     /* Initialization of inputA pointer */
93     pIn1 = pSrcB;
94 
95     /* Initialization of inputB pointer */
96     pIn2 = pSrcA;
97 
98     /* srcBLen is always considered as shorter or equal to srcALen */
99     j = srcBLen;
100     srcBLen = srcALen;
101     srcALen = j;
102   }
103 
104   /* points to smaller length sequence */
105   px = pIn2 + srcBLen - 1;
106 
107   /* Apply loop unrolling and do 4 Copies simultaneously. */
108   k = srcBLen >> 2U;
109 
110   /* First part of the processing with loop unrolling copies 4 data points at a time.
111    ** a second loop below copies for the remaining 1 to 3 samples. */
112   while (k > 0U)
113   {
114     /* copy second buffer in reversal manner */
115     x4 = (q15_t) *px--;
116     *pScr2++ = x4;
117     x4 = (q15_t) *px--;
118     *pScr2++ = x4;
119     x4 = (q15_t) *px--;
120     *pScr2++ = x4;
121     x4 = (q15_t) *px--;
122     *pScr2++ = x4;
123 
124     /* Decrement loop counter */
125     k--;
126   }
127 
128   /* If the count is not a multiple of 4, copy remaining samples here.
129    ** No loop unrolling is used. */
130   k = srcBLen % 0x4U;
131 
132   while (k > 0U)
133   {
134     /* copy second buffer in reversal manner for remaining samples */
135     x4 = (q15_t) *px--;
136     *pScr2++ = x4;
137 
138     /* Decrement loop counter */
139     k--;
140   }
141 
142   /* Fill (srcBLen - 1U) zeros in scratch buffer */
143   arm_fill_q15(0, pScr1, (srcBLen - 1U));
144 
145   /* Update temporary scratch pointer */
146   pScr1 += (srcBLen - 1U);
147 
148   /* Copy (srcALen) samples in scratch buffer */
149   /* Apply loop unrolling and do 4 Copies simultaneously. */
150   k = srcALen >> 2U;
151 
152   /* First part of the processing with loop unrolling copies 4 data points at a time.
153    ** a second loop below copies for the remaining 1 to 3 samples. */
154   while (k > 0U)
155   {
156     /* copy second buffer in reversal manner */
157     x4 = (q15_t) *pIn1++;
158     *pScr1++ = x4;
159     x4 = (q15_t) *pIn1++;
160     *pScr1++ = x4;
161     x4 = (q15_t) *pIn1++;
162     *pScr1++ = x4;
163     x4 = (q15_t) *pIn1++;
164     *pScr1++ = x4;
165 
166     /* Decrement loop counter */
167     k--;
168   }
169 
170   /* If the count is not a multiple of 4, copy remaining samples here.
171    ** No loop unrolling is used. */
172   k = srcALen % 0x4U;
173 
174   while (k > 0U)
175   {
176     /* copy second buffer in reversal manner for remaining samples */
177     x4 = (q15_t) * pIn1++;
178     *pScr1++ = x4;
179 
180     /* Decrement the loop counter */
181     k--;
182   }
183 
184   /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
185   arm_fill_q15(0, pScr1, (srcBLen - 1U));
186 
187   /* Update pointer */
188   pScr1 += (srcBLen - 1U);
189 
190   /* Temporary pointer for scratch2 */
191   py = pScratch2;
192 
193   /* Initialization of pIn2 pointer */
194   pIn2 = (q7_t *) py;
195 
196   pScr2 = py;
197 
198   /* Actual convolution process starts here */
199   blkCnt = (srcALen + srcBLen - 1U) >> 2U;
200 
201   while (blkCnt > 0)
202   {
203     /* Initialze temporary scratch pointer as scratch1 */
204     pScr1 = pScratch1;
205 
206     /* Clear Accumlators */
207     acc0 = 0;
208     acc1 = 0;
209     acc2 = 0;
210     acc3 = 0;
211 
212     /* Read two samples from scratch1 buffer */
213     x1 = read_q15x2_ia (&pScr1);
214 
215     /* Read next two samples from scratch1 buffer */
216     x2 = read_q15x2_ia (&pScr1);
217 
218     tapCnt = (srcBLen) >> 2U;
219 
220     while (tapCnt > 0U)
221     {
222       /* Read four samples from smaller buffer */
223       y1 = read_q15x2_ia (&pScr2);
224 
225       /* multiply and accumulate */
226       acc0 = __SMLAD(x1, y1, acc0);
227       acc2 = __SMLAD(x2, y1, acc2);
228 
229       /* pack input data */
230 #ifndef ARM_MATH_BIG_ENDIAN
231       x3 = __PKHBT(x2, x1, 0);
232 #else
233       x3 = __PKHBT(x1, x2, 0);
234 #endif
235 
236       /* multiply and accumulate */
237       acc1 = __SMLADX(x3, y1, acc1);
238 
239       /* Read next two samples from scratch1 buffer */
240       x1 = read_q15x2_ia (&pScr1);
241 
242       /* pack input data */
243 #ifndef ARM_MATH_BIG_ENDIAN
244       x3 = __PKHBT(x1, x2, 0);
245 #else
246       x3 = __PKHBT(x2, x1, 0);
247 #endif
248 
249       acc3 = __SMLADX(x3, y1, acc3);
250 
251       /* Read four samples from smaller buffer */
252       y1 = read_q15x2_ia (&pScr2);
253 
254       acc0 = __SMLAD(x2, y1, acc0);
255 
256       acc2 = __SMLAD(x1, y1, acc2);
257 
258       acc1 = __SMLADX(x3, y1, acc1);
259 
260       x2 = read_q15x2_ia (&pScr1);
261 
262 #ifndef ARM_MATH_BIG_ENDIAN
263       x3 = __PKHBT(x2, x1, 0);
264 #else
265       x3 = __PKHBT(x1, x2, 0);
266 #endif
267 
268       acc3 = __SMLADX(x3, y1, acc3);
269 
270       /* Decrement loop counter */
271       tapCnt--;
272     }
273 
274     /* Update scratch pointer for remaining samples of smaller length sequence */
275     pScr1 -= 4U;
276 
277     /* apply same above for remaining samples of smaller length sequence */
278     tapCnt = (srcBLen) & 3U;
279 
280     while (tapCnt > 0U)
281     {
282       /* accumulate the results */
283       acc0 += (*pScr1++ * *pScr2);
284       acc1 += (*pScr1++ * *pScr2);
285       acc2 += (*pScr1++ * *pScr2);
286       acc3 += (*pScr1++ * *pScr2++);
287 
288       pScr1 -= 3U;
289 
290       /* Decrement loop counter */
291       tapCnt--;
292     }
293 
294     blkCnt--;
295 
296     /* Store the result in the accumulator in the destination buffer. */
297     out0 = (q7_t) (__SSAT(acc0 >> 7U, 8));
298     out1 = (q7_t) (__SSAT(acc1 >> 7U, 8));
299     out2 = (q7_t) (__SSAT(acc2 >> 7U, 8));
300     out3 = (q7_t) (__SSAT(acc3 >> 7U, 8));
301 
302     write_q7x4_ia (&pOut, __PACKq7(out0, out1, out2, out3));
303 
304     /* Initialization of inputB pointer */
305     pScr2 = py;
306 
307     pScratch1 += 4U;
308   }
309 
310   blkCnt = (srcALen + srcBLen - 1U) & 0x3;
311 
312   /* Calculate convolution for remaining samples of Bigger length sequence */
313   while (blkCnt > 0)
314   {
315     /* Initialze temporary scratch pointer as scratch1 */
316     pScr1 = pScratch1;
317 
318     /* Clear Accumlators */
319     acc0 = 0;
320 
321     tapCnt = (srcBLen) >> 1U;
322 
323     while (tapCnt > 0U)
324     {
325       acc0 += (*pScr1++ * *pScr2++);
326       acc0 += (*pScr1++ * *pScr2++);
327 
328       /* Decrement loop counter */
329       tapCnt--;
330     }
331 
332     tapCnt = (srcBLen) & 1U;
333 
334     /* apply same above for remaining samples of smaller length sequence */
335     while (tapCnt > 0U)
336     {
337       /* accumulate the results */
338       acc0 += (*pScr1++ * *pScr2++);
339 
340       /* Decrement loop counter */
341       tapCnt--;
342     }
343 
344     blkCnt--;
345 
346     /* Store the result in the accumulator in the destination buffer. */
347     *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
348 
349     /* Initialization of inputB pointer */
350     pScr2 = py;
351 
352     pScratch1 += 1U;
353   }
354 
355 }
356 
357 /**
358   @} end of Conv group
359  */
360