1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_opt_q15.c
4 * Description: Convolution of Q15 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Conv
37 @{
38 */
39
40 /**
41 @brief Convolution of Q15 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
47 @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
48 @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
49
50 @par Scaling and Overflow Behavior
51 The function is implemented using a 64-bit internal accumulator.
52 Both inputs are in 1.15 format and multiplications yield a 2.30 result.
53 The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
54 This approach provides 33 guard bits and there is no risk of overflow.
55 The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
56 @remark
57 Refer to \ref arm_conv_fast_q15() for a faster but less precise version of this function.
58 */
59
arm_conv_opt_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,q15_t * pScratch1,q15_t * pScratch2)60 void arm_conv_opt_q15(
61 const q15_t * pSrcA,
62 uint32_t srcALen,
63 const q15_t * pSrcB,
64 uint32_t srcBLen,
65 q15_t * pDst,
66 q15_t * pScratch1,
67 q15_t * pScratch2)
68 {
69 q63_t acc0; /* Accumulators */
70 const q15_t *pIn1; /* InputA pointer */
71 const q15_t *pIn2; /* InputB pointer */
72 q15_t *pOut = pDst; /* Output pointer */
73 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
74 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
75 const q15_t *px; /* Intermediate inputA pointer */
76 q15_t *py; /* Intermediate inputB pointer */
77 uint32_t j, k, blkCnt; /* Loop counter */
78 uint32_t tapCnt; /* Loop count */
79
80 #if defined (ARM_MATH_LOOPUNROLL)
81 q63_t acc1, acc2, acc3; /* Accumulators */
82 q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
83 q31_t y1, y2; /* State variables */
84 #endif
85
86
87 /* The algorithm implementation is based on the lengths of the inputs. */
88 /* srcB is always made to slide across srcA. */
89 /* So srcBLen is always considered as shorter or equal to srcALen */
90 if (srcALen >= srcBLen)
91 {
92 /* Initialization of inputA pointer */
93 pIn1 = pSrcA;
94
95 /* Initialization of inputB pointer */
96 pIn2 = pSrcB;
97 }
98 else
99 {
100 /* Initialization of inputA pointer */
101 pIn1 = pSrcB;
102
103 /* Initialization of inputB pointer */
104 pIn2 = pSrcA;
105
106 /* srcBLen is always considered as shorter or equal to srcALen */
107 j = srcBLen;
108 srcBLen = srcALen;
109 srcALen = j;
110 }
111
112 /* Pointer to take end of scratch2 buffer */
113 pScr2 = pScratch2 + srcBLen - 1;
114
115 /* points to smaller length sequence */
116 px = pIn2;
117
118 #if defined (ARM_MATH_LOOPUNROLL)
119
120 /* Loop unrolling: Compute 4 outputs at a time */
121 k = srcBLen >> 2U;
122
123 /* Copy smaller length input sequence in reverse order into second scratch buffer */
124 while (k > 0U)
125 {
126 /* copy second buffer in reversal manner */
127 *pScr2-- = *px++;
128 *pScr2-- = *px++;
129 *pScr2-- = *px++;
130 *pScr2-- = *px++;
131
132 /* Decrement loop counter */
133 k--;
134 }
135
136 /* Loop unrolling: Compute remaining outputs */
137 k = srcBLen % 0x4U;
138
139 #else
140
141 /* Initialize k with number of samples */
142 k = srcBLen;
143
144 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
145
146 while (k > 0U)
147 {
148 /* copy second buffer in reversal manner for remaining samples */
149 *pScr2-- = *px++;
150
151 /* Decrement loop counter */
152 k--;
153 }
154
155 /* Initialze temporary scratch pointer */
156 pScr1 = pScratch1;
157
158 /* Assuming scratch1 buffer is aligned by 32-bit */
159 /* Fill (srcBLen - 1U) zeros in scratch1 buffer */
160 arm_fill_q15(0, pScr1, (srcBLen - 1U));
161
162 /* Update temporary scratch pointer */
163 pScr1 += (srcBLen - 1U);
164
165 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
166
167 /* Copy (srcALen) samples in scratch buffer */
168 arm_copy_q15(pIn1, pScr1, srcALen);
169
170 /* Update pointers */
171 pScr1 += srcALen;
172
173
174 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
175 arm_fill_q15(0, pScr1, (srcBLen - 1U));
176
177 /* Update pointer */
178 pScr1 += (srcBLen - 1U);
179
180 /* Temporary pointer for scratch2 */
181 py = pScratch2;
182
183
184 /* Initialization of pIn2 pointer */
185 pIn2 = py;
186
187 #if defined (ARM_MATH_LOOPUNROLL)
188
189 /* Loop unrolling: Compute 4 outputs at a time */
190 blkCnt = (srcALen + srcBLen - 1U) >> 2;
191
192 while (blkCnt > 0)
193 {
194 /* Initialze temporary scratch pointer as scratch1 */
195 pScr1 = pScratch1;
196
197 /* Clear Accumlators */
198 acc0 = 0;
199 acc1 = 0;
200 acc2 = 0;
201 acc3 = 0;
202
203 /* Read two samples from scratch1 buffer */
204 x1 = read_q15x2_ia (&pScr1);
205
206 /* Read next two samples from scratch1 buffer */
207 x2 = read_q15x2_ia (&pScr1);
208
209 tapCnt = (srcBLen) >> 2U;
210
211 while (tapCnt > 0U)
212 {
213
214 /* Read four samples from smaller buffer */
215 y1 = read_q15x2_ia ((q15_t **) &pIn2);
216 y2 = read_q15x2_ia ((q15_t **) &pIn2);
217
218 /* multiply and accumulate */
219 acc0 = __SMLALD(x1, y1, acc0);
220 acc2 = __SMLALD(x2, y1, acc2);
221
222 /* pack input data */
223 #ifndef ARM_MATH_BIG_ENDIAN
224 x3 = __PKHBT(x2, x1, 0);
225 #else
226 x3 = __PKHBT(x1, x2, 0);
227 #endif
228
229 /* multiply and accumulate */
230 acc1 = __SMLALDX(x3, y1, acc1);
231
232 /* Read next two samples from scratch1 buffer */
233 x1 = read_q15x2_ia (&pScr1);
234
235 /* multiply and accumulate */
236 acc0 = __SMLALD(x2, y2, acc0);
237 acc2 = __SMLALD(x1, y2, acc2);
238
239 /* pack input data */
240 #ifndef ARM_MATH_BIG_ENDIAN
241 x3 = __PKHBT(x1, x2, 0);
242 #else
243 x3 = __PKHBT(x2, x1, 0);
244 #endif
245
246 acc3 = __SMLALDX(x3, y1, acc3);
247 acc1 = __SMLALDX(x3, y2, acc1);
248
249 x2 = read_q15x2_ia (&pScr1);
250
251 #ifndef ARM_MATH_BIG_ENDIAN
252 x3 = __PKHBT(x2, x1, 0);
253 #else
254 x3 = __PKHBT(x1, x2, 0);
255 #endif
256
257 acc3 = __SMLALDX(x3, y2, acc3);
258
259 /* Decrement loop counter */
260 tapCnt--;
261 }
262
263 /* Update scratch pointer for remaining samples of smaller length sequence */
264 pScr1 -= 4U;
265
266 /* apply same above for remaining samples of smaller length sequence */
267 tapCnt = (srcBLen) & 3U;
268
269 while (tapCnt > 0U)
270 {
271 /* accumulate the results */
272 acc0 += (*pScr1++ * *pIn2);
273 acc1 += (*pScr1++ * *pIn2);
274 acc2 += (*pScr1++ * *pIn2);
275 acc3 += (*pScr1++ * *pIn2++);
276
277 pScr1 -= 3U;
278
279 /* Decrement loop counter */
280 tapCnt--;
281 }
282
283 blkCnt--;
284
285 /* Store the results in the accumulators in the destination buffer. */
286 #ifndef ARM_MATH_BIG_ENDIAN
287 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
288 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
289 #else
290 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
291 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
292 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
293
294 /* Initialization of inputB pointer */
295 pIn2 = py;
296
297 pScratch1 += 4U;
298 }
299
300 /* Loop unrolling: Compute remaining outputs */
301 blkCnt = (srcALen + srcBLen - 1U) & 0x3;
302
303 #else
304
305 /* Initialize blkCnt with number of samples */
306 blkCnt = (srcALen + srcBLen - 1U);
307
308 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
309
310 /* Calculate convolution for remaining samples of Bigger length sequence */
311 while (blkCnt > 0)
312 {
313 /* Initialze temporary scratch pointer as scratch1 */
314 pScr1 = pScratch1;
315
316 /* Clear Accumlators */
317 acc0 = 0;
318
319 tapCnt = (srcBLen) >> 1U;
320
321 while (tapCnt > 0U)
322 {
323
324 /* Read next two samples from scratch1 buffer */
325 acc0 += (*pScr1++ * *pIn2++);
326 acc0 += (*pScr1++ * *pIn2++);
327
328 /* Decrement loop counter */
329 tapCnt--;
330 }
331
332 tapCnt = (srcBLen) & 1U;
333
334 /* apply same above for remaining samples of smaller length sequence */
335 while (tapCnt > 0U)
336 {
337
338 /* accumulate the results */
339 acc0 += (*pScr1++ * *pIn2++);
340
341 /* Decrement loop counter */
342 tapCnt--;
343 }
344
345 blkCnt--;
346
347 /* The result is in 2.30 format. Convert to 1.15 with saturation.
348 Then store the output in the destination buffer. */
349 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
350
351 /* Initialization of inputB pointer */
352 pIn2 = py;
353
354 pScratch1 += 1U;
355 }
356
357 }
358
359 /**
360 @} end of Conv group
361 */
362