1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_fast_opt_q15.c
4 * Description: Fast Q15 Convolution
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Conv
37 @{
38 */
39
40 /**
41 @brief Convolution of Q15 sequences (fast version).
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1
47 @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2
48 @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen
49
50 @par Scaling and Overflow Behavior
51 This fast version uses a 32-bit accumulator with 2.30 format.
52 The accumulator maintains full precision of the intermediate multiplication results
53 but provides only a single guard bit. There is no saturation on intermediate additions.
54 Thus, if the accumulator overflows it wraps around and distorts the result.
55 The input signals should be scaled down to avoid intermediate overflows.
56 Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
57 as maximum of min(srcALen, srcBLen) number of additions are carried internally.
58 The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
59
60 @remark
61 Refer to \ref arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
62 */
63
arm_conv_fast_opt_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,q15_t * pScratch1,q15_t * pScratch2)64 ARM_DSP_ATTRIBUTE void arm_conv_fast_opt_q15(
65 const q15_t * pSrcA,
66 uint32_t srcALen,
67 const q15_t * pSrcB,
68 uint32_t srcBLen,
69 q15_t * pDst,
70 q15_t * pScratch1,
71 q15_t * pScratch2)
72 {
73 q31_t acc0; /* Accumulators */
74 const q15_t *pIn1; /* InputA pointer */
75 const q15_t *pIn2; /* InputB pointer */
76 q15_t *pOut = pDst; /* Output pointer */
77 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
78 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
79 const q15_t *px; /* Intermediate inputA pointer */
80 q15_t *py; /* Intermediate inputB pointer */
81 uint32_t j, k, blkCnt; /* Loop counter */
82 uint32_t tapCnt; /* Loop count */
83
84 #if defined (ARM_MATH_LOOPUNROLL)
85 q31_t acc1, acc2, acc3; /* Accumulators */
86 q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
87 q31_t y1, y2; /* State variables */
88 #endif
89
90
91 /* The algorithm implementation is based on the lengths of the inputs. */
92 /* srcB is always made to slide across srcA. */
93 /* So srcBLen is always considered as shorter or equal to srcALen */
94 if (srcALen >= srcBLen)
95 {
96 /* Initialization of inputA pointer */
97 pIn1 = pSrcA;
98
99 /* Initialization of inputB pointer */
100 pIn2 = pSrcB;
101 }
102 else
103 {
104 /* Initialization of inputA pointer */
105 pIn1 = pSrcB;
106
107 /* Initialization of inputB pointer */
108 pIn2 = pSrcA;
109
110 /* srcBLen is always considered as shorter or equal to srcALen */
111 j = srcBLen;
112 srcBLen = srcALen;
113 srcALen = j;
114 }
115
116 /* Pointer to take end of scratch2 buffer */
117 pScr2 = pScratch2 + srcBLen - 1;
118
119 /* points to smaller length sequence */
120 px = pIn2;
121
122 #if defined (ARM_MATH_LOOPUNROLL)
123
124 /* Loop unrolling: Compute 4 outputs at a time */
125 k = srcBLen >> 2U;
126
127 /* Copy smaller length input sequence in reverse order into second scratch buffer */
128 while (k > 0U)
129 {
130 /* copy second buffer in reversal manner */
131 *pScr2-- = *px++;
132 *pScr2-- = *px++;
133 *pScr2-- = *px++;
134 *pScr2-- = *px++;
135
136 /* Decrement loop counter */
137 k--;
138 }
139
140 /* Loop unrolling: Compute remaining outputs */
141 k = srcBLen % 0x4U;
142
143 #else
144
145 /* Initialize k with number of samples */
146 k = srcBLen;
147
148 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
149
150 while (k > 0U)
151 {
152 /* copy second buffer in reversal manner for remaining samples */
153 *pScr2-- = *px++;
154
155 /* Decrement loop counter */
156 k--;
157 }
158
159 /* Initialze temporary scratch pointer */
160 pScr1 = pScratch1;
161
162 /* Assuming scratch1 buffer is aligned by 32-bit */
163 /* Fill (srcBLen - 1U) zeros in scratch1 buffer */
164 arm_fill_q15(0, pScr1, (srcBLen - 1U));
165
166 /* Update temporary scratch pointer */
167 pScr1 += (srcBLen - 1U);
168
169 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
170
171 /* Copy (srcALen) samples in scratch buffer */
172 arm_copy_q15(pIn1, pScr1, srcALen);
173
174 /* Update pointers */
175 pScr1 += srcALen;
176
177
178 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
179 arm_fill_q15(0, pScr1, (srcBLen - 1U));
180
181 /* Update pointer */
182 pScr1 += (srcBLen - 1U);
183
184 /* Temporary pointer for scratch2 */
185 py = pScratch2;
186
187
188 /* Initialization of pIn2 pointer */
189 pIn2 = py;
190
191 #if defined (ARM_MATH_LOOPUNROLL)
192
193 /* Loop unrolling: Compute 4 outputs at a time */
194 blkCnt = (srcALen + srcBLen - 1U) >> 2;
195
196 while (blkCnt > 0)
197 {
198 /* Initialze temporary scratch pointer as scratch1 */
199 pScr1 = pScratch1;
200
201 /* Clear Accumlators */
202 acc0 = 0;
203 acc1 = 0;
204 acc2 = 0;
205 acc3 = 0;
206
207 /* Read two samples from scratch1 buffer */
208 x1 = read_q15x2_ia (&pScr1);
209
210 /* Read next two samples from scratch1 buffer */
211 x2 = read_q15x2_ia (&pScr1);
212
213 tapCnt = (srcBLen) >> 2U;
214
215 while (tapCnt > 0U)
216 {
217
218 /* Read four samples from smaller buffer */
219 y1 = read_q15x2_ia ((q15_t **) &pIn2);
220 y2 = read_q15x2_ia ((q15_t **) &pIn2);
221
222 /* multiply and accumulate */
223 acc0 = __SMLAD(x1, y1, acc0);
224 acc2 = __SMLAD(x2, y1, acc2);
225
226 /* pack input data */
227 #ifndef ARM_MATH_BIG_ENDIAN
228 x3 = __PKHBT(x2, x1, 0);
229 #else
230 x3 = __PKHBT(x1, x2, 0);
231 #endif
232
233 /* multiply and accumulate */
234 acc1 = __SMLADX(x3, y1, acc1);
235
236 /* Read next two samples from scratch1 buffer */
237 x1 = read_q15x2_ia (&pScr1);
238
239 /* multiply and accumulate */
240 acc0 = __SMLAD(x2, y2, acc0);
241 acc2 = __SMLAD(x1, y2, acc2);
242
243 /* pack input data */
244 #ifndef ARM_MATH_BIG_ENDIAN
245 x3 = __PKHBT(x1, x2, 0);
246 #else
247 x3 = __PKHBT(x2, x1, 0);
248 #endif
249
250 acc3 = __SMLADX(x3, y1, acc3);
251 acc1 = __SMLADX(x3, y2, acc1);
252
253 x2 = read_q15x2_ia (&pScr1);
254
255 #ifndef ARM_MATH_BIG_ENDIAN
256 x3 = __PKHBT(x2, x1, 0);
257 #else
258 x3 = __PKHBT(x1, x2, 0);
259 #endif
260
261 acc3 = __SMLADX(x3, y2, acc3);
262
263 /* Decrement loop counter */
264 tapCnt--;
265 }
266
267 /* Update scratch pointer for remaining samples of smaller length sequence */
268 pScr1 -= 4U;
269
270 /* apply same above for remaining samples of smaller length sequence */
271 tapCnt = (srcBLen) & 3U;
272
273 while (tapCnt > 0U)
274 {
275 /* accumulate the results */
276 acc0 += (*pScr1++ * *pIn2);
277 acc1 += (*pScr1++ * *pIn2);
278 acc2 += (*pScr1++ * *pIn2);
279 acc3 += (*pScr1++ * *pIn2++);
280
281 pScr1 -= 3U;
282
283 /* Decrement loop counter */
284 tapCnt--;
285 }
286
287 blkCnt--;
288
289 /* Store the results in the accumulators in the destination buffer. */
290 #ifndef ARM_MATH_BIG_ENDIAN
291 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
292 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
293 #else
294 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
295 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
296 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
297
298 /* Initialization of inputB pointer */
299 pIn2 = py;
300
301 pScratch1 += 4U;
302 }
303
304 /* Loop unrolling: Compute remaining outputs */
305 blkCnt = (srcALen + srcBLen - 1U) & 0x3;
306
307 #else
308
309 /* Initialize blkCnt with number of samples */
310 blkCnt = (srcALen + srcBLen - 1U);
311
312 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
313
314 /* Calculate convolution for remaining samples of Bigger length sequence */
315 while (blkCnt > 0)
316 {
317 /* Initialze temporary scratch pointer as scratch1 */
318 pScr1 = pScratch1;
319
320 /* Clear Accumlators */
321 acc0 = 0;
322
323 tapCnt = (srcBLen) >> 1U;
324
325 while (tapCnt > 0U)
326 {
327
328 /* Read next two samples from scratch1 buffer */
329 acc0 += (*pScr1++ * *pIn2++);
330 acc0 += (*pScr1++ * *pIn2++);
331
332 /* Decrement loop counter */
333 tapCnt--;
334 }
335
336 tapCnt = (srcBLen) & 1U;
337
338 /* apply same above for remaining samples of smaller length sequence */
339 while (tapCnt > 0U)
340 {
341
342 /* accumulate the results */
343 acc0 += (*pScr1++ * *pIn2++);
344
345 /* Decrement loop counter */
346 tapCnt--;
347 }
348
349 blkCnt--;
350
351 /* The result is in 2.30 format. Convert to 1.15 with saturation.
352 Then store the output in the destination buffer. */
353 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
354
355 /* Initialization of inputB pointer */
356 pIn2 = py;
357
358 pScratch1 += 1U;
359 }
360
361 }
362
363 /**
364 @} end of Conv group
365 */
366