1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_fast_q31.c
4 * Description: Fast Q31 Convolution
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Conv
37 @{
38 */
39
40 /**
41 @brief Convolution of Q31 sequences (fast version).
42 @param[in] pSrcA points to the first input sequence.
43 @param[in] srcALen length of the first input sequence.
44 @param[in] pSrcB points to the second input sequence.
45 @param[in] srcBLen length of the second input sequence.
46 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
47
48 @par Scaling and Overflow Behavior
49 This function is optimized for speed at the expense of fixed-point precision and overflow protection.
50 The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
51 These intermediate results are accumulated in a 32-bit register in 2.30 format.
52 Finally, the accumulator is saturated and converted to a 1.31 result.
53 @par
54 The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
55 In order to avoid overflows completely the input signals must be scaled down.
56 Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
57 as maximum of min(srcALen, srcBLen) number of additions are carried internally.
58 @remark
59 Refer to \ref arm_conv_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
60 */
61
arm_conv_fast_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst)62 void arm_conv_fast_q31(
63 const q31_t * pSrcA,
64 uint32_t srcALen,
65 const q31_t * pSrcB,
66 uint32_t srcBLen,
67 q31_t * pDst)
68 {
69 const q31_t *pIn1; /* InputA pointer */
70 const q31_t *pIn2; /* InputB pointer */
71 q31_t *pOut = pDst; /* Output pointer */
72 const q31_t *px; /* Intermediate inputA pointer */
73 const q31_t *py; /* Intermediate inputB pointer */
74 const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
75 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
76 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
77 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
78 uint32_t j, k, count, blkCnt; /* Loop counters */
79
80 /* The algorithm implementation is based on the lengths of the inputs. */
81 /* srcB is always made to slide across srcA. */
82 /* So srcBLen is always considered as shorter or equal to srcALen */
83 if (srcALen >= srcBLen)
84 {
85 /* Initialization of inputA pointer */
86 pIn1 = pSrcA;
87
88 /* Initialization of inputB pointer */
89 pIn2 = pSrcB;
90 }
91 else
92 {
93 /* Initialization of inputA pointer */
94 pIn1 = pSrcB;
95
96 /* Initialization of inputB pointer */
97 pIn2 = pSrcA;
98
99 /* srcBLen is always considered as shorter or equal to srcALen */
100 j = srcBLen;
101 srcBLen = srcALen;
102 srcALen = j;
103 }
104
105 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
106 /* The function is internally
107 * divided into three stages according to the number of multiplications that has to be
108 * taken place between inputA samples and inputB samples. In the first stage of the
109 * algorithm, the multiplications increase by one for every iteration.
110 * In the second stage of the algorithm, srcBLen number of multiplications are done.
111 * In the third stage of the algorithm, the multiplications decrease by one
112 * for every iteration. */
113
114 /* The algorithm is implemented in three stages.
115 The loop counters of each stage is initiated here. */
116 blockSize1 = srcBLen - 1U;
117 blockSize2 = srcALen - (srcBLen - 1U);
118 blockSize3 = blockSize1;
119
120 /* --------------------------
121 * Initializations of stage1
122 * -------------------------*/
123
124 /* sum = x[0] * y[0]
125 * sum = x[0] * y[1] + x[1] * y[0]
126 * ....
127 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
128 */
129
130 /* In this stage the MAC operations are increased by 1 for every iteration.
131 The count variable holds the number of MAC operations performed */
132 count = 1U;
133
134 /* Working pointer of inputA */
135 px = pIn1;
136
137 /* Working pointer of inputB */
138 py = pIn2;
139
140
141 /* ------------------------
142 * Stage1 process
143 * ----------------------*/
144
145 /* The first stage starts here */
146 while (blockSize1 > 0U)
147 {
148 /* Accumulator is made zero for every iteration */
149 sum = 0;
150
151 /* Apply loop unrolling and compute 4 MACs simultaneously. */
152 k = count >> 2U;
153
154 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
155 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
156 while (k > 0U)
157 {
158 /* x[0] * y[srcBLen - 1] */
159 sum = (q31_t) ((((q63_t) sum << 32) +
160 ((q63_t) *px++ * (*py--))) >> 32);
161
162 /* x[1] * y[srcBLen - 2] */
163 sum = (q31_t) ((((q63_t) sum << 32) +
164 ((q63_t) *px++ * (*py--))) >> 32);
165
166 /* x[2] * y[srcBLen - 3] */
167 sum = (q31_t) ((((q63_t) sum << 32) +
168 ((q63_t) *px++ * (*py--))) >> 32);
169
170 /* x[3] * y[srcBLen - 4] */
171 sum = (q31_t) ((((q63_t) sum << 32) +
172 ((q63_t) *px++ * (*py--))) >> 32);
173
174 /* Decrement loop counter */
175 k--;
176 }
177
178 /* If the count is not a multiple of 4, compute any remaining MACs here.
179 ** No loop unrolling is used. */
180 k = count % 0x4U;
181
182 while (k > 0U)
183 {
184 /* Perform the multiply-accumulate */
185 sum = (q31_t) ((((q63_t) sum << 32) +
186 ((q63_t) *px++ * (*py--))) >> 32);
187
188 /* Decrement loop counter */
189 k--;
190 }
191
192 /* Store the result in the accumulator in the destination buffer. */
193 *pOut++ = sum << 1;
194
195 /* Update the inputA and inputB pointers for next MAC calculation */
196 py = pIn2 + count;
197 px = pIn1;
198
199 /* Increment MAC count */
200 count++;
201
202 /* Decrement loop counter */
203 blockSize1--;
204 }
205
206 /* --------------------------
207 * Initializations of stage2
208 * ------------------------*/
209
210 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
211 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
212 * ....
213 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
214 */
215
216 /* Working pointer of inputA */
217 px = pIn1;
218
219 /* Working pointer of inputB */
220 pSrc2 = pIn2 + (srcBLen - 1U);
221 py = pSrc2;
222
223 /* count is index by which the pointer pIn1 to be incremented */
224 count = 0U;
225
226 /* -------------------
227 * Stage2 process
228 * ------------------*/
229
230 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
231 * So, to loop unroll over blockSize2,
232 * srcBLen should be greater than or equal to 4 */
233 if (srcBLen >= 4U)
234 {
235 /* Loop unroll over blockSize2, by 4 */
236 blkCnt = blockSize2 >> 2U;
237
238 while (blkCnt > 0U)
239 {
240 /* Set all accumulators to zero */
241 acc0 = 0;
242 acc1 = 0;
243 acc2 = 0;
244 acc3 = 0;
245
246 /* read x[0], x[1], x[2] samples */
247 x0 = *px++;
248 x1 = *px++;
249 x2 = *px++;
250
251 /* Apply loop unrolling and compute 4 MACs simultaneously. */
252 k = srcBLen >> 2U;
253
254 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
255 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
256 do
257 {
258 /* Read y[srcBLen - 1] sample */
259 c0 = *py--;
260 /* Read x[3] sample */
261 x3 = *px++;
262
263 /* Perform the multiply-accumulate */
264 /* acc0 += x[0] * y[srcBLen - 1] */
265 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
266 /* acc1 += x[1] * y[srcBLen - 1] */
267 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
268 /* acc2 += x[2] * y[srcBLen - 1] */
269 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
270 /* acc3 += x[3] * y[srcBLen - 1] */
271 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
272
273
274 /* Read y[srcBLen - 2] sample */
275 c0 = *py--;
276 /* Read x[4] sample */
277 x0 = *px++;
278
279 /* Perform the multiply-accumulate */
280 /* acc0 += x[1] * y[srcBLen - 2] */
281 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
282 /* acc1 += x[2] * y[srcBLen - 2] */
283 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
284 /* acc2 += x[3] * y[srcBLen - 2] */
285 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
286 /* acc3 += x[4] * y[srcBLen - 2] */
287 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
288
289
290 /* Read y[srcBLen - 3] sample */
291 c0 = *py--;
292 /* Read x[5] sample */
293 x1 = *px++;
294
295 /* Perform the multiply-accumulates */
296 /* acc0 += x[2] * y[srcBLen - 3] */
297 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
298 /* acc1 += x[3] * y[srcBLen - 3] */
299 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
300 /* acc2 += x[4] * y[srcBLen - 3] */
301 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
302 /* acc3 += x[5] * y[srcBLen - 3] */
303 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
304
305
306 /* Read y[srcBLen - 4] sample */
307 c0 = *py--;
308 /* Read x[6] sample */
309 x2 = *px++;
310
311 /* Perform the multiply-accumulates */
312 /* acc0 += x[3] * y[srcBLen - 4] */
313 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
314 /* acc1 += x[4] * y[srcBLen - 4] */
315 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
316 /* acc2 += x[5] * y[srcBLen - 4] */
317 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
318 /* acc3 += x[6] * y[srcBLen - 4] */
319 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
320
321
322 } while (--k);
323
324 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
325 ** No loop unrolling is used. */
326 k = srcBLen % 0x4U;
327
328 while (k > 0U)
329 {
330 /* Read y[srcBLen - 5] sample */
331 c0 = *py--;
332 /* Read x[7] sample */
333 x3 = *px++;
334
335 /* Perform the multiply-accumulates */
336 /* acc0 += x[4] * y[srcBLen - 5] */
337 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
338 /* acc1 += x[5] * y[srcBLen - 5] */
339 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
340 /* acc2 += x[6] * y[srcBLen - 5] */
341 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
342 /* acc3 += x[7] * y[srcBLen - 5] */
343 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
344
345 /* Reuse the present samples for the next MAC */
346 x0 = x1;
347 x1 = x2;
348 x2 = x3;
349
350 /* Decrement loop counter */
351 k--;
352 }
353
354 /* Store the result in the accumulator in the destination buffer. */
355 *pOut++ = (q31_t) (acc0 << 1);
356 *pOut++ = (q31_t) (acc1 << 1);
357 *pOut++ = (q31_t) (acc2 << 1);
358 *pOut++ = (q31_t) (acc3 << 1);
359
360 /* Increment the pointer pIn1 index, count by 4 */
361 count += 4U;
362
363 /* Update the inputA and inputB pointers for next MAC calculation */
364 px = pIn1 + count;
365 py = pSrc2;
366
367 /* Decrement loop counter */
368 blkCnt--;
369 }
370
371 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
372 ** No loop unrolling is used. */
373 blkCnt = blockSize2 % 0x4U;
374
375 while (blkCnt > 0U)
376 {
377 /* Accumulator is made zero for every iteration */
378 sum = 0;
379
380 /* Apply loop unrolling and compute 4 MACs simultaneously. */
381 k = srcBLen >> 2U;
382
383 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
384 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
385 while (k > 0U)
386 {
387 /* Perform the multiply-accumulates */
388 sum = (q31_t) ((((q63_t) sum << 32) +
389 ((q63_t) *px++ * (*py--))) >> 32);
390 sum = (q31_t) ((((q63_t) sum << 32) +
391 ((q63_t) *px++ * (*py--))) >> 32);
392 sum = (q31_t) ((((q63_t) sum << 32) +
393 ((q63_t) *px++ * (*py--))) >> 32);
394 sum = (q31_t) ((((q63_t) sum << 32) +
395 ((q63_t) *px++ * (*py--))) >> 32);
396
397 /* Decrement loop counter */
398 k--;
399 }
400
401 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
402 ** No loop unrolling is used. */
403 k = srcBLen % 0x4U;
404
405 while (k > 0U)
406 {
407 /* Perform the multiply-accumulate */
408 sum = (q31_t) ((((q63_t) sum << 32) +
409 ((q63_t) *px++ * (*py--))) >> 32);
410
411 /* Decrement loop counter */
412 k--;
413 }
414
415 /* Store the result in the accumulator in the destination buffer. */
416 *pOut++ = sum << 1;
417
418 /* Increment MAC count */
419 count++;
420
421 /* Update the inputA and inputB pointers for next MAC calculation */
422 px = pIn1 + count;
423 py = pSrc2;
424
425 /* Decrement loop counter */
426 blkCnt--;
427 }
428 }
429 else
430 {
431 /* If the srcBLen is not a multiple of 4,
432 * the blockSize2 loop cannot be unrolled by 4 */
433 blkCnt = blockSize2;
434
435 while (blkCnt > 0U)
436 {
437 /* Accumulator is made zero for every iteration */
438 sum = 0;
439
440 /* srcBLen number of MACS should be performed */
441 k = srcBLen;
442
443 while (k > 0U)
444 {
445 /* Perform the multiply-accumulate */
446 sum = (q31_t) ((((q63_t) sum << 32) +
447 ((q63_t) *px++ * (*py--))) >> 32);
448
449 /* Decrement loop counter */
450 k--;
451 }
452
453 /* Store the result in the accumulator in the destination buffer. */
454 *pOut++ = sum << 1;
455
456 /* Increment MAC count */
457 count++;
458
459 /* Update the inputA and inputB pointers for next MAC calculation */
460 px = pIn1 + count;
461 py = pSrc2;
462
463 /* Decrement loop counter */
464 blkCnt--;
465 }
466 }
467
468
469 /* --------------------------
470 * Initializations of stage3
471 * -------------------------*/
472
473 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
474 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
475 * ....
476 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
477 * sum += x[srcALen-1] * y[srcBLen-1]
478 */
479
480 /* In this stage the MAC operations are decreased by 1 for every iteration.
481 The blockSize3 variable holds the number of MAC operations performed */
482
483 /* Working pointer of inputA */
484 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
485 px = pSrc1;
486
487 /* Working pointer of inputB */
488 pSrc2 = pIn2 + (srcBLen - 1U);
489 py = pSrc2;
490
491 /* -------------------
492 * Stage3 process
493 * ------------------*/
494
495 while (blockSize3 > 0U)
496 {
497 /* Accumulator is made zero for every iteration */
498 sum = 0;
499
500 /* Apply loop unrolling and compute 4 MACs simultaneously. */
501 k = blockSize3 >> 2U;
502
503 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
504 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
505 while (k > 0U)
506 {
507 /* Perform the multiply-accumulate */
508 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
509 sum = (q31_t) ((((q63_t) sum << 32) +
510 ((q63_t) *px++ * (*py--))) >> 32);
511
512 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
513 sum = (q31_t) ((((q63_t) sum << 32) +
514 ((q63_t) *px++ * (*py--))) >> 32);
515
516 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
517 sum = (q31_t) ((((q63_t) sum << 32) +
518 ((q63_t) *px++ * (*py--))) >> 32);
519
520 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
521 sum = (q31_t) ((((q63_t) sum << 32) +
522 ((q63_t) *px++ * (*py--))) >> 32);
523
524 /* Decrement loop counter */
525 k--;
526 }
527
528 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
529 ** No loop unrolling is used. */
530 k = blockSize3 % 0x4U;
531
532 while (k > 0U)
533 {
534 /* Perform the multiply-accumulate */
535 sum = (q31_t) ((((q63_t) sum << 32) +
536 ((q63_t) *px++ * (*py--))) >> 32);
537
538 /* Decrement loop counter */
539 k--;
540 }
541
542 /* Store the result in the accumulator in the destination buffer. */
543 *pOut++ = sum << 1;
544
545 /* Update the inputA and inputB pointers for next MAC calculation */
546 px = ++pSrc1;
547 py = pSrc2;
548
549 /* Decrement loop counter */
550 blockSize3--;
551 }
552
553 }
554
555 /**
556 @} end of Conv group
557 */
558