1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_fast_q15.c
4 * Description: Fast Q15 Convolution
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Conv
37 @{
38 */
39
40 /**
41 @brief Convolution of Q15 sequences (fast version).
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1
47
48 @par Scaling and Overflow Behavior
49 This fast version uses a 32-bit accumulator with 2.30 format.
50 The accumulator maintains full precision of the intermediate multiplication results
51 but provides only a single guard bit. There is no saturation on intermediate additions.
52 Thus, if the accumulator overflows it wraps around and distorts the result.
53 The input signals should be scaled down to avoid intermediate overflows.
54 Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
55 as maximum of min(srcALen, srcBLen) number of additions are carried internally.
56 The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
57
58 @remark
59 Refer to \ref arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
60 */
61
arm_conv_fast_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst)62 ARM_DSP_ATTRIBUTE void arm_conv_fast_q15(
63 const q15_t * pSrcA,
64 uint32_t srcALen,
65 const q15_t * pSrcB,
66 uint32_t srcBLen,
67 q15_t * pDst)
68 {
69 const q15_t *pIn1; /* InputA pointer */
70 const q15_t *pIn2; /* InputB pointer */
71 q15_t *pOut = pDst; /* Output pointer */
72 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
73 const q15_t *px; /* Intermediate inputA pointer */
74 const q15_t *py; /* Intermediate inputB pointer */
75 const q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
76 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
77 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
78 uint32_t j, k, count, blkCnt; /* Loop counters */
79
80 /* The algorithm implementation is based on the lengths of the inputs. */
81 /* srcB is always made to slide across srcA. */
82 /* So srcBLen is always considered as shorter or equal to srcALen */
83 if (srcALen >= srcBLen)
84 {
85 /* Initialization of inputA pointer */
86 pIn1 = pSrcA;
87
88 /* Initialization of inputB pointer */
89 pIn2 = pSrcB;
90 }
91 else
92 {
93 /* Initialization of inputA pointer */
94 pIn1 = pSrcB;
95
96 /* Initialization of inputB pointer */
97 pIn2 = pSrcA;
98
99 /* srcBLen is always considered as shorter or equal to srcALen */
100 j = srcBLen;
101 srcBLen = srcALen;
102 srcALen = j;
103 }
104
105 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
106 /* The function is internally
107 * divided into three stages according to the number of multiplications that has to be
108 * taken place between inputA samples and inputB samples. In the first stage of the
109 * algorithm, the multiplications increase by one for every iteration.
110 * In the second stage of the algorithm, srcBLen number of multiplications are done.
111 * In the third stage of the algorithm, the multiplications decrease by one
112 * for every iteration. */
113
114 /* The algorithm is implemented in three stages.
115 The loop counters of each stage is initiated here. */
116 blockSize1 = srcBLen - 1U;
117 blockSize2 = srcALen - (srcBLen - 1U);
118 blockSize3 = blockSize1;
119
120 /* --------------------------
121 * Initializations of stage1
122 * -------------------------*/
123
124 /* sum = x[0] * y[0]
125 * sum = x[0] * y[1] + x[1] * y[0]
126 * ....
127 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
128 */
129
130 /* In this stage the MAC operations are increased by 1 for every iteration.
131 The count variable holds the number of MAC operations performed */
132 count = 1U;
133
134 /* Working pointer of inputA */
135 px = pIn1;
136
137 /* Working pointer of inputB */
138 py = pIn2;
139
140
141 /* ------------------------
142 * Stage1 process
143 * ----------------------*/
144
145 /* For loop unrolling by 4, this stage is divided into two. */
146 /* First part of this stage computes the MAC operations less than 4 */
147 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
148
149 /* The first part of the stage starts here */
150 while ((count < 4U) && (blockSize1 > 0U))
151 {
152 /* Accumulator is made zero for every iteration */
153 sum = 0;
154
155 /* Loop over number of MAC operations between
156 * inputA samples and inputB samples */
157 k = count;
158
159 while (k > 0U)
160 {
161 /* Perform the multiply-accumulates */
162 sum = __SMLAD(*px++, *py--, sum);
163
164 /* Decrement the loop counter */
165 k--;
166 }
167
168 /* Store the result in the accumulator in the destination buffer. */
169 *pOut++ = (q15_t) (sum >> 15);
170
171 /* Update the inputA and inputB pointers for next MAC calculation */
172 py = pIn2 + count;
173 px = pIn1;
174
175 /* Increment MAC count */
176 count++;
177
178 /* Decrement loop counter */
179 blockSize1--;
180 }
181
182 /* The second part of the stage starts here */
183 /* The internal loop, over count, is unrolled by 4 */
184 /* To, read the last two inputB samples using SIMD:
185 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
186 py = py - 1;
187
188 while (blockSize1 > 0U)
189 {
190 /* Accumulator is made zero for every iteration */
191 sum = 0;
192
193 /* Apply loop unrolling and compute 4 MACs simultaneously. */
194 k = count >> 2U;
195
196 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
197 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
198 while (k > 0U)
199 {
200 /* Perform the multiply-accumulates */
201 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
202 sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
203 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
204 sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
205
206 /* Decrement loop counter */
207 k--;
208 }
209
210 /* For the next MAC operations, the pointer py is used without SIMD
211 * So, py is incremented by 1 */
212 py = py + 1U;
213
214 /* If the count is not a multiple of 4, compute any remaining MACs here.
215 ** No loop unrolling is used. */
216 k = count % 0x4U;
217
218 while (k > 0U)
219 {
220 /* Perform the multiply-accumulates */
221 sum = __SMLAD(*px++, *py--, sum);
222
223 /* Decrement the loop counter */
224 k--;
225 }
226
227 /* Store the result in the accumulator in the destination buffer. */
228 *pOut++ = (q15_t) (sum >> 15);
229
230 /* Update the inputA and inputB pointers for next MAC calculation */
231 py = pIn2 + (count - 1U);
232 px = pIn1;
233
234 /* Increment MAC count */
235 count++;
236
237 /* Decrement loop counter */
238 blockSize1--;
239 }
240
241 /* --------------------------
242 * Initializations of stage2
243 * ------------------------*/
244
245 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
246 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
247 * ....
248 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
249 */
250
251 /* Working pointer of inputA */
252 px = pIn1;
253
254 /* Working pointer of inputB */
255 pSrc2 = pIn2 + (srcBLen - 1U);
256 py = pSrc2;
257
258 /* count is the index by which the pointer pIn1 to be incremented */
259 count = 0U;
260
261 /* --------------------
262 * Stage2 process
263 * -------------------*/
264
265 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
266 * So, to loop unroll over blockSize2,
267 * srcBLen should be greater than or equal to 4 */
268 if (srcBLen >= 4U)
269 {
270 /* Loop unroll over blockSize2, by 4 */
271 blkCnt = blockSize2 >> 2U;
272
273 while (blkCnt > 0U)
274 {
275 py = py - 1U;
276
277 /* Set all accumulators to zero */
278 acc0 = 0;
279 acc1 = 0;
280 acc2 = 0;
281 acc3 = 0;
282
283 /* read x[0], x[1] samples */
284 x0 = read_q15x2 ((q15_t *) px);
285 /* read x[1], x[2] samples */
286 x1 = read_q15x2 ((q15_t *) px + 1);
287 px += 2U;
288
289 /* Apply loop unrolling and compute 4 MACs simultaneously. */
290 k = srcBLen >> 2U;
291
292 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
293 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
294 do
295 {
296 /* Read the last two inputB samples using SIMD:
297 * y[srcBLen - 1] and y[srcBLen - 2] */
298 c0 = read_q15x2_da ((q15_t **) &py);
299
300 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
301 acc0 = __SMLADX(x0, c0, acc0);
302
303 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
304 acc1 = __SMLADX(x1, c0, acc1);
305
306 /* Read x[2], x[3] */
307 x2 = read_q15x2 ((q15_t *) px);
308
309 /* Read x[3], x[4] */
310 x3 = read_q15x2 ((q15_t *) px + 1);
311
312 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
313 acc2 = __SMLADX(x2, c0, acc2);
314
315 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
316 acc3 = __SMLADX(x3, c0, acc3);
317
318 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
319 c0 = read_q15x2_da ((q15_t **) &py);
320
321 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
322 acc0 = __SMLADX(x2, c0, acc0);
323
324 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
325 acc1 = __SMLADX(x3, c0, acc1);
326
327 /* Read x[4], x[5] */
328 x0 = read_q15x2 ((q15_t *) px + 2);
329
330 /* Read x[5], x[6] */
331 x1 = read_q15x2 ((q15_t *) px + 3);
332 px += 4U;
333
334 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
335 acc2 = __SMLADX(x0, c0, acc2);
336
337 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
338 acc3 = __SMLADX(x1, c0, acc3);
339
340 } while (--k);
341
342 /* For the next MAC operations, SIMD is not used
343 * So, the 16 bit pointer if inputB, py is updated */
344
345 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
346 ** No loop unrolling is used. */
347 k = srcBLen % 0x4U;
348
349 if (k == 1U)
350 {
351 /* Read y[srcBLen - 5] */
352 c0 = *(py+1);
353
354 #ifdef ARM_MATH_BIG_ENDIAN
355 c0 = c0 << 16U;
356 #else
357 c0 = c0 & 0x0000FFFF;
358 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
359
360 /* Read x[7] */
361 x3 = read_q15x2 ((q15_t *) px);
362 px++;
363
364 /* Perform the multiply-accumulates */
365 acc0 = __SMLAD(x0, c0, acc0);
366 acc1 = __SMLAD(x1, c0, acc1);
367 acc2 = __SMLADX(x1, c0, acc2);
368 acc3 = __SMLADX(x3, c0, acc3);
369 }
370
371 if (k == 2U)
372 {
373 /* Read y[srcBLen - 5], y[srcBLen - 6] */
374 c0 = read_q15x2 ((q15_t *) py);
375
376 /* Read x[7], x[8] */
377 x3 = read_q15x2 ((q15_t *) px);
378
379 /* Read x[9] */
380 x2 = read_q15x2 ((q15_t *) px + 1);
381 px += 2U;
382
383 /* Perform the multiply-accumulates */
384 acc0 = __SMLADX(x0, c0, acc0);
385 acc1 = __SMLADX(x1, c0, acc1);
386 acc2 = __SMLADX(x3, c0, acc2);
387 acc3 = __SMLADX(x2, c0, acc3);
388 }
389
390 if (k == 3U)
391 {
392 /* Read y[srcBLen - 5], y[srcBLen - 6] */
393 c0 = read_q15x2 ((q15_t *) py);
394
395 /* Read x[7], x[8] */
396 x3 = read_q15x2 ((q15_t *) px);
397
398 /* Read x[9] */
399 x2 = read_q15x2 ((q15_t *) px + 1);
400
401 /* Perform the multiply-accumulates */
402 acc0 = __SMLADX(x0, c0, acc0);
403 acc1 = __SMLADX(x1, c0, acc1);
404 acc2 = __SMLADX(x3, c0, acc2);
405 acc3 = __SMLADX(x2, c0, acc3);
406
407 /* Read y[srcBLen - 7] */
408 c0 = *(py-1);
409 #ifdef ARM_MATH_BIG_ENDIAN
410 c0 = c0 << 16U;
411 #else
412 c0 = c0 & 0x0000FFFF;
413 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
414
415 /* Read x[10] */
416 x3 = read_q15x2 ((q15_t *) px + 2);
417 px += 3U;
418
419 /* Perform the multiply-accumulates */
420 acc0 = __SMLADX(x1, c0, acc0);
421 acc1 = __SMLAD(x2, c0, acc1);
422 acc2 = __SMLADX(x2, c0, acc2);
423 acc3 = __SMLADX(x3, c0, acc3);
424 }
425
426 /* Store the result in the accumulator in the destination buffer. */
427 #ifndef ARM_MATH_BIG_ENDIAN
428 write_q15x2_ia (&pOut, __PKHBT((acc0 >> 15), (acc1 >> 15), 16));
429 write_q15x2_ia (&pOut, __PKHBT((acc2 >> 15), (acc3 >> 15), 16));
430 #else
431 write_q15x2_ia (&pOut, __PKHBT((acc1 >> 15), (acc0 >> 15), 16));
432 write_q15x2_ia (&pOut, __PKHBT((acc3 >> 15), (acc2 >> 15), 16));
433 #endif /*#ifndef ARM_MATH_BIG_ENDIAN*/
434
435 /* Increment the pointer pIn1 index, count by 4 */
436 count += 4U;
437
438 /* Update the inputA and inputB pointers for next MAC calculation */
439 px = pIn1 + count;
440 py = pSrc2;
441
442 /* Decrement loop counter */
443 blkCnt--;
444 }
445
446 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
447 ** No loop unrolling is used. */
448 blkCnt = blockSize2 % 0x4U;
449
450 while (blkCnt > 0U)
451 {
452 /* Accumulator is made zero for every iteration */
453 sum = 0;
454
455 /* Apply loop unrolling and compute 4 MACs simultaneously. */
456 k = srcBLen >> 2U;
457
458 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
459 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
460 while (k > 0U)
461 {
462 /* Perform the multiply-accumulates */
463 sum += ((q31_t) *px++ * *py--);
464 sum += ((q31_t) *px++ * *py--);
465 sum += ((q31_t) *px++ * *py--);
466 sum += ((q31_t) *px++ * *py--);
467
468 /* Decrement loop counter */
469 k--;
470 }
471
472 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
473 ** No loop unrolling is used. */
474 k = srcBLen % 0x4U;
475
476 while (k > 0U)
477 {
478 /* Perform the multiply-accumulates */
479 sum += ((q31_t) *px++ * *py--);
480
481 /* Decrement loop counter */
482 k--;
483 }
484
485 /* Store the result in the accumulator in the destination buffer. */
486 *pOut++ = (q15_t) (sum >> 15);
487
488 /* Increment the pointer pIn1 index, count by 1 */
489 count++;
490
491 /* Update the inputA and inputB pointers for next MAC calculation */
492 px = pIn1 + count;
493 py = pSrc2;
494
495 /* Decrement loop counter */
496 blkCnt--;
497 }
498 }
499 else
500 {
501 /* If the srcBLen is not a multiple of 4,
502 * the blockSize2 loop cannot be unrolled by 4 */
503 blkCnt = blockSize2;
504
505 while (blkCnt > 0U)
506 {
507 /* Accumulator is made zero for every iteration */
508 sum = 0;
509
510 /* srcBLen number of MACS should be performed */
511 k = srcBLen;
512
513 while (k > 0U)
514 {
515 /* Perform the multiply-accumulate */
516 sum += ((q31_t) *px++ * *py--);
517
518 /* Decrement loop counter */
519 k--;
520 }
521
522 /* Store the result in the accumulator in the destination buffer. */
523 *pOut++ = (q15_t) (sum >> 15);
524
525 /* Increment MAC count */
526 count++;
527
528 /* Update the inputA and inputB pointers for next MAC calculation */
529 px = pIn1 + count;
530 py = pSrc2;
531
532 /* Decrement loop counter */
533 blkCnt--;
534 }
535 }
536
537 /* --------------------------
538 * Initializations of stage3
539 * -------------------------*/
540
541 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
542 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
543 * ....
544 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
545 * sum += x[srcALen-1] * y[srcBLen-1]
546 */
547
548 /* In this stage the MAC operations are decreased by 1 for every iteration.
549 The blockSize3 variable holds the number of MAC operations performed */
550
551 /* Working pointer of inputA */
552 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
553 px = pSrc1;
554
555 /* Working pointer of inputB */
556 pSrc2 = pIn2 + (srcBLen - 1U);
557 pIn2 = pSrc2 - 1U;
558 py = pIn2;
559
560 /* -------------------
561 * Stage3 process
562 * ------------------*/
563
564 /* For loop unrolling by 4, this stage is divided into two. */
565 /* First part of this stage computes the MAC operations greater than 4 */
566 /* Second part of this stage computes the MAC operations less than or equal to 4 */
567
568 /* The first part of the stage starts here */
569 j = blockSize3 >> 2U;
570
571 while ((j > 0U) && (blockSize3 > 0U))
572 {
573 /* Accumulator is made zero for every iteration */
574 sum = 0;
575
576 /* Apply loop unrolling and compute 4 MACs simultaneously. */
577 k = blockSize3 >> 2U;
578
579 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
580 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
581 while (k > 0U)
582 {
583 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
584 * with y[srcBLen - 1], y[srcBLen - 2] respectively */
585 sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
586 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
587 * with y[srcBLen - 3], y[srcBLen - 4] respectively */
588 sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
589
590 /* Decrement loop counter */
591 k--;
592 }
593
594 /* For the next MAC operations, the pointer py is used without SIMD
595 * So, py is incremented by 1 */
596 py = py + 1U;
597
598 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
599 ** No loop unrolling is used. */
600 k = blockSize3 % 0x4U;
601
602 while (k > 0U)
603 {
604 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
605 sum = __SMLAD(*px++, *py--, sum);
606
607 /* Decrement loop counter */
608 k--;
609 }
610
611 /* Store the result in the accumulator in the destination buffer. */
612 *pOut++ = (q15_t) (sum >> 15);
613
614 /* Update the inputA and inputB pointers for next MAC calculation */
615 px = ++pSrc1;
616 py = pIn2;
617
618 /* Decrement loop counter */
619 blockSize3--;
620
621 j--;
622 }
623
624 /* The second part of the stage starts here */
625 /* SIMD is not used for the next MAC operations,
626 * so pointer py is updated to read only one sample at a time */
627 py = py + 1U;
628
629 while (blockSize3 > 0U)
630 {
631 /* Accumulator is made zero for every iteration */
632 sum = 0;
633
634 /* Apply loop unrolling and compute 4 MACs simultaneously. */
635 k = blockSize3;
636
637 while (k > 0U)
638 {
639 /* Perform the multiply-accumulates */
640 /* sum += x[srcALen-1] * y[srcBLen-1] */
641 sum = __SMLAD(*px++, *py--, sum);
642
643 /* Decrement loop counter */
644 k--;
645 }
646
647 /* Store the result in the accumulator in the destination buffer. */
648 *pOut++ = (q15_t) (sum >> 15);
649
650 /* Update the inputA and inputB pointers for next MAC calculation */
651 px = ++pSrc1;
652 py = pSrc2;
653
654 /* Decrement the loop counter */
655 blockSize3--;
656 }
657
658 }
659
660 /**
661 @} end of Conv group
662 */
663