1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_q15.c
4 * Description: Convolution of Q15 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Conv
37 @{
38 */
39
40 /**
41 @brief Convolution of Q15 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
47
48 @par Scaling and Overflow Behavior
49 The function is implemented using a 64-bit internal accumulator.
50 Both inputs are in 1.15 format and multiplications yield a 2.30 result.
51 The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
52 This approach provides 33 guard bits and there is no risk of overflow.
53 The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
54
55 @remark
56 Refer to \ref arm_conv_fast_q15() for a faster but less precise version of this function.
57 @remark
58 Refer to \ref arm_conv_opt_q15() for a faster implementation of this function using scratch buffers.
59 */
60
61
62 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
63 #include "arm_helium_utils.h"
64 #include "arm_vec_filtering.h"
65
66
arm_conv_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst)67 ARM_DSP_ATTRIBUTE void arm_conv_q15(
68 const q15_t * pSrcA,
69 uint32_t srcALen,
70 const q15_t * pSrcB,
71 uint32_t srcBLen,
72 q15_t * pDst)
73 {
74 const q15_t *pIn1 = pSrcA; /* inputA pointer */
75 const q15_t *pIn2 = pSrcB; /* inputB pointer */
76 /*
77 * Loop to perform MAC operations according to correlation equation
78 */
79 const q15_t *pX;
80 const q15_t *pY;
81 const q15_t *pA;
82 const q15_t *pB;
83 int32_t i = 0U, j = 0; /* loop counters */
84 int32_t block1, block2, block3;
85
86
87
88 uint16x8_t decrIdxVec = vddupq_u16(7, 1);
89
90
91 if (srcALen < srcBLen)
92 {
93 /*
94 * Initialization to inputB pointer
95 */
96 pIn1 = pSrcB;
97 /*
98 * Initialization to the end of inputA pointer
99 */
100 pIn2 = pSrcA;
101 /*
102 * Swapping the lengths
103 */
104 j = srcALen;
105 srcALen = srcBLen;
106 srcBLen = j;
107 }
108
109 block1 = srcBLen - 1;
110 block2 = srcALen - srcBLen + 1;
111 block3 = srcBLen - 1;
112
113
114 pA = pIn1;
115 pB = pIn2 - 7;
116
117 for (i = 0; i <= block1 - 2; i += 2)
118 {
119 uint32_t count = i + 1;
120 int64_t acc0 = 0LL;
121 int64_t acc1 = 0LL;
122
123 pX = pA;
124 pY = pB;
125
126 MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count);
127 *pDst++ = (q15_t) acc0;
128 *pDst++ = (q15_t) acc1;
129 pB += 2;
130 }
131 for (; i < block1; i++)
132 {
133 uint32_t count = i + 1;
134 int64_t acc = 0LL;
135
136 pX = pA;
137 pY = pB;
138
139 MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count);
140 *pDst++ = (q15_t) acc;
141 pB++;
142 }
143
144 for (i = 0; i <= block2 - 4; i += 4)
145 {
146 uint32_t count = srcBLen;
147 int64_t acc0 = 0LL;
148 int64_t acc1 = 0LL;
149 int64_t acc2 = 0LL;
150 int64_t acc3 = 0LL;
151
152 pX = pA;
153 pY = pB;
154 /*
155 * compute 4 accumulators per loop
156 * size is fixed for all accumulators
157 * X pointer is incrementing for successive accumulators
158 */
159 MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count);
160 *pDst++ = (q15_t) acc0;
161 *pDst++ = (q15_t) acc1;
162 *pDst++ = (q15_t) acc2;
163 *pDst++ = (q15_t) acc3;
164
165 pA += 4;
166 }
167 for (; i <= block2 - 2; i += 2)
168 {
169 uint32_t count = srcBLen;
170 int64_t acc0 = 0LL;
171 int64_t acc1 = 0LL;
172
173 pX = pA;
174 pY = pB;
175 /*
176 * compute 2 accumulators per loop
177 * size is fixed for all accumulators
178 * X pointer is incrementing for successive accumulators
179 */
180 MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count);
181 *pDst++ = (q15_t) acc0;
182 *pDst++ = (q15_t) acc1;
183
184 pA += 2;
185 }
186 if (block2 & 1)
187 {
188 uint32_t count = srcBLen;
189 int64_t acc = 0LL;
190
191 pX = pA;
192 pY = pB;
193
194 MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count);
195 *pDst++ = (q15_t) acc;
196 pA++;
197 }
198
199 for (i = block3; i >= 2; i -= 2)
200 {
201 uint32_t count = i;
202 int64_t acc0 = 0LL;
203 int64_t acc1 = 0LL;
204
205 pX = pA;
206 pY = pB;
207
208 MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count);
209 *pDst++ = (q15_t) acc0;
210 *pDst++ = (q15_t) acc1;
211 pA += 2;
212 }
213 for (; i > 0; i--)
214 {
215 uint32_t count = i;
216 int64_t acc = 0LL;
217
218 pX = pA;
219 pY = pB;
220
221 MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count);
222 *pDst++ = (q15_t) acc;
223 pA++;
224 }
225
226
227 }
228 #else
arm_conv_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst)229 ARM_DSP_ATTRIBUTE void arm_conv_q15(
230 const q15_t * pSrcA,
231 uint32_t srcALen,
232 const q15_t * pSrcB,
233 uint32_t srcBLen,
234 q15_t * pDst)
235 {
236
237 #if defined (ARM_MATH_DSP)
238
239 const q15_t *pIn1; /* InputA pointer */
240 const q15_t *pIn2; /* InputB pointer */
241 q15_t *pOut = pDst; /* Output pointer */
242 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
243 const q15_t *px; /* Intermediate inputA pointer */
244 const q15_t *py; /* Intermediate inputB pointer */
245 const q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
246 q31_t x0, x1, x2, x3, c0; /* Temporary input variables to hold state and coefficient values */
247 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
248 uint32_t j, k, count, blkCnt; /* Loop counters */
249
250
251
252 /* The algorithm implementation is based on the lengths of the inputs. */
253 /* srcB is always made to slide across srcA. */
254 /* So srcBLen is always considered as shorter or equal to srcALen */
255 if (srcALen >= srcBLen)
256 {
257 /* Initialization of inputA pointer */
258 pIn1 = pSrcA;
259
260 /* Initialization of inputB pointer */
261 pIn2 = pSrcB;
262 }
263 else
264 {
265 /* Initialization of inputA pointer */
266 pIn1 = pSrcB;
267
268 /* Initialization of inputB pointer */
269 pIn2 = pSrcA;
270
271 /* srcBLen is always considered as shorter or equal to srcALen */
272 j = srcBLen;
273 srcBLen = srcALen;
274 srcALen = j;
275 }
276
277 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
278 /* The function is internally
279 * divided into three stages according to the number of multiplications that has to be
280 * taken place between inputA samples and inputB samples. In the first stage of the
281 * algorithm, the multiplications increase by one for every iteration.
282 * In the second stage of the algorithm, srcBLen number of multiplications are done.
283 * In the third stage of the algorithm, the multiplications decrease by one
284 * for every iteration. */
285
286 /* The algorithm is implemented in three stages.
287 The loop counters of each stage is initiated here. */
288 blockSize1 = srcBLen - 1U;
289 blockSize2 = srcALen - (srcBLen - 1U);
290
291
292
293
294 /* --------------------------
295 * Initializations of stage1
296 * -------------------------*/
297
298 /* sum = x[0] * y[0]
299 * sum = x[0] * y[1] + x[1] * y[0]
300 * ....
301 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
302 */
303
304 /* In this stage the MAC operations are increased by 1 for every iteration.
305 The count variable holds the number of MAC operations performed */
306 count = 1U;
307
308 /* Working pointer of inputA */
309 px = pIn1;
310
311 /* Working pointer of inputB */
312 py = pIn2;
313
314 /* ------------------------
315 * Stage1 process
316 * ----------------------*/
317
318 /* For loop unrolling by 4, this stage is divided into two. */
319 /* First part of this stage computes the MAC operations less than 4 */
320 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
321
322 /* The first part of the stage starts here */
323 while ((count < 4U) && (blockSize1 > 0U))
324 {
325 /* Accumulator is made zero for every iteration */
326 sum = 0;
327
328 /* Loop over number of MAC operations between
329 * inputA samples and inputB samples */
330 k = count;
331
332 while (k > 0U)
333 {
334 /* Perform the multiply-accumulates */
335 sum = __SMLALD(*px++, *py--, sum);
336
337 /* Decrement loop counter */
338 k--;
339 }
340
341 /* Store the result in the accumulator in the destination buffer. */
342 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
343
344 /* Update the inputA and inputB pointers for next MAC calculation */
345 py = pIn2 + count;
346 px = pIn1;
347
348 /* Increment MAC count */
349 count++;
350
351 /* Decrement loop counter */
352 blockSize1--;
353 }
354
355 /* The second part of the stage starts here */
356 /* The internal loop, over count, is unrolled by 4 */
357 /* To, read the last two inputB samples using SIMD:
358 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
359 py = py - 1;
360
361 while (blockSize1 > 0U)
362 {
363 /* Accumulator is made zero for every iteration */
364 sum = 0;
365
366 /* Apply loop unrolling and compute 4 MACs simultaneously. */
367 k = count >> 2U;
368
369 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
370 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
371 while (k > 0U)
372 {
373 /* Perform the multiply-accumulate */
374 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
375 sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
376 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
377 sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
378
379 /* Decrement loop counter */
380 k--;
381 }
382
383 /* For the next MAC operations, the pointer py is used without SIMD
384 * So, py is incremented by 1 */
385 py = py + 1U;
386
387 /* If the count is not a multiple of 4, compute any remaining MACs here.
388 ** No loop unrolling is used. */
389 k = count % 0x4U;
390
391 while (k > 0U)
392 {
393 /* Perform the multiply-accumulate */
394 sum = __SMLALD(*px++, *py--, sum);
395
396 /* Decrement loop counter */
397 k--;
398 }
399
400 /* Store the result in the accumulator in the destination buffer. */
401 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
402
403 /* Update the inputA and inputB pointers for next MAC calculation */
404 py = pIn2 + (count - 1U);
405 px = pIn1;
406
407 /* Increment MAC count */
408 count++;
409
410 /* Decrement loop counter */
411 blockSize1--;
412 }
413
414 /* --------------------------
415 * Initializations of stage2
416 * ------------------------*/
417
418 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
419 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
420 * ....
421 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
422 */
423
424 /* Working pointer of inputA */
425 px = pIn1;
426
427 /* Working pointer of inputB */
428 pSrc2 = pIn2 + (srcBLen - 1U);
429 py = pSrc2;
430
431 /* count is the index by which the pointer pIn1 to be incremented */
432 count = 0U;
433
434 /* -------------------
435 * Stage2 process
436 * ------------------*/
437
438 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
439 * So, to loop unroll over blockSize2,
440 * srcBLen should be greater than or equal to 4 */
441 if (srcBLen >= 4U)
442 {
443 /* Loop unrolling: Compute 4 outputs at a time */
444 blkCnt = blockSize2 >> 2U;
445
446 while (blkCnt > 0U)
447 {
448 py = py - 1U;
449
450 /* Set all accumulators to zero */
451 acc0 = 0;
452 acc1 = 0;
453 acc2 = 0;
454 acc3 = 0;
455
456 /* read x[0], x[1] samples */
457 x0 = read_q15x2 ((q15_t *) px);
458
459 /* read x[1], x[2] samples */
460 x1 = read_q15x2 ((q15_t *) px + 1);
461 px += 2U;
462
463 /* Apply loop unrolling and compute 4 MACs simultaneously. */
464 k = srcBLen >> 2U;
465
466 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
467 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
468 do
469 {
470 /* Read the last two inputB samples using SIMD:
471 * y[srcBLen - 1] and y[srcBLen - 2] */
472 c0 = read_q15x2_da ((q15_t **) &py);
473
474 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
475 acc0 = __SMLALDX(x0, c0, acc0);
476
477 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
478 acc1 = __SMLALDX(x1, c0, acc1);
479
480 /* Read x[2], x[3] */
481 x2 = read_q15x2 ((q15_t *) px);
482
483 /* Read x[3], x[4] */
484 x3 = read_q15x2 ((q15_t *) px + 1);
485
486 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
487 acc2 = __SMLALDX(x2, c0, acc2);
488
489 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
490 acc3 = __SMLALDX(x3, c0, acc3);
491
492 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
493 c0 = read_q15x2_da ((q15_t **) &py);
494
495 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
496 acc0 = __SMLALDX(x2, c0, acc0);
497
498 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
499 acc1 = __SMLALDX(x3, c0, acc1);
500
501 /* Read x[4], x[5] */
502 x0 = read_q15x2 ((q15_t *) px + 2);
503
504 /* Read x[5], x[6] */
505 x1 = read_q15x2 ((q15_t *) px + 3);
506
507 px += 4U;
508
509 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
510 acc2 = __SMLALDX(x0, c0, acc2);
511
512 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
513 acc3 = __SMLALDX(x1, c0, acc3);
514
515 } while (--k);
516
517 /* For the next MAC operations, SIMD is not used
518 * So, the 16 bit pointer if inputB, py is updated */
519
520 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
521 ** No loop unrolling is used. */
522 k = srcBLen % 0x4U;
523
524 if (k == 1U)
525 {
526 /* Read y[srcBLen - 5] */
527 c0 = *(py + 1);
528 #ifdef ARM_MATH_BIG_ENDIAN
529 c0 = c0 << 16U;
530 #else
531 c0 = c0 & 0x0000FFFF;
532 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
533
534 /* Read x[7] */
535 x3 = read_q15x2 ((q15_t *) px);
536 px++;
537
538 /* Perform the multiply-accumulate */
539 acc0 = __SMLALD(x0, c0, acc0);
540 acc1 = __SMLALD(x1, c0, acc1);
541 acc2 = __SMLALDX(x1, c0, acc2);
542 acc3 = __SMLALDX(x3, c0, acc3);
543 }
544
545 if (k == 2U)
546 {
547 /* Read y[srcBLen - 5], y[srcBLen - 6] */
548 c0 = read_q15x2 ((q15_t *) py);
549
550 /* Read x[7], x[8] */
551 x3 = read_q15x2 ((q15_t *) px);
552
553 /* Read x[9] */
554 x2 = read_q15x2 ((q15_t *) px + 1);
555 px += 2U;
556
557 /* Perform the multiply-accumulate */
558 acc0 = __SMLALDX(x0, c0, acc0);
559 acc1 = __SMLALDX(x1, c0, acc1);
560 acc2 = __SMLALDX(x3, c0, acc2);
561 acc3 = __SMLALDX(x2, c0, acc3);
562 }
563
564 if (k == 3U)
565 {
566 /* Read y[srcBLen - 5], y[srcBLen - 6] */
567 c0 = read_q15x2 ((q15_t *) py);
568
569 /* Read x[7], x[8] */
570 x3 = read_q15x2 ((q15_t *) px);
571
572 /* Read x[9] */
573 x2 = read_q15x2 ((q15_t *) px + 1);
574
575 /* Perform the multiply-accumulate */
576 acc0 = __SMLALDX(x0, c0, acc0);
577 acc1 = __SMLALDX(x1, c0, acc1);
578 acc2 = __SMLALDX(x3, c0, acc2);
579 acc3 = __SMLALDX(x2, c0, acc3);
580
581 c0 = *(py-1);
582 #ifdef ARM_MATH_BIG_ENDIAN
583 c0 = c0 << 16U;
584 #else
585 c0 = c0 & 0x0000FFFF;
586 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
587
588 /* Read x[10] */
589 x3 = read_q15x2 ((q15_t *) px + 2);
590 px += 3U;
591
592 /* Perform the multiply-accumulates */
593 acc0 = __SMLALDX(x1, c0, acc0);
594 acc1 = __SMLALD(x2, c0, acc1);
595 acc2 = __SMLALDX(x2, c0, acc2);
596 acc3 = __SMLALDX(x3, c0, acc3);
597 }
598
599 /* Store the result in the accumulator in the destination buffer. */
600 {
601 int32_t sat0 = __SSAT((acc0 >> 15), 16);
602 int32_t sat1 = __SSAT((acc1 >> 15), 16);
603 int32_t sat2 = __SSAT((acc2 >> 15), 16);
604 int32_t sat3 = __SSAT((acc3 >> 15), 16);
605 #ifndef ARM_MATH_BIG_ENDIAN
606 write_q15x2_ia (&pOut, __PKHBT(sat0, sat1, 16));
607 write_q15x2_ia (&pOut, __PKHBT(sat2, sat3, 16));
608 #else
609 write_q15x2_ia (&pOut, __PKHBT(sat1, sat0, 16));
610 write_q15x2_ia (&pOut, __PKHBT(sat3, sat2, 16));
611 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
612 }
613 /* Increment the pointer pIn1 index, count by 4 */
614 count += 4U;
615
616 /* Update the inputA and inputB pointers for next MAC calculation */
617 px = pIn1 + count;
618 py = pSrc2;
619
620 /* Decrement loop counter */
621 blkCnt--;
622 }
623
624 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
625 ** No loop unrolling is used. */
626 blkCnt = blockSize2 % 0x4U;
627
628 while (blkCnt > 0U)
629 {
630 /* Accumulator is made zero for every iteration */
631 sum = 0;
632
633 /* Apply loop unrolling and compute 4 MACs simultaneously. */
634 k = srcBLen >> 2U;
635
636 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
637 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
638 while (k > 0U)
639 {
640 /* Perform the multiply-accumulates */
641 sum += (q63_t) ((q31_t) *px++ * *py--);
642 sum += (q63_t) ((q31_t) *px++ * *py--);
643 sum += (q63_t) ((q31_t) *px++ * *py--);
644 sum += (q63_t) ((q31_t) *px++ * *py--);
645
646 /* Decrement loop counter */
647 k--;
648 }
649
650 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
651 ** No loop unrolling is used. */
652 k = srcBLen % 0x4U;
653
654 while (k > 0U)
655 {
656 /* Perform the multiply-accumulates */
657 sum += (q63_t) ((q31_t) *px++ * *py--);
658
659 /* Decrement the loop counter */
660 k--;
661 }
662
663 /* Store the result in the accumulator in the destination buffer. */
664 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
665
666 /* Increment the pointer pIn1 index, count by 1 */
667 count++;
668
669 /* Update the inputA and inputB pointers for next MAC calculation */
670 px = pIn1 + count;
671 py = pSrc2;
672
673 /* Decrement the loop counter */
674 blkCnt--;
675 }
676 }
677 else
678 {
679 /* If the srcBLen is not a multiple of 4,
680 * the blockSize2 loop cannot be unrolled by 4 */
681 blkCnt = blockSize2;
682
683 while (blkCnt > 0U)
684 {
685 /* Accumulator is made zero for every iteration */
686 sum = 0;
687
688 /* srcBLen number of MACS should be performed */
689 k = srcBLen;
690
691 while (k > 0U)
692 {
693 /* Perform the multiply-accumulate */
694 sum += (q63_t) ((q31_t) *px++ * *py--);
695
696 /* Decrement the loop counter */
697 k--;
698 }
699
700 /* Store the result in the accumulator in the destination buffer. */
701 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
702
703 /* Increment the MAC count */
704 count++;
705
706 /* Update the inputA and inputB pointers for next MAC calculation */
707 px = pIn1 + count;
708 py = pSrc2;
709
710 /* Decrement the loop counter */
711 blkCnt--;
712 }
713 }
714
715
716 /* --------------------------
717 * Initializations of stage3
718 * -------------------------*/
719
720 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
721 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
722 * ....
723 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
724 * sum += x[srcALen-1] * y[srcBLen-1]
725 */
726
727 /* In this stage the MAC operations are decreased by 1 for every iteration.
728 The blockSize3 variable holds the number of MAC operations performed */
729 blockSize3 = srcBLen - 1U;
730
731 /* Working pointer of inputA */
732 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
733 px = pSrc1;
734
735 /* Working pointer of inputB */
736 pSrc2 = pIn2 + (srcBLen - 1U);
737 pIn2 = pSrc2 - 1U;
738 py = pIn2;
739
740 /* -------------------
741 * Stage3 process
742 * ------------------*/
743
744 /* For loop unrolling by 4, this stage is divided into two. */
745 /* First part of this stage computes the MAC operations greater than 4 */
746 /* Second part of this stage computes the MAC operations less than or equal to 4 */
747
748 /* The first part of the stage starts here */
749 j = blockSize3 >> 2U;
750
751 while ((j > 0U) && (blockSize3 > 0U))
752 {
753 /* Accumulator is made zero for every iteration */
754 sum = 0;
755
756 /* Apply loop unrolling and compute 4 MACs simultaneously. */
757 k = blockSize3 >> 2U;
758
759 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
760 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
761 while (k > 0U)
762 {
763 /* Perform the multiply-accumulate */
764 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
765 * with y[srcBLen - 1], y[srcBLen - 2] respectively */
766 sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
767 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
768 * with y[srcBLen - 3], y[srcBLen - 4] respectively */
769 sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
770
771 /* Decrement loop counter */
772 k--;
773 }
774
775 /* For the next MAC operations, the pointer py is used without SIMD
776 * So, py is incremented by 1 */
777 py = py + 1U;
778
779 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
780 ** No loop unrolling is used. */
781 k = blockSize3 % 0x4U;
782
783 while (k > 0U)
784 {
785 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
786 sum = __SMLALD(*px++, *py--, sum);
787
788 /* Decrement loop counter */
789 k--;
790 }
791
792 /* Store the result in the accumulator in the destination buffer. */
793 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
794
795 /* Update the inputA and inputB pointers for next MAC calculation */
796 px = ++pSrc1;
797 py = pIn2;
798
799 /* Decrement loop counter */
800 blockSize3--;
801
802 j--;
803 }
804
805 /* The second part of the stage starts here */
806 /* SIMD is not used for the next MAC operations,
807 * so pointer py is updated to read only one sample at a time */
808 py = py + 1U;
809
810 while (blockSize3 > 0U)
811 {
812 /* Accumulator is made zero for every iteration */
813 sum = 0;
814
815 /* Apply loop unrolling and compute 4 MACs simultaneously. */
816 k = blockSize3;
817
818 while (k > 0U)
819 {
820 /* Perform the multiply-accumulates */
821 /* sum += x[srcALen-1] * y[srcBLen-1] */
822 sum = __SMLALD(*px++, *py--, sum);
823
824 /* Decrement loop counter */
825 k--;
826 }
827
828 /* Store the result in the accumulator in the destination buffer. */
829 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
830
831 /* Update the inputA and inputB pointers for next MAC calculation */
832 px = ++pSrc1;
833 py = pSrc2;
834
835 /* Decrement loop counter */
836 blockSize3--;
837 }
838
839 #else /* #if defined (ARM_MATH_DSP) */
840
841 const q15_t *pIn1 = pSrcA; /* InputA pointer */
842 const q15_t *pIn2 = pSrcB; /* InputB pointer */
843 q63_t sum; /* Accumulator */
844 uint32_t i, j; /* Loop counters */
845
846 /* Loop to calculate convolution for output length number of values */
847 for (i = 0; i < (srcALen + srcBLen - 1); i++)
848 {
849 /* Initialize sum with zero to carry on MAC operations */
850 sum = 0;
851
852 /* Loop to perform MAC operations according to convolution equation */
853 for (j = 0U; j <= i; j++)
854 {
855 /* Check the array limitations */
856 if (((i - j) < srcBLen) && (j < srcALen))
857 {
858 /* z[i] += x[i-j] * y[j] */
859 sum += ((q31_t) pIn1[j] * pIn2[i - j]);
860 }
861 }
862
863 /* Store the output in the destination buffer */
864 pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
865 }
866
867 #endif /* #if defined (ARM_MATH_DSP) */
868
869 }
870 #endif /* defined(ARM_MATH_MVEI) */
871
872 /**
873 @} end of Conv group
874 */
875