1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_q7.c
4 * Description: Convolution of Q7 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Conv
37 @{
38 */
39
40 /**
41 @brief Convolution of Q7 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
47
48 @par Scaling and Overflow Behavior
49 The function is implemented using a 32-bit internal accumulator.
50 Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
51 The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
52 This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
53 The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
54 @remark
55 Refer to \ref arm_conv_opt_q7() for a faster implementation of this function.
56 */
57
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
59 #include "arm_helium_utils.h"
60
61 #include "arm_vec_filtering.h"
62
arm_conv_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst)63 ARM_DSP_ATTRIBUTE void arm_conv_q7(
64 const q7_t * pSrcA,
65 uint32_t srcALen,
66 const q7_t * pSrcB,
67 uint32_t srcBLen,
68 q7_t * pDst)
69 {
70 const q7_t *pIn1 = pSrcA; /* inputA pointer */
71 const q7_t *pIn2 = pSrcB; /* inputB pointer */
72 /*
73 * Loop to perform MAC operations according to correlation equation
74 */
75 const q7_t *pX;
76 const q7_t *pY;
77 const q7_t *pA;
78 const q7_t *pB;
79 int32_t i = 0U, j = 0; /* loop counters */
80 int32_t block1, block2, block3;
81 uint8_t vddupStartIdx = 15;
82 uint8x16_t decrIdxVec = vddupq_u8(vddupStartIdx, 1);
83
84 if (srcALen < srcBLen)
85 {
86 /*
87 * Initialization to inputB pointer
88 */
89 pIn1 = pSrcB;
90 /*
91 * Initialization to the end of inputA pointer
92 */
93 pIn2 = pSrcA;
94 /*
95 * Swapping the lengths
96 */
97 j = srcALen;
98 srcALen = srcBLen;
99 srcBLen = j;
100 }
101
102 block1 = srcBLen - 1;
103 block2 = srcALen - srcBLen + 1;
104 block3 = srcBLen - 1;
105
106 pA = pIn1;
107 pB = pIn2 - 15;
108
109 for (i = 0; i <= block1 - 2; i += 2)
110 {
111 uint32_t count = i + 1;
112 int32_t acc0 = 0;
113 int32_t acc1 = 0;
114
115 pX = pA;
116 pY = pB;
117
118 MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count);
119 *pDst++ = (q7_t) acc0;
120 *pDst++ = (q7_t) acc1;
121 pB += 2;
122 }
123 for (; i < block1; i++)
124 {
125 uint32_t count = i + 1;
126 int32_t acc = 0;
127
128 pX = pA;
129 pY = pB;
130
131 MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count);
132 *pDst++ = (q7_t) acc;
133 pB++;
134 }
135
136 for (i = 0; i <= block2 - 4; i += 4)
137 {
138 uint32_t count = srcBLen;
139 int32_t acc0 = 0;
140 int32_t acc1 = 0;
141 int32_t acc2 = 0;
142 int32_t acc3 = 0;
143
144 pX = pA;
145 pY = pB;
146 /*
147 * compute 4 accumulators per loop
148 * size is fixed for all accumulators
149 * X pointer is incrementing for successive accumulators
150 */
151 MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count);
152 *pDst++ = (q7_t) acc0;
153 *pDst++ = (q7_t) acc1;
154 *pDst++ = (q7_t) acc2;
155 *pDst++ = (q7_t) acc3;
156 pA += 4;
157 }
158 for (; i <= block2 - 2; i += 2)
159 {
160 uint32_t count = srcBLen;
161 int32_t acc0 = 0;
162 int32_t acc1 = 0;
163
164 pX = pA;
165 pY = pB;
166 /*
167 * compute 2 accumulators per loop
168 * size is fixed for all accumulators
169 * X pointer is incrementing for successive accumulators
170 */
171 MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count);
172 *pDst++ = (q7_t) acc0;
173 *pDst++ = (q7_t) acc1;
174 pA += 2;
175 }
176 if (block2 & 1)
177 {
178 uint32_t count = srcBLen;
179 int32_t acc = 0;
180
181 pX = pA;
182 pY = pB;
183
184 MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count);
185 *pDst++ = (q7_t) acc;
186 pA++;
187 }
188
189 for (i = block3; i >= 2; i -= 2)
190 {
191 uint32_t count = i;
192 int32_t acc0 = 0;
193 int32_t acc1 = 0;
194
195 pX = pA;
196 pY = pB;
197
198 MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count);
199 *pDst++ = (q7_t) acc0;
200 *pDst++ = (q7_t) acc1;
201 pA += 2;
202 }
203 for (; i > 0; i--)
204 {
205 uint32_t count = i;
206 int32_t acc = 0;
207
208 pX = pA;
209 pY = pB;
210
211 MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count);
212 *pDst++ = (q7_t) acc;
213 pA++;
214 }
215
216 }
217
218 #else
arm_conv_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst)219 ARM_DSP_ATTRIBUTE void arm_conv_q7(
220 const q7_t * pSrcA,
221 uint32_t srcALen,
222 const q7_t * pSrcB,
223 uint32_t srcBLen,
224 q7_t * pDst)
225 {
226
227 #if (1)
228 //#if !defined(ARM_MATH_CM0_FAMILY)
229
230 const q7_t *pIn1; /* InputA pointer */
231 const q7_t *pIn2; /* InputB pointer */
232 q7_t *pOut = pDst; /* Output pointer */
233 const q7_t *px; /* Intermediate inputA pointer */
234 const q7_t *py; /* Intermediate inputB pointer */
235 const q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
236 q31_t sum; /* Accumulators */
237 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
238 uint32_t j, k, count, blkCnt; /* Loop counters */
239
240 #if defined (ARM_MATH_LOOPUNROLL)
241 q31_t acc0, acc1, acc2, acc3; /* Accumulators */
242 q31_t input1, input2; /* Temporary input variables */
243 q15_t in1, in2; /* Temporary input variables */
244 q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
245 #endif
246
247 /* The algorithm implementation is based on the lengths of the inputs. */
248 /* srcB is always made to slide across srcA. */
249 /* So srcBLen is always considered as shorter or equal to srcALen */
250 if (srcALen >= srcBLen)
251 {
252 /* Initialization of inputA pointer */
253 pIn1 = pSrcA;
254
255 /* Initialization of inputB pointer */
256 pIn2 = pSrcB;
257 }
258 else
259 {
260 /* Initialization of inputA pointer */
261 pIn1 = pSrcB;
262
263 /* Initialization of inputB pointer */
264 pIn2 = pSrcA;
265
266 /* srcBLen is always considered as shorter or equal to srcALen */
267 j = srcBLen;
268 srcBLen = srcALen;
269 srcALen = j;
270 }
271
272 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
273 /* The function is internally
274 * divided into three stages according to the number of multiplications that has to be
275 * taken place between inputA samples and inputB samples. In the first stage of the
276 * algorithm, the multiplications increase by one for every iteration.
277 * In the second stage of the algorithm, srcBLen number of multiplications are done.
278 * In the third stage of the algorithm, the multiplications decrease by one
279 * for every iteration. */
280
281 /* The algorithm is implemented in three stages.
282 The loop counters of each stage is initiated here. */
283 blockSize1 = srcBLen - 1U;
284 blockSize2 = srcALen - (srcBLen - 1U);
285 blockSize3 = blockSize1;
286
287 /* --------------------------
288 * Initializations of stage1
289 * -------------------------*/
290
291 /* sum = x[0] * y[0]
292 * sum = x[0] * y[1] + x[1] * y[0]
293 * ....
294 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
295 */
296
297 /* In this stage the MAC operations are increased by 1 for every iteration.
298 The count variable holds the number of MAC operations performed */
299 count = 1U;
300
301 /* Working pointer of inputA */
302 px = pIn1;
303
304 /* Working pointer of inputB */
305 py = pIn2;
306
307
308 /* ------------------------
309 * Stage1 process
310 * ----------------------*/
311
312 /* The first stage starts here */
313 while (blockSize1 > 0U)
314 {
315 /* Accumulator is made zero for every iteration */
316 sum = 0;
317
318 #if defined (ARM_MATH_LOOPUNROLL)
319
320 /* Loop unrolling: Compute 4 outputs at a time */
321 k = count >> 2U;
322
323 while (k > 0U)
324 {
325 /* x[0] , x[1] */
326 in1 = (q15_t) *px++;
327 in2 = (q15_t) *px++;
328 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
329
330 /* y[srcBLen - 1] , y[srcBLen - 2] */
331 in1 = (q15_t) *py--;
332 in2 = (q15_t) *py--;
333 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
334
335 /* x[0] * y[srcBLen - 1] */
336 /* x[1] * y[srcBLen - 2] */
337 sum = __SMLAD(input1, input2, sum);
338
339 /* x[2] , x[3] */
340 in1 = (q15_t) *px++;
341 in2 = (q15_t) *px++;
342 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
343
344 /* y[srcBLen - 3] , y[srcBLen - 4] */
345 in1 = (q15_t) *py--;
346 in2 = (q15_t) *py--;
347 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
348
349 /* x[2] * y[srcBLen - 3] */
350 /* x[3] * y[srcBLen - 4] */
351 sum = __SMLAD(input1, input2, sum);
352
353 /* Decrement loop counter */
354 k--;
355 }
356
357 /* Loop unrolling: Compute remaining outputs */
358 k = count % 0x4U;
359
360 #else
361
362 /* Initialize k with number of samples */
363 k = count;
364
365 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
366
367 while (k > 0U)
368 {
369 /* Perform the multiply-accumulate */
370 sum += ((q15_t) *px++ * *py--);
371
372 /* Decrement loop counter */
373 k--;
374 }
375
376 /* Store the result in the accumulator in the destination buffer. */
377 *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
378
379 /* Update the inputA and inputB pointers for next MAC calculation */
380 py = pIn2 + count;
381 px = pIn1;
382
383 /* Increment MAC count */
384 count++;
385
386 /* Decrement loop counter */
387 blockSize1--;
388 }
389
390 /* --------------------------
391 * Initializations of stage2
392 * ------------------------*/
393
394 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
395 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
396 * ....
397 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
398 */
399
400 /* Working pointer of inputA */
401 px = pIn1;
402
403 /* Working pointer of inputB */
404 pSrc2 = pIn2 + (srcBLen - 1U);
405 py = pSrc2;
406
407 /* count is index by which the pointer pIn1 to be incremented */
408 count = 0U;
409
410 /* -------------------
411 * Stage2 process
412 * ------------------*/
413
414 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
415 * So, to loop unroll over blockSize2,
416 * srcBLen should be greater than or equal to 4 */
417 if (srcBLen >= 4U)
418 {
419 #if defined (ARM_MATH_LOOPUNROLL)
420
421 /* Loop unrolling: Compute 4 outputs at a time */
422 blkCnt = blockSize2 >> 2U;
423
424 while (blkCnt > 0U)
425 {
426 /* Set all accumulators to zero */
427 acc0 = 0;
428 acc1 = 0;
429 acc2 = 0;
430 acc3 = 0;
431
432 /* read x[0], x[1], x[2] samples */
433 x0 = *px++;
434 x1 = *px++;
435 x2 = *px++;
436
437 /* Apply loop unrolling and compute 4 MACs simultaneously. */
438 k = srcBLen >> 2U;
439
440 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
441 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
442 do
443 {
444 /* Read y[srcBLen - 1] sample */
445 c0 = *py--;
446 /* Read y[srcBLen - 2] sample */
447 c1 = *py--;
448
449 /* Read x[3] sample */
450 x3 = *px++;
451
452 /* x[0] and x[1] are packed */
453 in1 = (q15_t) x0;
454 in2 = (q15_t) x1;
455
456 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
457
458 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */
459 in1 = (q15_t) c0;
460 in2 = (q15_t) c1;
461
462 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
463
464 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
465 acc0 = __SMLAD(input1, input2, acc0);
466
467 /* x[1] and x[2] are packed */
468 in1 = (q15_t) x1;
469 in2 = (q15_t) x2;
470
471 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
472
473 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
474 acc1 = __SMLAD(input1, input2, acc1);
475
476 /* x[2] and x[3] are packed */
477 in1 = (q15_t) x2;
478 in2 = (q15_t) x3;
479
480 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
481
482 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
483 acc2 = __SMLAD(input1, input2, acc2);
484
485 /* Read x[4] sample */
486 x0 = *px++;
487
488 /* x[3] and x[4] are packed */
489 in1 = (q15_t) x3;
490 in2 = (q15_t) x0;
491
492 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
493
494 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
495 acc3 = __SMLAD(input1, input2, acc3);
496
497 /* Read y[srcBLen - 3] sample */
498 c0 = *py--;
499 /* Read y[srcBLen - 4] sample */
500 c1 = *py--;
501
502 /* Read x[5] sample */
503 x1 = *px++;
504
505 /* x[2] and x[3] are packed */
506 in1 = (q15_t) x2;
507 in2 = (q15_t) x3;
508
509 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
510
511 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
512 in1 = (q15_t) c0;
513 in2 = (q15_t) c1;
514
515 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
516
517 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
518 acc0 = __SMLAD(input1, input2, acc0);
519
520 /* x[3] and x[4] are packed */
521 in1 = (q15_t) x3;
522 in2 = (q15_t) x0;
523
524 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
525
526 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
527 acc1 = __SMLAD(input1, input2, acc1);
528
529 /* x[4] and x[5] are packed */
530 in1 = (q15_t) x0;
531 in2 = (q15_t) x1;
532
533 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
534
535 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
536 acc2 = __SMLAD(input1, input2, acc2);
537
538 /* Read x[6] sample */
539 x2 = *px++;
540
541 /* x[5] and x[6] are packed */
542 in1 = (q15_t) x1;
543 in2 = (q15_t) x2;
544
545 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
546
547 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
548 acc3 = __SMLAD(input1, input2, acc3);
549
550 } while (--k);
551
552 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
553 ** No loop unrolling is used. */
554 k = srcBLen % 0x4U;
555
556 while (k > 0U)
557 {
558 /* Read y[srcBLen - 5] sample */
559 c0 = *py--;
560 /* Read x[7] sample */
561 x3 = *px++;
562
563 /* Perform the multiply-accumulates */
564 /* acc0 += x[4] * y[srcBLen - 5] */
565 acc0 += ((q15_t) x0 * c0);
566 /* acc1 += x[5] * y[srcBLen - 5] */
567 acc1 += ((q15_t) x1 * c0);
568 /* acc2 += x[6] * y[srcBLen - 5] */
569 acc2 += ((q15_t) x2 * c0);
570 /* acc3 += x[7] * y[srcBLen - 5] */
571 acc3 += ((q15_t) x3 * c0);
572
573 /* Reuse the present samples for the next MAC */
574 x0 = x1;
575 x1 = x2;
576 x2 = x3;
577
578 /* Decrement loop counter */
579 k--;
580 }
581
582 /* Store the result in the accumulator in the destination buffer. */
583 *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
584 *pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8));
585 *pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8));
586 *pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8));
587
588 /* Increment the pointer pIn1 index, count by 4 */
589 count += 4U;
590
591 /* Update the inputA and inputB pointers for next MAC calculation */
592 px = pIn1 + count;
593 py = pSrc2;
594
595 /* Decrement loop counter */
596 blkCnt--;
597 }
598
599 /* Loop unrolling: Compute remaining outputs */
600 blkCnt = blockSize2 % 0x4U;
601
602 #else
603
604 /* Initialize blkCnt with number of samples */
605 blkCnt = blockSize2;
606
607 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
608
609 while (blkCnt > 0U)
610 {
611 /* Accumulator is made zero for every iteration */
612 sum = 0;
613
614 #if defined (ARM_MATH_LOOPUNROLL)
615
616 /* Loop unrolling: Compute 4 outputs at a time */
617 k = srcBLen >> 2U;
618
619 while (k > 0U)
620 {
621
622 /* Reading two inputs of SrcA buffer and packing */
623 in1 = (q15_t) *px++;
624 in2 = (q15_t) *px++;
625 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
626
627 /* Reading two inputs of SrcB buffer and packing */
628 in1 = (q15_t) *py--;
629 in2 = (q15_t) *py--;
630 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
631
632 /* Perform the multiply-accumulate */
633 sum = __SMLAD(input1, input2, sum);
634
635 /* Reading two inputs of SrcA buffer and packing */
636 in1 = (q15_t) *px++;
637 in2 = (q15_t) *px++;
638 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
639
640 /* Reading two inputs of SrcB buffer and packing */
641 in1 = (q15_t) *py--;
642 in2 = (q15_t) *py--;
643 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
644
645 /* Perform the multiply-accumulate */
646 sum = __SMLAD(input1, input2, sum);
647
648 /* Decrement loop counter */
649 k--;
650 }
651
652 /* Loop unrolling: Compute remaining outputs */
653 k = srcBLen % 0x4U;
654
655 #else
656
657 /* Initialize blkCnt with number of samples */
658 k = srcBLen;
659
660 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
661
662 while (k > 0U)
663 {
664 /* Perform the multiply-accumulate */
665 sum += ((q15_t) *px++ * *py--);
666
667 /* Decrement the loop counter */
668 k--;
669 }
670
671 /* Store the result in the accumulator in the destination buffer. */
672 *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
673
674 /* Increment the pointer pIn1 index, count by 1 */
675 count++;
676
677 /* Update the inputA and inputB pointers for next MAC calculation */
678 px = pIn1 + count;
679 py = pSrc2;
680
681 /* Decrement the loop counter */
682 blkCnt--;
683 }
684 }
685 else
686 {
687 /* If the srcBLen is not a multiple of 4,
688 * the blockSize2 loop cannot be unrolled by 4 */
689 blkCnt = blockSize2;
690
691 while (blkCnt > 0U)
692 {
693 /* Accumulator is made zero for every iteration */
694 sum = 0;
695
696 /* srcBLen number of MACS should be performed */
697 k = srcBLen;
698
699 while (k > 0U)
700 {
701 /* Perform the multiply-accumulate */
702 sum += ((q15_t) *px++ * *py--);
703
704 /* Decrement the loop counter */
705 k--;
706 }
707
708 /* Store the result in the accumulator in the destination buffer. */
709 *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
710
711 /* Increment the MAC count */
712 count++;
713
714 /* Update the inputA and inputB pointers for next MAC calculation */
715 px = pIn1 + count;
716 py = pSrc2;
717
718 /* Decrement loop counter */
719 blkCnt--;
720 }
721 }
722
723
724 /* --------------------------
725 * Initializations of stage3
726 * -------------------------*/
727
728 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
729 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
730 * ....
731 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
732 * sum += x[srcALen-1] * y[srcBLen-1]
733 */
734
735 /* In this stage the MAC operations are decreased by 1 for every iteration.
736 The blockSize3 variable holds the number of MAC operations performed */
737
738 /* Working pointer of inputA */
739 pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
740 px = pSrc1;
741
742 /* Working pointer of inputB */
743 pSrc2 = pIn2 + (srcBLen - 1U);
744 py = pSrc2;
745
746 /* -------------------
747 * Stage3 process
748 * ------------------*/
749
750 while (blockSize3 > 0U)
751 {
752 /* Accumulator is made zero for every iteration */
753 sum = 0;
754
755 #if defined (ARM_MATH_LOOPUNROLL)
756
757 /* Loop unrolling: Compute 4 outputs at a time */
758 k = blockSize3 >> 2U;
759
760 while (k > 0U)
761 {
762 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
763 in1 = (q15_t) *px++;
764 in2 = (q15_t) *px++;
765 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
766
767 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
768 in1 = (q15_t) *py--;
769 in2 = (q15_t) *py--;
770 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
771
772 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
773 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
774 sum = __SMLAD(input1, input2, sum);
775
776 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
777 in1 = (q15_t) *px++;
778 in2 = (q15_t) *px++;
779 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
780
781 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
782 in1 = (q15_t) *py--;
783 in2 = (q15_t) *py--;
784 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
785
786 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
787 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
788 sum = __SMLAD(input1, input2, sum);
789
790 /* Decrement loop counter */
791 k--;
792 }
793
794 /* Loop unrolling: Compute remaining outputs */
795 k = blockSize3 % 0x4U;
796
797 #else
798
799 /* Initialize blkCnt with number of samples */
800 k = blockSize3;
801
802 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
803
804 while (k > 0U)
805 {
806 /* Perform the multiply-accumulate */
807 /* sum += x[srcALen-1] * y[srcBLen-1] */
808 sum += ((q15_t) *px++ * *py--);
809
810 /* Decrement loop counter */
811 k--;
812 }
813
814 /* Store the result in the accumulator in the destination buffer. */
815 *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
816
817 /* Update the inputA and inputB pointers for next MAC calculation */
818 px = ++pSrc1;
819 py = pSrc2;
820
821 /* Decrement loop counter */
822 blockSize3--;
823 }
824
825 #else
826 /* alternate version for CM0_FAMILY */
827
828 const q7_t *pIn1 = pSrcA; /* InputA pointer */
829 const q7_t *pIn2 = pSrcB; /* InputB pointer */
830 q31_t sum; /* Accumulator */
831 uint32_t i, j; /* Loop counters */
832
833 /* Loop to calculate convolution for output length number of times */
834 for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
835 {
836 /* Initialize sum with zero to carry out MAC operations */
837 sum = 0;
838
839 /* Loop to perform MAC operations according to convolution equation */
840 for (j = 0U; j <= i; j++)
841 {
842 /* Check the array limitations */
843 if (((i - j) < srcBLen) && (j < srcALen))
844 {
845 /* z[i] += x[i-j] * y[j] */
846 sum += ((q15_t) pIn1[j] * pIn2[i - j]);
847 }
848 }
849
850 /* Store the output in the destination buffer */
851 pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U);
852 }
853
854 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
855
856 }
857 #endif /* defined(ARM_MATH_MVEI) */
858
859 /**
860 @} end of Conv group
861 */
862