1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_partial_q7.c
4 * Description: Partial convolution of Q7 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup PartialConv
37 @{
38 */
39
40 /**
41 @brief Partial convolution of Q7 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written
47 @param[in] firstIndex is the first output sample to start with
48 @param[in] numPoints is the number of output points to be computed
49 @return execution status
50 - \ref ARM_MATH_SUCCESS : Operation successful
51 - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
52
53 @remark
54 Refer to \ref arm_conv_partial_opt_q7() for a faster implementation of this function.
55 */
56
arm_conv_partial_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst,uint32_t firstIndex,uint32_t numPoints)57 ARM_DSP_ATTRIBUTE arm_status arm_conv_partial_q7(
58 const q7_t * pSrcA,
59 uint32_t srcALen,
60 const q7_t * pSrcB,
61 uint32_t srcBLen,
62 q7_t * pDst,
63 uint32_t firstIndex,
64 uint32_t numPoints)
65 {
66
67 #if defined(ARM_MATH_DSP)
68
69 const q7_t *pIn1; /* InputA pointer */
70 const q7_t *pIn2; /* InputB pointer */
71 q7_t *pOut = pDst; /* Output pointer */
72 const q7_t *px; /* Intermediate inputA pointer */
73 const q7_t *py; /* Intermediate inputB pointer */
74 const q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
75 q31_t sum; /* Accumulator */
76 uint32_t j, k, count, blkCnt, check; /* Loop counters */
77 int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
78 arm_status status; /* Status of Partial convolution */
79
80 #if defined (ARM_MATH_LOOPUNROLL)
81 q31_t acc0, acc1, acc2, acc3; /* Accumulator */
82 q31_t input1, input2; /* Temporary input variables */
83 q15_t in1, in2; /* Temporary input variables */
84 q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
85 #endif
86
87 /* Check for range of output samples to be calculated */
88 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
89 {
90 /* Set status as ARM_MATH_ARGUMENT_ERROR */
91 status = ARM_MATH_ARGUMENT_ERROR;
92 }
93 else
94 {
95 /* The algorithm implementation is based on the lengths of the inputs. */
96 /* srcB is always made to slide across srcA. */
97 /* So srcBLen is always considered as shorter or equal to srcALen */
98 if (srcALen >= srcBLen)
99 {
100 /* Initialization of inputA pointer */
101 pIn1 = pSrcA;
102
103 /* Initialization of inputB pointer */
104 pIn2 = pSrcB;
105 }
106 else
107 {
108 /* Initialization of inputA pointer */
109 pIn1 = pSrcB;
110
111 /* Initialization of inputB pointer */
112 pIn2 = pSrcA;
113
114 /* srcBLen is always considered as shorter or equal to srcALen */
115 j = srcBLen;
116 srcBLen = srcALen;
117 srcALen = j;
118 }
119
120 /* Conditions to check which loopCounter holds
121 * the first and last indices of the output samples to be calculated. */
122 check = firstIndex + numPoints;
123 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
124 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
125 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
126 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
127 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
128 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
129
130 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
131 /* The function is internally
132 * divided into three stages according to the number of multiplications that has to be
133 * taken place between inputA samples and inputB samples. In the first stage of the
134 * algorithm, the multiplications increase by one for every iteration.
135 * In the second stage of the algorithm, srcBLen number of multiplications are done.
136 * In the third stage of the algorithm, the multiplications decrease by one
137 * for every iteration. */
138
139 /* Set the output pointer to point to the firstIndex
140 * of the output sample to be calculated. */
141 pOut = pDst + firstIndex;
142
143 /* --------------------------
144 * Initializations of stage1
145 * -------------------------*/
146
147 /* sum = x[0] * y[0]
148 * sum = x[0] * y[1] + x[1] * y[0]
149 * ....
150 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
151 */
152
153 /* In this stage the MAC operations are increased by 1 for every iteration.
154 The count variable holds the number of MAC operations performed.
155 Since the partial convolution starts from firstIndex
156 Number of Macs to be performed is firstIndex + 1 */
157 count = 1U + firstIndex;
158
159 /* Working pointer of inputA */
160 px = pIn1;
161
162 /* Working pointer of inputB */
163 pSrc2 = pIn2 + firstIndex;
164 py = pSrc2;
165
166 /* ------------------------
167 * Stage1 process
168 * ----------------------*/
169
170 /* The first stage starts here */
171 while (blockSize1 > 0)
172 {
173 /* Accumulator is made zero for every iteration */
174 sum = 0;
175
176 #if defined (ARM_MATH_LOOPUNROLL)
177
178 /* Loop unrolling: Compute 4 outputs at a time */
179 k = count >> 2U;
180
181 while (k > 0U)
182 {
183 /* x[0] , x[1] */
184 in1 = (q15_t) *px++;
185 in2 = (q15_t) *px++;
186 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
187
188 /* y[srcBLen - 1] , y[srcBLen - 2] */
189 in1 = (q15_t) *py--;
190 in2 = (q15_t) *py--;
191 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
192
193 /* x[0] * y[srcBLen - 1] */
194 /* x[1] * y[srcBLen - 2] */
195 sum = __SMLAD(input1, input2, sum);
196
197 /* x[2] , x[3] */
198 in1 = (q15_t) *px++;
199 in2 = (q15_t) *px++;
200 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
201
202 /* y[srcBLen - 3] , y[srcBLen - 4] */
203 in1 = (q15_t) *py--;
204 in2 = (q15_t) *py--;
205 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
206
207 /* x[2] * y[srcBLen - 3] */
208 /* x[3] * y[srcBLen - 4] */
209 sum = __SMLAD(input1, input2, sum);
210
211 /* Decrement loop counter */
212 k--;
213 }
214
215 /* Loop unrolling: Compute remaining outputs */
216 k = count % 0x4U;
217
218 #else
219
220 /* Initialize k with number of samples */
221 k = count;
222
223 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
224
225 while (k > 0U)
226 {
227 /* Perform the multiply-accumulate */
228 sum += ((q31_t) * px++ * *py--);
229
230 /* Decrement loop counter */
231 k--;
232 }
233
234 /* Store the result in the accumulator in the destination buffer. */
235 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
236
237 /* Update the inputA and inputB pointers for next MAC calculation */
238 py = ++pSrc2;
239 px = pIn1;
240
241 /* Increment MAC count */
242 count++;
243
244 /* Decrement loop counter */
245 blockSize1--;
246 }
247
248 /* --------------------------
249 * Initializations of stage2
250 * ------------------------*/
251
252 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
253 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
254 * ....
255 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
256 */
257
258 /* Working pointer of inputA */
259 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
260 {
261 pSrc1 = pIn1 + firstIndex - srcBLen + 1;
262 }
263 else
264 {
265 pSrc1 = pIn1;
266 }
267 px = pSrc1;
268
269 /* Working pointer of inputB */
270 pSrc2 = pIn2 + (srcBLen - 1U);
271 py = pSrc2;
272
273 /* count is the index by which the pointer pIn1 to be incremented */
274 count = 0U;
275
276 /* -------------------
277 * Stage2 process
278 * ------------------*/
279
280 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
281 * So, to loop unroll over blockSize2,
282 * srcBLen should be greater than or equal to 4 */
283 if (srcBLen >= 4U)
284 {
285 #if defined (ARM_MATH_LOOPUNROLL)
286
287 /* Loop unrolling: Compute 4 outputs at a time */
288 blkCnt = ((uint32_t) blockSize2 >> 2U);
289
290 while (blkCnt > 0U)
291 {
292 /* Set all accumulators to zero */
293 acc0 = 0;
294 acc1 = 0;
295 acc2 = 0;
296 acc3 = 0;
297
298 /* read x[0], x[1], x[2] samples */
299 x0 = *px++;
300 x1 = *px++;
301 x2 = *px++;
302
303 /* Apply loop unrolling and compute 4 MACs simultaneously. */
304 k = srcBLen >> 2U;
305
306 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
307 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
308 do
309 {
310 /* Read y[srcBLen - 1] sample */
311 c0 = *py--;
312 /* Read y[srcBLen - 2] sample */
313 c1 = *py--;
314
315 /* Read x[3] sample */
316 x3 = *px++;
317
318 /* x[0] and x[1] are packed */
319 in1 = (q15_t) x0;
320 in2 = (q15_t) x1;
321
322 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
323
324 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */
325 in1 = (q15_t) c0;
326 in2 = (q15_t) c1;
327
328 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
329
330 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
331 acc0 = __SMLAD(input1, input2, acc0);
332
333 /* x[1] and x[2] are packed */
334 in1 = (q15_t) x1;
335 in2 = (q15_t) x2;
336
337 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
338
339 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
340 acc1 = __SMLAD(input1, input2, acc1);
341
342 /* x[2] and x[3] are packed */
343 in1 = (q15_t) x2;
344 in2 = (q15_t) x3;
345
346 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
347
348 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
349 acc2 = __SMLAD(input1, input2, acc2);
350
351 /* Read x[4] sample */
352 x0 = *px++;
353
354 /* x[3] and x[4] are packed */
355 in1 = (q15_t) x3;
356 in2 = (q15_t) x0;
357
358 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
359
360 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
361 acc3 = __SMLAD(input1, input2, acc3);
362
363 /* Read y[srcBLen - 3] sample */
364 c0 = *py--;
365 /* Read y[srcBLen - 4] sample */
366 c1 = *py--;
367
368 /* Read x[5] sample */
369 x1 = *px++;
370
371 /* x[2] and x[3] are packed */
372 in1 = (q15_t) x2;
373 in2 = (q15_t) x3;
374
375 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
376
377 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
378 in1 = (q15_t) c0;
379 in2 = (q15_t) c1;
380
381 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
382
383 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
384 acc0 = __SMLAD(input1, input2, acc0);
385
386 /* x[3] and x[4] are packed */
387 in1 = (q15_t) x3;
388 in2 = (q15_t) x0;
389
390 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
391
392 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
393 acc1 = __SMLAD(input1, input2, acc1);
394
395 /* x[4] and x[5] are packed */
396 in1 = (q15_t) x0;
397 in2 = (q15_t) x1;
398
399 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
400
401 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
402 acc2 = __SMLAD(input1, input2, acc2);
403
404 /* Read x[6] sample */
405 x2 = *px++;
406
407 /* x[5] and x[6] are packed */
408 in1 = (q15_t) x1;
409 in2 = (q15_t) x2;
410
411 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
412
413 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
414 acc3 = __SMLAD(input1, input2, acc3);
415
416 } while (--k);
417
418 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
419 ** No loop unrolling is used. */
420 k = srcBLen % 0x4U;
421
422 while (k > 0U)
423 {
424 /* Read y[srcBLen - 5] sample */
425 c0 = *py--;
426 /* Read x[7] sample */
427 x3 = *px++;
428
429 /* Perform the multiply-accumulates */
430 /* acc0 += x[4] * y[srcBLen - 5] */
431 acc0 += ((q31_t) x0 * c0);
432 /* acc1 += x[5] * y[srcBLen - 5] */
433 acc1 += ((q31_t) x1 * c0);
434 /* acc2 += x[6] * y[srcBLen - 5] */
435 acc2 += ((q31_t) x2 * c0);
436 /* acc3 += x[7] * y[srcBLen - 5] */
437 acc3 += ((q31_t) x3 * c0);
438
439 /* Reuse the present samples for the next MAC */
440 x0 = x1;
441 x1 = x2;
442 x2 = x3;
443
444 /* Decrement the loop counter */
445 k--;
446 }
447
448 /* Store the result in the accumulator in the destination buffer. */
449 *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8));
450 *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8));
451 *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8));
452 *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8));
453
454 /* Increment the pointer pIn1 index, count by 4 */
455 count += 4U;
456
457 /* Update the inputA and inputB pointers for next MAC calculation */
458 px = pSrc1 + count;
459 py = pSrc2;
460
461 /* Decrement loop counter */
462 blkCnt--;
463 }
464
465 /* Loop unrolling: Compute remaining outputs */
466 blkCnt = (uint32_t) blockSize2 % 0x4U;
467
468 #else
469
470 /* Initialize blkCnt with number of samples */
471 blkCnt = blockSize2;
472
473 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
474
475 while (blkCnt > 0U)
476 {
477 /* Accumulator is made zero for every iteration */
478 sum = 0;
479
480 #if defined (ARM_MATH_LOOPUNROLL)
481
482 /* Loop unrolling: Compute 4 outputs at a time */
483 k = srcBLen >> 2U;
484
485 while (k > 0U)
486 {
487 /* Reading two inputs of SrcA buffer and packing */
488 in1 = (q15_t) *px++;
489 in2 = (q15_t) *px++;
490 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
491
492 /* Reading two inputs of SrcB buffer and packing */
493 in1 = (q15_t) *py--;
494 in2 = (q15_t) *py--;
495 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
496
497 /* Perform the multiply-accumulate */
498 sum = __SMLAD(input1, input2, sum);
499
500 /* Reading two inputs of SrcA buffer and packing */
501 in1 = (q15_t) *px++;
502 in2 = (q15_t) *px++;
503 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
504
505 /* Reading two inputs of SrcB buffer and packing */
506 in1 = (q15_t) *py--;
507 in2 = (q15_t) *py--;
508 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
509
510 /* Perform the multiply-accumulate */
511 sum = __SMLAD(input1, input2, sum);
512
513 /* Decrement loop counter */
514 k--;
515 }
516
517 /* Loop unrolling: Compute remaining outputs */
518 k = srcBLen % 0x4U;
519
520 #else
521
522 /* Initialize blkCnt with number of samples */
523 k = srcBLen;
524
525 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
526
527 while (k > 0U)
528 {
529 /* Perform the multiply-accumulate */
530 sum += ((q31_t) * px++ * *py--);
531
532 /* Decrement loop counter */
533 k--;
534 }
535
536 /* Store the result in the accumulator in the destination buffer. */
537 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
538
539 /* Increment the pointer pIn1 index, count by 1 */
540 count++;
541
542 /* Update the inputA and inputB pointers for next MAC calculation */
543 px = pSrc1 + count;
544 py = pSrc2;
545
546 /* Decrement loop counter */
547 blkCnt--;
548 }
549 }
550 else
551 {
552 /* If the srcBLen is not a multiple of 4,
553 * the blockSize2 loop cannot be unrolled by 4 */
554 blkCnt = (uint32_t) blockSize2;
555
556 while (blkCnt > 0U)
557 {
558 /* Accumulator is made zero for every iteration */
559 sum = 0;
560
561 /* srcBLen number of MACS should be performed */
562 k = srcBLen;
563
564 while (k > 0U)
565 {
566 /* Perform the multiply-accumulate */
567 sum += ((q31_t) * px++ * *py--);
568
569 /* Decrement loop counter */
570 k--;
571 }
572
573 /* Store the result in the accumulator in the destination buffer. */
574 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
575
576 /* Increment the MAC count */
577 count++;
578
579 /* Update the inputA and inputB pointers for next MAC calculation */
580 px = pSrc1 + count;
581 py = pSrc2;
582
583 /* Decrement the loop counter */
584 blkCnt--;
585 }
586 }
587
588
589 /* --------------------------
590 * Initializations of stage3
591 * -------------------------*/
592
593 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
594 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
595 * ....
596 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
597 * sum += x[srcALen-1] * y[srcBLen-1]
598 */
599
600 /* In this stage the MAC operations are decreased by 1 for every iteration.
601 The count variable holds the number of MAC operations performed */
602 count = srcBLen - 1U;
603
604 /* Working pointer of inputA */
605 if (firstIndex > srcALen)
606 {
607 pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
608 }
609 else
610 {
611 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
612 }
613 px = pSrc1;
614
615 /* Working pointer of inputB */
616 pSrc2 = pIn2 + (srcBLen - 1U);
617 py = pSrc2;
618
619 /* -------------------
620 * Stage3 process
621 * ------------------*/
622
623 while (blockSize3 > 0)
624 {
625 /* Accumulator is made zero for every iteration */
626 sum = 0;
627
628 #if defined (ARM_MATH_LOOPUNROLL)
629
630 /* Loop unrolling: Compute 4 outputs at a time */
631 k = count >> 2U;
632
633 while (k > 0U)
634 {
635 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
636 in1 = (q15_t) *px++;
637 in2 = (q15_t) *px++;
638 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
639
640 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
641 in1 = (q15_t) *py--;
642 in2 = (q15_t) *py--;
643 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
644
645 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
646 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
647 sum = __SMLAD(input1, input2, sum);
648
649 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
650 in1 = (q15_t) *px++;
651 in2 = (q15_t) *px++;
652 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
653
654 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
655 in1 = (q15_t) *py--;
656 in2 = (q15_t) *py--;
657 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
658
659 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
660 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
661 sum = __SMLAD(input1, input2, sum);
662
663 /* Decrement loop counter */
664 k--;
665 }
666
667 /* Loop unrolling: Compute remaining outputs */
668 k = count % 0x4U;
669
670 #else
671
672 /* Initialize blkCnt with number of samples */
673 k = count;
674
675 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
676
677 while (k > 0U)
678 {
679 /* Perform the multiply-accumulates */
680 /* sum += x[srcALen-1] * y[srcBLen-1] */
681 sum += ((q31_t) * px++ * *py--);
682
683 /* Decrement loop counter */
684 k--;
685 }
686
687 /* Store the result in the accumulator in the destination buffer. */
688 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
689
690 /* Update the inputA and inputB pointers for next MAC calculation */
691 px = ++pSrc1;
692 py = pSrc2;
693
694 /* Decrement MAC count */
695 count--;
696
697 /* Decrement the loop counter */
698 blockSize3--;
699 }
700
701 /* Set status as ARM_MATH_SUCCESS */
702 status = ARM_MATH_SUCCESS;
703 }
704
705 /* Return to application */
706 return (status);
707
708 #else
709 /* alternate version for CM0_FAMILY */
710
711 const q7_t *pIn1 = pSrcA; /* InputA pointer */
712 const q7_t *pIn2 = pSrcB; /* InputB pointer */
713 q31_t sum; /* Accumulator */
714 uint32_t i, j; /* Loop counters */
715 arm_status status; /* Status of Partial convolution */
716
717 /* Check for range of output samples to be calculated */
718 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
719 {
720 /* Set status as ARM_MATH_ARGUMENT_ERROR */
721 status = ARM_MATH_ARGUMENT_ERROR;
722 }
723 else
724 {
725 /* Loop to calculate convolution for output length number of values */
726 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
727 {
728 /* Initialize sum with zero to carry on MAC operations */
729 sum = 0;
730
731 /* Loop to perform MAC operations according to convolution equation */
732 for (j = 0U; j <= i; j++)
733 {
734 /* Check the array limitations */
735 if (((i - j) < srcBLen) && (j < srcALen))
736 {
737 /* z[i] += x[i-j] * y[j] */
738 sum += ((q15_t) pIn1[j] * (pIn2[i - j]));
739 }
740 }
741
742 /* Store the output in the destination buffer */
743 pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U);
744 }
745
746 /* Set status as ARM_MATH_SUCCESS */
747 status = ARM_MATH_SUCCESS;
748 }
749
750 /* Return to application */
751 return (status);
752
753 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
754
755 }
756
757 /**
758 @} end of PartialConv group
759 */
760