1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_partial_q15.c
4 * Description: Partial convolution of Q15 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup PartialConv
37 @{
38 */
39
40 /**
41 @brief Partial convolution of Q15 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written
47 @param[in] firstIndex is the first output sample to start with
48 @param[in] numPoints is the number of output points to be computed
49 @return execution status
50 - \ref ARM_MATH_SUCCESS : Operation successful
51 - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
52
53 @remark
54 Refer to \ref arm_conv_partial_fast_q15() for a faster but less precise version of this function.
55 @remark
56 Refer to \ref arm_conv_partial_opt_q15() for a faster implementation of this function using scratch buffers.
57 */
58
arm_conv_partial_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,uint32_t firstIndex,uint32_t numPoints)59 ARM_DSP_ATTRIBUTE arm_status arm_conv_partial_q15(
60 const q15_t * pSrcA,
61 uint32_t srcALen,
62 const q15_t * pSrcB,
63 uint32_t srcBLen,
64 q15_t * pDst,
65 uint32_t firstIndex,
66 uint32_t numPoints)
67 {
68
69 #if defined (ARM_MATH_DSP)
70
71 const q15_t *pIn1; /* InputA pointer */
72 const q15_t *pIn2; /* InputB pointer */
73 q15_t *pOut = pDst; /* Output pointer */
74 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
75 const q15_t *px; /* Intermediate inputA pointer */
76 const q15_t *py; /* Intermediate inputB pointer */
77 const q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
78 q31_t x0, x1, x2, x3, c0; /* Temporary input variables to hold state and coefficient values */
79 int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
80 uint32_t j, k, count, blkCnt, check;
81 arm_status status; /* Status of Partial convolution */
82
83 /* Check for range of output samples to be calculated */
84 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
85 {
86 /* Set status as ARM_MATH_ARGUMENT_ERROR */
87 status = ARM_MATH_ARGUMENT_ERROR;
88 }
89 else
90 {
91 /* The algorithm implementation is based on the lengths of the inputs. */
92 /* srcB is always made to slide across srcA. */
93 /* So srcBLen is always considered as shorter or equal to srcALen */
94 if (srcALen >= srcBLen)
95 {
96 /* Initialization of inputA pointer */
97 pIn1 = pSrcA;
98
99 /* Initialization of inputB pointer */
100 pIn2 = pSrcB;
101 }
102 else
103 {
104 /* Initialization of inputA pointer */
105 pIn1 = pSrcB;
106
107 /* Initialization of inputB pointer */
108 pIn2 = pSrcA;
109
110 /* srcBLen is always considered as shorter or equal to srcALen */
111 j = srcBLen;
112 srcBLen = srcALen;
113 srcALen = j;
114 }
115
116 /* Conditions to check which loopCounter holds
117 * the first and last indices of the output samples to be calculated. */
118 check = firstIndex + numPoints;
119 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
120 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
121 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
122 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
123 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
124 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
125
126 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
127 /* The function is internally
128 * divided into three stages according to the number of multiplications that has to be
129 * taken place between inputA samples and inputB samples. In the first stage of the
130 * algorithm, the multiplications increase by one for every iteration.
131 * In the second stage of the algorithm, srcBLen number of multiplications are done.
132 * In the third stage of the algorithm, the multiplications decrease by one
133 * for every iteration. */
134
135 /* Set the output pointer to point to the firstIndex
136 * of the output sample to be calculated. */
137 pOut = pDst + firstIndex;
138
139 /* --------------------------
140 * Initializations of stage1
141 * -------------------------*/
142
143 /* sum = x[0] * y[0]
144 * sum = x[0] * y[1] + x[1] * y[0]
145 * ....
146 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
147 */
148
149 /* In this stage the MAC operations are increased by 1 for every iteration.
150 The count variable holds the number of MAC operations performed.
151 Since the partial convolution starts from firstIndex
152 Number of Macs to be performed is firstIndex + 1 */
153 count = 1U + firstIndex;
154
155 /* Working pointer of inputA */
156 px = pIn1;
157
158 /* Working pointer of inputB */
159 pSrc2 = pIn2 + firstIndex;
160 py = pSrc2;
161
162 /* ------------------------
163 * Stage1 process
164 * ----------------------*/
165
166 /* For loop unrolling by 4, this stage is divided into two. */
167 /* First part of this stage computes the MAC operations less than 4 */
168 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
169
170 /* The first part of the stage starts here */
171 while ((count < 4U) && (blockSize1 > 0))
172 {
173 /* Accumulator is made zero for every iteration */
174 sum = 0;
175
176 /* Loop over number of MAC operations between
177 * inputA samples and inputB samples */
178 k = count;
179
180 while (k > 0U)
181 {
182 /* Perform the multiply-accumulates */
183 sum = __SMLALD(*px++, *py--, sum);
184
185 /* Decrement loop counter */
186 k--;
187 }
188
189 /* Store the result in the accumulator in the destination buffer. */
190 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
191
192 /* Update the inputA and inputB pointers for next MAC calculation */
193 py = ++pSrc2;
194 px = pIn1;
195
196 /* Increment MAC count */
197 count++;
198
199 /* Decrement loop counter */
200 blockSize1--;
201 }
202
203 /* The second part of the stage starts here */
204 /* The internal loop, over count, is unrolled by 4 */
205 /* To, read the last two inputB samples using SIMD:
206 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
207 py = py - 1;
208
209 while (blockSize1 > 0)
210 {
211 /* Accumulator is made zero for every iteration */
212 sum = 0;
213
214 /* Apply loop unrolling and compute 4 MACs simultaneously. */
215 k = count >> 2U;
216
217 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
218 a second loop below computes MACs for the remaining 1 to 3 samples. */
219 while (k > 0U)
220 {
221 /* Perform the multiply-accumulate */
222 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
223 sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
224 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
225 sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
226
227 /* Decrement loop counter */
228 k--;
229 }
230
231 /* For the next MAC operations, the pointer py is used without SIMD
232 * So, py is incremented by 1 */
233 py = py + 1U;
234
235 /* If the count is not a multiple of 4, compute any remaining MACs here.
236 No loop unrolling is used. */
237 k = count % 0x4U;
238
239 while (k > 0U)
240 {
241 /* Perform the multiply-accumulates */
242 sum = __SMLALD(*px++, *py--, sum);
243
244 /* Decrement loop counter */
245 k--;
246 }
247
248 /* Store the result in the accumulator in the destination buffer. */
249 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
250
251 /* Update the inputA and inputB pointers for next MAC calculation */
252 py = ++pSrc2 - 1U;
253 px = pIn1;
254
255 /* Increment MAC count */
256 count++;
257
258 /* Decrement loop counter */
259 blockSize1--;
260 }
261
262 /* --------------------------
263 * Initializations of stage2
264 * ------------------------*/
265
266 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
267 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
268 * ....
269 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
270 */
271
272 /* Working pointer of inputA */
273 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
274 {
275 pSrc1 = pIn1 + firstIndex - srcBLen + 1;
276 }
277 else
278 {
279 pSrc1 = pIn1;
280 }
281 px = pSrc1;
282
283 /* Working pointer of inputB */
284 pSrc2 = pIn2 + (srcBLen - 1U);
285 py = pSrc2;
286
287 /* count is the index by which the pointer pIn1 to be incremented */
288 count = 0U;
289
290 /* -------------------
291 * Stage2 process
292 * ------------------*/
293
294 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
295 * So, to loop unroll over blockSize2,
296 * srcBLen should be greater than or equal to 4 */
297 if (srcBLen >= 4U)
298 {
299 /* Loop unrolling: Compute 4 outputs at a time */
300 blkCnt = ((uint32_t) blockSize2 >> 2U);
301
302 while (blkCnt > 0U)
303 {
304 py = py - 1U;
305
306 /* Set all accumulators to zero */
307 acc0 = 0;
308 acc1 = 0;
309 acc2 = 0;
310 acc3 = 0;
311
312
313 /* read x[0], x[1] samples */
314 x0 = read_q15x2 ((q15_t *) px);
315 /* read x[1], x[2] samples */
316 x1 = read_q15x2 ((q15_t *) px + 1);
317 px += 2U;
318
319
320 /* Apply loop unrolling and compute 4 MACs simultaneously. */
321 k = srcBLen >> 2U;
322
323 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
324 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
325 do
326 {
327 /* Read the last two inputB samples using SIMD:
328 * y[srcBLen - 1] and y[srcBLen - 2] */
329 c0 = read_q15x2_da ((q15_t **) &py);
330
331 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
332 acc0 = __SMLALDX(x0, c0, acc0);
333
334 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
335 acc1 = __SMLALDX(x1, c0, acc1);
336
337 /* Read x[2], x[3] */
338 x2 = read_q15x2 ((q15_t *) px);
339
340 /* Read x[3], x[4] */
341 x3 = read_q15x2 ((q15_t *) px + 1);
342
343 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
344 acc2 = __SMLALDX(x2, c0, acc2);
345
346 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
347 acc3 = __SMLALDX(x3, c0, acc3);
348
349 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
350 c0 = read_q15x2_da ((q15_t **) &py);
351
352 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
353 acc0 = __SMLALDX(x2, c0, acc0);
354
355 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
356 acc1 = __SMLALDX(x3, c0, acc1);
357
358 /* Read x[4], x[5] */
359 x0 = read_q15x2 ((q15_t *) px + 2);
360
361 /* Read x[5], x[6] */
362 x1 = read_q15x2 ((q15_t *) px + 3);
363 px += 4U;
364
365 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
366 acc2 = __SMLALDX(x0, c0, acc2);
367
368 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
369 acc3 = __SMLALDX(x1, c0, acc3);
370
371 } while (--k);
372
373 /* For the next MAC operations, SIMD is not used
374 * So, the 16 bit pointer if inputB, py is updated */
375
376 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
377 ** No loop unrolling is used. */
378 k = srcBLen % 0x4U;
379
380 if (k == 1U)
381 {
382 /* Read y[srcBLen - 5] */
383 c0 = *(py+1);
384 #ifdef ARM_MATH_BIG_ENDIAN
385 c0 = c0 << 16U;
386 #else
387 c0 = c0 & 0x0000FFFF;
388 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
389
390 /* Read x[7] */
391 x3 = read_q15x2 ((q15_t *) px);
392 px++;
393
394 /* Perform the multiply-accumulate */
395 acc0 = __SMLALD (x0, c0, acc0);
396 acc1 = __SMLALD (x1, c0, acc1);
397 acc2 = __SMLALDX(x1, c0, acc2);
398 acc3 = __SMLALDX(x3, c0, acc3);
399 }
400
401 if (k == 2U)
402 {
403 /* Read y[srcBLen - 5], y[srcBLen - 6] */
404 c0 = read_q15x2 ((q15_t *) py);
405
406 /* Read x[7], x[8] */
407 x3 = read_q15x2 ((q15_t *) px);
408
409 /* Read x[9] */
410 x2 = read_q15x2 ((q15_t *) px + 1);
411 px += 2U;
412
413 /* Perform the multiply-accumulate */
414 acc0 = __SMLALDX(x0, c0, acc0);
415 acc1 = __SMLALDX(x1, c0, acc1);
416 acc2 = __SMLALDX(x3, c0, acc2);
417 acc3 = __SMLALDX(x2, c0, acc3);
418 }
419
420 if (k == 3U)
421 {
422 /* Read y[srcBLen - 5], y[srcBLen - 6] */
423 c0 = read_q15x2 ((q15_t *) py);
424
425 /* Read x[7], x[8] */
426 x3 = read_q15x2 ((q15_t *) px);
427
428 /* Read x[9] */
429 x2 = read_q15x2 ((q15_t *) px + 1);
430
431 /* Perform the multiply-accumulate */
432 acc0 = __SMLALDX(x0, c0, acc0);
433 acc1 = __SMLALDX(x1, c0, acc1);
434 acc2 = __SMLALDX(x3, c0, acc2);
435 acc3 = __SMLALDX(x2, c0, acc3);
436
437 c0 = *(py-1);
438 #ifdef ARM_MATH_BIG_ENDIAN
439 c0 = c0 << 16U;
440 #else
441 c0 = c0 & 0x0000FFFF;
442 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
443
444 /* Read x[10] */
445 x3 = read_q15x2 ((q15_t *) px + 2);
446 px += 3U;
447
448 /* Perform the multiply-accumulates */
449 acc0 = __SMLALDX(x1, c0, acc0);
450 acc1 = __SMLALD (x2, c0, acc1);
451 acc2 = __SMLALDX(x2, c0, acc2);
452 acc3 = __SMLALDX(x3, c0, acc3);
453 }
454
455 /* Store the results in the accumulators in the destination buffer. */
456 {
457 int32_t sat0 = __SSAT((acc0 >> 15), 16);
458 int32_t sat1 = __SSAT((acc1 >> 15), 16);
459 int32_t sat2 = __SSAT((acc2 >> 15), 16);
460 int32_t sat3 = __SSAT((acc3 >> 15), 16);
461 #ifndef ARM_MATH_BIG_ENDIAN
462 write_q15x2_ia (&pOut, __PKHBT(sat0, sat1, 16));
463 write_q15x2_ia (&pOut, __PKHBT(sat2, sat3, 16));
464 #else
465 write_q15x2_ia (&pOut, __PKHBT(sat1, sat0, 16));
466 write_q15x2_ia (&pOut, __PKHBT(sat3, sat2, 16));
467 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
468 }
469
470 /* Increment the pointer pIn1 index, count by 4 */
471 count += 4U;
472
473 /* Update the inputA and inputB pointers for next MAC calculation */
474 px = pSrc1 + count;
475 py = pSrc2;
476
477 /* Decrement loop counter */
478 blkCnt--;
479 }
480
481 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
482 No loop unrolling is used. */
483 blkCnt = (uint32_t) blockSize2 % 0x4U;
484
485 while (blkCnt > 0U)
486 {
487 /* Accumulator is made zero for every iteration */
488 sum = 0;
489
490 /* Apply loop unrolling and compute 4 MACs simultaneously. */
491 k = srcBLen >> 2U;
492
493 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
494 a second loop below computes MACs for the remaining 1 to 3 samples. */
495 while (k > 0U)
496 {
497 /* Perform the multiply-accumulates */
498 sum += (q63_t) ((q31_t) *px++ * *py--);
499 sum += (q63_t) ((q31_t) *px++ * *py--);
500 sum += (q63_t) ((q31_t) *px++ * *py--);
501 sum += (q63_t) ((q31_t) *px++ * *py--);
502
503 /* Decrement loop counter */
504 k--;
505 }
506
507 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
508 ** No loop unrolling is used. */
509 k = srcBLen % 0x4U;
510
511 while (k > 0U)
512 {
513 /* Perform the multiply-accumulate */
514 sum += (q63_t) ((q31_t) *px++ * *py--);
515
516 /* Decrement loop counter */
517 k--;
518 }
519
520 /* Store the result in the accumulator in the destination buffer. */
521 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
522
523 /* Increment the pointer pIn1 index, count by 1 */
524 count++;
525
526 /* Update the inputA and inputB pointers for next MAC calculation */
527 px = pSrc1 + count;
528 py = pSrc2;
529
530 /* Decrement loop counter */
531 blkCnt--;
532 }
533 }
534 else
535 {
536 /* If the srcBLen is not a multiple of 4,
537 * the blockSize2 loop cannot be unrolled by 4 */
538 blkCnt = (uint32_t) blockSize2;
539
540 while (blkCnt > 0U)
541 {
542 /* Accumulator is made zero for every iteration */
543 sum = 0;
544
545 /* srcBLen number of MACS should be performed */
546 k = srcBLen;
547
548 while (k > 0U)
549 {
550 /* Perform the multiply-accumulate */
551 sum += (q63_t) ((q31_t) *px++ * *py--);
552
553 /* Decrement the loop counter */
554 k--;
555 }
556
557 /* Store the result in the accumulator in the destination buffer. */
558 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
559
560 /* Increment the MAC count */
561 count++;
562
563 /* Update the inputA and inputB pointers for next MAC calculation */
564 px = pSrc1 + count;
565 py = pSrc2;
566
567 /* Decrement the loop counter */
568 blkCnt--;
569 }
570 }
571
572
573 /* --------------------------
574 * Initializations of stage3
575 * -------------------------*/
576
577 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
578 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
579 * ....
580 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
581 * sum += x[srcALen-1] * y[srcBLen-1]
582 */
583
584 /* In this stage the MAC operations are decreased by 1 for every iteration.
585 The count variable holds the number of MAC operations performed */
586 count = srcBLen - 1U;
587
588 /* Working pointer of inputA */
589 if (firstIndex > srcALen)
590 {
591 pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
592 }
593 else
594 {
595 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
596 }
597 px = pSrc1;
598
599 /* Working pointer of inputB */
600 pSrc2 = pIn2 + (srcBLen - 1U);
601 pIn2 = pSrc2 - 1U;
602 py = pIn2;
603
604 /* -------------------
605 * Stage3 process
606 * ------------------*/
607
608 /* For loop unrolling by 4, this stage is divided into two. */
609 /* First part of this stage computes the MAC operations greater than 4 */
610 /* Second part of this stage computes the MAC operations less than or equal to 4 */
611
612 /* The first part of the stage starts here */
613 j = count >> 2U;
614
615 while ((j > 0U) && (blockSize3 > 0))
616 {
617 /* Accumulator is made zero for every iteration */
618 sum = 0;
619
620 /* Apply loop unrolling and compute 4 MACs simultaneously. */
621 k = count >> 2U;
622
623 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
624 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
625 while (k > 0U)
626 {
627 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
628 * with y[srcBLen - 1], y[srcBLen - 2] respectively */
629 sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
630 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
631 * with y[srcBLen - 3], y[srcBLen - 4] respectively */
632 sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
633
634 /* Decrement loop counter */
635 k--;
636 }
637
638 /* For the next MAC operations, the pointer py is used without SIMD
639 * So, py is incremented by 1 */
640 py = py + 1U;
641
642 /* If the count is not a multiple of 4, compute any remaining MACs here.
643 ** No loop unrolling is used. */
644 k = count % 0x4U;
645
646 while (k > 0U)
647 {
648 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
649 sum = __SMLALD(*px++, *py--, sum);
650
651 /* Decrement loop counter */
652 k--;
653 }
654
655 /* Store the result in the accumulator in the destination buffer. */
656 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
657
658 /* Update the inputA and inputB pointers for next MAC calculation */
659 px = ++pSrc1;
660 py = pIn2;
661
662 /* Decrement MAC count */
663 count--;
664
665 /* Decrement loop counter */
666 blockSize3--;
667
668 j--;
669 }
670
671 /* The second part of the stage starts here */
672 /* SIMD is not used for the next MAC operations,
673 * so pointer py is updated to read only one sample at a time */
674 py = py + 1U;
675
676 while (blockSize3 > 0)
677 {
678 /* Accumulator is made zero for every iteration */
679 sum = 0;
680
681 /* Apply loop unrolling and compute 4 MACs simultaneously. */
682 k = count;
683
684 while (k > 0U)
685 {
686 /* Perform the multiply-accumulates */
687 /* sum += x[srcALen-1] * y[srcBLen-1] */
688 sum = __SMLALD(*px++, *py--, sum);
689
690 /* Decrement loop counter */
691 k--;
692 }
693
694 /* Store the result in the accumulator in the destination buffer. */
695 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
696
697 /* Update the inputA and inputB pointers for next MAC calculation */
698 px = ++pSrc1;
699 py = pSrc2;
700
701 /* Decrement MAC count */
702 count--;
703
704 /* Decrement the loop counter */
705 blockSize3--;
706 }
707
708 /* Set status as ARM_MATH_SUCCESS */
709 status = ARM_MATH_SUCCESS;
710 }
711
712 /* Return to application */
713 return (status);
714
715 #else /* #if defined (ARM_MATH_DSP) */
716
717 const q15_t *pIn1 = pSrcA; /* InputA pointer */
718 const q15_t *pIn2 = pSrcB; /* InputB pointer */
719 q63_t sum; /* Accumulator */
720 uint32_t i, j; /* Loop counters */
721 arm_status status; /* Status of Partial convolution */
722
723 /* Check for range of output samples to be calculated */
724 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
725 {
726 /* Set status as ARM_MATH_ARGUMENT_ERROR */
727 status = ARM_MATH_ARGUMENT_ERROR;
728 }
729 else
730 {
731 /* Loop to calculate convolution for output length number of values */
732 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
733 {
734 /* Initialize sum with zero to carry on MAC operations */
735 sum = 0;
736
737 /* Loop to perform MAC operations according to convolution equation */
738 for (j = 0U; j <= i; j++)
739 {
740 /* Check the array limitations */
741 if (((i - j) < srcBLen) && (j < srcALen))
742 {
743 /* z[i] += x[i-j] * y[j] */
744 sum += ((q31_t) pIn1[j] * pIn2[i - j]);
745 }
746 }
747
748 /* Store the output in the destination buffer */
749 pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
750 }
751
752 /* Set status as ARM_MATH_SUCCESS */
753 status = ARM_MATH_SUCCESS;
754 }
755
756 /* Return to application */
757 return (status);
758
759 #endif /* #if defined (ARM_MATH_DSP) */
760
761 }
762
763 /**
764 @} end of PartialConv group
765 */
766