1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_partial_fast_q15.c
4 * Description: Fast Q15 Partial convolution
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup PartialConv
37 @{
38 */
39
40 /**
41 @brief Partial convolution of Q15 sequences (fast version).
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written
47 @param[in] firstIndex is the first output sample to start with
48 @param[in] numPoints is the number of output points to be computed
49 @return execution status
50 - \ref ARM_MATH_SUCCESS : Operation successful
51 - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
52 @remark
53 Refer to \ref arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
54 */
55
arm_conv_partial_fast_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,uint32_t firstIndex,uint32_t numPoints)56 ARM_DSP_ATTRIBUTE arm_status arm_conv_partial_fast_q15(
57 const q15_t * pSrcA,
58 uint32_t srcALen,
59 const q15_t * pSrcB,
60 uint32_t srcBLen,
61 q15_t * pDst,
62 uint32_t firstIndex,
63 uint32_t numPoints)
64 {
65 const q15_t *pIn1; /* InputA pointer */
66 const q15_t *pIn2; /* InputB pointer */
67 q15_t *pOut = pDst; /* Output pointer */
68 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
69 const q15_t *px; /* Intermediate inputA pointer */
70 const q15_t *py; /* Intermediate inputB pointer */
71 const q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
72 q31_t x0, x1, x2, x3, c0; /* Temporary input variables */
73 uint32_t j, k, count, blkCnt, check;
74 int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
75 arm_status status; /* Status of Partial convolution */
76
77 /* Check for range of output samples to be calculated */
78 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
79 {
80 /* Set status as ARM_MATH_ARGUMENT_ERROR */
81 status = ARM_MATH_ARGUMENT_ERROR;
82 }
83 else
84 {
85 /* The algorithm implementation is based on the lengths of the inputs. */
86 /* srcB is always made to slide across srcA. */
87 /* So srcBLen is always considered as shorter or equal to srcALen */
88 if (srcALen >= srcBLen)
89 {
90 /* Initialization of inputA pointer */
91 pIn1 = pSrcA;
92
93 /* Initialization of inputB pointer */
94 pIn2 = pSrcB;
95 }
96 else
97 {
98 /* Initialization of inputA pointer */
99 pIn1 = pSrcB;
100
101 /* Initialization of inputB pointer */
102 pIn2 = pSrcA;
103
104 /* srcBLen is always considered as shorter or equal to srcALen */
105 j = srcBLen;
106 srcBLen = srcALen;
107 srcALen = j;
108 }
109
110 /* Conditions to check which loopCounter holds
111 * the first and last indices of the output samples to be calculated. */
112 check = firstIndex + numPoints;
113 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
114 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
115 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
116 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
117 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
118 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
119
120 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
121 /* The function is internally
122 * divided into three stages according to the number of multiplications that has to be
123 * taken place between inputA samples and inputB samples. In the first stage of the
124 * algorithm, the multiplications increase by one for every iteration.
125 * In the second stage of the algorithm, srcBLen number of multiplications are done.
126 * In the third stage of the algorithm, the multiplications decrease by one
127 * for every iteration. */
128
129 /* Set the output pointer to point to the firstIndex
130 * of the output sample to be calculated. */
131 pOut = pDst + firstIndex;
132
133 /* --------------------------
134 * Initializations of stage1
135 * -------------------------*/
136
137 /* sum = x[0] * y[0]
138 * sum = x[0] * y[1] + x[1] * y[0]
139 * ....
140 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
141 */
142
143 /* In this stage the MAC operations are increased by 1 for every iteration.
144 The count variable holds the number of MAC operations performed.
145 Since the partial convolution starts from firstIndex
146 Number of Macs to be performed is firstIndex + 1 */
147 count = 1U + firstIndex;
148
149 /* Working pointer of inputA */
150 px = pIn1;
151
152 /* Working pointer of inputB */
153 pSrc2 = pIn2 + firstIndex;
154 py = pSrc2;
155
156 /* ------------------------
157 * Stage1 process
158 * ----------------------*/
159
160 /* For loop unrolling by 4, this stage is divided into two. */
161 /* First part of this stage computes the MAC operations less than 4 */
162 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
163
164 /* The first part of the stage starts here */
165 while ((count < 4U) && (blockSize1 > 0))
166 {
167 /* Accumulator is made zero for every iteration */
168 sum = 0;
169
170 /* Loop over number of MAC operations between
171 * inputA samples and inputB samples */
172 k = count;
173
174 while (k > 0U)
175 {
176 /* Perform the multiply-accumulates */
177 sum = __SMLAD(*px++, *py--, sum);
178
179 /* Decrement loop counter */
180 k--;
181 }
182
183 /* Store the result in the accumulator in the destination buffer. */
184 *pOut++ = (q15_t) (sum >> 15);
185
186 /* Update the inputA and inputB pointers for next MAC calculation */
187 py = ++pSrc2;
188 px = pIn1;
189
190 /* Increment MAC count */
191 count++;
192
193 /* Decrement loop counter */
194 blockSize1--;
195 }
196
197 /* The second part of the stage starts here */
198 /* The internal loop, over count, is unrolled by 4 */
199 /* To, read the last two inputB samples using SIMD:
200 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
201 py = py - 1;
202
203 while (blockSize1 > 0)
204 {
205 /* Accumulator is made zero for every iteration */
206 sum = 0;
207
208 /* Apply loop unrolling and compute 4 MACs simultaneously. */
209 k = count >> 2U;
210
211 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
212 a second loop below computes MACs for the remaining 1 to 3 samples. */
213 while (k > 0U)
214 {
215 /* Perform the multiply-accumulate */
216 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
217 sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
218 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
219 sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
220
221 /* Decrement loop counter */
222 k--;
223 }
224
225 /* For the next MAC operations, the pointer py is used without SIMD
226 So, py is incremented by 1 */
227 py = py + 1U;
228
229 /* If the count is not a multiple of 4, compute any remaining MACs here.
230 No loop unrolling is used. */
231 k = count % 0x4U;
232
233 while (k > 0U)
234 {
235 /* Perform the multiply-accumulates */
236 sum = __SMLAD(*px++, *py--, sum);
237
238 /* Decrement loop counter */
239 k--;
240 }
241
242 /* Store the result in the accumulator in the destination buffer. */
243 *pOut++ = (q15_t) (sum >> 15);
244
245 /* Update the inputA and inputB pointers for next MAC calculation */
246 py = ++pSrc2 - 1U;
247 px = pIn1;
248
249 /* Increment MAC count */
250 count++;
251
252 /* Decrement loop counter */
253 blockSize1--;
254 }
255
256 /* --------------------------
257 * Initializations of stage2
258 * ------------------------*/
259
260 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
261 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
262 * ....
263 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
264 */
265
266 /* Working pointer of inputA */
267 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
268 {
269 pSrc1 = pIn1 + firstIndex - srcBLen + 1;
270 }
271 else
272 {
273 pSrc1 = pIn1;
274 }
275 px = pSrc1;
276
277 /* Working pointer of inputB */
278 pSrc2 = pIn2 + (srcBLen - 1U);
279 py = pSrc2;
280
281 /* count is the index by which the pointer pIn1 to be incremented */
282 count = 0U;
283
284 /* -------------------
285 * Stage2 process
286 * ------------------*/
287
288 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
289 * So, to loop unroll over blockSize2,
290 * srcBLen should be greater than or equal to 4 */
291 if (srcBLen >= 4U)
292 {
293 /* Loop unrolling: Compute 4 outputs at a time */
294 blkCnt = ((uint32_t) blockSize2 >> 2U);
295
296 while (blkCnt > 0U)
297 {
298 py = py - 1U;
299
300 /* Set all accumulators to zero */
301 acc0 = 0;
302 acc1 = 0;
303 acc2 = 0;
304 acc3 = 0;
305
306
307 /* read x[0], x[1] samples */
308 x0 = read_q15x2 ((q15_t *) px);
309 /* read x[1], x[2] samples */
310 x1 = read_q15x2 ((q15_t *) px + 1);
311 px += 2U;
312
313
314 /* Apply loop unrolling and compute 4 MACs simultaneously. */
315 k = srcBLen >> 2U;
316
317 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
318 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
319 do
320 {
321 /* Read the last two inputB samples using SIMD:
322 * y[srcBLen - 1] and y[srcBLen - 2] */
323 c0 = read_q15x2_da ((q15_t **) &py);
324
325 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
326 acc0 = __SMLADX(x0, c0, acc0);
327
328 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
329 acc1 = __SMLADX(x1, c0, acc1);
330
331 /* Read x[2], x[3] */
332 x2 = read_q15x2 ((q15_t *) px);
333
334 /* Read x[3], x[4] */
335 x3 = read_q15x2 ((q15_t *) px + 1);
336
337 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
338 acc2 = __SMLADX(x2, c0, acc2);
339
340 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
341 acc3 = __SMLADX(x3, c0, acc3);
342
343 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
344 c0 = read_q15x2_da ((q15_t **) &py);
345
346 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
347 acc0 = __SMLADX(x2, c0, acc0);
348
349 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
350 acc1 = __SMLADX(x3, c0, acc1);
351
352 /* Read x[4], x[5] */
353 x0 = read_q15x2 ((q15_t *) px + 2);
354
355 /* Read x[5], x[6] */
356 x1 = read_q15x2 ((q15_t *) px + 3);
357 px += 4U;
358
359 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
360 acc2 = __SMLADX(x0, c0, acc2);
361
362 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
363 acc3 = __SMLADX(x1, c0, acc3);
364
365 } while (--k);
366
367 /* For the next MAC operations, SIMD is not used
368 So, the 16 bit pointer if inputB, py is updated */
369
370 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
371 No loop unrolling is used. */
372 k = srcBLen % 0x4U;
373
374 if (k == 1U)
375 {
376 /* Read y[srcBLen - 5] */
377 c0 = *(py + 1);
378 #ifdef ARM_MATH_BIG_ENDIAN
379 c0 = c0 << 16U;
380 #else
381 c0 = c0 & 0x0000FFFF;
382 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
383
384 /* Read x[7] */
385 x3 = read_q15x2 ((q15_t *) px);
386 px++;
387
388 /* Perform the multiply-accumulate */
389 acc0 = __SMLAD (x0, c0, acc0);
390 acc1 = __SMLAD (x1, c0, acc1);
391 acc2 = __SMLADX(x1, c0, acc2);
392 acc3 = __SMLADX(x3, c0, acc3);
393 }
394
395 if (k == 2U)
396 {
397 /* Read y[srcBLen - 5], y[srcBLen - 6] */
398 c0 = read_q15x2 ((q15_t *) py);
399
400 /* Read x[7], x[8] */
401 x3 = read_q15x2 ((q15_t *) px);
402
403 /* Read x[9] */
404 x2 = read_q15x2 ((q15_t *) px + 1);
405 px += 2U;
406
407 /* Perform the multiply-accumulate */
408 acc0 = __SMLADX(x0, c0, acc0);
409 acc1 = __SMLADX(x1, c0, acc1);
410 acc2 = __SMLADX(x3, c0, acc2);
411 acc3 = __SMLADX(x2, c0, acc3);
412 }
413
414 if (k == 3U)
415 {
416 /* Read y[srcBLen - 5], y[srcBLen - 6] */
417 c0 = read_q15x2 ((q15_t *) py);
418
419 /* Read x[7], x[8] */
420 x3 = read_q15x2 ((q15_t *) px);
421
422 /* Read x[9] */
423 x2 = read_q15x2 ((q15_t *) px + 1);
424
425 /* Perform the multiply-accumulate */
426 acc0 = __SMLADX(x0, c0, acc0);
427 acc1 = __SMLADX(x1, c0, acc1);
428 acc2 = __SMLADX(x3, c0, acc2);
429 acc3 = __SMLADX(x2, c0, acc3);
430
431 c0 = *(py-1);
432 #ifdef ARM_MATH_BIG_ENDIAN
433 c0 = c0 << 16U;
434 #else
435 c0 = c0 & 0x0000FFFF;
436 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
437
438 /* Read x[10] */
439 x3 = read_q15x2 ((q15_t *) px + 2);
440 px += 3U;
441
442 /* Perform the multiply-accumulates */
443 acc0 = __SMLADX(x1, c0, acc0);
444 acc1 = __SMLAD (x2, c0, acc1);
445 acc2 = __SMLADX(x2, c0, acc2);
446 acc3 = __SMLADX(x3, c0, acc3);
447 }
448
449 /* Store the results in the accumulators in the destination buffer. */
450 #ifndef ARM_MATH_BIG_ENDIAN
451 write_q15x2_ia (&pOut, __PKHBT(acc0 >> 15, acc1 >> 15, 16));
452 write_q15x2_ia (&pOut, __PKHBT(acc2 >> 15, acc3 >> 15, 16));
453 #else
454 write_q15x2_ia (&pOut, __PKHBT(acc1 >> 15, acc0 >> 15, 16));
455 write_q15x2_ia (&pOut, __PKHBT(acc3 >> 15, acc2 >> 15, 16));
456 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
457
458 /* Increment the pointer pIn1 index, count by 4 */
459 count += 4U;
460
461 /* Update the inputA and inputB pointers for next MAC calculation */
462 px = pSrc1 + count;
463 py = pSrc2;
464
465 /* Decrement the loop counter */
466 blkCnt--;
467 }
468
469 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
470 No loop unrolling is used. */
471 blkCnt = (uint32_t) blockSize2 % 0x4U;
472
473 while (blkCnt > 0U)
474 {
475 /* Accumulator is made zero for every iteration */
476 sum = 0;
477
478 /* Apply loop unrolling and compute 4 MACs simultaneously. */
479 k = srcBLen >> 2U;
480
481 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
482 a second loop below computes MACs for the remaining 1 to 3 samples. */
483 while (k > 0U)
484 {
485 /* Perform the multiply-accumulates */
486 sum += ((q31_t) *px++ * *py--);
487 sum += ((q31_t) *px++ * *py--);
488 sum += ((q31_t) *px++ * *py--);
489 sum += ((q31_t) *px++ * *py--);
490
491 /* Decrement loop counter */
492 k--;
493 }
494
495 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
496 ** No loop unrolling is used. */
497 k = srcBLen % 0x4U;
498
499 while (k > 0U)
500 {
501 /* Perform the multiply-accumulates */
502 sum += ((q31_t) *px++ * *py--);
503
504 /* Decrement the loop counter */
505 k--;
506 }
507
508 /* Store the result in the accumulator in the destination buffer. */
509 *pOut++ = (q15_t) (sum >> 15);
510
511 /* Increment the pointer pIn1 index, count by 1 */
512 count++;
513
514 /* Update the inputA and inputB pointers for next MAC calculation */
515 px = pSrc1 + count;
516 py = pSrc2;
517
518 /* Decrement loop counter */
519 blkCnt--;
520 }
521 }
522 else
523 {
524 /* If the srcBLen is not a multiple of 4,
525 * the blockSize2 loop cannot be unrolled by 4 */
526 blkCnt = (uint32_t) blockSize2;
527
528 while (blkCnt > 0U)
529 {
530 /* Accumulator is made zero for every iteration */
531 sum = 0;
532
533 /* srcBLen number of MACS should be performed */
534 k = srcBLen;
535
536 while (k > 0U)
537 {
538 /* Perform the multiply-accumulate */
539 sum += ((q31_t) *px++ * *py--);
540
541 /* Decrement the loop counter */
542 k--;
543 }
544
545 /* Store the result in the accumulator in the destination buffer. */
546 *pOut++ = (q15_t) (sum >> 15);
547
548 /* Increment the MAC count */
549 count++;
550
551 /* Update the inputA and inputB pointers for next MAC calculation */
552 px = pSrc1 + count;
553 py = pSrc2;
554
555 /* Decrement the loop counter */
556 blkCnt--;
557 }
558 }
559
560
561 /* --------------------------
562 * Initializations of stage3
563 * -------------------------*/
564
565 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
566 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
567 * ....
568 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
569 * sum += x[srcALen-1] * y[srcBLen-1]
570 */
571
572 /* In this stage the MAC operations are decreased by 1 for every iteration.
573 The count variable holds the number of MAC operations performed */
574 count = srcBLen - 1U;
575
576 /* Working pointer of inputA */
577 if (firstIndex > srcALen)
578 {
579 pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
580 }
581 else
582 {
583 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
584 }
585 px = pSrc1;
586
587 /* Working pointer of inputB */
588 pSrc2 = pIn2 + (srcBLen - 1U);
589 pIn2 = pSrc2 - 1U;
590 py = pIn2;
591
592 /* -------------------
593 * Stage3 process
594 * ------------------*/
595
596 /* For loop unrolling by 4, this stage is divided into two. */
597 /* First part of this stage computes the MAC operations greater than 4 */
598 /* Second part of this stage computes the MAC operations less than or equal to 4 */
599
600 /* The first part of the stage starts here */
601 j = count >> 2U;
602
603 while ((j > 0U) && (blockSize3 > 0))
604 {
605 /* Accumulator is made zero for every iteration */
606 sum = 0;
607
608 /* Apply loop unrolling and compute 4 MACs simultaneously. */
609 k = count >> 2U;
610
611 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
612 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
613 while (k > 0U)
614 {
615 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
616 * with y[srcBLen - 1], y[srcBLen - 2] respectively */
617 sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
618 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
619 * with y[srcBLen - 3], y[srcBLen - 4] respectively */
620 sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
621
622 /* Decrement loop counter */
623 k--;
624 }
625
626 /* For the next MAC operations, the pointer py is used without SIMD
627 So, py is incremented by 1 */
628 py = py + 1U;
629
630 /* If the count is not a multiple of 4, compute any remaining MACs here.
631 No loop unrolling is used. */
632 k = count % 0x4U;
633
634 while (k > 0U)
635 {
636 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
637 sum = __SMLAD(*px++, *py--, sum);
638
639 /* Decrement the loop counter */
640 k--;
641 }
642
643 /* Store the result in the accumulator in the destination buffer. */
644 *pOut++ = (q15_t) (sum >> 15);
645
646 /* Update the inputA and inputB pointers for next MAC calculation */
647 px = ++pSrc1;
648 py = pIn2;
649
650 /* Decrement the MAC count */
651 count--;
652
653 /* Decrement the loop counter */
654 blockSize3--;
655
656 j--;
657 }
658
659 /* The second part of the stage starts here */
660 /* SIMD is not used for the next MAC operations,
661 * so pointer py is updated to read only one sample at a time */
662 py = py + 1U;
663
664 while (blockSize3 > 0)
665 {
666 /* Accumulator is made zero for every iteration */
667 sum = 0;
668
669 /* Apply loop unrolling and compute 4 MACs simultaneously. */
670 k = count;
671
672 while (k > 0U)
673 {
674 /* Perform the multiply-accumulates */
675 /* sum += x[srcALen-1] * y[srcBLen-1] */
676 sum = __SMLAD(*px++, *py--, sum);
677
678 /* Decrement the loop counter */
679 k--;
680 }
681
682 /* Store the result in the accumulator in the destination buffer. */
683 *pOut++ = (q15_t) (sum >> 15);
684
685 /* Update the inputA and inputB pointers for next MAC calculation */
686 px = ++pSrc1;
687 py = pSrc2;
688
689 /* Decrement the MAC count */
690 count--;
691
692 /* Decrement the loop counter */
693 blockSize3--;
694 }
695
696 /* Set status as ARM_MATH_SUCCESS */
697 status = ARM_MATH_SUCCESS;
698 }
699
700 /* Return to application */
701 return (status);
702
703 }
704
705 /**
706 @} end of PartialConv group
707 */
708