1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_partial_fast_q15.c
4  * Description:  Fast Q15 Partial convolution
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup PartialConv
37   @{
38  */
39 
40 /**
41   @brief         Partial convolution of Q15 sequences (fast version).
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written
47   @param[in]     firstIndex is the first output sample to start with
48   @param[in]     numPoints  is the number of output points to be computed
49   @return        execution status
50                    - \ref ARM_MATH_SUCCESS        : Operation successful
51                    - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
52   @remark
53                    Refer to \ref arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
54  */
55 
arm_conv_partial_fast_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,uint32_t firstIndex,uint32_t numPoints)56 ARM_DSP_ATTRIBUTE arm_status arm_conv_partial_fast_q15(
57   const q15_t * pSrcA,
58         uint32_t srcALen,
59   const q15_t * pSrcB,
60         uint32_t srcBLen,
61         q15_t * pDst,
62         uint32_t firstIndex,
63         uint32_t numPoints)
64 {
65   const q15_t *pIn1;                                   /* InputA pointer */
66   const q15_t *pIn2;                                   /* InputB pointer */
67         q15_t *pOut = pDst;                            /* Output pointer */
68         q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
69   const q15_t *px;                                     /* Intermediate inputA pointer */
70   const q15_t *py;                                     /* Intermediate inputB pointer */
71   const q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
72         q31_t x0, x1, x2, x3, c0;                      /* Temporary input variables */
73         uint32_t j, k, count, blkCnt, check;
74         int32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
75         arm_status status;                             /* Status of Partial convolution */
76 
77   /* Check for range of output samples to be calculated */
78   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
79   {
80     /* Set status as ARM_MATH_ARGUMENT_ERROR */
81     status = ARM_MATH_ARGUMENT_ERROR;
82   }
83   else
84   {
85     /* The algorithm implementation is based on the lengths of the inputs. */
86     /* srcB is always made to slide across srcA. */
87     /* So srcBLen is always considered as shorter or equal to srcALen */
88     if (srcALen >= srcBLen)
89     {
90       /* Initialization of inputA pointer */
91       pIn1 = pSrcA;
92 
93       /* Initialization of inputB pointer */
94       pIn2 = pSrcB;
95     }
96     else
97     {
98       /* Initialization of inputA pointer */
99       pIn1 = pSrcB;
100 
101       /* Initialization of inputB pointer */
102       pIn2 = pSrcA;
103 
104       /* srcBLen is always considered as shorter or equal to srcALen */
105       j = srcBLen;
106       srcBLen = srcALen;
107       srcALen = j;
108     }
109 
110     /* Conditions to check which loopCounter holds
111      * the first and last indices of the output samples to be calculated. */
112     check = firstIndex + numPoints;
113     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
114     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
115     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
116     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
117     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
118     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
119 
120     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
121     /* The function is internally
122      * divided into three stages according to the number of multiplications that has to be
123      * taken place between inputA samples and inputB samples. In the first stage of the
124      * algorithm, the multiplications increase by one for every iteration.
125      * In the second stage of the algorithm, srcBLen number of multiplications are done.
126      * In the third stage of the algorithm, the multiplications decrease by one
127      * for every iteration. */
128 
129     /* Set the output pointer to point to the firstIndex
130      * of the output sample to be calculated. */
131     pOut = pDst + firstIndex;
132 
133     /* --------------------------
134      * Initializations of stage1
135      * -------------------------*/
136 
137     /* sum = x[0] * y[0]
138      * sum = x[0] * y[1] + x[1] * y[0]
139      * ....
140      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
141      */
142 
143     /* In this stage the MAC operations are increased by 1 for every iteration.
144        The count variable holds the number of MAC operations performed.
145        Since the partial convolution starts from firstIndex
146        Number of Macs to be performed is firstIndex + 1 */
147     count = 1U + firstIndex;
148 
149     /* Working pointer of inputA */
150     px = pIn1;
151 
152     /* Working pointer of inputB */
153     pSrc2 = pIn2 + firstIndex;
154     py = pSrc2;
155 
156     /* ------------------------
157      * Stage1 process
158      * ----------------------*/
159 
160     /* For loop unrolling by 4, this stage is divided into two. */
161     /* First part of this stage computes the MAC operations less than 4 */
162     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
163 
164     /* The first part of the stage starts here */
165     while ((count < 4U) && (blockSize1 > 0))
166     {
167       /* Accumulator is made zero for every iteration */
168       sum = 0;
169 
170       /* Loop over number of MAC operations between
171        * inputA samples and inputB samples */
172       k = count;
173 
174       while (k > 0U)
175       {
176         /* Perform the multiply-accumulates */
177         sum = __SMLAD(*px++, *py--, sum);
178 
179         /* Decrement loop counter */
180         k--;
181       }
182 
183       /* Store the result in the accumulator in the destination buffer. */
184       *pOut++ = (q15_t) (sum >> 15);
185 
186       /* Update the inputA and inputB pointers for next MAC calculation */
187       py = ++pSrc2;
188       px = pIn1;
189 
190       /* Increment MAC count */
191       count++;
192 
193       /* Decrement loop counter */
194       blockSize1--;
195     }
196 
197     /* The second part of the stage starts here */
198     /* The internal loop, over count, is unrolled by 4 */
199     /* To, read the last two inputB samples using SIMD:
200      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
201     py = py - 1;
202 
203     while (blockSize1 > 0)
204     {
205       /* Accumulator is made zero for every iteration */
206       sum = 0;
207 
208       /* Apply loop unrolling and compute 4 MACs simultaneously. */
209       k = count >> 2U;
210 
211       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
212          a second loop below computes MACs for the remaining 1 to 3 samples. */
213       while (k > 0U)
214       {
215         /* Perform the multiply-accumulate */
216         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
217         sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
218         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
219         sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
220 
221         /* Decrement loop counter */
222         k--;
223       }
224 
225       /* For the next MAC operations, the pointer py is used without SIMD
226          So, py is incremented by 1 */
227       py = py + 1U;
228 
229       /* If the count is not a multiple of 4, compute any remaining MACs here.
230          No loop unrolling is used. */
231       k = count % 0x4U;
232 
233       while (k > 0U)
234       {
235         /* Perform the multiply-accumulates */
236         sum = __SMLAD(*px++, *py--, sum);
237 
238         /* Decrement loop counter */
239         k--;
240       }
241 
242       /* Store the result in the accumulator in the destination buffer. */
243       *pOut++ = (q15_t) (sum >> 15);
244 
245       /* Update the inputA and inputB pointers for next MAC calculation */
246       py = ++pSrc2 - 1U;
247       px = pIn1;
248 
249       /* Increment MAC count */
250       count++;
251 
252       /* Decrement loop counter */
253       blockSize1--;
254     }
255 
256     /* --------------------------
257      * Initializations of stage2
258      * ------------------------*/
259 
260     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
261      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
262      * ....
263      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
264      */
265 
266     /* Working pointer of inputA */
267     if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
268     {
269       pSrc1 = pIn1 + firstIndex - srcBLen + 1;
270     }
271     else
272     {
273       pSrc1 = pIn1;
274     }
275     px = pSrc1;
276 
277     /* Working pointer of inputB */
278     pSrc2 = pIn2 + (srcBLen - 1U);
279     py = pSrc2;
280 
281     /* count is the index by which the pointer pIn1 to be incremented */
282     count = 0U;
283 
284     /* -------------------
285      * Stage2 process
286      * ------------------*/
287 
288     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
289      * So, to loop unroll over blockSize2,
290      * srcBLen should be greater than or equal to 4 */
291     if (srcBLen >= 4U)
292     {
293       /* Loop unrolling: Compute 4 outputs at a time */
294       blkCnt = ((uint32_t) blockSize2 >> 2U);
295 
296       while (blkCnt > 0U)
297       {
298         py = py - 1U;
299 
300         /* Set all accumulators to zero */
301         acc0 = 0;
302         acc1 = 0;
303         acc2 = 0;
304         acc3 = 0;
305 
306 
307         /* read x[0], x[1] samples */
308         x0 = read_q15x2 ((q15_t *) px);
309         /* read x[1], x[2] samples */
310         x1 = read_q15x2 ((q15_t *) px + 1);
311         px += 2U;
312 
313 
314         /* Apply loop unrolling and compute 4 MACs simultaneously. */
315         k = srcBLen >> 2U;
316 
317         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
318          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
319         do
320         {
321           /* Read the last two inputB samples using SIMD:
322            * y[srcBLen - 1] and y[srcBLen - 2] */
323           c0 = read_q15x2_da ((q15_t **) &py);
324 
325           /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
326           acc0 = __SMLADX(x0, c0, acc0);
327 
328           /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
329           acc1 = __SMLADX(x1, c0, acc1);
330 
331           /* Read x[2], x[3] */
332           x2 = read_q15x2 ((q15_t *) px);
333 
334           /* Read x[3], x[4] */
335           x3 = read_q15x2 ((q15_t *) px + 1);
336 
337           /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
338           acc2 = __SMLADX(x2, c0, acc2);
339 
340           /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
341           acc3 = __SMLADX(x3, c0, acc3);
342 
343           /* Read y[srcBLen - 3] and y[srcBLen - 4] */
344           c0 = read_q15x2_da ((q15_t **) &py);
345 
346           /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
347           acc0 = __SMLADX(x2, c0, acc0);
348 
349           /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
350           acc1 = __SMLADX(x3, c0, acc1);
351 
352           /* Read x[4], x[5] */
353           x0 = read_q15x2 ((q15_t *) px + 2);
354 
355           /* Read x[5], x[6] */
356           x1 = read_q15x2 ((q15_t *) px + 3);
357           px += 4U;
358 
359           /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
360           acc2 = __SMLADX(x0, c0, acc2);
361 
362           /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
363           acc3 = __SMLADX(x1, c0, acc3);
364 
365         } while (--k);
366 
367         /* For the next MAC operations, SIMD is not used
368            So, the 16 bit pointer if inputB, py is updated */
369 
370         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
371            No loop unrolling is used. */
372         k = srcBLen % 0x4U;
373 
374         if (k == 1U)
375         {
376           /* Read y[srcBLen - 5] */
377           c0 = *(py + 1);
378 #ifdef  ARM_MATH_BIG_ENDIAN
379           c0 = c0 << 16U;
380 #else
381           c0 = c0 & 0x0000FFFF;
382 #endif /* #ifdef  ARM_MATH_BIG_ENDIAN */
383 
384           /* Read x[7] */
385           x3 = read_q15x2 ((q15_t *) px);
386           px++;
387 
388           /* Perform the multiply-accumulate */
389           acc0 = __SMLAD (x0, c0, acc0);
390           acc1 = __SMLAD (x1, c0, acc1);
391           acc2 = __SMLADX(x1, c0, acc2);
392           acc3 = __SMLADX(x3, c0, acc3);
393         }
394 
395         if (k == 2U)
396         {
397           /* Read y[srcBLen - 5], y[srcBLen - 6] */
398           c0 = read_q15x2 ((q15_t *) py);
399 
400           /* Read x[7], x[8] */
401           x3 = read_q15x2 ((q15_t *) px);
402 
403           /* Read x[9] */
404           x2 = read_q15x2 ((q15_t *) px + 1);
405           px += 2U;
406 
407           /* Perform the multiply-accumulate */
408           acc0 = __SMLADX(x0, c0, acc0);
409           acc1 = __SMLADX(x1, c0, acc1);
410           acc2 = __SMLADX(x3, c0, acc2);
411           acc3 = __SMLADX(x2, c0, acc3);
412         }
413 
414         if (k == 3U)
415         {
416           /* Read y[srcBLen - 5], y[srcBLen - 6] */
417           c0 = read_q15x2 ((q15_t *) py);
418 
419           /* Read x[7], x[8] */
420           x3 = read_q15x2 ((q15_t *) px);
421 
422           /* Read x[9] */
423           x2 = read_q15x2 ((q15_t *) px + 1);
424 
425           /* Perform the multiply-accumulate */
426           acc0 = __SMLADX(x0, c0, acc0);
427           acc1 = __SMLADX(x1, c0, acc1);
428           acc2 = __SMLADX(x3, c0, acc2);
429           acc3 = __SMLADX(x2, c0, acc3);
430 
431           c0 = *(py-1);
432 #ifdef  ARM_MATH_BIG_ENDIAN
433           c0 = c0 << 16U;
434 #else
435           c0 = c0 & 0x0000FFFF;
436 #endif /* #ifdef  ARM_MATH_BIG_ENDIAN */
437 
438           /* Read x[10] */
439           x3 =  read_q15x2 ((q15_t *) px + 2);
440           px += 3U;
441 
442           /* Perform the multiply-accumulates */
443           acc0 = __SMLADX(x1, c0, acc0);
444           acc1 = __SMLAD (x2, c0, acc1);
445           acc2 = __SMLADX(x2, c0, acc2);
446           acc3 = __SMLADX(x3, c0, acc3);
447         }
448 
449         /* Store the results in the accumulators in the destination buffer. */
450 #ifndef ARM_MATH_BIG_ENDIAN
451         write_q15x2_ia (&pOut, __PKHBT(acc0 >> 15, acc1 >> 15, 16));
452         write_q15x2_ia (&pOut, __PKHBT(acc2 >> 15, acc3 >> 15, 16));
453 #else
454         write_q15x2_ia (&pOut, __PKHBT(acc1 >> 15, acc0 >> 15, 16));
455         write_q15x2_ia (&pOut, __PKHBT(acc3 >> 15, acc2 >> 15, 16));
456 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
457 
458         /* Increment the pointer pIn1 index, count by 4 */
459         count += 4U;
460 
461         /* Update the inputA and inputB pointers for next MAC calculation */
462         px = pSrc1 + count;
463         py = pSrc2;
464 
465         /* Decrement the loop counter */
466         blkCnt--;
467       }
468 
469       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
470          No loop unrolling is used. */
471       blkCnt = (uint32_t) blockSize2 % 0x4U;
472 
473       while (blkCnt > 0U)
474       {
475         /* Accumulator is made zero for every iteration */
476         sum = 0;
477 
478         /* Apply loop unrolling and compute 4 MACs simultaneously. */
479         k = srcBLen >> 2U;
480 
481         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
482            a second loop below computes MACs for the remaining 1 to 3 samples. */
483         while (k > 0U)
484         {
485           /* Perform the multiply-accumulates */
486           sum += ((q31_t) *px++ * *py--);
487           sum += ((q31_t) *px++ * *py--);
488           sum += ((q31_t) *px++ * *py--);
489           sum += ((q31_t) *px++ * *py--);
490 
491           /* Decrement loop counter */
492           k--;
493         }
494 
495         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
496          ** No loop unrolling is used. */
497         k = srcBLen % 0x4U;
498 
499         while (k > 0U)
500         {
501           /* Perform the multiply-accumulates */
502           sum += ((q31_t) *px++ * *py--);
503 
504           /* Decrement the loop counter */
505           k--;
506         }
507 
508         /* Store the result in the accumulator in the destination buffer. */
509         *pOut++ = (q15_t) (sum >> 15);
510 
511         /* Increment the pointer pIn1 index, count by 1 */
512         count++;
513 
514         /* Update the inputA and inputB pointers for next MAC calculation */
515         px = pSrc1 + count;
516         py = pSrc2;
517 
518         /* Decrement loop counter */
519         blkCnt--;
520       }
521     }
522     else
523     {
524       /* If the srcBLen is not a multiple of 4,
525        * the blockSize2 loop cannot be unrolled by 4 */
526       blkCnt = (uint32_t) blockSize2;
527 
528       while (blkCnt > 0U)
529       {
530         /* Accumulator is made zero for every iteration */
531         sum = 0;
532 
533         /* srcBLen number of MACS should be performed */
534         k = srcBLen;
535 
536         while (k > 0U)
537         {
538           /* Perform the multiply-accumulate */
539           sum += ((q31_t) *px++ * *py--);
540 
541           /* Decrement the loop counter */
542           k--;
543         }
544 
545         /* Store the result in the accumulator in the destination buffer. */
546         *pOut++ = (q15_t) (sum >> 15);
547 
548         /* Increment the MAC count */
549         count++;
550 
551         /* Update the inputA and inputB pointers for next MAC calculation */
552         px = pSrc1 + count;
553         py = pSrc2;
554 
555         /* Decrement the loop counter */
556         blkCnt--;
557       }
558     }
559 
560 
561     /* --------------------------
562      * Initializations of stage3
563      * -------------------------*/
564 
565     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
566      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
567      * ....
568      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
569      * sum +=  x[srcALen-1] * y[srcBLen-1]
570      */
571 
572     /* In this stage the MAC operations are decreased by 1 for every iteration.
573        The count variable holds the number of MAC operations performed */
574     count = srcBLen - 1U;
575 
576     /* Working pointer of inputA */
577     if (firstIndex > srcALen)
578     {
579        pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
580     }
581     else
582     {
583        pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
584     }
585     px = pSrc1;
586 
587     /* Working pointer of inputB */
588     pSrc2 = pIn2 + (srcBLen - 1U);
589     pIn2 = pSrc2 - 1U;
590     py = pIn2;
591 
592     /* -------------------
593      * Stage3 process
594      * ------------------*/
595 
596     /* For loop unrolling by 4, this stage is divided into two. */
597     /* First part of this stage computes the MAC operations greater than 4 */
598     /* Second part of this stage computes the MAC operations less than or equal to 4 */
599 
600     /* The first part of the stage starts here */
601     j = count >> 2U;
602 
603     while ((j > 0U) && (blockSize3 > 0))
604     {
605       /* Accumulator is made zero for every iteration */
606       sum = 0;
607 
608       /* Apply loop unrolling and compute 4 MACs simultaneously. */
609       k = count >> 2U;
610 
611       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
612        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
613       while (k > 0U)
614       {
615         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
616          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
617         sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
618         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
619          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
620         sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
621 
622         /* Decrement loop counter */
623         k--;
624       }
625 
626       /* For the next MAC operations, the pointer py is used without SIMD
627          So, py is incremented by 1 */
628       py = py + 1U;
629 
630       /* If the count is not a multiple of 4, compute any remaining MACs here.
631          No loop unrolling is used. */
632       k = count % 0x4U;
633 
634       while (k > 0U)
635       {
636         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
637         sum = __SMLAD(*px++, *py--, sum);
638 
639         /* Decrement the loop counter */
640         k--;
641       }
642 
643       /* Store the result in the accumulator in the destination buffer. */
644       *pOut++ = (q15_t) (sum >> 15);
645 
646       /* Update the inputA and inputB pointers for next MAC calculation */
647       px = ++pSrc1;
648       py = pIn2;
649 
650       /* Decrement the MAC count */
651       count--;
652 
653       /* Decrement the loop counter */
654       blockSize3--;
655 
656       j--;
657     }
658 
659     /* The second part of the stage starts here */
660     /* SIMD is not used for the next MAC operations,
661      * so pointer py is updated to read only one sample at a time */
662     py = py + 1U;
663 
664     while (blockSize3 > 0)
665     {
666       /* Accumulator is made zero for every iteration */
667       sum = 0;
668 
669       /* Apply loop unrolling and compute 4 MACs simultaneously. */
670       k = count;
671 
672       while (k > 0U)
673       {
674         /* Perform the multiply-accumulates */
675         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
676         sum = __SMLAD(*px++, *py--, sum);
677 
678         /* Decrement the loop counter */
679         k--;
680       }
681 
682       /* Store the result in the accumulator in the destination buffer. */
683       *pOut++ = (q15_t) (sum >> 15);
684 
685       /* Update the inputA and inputB pointers for next MAC calculation */
686       px = ++pSrc1;
687       py = pSrc2;
688 
689       /* Decrement the MAC count */
690       count--;
691 
692       /* Decrement the loop counter */
693       blockSize3--;
694     }
695 
696     /* Set status as ARM_MATH_SUCCESS */
697     status = ARM_MATH_SUCCESS;
698   }
699 
700   /* Return to application */
701   return (status);
702 
703 }
704 
705 /**
706   @} end of PartialConv group
707  */
708