1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_partial_q7.c
4  * Description:  Partial convolution of Q7 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup PartialConv
37   @{
38  */
39 
40 /**
41   @brief         Partial convolution of Q7 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written
47   @param[in]     firstIndex is the first output sample to start with
48   @param[in]     numPoints  is the number of output points to be computed
49   @return        execution status
50                    - \ref ARM_MATH_SUCCESS        : Operation successful
51                    - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
52 
53   @remark
54                    Refer to \ref arm_conv_partial_opt_q7() for a faster implementation of this function.
55  */
56 
arm_conv_partial_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst,uint32_t firstIndex,uint32_t numPoints)57 ARM_DSP_ATTRIBUTE arm_status arm_conv_partial_q7(
58   const q7_t * pSrcA,
59         uint32_t srcALen,
60   const q7_t * pSrcB,
61         uint32_t srcBLen,
62         q7_t * pDst,
63         uint32_t firstIndex,
64         uint32_t numPoints)
65 {
66 
67 #if defined(ARM_MATH_DSP)
68 
69   const q7_t *pIn1;                                    /* InputA pointer */
70   const q7_t *pIn2;                                    /* InputB pointer */
71         q7_t *pOut = pDst;                             /* Output pointer */
72   const q7_t *px;                                      /* Intermediate inputA pointer */
73   const q7_t *py;                                      /* Intermediate inputB pointer */
74   const q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */
75         q31_t sum;                                     /* Accumulator */
76         uint32_t j, k, count, blkCnt, check;           /* Loop counters */
77         int32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
78         arm_status status;                             /* Status of Partial convolution */
79 
80 #if defined (ARM_MATH_LOOPUNROLL)
81         q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
82         q31_t input1, input2;                          /* Temporary input variables */
83         q15_t in1, in2;                                /* Temporary input variables */
84         q7_t x0, x1, x2, x3, c0, c1;                   /* Temporary variables to hold state and coefficient values */
85 #endif
86 
87   /* Check for range of output samples to be calculated */
88   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
89   {
90     /* Set status as ARM_MATH_ARGUMENT_ERROR */
91     status = ARM_MATH_ARGUMENT_ERROR;
92   }
93   else
94   {
95     /* The algorithm implementation is based on the lengths of the inputs. */
96     /* srcB is always made to slide across srcA. */
97     /* So srcBLen is always considered as shorter or equal to srcALen */
98     if (srcALen >= srcBLen)
99     {
100       /* Initialization of inputA pointer */
101       pIn1 = pSrcA;
102 
103       /* Initialization of inputB pointer */
104       pIn2 = pSrcB;
105     }
106     else
107     {
108       /* Initialization of inputA pointer */
109       pIn1 = pSrcB;
110 
111       /* Initialization of inputB pointer */
112       pIn2 = pSrcA;
113 
114       /* srcBLen is always considered as shorter or equal to srcALen */
115       j = srcBLen;
116       srcBLen = srcALen;
117       srcALen = j;
118     }
119 
120     /* Conditions to check which loopCounter holds
121      * the first and last indices of the output samples to be calculated. */
122     check = firstIndex + numPoints;
123     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
124     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
125     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
126     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
127     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
128     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
129 
130     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
131     /* The function is internally
132      * divided into three stages according to the number of multiplications that has to be
133      * taken place between inputA samples and inputB samples. In the first stage of the
134      * algorithm, the multiplications increase by one for every iteration.
135      * In the second stage of the algorithm, srcBLen number of multiplications are done.
136      * In the third stage of the algorithm, the multiplications decrease by one
137      * for every iteration. */
138 
139     /* Set the output pointer to point to the firstIndex
140      * of the output sample to be calculated. */
141     pOut = pDst + firstIndex;
142 
143     /* --------------------------
144      * Initializations of stage1
145      * -------------------------*/
146 
147     /* sum = x[0] * y[0]
148      * sum = x[0] * y[1] + x[1] * y[0]
149      * ....
150      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
151      */
152 
153     /* In this stage the MAC operations are increased by 1 for every iteration.
154        The count variable holds the number of MAC operations performed.
155        Since the partial convolution starts from firstIndex
156        Number of Macs to be performed is firstIndex + 1 */
157     count = 1U + firstIndex;
158 
159     /* Working pointer of inputA */
160     px = pIn1;
161 
162     /* Working pointer of inputB */
163     pSrc2 = pIn2 + firstIndex;
164     py = pSrc2;
165 
166     /* ------------------------
167      * Stage1 process
168      * ----------------------*/
169 
170     /* The first stage starts here */
171     while (blockSize1 > 0)
172     {
173       /* Accumulator is made zero for every iteration */
174       sum = 0;
175 
176 #if defined (ARM_MATH_LOOPUNROLL)
177 
178       /* Loop unrolling: Compute 4 outputs at a time */
179       k = count >> 2U;
180 
181       while (k > 0U)
182       {
183         /* x[0] , x[1] */
184         in1 = (q15_t) *px++;
185         in2 = (q15_t) *px++;
186         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
187 
188         /* y[srcBLen - 1] , y[srcBLen - 2] */
189         in1 = (q15_t) *py--;
190         in2 = (q15_t) *py--;
191         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
192 
193         /* x[0] * y[srcBLen - 1] */
194         /* x[1] * y[srcBLen - 2] */
195         sum = __SMLAD(input1, input2, sum);
196 
197         /* x[2] , x[3] */
198         in1 = (q15_t) *px++;
199         in2 = (q15_t) *px++;
200         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
201 
202         /* y[srcBLen - 3] , y[srcBLen - 4] */
203         in1 = (q15_t) *py--;
204         in2 = (q15_t) *py--;
205         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
206 
207         /* x[2] * y[srcBLen - 3] */
208         /* x[3] * y[srcBLen - 4] */
209         sum = __SMLAD(input1, input2, sum);
210 
211         /* Decrement loop counter */
212         k--;
213       }
214 
215       /* Loop unrolling: Compute remaining outputs */
216       k = count % 0x4U;
217 
218 #else
219 
220       /* Initialize k with number of samples */
221       k = count;
222 
223 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
224 
225       while (k > 0U)
226       {
227         /* Perform the multiply-accumulate */
228         sum += ((q31_t) * px++ * *py--);
229 
230         /* Decrement loop counter */
231         k--;
232       }
233 
234       /* Store the result in the accumulator in the destination buffer. */
235       *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
236 
237       /* Update the inputA and inputB pointers for next MAC calculation */
238       py = ++pSrc2;
239       px = pIn1;
240 
241       /* Increment MAC count */
242       count++;
243 
244       /* Decrement loop counter */
245       blockSize1--;
246     }
247 
248     /* --------------------------
249      * Initializations of stage2
250      * ------------------------*/
251 
252     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
253      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
254      * ....
255      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
256      */
257 
258     /* Working pointer of inputA */
259     if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
260     {
261       pSrc1 = pIn1 + firstIndex - srcBLen + 1;
262     }
263     else
264     {
265       pSrc1 = pIn1;
266     }
267     px = pSrc1;
268 
269     /* Working pointer of inputB */
270     pSrc2 = pIn2 + (srcBLen - 1U);
271     py = pSrc2;
272 
273     /* count is the index by which the pointer pIn1 to be incremented */
274     count = 0U;
275 
276     /* -------------------
277      * Stage2 process
278      * ------------------*/
279 
280     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
281      * So, to loop unroll over blockSize2,
282      * srcBLen should be greater than or equal to 4 */
283     if (srcBLen >= 4U)
284     {
285 #if defined (ARM_MATH_LOOPUNROLL)
286 
287       /* Loop unrolling: Compute 4 outputs at a time */
288       blkCnt = ((uint32_t) blockSize2 >> 2U);
289 
290       while (blkCnt > 0U)
291       {
292         /* Set all accumulators to zero */
293         acc0 = 0;
294         acc1 = 0;
295         acc2 = 0;
296         acc3 = 0;
297 
298         /* read x[0], x[1], x[2] samples */
299         x0 = *px++;
300         x1 = *px++;
301         x2 = *px++;
302 
303         /* Apply loop unrolling and compute 4 MACs simultaneously. */
304         k = srcBLen >> 2U;
305 
306         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
307          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
308         do
309         {
310           /* Read y[srcBLen - 1] sample */
311           c0 = *py--;
312           /* Read y[srcBLen - 2] sample */
313           c1 = *py--;
314 
315           /* Read x[3] sample */
316           x3 = *px++;
317 
318           /* x[0] and x[1] are packed */
319           in1 = (q15_t) x0;
320           in2 = (q15_t) x1;
321 
322           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
323 
324           /* y[srcBLen - 1]   and y[srcBLen - 2] are packed */
325           in1 = (q15_t) c0;
326           in2 = (q15_t) c1;
327 
328           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
329 
330           /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2]  */
331           acc0 = __SMLAD(input1, input2, acc0);
332 
333           /* x[1] and x[2] are packed */
334           in1 = (q15_t) x1;
335           in2 = (q15_t) x2;
336 
337           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
338 
339           /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2]  */
340           acc1 = __SMLAD(input1, input2, acc1);
341 
342           /* x[2] and x[3] are packed */
343           in1 = (q15_t) x2;
344           in2 = (q15_t) x3;
345 
346           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
347 
348           /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2]  */
349           acc2 = __SMLAD(input1, input2, acc2);
350 
351           /* Read x[4] sample */
352           x0 = *px++;
353 
354           /* x[3] and x[4] are packed */
355           in1 = (q15_t) x3;
356           in2 = (q15_t) x0;
357 
358           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
359 
360           /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2]  */
361           acc3 = __SMLAD(input1, input2, acc3);
362 
363           /* Read y[srcBLen - 3] sample */
364           c0 = *py--;
365           /* Read y[srcBLen - 4] sample */
366           c1 = *py--;
367 
368           /* Read x[5] sample */
369           x1 = *px++;
370 
371           /* x[2] and x[3] are packed */
372           in1 = (q15_t) x2;
373           in2 = (q15_t) x3;
374 
375           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
376 
377           /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
378           in1 = (q15_t) c0;
379           in2 = (q15_t) c1;
380 
381           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
382 
383           /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */
384           acc0 = __SMLAD(input1, input2, acc0);
385 
386           /* x[3] and x[4] are packed */
387           in1 = (q15_t) x3;
388           in2 = (q15_t) x0;
389 
390           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
391 
392           /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4]  */
393           acc1 = __SMLAD(input1, input2, acc1);
394 
395           /* x[4] and x[5] are packed */
396           in1 = (q15_t) x0;
397           in2 = (q15_t) x1;
398 
399           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
400 
401           /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */
402           acc2 = __SMLAD(input1, input2, acc2);
403 
404           /* Read x[6] sample */
405           x2 = *px++;
406 
407           /* x[5] and x[6] are packed */
408           in1 = (q15_t) x1;
409           in2 = (q15_t) x2;
410 
411           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
412 
413           /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4]  */
414           acc3 = __SMLAD(input1, input2, acc3);
415 
416         } while (--k);
417 
418         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
419          ** No loop unrolling is used. */
420         k = srcBLen % 0x4U;
421 
422         while (k > 0U)
423         {
424           /* Read y[srcBLen - 5] sample */
425           c0 = *py--;
426           /* Read x[7] sample */
427           x3 = *px++;
428 
429           /* Perform the multiply-accumulates */
430           /* acc0 +=  x[4] * y[srcBLen - 5] */
431           acc0 += ((q31_t) x0 * c0);
432           /* acc1 +=  x[5] * y[srcBLen - 5] */
433           acc1 += ((q31_t) x1 * c0);
434           /* acc2 +=  x[6] * y[srcBLen - 5] */
435           acc2 += ((q31_t) x2 * c0);
436           /* acc3 +=  x[7] * y[srcBLen - 5] */
437           acc3 += ((q31_t) x3 * c0);
438 
439           /* Reuse the present samples for the next MAC */
440           x0 = x1;
441           x1 = x2;
442           x2 = x3;
443 
444           /* Decrement the loop counter */
445           k--;
446         }
447 
448         /* Store the result in the accumulator in the destination buffer. */
449         *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8));
450         *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8));
451         *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8));
452         *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8));
453 
454         /* Increment the pointer pIn1 index, count by 4 */
455         count += 4U;
456 
457         /* Update the inputA and inputB pointers for next MAC calculation */
458         px = pSrc1 + count;
459         py = pSrc2;
460 
461         /* Decrement loop counter */
462         blkCnt--;
463       }
464 
465       /* Loop unrolling: Compute remaining outputs */
466       blkCnt = (uint32_t) blockSize2 % 0x4U;
467 
468 #else
469 
470       /* Initialize blkCnt with number of samples */
471       blkCnt = blockSize2;
472 
473 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
474 
475       while (blkCnt > 0U)
476       {
477         /* Accumulator is made zero for every iteration */
478         sum = 0;
479 
480 #if defined (ARM_MATH_LOOPUNROLL)
481 
482         /* Loop unrolling: Compute 4 outputs at a time */
483         k = srcBLen >> 2U;
484 
485         while (k > 0U)
486         {
487           /* Reading two inputs of SrcA buffer and packing */
488           in1 = (q15_t) *px++;
489           in2 = (q15_t) *px++;
490           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
491 
492           /* Reading two inputs of SrcB buffer and packing */
493           in1 = (q15_t) *py--;
494           in2 = (q15_t) *py--;
495           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
496 
497           /* Perform the multiply-accumulate */
498           sum = __SMLAD(input1, input2, sum);
499 
500           /* Reading two inputs of SrcA buffer and packing */
501           in1 = (q15_t) *px++;
502           in2 = (q15_t) *px++;
503           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
504 
505           /* Reading two inputs of SrcB buffer and packing */
506           in1 = (q15_t) *py--;
507           in2 = (q15_t) *py--;
508           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
509 
510           /* Perform the multiply-accumulate */
511           sum = __SMLAD(input1, input2, sum);
512 
513           /* Decrement loop counter */
514           k--;
515         }
516 
517         /* Loop unrolling: Compute remaining outputs */
518         k = srcBLen % 0x4U;
519 
520 #else
521 
522         /* Initialize blkCnt with number of samples */
523         k = srcBLen;
524 
525 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
526 
527         while (k > 0U)
528         {
529           /* Perform the multiply-accumulate */
530           sum += ((q31_t) * px++ * *py--);
531 
532           /* Decrement loop counter */
533           k--;
534         }
535 
536         /* Store the result in the accumulator in the destination buffer. */
537         *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
538 
539         /* Increment the pointer pIn1 index, count by 1 */
540         count++;
541 
542         /* Update the inputA and inputB pointers for next MAC calculation */
543         px = pSrc1 + count;
544         py = pSrc2;
545 
546         /* Decrement loop counter */
547         blkCnt--;
548       }
549     }
550     else
551     {
552       /* If the srcBLen is not a multiple of 4,
553        * the blockSize2 loop cannot be unrolled by 4 */
554       blkCnt = (uint32_t) blockSize2;
555 
556       while (blkCnt > 0U)
557       {
558         /* Accumulator is made zero for every iteration */
559         sum = 0;
560 
561         /* srcBLen number of MACS should be performed */
562         k = srcBLen;
563 
564         while (k > 0U)
565         {
566           /* Perform the multiply-accumulate */
567           sum += ((q31_t) * px++ * *py--);
568 
569           /* Decrement loop counter */
570           k--;
571         }
572 
573         /* Store the result in the accumulator in the destination buffer. */
574         *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
575 
576         /* Increment the MAC count */
577         count++;
578 
579         /* Update the inputA and inputB pointers for next MAC calculation */
580         px = pSrc1 + count;
581         py = pSrc2;
582 
583         /* Decrement the loop counter */
584         blkCnt--;
585       }
586     }
587 
588 
589     /* --------------------------
590      * Initializations of stage3
591      * -------------------------*/
592 
593     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
594      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
595      * ....
596      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
597      * sum +=  x[srcALen-1] * y[srcBLen-1]
598      */
599 
600     /* In this stage the MAC operations are decreased by 1 for every iteration.
601        The count variable holds the number of MAC operations performed */
602     count = srcBLen - 1U;
603 
604     /* Working pointer of inputA */
605     if (firstIndex > srcALen)
606     {
607        pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
608     }
609     else
610     {
611        pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
612     }
613     px = pSrc1;
614 
615     /* Working pointer of inputB */
616     pSrc2 = pIn2 + (srcBLen - 1U);
617     py = pSrc2;
618 
619     /* -------------------
620      * Stage3 process
621      * ------------------*/
622 
623     while (blockSize3 > 0)
624     {
625       /* Accumulator is made zero for every iteration */
626       sum = 0;
627 
628 #if defined (ARM_MATH_LOOPUNROLL)
629 
630       /* Loop unrolling: Compute 4 outputs at a time */
631       k = count >> 2U;
632 
633       while (k > 0U)
634       {
635         /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
636         in1 = (q15_t) *px++;
637         in2 = (q15_t) *px++;
638         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
639 
640         /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
641         in1 = (q15_t) *py--;
642         in2 = (q15_t) *py--;
643         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
644 
645         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
646         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
647         sum = __SMLAD(input1, input2, sum);
648 
649         /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
650         in1 = (q15_t) *px++;
651         in2 = (q15_t) *px++;
652         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
653 
654         /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
655         in1 = (q15_t) *py--;
656         in2 = (q15_t) *py--;
657         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
658 
659         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
660         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
661         sum = __SMLAD(input1, input2, sum);
662 
663         /* Decrement loop counter */
664         k--;
665       }
666 
667       /* Loop unrolling: Compute remaining outputs */
668       k = count % 0x4U;
669 
670 #else
671 
672       /* Initialize blkCnt with number of samples */
673       k = count;
674 
675 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
676 
677       while (k > 0U)
678       {
679         /* Perform the multiply-accumulates */
680         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
681         sum += ((q31_t) * px++ * *py--);
682 
683         /* Decrement loop counter */
684         k--;
685       }
686 
687       /* Store the result in the accumulator in the destination buffer. */
688       *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
689 
690       /* Update the inputA and inputB pointers for next MAC calculation */
691       px = ++pSrc1;
692       py = pSrc2;
693 
694       /* Decrement MAC count */
695       count--;
696 
697       /* Decrement the loop counter */
698       blockSize3--;
699     }
700 
701     /* Set status as ARM_MATH_SUCCESS */
702     status = ARM_MATH_SUCCESS;
703   }
704 
705   /* Return to application */
706   return (status);
707 
708 #else
709 /* alternate version for CM0_FAMILY */
710 
711   const q7_t *pIn1 = pSrcA;                            /* InputA pointer */
712   const q7_t *pIn2 = pSrcB;                            /* InputB pointer */
713         q31_t sum;                                     /* Accumulator */
714         uint32_t i, j;                                 /* Loop counters */
715         arm_status status;                             /* Status of Partial convolution */
716 
717   /* Check for range of output samples to be calculated */
718   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
719   {
720     /* Set status as ARM_MATH_ARGUMENT_ERROR */
721     status = ARM_MATH_ARGUMENT_ERROR;
722   }
723   else
724   {
725     /* Loop to calculate convolution for output length number of values */
726     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
727     {
728       /* Initialize sum with zero to carry on MAC operations */
729       sum = 0;
730 
731       /* Loop to perform MAC operations according to convolution equation */
732       for (j = 0U; j <= i; j++)
733       {
734         /* Check the array limitations */
735         if (((i - j) < srcBLen) && (j < srcALen))
736         {
737           /* z[i] += x[i-j] * y[j] */
738           sum += ((q15_t) pIn1[j] * (pIn2[i - j]));
739         }
740       }
741 
742       /* Store the output in the destination buffer */
743       pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U);
744     }
745 
746     /* Set status as ARM_MATH_SUCCESS */
747     status = ARM_MATH_SUCCESS;
748   }
749 
750   /* Return to application */
751   return (status);
752 
753 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
754 
755 }
756 
757 /**
758   @} end of PartialConv group
759  */
760