1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_partial_q15.c
4  * Description:  Partial convolution of Q15 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup PartialConv
37   @{
38  */
39 
40 /**
41   @brief         Partial convolution of Q15 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written
47   @param[in]     firstIndex is the first output sample to start with
48   @param[in]     numPoints  is the number of output points to be computed
49   @return        execution status
50                    - \ref ARM_MATH_SUCCESS        : Operation successful
51                    - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
52 
53   @remark
54                    Refer to \ref arm_conv_partial_fast_q15() for a faster but less precise version of this function.
55   @remark
56                    Refer to \ref arm_conv_partial_opt_q15() for a faster implementation of this function using scratch buffers.
57  */
58 
arm_conv_partial_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,uint32_t firstIndex,uint32_t numPoints)59 ARM_DSP_ATTRIBUTE arm_status arm_conv_partial_q15(
60   const q15_t * pSrcA,
61         uint32_t srcALen,
62   const q15_t * pSrcB,
63         uint32_t srcBLen,
64         q15_t * pDst,
65         uint32_t firstIndex,
66         uint32_t numPoints)
67 {
68 
69 #if defined (ARM_MATH_DSP)
70 
71   const q15_t *pIn1;                                   /* InputA pointer */
72   const q15_t *pIn2;                                   /* InputB pointer */
73         q15_t *pOut = pDst;                            /* Output pointer */
74         q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
75   const q15_t *px;                                     /* Intermediate inputA pointer */
76   const q15_t *py;                                     /* Intermediate inputB pointer */
77   const q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
78         q31_t x0, x1, x2, x3, c0;                      /* Temporary input variables to hold state and coefficient values */
79         int32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
80         uint32_t j, k, count, blkCnt, check;
81         arm_status status;                             /* Status of Partial convolution */
82 
83   /* Check for range of output samples to be calculated */
84   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
85   {
86     /* Set status as ARM_MATH_ARGUMENT_ERROR */
87     status = ARM_MATH_ARGUMENT_ERROR;
88   }
89   else
90   {
91     /* The algorithm implementation is based on the lengths of the inputs. */
92     /* srcB is always made to slide across srcA. */
93     /* So srcBLen is always considered as shorter or equal to srcALen */
94     if (srcALen >= srcBLen)
95     {
96       /* Initialization of inputA pointer */
97       pIn1 = pSrcA;
98 
99       /* Initialization of inputB pointer */
100       pIn2 = pSrcB;
101     }
102     else
103     {
104       /* Initialization of inputA pointer */
105       pIn1 = pSrcB;
106 
107       /* Initialization of inputB pointer */
108       pIn2 = pSrcA;
109 
110       /* srcBLen is always considered as shorter or equal to srcALen */
111       j = srcBLen;
112       srcBLen = srcALen;
113       srcALen = j;
114     }
115 
116     /* Conditions to check which loopCounter holds
117      * the first and last indices of the output samples to be calculated. */
118     check = firstIndex + numPoints;
119     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
120     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
121     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
122     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :  (int32_t)numPoints) : 0;
123     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
124     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
125 
126     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
127     /* The function is internally
128      * divided into three stages according to the number of multiplications that has to be
129      * taken place between inputA samples and inputB samples. In the first stage of the
130      * algorithm, the multiplications increase by one for every iteration.
131      * In the second stage of the algorithm, srcBLen number of multiplications are done.
132      * In the third stage of the algorithm, the multiplications decrease by one
133      * for every iteration. */
134 
135     /* Set the output pointer to point to the firstIndex
136      * of the output sample to be calculated. */
137     pOut = pDst + firstIndex;
138 
139     /* --------------------------
140      * Initializations of stage1
141      * -------------------------*/
142 
143     /* sum = x[0] * y[0]
144      * sum = x[0] * y[1] + x[1] * y[0]
145      * ....
146      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
147      */
148 
149     /* In this stage the MAC operations are increased by 1 for every iteration.
150        The count variable holds the number of MAC operations performed.
151        Since the partial convolution starts from firstIndex
152        Number of Macs to be performed is firstIndex + 1 */
153     count = 1U + firstIndex;
154 
155     /* Working pointer of inputA */
156     px = pIn1;
157 
158     /* Working pointer of inputB */
159     pSrc2 = pIn2 + firstIndex;
160     py = pSrc2;
161 
162     /* ------------------------
163      * Stage1 process
164      * ----------------------*/
165 
166     /* For loop unrolling by 4, this stage is divided into two. */
167     /* First part of this stage computes the MAC operations less than 4 */
168     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
169 
170     /* The first part of the stage starts here */
171     while ((count < 4U) && (blockSize1 > 0))
172     {
173       /* Accumulator is made zero for every iteration */
174       sum = 0;
175 
176       /* Loop over number of MAC operations between
177        * inputA samples and inputB samples */
178       k = count;
179 
180       while (k > 0U)
181       {
182         /* Perform the multiply-accumulates */
183         sum = __SMLALD(*px++, *py--, sum);
184 
185         /* Decrement loop counter */
186         k--;
187       }
188 
189       /* Store the result in the accumulator in the destination buffer. */
190       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
191 
192       /* Update the inputA and inputB pointers for next MAC calculation */
193       py = ++pSrc2;
194       px = pIn1;
195 
196       /* Increment MAC count */
197       count++;
198 
199       /* Decrement loop counter */
200       blockSize1--;
201     }
202 
203     /* The second part of the stage starts here */
204     /* The internal loop, over count, is unrolled by 4 */
205     /* To, read the last two inputB samples using SIMD:
206      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
207     py = py - 1;
208 
209     while (blockSize1 > 0)
210     {
211       /* Accumulator is made zero for every iteration */
212       sum = 0;
213 
214       /* Apply loop unrolling and compute 4 MACs simultaneously. */
215       k = count >> 2U;
216 
217       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
218          a second loop below computes MACs for the remaining 1 to 3 samples. */
219       while (k > 0U)
220       {
221         /* Perform the multiply-accumulate */
222         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
223         sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
224         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
225         sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
226 
227         /* Decrement loop counter */
228         k--;
229       }
230 
231       /* For the next MAC operations, the pointer py is used without SIMD
232        * So, py is incremented by 1 */
233       py = py + 1U;
234 
235       /* If the count is not a multiple of 4, compute any remaining MACs here.
236          No loop unrolling is used. */
237       k = count % 0x4U;
238 
239       while (k > 0U)
240       {
241         /* Perform the multiply-accumulates */
242         sum = __SMLALD(*px++, *py--, sum);
243 
244         /* Decrement loop counter */
245         k--;
246       }
247 
248       /* Store the result in the accumulator in the destination buffer. */
249       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
250 
251       /* Update the inputA and inputB pointers for next MAC calculation */
252       py = ++pSrc2 - 1U;
253       px = pIn1;
254 
255       /* Increment MAC count */
256       count++;
257 
258       /* Decrement loop counter */
259       blockSize1--;
260     }
261 
262     /* --------------------------
263      * Initializations of stage2
264      * ------------------------*/
265 
266     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
267      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
268      * ....
269      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
270      */
271 
272     /* Working pointer of inputA */
273     if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
274     {
275       pSrc1 = pIn1 + firstIndex - srcBLen + 1;
276     }
277     else
278     {
279       pSrc1 = pIn1;
280     }
281     px = pSrc1;
282 
283     /* Working pointer of inputB */
284     pSrc2 = pIn2 + (srcBLen - 1U);
285     py = pSrc2;
286 
287     /* count is the index by which the pointer pIn1 to be incremented */
288     count = 0U;
289 
290     /* -------------------
291      * Stage2 process
292      * ------------------*/
293 
294     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
295      * So, to loop unroll over blockSize2,
296      * srcBLen should be greater than or equal to 4 */
297     if (srcBLen >= 4U)
298     {
299       /* Loop unrolling: Compute 4 outputs at a time */
300       blkCnt = ((uint32_t) blockSize2 >> 2U);
301 
302       while (blkCnt > 0U)
303       {
304         py = py - 1U;
305 
306         /* Set all accumulators to zero */
307         acc0 = 0;
308         acc1 = 0;
309         acc2 = 0;
310         acc3 = 0;
311 
312 
313         /* read x[0], x[1] samples */
314         x0 = read_q15x2 ((q15_t *) px);
315         /* read x[1], x[2] samples */
316         x1 = read_q15x2 ((q15_t *) px + 1);
317         px += 2U;
318 
319 
320         /* Apply loop unrolling and compute 4 MACs simultaneously. */
321         k = srcBLen >> 2U;
322 
323         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
324          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
325         do
326         {
327           /* Read the last two inputB samples using SIMD:
328            * y[srcBLen - 1] and y[srcBLen - 2] */
329           c0 = read_q15x2_da ((q15_t **) &py);
330 
331           /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
332           acc0 = __SMLALDX(x0, c0, acc0);
333 
334           /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
335           acc1 = __SMLALDX(x1, c0, acc1);
336 
337           /* Read x[2], x[3] */
338           x2 = read_q15x2 ((q15_t *) px);
339 
340           /* Read x[3], x[4] */
341           x3 = read_q15x2 ((q15_t *) px + 1);
342 
343           /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
344           acc2 = __SMLALDX(x2, c0, acc2);
345 
346           /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
347           acc3 = __SMLALDX(x3, c0, acc3);
348 
349           /* Read y[srcBLen - 3] and y[srcBLen - 4] */
350           c0 = read_q15x2_da ((q15_t **) &py);
351 
352           /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
353           acc0 = __SMLALDX(x2, c0, acc0);
354 
355           /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
356           acc1 = __SMLALDX(x3, c0, acc1);
357 
358           /* Read x[4], x[5] */
359           x0 = read_q15x2 ((q15_t *) px + 2);
360 
361           /* Read x[5], x[6] */
362           x1 = read_q15x2 ((q15_t *) px + 3);
363           px += 4U;
364 
365           /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
366           acc2 = __SMLALDX(x0, c0, acc2);
367 
368           /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
369           acc3 = __SMLALDX(x1, c0, acc3);
370 
371         } while (--k);
372 
373         /* For the next MAC operations, SIMD is not used
374          * So, the 16 bit pointer if inputB, py is updated */
375 
376         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
377          ** No loop unrolling is used. */
378         k = srcBLen % 0x4U;
379 
380         if (k == 1U)
381         {
382           /* Read y[srcBLen - 5] */
383           c0 = *(py+1);
384 #ifdef  ARM_MATH_BIG_ENDIAN
385           c0 = c0 << 16U;
386 #else
387           c0 = c0 & 0x0000FFFF;
388 #endif /* #ifdef  ARM_MATH_BIG_ENDIAN */
389 
390           /* Read x[7] */
391           x3 = read_q15x2 ((q15_t *) px);
392           px++;
393 
394           /* Perform the multiply-accumulate */
395           acc0 = __SMLALD (x0, c0, acc0);
396           acc1 = __SMLALD (x1, c0, acc1);
397           acc2 = __SMLALDX(x1, c0, acc2);
398           acc3 = __SMLALDX(x3, c0, acc3);
399         }
400 
401         if (k == 2U)
402         {
403           /* Read y[srcBLen - 5], y[srcBLen - 6] */
404           c0 = read_q15x2 ((q15_t *) py);
405 
406           /* Read x[7], x[8] */
407           x3 = read_q15x2 ((q15_t *) px);
408 
409           /* Read x[9] */
410           x2 = read_q15x2 ((q15_t *) px + 1);
411           px += 2U;
412 
413           /* Perform the multiply-accumulate */
414           acc0 = __SMLALDX(x0, c0, acc0);
415           acc1 = __SMLALDX(x1, c0, acc1);
416           acc2 = __SMLALDX(x3, c0, acc2);
417           acc3 = __SMLALDX(x2, c0, acc3);
418         }
419 
420         if (k == 3U)
421         {
422           /* Read y[srcBLen - 5], y[srcBLen - 6] */
423           c0 = read_q15x2 ((q15_t *) py);
424 
425           /* Read x[7], x[8] */
426           x3 = read_q15x2 ((q15_t *) px);
427 
428           /* Read x[9] */
429           x2 = read_q15x2 ((q15_t *) px + 1);
430 
431           /* Perform the multiply-accumulate */
432           acc0 = __SMLALDX(x0, c0, acc0);
433           acc1 = __SMLALDX(x1, c0, acc1);
434           acc2 = __SMLALDX(x3, c0, acc2);
435           acc3 = __SMLALDX(x2, c0, acc3);
436 
437           c0 = *(py-1);
438 #ifdef  ARM_MATH_BIG_ENDIAN
439           c0 = c0 << 16U;
440 #else
441           c0 = c0 & 0x0000FFFF;
442 #endif /* #ifdef  ARM_MATH_BIG_ENDIAN */
443 
444           /* Read x[10] */
445           x3 =  read_q15x2 ((q15_t *) px + 2);
446           px += 3U;
447 
448           /* Perform the multiply-accumulates */
449           acc0 = __SMLALDX(x1, c0, acc0);
450           acc1 = __SMLALD (x2, c0, acc1);
451           acc2 = __SMLALDX(x2, c0, acc2);
452           acc3 = __SMLALDX(x3, c0, acc3);
453         }
454 
455         /* Store the results in the accumulators in the destination buffer. */
456         {
457           int32_t sat0 = __SSAT((acc0 >> 15), 16);
458           int32_t sat1 = __SSAT((acc1 >> 15), 16);
459           int32_t sat2 = __SSAT((acc2 >> 15), 16);
460           int32_t sat3 = __SSAT((acc3 >> 15), 16);
461 #ifndef  ARM_MATH_BIG_ENDIAN
462           write_q15x2_ia (&pOut, __PKHBT(sat0, sat1, 16));
463           write_q15x2_ia (&pOut, __PKHBT(sat2, sat3, 16));
464 #else
465           write_q15x2_ia (&pOut, __PKHBT(sat1, sat0, 16));
466           write_q15x2_ia (&pOut, __PKHBT(sat3, sat2, 16));
467 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
468         }
469 
470         /* Increment the pointer pIn1 index, count by 4 */
471         count += 4U;
472 
473         /* Update the inputA and inputB pointers for next MAC calculation */
474         px = pSrc1 + count;
475         py = pSrc2;
476 
477         /* Decrement loop counter */
478         blkCnt--;
479       }
480 
481       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
482          No loop unrolling is used. */
483       blkCnt = (uint32_t) blockSize2 % 0x4U;
484 
485       while (blkCnt > 0U)
486       {
487         /* Accumulator is made zero for every iteration */
488         sum = 0;
489 
490         /* Apply loop unrolling and compute 4 MACs simultaneously. */
491         k = srcBLen >> 2U;
492 
493         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
494            a second loop below computes MACs for the remaining 1 to 3 samples. */
495         while (k > 0U)
496         {
497           /* Perform the multiply-accumulates */
498           sum += (q63_t) ((q31_t) *px++ * *py--);
499           sum += (q63_t) ((q31_t) *px++ * *py--);
500           sum += (q63_t) ((q31_t) *px++ * *py--);
501           sum += (q63_t) ((q31_t) *px++ * *py--);
502 
503           /* Decrement loop counter */
504           k--;
505         }
506 
507         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
508          ** No loop unrolling is used. */
509         k = srcBLen % 0x4U;
510 
511         while (k > 0U)
512         {
513           /* Perform the multiply-accumulate */
514           sum += (q63_t) ((q31_t) *px++ * *py--);
515 
516           /* Decrement loop counter */
517           k--;
518         }
519 
520         /* Store the result in the accumulator in the destination buffer. */
521         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
522 
523         /* Increment the pointer pIn1 index, count by 1 */
524         count++;
525 
526         /* Update the inputA and inputB pointers for next MAC calculation */
527         px = pSrc1 + count;
528         py = pSrc2;
529 
530         /* Decrement loop counter */
531         blkCnt--;
532       }
533     }
534     else
535     {
536       /* If the srcBLen is not a multiple of 4,
537        * the blockSize2 loop cannot be unrolled by 4 */
538       blkCnt = (uint32_t) blockSize2;
539 
540       while (blkCnt > 0U)
541       {
542         /* Accumulator is made zero for every iteration */
543         sum = 0;
544 
545         /* srcBLen number of MACS should be performed */
546         k = srcBLen;
547 
548         while (k > 0U)
549         {
550           /* Perform the multiply-accumulate */
551           sum += (q63_t) ((q31_t) *px++ * *py--);
552 
553           /* Decrement the loop counter */
554           k--;
555         }
556 
557         /* Store the result in the accumulator in the destination buffer. */
558         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
559 
560         /* Increment the MAC count */
561         count++;
562 
563         /* Update the inputA and inputB pointers for next MAC calculation */
564         px = pSrc1 + count;
565         py = pSrc2;
566 
567         /* Decrement the loop counter */
568         blkCnt--;
569       }
570     }
571 
572 
573     /* --------------------------
574      * Initializations of stage3
575      * -------------------------*/
576 
577     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
578      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
579      * ....
580      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
581      * sum +=  x[srcALen-1] * y[srcBLen-1]
582      */
583 
584     /* In this stage the MAC operations are decreased by 1 for every iteration.
585        The count variable holds the number of MAC operations performed */
586     count = srcBLen - 1U;
587 
588     /* Working pointer of inputA */
589     if (firstIndex > srcALen)
590     {
591        pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
592     }
593     else
594     {
595        pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
596     }
597     px = pSrc1;
598 
599     /* Working pointer of inputB */
600     pSrc2 = pIn2 + (srcBLen - 1U);
601     pIn2 = pSrc2 - 1U;
602     py = pIn2;
603 
604     /* -------------------
605      * Stage3 process
606      * ------------------*/
607 
608     /* For loop unrolling by 4, this stage is divided into two. */
609     /* First part of this stage computes the MAC operations greater than 4 */
610     /* Second part of this stage computes the MAC operations less than or equal to 4 */
611 
612     /* The first part of the stage starts here */
613     j = count >> 2U;
614 
615     while ((j > 0U) && (blockSize3 > 0))
616     {
617       /* Accumulator is made zero for every iteration */
618       sum = 0;
619 
620       /* Apply loop unrolling and compute 4 MACs simultaneously. */
621       k = count >> 2U;
622 
623       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
624        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
625       while (k > 0U)
626       {
627         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
628          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
629         sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
630         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
631          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
632         sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
633 
634         /* Decrement loop counter */
635         k--;
636       }
637 
638       /* For the next MAC operations, the pointer py is used without SIMD
639        * So, py is incremented by 1 */
640       py = py + 1U;
641 
642       /* If the count is not a multiple of 4, compute any remaining MACs here.
643        ** No loop unrolling is used. */
644       k = count % 0x4U;
645 
646       while (k > 0U)
647       {
648         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
649         sum = __SMLALD(*px++, *py--, sum);
650 
651         /* Decrement loop counter */
652         k--;
653       }
654 
655       /* Store the result in the accumulator in the destination buffer. */
656       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
657 
658       /* Update the inputA and inputB pointers for next MAC calculation */
659       px = ++pSrc1;
660       py = pIn2;
661 
662       /* Decrement MAC count */
663       count--;
664 
665       /* Decrement loop counter */
666       blockSize3--;
667 
668       j--;
669     }
670 
671     /* The second part of the stage starts here */
672     /* SIMD is not used for the next MAC operations,
673      * so pointer py is updated to read only one sample at a time */
674     py = py + 1U;
675 
676     while (blockSize3 > 0)
677     {
678       /* Accumulator is made zero for every iteration */
679       sum = 0;
680 
681       /* Apply loop unrolling and compute 4 MACs simultaneously. */
682       k = count;
683 
684       while (k > 0U)
685       {
686         /* Perform the multiply-accumulates */
687         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
688         sum = __SMLALD(*px++, *py--, sum);
689 
690         /* Decrement loop counter */
691         k--;
692       }
693 
694       /* Store the result in the accumulator in the destination buffer. */
695       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
696 
697       /* Update the inputA and inputB pointers for next MAC calculation */
698       px = ++pSrc1;
699       py = pSrc2;
700 
701       /* Decrement MAC count */
702       count--;
703 
704       /* Decrement the loop counter */
705       blockSize3--;
706     }
707 
708     /* Set status as ARM_MATH_SUCCESS */
709     status = ARM_MATH_SUCCESS;
710   }
711 
712   /* Return to application */
713   return (status);
714 
715 #else /* #if defined (ARM_MATH_DSP) */
716 
717   const q15_t *pIn1 = pSrcA;                           /* InputA pointer */
718   const q15_t *pIn2 = pSrcB;                           /* InputB pointer */
719         q63_t sum;                                     /* Accumulator */
720         uint32_t i, j;                                 /* Loop counters */
721         arm_status status;                             /* Status of Partial convolution */
722 
723   /* Check for range of output samples to be calculated */
724   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
725   {
726     /* Set status as ARM_MATH_ARGUMENT_ERROR */
727     status = ARM_MATH_ARGUMENT_ERROR;
728   }
729   else
730   {
731     /* Loop to calculate convolution for output length number of values */
732     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
733     {
734       /* Initialize sum with zero to carry on MAC operations */
735       sum = 0;
736 
737       /* Loop to perform MAC operations according to convolution equation */
738       for (j = 0U; j <= i; j++)
739       {
740         /* Check the array limitations */
741         if (((i - j) < srcBLen) && (j < srcALen))
742         {
743           /* z[i] += x[i-j] * y[j] */
744           sum += ((q31_t) pIn1[j] * pIn2[i - j]);
745         }
746       }
747 
748       /* Store the output in the destination buffer */
749       pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
750     }
751 
752     /* Set status as ARM_MATH_SUCCESS */
753     status = ARM_MATH_SUCCESS;
754   }
755 
756   /* Return to application */
757   return (status);
758 
759 #endif /* #if defined (ARM_MATH_DSP) */
760 
761 }
762 
763 /**
764   @} end of PartialConv group
765  */
766