1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_partial_opt_q15.c
4  * Description:  Partial convolution of Q15 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup PartialConv
37   @{
38  */
39 
40 /**
41   @brief         Partial convolution of Q15 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written
47   @param[in]     firstIndex is the first output sample to start with
48   @param[in]     numPoints  is the number of output points to be computed
49   @param[in]     pScratch1  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
50   @param[in]     pScratch2  points to scratch buffer of size min(srcALen, srcBLen).
51   @return        execution status
52                    - \ref ARM_MATH_SUCCESS        : Operation successful
53                    - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
54 
55   @remark
56                    Refer to \ref arm_conv_partial_fast_q15() for a faster but less precise version of this function.
57  */
58 
arm_conv_partial_opt_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,uint32_t firstIndex,uint32_t numPoints,q15_t * pScratch1,q15_t * pScratch2)59 ARM_DSP_ATTRIBUTE arm_status arm_conv_partial_opt_q15(
60   const q15_t * pSrcA,
61         uint32_t srcALen,
62   const q15_t * pSrcB,
63         uint32_t srcBLen,
64         q15_t * pDst,
65         uint32_t firstIndex,
66         uint32_t numPoints,
67         q15_t * pScratch1,
68         q15_t * pScratch2)
69 {
70 
71         q15_t *pOut = pDst;                            /* Output pointer */
72         q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
73         q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
74         q63_t acc0;                                    /* Accumulator */
75         q31_t x1;                                      /* Temporary variables to hold state and coefficient values */
76         q31_t y1;                                      /* State variables */
77   const q15_t *pIn1;                                   /* InputA pointer */
78   const q15_t *pIn2;                                   /* InputB pointer */
79   const q15_t *px;                                     /* Intermediate inputA pointer */
80         q15_t *py;                                     /* Intermediate inputB pointer */
81         uint32_t j, k, blkCnt;                         /* Loop counter */
82         uint32_t tapCnt;                               /* Loop count */
83         arm_status status;                             /* Status variable */
84 
85 #if defined (ARM_MATH_LOOPUNROLL)
86         q63_t acc1, acc2, acc3;                        /* Accumulator */
87         q31_t x2, x3;                                  /* Temporary variables to hold state and coefficient values */
88         q31_t y2;                                      /* State variables */
89 #endif
90 
91   /* Check for range of output samples to be calculated */
92   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
93   {
94     /* Set status as ARM_MATH_ARGUMENT_ERROR */
95     status = ARM_MATH_ARGUMENT_ERROR;
96   }
97   else
98   {
99     /* The algorithm implementation is based on the lengths of the inputs. */
100     /* srcB is always made to slide across srcA. */
101     /* So srcBLen is always considered as shorter or equal to srcALen */
102     if (srcALen >= srcBLen)
103     {
104       /* Initialization of inputA pointer */
105       pIn1 = pSrcA;
106 
107       /* Initialization of inputB pointer */
108       pIn2 = pSrcB;
109     }
110     else
111     {
112       /* Initialization of inputA pointer */
113       pIn1 = pSrcB;
114 
115       /* Initialization of inputB pointer */
116       pIn2 = pSrcA;
117 
118       /* srcBLen is always considered as shorter or equal to srcALen */
119       j = srcBLen;
120       srcBLen = srcALen;
121       srcALen = j;
122     }
123 
124     /* Temporary pointer for scratch2 */
125     py = pScratch2;
126 
127     /* pointer to take end of scratch2 buffer */
128     pScr2 = pScratch2 + srcBLen - 1;
129 
130     /* points to smaller length sequence */
131     px = pIn2;
132 
133 #if defined (ARM_MATH_LOOPUNROLL)
134 
135     /* Loop unrolling: Compute 4 outputs at a time */
136     k = srcBLen >> 2U;
137 
138     /* Copy smaller length input sequence in reverse order into second scratch buffer */
139     while (k > 0U)
140     {
141       /* copy second buffer in reversal manner */
142       *pScr2-- = *px++;
143       *pScr2-- = *px++;
144       *pScr2-- = *px++;
145       *pScr2-- = *px++;
146 
147       /* Decrement loop counter */
148       k--;
149     }
150 
151     /* Loop unrolling: Compute remaining outputs */
152     k = srcBLen % 0x4U;
153 
154 #else
155 
156     /* Initialize k with number of samples */
157     k = srcBLen;
158 
159 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
160 
161     while (k > 0U)
162     {
163       /* copy second buffer in reversal manner for remaining samples */
164       *pScr2-- = *px++;
165 
166       /* Decrement loop counter */
167       k--;
168     }
169 
170     /* Initialze temporary scratch pointer */
171     pScr1 = pScratch1;
172 
173     /* Assuming scratch1 buffer is aligned by 32-bit */
174     /* Fill (srcBLen - 1U) zeros in scratch buffer */
175     arm_fill_q15(0, pScr1, (srcBLen - 1U));
176 
177     /* Update temporary scratch pointer */
178     pScr1 += (srcBLen - 1U);
179 
180     /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
181 
182     /* Copy (srcALen) samples in scratch buffer */
183     arm_copy_q15(pIn1, pScr1, srcALen);
184 
185     /* Update pointers */
186     pScr1 += srcALen;
187 
188     /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
189     arm_fill_q15(0, pScr1, (srcBLen - 1U));
190 
191     /* Update pointer */
192     pScr1 += (srcBLen - 1U);
193 
194     /* Initialization of pIn2 pointer */
195     pIn2 = py;
196 
197     pScratch1 += firstIndex;
198 
199     pOut = pDst + firstIndex;
200 
201     /* Actual convolution process starts here */
202 
203 #if defined (ARM_MATH_LOOPUNROLL)
204 
205     /* Loop unrolling: Compute 4 outputs at a time */
206     blkCnt = (numPoints) >> 2;
207 
208     while (blkCnt > 0)
209     {
210       /* Initialze temporary scratch pointer as scratch1 */
211       pScr1 = pScratch1;
212 
213       /* Clear Accumlators */
214       acc0 = 0;
215       acc1 = 0;
216       acc2 = 0;
217       acc3 = 0;
218 
219       /* Read two samples from scratch1 buffer */
220       x1 = read_q15x2_ia (&pScr1);
221 
222       /* Read next two samples from scratch1 buffer */
223       x2 = read_q15x2_ia (&pScr1);
224 
225       tapCnt = (srcBLen) >> 2U;
226 
227       while (tapCnt > 0U)
228       {
229 
230         /* Read four samples from smaller buffer */
231         y1 = read_q15x2_ia ((q15_t **) &pIn2);
232         y2 = read_q15x2_ia ((q15_t **) &pIn2);
233 
234         /* multiply and accumulate */
235         acc0 = __SMLALD(x1, y1, acc0);
236         acc2 = __SMLALD(x2, y1, acc2);
237 
238         /* pack input data */
239 #ifndef ARM_MATH_BIG_ENDIAN
240         x3 = __PKHBT(x2, x1, 0);
241 #else
242         x3 = __PKHBT(x1, x2, 0);
243 #endif
244 
245         /* multiply and accumulate */
246         acc1 = __SMLALDX(x3, y1, acc1);
247 
248         /* Read next two samples from scratch1 buffer */
249         x1 = read_q15x2_ia (&pScr1);
250 
251         /* multiply and accumulate */
252         acc0 = __SMLALD(x2, y2, acc0);
253         acc2 = __SMLALD(x1, y2, acc2);
254 
255         /* pack input data */
256 #ifndef ARM_MATH_BIG_ENDIAN
257         x3 = __PKHBT(x1, x2, 0);
258 #else
259         x3 = __PKHBT(x2, x1, 0);
260 #endif
261 
262         acc3 = __SMLALDX(x3, y1, acc3);
263         acc1 = __SMLALDX(x3, y2, acc1);
264 
265         x2 = read_q15x2_ia (&pScr1);
266 
267 #ifndef ARM_MATH_BIG_ENDIAN
268         x3 = __PKHBT(x2, x1, 0);
269 #else
270         x3 = __PKHBT(x1, x2, 0);
271 #endif
272 
273         acc3 = __SMLALDX(x3, y2, acc3);
274 
275         /* Decrement loop counter */
276         tapCnt--;
277       }
278 
279       /* Update scratch pointer for remaining samples of smaller length sequence */
280       pScr1 -= 4U;
281 
282       /* apply same above for remaining samples of smaller length sequence */
283       tapCnt = (srcBLen) & 3U;
284 
285       while (tapCnt > 0U)
286       {
287         /* accumulate the results */
288         acc0 += (*pScr1++ * *pIn2);
289         acc1 += (*pScr1++ * *pIn2);
290         acc2 += (*pScr1++ * *pIn2);
291         acc3 += (*pScr1++ * *pIn2++);
292 
293         pScr1 -= 3U;
294 
295         /* Decrement loop counter */
296         tapCnt--;
297       }
298 
299       blkCnt--;
300 
301       /* Store the results in the accumulators in the destination buffer. */
302 #ifndef  ARM_MATH_BIG_ENDIAN
303       write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
304       write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
305 #else
306       write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
307       write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
308 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
309 
310       /* Initialization of inputB pointer */
311       pIn2 = py;
312 
313       pScratch1 += 4U;
314     }
315 
316     /* Loop unrolling: Compute remaining outputs */
317     blkCnt = numPoints & 0x3;
318 
319 #else
320 
321     /* Initialize blkCnt with number of samples */
322     blkCnt = numPoints;
323 
324 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
325 
326     /* Calculate convolution for remaining samples of Bigger length sequence */
327     while (blkCnt > 0)
328     {
329       /* Initialze temporary scratch pointer as scratch1 */
330       pScr1 = pScratch1;
331 
332       /* Clear Accumlators */
333       acc0 = 0;
334 
335       tapCnt = (srcBLen) >> 1U;
336 
337       while (tapCnt > 0U)
338       {
339         /* Read next two samples from scratch1 buffer */
340         x1 = read_q15x2_ia (&pScr1);
341 
342         /* Read two samples from smaller buffer */
343         y1 = read_q15x2_ia ((q15_t **) &pIn2);
344 
345         acc0 = __SMLALD(x1, y1, acc0);
346 
347         /* Decrement the loop counter */
348         tapCnt--;
349       }
350 
351       tapCnt = (srcBLen) & 1U;
352 
353       /* apply same above for remaining samples of smaller length sequence */
354       while (tapCnt > 0U)
355       {
356         /* accumulate the results */
357         acc0 += (*pScr1++ * *pIn2++);
358 
359         /* Decrement loop counter */
360         tapCnt--;
361       }
362 
363       blkCnt--;
364 
365       /* The result is in 2.30 format.  Convert to 1.15 with saturation.
366        ** Then store the output in the destination buffer. */
367       *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
368 
369       /* Initialization of inputB pointer */
370       pIn2 = py;
371 
372       pScratch1 += 1U;
373 
374     }
375 
376     /* Set status as ARM_MATH_SUCCESS */
377     status = ARM_MATH_SUCCESS;
378   }
379 
380   /* Return to application */
381   return (status);
382 }
383 
384 /**
385   @} end of PartialConv group
386  */
387