1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_partial_fast_opt_q15.c
4  * Description:  Fast Q15 Partial convolution
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup PartialConv
37   @{
38  */
39 
40 /**
41   @brief         Partial convolution of Q15 sequences (fast version).
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written
47   @param[in]     firstIndex is the first output sample to start with
48   @param[in]     numPoints  is the number of output points to be computed
49   @param[in]     pScratch1  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2
50   @param[in]     pScratch2  points to scratch buffer of size min(srcALen, srcBLen)
51   @return        execution status
52                    - \ref ARM_MATH_SUCCESS        : Operation successful
53                    - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
54 
55   @remark
56                    Refer to \ref arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
57  */
58 
arm_conv_partial_fast_opt_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,uint32_t firstIndex,uint32_t numPoints,q15_t * pScratch1,q15_t * pScratch2)59 ARM_DSP_ATTRIBUTE arm_status arm_conv_partial_fast_opt_q15(
60   const q15_t * pSrcA,
61         uint32_t srcALen,
62   const q15_t * pSrcB,
63         uint32_t srcBLen,
64         q15_t * pDst,
65         uint32_t firstIndex,
66         uint32_t numPoints,
67         q15_t * pScratch1,
68         q15_t * pScratch2)
69 {
70         q15_t *pOut = pDst;                            /* Output pointer */
71         q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
72         q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
73         q31_t acc0;                                    /* Accumulator */
74   const q15_t *pIn1;                                   /* InputA pointer */
75   const q15_t *pIn2;                                   /* InputB pointer */
76   const q15_t *px;                                     /* Intermediate inputA pointer */
77         q15_t *py;                                     /* Intermediate inputB pointer */
78         uint32_t j, k, blkCnt;                         /* Loop counter */
79         uint32_t tapCnt;                               /* Loop count */
80         arm_status status;                             /* Status variable */
81         q31_t x1;                                      /* Temporary variables to hold state and coefficient values */
82         q31_t y1;                                      /* State variables */
83 
84 #if defined (ARM_MATH_LOOPUNROLL)
85         q31_t acc1, acc2, acc3;                        /* Accumulator */
86         q31_t x2, x3;                                  /* Temporary variables to hold state and coefficient values */
87         q31_t y2;                                      /* State variables */
88 #endif
89 
90   /* Check for range of output samples to be calculated */
91   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
92   {
93     /* Set status as ARM_MATH_ARGUMENT_ERROR */
94     status = ARM_MATH_ARGUMENT_ERROR;
95   }
96   else
97   {
98     /* The algorithm implementation is based on the lengths of the inputs. */
99     /* srcB is always made to slide across srcA. */
100     /* So srcBLen is always considered as shorter or equal to srcALen */
101     if (srcALen >= srcBLen)
102     {
103       /* Initialization of inputA pointer */
104       pIn1 = pSrcA;
105 
106       /* Initialization of inputB pointer */
107       pIn2 = pSrcB;
108     }
109     else
110     {
111       /* Initialization of inputA pointer */
112       pIn1 = pSrcB;
113 
114       /* Initialization of inputB pointer */
115       pIn2 = pSrcA;
116 
117       /* srcBLen is always considered as shorter or equal to srcALen */
118       j = srcBLen;
119       srcBLen = srcALen;
120       srcALen = j;
121     }
122 
123     /* Temporary pointer for scratch2 */
124     py = pScratch2;
125 
126     /* pointer to take end of scratch2 buffer */
127     pScr2 = pScratch2 + srcBLen - 1;
128 
129     /* points to smaller length sequence */
130     px = pIn2;
131 
132 #if defined (ARM_MATH_LOOPUNROLL)
133 
134     /* Loop unrolling: Compute 4 outputs at a time */
135     k = srcBLen >> 2U;
136 
137     /* Copy smaller length input sequence in reverse order into second scratch buffer */
138     while (k > 0U)
139     {
140       /* copy second buffer in reversal manner */
141       *pScr2-- = *px++;
142       *pScr2-- = *px++;
143       *pScr2-- = *px++;
144       *pScr2-- = *px++;
145 
146       /* Decrement loop counter */
147       k--;
148     }
149 
150     /* Loop unrolling: Compute remaining outputs */
151     k = srcBLen % 0x4U;
152 
153 #else
154 
155     /* Initialize k with number of samples */
156     k = srcBLen;
157 
158 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
159 
160     while (k > 0U)
161     {
162       /* copy second buffer in reversal manner for remaining samples */
163       *pScr2-- = *px++;
164 
165       /* Decrement loop counter */
166       k--;
167     }
168 
169     /* Initialze temporary scratch pointer */
170     pScr1 = pScratch1;
171 
172     /* Assuming scratch1 buffer is aligned by 32-bit */
173     /* Fill (srcBLen - 1U) zeros in scratch buffer */
174     arm_fill_q15(0, pScr1, (srcBLen - 1U));
175 
176     /* Update temporary scratch pointer */
177     pScr1 += (srcBLen - 1U);
178 
179     /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
180 
181     /* Copy (srcALen) samples in scratch buffer */
182     arm_copy_q15(pIn1, pScr1, srcALen);
183 
184     /* Update pointers */
185     pScr1 += srcALen;
186 
187     /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
188     arm_fill_q15(0, pScr1, (srcBLen - 1U));
189 
190     /* Update pointer */
191     pScr1 += (srcBLen - 1U);
192 
193     /* Initialization of pIn2 pointer */
194     pIn2 = py;
195 
196     pScratch1 += firstIndex;
197 
198     pOut = pDst + firstIndex;
199 
200     /* Actual convolution process starts here */
201 
202 #if defined (ARM_MATH_LOOPUNROLL)
203 
204     /* Loop unrolling: Compute 4 outputs at a time */
205     blkCnt = (numPoints) >> 2;
206 
207     while (blkCnt > 0)
208     {
209       /* Initialze temporary scratch pointer as scratch1 */
210       pScr1 = pScratch1;
211 
212       /* Clear Accumlators */
213       acc0 = 0;
214       acc1 = 0;
215       acc2 = 0;
216       acc3 = 0;
217 
218       /* Read two samples from scratch1 buffer */
219       x1 = read_q15x2_ia (&pScr1);
220 
221       /* Read next two samples from scratch1 buffer */
222       x2 = read_q15x2_ia (&pScr1);
223 
224       tapCnt = (srcBLen) >> 2U;
225 
226       while (tapCnt > 0U)
227       {
228 
229         /* Read four samples from smaller buffer */
230         y1 = read_q15x2_ia ((q15_t **) &pIn2);
231         y2 = read_q15x2_ia ((q15_t **) &pIn2);
232 
233         /* multiply and accumulate */
234         acc0 = __SMLAD(x1, y1, acc0);
235         acc2 = __SMLAD(x2, y1, acc2);
236 
237         /* pack input data */
238 #ifndef ARM_MATH_BIG_ENDIAN
239         x3 = __PKHBT(x2, x1, 0);
240 #else
241         x3 = __PKHBT(x1, x2, 0);
242 #endif
243 
244         /* multiply and accumulate */
245         acc1 = __SMLADX(x3, y1, acc1);
246 
247         /* Read next two samples from scratch1 buffer */
248         x1 = read_q15x2_ia (&pScr1);
249 
250         /* multiply and accumulate */
251         acc0 = __SMLAD(x2, y2, acc0);
252         acc2 = __SMLAD(x1, y2, acc2);
253 
254         /* pack input data */
255 #ifndef ARM_MATH_BIG_ENDIAN
256         x3 = __PKHBT(x1, x2, 0);
257 #else
258         x3 = __PKHBT(x2, x1, 0);
259 #endif
260 
261         acc3 = __SMLADX(x3, y1, acc3);
262         acc1 = __SMLADX(x3, y2, acc1);
263 
264         x2 = read_q15x2_ia (&pScr1);
265 
266 #ifndef ARM_MATH_BIG_ENDIAN
267         x3 = __PKHBT(x2, x1, 0);
268 #else
269         x3 = __PKHBT(x1, x2, 0);
270 #endif
271 
272         /* multiply and accumulate */
273         acc3 = __SMLADX(x3, y2, acc3);
274 
275         /* Decrement loop counter */
276         tapCnt--;
277       }
278 
279       /* Update scratch pointer for remaining samples of smaller length sequence */
280       pScr1 -= 4U;
281 
282       /* apply same above for remaining samples of smaller length sequence */
283       tapCnt = (srcBLen) & 3U;
284 
285       while (tapCnt > 0U)
286       {
287         /* accumulate the results */
288         acc0 += (*pScr1++ * *pIn2);
289         acc1 += (*pScr1++ * *pIn2);
290         acc2 += (*pScr1++ * *pIn2);
291         acc3 += (*pScr1++ * *pIn2++);
292 
293         pScr1 -= 3U;
294 
295         /* Decrement loop counter */
296         tapCnt--;
297       }
298 
299       blkCnt--;
300 
301       /* Store the results in the accumulators in the destination buffer. */
302 #ifndef  ARM_MATH_BIG_ENDIAN
303       write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
304       write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
305 #else
306       write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
307       write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
308 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
309 
310       /* Initialization of inputB pointer */
311       pIn2 = py;
312 
313       pScratch1 += 4U;
314     }
315 
316     /* Loop unrolling: Compute remaining outputs */
317     blkCnt = numPoints & 0x3;
318 
319 #else
320 
321     /* Initialize blkCnt with number of samples */
322     blkCnt = numPoints;
323 
324 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
325 
326     /* Calculate convolution for remaining samples of Bigger length sequence */
327     while (blkCnt > 0)
328     {
329       /* Initialze temporary scratch pointer as scratch1 */
330       pScr1 = pScratch1;
331 
332       /* Clear Accumlators */
333       acc0 = 0;
334 
335       tapCnt = (srcBLen) >> 1U;
336 
337       while (tapCnt > 0U)
338       {
339         /* Read next two samples from scratch1 buffer */
340         x1 = read_q15x2_ia (&pScr1);
341 
342         /* Read two samples from smaller buffer */
343         y1 = read_q15x2_ia ((q15_t **) &pIn2);
344 
345         /* multiply and accumulate */
346         acc0 = __SMLAD(x1, y1, acc0);
347 
348         /* Decrement loop counter */
349         tapCnt--;
350       }
351 
352       tapCnt = (srcBLen) & 1U;
353 
354       /* apply same above for remaining samples of smaller length sequence */
355       while (tapCnt > 0U)
356       {
357         /* accumulate the results */
358         acc0 += (*pScr1++ * *pIn2++);
359 
360         /* Decrement loop counter */
361         tapCnt--;
362       }
363 
364       blkCnt--;
365 
366       /* The result is in 2.30 format.  Convert to 1.15 with saturation.
367        ** Then store the output in the destination buffer. */
368       *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
369 
370       /* Initialization of inputB pointer */
371       pIn2 = py;
372 
373       pScratch1 += 1U;
374 
375     }
376 
377     /* Set status as ARM_MATH_SUCCESS */
378     status = ARM_MATH_SUCCESS;
379   }
380 
381   /* Return to application */
382   return (status);
383 }
384 
385 /**
386   @} end of PartialConv group
387  */
388