1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_partial_opt_q7.c
4  * Description:  Partial convolution of Q7 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup PartialConv
37   @{
38  */
39 
40 /**
41   @brief         Partial convolution of Q7 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written
47   @param[in]     firstIndex is the first output sample to start with
48   @param[in]     numPoints  is the number of output points to be computed
49   @param[in]     pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
50   @param[in]     pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
51   @return        execution status
52                    - \ref ARM_MATH_SUCCESS        : Operation successful
53                    - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
54  */
55 
arm_conv_partial_opt_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst,uint32_t firstIndex,uint32_t numPoints,q15_t * pScratch1,q15_t * pScratch2)56 arm_status arm_conv_partial_opt_q7(
57   const q7_t * pSrcA,
58         uint32_t srcALen,
59   const q7_t * pSrcB,
60         uint32_t srcBLen,
61         q7_t * pDst,
62         uint32_t firstIndex,
63         uint32_t numPoints,
64         q15_t * pScratch1,
65         q15_t * pScratch2)
66 {
67         q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
68         q15_t x4;                                      /* Temporary input variable */
69   const q7_t *pIn1, *pIn2;                             /* InputA and inputB pointer */
70         uint32_t j, k, blkCnt, tapCnt;                 /* Loop counter */
71   const q7_t *px;                                      /* Temporary input1 pointer */
72         q15_t *py;                                     /* Temporary input2 pointer */
73         q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
74         q31_t x1, x2, x3, y1;                          /* Temporary input variables */
75         arm_status status;
76         q7_t *pOut = pDst;                             /* Output pointer */
77         q7_t out0, out1, out2, out3;                   /* Temporary variables */
78 
79   /* Check for range of output samples to be calculated */
80   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
81   {
82     /* Set status as ARM_MATH_ARGUMENT_ERROR */
83     status = ARM_MATH_ARGUMENT_ERROR;
84   }
85   else
86   {
87     /* The algorithm implementation is based on the lengths of the inputs. */
88     /* srcB is always made to slide across srcA. */
89     /* So srcBLen is always considered as shorter or equal to srcALen */
90     if (srcALen >= srcBLen)
91     {
92       /* Initialization of inputA pointer */
93       pIn1 = pSrcA;
94 
95       /* Initialization of inputB pointer */
96       pIn2 = pSrcB;
97     }
98     else
99     {
100       /* Initialization of inputA pointer */
101       pIn1 = pSrcB;
102 
103       /* Initialization of inputB pointer */
104       pIn2 = pSrcA;
105 
106       /* srcBLen is always considered as shorter or equal to srcALen */
107       j = srcBLen;
108       srcBLen = srcALen;
109       srcALen = j;
110     }
111 
112     /* pointer to take end of scratch2 buffer */
113     pScr2 = pScratch2;
114 
115     /* points to smaller length sequence */
116     px = pIn2 + srcBLen - 1;
117 
118     /* Apply loop unrolling and do 4 Copies simultaneously. */
119     k = srcBLen >> 2U;
120 
121     /* First part of the processing with loop unrolling copies 4 data points at a time.
122      ** a second loop below copies for the remaining 1 to 3 samples. */
123     while (k > 0U)
124     {
125       /* copy second buffer in reversal manner */
126       x4 = (q15_t) *px--;
127       *pScr2++ = x4;
128       x4 = (q15_t) *px--;
129       *pScr2++ = x4;
130       x4 = (q15_t) *px--;
131       *pScr2++ = x4;
132       x4 = (q15_t) *px--;
133       *pScr2++ = x4;
134 
135       /* Decrement loop counter */
136       k--;
137     }
138 
139     /* If the count is not a multiple of 4, copy remaining samples here.
140      ** No loop unrolling is used. */
141     k = srcBLen % 0x4U;
142 
143     while (k > 0U)
144     {
145       /* copy second buffer in reversal manner for remaining samples */
146       x4 = (q15_t) *px--;
147       *pScr2++ = x4;
148 
149       /* Decrement loop counter */
150       k--;
151     }
152 
153     /* Initialze temporary scratch pointer */
154     pScr1 = pScratch1;
155 
156     /* Fill (srcBLen - 1U) zeros in scratch buffer */
157     arm_fill_q15(0, pScr1, (srcBLen - 1U));
158 
159     /* Update temporary scratch pointer */
160     pScr1 += (srcBLen - 1U);
161 
162     /* Copy (srcALen) samples in scratch buffer */
163     /* Apply loop unrolling and do 4 Copies simultaneously. */
164     k = srcALen >> 2U;
165 
166     /* First part of the processing with loop unrolling copies 4 data points at a time.
167      ** a second loop below copies for the remaining 1 to 3 samples. */
168     while (k > 0U)
169     {
170       /* copy second buffer in reversal manner */
171       x4 = (q15_t) *pIn1++;
172       *pScr1++ = x4;
173       x4 = (q15_t) *pIn1++;
174       *pScr1++ = x4;
175       x4 = (q15_t) *pIn1++;
176       *pScr1++ = x4;
177       x4 = (q15_t) *pIn1++;
178       *pScr1++ = x4;
179 
180       /* Decrement loop counter */
181       k--;
182     }
183 
184     /* If the count is not a multiple of 4, copy remaining samples here.
185      ** No loop unrolling is used. */
186     k = srcALen % 0x4U;
187 
188     while (k > 0U)
189     {
190       /* copy second buffer in reversal manner for remaining samples */
191       x4 = (q15_t) *pIn1++;
192       *pScr1++ = x4;
193 
194       /* Decrement the loop counter */
195       k--;
196     }
197 
198     /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
199     arm_fill_q15(0, pScr1, (srcBLen - 1U));
200 
201     /* Update pointer */
202     pScr1 += (srcBLen - 1U);
203 
204 
205     /* Temporary pointer for scratch2 */
206     py = pScratch2;
207 
208     /* Initialization of pIn2 pointer */
209     pIn2 = (q7_t *) py;
210 
211     pScr2 = py;
212 
213     pOut = pDst + firstIndex;
214 
215     pScratch1 += firstIndex;
216 
217     /* Actual convolution process starts here */
218     blkCnt = (numPoints) >> 2;
219 
220     while (blkCnt > 0)
221     {
222       /* Initialize temporary scratch pointer as scratch1 */
223       pScr1 = pScratch1;
224 
225       /* Clear Accumulators */
226       acc0 = 0;
227       acc1 = 0;
228       acc2 = 0;
229       acc3 = 0;
230 
231       /* Read two samples from scratch1 buffer */
232       x1 = read_q15x2_ia (&pScr1);
233 
234       /* Read next two samples from scratch1 buffer */
235       x2 = read_q15x2_ia (&pScr1);
236 
237       tapCnt = (srcBLen) >> 2U;
238 
239       while (tapCnt > 0U)
240       {
241         /* Read four samples from smaller buffer */
242         y1 = read_q15x2_ia (&pScr2);
243 
244         /* multiply and accumulate */
245         acc0 = __SMLAD(x1, y1, acc0);
246         acc2 = __SMLAD(x2, y1, acc2);
247 
248         /* pack input data */
249 #ifndef ARM_MATH_BIG_ENDIAN
250         x3 = __PKHBT(x2, x1, 0);
251 #else
252         x3 = __PKHBT(x1, x2, 0);
253 #endif
254 
255         /* multiply and accumulate */
256         acc1 = __SMLADX(x3, y1, acc1);
257 
258         /* Read next two samples from scratch1 buffer */
259         x1 = read_q15x2_ia (&pScr1);
260 
261         /* pack input data */
262 #ifndef ARM_MATH_BIG_ENDIAN
263         x3 = __PKHBT(x1, x2, 0);
264 #else
265         x3 = __PKHBT(x2, x1, 0);
266 #endif
267 
268         acc3 = __SMLADX(x3, y1, acc3);
269 
270         /* Read four samples from smaller buffer */
271         y1 = read_q15x2_ia (&pScr2);
272 
273         acc0 = __SMLAD(x2, y1, acc0);
274 
275         acc2 = __SMLAD(x1, y1, acc2);
276 
277         acc1 = __SMLADX(x3, y1, acc1);
278 
279         x2 = read_q15x2_ia (&pScr1);
280 
281 #ifndef ARM_MATH_BIG_ENDIAN
282         x3 = __PKHBT(x2, x1, 0);
283 #else
284         x3 = __PKHBT(x1, x2, 0);
285 #endif
286 
287         acc3 = __SMLADX(x3, y1, acc3);
288 
289         /* Decrement loop counter */
290         tapCnt--;
291       }
292 
293       /* Update scratch pointer for remaining samples of smaller length sequence */
294       pScr1 -= 4U;
295 
296       /* apply same above for remaining samples of smaller length sequence */
297       tapCnt = (srcBLen) & 3U;
298 
299       while (tapCnt > 0U)
300       {
301         /* accumulate the results */
302         acc0 += (*pScr1++ * *pScr2);
303         acc1 += (*pScr1++ * *pScr2);
304         acc2 += (*pScr1++ * *pScr2);
305         acc3 += (*pScr1++ * *pScr2++);
306 
307         pScr1 -= 3U;
308 
309         /* Decrement loop counter */
310         tapCnt--;
311       }
312 
313       blkCnt--;
314 
315       /* Store the result in the accumulator in the destination buffer. */
316       out0 = (q7_t) (__SSAT(acc0 >> 7U, 8));
317       out1 = (q7_t) (__SSAT(acc1 >> 7U, 8));
318       out2 = (q7_t) (__SSAT(acc2 >> 7U, 8));
319       out3 = (q7_t) (__SSAT(acc3 >> 7U, 8));
320 
321       write_q7x4_ia (&pOut, __PACKq7(out0, out1, out2, out3));
322 
323       /* Initialization of inputB pointer */
324       pScr2 = py;
325 
326       pScratch1 += 4U;
327     }
328 
329     blkCnt = (numPoints) & 0x3;
330 
331     /* Calculate convolution for remaining samples of Bigger length sequence */
332     while (blkCnt > 0)
333     {
334       /* Initialze temporary scratch pointer as scratch1 */
335       pScr1 = pScratch1;
336 
337       /* Clear Accumlators */
338       acc0 = 0;
339 
340       tapCnt = (srcBLen) >> 1U;
341 
342       while (tapCnt > 0U)
343       {
344 
345         /* Read next two samples from scratch1 buffer */
346         x1 = read_q15x2_ia (&pScr1);
347 
348         /* Read two samples from smaller buffer */
349         y1 = read_q15x2_ia (&pScr2);
350 
351         acc0 = __SMLAD(x1, y1, acc0);
352 
353         /* Decrement the loop counter */
354         tapCnt--;
355       }
356 
357       tapCnt = (srcBLen) & 1U;
358 
359       /* apply same above for remaining samples of smaller length sequence */
360       while (tapCnt > 0U)
361       {
362 
363         /* accumulate the results */
364         acc0 += (*pScr1++ * *pScr2++);
365 
366         /* Decrement loop counter */
367         tapCnt--;
368       }
369 
370       blkCnt--;
371 
372       /* Store the result in the accumulator in the destination buffer. */
373       *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
374 
375       /* Initialization of inputB pointer */
376       pScr2 = py;
377 
378       pScratch1 += 1U;
379     }
380 
381     /* Set status as ARM_MATH_SUCCESS */
382     status = ARM_MATH_SUCCESS;
383   }
384 
385   return (status);
386 }
387 
388 /**
389   @} end of PartialConv group
390  */
391