1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_partial_fast_opt_q15.c
4 * Description: Fast Q15 Partial convolution
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup PartialConv
37 @{
38 */
39
40 /**
41 @brief Partial convolution of Q15 sequences (fast version).
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written
47 @param[in] firstIndex is the first output sample to start with
48 @param[in] numPoints is the number of output points to be computed
49 @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2
50 @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen)
51 @return execution status
52 - \ref ARM_MATH_SUCCESS : Operation successful
53 - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
54
55 @remark
56 Refer to \ref arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
57 */
58
arm_conv_partial_fast_opt_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,uint32_t firstIndex,uint32_t numPoints,q15_t * pScratch1,q15_t * pScratch2)59 ARM_DSP_ATTRIBUTE arm_status arm_conv_partial_fast_opt_q15(
60 const q15_t * pSrcA,
61 uint32_t srcALen,
62 const q15_t * pSrcB,
63 uint32_t srcBLen,
64 q15_t * pDst,
65 uint32_t firstIndex,
66 uint32_t numPoints,
67 q15_t * pScratch1,
68 q15_t * pScratch2)
69 {
70 q15_t *pOut = pDst; /* Output pointer */
71 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
72 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
73 q31_t acc0; /* Accumulator */
74 const q15_t *pIn1; /* InputA pointer */
75 const q15_t *pIn2; /* InputB pointer */
76 const q15_t *px; /* Intermediate inputA pointer */
77 q15_t *py; /* Intermediate inputB pointer */
78 uint32_t j, k, blkCnt; /* Loop counter */
79 uint32_t tapCnt; /* Loop count */
80 arm_status status; /* Status variable */
81 q31_t x1; /* Temporary variables to hold state and coefficient values */
82 q31_t y1; /* State variables */
83
84 #if defined (ARM_MATH_LOOPUNROLL)
85 q31_t acc1, acc2, acc3; /* Accumulator */
86 q31_t x2, x3; /* Temporary variables to hold state and coefficient values */
87 q31_t y2; /* State variables */
88 #endif
89
90 /* Check for range of output samples to be calculated */
91 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
92 {
93 /* Set status as ARM_MATH_ARGUMENT_ERROR */
94 status = ARM_MATH_ARGUMENT_ERROR;
95 }
96 else
97 {
98 /* The algorithm implementation is based on the lengths of the inputs. */
99 /* srcB is always made to slide across srcA. */
100 /* So srcBLen is always considered as shorter or equal to srcALen */
101 if (srcALen >= srcBLen)
102 {
103 /* Initialization of inputA pointer */
104 pIn1 = pSrcA;
105
106 /* Initialization of inputB pointer */
107 pIn2 = pSrcB;
108 }
109 else
110 {
111 /* Initialization of inputA pointer */
112 pIn1 = pSrcB;
113
114 /* Initialization of inputB pointer */
115 pIn2 = pSrcA;
116
117 /* srcBLen is always considered as shorter or equal to srcALen */
118 j = srcBLen;
119 srcBLen = srcALen;
120 srcALen = j;
121 }
122
123 /* Temporary pointer for scratch2 */
124 py = pScratch2;
125
126 /* pointer to take end of scratch2 buffer */
127 pScr2 = pScratch2 + srcBLen - 1;
128
129 /* points to smaller length sequence */
130 px = pIn2;
131
132 #if defined (ARM_MATH_LOOPUNROLL)
133
134 /* Loop unrolling: Compute 4 outputs at a time */
135 k = srcBLen >> 2U;
136
137 /* Copy smaller length input sequence in reverse order into second scratch buffer */
138 while (k > 0U)
139 {
140 /* copy second buffer in reversal manner */
141 *pScr2-- = *px++;
142 *pScr2-- = *px++;
143 *pScr2-- = *px++;
144 *pScr2-- = *px++;
145
146 /* Decrement loop counter */
147 k--;
148 }
149
150 /* Loop unrolling: Compute remaining outputs */
151 k = srcBLen % 0x4U;
152
153 #else
154
155 /* Initialize k with number of samples */
156 k = srcBLen;
157
158 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
159
160 while (k > 0U)
161 {
162 /* copy second buffer in reversal manner for remaining samples */
163 *pScr2-- = *px++;
164
165 /* Decrement loop counter */
166 k--;
167 }
168
169 /* Initialze temporary scratch pointer */
170 pScr1 = pScratch1;
171
172 /* Assuming scratch1 buffer is aligned by 32-bit */
173 /* Fill (srcBLen - 1U) zeros in scratch buffer */
174 arm_fill_q15(0, pScr1, (srcBLen - 1U));
175
176 /* Update temporary scratch pointer */
177 pScr1 += (srcBLen - 1U);
178
179 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
180
181 /* Copy (srcALen) samples in scratch buffer */
182 arm_copy_q15(pIn1, pScr1, srcALen);
183
184 /* Update pointers */
185 pScr1 += srcALen;
186
187 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
188 arm_fill_q15(0, pScr1, (srcBLen - 1U));
189
190 /* Update pointer */
191 pScr1 += (srcBLen - 1U);
192
193 /* Initialization of pIn2 pointer */
194 pIn2 = py;
195
196 pScratch1 += firstIndex;
197
198 pOut = pDst + firstIndex;
199
200 /* Actual convolution process starts here */
201
202 #if defined (ARM_MATH_LOOPUNROLL)
203
204 /* Loop unrolling: Compute 4 outputs at a time */
205 blkCnt = (numPoints) >> 2;
206
207 while (blkCnt > 0)
208 {
209 /* Initialze temporary scratch pointer as scratch1 */
210 pScr1 = pScratch1;
211
212 /* Clear Accumlators */
213 acc0 = 0;
214 acc1 = 0;
215 acc2 = 0;
216 acc3 = 0;
217
218 /* Read two samples from scratch1 buffer */
219 x1 = read_q15x2_ia (&pScr1);
220
221 /* Read next two samples from scratch1 buffer */
222 x2 = read_q15x2_ia (&pScr1);
223
224 tapCnt = (srcBLen) >> 2U;
225
226 while (tapCnt > 0U)
227 {
228
229 /* Read four samples from smaller buffer */
230 y1 = read_q15x2_ia ((q15_t **) &pIn2);
231 y2 = read_q15x2_ia ((q15_t **) &pIn2);
232
233 /* multiply and accumulate */
234 acc0 = __SMLAD(x1, y1, acc0);
235 acc2 = __SMLAD(x2, y1, acc2);
236
237 /* pack input data */
238 #ifndef ARM_MATH_BIG_ENDIAN
239 x3 = __PKHBT(x2, x1, 0);
240 #else
241 x3 = __PKHBT(x1, x2, 0);
242 #endif
243
244 /* multiply and accumulate */
245 acc1 = __SMLADX(x3, y1, acc1);
246
247 /* Read next two samples from scratch1 buffer */
248 x1 = read_q15x2_ia (&pScr1);
249
250 /* multiply and accumulate */
251 acc0 = __SMLAD(x2, y2, acc0);
252 acc2 = __SMLAD(x1, y2, acc2);
253
254 /* pack input data */
255 #ifndef ARM_MATH_BIG_ENDIAN
256 x3 = __PKHBT(x1, x2, 0);
257 #else
258 x3 = __PKHBT(x2, x1, 0);
259 #endif
260
261 acc3 = __SMLADX(x3, y1, acc3);
262 acc1 = __SMLADX(x3, y2, acc1);
263
264 x2 = read_q15x2_ia (&pScr1);
265
266 #ifndef ARM_MATH_BIG_ENDIAN
267 x3 = __PKHBT(x2, x1, 0);
268 #else
269 x3 = __PKHBT(x1, x2, 0);
270 #endif
271
272 /* multiply and accumulate */
273 acc3 = __SMLADX(x3, y2, acc3);
274
275 /* Decrement loop counter */
276 tapCnt--;
277 }
278
279 /* Update scratch pointer for remaining samples of smaller length sequence */
280 pScr1 -= 4U;
281
282 /* apply same above for remaining samples of smaller length sequence */
283 tapCnt = (srcBLen) & 3U;
284
285 while (tapCnt > 0U)
286 {
287 /* accumulate the results */
288 acc0 += (*pScr1++ * *pIn2);
289 acc1 += (*pScr1++ * *pIn2);
290 acc2 += (*pScr1++ * *pIn2);
291 acc3 += (*pScr1++ * *pIn2++);
292
293 pScr1 -= 3U;
294
295 /* Decrement loop counter */
296 tapCnt--;
297 }
298
299 blkCnt--;
300
301 /* Store the results in the accumulators in the destination buffer. */
302 #ifndef ARM_MATH_BIG_ENDIAN
303 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
304 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
305 #else
306 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
307 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
308 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
309
310 /* Initialization of inputB pointer */
311 pIn2 = py;
312
313 pScratch1 += 4U;
314 }
315
316 /* Loop unrolling: Compute remaining outputs */
317 blkCnt = numPoints & 0x3;
318
319 #else
320
321 /* Initialize blkCnt with number of samples */
322 blkCnt = numPoints;
323
324 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
325
326 /* Calculate convolution for remaining samples of Bigger length sequence */
327 while (blkCnt > 0)
328 {
329 /* Initialze temporary scratch pointer as scratch1 */
330 pScr1 = pScratch1;
331
332 /* Clear Accumlators */
333 acc0 = 0;
334
335 tapCnt = (srcBLen) >> 1U;
336
337 while (tapCnt > 0U)
338 {
339 /* Read next two samples from scratch1 buffer */
340 x1 = read_q15x2_ia (&pScr1);
341
342 /* Read two samples from smaller buffer */
343 y1 = read_q15x2_ia ((q15_t **) &pIn2);
344
345 /* multiply and accumulate */
346 acc0 = __SMLAD(x1, y1, acc0);
347
348 /* Decrement loop counter */
349 tapCnt--;
350 }
351
352 tapCnt = (srcBLen) & 1U;
353
354 /* apply same above for remaining samples of smaller length sequence */
355 while (tapCnt > 0U)
356 {
357 /* accumulate the results */
358 acc0 += (*pScr1++ * *pIn2++);
359
360 /* Decrement loop counter */
361 tapCnt--;
362 }
363
364 blkCnt--;
365
366 /* The result is in 2.30 format. Convert to 1.15 with saturation.
367 ** Then store the output in the destination buffer. */
368 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
369
370 /* Initialization of inputB pointer */
371 pIn2 = py;
372
373 pScratch1 += 1U;
374
375 }
376
377 /* Set status as ARM_MATH_SUCCESS */
378 status = ARM_MATH_SUCCESS;
379 }
380
381 /* Return to application */
382 return (status);
383 }
384
385 /**
386 @} end of PartialConv group
387 */
388