1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_partial_opt_q15.c
4 * Description: Partial convolution of Q15 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup PartialConv
37 @{
38 */
39
40 /**
41 @brief Partial convolution of Q15 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written
47 @param[in] firstIndex is the first output sample to start with
48 @param[in] numPoints is the number of output points to be computed
49 @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
50 @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
51 @return execution status
52 - \ref ARM_MATH_SUCCESS : Operation successful
53 - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
54
55 @remark
56 Refer to \ref arm_conv_partial_fast_q15() for a faster but less precise version of this function.
57 */
58
arm_conv_partial_opt_q15(const q15_t * pSrcA,uint32_t srcALen,const q15_t * pSrcB,uint32_t srcBLen,q15_t * pDst,uint32_t firstIndex,uint32_t numPoints,q15_t * pScratch1,q15_t * pScratch2)59 ARM_DSP_ATTRIBUTE arm_status arm_conv_partial_opt_q15(
60 const q15_t * pSrcA,
61 uint32_t srcALen,
62 const q15_t * pSrcB,
63 uint32_t srcBLen,
64 q15_t * pDst,
65 uint32_t firstIndex,
66 uint32_t numPoints,
67 q15_t * pScratch1,
68 q15_t * pScratch2)
69 {
70
71 q15_t *pOut = pDst; /* Output pointer */
72 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
73 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
74 q63_t acc0; /* Accumulator */
75 q31_t x1; /* Temporary variables to hold state and coefficient values */
76 q31_t y1; /* State variables */
77 const q15_t *pIn1; /* InputA pointer */
78 const q15_t *pIn2; /* InputB pointer */
79 const q15_t *px; /* Intermediate inputA pointer */
80 q15_t *py; /* Intermediate inputB pointer */
81 uint32_t j, k, blkCnt; /* Loop counter */
82 uint32_t tapCnt; /* Loop count */
83 arm_status status; /* Status variable */
84
85 #if defined (ARM_MATH_LOOPUNROLL)
86 q63_t acc1, acc2, acc3; /* Accumulator */
87 q31_t x2, x3; /* Temporary variables to hold state and coefficient values */
88 q31_t y2; /* State variables */
89 #endif
90
91 /* Check for range of output samples to be calculated */
92 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
93 {
94 /* Set status as ARM_MATH_ARGUMENT_ERROR */
95 status = ARM_MATH_ARGUMENT_ERROR;
96 }
97 else
98 {
99 /* The algorithm implementation is based on the lengths of the inputs. */
100 /* srcB is always made to slide across srcA. */
101 /* So srcBLen is always considered as shorter or equal to srcALen */
102 if (srcALen >= srcBLen)
103 {
104 /* Initialization of inputA pointer */
105 pIn1 = pSrcA;
106
107 /* Initialization of inputB pointer */
108 pIn2 = pSrcB;
109 }
110 else
111 {
112 /* Initialization of inputA pointer */
113 pIn1 = pSrcB;
114
115 /* Initialization of inputB pointer */
116 pIn2 = pSrcA;
117
118 /* srcBLen is always considered as shorter or equal to srcALen */
119 j = srcBLen;
120 srcBLen = srcALen;
121 srcALen = j;
122 }
123
124 /* Temporary pointer for scratch2 */
125 py = pScratch2;
126
127 /* pointer to take end of scratch2 buffer */
128 pScr2 = pScratch2 + srcBLen - 1;
129
130 /* points to smaller length sequence */
131 px = pIn2;
132
133 #if defined (ARM_MATH_LOOPUNROLL)
134
135 /* Loop unrolling: Compute 4 outputs at a time */
136 k = srcBLen >> 2U;
137
138 /* Copy smaller length input sequence in reverse order into second scratch buffer */
139 while (k > 0U)
140 {
141 /* copy second buffer in reversal manner */
142 *pScr2-- = *px++;
143 *pScr2-- = *px++;
144 *pScr2-- = *px++;
145 *pScr2-- = *px++;
146
147 /* Decrement loop counter */
148 k--;
149 }
150
151 /* Loop unrolling: Compute remaining outputs */
152 k = srcBLen % 0x4U;
153
154 #else
155
156 /* Initialize k with number of samples */
157 k = srcBLen;
158
159 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
160
161 while (k > 0U)
162 {
163 /* copy second buffer in reversal manner for remaining samples */
164 *pScr2-- = *px++;
165
166 /* Decrement loop counter */
167 k--;
168 }
169
170 /* Initialze temporary scratch pointer */
171 pScr1 = pScratch1;
172
173 /* Assuming scratch1 buffer is aligned by 32-bit */
174 /* Fill (srcBLen - 1U) zeros in scratch buffer */
175 arm_fill_q15(0, pScr1, (srcBLen - 1U));
176
177 /* Update temporary scratch pointer */
178 pScr1 += (srcBLen - 1U);
179
180 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
181
182 /* Copy (srcALen) samples in scratch buffer */
183 arm_copy_q15(pIn1, pScr1, srcALen);
184
185 /* Update pointers */
186 pScr1 += srcALen;
187
188 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
189 arm_fill_q15(0, pScr1, (srcBLen - 1U));
190
191 /* Update pointer */
192 pScr1 += (srcBLen - 1U);
193
194 /* Initialization of pIn2 pointer */
195 pIn2 = py;
196
197 pScratch1 += firstIndex;
198
199 pOut = pDst + firstIndex;
200
201 /* Actual convolution process starts here */
202
203 #if defined (ARM_MATH_LOOPUNROLL)
204
205 /* Loop unrolling: Compute 4 outputs at a time */
206 blkCnt = (numPoints) >> 2;
207
208 while (blkCnt > 0)
209 {
210 /* Initialze temporary scratch pointer as scratch1 */
211 pScr1 = pScratch1;
212
213 /* Clear Accumlators */
214 acc0 = 0;
215 acc1 = 0;
216 acc2 = 0;
217 acc3 = 0;
218
219 /* Read two samples from scratch1 buffer */
220 x1 = read_q15x2_ia (&pScr1);
221
222 /* Read next two samples from scratch1 buffer */
223 x2 = read_q15x2_ia (&pScr1);
224
225 tapCnt = (srcBLen) >> 2U;
226
227 while (tapCnt > 0U)
228 {
229
230 /* Read four samples from smaller buffer */
231 y1 = read_q15x2_ia ((q15_t **) &pIn2);
232 y2 = read_q15x2_ia ((q15_t **) &pIn2);
233
234 /* multiply and accumulate */
235 acc0 = __SMLALD(x1, y1, acc0);
236 acc2 = __SMLALD(x2, y1, acc2);
237
238 /* pack input data */
239 #ifndef ARM_MATH_BIG_ENDIAN
240 x3 = __PKHBT(x2, x1, 0);
241 #else
242 x3 = __PKHBT(x1, x2, 0);
243 #endif
244
245 /* multiply and accumulate */
246 acc1 = __SMLALDX(x3, y1, acc1);
247
248 /* Read next two samples from scratch1 buffer */
249 x1 = read_q15x2_ia (&pScr1);
250
251 /* multiply and accumulate */
252 acc0 = __SMLALD(x2, y2, acc0);
253 acc2 = __SMLALD(x1, y2, acc2);
254
255 /* pack input data */
256 #ifndef ARM_MATH_BIG_ENDIAN
257 x3 = __PKHBT(x1, x2, 0);
258 #else
259 x3 = __PKHBT(x2, x1, 0);
260 #endif
261
262 acc3 = __SMLALDX(x3, y1, acc3);
263 acc1 = __SMLALDX(x3, y2, acc1);
264
265 x2 = read_q15x2_ia (&pScr1);
266
267 #ifndef ARM_MATH_BIG_ENDIAN
268 x3 = __PKHBT(x2, x1, 0);
269 #else
270 x3 = __PKHBT(x1, x2, 0);
271 #endif
272
273 acc3 = __SMLALDX(x3, y2, acc3);
274
275 /* Decrement loop counter */
276 tapCnt--;
277 }
278
279 /* Update scratch pointer for remaining samples of smaller length sequence */
280 pScr1 -= 4U;
281
282 /* apply same above for remaining samples of smaller length sequence */
283 tapCnt = (srcBLen) & 3U;
284
285 while (tapCnt > 0U)
286 {
287 /* accumulate the results */
288 acc0 += (*pScr1++ * *pIn2);
289 acc1 += (*pScr1++ * *pIn2);
290 acc2 += (*pScr1++ * *pIn2);
291 acc3 += (*pScr1++ * *pIn2++);
292
293 pScr1 -= 3U;
294
295 /* Decrement loop counter */
296 tapCnt--;
297 }
298
299 blkCnt--;
300
301 /* Store the results in the accumulators in the destination buffer. */
302 #ifndef ARM_MATH_BIG_ENDIAN
303 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
304 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
305 #else
306 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
307 write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
308 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
309
310 /* Initialization of inputB pointer */
311 pIn2 = py;
312
313 pScratch1 += 4U;
314 }
315
316 /* Loop unrolling: Compute remaining outputs */
317 blkCnt = numPoints & 0x3;
318
319 #else
320
321 /* Initialize blkCnt with number of samples */
322 blkCnt = numPoints;
323
324 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
325
326 /* Calculate convolution for remaining samples of Bigger length sequence */
327 while (blkCnt > 0)
328 {
329 /* Initialze temporary scratch pointer as scratch1 */
330 pScr1 = pScratch1;
331
332 /* Clear Accumlators */
333 acc0 = 0;
334
335 tapCnt = (srcBLen) >> 1U;
336
337 while (tapCnt > 0U)
338 {
339 /* Read next two samples from scratch1 buffer */
340 x1 = read_q15x2_ia (&pScr1);
341
342 /* Read two samples from smaller buffer */
343 y1 = read_q15x2_ia ((q15_t **) &pIn2);
344
345 acc0 = __SMLALD(x1, y1, acc0);
346
347 /* Decrement the loop counter */
348 tapCnt--;
349 }
350
351 tapCnt = (srcBLen) & 1U;
352
353 /* apply same above for remaining samples of smaller length sequence */
354 while (tapCnt > 0U)
355 {
356 /* accumulate the results */
357 acc0 += (*pScr1++ * *pIn2++);
358
359 /* Decrement loop counter */
360 tapCnt--;
361 }
362
363 blkCnt--;
364
365 /* The result is in 2.30 format. Convert to 1.15 with saturation.
366 ** Then store the output in the destination buffer. */
367 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
368
369 /* Initialization of inputB pointer */
370 pIn2 = py;
371
372 pScratch1 += 1U;
373
374 }
375
376 /* Set status as ARM_MATH_SUCCESS */
377 status = ARM_MATH_SUCCESS;
378 }
379
380 /* Return to application */
381 return (status);
382 }
383
384 /**
385 @} end of PartialConv group
386 */
387