1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_partial_fast_q31.c
4 * Description: Fast Q31 Partial convolution
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup PartialConv
37 @{
38 */
39
40 /**
41 @brief Partial convolution of Q31 sequences (fast version).
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written
47 @param[in] firstIndex is the first output sample to start with
48 @param[in] numPoints is the number of output points to be computed
49 @return execution status
50 - \ref ARM_MATH_SUCCESS : Operation successful
51 - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
52
53 @remark
54 Refer to \ref arm_conv_partial_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
55 */
56
arm_conv_partial_fast_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst,uint32_t firstIndex,uint32_t numPoints)57 arm_status arm_conv_partial_fast_q31(
58 const q31_t * pSrcA,
59 uint32_t srcALen,
60 const q31_t * pSrcB,
61 uint32_t srcBLen,
62 q31_t * pDst,
63 uint32_t firstIndex,
64 uint32_t numPoints)
65 {
66 const q31_t *pIn1; /* InputA pointer */
67 const q31_t *pIn2; /* InputB pointer */
68 q31_t *pOut = pDst; /* Output pointer */
69 const q31_t *px; /* Intermediate inputA pointer */
70 const q31_t *py; /* Intermediate inputB pointer */
71 const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
72 q31_t sum; /* Accumulators */
73 uint32_t j, k, count, check, blkCnt;
74 int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
75 arm_status status; /* Status of Partial convolution */
76
77 #if defined (ARM_MATH_LOOPUNROLL)
78 q31_t acc0, acc1, acc2, acc3; /* Accumulators */
79 q31_t x0, x1, x2, x3, c0;
80 #endif
81
82 /* Check for range of output samples to be calculated */
83 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
84 {
85 /* Set status as ARM_MATH_ARGUMENT_ERROR */
86 status = ARM_MATH_ARGUMENT_ERROR;
87 }
88 else
89 {
90 /* The algorithm implementation is based on the lengths of the inputs. */
91 /* srcB is always made to slide across srcA. */
92 /* So srcBLen is always considered as shorter or equal to srcALen */
93 if (srcALen >= srcBLen)
94 {
95 /* Initialization of inputA pointer */
96 pIn1 = pSrcA;
97
98 /* Initialization of inputB pointer */
99 pIn2 = pSrcB;
100 }
101 else
102 {
103 /* Initialization of inputA pointer */
104 pIn1 = pSrcB;
105
106 /* Initialization of inputB pointer */
107 pIn2 = pSrcA;
108
109 /* srcBLen is always considered as shorter or equal to srcALen */
110 j = srcBLen;
111 srcBLen = srcALen;
112 srcALen = j;
113 }
114
115 /* Conditions to check which loopCounter holds
116 * the first and last indices of the output samples to be calculated. */
117 check = firstIndex + numPoints;
118 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
119 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
120 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
121 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
122 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
123 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
124
125 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
126 /* The function is internally
127 * divided into three stages according to the number of multiplications that has to be
128 * taken place between inputA samples and inputB samples. In the first stage of the
129 * algorithm, the multiplications increase by one for every iteration.
130 * In the second stage of the algorithm, srcBLen number of multiplications are done.
131 * In the third stage of the algorithm, the multiplications decrease by one
132 * for every iteration. */
133
134 /* Set the output pointer to point to the firstIndex
135 * of the output sample to be calculated. */
136 pOut = pDst + firstIndex;
137
138 /* --------------------------
139 * Initializations of stage1
140 * -------------------------*/
141
142 /* sum = x[0] * y[0]
143 * sum = x[0] * y[1] + x[1] * y[0]
144 * ....
145 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
146 */
147
148 /* In this stage the MAC operations are increased by 1 for every iteration.
149 The count variable holds the number of MAC operations performed.
150 Since the partial convolution starts from firstIndex
151 Number of Macs to be performed is firstIndex + 1 */
152 count = 1U + firstIndex;
153
154 /* Working pointer of inputA */
155 px = pIn1;
156
157 /* Working pointer of inputB */
158 pSrc2 = pIn2 + firstIndex;
159 py = pSrc2;
160
161 /* ------------------------
162 * Stage1 process
163 * ----------------------*/
164
165 /* The first stage starts here */
166 while (blockSize1 > 0)
167 {
168 /* Accumulator is made zero for every iteration */
169 sum = 0;
170
171 #if defined (ARM_MATH_LOOPUNROLL)
172
173 /* Loop unrolling: Compute 4 outputs at a time */
174 k = count >> 2U;
175
176 while (k > 0U)
177 {
178 /* x[0] * y[srcBLen - 1] */
179 sum = (q31_t) ((((q63_t) sum << 32) +
180 ((q63_t) *px++ * (*py--))) >> 32);
181
182 /* x[1] * y[srcBLen - 2] */
183 sum = (q31_t) ((((q63_t) sum << 32) +
184 ((q63_t) *px++ * (*py--))) >> 32);
185
186 /* x[2] * y[srcBLen - 3] */
187 sum = (q31_t) ((((q63_t) sum << 32) +
188 ((q63_t) *px++ * (*py--))) >> 32);
189
190 /* x[3] * y[srcBLen - 4] */
191 sum = (q31_t) ((((q63_t) sum << 32) +
192 ((q63_t) *px++ * (*py--))) >> 32);
193
194 /* Decrement loop counter */
195 k--;
196 }
197
198 /* Loop unrolling: Compute remaining outputs */
199 k = count % 0x4U;
200
201 #else
202
203 /* Initialize k with number of samples */
204 k = count;
205
206 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
207
208 while (k > 0U)
209 {
210 /* Perform the multiply-accumulate */
211 sum = (q31_t) ((((q63_t) sum << 32) +
212 ((q63_t) *px++ * (*py--))) >> 32);
213
214 /* Decrement loop counter */
215 k--;
216 }
217
218 /* Store the result in the accumulator in the destination buffer. */
219 *pOut++ = sum << 1;
220
221 /* Update the inputA and inputB pointers for next MAC calculation */
222 py = ++pSrc2;
223 px = pIn1;
224
225 /* Increment MAC count */
226 count++;
227
228 /* Decrement loop counter */
229 blockSize1--;
230 }
231
232 /* --------------------------
233 * Initializations of stage2
234 * ------------------------*/
235
236 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
237 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
238 * ....
239 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
240 */
241
242 /* Working pointer of inputA */
243 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
244 {
245 pSrc1 = pIn1 + firstIndex - srcBLen + 1;
246 }
247 else
248 {
249 pSrc1 = pIn1;
250 }
251 px = pSrc1;
252
253 /* Working pointer of inputB */
254 pSrc2 = pIn2 + (srcBLen - 1U);
255 py = pSrc2;
256
257 /* count is index by which the pointer pIn1 to be incremented */
258 count = 0U;
259
260 /* -------------------
261 * Stage2 process
262 * ------------------*/
263
264 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
265 * So, to loop unroll over blockSize2,
266 * srcBLen should be greater than or equal to 4 */
267 if (srcBLen >= 4U)
268 {
269 #if defined (ARM_MATH_LOOPUNROLL)
270
271 /* Loop unrolling: Compute 4 outputs at a time */
272 blkCnt = ((uint32_t) blockSize2 >> 2U);
273
274 while (blkCnt > 0U)
275 {
276 /* Set all accumulators to zero */
277 acc0 = 0;
278 acc1 = 0;
279 acc2 = 0;
280 acc3 = 0;
281
282 /* read x[0], x[1], x[2] samples */
283 x0 = *px++;
284 x1 = *px++;
285 x2 = *px++;
286
287 /* Apply loop unrolling and compute 4 MACs simultaneously. */
288 k = srcBLen >> 2U;
289
290 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
291 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
292 do
293 {
294 /* Read y[srcBLen - 1] sample */
295 c0 = *py--;
296 /* Read x[3] sample */
297 x3 = *px++;
298
299 /* Perform the multiply-accumulate */
300 /* acc0 += x[0] * y[srcBLen - 1] */
301 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
302 /* acc1 += x[1] * y[srcBLen - 1] */
303 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
304 /* acc2 += x[2] * y[srcBLen - 1] */
305 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
306 /* acc3 += x[3] * y[srcBLen - 1] */
307 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
308
309 /* Read y[srcBLen - 2] sample */
310 c0 = *py--;
311 /* Read x[4] sample */
312 x0 = *px++;
313
314 /* Perform the multiply-accumulate */
315 /* acc0 += x[1] * y[srcBLen - 2] */
316 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
317 /* acc1 += x[2] * y[srcBLen - 2] */
318 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
319 /* acc2 += x[3] * y[srcBLen - 2] */
320 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
321 /* acc3 += x[4] * y[srcBLen - 2] */
322 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
323
324 /* Read y[srcBLen - 3] sample */
325 c0 = *py--;
326 /* Read x[5] sample */
327 x1 = *px++;
328
329 /* Perform the multiply-accumulates */
330 /* acc0 += x[2] * y[srcBLen - 3] */
331 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
332 /* acc1 += x[3] * y[srcBLen - 2] */
333 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
334 /* acc2 += x[4] * y[srcBLen - 2] */
335 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
336 /* acc3 += x[5] * y[srcBLen - 2] */
337 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
338
339 /* Read y[srcBLen - 4] sample */
340 c0 = *py--;
341 /* Read x[6] sample */
342 x2 = *px++;
343
344 /* Perform the multiply-accumulates */
345 /* acc0 += x[3] * y[srcBLen - 4] */
346 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
347 /* acc1 += x[4] * y[srcBLen - 4] */
348 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
349 /* acc2 += x[5] * y[srcBLen - 4] */
350 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
351 /* acc3 += x[6] * y[srcBLen - 4] */
352 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
353
354 } while (--k);
355
356 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
357 ** No loop unrolling is used. */
358 k = srcBLen % 0x4U;
359
360 while (k > 0U)
361 {
362 /* Read y[srcBLen - 5] sample */
363 c0 = *py--;
364 /* Read x[7] sample */
365 x3 = *px++;
366
367 /* Perform the multiply-accumulates */
368 /* acc0 += x[4] * y[srcBLen - 5] */
369 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
370 /* acc1 += x[5] * y[srcBLen - 5] */
371 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
372 /* acc2 += x[6] * y[srcBLen - 5] */
373 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
374 /* acc3 += x[7] * y[srcBLen - 5] */
375 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
376
377 /* Reuse the present samples for the next MAC */
378 x0 = x1;
379 x1 = x2;
380 x2 = x3;
381
382 /* Decrement the loop counter */
383 k--;
384 }
385
386 /* Store the result in the accumulator in the destination buffer. */
387 *pOut++ = (q31_t) (acc0 << 1);
388 *pOut++ = (q31_t) (acc1 << 1);
389 *pOut++ = (q31_t) (acc2 << 1);
390 *pOut++ = (q31_t) (acc3 << 1);
391
392 /* Increment the pointer pIn1 index, count by 4 */
393 count += 4U;
394
395 /* Update the inputA and inputB pointers for next MAC calculation */
396 px = pSrc1 + count;
397 py = pSrc2;
398
399 /* Decrement loop counter */
400 blkCnt--;
401 }
402
403 /* Loop unrolling: Compute remaining outputs */
404 blkCnt = (uint32_t) blockSize2 % 0x4U;
405
406 #else
407
408 /* Initialize blkCnt with number of samples */
409 blkCnt = blockSize2;
410
411 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
412
413 while (blkCnt > 0U)
414 {
415 /* Accumulator is made zero for every iteration */
416 sum = 0;
417
418 #if defined (ARM_MATH_LOOPUNROLL)
419
420 /* Loop unrolling: Compute 4 outputs at a time */
421 k = srcBLen >> 2U;
422
423 while (k > 0U)
424 {
425 /* Perform the multiply-accumulates */
426 sum = (q31_t) ((((q63_t) sum << 32) +
427 ((q63_t) * px++ * (*py--))) >> 32);
428 sum = (q31_t) ((((q63_t) sum << 32) +
429 ((q63_t) * px++ * (*py--))) >> 32);
430 sum = (q31_t) ((((q63_t) sum << 32) +
431 ((q63_t) * px++ * (*py--))) >> 32);
432 sum = (q31_t) ((((q63_t) sum << 32) +
433 ((q63_t) * px++ * (*py--))) >> 32);
434
435 /* Decrement loop counter */
436 k--;
437 }
438
439 /* Loop unrolling: Compute remaining outputs */
440 k = srcBLen % 0x4U;
441
442 #else
443
444 /* Initialize blkCnt with number of samples */
445 k = srcBLen;
446
447 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
448
449 while (k > 0U)
450 {
451 /* Perform the multiply-accumulate */
452 sum = (q31_t) ((((q63_t) sum << 32) +
453 ((q63_t) *px++ * (*py--))) >> 32);
454
455 /* Decrement loop counter */
456 k--;
457 }
458
459 /* Store the result in the accumulator in the destination buffer. */
460 *pOut++ = sum << 1;
461
462 /* Increment MAC count */
463 count++;
464
465 /* Update the inputA and inputB pointers for next MAC calculation */
466 px = pSrc1 + count;
467 py = pSrc2;
468
469 /* Decrement loop counter */
470 blkCnt--;
471 }
472 }
473 else
474 {
475 /* If the srcBLen is not a multiple of 4,
476 * the blockSize2 loop cannot be unrolled by 4 */
477 blkCnt = (uint32_t) blockSize2;
478
479 while (blkCnt > 0U)
480 {
481 /* Accumulator is made zero for every iteration */
482 sum = 0;
483
484 /* srcBLen number of MACS should be performed */
485 k = srcBLen;
486
487 while (k > 0U)
488 {
489 /* Perform the multiply-accumulate */
490 sum = (q31_t) ((((q63_t) sum << 32) +
491 ((q63_t) *px++ * (*py--))) >> 32);
492
493 /* Decrement loop counter */
494 k--;
495 }
496
497 /* Store the result in the accumulator in the destination buffer. */
498 *pOut++ = sum << 1;
499
500 /* Increment the MAC count */
501 count++;
502
503 /* Update the inputA and inputB pointers for next MAC calculation */
504 px = pSrc1 + count;
505 py = pSrc2;
506
507 /* Decrement the loop counter */
508 blkCnt--;
509 }
510 }
511
512
513 /* --------------------------
514 * Initializations of stage3
515 * -------------------------*/
516
517 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
518 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
519 * ....
520 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
521 * sum += x[srcALen-1] * y[srcBLen-1]
522 */
523
524 /* In this stage the MAC operations are decreased by 1 for every iteration.
525 The count variable holds the number of MAC operations performed */
526 count = srcBLen - 1U;
527
528 /* Working pointer of inputA */
529 if (firstIndex > srcALen)
530 {
531 pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
532 }
533 else
534 {
535 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
536 }
537 px = pSrc1;
538
539 /* Working pointer of inputB */
540 pSrc2 = pIn2 + (srcBLen - 1U);
541 py = pSrc2;
542
543 /* -------------------
544 * Stage3 process
545 * ------------------*/
546
547 while (blockSize3 > 0)
548 {
549 /* Accumulator is made zero for every iteration */
550 sum = 0;
551
552 #if defined (ARM_MATH_LOOPUNROLL)
553
554 /* Loop unrolling: Compute 4 outputs at a time */
555 k = count >> 2U;
556
557 while (k > 0U)
558 {
559 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
560 sum = (q31_t) ((((q63_t) sum << 32) +
561 ((q63_t) *px++ * (*py--))) >> 32);
562
563 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
564 sum = (q31_t) ((((q63_t) sum << 32) +
565 ((q63_t) *px++ * (*py--))) >> 32);
566
567 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
568 sum = (q31_t) ((((q63_t) sum << 32) +
569 ((q63_t) *px++ * (*py--))) >> 32);
570
571 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
572 sum = (q31_t) ((((q63_t) sum << 32) +
573 ((q63_t) *px++ * (*py--))) >> 32);
574
575 /* Decrement loop counter */
576 k--;
577 }
578
579 /* Loop unrolling: Compute remaining outputs */
580 k = count % 0x4U;
581
582 #else
583
584 /* Initialize blkCnt with number of samples */
585 k = count;
586
587 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
588
589 while (k > 0U)
590 {
591 /* Perform the multiply-accumulates */
592 /* sum += x[srcALen-1] * y[srcBLen-1] */
593 sum = (q31_t) ((((q63_t) sum << 32) +
594 ((q63_t) *px++ * (*py--))) >> 32);
595
596 /* Decrement loop counter */
597 k--;
598 }
599
600 /* Store the result in the accumulator in the destination buffer. */
601 *pOut++ = sum << 1;
602
603 /* Update the inputA and inputB pointers for next MAC calculation */
604 px = ++pSrc1;
605 py = pSrc2;
606
607 /* Decrement MAC count */
608 count--;
609
610 /* Decrement the loop counter */
611 blockSize3--;
612 }
613
614 /* Set status as ARM_MATH_SUCCESS */
615 status = ARM_MATH_SUCCESS;
616 }
617
618 /* Return to application */
619 return (status);
620
621 }
622
623 /**
624 @} end of PartialConv group
625 */
626