1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_partial_q31.c
4 * Description: Partial convolution of Q31 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup PartialConv
37 @{
38 */
39
40 /**
41 @brief Partial convolution of Q31 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written
47 @param[in] firstIndex is the first output sample to start with
48 @param[in] numPoints is the number of output points to be computed
49 @return execution status
50 - \ref ARM_MATH_SUCCESS : Operation successful
51 - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
52
53 @remark
54 Refer to \ref arm_conv_partial_fast_q31() for a faster but less precise implementation of this function.
55 */
56
arm_conv_partial_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst,uint32_t firstIndex,uint32_t numPoints)57 arm_status arm_conv_partial_q31(
58 const q31_t * pSrcA,
59 uint32_t srcALen,
60 const q31_t * pSrcB,
61 uint32_t srcBLen,
62 q31_t * pDst,
63 uint32_t firstIndex,
64 uint32_t numPoints)
65 {
66
67 #if defined(ARM_MATH_DSP)
68
69 const q31_t *pIn1; /* InputA pointer */
70 const q31_t *pIn2; /* InputB pointer */
71 q31_t *pOut = pDst; /* Output pointer */
72 const q31_t *px; /* Intermediate inputA pointer */
73 const q31_t *py; /* Intermediate inputB pointer */
74 const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
75 q63_t sum; /* Accumulator */
76 uint32_t j, k, count, blkCnt, check;
77 int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
78 arm_status status; /* Status of Partial convolution */
79
80 #if defined (ARM_MATH_LOOPUNROLL)
81 q63_t acc0, acc1, acc2; /* Accumulator */
82 q31_t x0, x1, x2, c0; /* Temporary variables */
83 #endif
84
85 /* Check for range of output samples to be calculated */
86 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
87 {
88 /* Set status as ARM_MATH_ARGUMENT_ERROR */
89 status = ARM_MATH_ARGUMENT_ERROR;
90 }
91 else
92 {
93 /* The algorithm implementation is based on the lengths of the inputs. */
94 /* srcB is always made to slide across srcA. */
95 /* So srcBLen is always considered as shorter or equal to srcALen */
96 if (srcALen >= srcBLen)
97 {
98 /* Initialization of inputA pointer */
99 pIn1 = pSrcA;
100
101 /* Initialization of inputB pointer */
102 pIn2 = pSrcB;
103 }
104 else
105 {
106 /* Initialization of inputA pointer */
107 pIn1 = pSrcB;
108
109 /* Initialization of inputB pointer */
110 pIn2 = pSrcA;
111
112 /* srcBLen is always considered as shorter or equal to srcALen */
113 j = srcBLen;
114 srcBLen = srcALen;
115 srcALen = j;
116 }
117
118 /* Conditions to check which loopCounter holds
119 * the first and last indices of the output samples to be calculated. */
120 check = firstIndex + numPoints;
121 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
122 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
123 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
124 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
125 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
126 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
127
128 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
129 /* The function is internally
130 * divided into three stages according to the number of multiplications that has to be
131 * taken place between inputA samples and inputB samples. In the first stage of the
132 * algorithm, the multiplications increase by one for every iteration.
133 * In the second stage of the algorithm, srcBLen number of multiplications are done.
134 * In the third stage of the algorithm, the multiplications decrease by one
135 * for every iteration. */
136
137 /* Set the output pointer to point to the firstIndex
138 * of the output sample to be calculated. */
139 pOut = pDst + firstIndex;
140
141 /* --------------------------
142 * Initializations of stage1
143 * -------------------------*/
144
145 /* sum = x[0] * y[0]
146 * sum = x[0] * y[1] + x[1] * y[0]
147 * ....
148 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
149 */
150
151 /* In this stage the MAC operations are increased by 1 for every iteration.
152 The count variable holds the number of MAC operations performed.
153 Since the partial convolution starts from firstIndex
154 Number of Macs to be performed is firstIndex + 1 */
155 count = 1U + firstIndex;
156
157 /* Working pointer of inputA */
158 px = pIn1;
159
160 /* Working pointer of inputB */
161 pSrc2 = pIn2 + firstIndex;
162 py = pSrc2;
163
164 /* ------------------------
165 * Stage1 process
166 * ----------------------*/
167
168 /* The first stage starts here */
169 while (blockSize1 > 0)
170 {
171 /* Accumulator is made zero for every iteration */
172 sum = 0;
173
174 #if defined (ARM_MATH_LOOPUNROLL)
175
176 /* Loop unrolling: Compute 4 outputs at a time */
177 k = count >> 2U;
178
179 while (k > 0U)
180 {
181 /* x[0] * y[srcBLen - 1] */
182 sum += (q63_t) *px++ * (*py--);
183
184 /* x[1] * y[srcBLen - 2] */
185 sum += (q63_t) *px++ * (*py--);
186
187 /* x[2] * y[srcBLen - 3] */
188 sum += (q63_t) *px++ * (*py--);
189
190 /* x[3] * y[srcBLen - 4] */
191 sum += (q63_t) *px++ * (*py--);
192
193 /* Decrement loop counter */
194 k--;
195 }
196
197 /* Loop unrolling: Compute remaining outputs */
198 k = count % 0x4U;
199
200 #else
201
202 /* Initialize k with number of samples */
203 k = count;
204
205 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
206
207 while (k > 0U)
208 {
209 /* Perform the multiply-accumulate */
210 sum += (q63_t) *px++ * (*py--);
211
212 /* Decrement loop counter */
213 k--;
214 }
215
216 /* Store the result in the accumulator in the destination buffer. */
217 *pOut++ = (q31_t) (sum >> 31);
218
219 /* Update the inputA and inputB pointers for next MAC calculation */
220 py = ++pSrc2;
221 px = pIn1;
222
223 /* Increment MAC count */
224 count++;
225
226 /* Decrement loop counter */
227 blockSize1--;
228 }
229
230 /* --------------------------
231 * Initializations of stage2
232 * ------------------------*/
233
234 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
235 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
236 * ....
237 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
238 */
239
240 /* Working pointer of inputA */
241 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
242 {
243 pSrc1 = pIn1 + firstIndex - srcBLen + 1;
244 }
245 else
246 {
247 pSrc1 = pIn1;
248 }
249 px = pSrc1;
250
251 /* Working pointer of inputB */
252 pSrc2 = pIn2 + (srcBLen - 1U);
253 py = pSrc2;
254
255 /* count is index by which the pointer pIn1 to be incremented */
256 count = 0U;
257
258 /* -------------------
259 * Stage2 process
260 * ------------------*/
261
262 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
263 * So, to loop unroll over blockSize2,
264 * srcBLen should be greater than or equal to 4 */
265 if (srcBLen >= 4U)
266 {
267 #if defined (ARM_MATH_LOOPUNROLL)
268
269 /* Loop unroll over blkCnt */
270 blkCnt = blockSize2 / 3;
271
272 while (blkCnt > 0U)
273 {
274 /* Set all accumulators to zero */
275 acc0 = 0;
276 acc1 = 0;
277 acc2 = 0;
278
279 /* read x[0], x[1] samples */
280 x0 = *px++;
281 x1 = *px++;
282
283 /* Apply loop unrolling and compute 3 MACs simultaneously. */
284 k = srcBLen / 3;
285
286 /* First part of the processing with loop unrolling. Compute 3 MACs at a time.
287 ** a second loop below computes MACs for the remaining 1 to 2 samples. */
288 do
289 {
290 /* Read y[srcBLen - 1] sample */
291 c0 = *(py);
292
293 /* Read x[2] sample */
294 x2 = *(px);
295
296 /* Perform the multiply-accumulate */
297 /* acc0 += x[0] * y[srcBLen - 1] */
298 acc0 += (q63_t) x0 * c0;
299 /* acc1 += x[1] * y[srcBLen - 1] */
300 acc1 += (q63_t) x1 * c0;
301 /* acc2 += x[2] * y[srcBLen - 1] */
302 acc2 += (q63_t) x2 * c0;
303
304 /* Read y[srcBLen - 2] sample */
305 c0 = *(py - 1U);
306
307 /* Read x[3] sample */
308 x0 = *(px + 1U);
309
310 /* Perform the multiply-accumulate */
311 /* acc0 += x[1] * y[srcBLen - 2] */
312 acc0 += (q63_t) x1 * c0;
313 /* acc1 += x[2] * y[srcBLen - 2] */
314 acc1 += (q63_t) x2 * c0;
315 /* acc2 += x[3] * y[srcBLen - 2] */
316 acc2 += (q63_t) x0 * c0;
317
318 /* Read y[srcBLen - 3] sample */
319 c0 = *(py - 2U);
320
321 /* Read x[4] sample */
322 x1 = *(px + 2U);
323
324 /* Perform the multiply-accumulate */
325 /* acc0 += x[2] * y[srcBLen - 3] */
326 acc0 += (q63_t) x2 * c0;
327 /* acc1 += x[3] * y[srcBLen - 2] */
328 acc1 += (q63_t) x0 * c0;
329 /* acc2 += x[4] * y[srcBLen - 2] */
330 acc2 += (q63_t) x1 * c0;
331
332
333 px += 3U;
334
335 py -= 3U;
336
337 } while (--k);
338
339 /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
340 ** No loop unrolling is used. */
341 k = srcBLen - (3 * (srcBLen / 3));
342
343 while (k > 0U)
344 {
345 /* Read y[srcBLen - 5] sample */
346 c0 = *py--;
347 /* Read x[7] sample */
348 x2 = *px++;
349
350 /* Perform the multiply-accumulates */
351 /* acc0 += x[4] * y[srcBLen - 5] */
352 acc0 += (q63_t) x0 * c0;
353 /* acc1 += x[5] * y[srcBLen - 5] */
354 acc1 += (q63_t) x1 * c0;
355 /* acc2 += x[6] * y[srcBLen - 5] */
356 acc2 += (q63_t) x2 * c0;
357
358 /* Reuse the present samples for the next MAC */
359 x0 = x1;
360 x1 = x2;
361
362 /* Decrement the loop counter */
363 k--;
364 }
365
366 /* Store the result in the accumulator in the destination buffer. */
367 *pOut++ = (q31_t) (acc0 >> 31);
368 *pOut++ = (q31_t) (acc1 >> 31);
369 *pOut++ = (q31_t) (acc2 >> 31);
370
371 /* Increment the pointer pIn1 index, count by 3 */
372 count += 3U;
373
374 /* Update the inputA and inputB pointers for next MAC calculation */
375 px = pSrc1 + count;
376 py = pSrc2;
377
378 /* Decrement loop counter */
379 blkCnt--;
380 }
381
382 /* Loop unrolling: Compute remaining outputs */
383 blkCnt = blockSize2 - 3 * (blockSize2 / 3);
384
385 #else
386
387 /* Initialize blkCnt with number of samples */
388 blkCnt = blockSize2;
389
390 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
391
392 while (blkCnt > 0U)
393 {
394 /* Accumulator is made zero for every iteration */
395 sum = 0;
396
397 #if defined (ARM_MATH_LOOPUNROLL)
398
399 /* Loop unrolling: Compute 4 outputs at a time */
400 k = srcBLen >> 2U;
401
402 while (k > 0U)
403 {
404 /* Perform the multiply-accumulates */
405 sum += (q63_t) *px++ * (*py--);
406 sum += (q63_t) *px++ * (*py--);
407 sum += (q63_t) *px++ * (*py--);
408 sum += (q63_t) *px++ * (*py--);
409
410 /* Decrement loop counter */
411 k--;
412 }
413
414 /* Loop unrolling: Compute remaining outputs */
415 k = srcBLen % 0x4U;
416
417 #else
418
419 /* Initialize blkCnt with number of samples */
420 k = srcBLen;
421
422 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
423
424 while (k > 0U)
425 {
426 /* Perform the multiply-accumulate */
427 sum += (q63_t) *px++ * *py--;
428
429 /* Decrement loop counter */
430 k--;
431 }
432
433 /* Store the result in the accumulator in the destination buffer. */
434 *pOut++ = (q31_t) (sum >> 31);
435
436 /* Increment MAC count */
437 count++;
438
439 /* Update the inputA and inputB pointers for next MAC calculation */
440 px = pSrc1 + count;
441 py = pSrc2;
442
443 /* Decrement loop counter */
444 blkCnt--;
445 }
446 }
447 else
448 {
449 /* If the srcBLen is not a multiple of 4,
450 * the blockSize2 loop cannot be unrolled by 4 */
451 blkCnt = (uint32_t) blockSize2;
452
453 while (blkCnt > 0U)
454 {
455 /* Accumulator is made zero for every iteration */
456 sum = 0;
457
458 /* srcBLen number of MACS should be performed */
459 k = srcBLen;
460
461 while (k > 0U)
462 {
463 /* Perform the multiply-accumulate */
464 sum += (q63_t) *px++ * *py--;
465
466 /* Decrement loop counter */
467 k--;
468 }
469
470 /* Store the result in the accumulator in the destination buffer. */
471 *pOut++ = (q31_t) (sum >> 31);
472
473 /* Increment the MAC count */
474 count++;
475
476 /* Update the inputA and inputB pointers for next MAC calculation */
477 px = pSrc1 + count;
478 py = pSrc2;
479
480 /* Decrement the loop counter */
481 blkCnt--;
482 }
483 }
484
485
486 /* --------------------------
487 * Initializations of stage3
488 * -------------------------*/
489
490 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
491 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
492 * ....
493 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
494 * sum += x[srcALen-1] * y[srcBLen-1]
495 */
496
497 /* In this stage the MAC operations are decreased by 1 for every iteration.
498 The blockSize3 variable holds the number of MAC operations performed */
499 count = srcBLen - 1U;
500
501 /* Working pointer of inputA */
502 if (firstIndex > srcALen)
503 {
504 pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
505 }
506 else
507 {
508 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
509 }
510 px = pSrc1;
511
512 /* Working pointer of inputB */
513 pSrc2 = pIn2 + (srcBLen - 1U);
514 py = pSrc2;
515
516 /* -------------------
517 * Stage3 process
518 * ------------------*/
519
520 while (blockSize3 > 0)
521 {
522 /* Accumulator is made zero for every iteration */
523 sum = 0;
524
525 #if defined (ARM_MATH_LOOPUNROLL)
526
527 /* Loop unrolling: Compute 4 outputs at a time */
528 k = count >> 2U;
529
530 while (k > 0U)
531 {
532 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
533 sum += (q63_t) *px++ * *py--;
534
535 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
536 sum += (q63_t) *px++ * *py--;
537
538 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
539 sum += (q63_t) *px++ * *py--;
540
541 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
542 sum += (q63_t) *px++ * *py--;
543
544 /* Decrement loop counter */
545 k--;
546 }
547
548 /* Loop unrolling: Compute remaining outputs */
549 k = count % 0x4U;
550
551 #else
552
553 /* Initialize blkCnt with number of samples */
554 k = count;
555
556 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
557
558 while (k > 0U)
559 {
560 /* Perform the multiply-accumulate */
561 /* sum += x[srcALen-1] * y[srcBLen-1] */
562 sum += (q63_t) *px++ * *py--;
563
564 /* Decrement loop counter */
565 k--;
566 }
567
568 /* Store the result in the accumulator in the destination buffer. */
569 *pOut++ = (q31_t) (sum >> 31);
570
571 /* Update the inputA and inputB pointers for next MAC calculation */
572 px = ++pSrc1;
573 py = pSrc2;
574
575 /* Decrement MAC count */
576 count--;
577
578 /* Decrement the loop counter */
579 blockSize3--;
580 }
581
582 /* Set status as ARM_MATH_SUCCESS */
583 status = ARM_MATH_SUCCESS;
584 }
585
586 /* Return to application */
587 return (status);
588
589 #else
590 /* alternate version for CM0_FAMILY */
591
592 const q31_t *pIn1 = pSrcA; /* InputA pointer */
593 const q31_t *pIn2 = pSrcB; /* InputB pointer */
594 q63_t sum; /* Accumulator */
595 uint32_t i, j; /* Loop counters */
596 arm_status status; /* Status of Partial convolution */
597
598 /* Check for range of output samples to be calculated */
599 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
600 {
601 /* Set status as ARM_MATH_ARGUMENT_ERROR */
602 status = ARM_MATH_ARGUMENT_ERROR;
603 }
604 else
605 {
606 /* Loop to calculate convolution for output length number of values */
607 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
608 {
609 /* Initialize sum with zero to carry on MAC operations */
610 sum = 0;
611
612 /* Loop to perform MAC operations according to convolution equation */
613 for (j = 0U; j <= i; j++)
614 {
615 /* Check the array limitations */
616 if (((i - j) < srcBLen) && (j < srcALen))
617 {
618 /* z[i] += x[i-j] * y[j] */
619 sum += ((q63_t) pIn1[j] * pIn2[i - j]);
620 }
621 }
622
623 /* Store the output in the destination buffer */
624 pDst[i] = (q31_t) (sum >> 31U);
625 }
626
627 /* Set status as ARM_MATH_SUCCESS */
628 status = ARM_MATH_SUCCESS;
629 }
630
631 /* Return to application */
632 return (status);
633
634 #endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
635
636 }
637
638 /**
639 @} end of PartialConv group
640 */
641