1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_partial_opt_q7.c
4 * Description: Partial convolution of Q7 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup PartialConv
37 @{
38 */
39
40 /**
41 @brief Partial convolution of Q7 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written
47 @param[in] firstIndex is the first output sample to start with
48 @param[in] numPoints is the number of output points to be computed
49 @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
50 @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
51 @return execution status
52 - \ref ARM_MATH_SUCCESS : Operation successful
53 - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
54 */
55
arm_conv_partial_opt_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst,uint32_t firstIndex,uint32_t numPoints,q15_t * pScratch1,q15_t * pScratch2)56 arm_status arm_conv_partial_opt_q7(
57 const q7_t * pSrcA,
58 uint32_t srcALen,
59 const q7_t * pSrcB,
60 uint32_t srcBLen,
61 q7_t * pDst,
62 uint32_t firstIndex,
63 uint32_t numPoints,
64 q15_t * pScratch1,
65 q15_t * pScratch2)
66 {
67 q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */
68 q15_t x4; /* Temporary input variable */
69 const q7_t *pIn1, *pIn2; /* InputA and inputB pointer */
70 uint32_t j, k, blkCnt, tapCnt; /* Loop counter */
71 const q7_t *px; /* Temporary input1 pointer */
72 q15_t *py; /* Temporary input2 pointer */
73 q31_t acc0, acc1, acc2, acc3; /* Accumulator */
74 q31_t x1, x2, x3, y1; /* Temporary input variables */
75 arm_status status;
76 q7_t *pOut = pDst; /* Output pointer */
77 q7_t out0, out1, out2, out3; /* Temporary variables */
78
79 /* Check for range of output samples to be calculated */
80 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
81 {
82 /* Set status as ARM_MATH_ARGUMENT_ERROR */
83 status = ARM_MATH_ARGUMENT_ERROR;
84 }
85 else
86 {
87 /* The algorithm implementation is based on the lengths of the inputs. */
88 /* srcB is always made to slide across srcA. */
89 /* So srcBLen is always considered as shorter or equal to srcALen */
90 if (srcALen >= srcBLen)
91 {
92 /* Initialization of inputA pointer */
93 pIn1 = pSrcA;
94
95 /* Initialization of inputB pointer */
96 pIn2 = pSrcB;
97 }
98 else
99 {
100 /* Initialization of inputA pointer */
101 pIn1 = pSrcB;
102
103 /* Initialization of inputB pointer */
104 pIn2 = pSrcA;
105
106 /* srcBLen is always considered as shorter or equal to srcALen */
107 j = srcBLen;
108 srcBLen = srcALen;
109 srcALen = j;
110 }
111
112 /* pointer to take end of scratch2 buffer */
113 pScr2 = pScratch2;
114
115 /* points to smaller length sequence */
116 px = pIn2 + srcBLen - 1;
117
118 /* Apply loop unrolling and do 4 Copies simultaneously. */
119 k = srcBLen >> 2U;
120
121 /* First part of the processing with loop unrolling copies 4 data points at a time.
122 ** a second loop below copies for the remaining 1 to 3 samples. */
123 while (k > 0U)
124 {
125 /* copy second buffer in reversal manner */
126 x4 = (q15_t) *px--;
127 *pScr2++ = x4;
128 x4 = (q15_t) *px--;
129 *pScr2++ = x4;
130 x4 = (q15_t) *px--;
131 *pScr2++ = x4;
132 x4 = (q15_t) *px--;
133 *pScr2++ = x4;
134
135 /* Decrement loop counter */
136 k--;
137 }
138
139 /* If the count is not a multiple of 4, copy remaining samples here.
140 ** No loop unrolling is used. */
141 k = srcBLen % 0x4U;
142
143 while (k > 0U)
144 {
145 /* copy second buffer in reversal manner for remaining samples */
146 x4 = (q15_t) *px--;
147 *pScr2++ = x4;
148
149 /* Decrement loop counter */
150 k--;
151 }
152
153 /* Initialze temporary scratch pointer */
154 pScr1 = pScratch1;
155
156 /* Fill (srcBLen - 1U) zeros in scratch buffer */
157 arm_fill_q15(0, pScr1, (srcBLen - 1U));
158
159 /* Update temporary scratch pointer */
160 pScr1 += (srcBLen - 1U);
161
162 /* Copy (srcALen) samples in scratch buffer */
163 /* Apply loop unrolling and do 4 Copies simultaneously. */
164 k = srcALen >> 2U;
165
166 /* First part of the processing with loop unrolling copies 4 data points at a time.
167 ** a second loop below copies for the remaining 1 to 3 samples. */
168 while (k > 0U)
169 {
170 /* copy second buffer in reversal manner */
171 x4 = (q15_t) *pIn1++;
172 *pScr1++ = x4;
173 x4 = (q15_t) *pIn1++;
174 *pScr1++ = x4;
175 x4 = (q15_t) *pIn1++;
176 *pScr1++ = x4;
177 x4 = (q15_t) *pIn1++;
178 *pScr1++ = x4;
179
180 /* Decrement loop counter */
181 k--;
182 }
183
184 /* If the count is not a multiple of 4, copy remaining samples here.
185 ** No loop unrolling is used. */
186 k = srcALen % 0x4U;
187
188 while (k > 0U)
189 {
190 /* copy second buffer in reversal manner for remaining samples */
191 x4 = (q15_t) *pIn1++;
192 *pScr1++ = x4;
193
194 /* Decrement the loop counter */
195 k--;
196 }
197
198 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
199 arm_fill_q15(0, pScr1, (srcBLen - 1U));
200
201 /* Update pointer */
202 pScr1 += (srcBLen - 1U);
203
204
205 /* Temporary pointer for scratch2 */
206 py = pScratch2;
207
208 /* Initialization of pIn2 pointer */
209 pIn2 = (q7_t *) py;
210
211 pScr2 = py;
212
213 pOut = pDst + firstIndex;
214
215 pScratch1 += firstIndex;
216
217 /* Actual convolution process starts here */
218 blkCnt = (numPoints) >> 2;
219
220 while (blkCnt > 0)
221 {
222 /* Initialize temporary scratch pointer as scratch1 */
223 pScr1 = pScratch1;
224
225 /* Clear Accumulators */
226 acc0 = 0;
227 acc1 = 0;
228 acc2 = 0;
229 acc3 = 0;
230
231 /* Read two samples from scratch1 buffer */
232 x1 = read_q15x2_ia (&pScr1);
233
234 /* Read next two samples from scratch1 buffer */
235 x2 = read_q15x2_ia (&pScr1);
236
237 tapCnt = (srcBLen) >> 2U;
238
239 while (tapCnt > 0U)
240 {
241 /* Read four samples from smaller buffer */
242 y1 = read_q15x2_ia (&pScr2);
243
244 /* multiply and accumulate */
245 acc0 = __SMLAD(x1, y1, acc0);
246 acc2 = __SMLAD(x2, y1, acc2);
247
248 /* pack input data */
249 #ifndef ARM_MATH_BIG_ENDIAN
250 x3 = __PKHBT(x2, x1, 0);
251 #else
252 x3 = __PKHBT(x1, x2, 0);
253 #endif
254
255 /* multiply and accumulate */
256 acc1 = __SMLADX(x3, y1, acc1);
257
258 /* Read next two samples from scratch1 buffer */
259 x1 = read_q15x2_ia (&pScr1);
260
261 /* pack input data */
262 #ifndef ARM_MATH_BIG_ENDIAN
263 x3 = __PKHBT(x1, x2, 0);
264 #else
265 x3 = __PKHBT(x2, x1, 0);
266 #endif
267
268 acc3 = __SMLADX(x3, y1, acc3);
269
270 /* Read four samples from smaller buffer */
271 y1 = read_q15x2_ia (&pScr2);
272
273 acc0 = __SMLAD(x2, y1, acc0);
274
275 acc2 = __SMLAD(x1, y1, acc2);
276
277 acc1 = __SMLADX(x3, y1, acc1);
278
279 x2 = read_q15x2_ia (&pScr1);
280
281 #ifndef ARM_MATH_BIG_ENDIAN
282 x3 = __PKHBT(x2, x1, 0);
283 #else
284 x3 = __PKHBT(x1, x2, 0);
285 #endif
286
287 acc3 = __SMLADX(x3, y1, acc3);
288
289 /* Decrement loop counter */
290 tapCnt--;
291 }
292
293 /* Update scratch pointer for remaining samples of smaller length sequence */
294 pScr1 -= 4U;
295
296 /* apply same above for remaining samples of smaller length sequence */
297 tapCnt = (srcBLen) & 3U;
298
299 while (tapCnt > 0U)
300 {
301 /* accumulate the results */
302 acc0 += (*pScr1++ * *pScr2);
303 acc1 += (*pScr1++ * *pScr2);
304 acc2 += (*pScr1++ * *pScr2);
305 acc3 += (*pScr1++ * *pScr2++);
306
307 pScr1 -= 3U;
308
309 /* Decrement loop counter */
310 tapCnt--;
311 }
312
313 blkCnt--;
314
315 /* Store the result in the accumulator in the destination buffer. */
316 out0 = (q7_t) (__SSAT(acc0 >> 7U, 8));
317 out1 = (q7_t) (__SSAT(acc1 >> 7U, 8));
318 out2 = (q7_t) (__SSAT(acc2 >> 7U, 8));
319 out3 = (q7_t) (__SSAT(acc3 >> 7U, 8));
320
321 write_q7x4_ia (&pOut, __PACKq7(out0, out1, out2, out3));
322
323 /* Initialization of inputB pointer */
324 pScr2 = py;
325
326 pScratch1 += 4U;
327 }
328
329 blkCnt = (numPoints) & 0x3;
330
331 /* Calculate convolution for remaining samples of Bigger length sequence */
332 while (blkCnt > 0)
333 {
334 /* Initialze temporary scratch pointer as scratch1 */
335 pScr1 = pScratch1;
336
337 /* Clear Accumlators */
338 acc0 = 0;
339
340 tapCnt = (srcBLen) >> 1U;
341
342 while (tapCnt > 0U)
343 {
344
345 /* Read next two samples from scratch1 buffer */
346 x1 = read_q15x2_ia (&pScr1);
347
348 /* Read two samples from smaller buffer */
349 y1 = read_q15x2_ia (&pScr2);
350
351 acc0 = __SMLAD(x1, y1, acc0);
352
353 /* Decrement the loop counter */
354 tapCnt--;
355 }
356
357 tapCnt = (srcBLen) & 1U;
358
359 /* apply same above for remaining samples of smaller length sequence */
360 while (tapCnt > 0U)
361 {
362
363 /* accumulate the results */
364 acc0 += (*pScr1++ * *pScr2++);
365
366 /* Decrement loop counter */
367 tapCnt--;
368 }
369
370 blkCnt--;
371
372 /* Store the result in the accumulator in the destination buffer. */
373 *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
374
375 /* Initialization of inputB pointer */
376 pScr2 = py;
377
378 pScratch1 += 1U;
379 }
380
381 /* Set status as ARM_MATH_SUCCESS */
382 status = ARM_MATH_SUCCESS;
383 }
384
385 return (status);
386 }
387
388 /**
389 @} end of PartialConv group
390 */
391