1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_opt_q7.c
4 * Description: Convolution of Q7 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Conv
37 @{
38 */
39
40 /**
41 @brief Convolution of Q7 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
47 @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
48 @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
49
50 @par Scaling and Overflow Behavior
51 The function is implemented using a 32-bit internal accumulator.
52 Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
53 The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
54 This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
55 The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
56 */
57
arm_conv_opt_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst,q15_t * pScratch1,q15_t * pScratch2)58 ARM_DSP_ATTRIBUTE void arm_conv_opt_q7(
59 const q7_t * pSrcA,
60 uint32_t srcALen,
61 const q7_t * pSrcB,
62 uint32_t srcBLen,
63 q7_t * pDst,
64 q15_t * pScratch1,
65 q15_t * pScratch2)
66 {
67 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */
68 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */
69 q15_t x4; /* Temporary input variable */
70 q15_t *py; /* Temporary input2 pointer */
71 q31_t acc0, acc1, acc2, acc3; /* Accumulators */
72 const q7_t *pIn1, *pIn2; /* InputA and inputB pointer */
73 uint32_t j, k, blkCnt, tapCnt; /* Loop counter */
74 q31_t x1, x2, x3, y1; /* Temporary input variables */
75 const q7_t *px; /* Temporary input1 pointer */
76 q7_t *pOut = pDst; /* Output pointer */
77 q7_t out0, out1, out2, out3; /* Temporary variables */
78
79 /* The algorithm implementation is based on the lengths of the inputs. */
80 /* srcB is always made to slide across srcA. */
81 /* So srcBLen is always considered as shorter or equal to srcALen */
82 if (srcALen >= srcBLen)
83 {
84 /* Initialization of inputA pointer */
85 pIn1 = pSrcA;
86
87 /* Initialization of inputB pointer */
88 pIn2 = pSrcB;
89 }
90 else
91 {
92 /* Initialization of inputA pointer */
93 pIn1 = pSrcB;
94
95 /* Initialization of inputB pointer */
96 pIn2 = pSrcA;
97
98 /* srcBLen is always considered as shorter or equal to srcALen */
99 j = srcBLen;
100 srcBLen = srcALen;
101 srcALen = j;
102 }
103
104 /* points to smaller length sequence */
105 px = pIn2 + srcBLen - 1;
106
107 /* Apply loop unrolling and do 4 Copies simultaneously. */
108 k = srcBLen >> 2U;
109
110 /* First part of the processing with loop unrolling copies 4 data points at a time.
111 ** a second loop below copies for the remaining 1 to 3 samples. */
112 while (k > 0U)
113 {
114 /* copy second buffer in reversal manner */
115 x4 = (q15_t) *px--;
116 *pScr2++ = x4;
117 x4 = (q15_t) *px--;
118 *pScr2++ = x4;
119 x4 = (q15_t) *px--;
120 *pScr2++ = x4;
121 x4 = (q15_t) *px--;
122 *pScr2++ = x4;
123
124 /* Decrement loop counter */
125 k--;
126 }
127
128 /* If the count is not a multiple of 4, copy remaining samples here.
129 ** No loop unrolling is used. */
130 k = srcBLen % 0x4U;
131
132 while (k > 0U)
133 {
134 /* copy second buffer in reversal manner for remaining samples */
135 x4 = (q15_t) *px--;
136 *pScr2++ = x4;
137
138 /* Decrement loop counter */
139 k--;
140 }
141
142 /* Fill (srcBLen - 1U) zeros in scratch buffer */
143 arm_fill_q15(0, pScr1, (srcBLen - 1U));
144
145 /* Update temporary scratch pointer */
146 pScr1 += (srcBLen - 1U);
147
148 /* Copy (srcALen) samples in scratch buffer */
149 /* Apply loop unrolling and do 4 Copies simultaneously. */
150 k = srcALen >> 2U;
151
152 /* First part of the processing with loop unrolling copies 4 data points at a time.
153 ** a second loop below copies for the remaining 1 to 3 samples. */
154 while (k > 0U)
155 {
156 /* copy second buffer in reversal manner */
157 x4 = (q15_t) *pIn1++;
158 *pScr1++ = x4;
159 x4 = (q15_t) *pIn1++;
160 *pScr1++ = x4;
161 x4 = (q15_t) *pIn1++;
162 *pScr1++ = x4;
163 x4 = (q15_t) *pIn1++;
164 *pScr1++ = x4;
165
166 /* Decrement loop counter */
167 k--;
168 }
169
170 /* If the count is not a multiple of 4, copy remaining samples here.
171 ** No loop unrolling is used. */
172 k = srcALen % 0x4U;
173
174 while (k > 0U)
175 {
176 /* copy second buffer in reversal manner for remaining samples */
177 x4 = (q15_t) * pIn1++;
178 *pScr1++ = x4;
179
180 /* Decrement the loop counter */
181 k--;
182 }
183
184 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
185 arm_fill_q15(0, pScr1, (srcBLen - 1U));
186
187 /* Update pointer */
188 pScr1 += (srcBLen - 1U);
189
190 /* Temporary pointer for scratch2 */
191 py = pScratch2;
192
193 /* Initialization of pIn2 pointer */
194 pIn2 = (q7_t *) py;
195
196 pScr2 = py;
197
198 /* Actual convolution process starts here */
199 blkCnt = (srcALen + srcBLen - 1U) >> 2U;
200
201 while (blkCnt > 0)
202 {
203 /* Initialze temporary scratch pointer as scratch1 */
204 pScr1 = pScratch1;
205
206 /* Clear Accumlators */
207 acc0 = 0;
208 acc1 = 0;
209 acc2 = 0;
210 acc3 = 0;
211
212 /* Read two samples from scratch1 buffer */
213 x1 = read_q15x2_ia (&pScr1);
214
215 /* Read next two samples from scratch1 buffer */
216 x2 = read_q15x2_ia (&pScr1);
217
218 tapCnt = (srcBLen) >> 2U;
219
220 while (tapCnt > 0U)
221 {
222 /* Read four samples from smaller buffer */
223 y1 = read_q15x2_ia (&pScr2);
224
225 /* multiply and accumulate */
226 acc0 = __SMLAD(x1, y1, acc0);
227 acc2 = __SMLAD(x2, y1, acc2);
228
229 /* pack input data */
230 #ifndef ARM_MATH_BIG_ENDIAN
231 x3 = __PKHBT(x2, x1, 0);
232 #else
233 x3 = __PKHBT(x1, x2, 0);
234 #endif
235
236 /* multiply and accumulate */
237 acc1 = __SMLADX(x3, y1, acc1);
238
239 /* Read next two samples from scratch1 buffer */
240 x1 = read_q15x2_ia (&pScr1);
241
242 /* pack input data */
243 #ifndef ARM_MATH_BIG_ENDIAN
244 x3 = __PKHBT(x1, x2, 0);
245 #else
246 x3 = __PKHBT(x2, x1, 0);
247 #endif
248
249 acc3 = __SMLADX(x3, y1, acc3);
250
251 /* Read four samples from smaller buffer */
252 y1 = read_q15x2_ia (&pScr2);
253
254 acc0 = __SMLAD(x2, y1, acc0);
255
256 acc2 = __SMLAD(x1, y1, acc2);
257
258 acc1 = __SMLADX(x3, y1, acc1);
259
260 x2 = read_q15x2_ia (&pScr1);
261
262 #ifndef ARM_MATH_BIG_ENDIAN
263 x3 = __PKHBT(x2, x1, 0);
264 #else
265 x3 = __PKHBT(x1, x2, 0);
266 #endif
267
268 acc3 = __SMLADX(x3, y1, acc3);
269
270 /* Decrement loop counter */
271 tapCnt--;
272 }
273
274 /* Update scratch pointer for remaining samples of smaller length sequence */
275 pScr1 -= 4U;
276
277 /* apply same above for remaining samples of smaller length sequence */
278 tapCnt = (srcBLen) & 3U;
279
280 while (tapCnt > 0U)
281 {
282 /* accumulate the results */
283 acc0 += (*pScr1++ * *pScr2);
284 acc1 += (*pScr1++ * *pScr2);
285 acc2 += (*pScr1++ * *pScr2);
286 acc3 += (*pScr1++ * *pScr2++);
287
288 pScr1 -= 3U;
289
290 /* Decrement loop counter */
291 tapCnt--;
292 }
293
294 blkCnt--;
295
296 /* Store the result in the accumulator in the destination buffer. */
297 out0 = (q7_t) (__SSAT(acc0 >> 7U, 8));
298 out1 = (q7_t) (__SSAT(acc1 >> 7U, 8));
299 out2 = (q7_t) (__SSAT(acc2 >> 7U, 8));
300 out3 = (q7_t) (__SSAT(acc3 >> 7U, 8));
301
302 write_q7x4_ia (&pOut, __PACKq7(out0, out1, out2, out3));
303
304 /* Initialization of inputB pointer */
305 pScr2 = py;
306
307 pScratch1 += 4U;
308 }
309
310 blkCnt = (srcALen + srcBLen - 1U) & 0x3;
311
312 /* Calculate convolution for remaining samples of Bigger length sequence */
313 while (blkCnt > 0)
314 {
315 /* Initialze temporary scratch pointer as scratch1 */
316 pScr1 = pScratch1;
317
318 /* Clear Accumlators */
319 acc0 = 0;
320
321 tapCnt = (srcBLen) >> 1U;
322
323 while (tapCnt > 0U)
324 {
325 acc0 += (*pScr1++ * *pScr2++);
326 acc0 += (*pScr1++ * *pScr2++);
327
328 /* Decrement loop counter */
329 tapCnt--;
330 }
331
332 tapCnt = (srcBLen) & 1U;
333
334 /* apply same above for remaining samples of smaller length sequence */
335 while (tapCnt > 0U)
336 {
337 /* accumulate the results */
338 acc0 += (*pScr1++ * *pScr2++);
339
340 /* Decrement loop counter */
341 tapCnt--;
342 }
343
344 blkCnt--;
345
346 /* Store the result in the accumulator in the destination buffer. */
347 *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
348
349 /* Initialization of inputB pointer */
350 pScr2 = py;
351
352 pScratch1 += 1U;
353 }
354
355 }
356
357 /**
358 @} end of Conv group
359 */
360