1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_correlate_opt_q7.c
4 * Description: Correlation of Q7 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Corr
37 @{
38 */
39
40 /**
41 @brief Correlation of Q7 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
47 @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
48 @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
49
50 @par Scaling and Overflow Behavior
51 The function is implemented using a 32-bit internal accumulator.
52 Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
53 The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
54 This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
55 The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
56 */
57
arm_correlate_opt_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst,q15_t * pScratch1,q15_t * pScratch2)58 ARM_DSP_ATTRIBUTE void arm_correlate_opt_q7(
59 const q7_t * pSrcA,
60 uint32_t srcALen,
61 const q7_t * pSrcB,
62 uint32_t srcBLen,
63 q7_t * pDst,
64 q15_t * pScratch1,
65 q15_t * pScratch2)
66 {
67 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */
68 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */
69 q15_t x4; /* Temporary input variable */
70 q15_t *py; /* Temporary input2 pointer */
71 q31_t acc0, acc1, acc2, acc3; /* Accumulators */
72 const q7_t *pIn1, *pIn2; /* InputA and inputB pointer */
73 uint32_t j, k, blkCnt, tapCnt; /* Loop counter */
74 int32_t inc = 1; /* Output pointer increment */
75 uint32_t outBlockSize; /* Loop counter */
76 q31_t x1, x2, x3, y1; /* Temporary input variables */
77 q7_t *pOut = pDst; /* Output pointer */
78
79 /* The algorithm implementation is based on the lengths of the inputs. */
80 /* srcB is always made to slide across srcA. */
81 /* So srcBLen is always considered as shorter or equal to srcALen */
82 /* But CORR(x, y) is reverse of CORR(y, x) */
83 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
84 /* and the destination pointer modifier, inc is set to -1 */
85 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
86 /* But to improve the performance,
87 * we include zeroes in the output instead of zero padding either of the the inputs*/
88 /* If srcALen > srcBLen,
89 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
90 /* If srcALen < srcBLen,
91 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
92 if (srcALen >= srcBLen)
93 {
94 /* Initialization of inputA pointer */
95 pIn1 = pSrcA;
96
97 /* Initialization of inputB pointer */
98 pIn2 = pSrcB;
99
100 /* Number of output samples is calculated */
101 outBlockSize = (srcALen * 2U) - 1U;
102
103 /* When srcALen > srcBLen, zero padding is done to srcB
104 * to make their lengths equal.
105 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
106 * number of output samples are made zero */
107 j = outBlockSize - (srcALen + (srcBLen - 1U));
108
109 /* Updating the pointer position to non zero value */
110 pOut += j;
111 }
112 else
113 {
114 /* Initialization of inputA pointer */
115 pIn1 = pSrcB;
116
117 /* Initialization of inputB pointer */
118 pIn2 = pSrcA;
119
120 /* srcBLen is always considered as shorter or equal to srcALen */
121 j = srcBLen;
122 srcBLen = srcALen;
123 srcALen = j;
124
125 /* CORR(x, y) = Reverse order(CORR(y, x)) */
126 /* Hence set the destination pointer to point to the last output sample */
127 pOut = pDst + ((srcALen + srcBLen) - 2U);
128
129 /* Destination address modifier is set to -1 */
130 inc = -1;
131 }
132
133
134 /* Copy (srcBLen) samples in scratch buffer */
135 k = srcBLen >> 2U;
136
137 /* First part of the processing with loop unrolling copies 4 data points at a time.
138 a second loop below copies for the remaining 1 to 3 samples. */
139 while (k > 0U)
140 {
141 /* copy second buffer in reversal manner */
142 x4 = (q15_t) *pIn2++;
143 *pScr2++ = x4;
144 x4 = (q15_t) *pIn2++;
145 *pScr2++ = x4;
146 x4 = (q15_t) *pIn2++;
147 *pScr2++ = x4;
148 x4 = (q15_t) *pIn2++;
149 *pScr2++ = x4;
150
151 /* Decrement loop counter */
152 k--;
153 }
154
155 /* If the count is not a multiple of 4, copy remaining samples here.
156 No loop unrolling is used. */
157 k = srcBLen % 0x4U;
158
159 while (k > 0U)
160 {
161 /* copy second buffer in reversal manner for remaining samples */
162 x4 = (q15_t) *pIn2++;
163 *pScr2++ = x4;
164
165 /* Decrement loop counter */
166 k--;
167 }
168
169 /* Fill (srcBLen - 1U) zeros in scratch buffer */
170 arm_fill_q15(0, pScr1, (srcBLen - 1U));
171
172 /* Update temporary scratch pointer */
173 pScr1 += (srcBLen - 1U);
174
175 /* Copy (srcALen) samples in scratch buffer */
176 /* Apply loop unrolling and do 4 Copies simultaneously. */
177 k = srcALen >> 2U;
178
179 /* First part of the processing with loop unrolling copies 4 data points at a time.
180 a second loop below copies for the remaining 1 to 3 samples. */
181 while (k > 0U)
182 {
183 /* copy second buffer in reversal manner */
184 x4 = (q15_t) *pIn1++;
185 *pScr1++ = x4;
186 x4 = (q15_t) *pIn1++;
187 *pScr1++ = x4;
188 x4 = (q15_t) *pIn1++;
189 *pScr1++ = x4;
190 x4 = (q15_t) *pIn1++;
191 *pScr1++ = x4;
192
193 /* Decrement loop counter */
194 k--;
195 }
196
197 /* If the count is not a multiple of 4, copy remaining samples here.
198 No loop unrolling is used. */
199 k = srcALen % 0x4U;
200
201 while (k > 0U)
202 {
203 /* copy second buffer in reversal manner for remaining samples */
204 x4 = (q15_t) * pIn1++;
205 *pScr1++ = x4;
206
207 /* Decrement the loop counter */
208 k--;
209 }
210
211 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
212 arm_fill_q15(0, pScr1, (srcBLen - 1U));
213
214 /* Update pointer */
215 pScr1 += (srcBLen - 1U);
216
217 /* Temporary pointer for scratch2 */
218 py = pScratch2;
219
220 /* Initialization of pScr2 pointer */
221 pScr2 = pScratch2;
222
223 /* Actual correlation process starts here */
224 blkCnt = (srcALen + srcBLen - 1U) >> 2;
225
226 while (blkCnt > 0)
227 {
228 /* Initialze temporary scratch pointer as scratch1 */
229 pScr1 = pScratch1;
230
231 /* Clear Accumlators */
232 acc0 = 0;
233 acc1 = 0;
234 acc2 = 0;
235 acc3 = 0;
236
237 /* Read two samples from scratch1 buffer */
238 x1 = read_q15x2_ia (&pScr1);
239
240 /* Read next two samples from scratch1 buffer */
241 x2 = read_q15x2_ia (&pScr1);
242
243 tapCnt = (srcBLen) >> 2U;
244
245 while (tapCnt > 0U)
246 {
247 /* Read four samples from smaller buffer */
248 y1 = read_q15x2_ia (&pScr2);
249
250 /* multiply and accumulate */
251 acc0 = __SMLAD(x1, y1, acc0);
252 acc2 = __SMLAD(x2, y1, acc2);
253
254 /* pack input data */
255 #ifndef ARM_MATH_BIG_ENDIAN
256 x3 = __PKHBT(x2, x1, 0);
257 #else
258 x3 = __PKHBT(x1, x2, 0);
259 #endif
260
261 /* multiply and accumulate */
262 acc1 = __SMLADX(x3, y1, acc1);
263
264 /* Read next two samples from scratch1 buffer */
265 x1 = read_q15x2_ia (&pScr1);
266
267 /* pack input data */
268 #ifndef ARM_MATH_BIG_ENDIAN
269 x3 = __PKHBT(x1, x2, 0);
270 #else
271 x3 = __PKHBT(x2, x1, 0);
272 #endif
273
274 acc3 = __SMLADX(x3, y1, acc3);
275
276 /* Read four samples from smaller buffer */
277 y1 = read_q15x2_ia (&pScr2);
278
279 acc0 = __SMLAD(x2, y1, acc0);
280
281 acc2 = __SMLAD(x1, y1, acc2);
282
283 acc1 = __SMLADX(x3, y1, acc1);
284
285 x2 = read_q15x2_ia (&pScr1);
286
287 #ifndef ARM_MATH_BIG_ENDIAN
288 x3 = __PKHBT(x2, x1, 0);
289 #else
290 x3 = __PKHBT(x1, x2, 0);
291 #endif
292
293 acc3 = __SMLADX(x3, y1, acc3);
294
295 /* Decrement loop counter */
296 tapCnt--;
297 }
298
299 /* Update scratch pointer for remaining samples of smaller length sequence */
300 pScr1 -= 4U;
301
302 /* apply same above for remaining samples of smaller length sequence */
303 tapCnt = (srcBLen) & 3U;
304
305 while (tapCnt > 0U)
306 {
307 /* accumulate the results */
308 acc0 += (*pScr1++ * *pScr2);
309 acc1 += (*pScr1++ * *pScr2);
310 acc2 += (*pScr1++ * *pScr2);
311 acc3 += (*pScr1++ * *pScr2++);
312
313 pScr1 -= 3U;
314
315 /* Decrement loop counter */
316 tapCnt--;
317 }
318
319 blkCnt--;
320
321 /* Store the result in the accumulator in the destination buffer. */
322 *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
323 pOut += inc;
324 *pOut = (q7_t) (__SSAT(acc1 >> 7U, 8));
325 pOut += inc;
326 *pOut = (q7_t) (__SSAT(acc2 >> 7U, 8));
327 pOut += inc;
328 *pOut = (q7_t) (__SSAT(acc3 >> 7U, 8));
329 pOut += inc;
330
331 /* Initialization of inputB pointer */
332 pScr2 = py;
333
334 pScratch1 += 4U;
335 }
336
337 blkCnt = (srcALen + srcBLen - 1U) & 0x3;
338
339 /* Calculate correlation for remaining samples of Bigger length sequence */
340 while (blkCnt > 0)
341 {
342 /* Initialze temporary scratch pointer as scratch1 */
343 pScr1 = pScratch1;
344
345 /* Clear Accumlators */
346 acc0 = 0;
347
348 tapCnt = (srcBLen) >> 1U;
349
350 while (tapCnt > 0U)
351 {
352 acc0 += (*pScr1++ * *pScr2++);
353 acc0 += (*pScr1++ * *pScr2++);
354
355 /* Decrement loop counter */
356 tapCnt--;
357 }
358
359 tapCnt = (srcBLen) & 1U;
360
361 /* apply same above for remaining samples of smaller length sequence */
362 while (tapCnt > 0U)
363 {
364 /* accumulate the results */
365 acc0 += (*pScr1++ * *pScr2++);
366
367 /* Decrement loop counter */
368 tapCnt--;
369 }
370
371 blkCnt--;
372
373 /* Store the result in the accumulator in the destination buffer. */
374 *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
375 pOut += inc;
376
377 /* Initialization of inputB pointer */
378 pScr2 = py;
379
380 pScratch1 += 1U;
381 }
382
383 }
384
385 /**
386 @} end of Corr group
387 */
388