1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_correlate_opt_q7.c
4  * Description:  Correlation of Q7 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Corr
37   @{
38  */
39 
40 /**
41   @brief         Correlation of Q7 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
47   @param[in]     pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
48   @param[in]     pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
49 
50   @par           Scaling and Overflow Behavior
51                    The function is implemented using a 32-bit internal accumulator.
52                    Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
53                    The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
54                    This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
55                    The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
56  */
57 
arm_correlate_opt_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst,q15_t * pScratch1,q15_t * pScratch2)58 ARM_DSP_ATTRIBUTE void arm_correlate_opt_q7(
59   const q7_t * pSrcA,
60         uint32_t srcALen,
61   const q7_t * pSrcB,
62         uint32_t srcBLen,
63         q7_t * pDst,
64         q15_t * pScratch1,
65         q15_t * pScratch2)
66 {
67         q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch */
68         q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch */
69         q15_t x4;                                      /* Temporary input variable */
70         q15_t *py;                                     /* Temporary input2 pointer */
71         q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
72   const q7_t *pIn1, *pIn2;                             /* InputA and inputB pointer */
73         uint32_t j, k, blkCnt, tapCnt;                 /* Loop counter */
74         int32_t inc = 1;                               /* Output pointer increment */
75         uint32_t outBlockSize;                         /* Loop counter */
76         q31_t x1, x2, x3, y1;                          /* Temporary input variables */
77         q7_t *pOut = pDst;                             /* Output pointer */
78 
79   /* The algorithm implementation is based on the lengths of the inputs. */
80   /* srcB is always made to slide across srcA. */
81   /* So srcBLen is always considered as shorter or equal to srcALen */
82   /* But CORR(x, y) is reverse of CORR(y, x) */
83   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
84   /* and the destination pointer modifier, inc is set to -1 */
85   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
86   /* But to improve the performance,
87    * we include zeroes in the output instead of zero padding either of the the inputs*/
88   /* If srcALen > srcBLen,
89    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
90   /* If srcALen < srcBLen,
91    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
92   if (srcALen >= srcBLen)
93   {
94     /* Initialization of inputA pointer */
95     pIn1 = pSrcA;
96 
97     /* Initialization of inputB pointer */
98     pIn2 = pSrcB;
99 
100     /* Number of output samples is calculated */
101     outBlockSize = (srcALen * 2U) - 1U;
102 
103     /* When srcALen > srcBLen, zero padding is done to srcB
104      * to make their lengths equal.
105      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
106      * number of output samples are made zero */
107     j = outBlockSize - (srcALen + (srcBLen - 1U));
108 
109     /* Updating the pointer position to non zero value */
110     pOut += j;
111   }
112   else
113   {
114     /* Initialization of inputA pointer */
115     pIn1 = pSrcB;
116 
117     /* Initialization of inputB pointer */
118     pIn2 = pSrcA;
119 
120     /* srcBLen is always considered as shorter or equal to srcALen */
121     j = srcBLen;
122     srcBLen = srcALen;
123     srcALen = j;
124 
125     /* CORR(x, y) = Reverse order(CORR(y, x)) */
126     /* Hence set the destination pointer to point to the last output sample */
127     pOut = pDst + ((srcALen + srcBLen) - 2U);
128 
129     /* Destination address modifier is set to -1 */
130     inc = -1;
131   }
132 
133 
134   /* Copy (srcBLen) samples in scratch buffer */
135   k = srcBLen >> 2U;
136 
137   /* First part of the processing with loop unrolling copies 4 data points at a time.
138      a second loop below copies for the remaining 1 to 3 samples. */
139   while (k > 0U)
140   {
141     /* copy second buffer in reversal manner */
142     x4 = (q15_t) *pIn2++;
143     *pScr2++ = x4;
144     x4 = (q15_t) *pIn2++;
145     *pScr2++ = x4;
146     x4 = (q15_t) *pIn2++;
147     *pScr2++ = x4;
148     x4 = (q15_t) *pIn2++;
149     *pScr2++ = x4;
150 
151     /* Decrement loop counter */
152     k--;
153   }
154 
155   /* If the count is not a multiple of 4, copy remaining samples here.
156      No loop unrolling is used. */
157   k = srcBLen % 0x4U;
158 
159   while (k > 0U)
160   {
161     /* copy second buffer in reversal manner for remaining samples */
162     x4 = (q15_t) *pIn2++;
163     *pScr2++ = x4;
164 
165     /* Decrement loop counter */
166     k--;
167   }
168 
169   /* Fill (srcBLen - 1U) zeros in scratch buffer */
170   arm_fill_q15(0, pScr1, (srcBLen - 1U));
171 
172   /* Update temporary scratch pointer */
173   pScr1 += (srcBLen - 1U);
174 
175   /* Copy (srcALen) samples in scratch buffer */
176   /* Apply loop unrolling and do 4 Copies simultaneously. */
177   k = srcALen >> 2U;
178 
179   /* First part of the processing with loop unrolling copies 4 data points at a time.
180      a second loop below copies for the remaining 1 to 3 samples. */
181   while (k > 0U)
182   {
183     /* copy second buffer in reversal manner */
184     x4 = (q15_t) *pIn1++;
185     *pScr1++ = x4;
186     x4 = (q15_t) *pIn1++;
187     *pScr1++ = x4;
188     x4 = (q15_t) *pIn1++;
189     *pScr1++ = x4;
190     x4 = (q15_t) *pIn1++;
191     *pScr1++ = x4;
192 
193     /* Decrement loop counter */
194     k--;
195   }
196 
197   /* If the count is not a multiple of 4, copy remaining samples here.
198      No loop unrolling is used. */
199   k = srcALen % 0x4U;
200 
201   while (k > 0U)
202   {
203     /* copy second buffer in reversal manner for remaining samples */
204     x4 = (q15_t) * pIn1++;
205     *pScr1++ = x4;
206 
207     /* Decrement the loop counter */
208     k--;
209   }
210 
211   /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
212   arm_fill_q15(0, pScr1, (srcBLen - 1U));
213 
214   /* Update pointer */
215   pScr1 += (srcBLen - 1U);
216 
217   /* Temporary pointer for scratch2 */
218   py = pScratch2;
219 
220   /* Initialization of pScr2 pointer */
221   pScr2 = pScratch2;
222 
223   /* Actual correlation process starts here */
224   blkCnt = (srcALen + srcBLen - 1U) >> 2;
225 
226   while (blkCnt > 0)
227   {
228     /* Initialze temporary scratch pointer as scratch1 */
229     pScr1 = pScratch1;
230 
231     /* Clear Accumlators */
232     acc0 = 0;
233     acc1 = 0;
234     acc2 = 0;
235     acc3 = 0;
236 
237     /* Read two samples from scratch1 buffer */
238     x1 = read_q15x2_ia (&pScr1);
239 
240     /* Read next two samples from scratch1 buffer */
241     x2 = read_q15x2_ia (&pScr1);
242 
243     tapCnt = (srcBLen) >> 2U;
244 
245     while (tapCnt > 0U)
246     {
247       /* Read four samples from smaller buffer */
248       y1 = read_q15x2_ia (&pScr2);
249 
250       /* multiply and accumulate */
251       acc0 = __SMLAD(x1, y1, acc0);
252       acc2 = __SMLAD(x2, y1, acc2);
253 
254       /* pack input data */
255 #ifndef ARM_MATH_BIG_ENDIAN
256       x3 = __PKHBT(x2, x1, 0);
257 #else
258       x3 = __PKHBT(x1, x2, 0);
259 #endif
260 
261       /* multiply and accumulate */
262       acc1 = __SMLADX(x3, y1, acc1);
263 
264       /* Read next two samples from scratch1 buffer */
265       x1 = read_q15x2_ia (&pScr1);
266 
267       /* pack input data */
268 #ifndef ARM_MATH_BIG_ENDIAN
269       x3 = __PKHBT(x1, x2, 0);
270 #else
271       x3 = __PKHBT(x2, x1, 0);
272 #endif
273 
274       acc3 = __SMLADX(x3, y1, acc3);
275 
276       /* Read four samples from smaller buffer */
277       y1 = read_q15x2_ia (&pScr2);
278 
279       acc0 = __SMLAD(x2, y1, acc0);
280 
281       acc2 = __SMLAD(x1, y1, acc2);
282 
283       acc1 = __SMLADX(x3, y1, acc1);
284 
285       x2 = read_q15x2_ia (&pScr1);
286 
287 #ifndef ARM_MATH_BIG_ENDIAN
288       x3 = __PKHBT(x2, x1, 0);
289 #else
290       x3 = __PKHBT(x1, x2, 0);
291 #endif
292 
293       acc3 = __SMLADX(x3, y1, acc3);
294 
295       /* Decrement loop counter */
296       tapCnt--;
297     }
298 
299     /* Update scratch pointer for remaining samples of smaller length sequence */
300     pScr1 -= 4U;
301 
302     /* apply same above for remaining samples of smaller length sequence */
303     tapCnt = (srcBLen) & 3U;
304 
305     while (tapCnt > 0U)
306     {
307       /* accumulate the results */
308       acc0 += (*pScr1++ * *pScr2);
309       acc1 += (*pScr1++ * *pScr2);
310       acc2 += (*pScr1++ * *pScr2);
311       acc3 += (*pScr1++ * *pScr2++);
312 
313       pScr1 -= 3U;
314 
315       /* Decrement loop counter */
316       tapCnt--;
317     }
318 
319     blkCnt--;
320 
321     /* Store the result in the accumulator in the destination buffer. */
322     *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
323     pOut += inc;
324     *pOut = (q7_t) (__SSAT(acc1 >> 7U, 8));
325     pOut += inc;
326     *pOut = (q7_t) (__SSAT(acc2 >> 7U, 8));
327     pOut += inc;
328     *pOut = (q7_t) (__SSAT(acc3 >> 7U, 8));
329     pOut += inc;
330 
331     /* Initialization of inputB pointer */
332     pScr2 = py;
333 
334     pScratch1 += 4U;
335   }
336 
337   blkCnt = (srcALen + srcBLen - 1U) & 0x3;
338 
339   /* Calculate correlation for remaining samples of Bigger length sequence */
340   while (blkCnt > 0)
341   {
342     /* Initialze temporary scratch pointer as scratch1 */
343     pScr1 = pScratch1;
344 
345     /* Clear Accumlators */
346     acc0 = 0;
347 
348     tapCnt = (srcBLen) >> 1U;
349 
350     while (tapCnt > 0U)
351     {
352       acc0 += (*pScr1++ * *pScr2++);
353       acc0 += (*pScr1++ * *pScr2++);
354 
355       /* Decrement loop counter */
356       tapCnt--;
357     }
358 
359     tapCnt = (srcBLen) & 1U;
360 
361     /* apply same above for remaining samples of smaller length sequence */
362     while (tapCnt > 0U)
363     {
364       /* accumulate the results */
365       acc0 += (*pScr1++ * *pScr2++);
366 
367       /* Decrement loop counter */
368       tapCnt--;
369     }
370 
371     blkCnt--;
372 
373     /* Store the result in the accumulator in the destination buffer. */
374     *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
375     pOut += inc;
376 
377     /* Initialization of inputB pointer */
378     pScr2 = py;
379 
380     pScratch1 += 1U;
381   }
382 
383 }
384 
385 /**
386   @} end of Corr group
387  */
388