1 /******************************************************************************
2  * @file     arm_vec_filtering.h
3  * @brief    Private header file for CMSIS DSP Library
4  * @version  V1.7.0
5  * @date     30. October 2019
6  ******************************************************************************/
7 /*
8  * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
9  *
10  * SPDX-License-Identifier: Apache-2.0
11  *
12  * Licensed under the Apache License, Version 2.0 (the License); you may
13  * not use this file except in compliance with the License.
14  * You may obtain a copy of the License at
15  *
16  * www.apache.org/licenses/LICENSE-2.0
17  *
18  * Unless required by applicable law or agreed to in writing, software
19  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
20  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  * See the License for the specific language governing permissions and
22  * limitations under the License.
23  */
24 
25 #ifndef _ARM_VEC_FILTERING_H_
26 #define _ARM_VEC_FILTERING_H_
27 
28 #include "arm_math.h"
29 #include "arm_helium_utils.h"
30 
31 #ifdef   __cplusplus
32 extern "C"
33 {
34 #endif
35 
36 #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
37 
38 #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F32(acc0, acc1, acc2, acc3, pX, pY, count)\
39 {                                                                                     \
40     float32_t const *pSrcX, *pSrcY;                                                   \
41     f32x4_t   acc0Vec, acc1Vec, acc2Vec, acc3Vec, xVec, yVec;                         \
42     uint32_t    k;                                                                    \
43                                                                                       \
44     acc0Vec = vdupq_n_f32(0.0f);                                                      \
45     acc1Vec = vdupq_n_f32(0.0f);                                                      \
46     acc2Vec = vdupq_n_f32(0.0f);                                                      \
47     acc3Vec = vdupq_n_f32(0.0f);                                                      \
48     pSrcX = (float32_t const *) pX;                                                   \
49     pSrcY = (float32_t const *) pY;                                                   \
50     k = count >> 2;                                                                   \
51                                                                                       \
52     while (k > 0U)                                                                    \
53     {                                                                                 \
54         yVec = vld1q(pSrcY);                                                          \
55         pSrcY += 4;                                                                   \
56         xVec = vldrwq_f32(&pSrcX[1]);                                                 \
57         acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                     \
58         xVec = vldrwq_f32(&pSrcX[2]);                                                 \
59         acc2Vec = vfmaq_f32(acc2Vec, xVec, yVec);                                     \
60         xVec = vldrwq_f32(&pSrcX[3]);                                                 \
61         acc3Vec = vfmaq_f32(acc3Vec, xVec, yVec);                                     \
62         xVec = vld1q(pSrcX);                                                          \
63         pSrcX += 4;                                                                   \
64         acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                     \
65         /*  Decrement the loop counter   */                                           \
66         k--;                                                                          \
67     }                                                                                 \
68     /* loop + tail predication expected here  */                                      \
69     k = count % 0x4U;                                                                 \
70     if (k > 0U)                                                                       \
71     {                                                                                 \
72         mve_pred16_t p0 = vctp32q(k);                                                 \
73         yVec = vld1q(pSrcY);                                                          \
74         pSrcY += 4;                                                                   \
75         xVec = vldrwq_f32(&pSrcX[1]);                                                 \
76         acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                               \
77         xVec = vldrwq_f32(&pSrcX[2]);                                                 \
78         acc2Vec = vfmaq_m_f32(acc2Vec, xVec, yVec, p0);                               \
79         xVec = vldrwq_f32(&pSrcX[3]);                                                 \
80         acc3Vec = vfmaq_m_f32(acc3Vec, xVec, yVec, p0);                               \
81         xVec = vld1q(pSrcX);                                                          \
82         pSrcX += 4;                                                                   \
83         acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                               \
84     }                                                                                 \
85                                                                                       \
86     acc0 = vecAddAcrossF32Mve(acc0Vec);                                               \
87     acc1 = vecAddAcrossF32Mve(acc1Vec);                                               \
88     acc2 = vecAddAcrossF32Mve(acc2Vec);                                               \
89     acc3 = vecAddAcrossF32Mve(acc3Vec);                                               \
90 }
91 
92 #define MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, count) \
93 {                                                    \
94     float32_t const *pSrcX, *pSrcY;                  \
95     f32x4_t   accVec, xVec, yVec;                    \
96     uint32_t    k;                                   \
97                                                      \
98     accVec = vdupq_n_f32(0.0f);                      \
99     pSrcX = (float32_t const *) pX;                  \
100     pSrcY = (float32_t const *) pY;                  \
101     k = count >> 2;                                  \
102                                                      \
103     while (k > 0U)                                   \
104     {                                                \
105         yVec = vld1q(pSrcY);                         \
106         pSrcY += 4;                                  \
107         xVec = vld1q(pSrcX);                         \
108         pSrcX += 4;                                  \
109         accVec = vfmaq_f32(accVec, xVec, yVec);      \
110         /*  Decrement the loop counter   */          \
111         k--;                                         \
112     }                                                \
113     /* Loop with tail predication expected here  */  \
114     k = count % 0x4U;                                \
115     if (k > 0U)                                      \
116     {                                                \
117         mve_pred16_t p0 = vctp32q(k);                \
118         yVec = vld1q(pSrcY);                         \
119         pSrcY += 4;                                  \
120         xVec = vld1q(pSrcX);                         \
121         pSrcX += 4;                                  \
122         accVec = vfmaq_m_f32(accVec, xVec, yVec, p0);\
123     }                                                \
124     acc = vecAddAcrossF32Mve(accVec);                \
125 }
126 
127 #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count)\
128 {                                                                       \
129     float32_t const *pSrcX, *pSrcY;                                     \
130     f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                             \
131     uint32_t    k;                                                      \
132                                                                         \
133     acc0Vec = vdupq_n_f32(0.0f);                                        \
134     acc1Vec = vdupq_n_f32(0.0f);                                        \
135     pSrcX = (float32_t const *) pX;                                     \
136     pSrcY = (float32_t const *) pY;                                     \
137     k = (count-1) >> 2;                                                 \
138                                                                         \
139     while (k > 0U)                                                      \
140     {                                                                   \
141         yVec = vld1q(pSrcY);                                            \
142         pSrcY += 4;                                                     \
143         xVec = vldrwq_f32(&pSrcX[1]);                                   \
144         acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                       \
145         xVec = vld1q(pSrcX);                                            \
146         pSrcX += 4;                                                     \
147         acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                       \
148         /*  Decrement the loop counter   */                             \
149         k--;                                                            \
150     }                                                                   \
151     /* use predication to finalize MAC sum */                           \
152     /* acc1 requires exact number of sample (count-1)  */               \
153     /* disable extra lanes in final MAC computation  */                 \
154     k = (count-1) % 0x4U;                                               \
155     mve_pred16_t p0 = vctp32q(k);                                       \
156     yVec = vld1q(pSrcY);                                                \
157     pSrcY += 4;                                                         \
158     xVec = vldrwq_f32(&pSrcX[1]);                                       \
159     acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                     \
160     /* acc0 requires 1 additional sample  (count) */                    \
161     /* so add 1 to unmask an extra lane  in final MAC computation  */   \
162     p0 = vctp32q(k+1);                                                  \
163     xVec = vld1q(pSrcX);                                                \
164     pSrcX += 4;                                                         \
165     acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                     \
166                                                                         \
167     acc0 = vecAddAcrossF32Mve(acc0Vec);                                 \
168     acc1 = vecAddAcrossF32Mve(acc1Vec);                                 \
169 }
170 
171 #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count)\
172 {                                                                         \
173     float32_t const *pSrcX, *pSrcY;                                       \
174     f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                               \
175     uint32_t    k;                                                        \
176                                                                           \
177     acc0Vec = vdupq_n_f32(0.0f);                                          \
178     acc1Vec = vdupq_n_f32(0.0f);                                          \
179     pSrcX = (float32_t const *) pX;                                       \
180     pSrcY = (float32_t const *) pY;                                       \
181     k = count >> 2;                                                       \
182                                                                           \
183     while (k > 0U)                                                        \
184     {                                                                     \
185         yVec = vld1q(pSrcY);                                              \
186         pSrcY += 4;                                                       \
187         xVec = vldrwq_f32(&pSrcX[1]);                                     \
188         acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                         \
189         xVec = vld1q(pSrcX);                                              \
190         pSrcX += 4;                                                       \
191         acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                         \
192         /*  Decrement the loop counter   */                               \
193         k--;                                                              \
194     }                                                                     \
195     /* loop + tail predication expected here  */                          \
196     k = count % 0x4U;                                                     \
197     if (k > 0U)                                                           \
198     {                                                                     \
199         mve_pred16_t p0 = vctp32q(k);                                     \
200         yVec = vld1q(pSrcY);                                              \
201         pSrcY += 4;                                                       \
202         xVec = vldrwq_f32(&pSrcX[1]);                                     \
203         acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                   \
204         xVec = vld1q(pSrcX);                                              \
205         pSrcX += 4;                                                       \
206         acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                   \
207     }                                                                     \
208                                                                           \
209     acc0 = vecAddAcrossF32Mve(acc0Vec);                                   \
210     acc1 = vecAddAcrossF32Mve(acc1Vec);                                   \
211 }
212 
213 #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)\
214 {                                                                       \
215     float32_t const *pSrcX, *pSrcY;                                     \
216     f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                             \
217     uint32_t    k;                                                      \
218                                                                         \
219     acc0Vec = vdupq_n_f32(0.0f);                                        \
220     acc1Vec = vdupq_n_f32(0.0f);                                        \
221     pSrcX = (float32_t const *) pX;                                     \
222     pSrcY = (float32_t const *) pY;                                     \
223     k = count >> 2;                                                     \
224     while (k > 0U)                                                      \
225     {                                                                   \
226         xVec = vld1q(pSrcX);                                            \
227         pSrcX += 4;                                                     \
228         yVec = vldrwq_f32(&pSrcY[-1]);                                  \
229         acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                       \
230         yVec = vld1q(pSrcY);                                            \
231         pSrcY += 4;                                                     \
232         acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                       \
233         /*  Decrement the loop counter   */                             \
234         k--;                                                            \
235     }                                                                   \
236     k = count % 0x4U;                                                   \
237     /* use predication to finalize MAC sum */                           \
238     /* acc1 requires 1 additional sample  */                            \
239     /* so add 1 to unmask an extra lane  in final MAC computation  */   \
240     mve_pred16_t p0 = vctp32q(k+1);                                     \
241     xVec = vld1q(pSrcX);                                                \
242     pSrcX += 4;                                                         \
243     yVec = vldrwq_f32(&pSrcY[-1]);                                      \
244     acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec,p0);                      \
245     /* acc0 requires exact number of sample  */                         \
246     /* disable extra lanes in final MAC computation  */                 \
247     p0 = vctp32q(k);                                                    \
248     yVec = vld1q(pSrcY);                                                \
249     pSrcY += 4;                                                         \
250     acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec,p0);                      \
251                                                                         \
252     acc0 = vecAddAcrossF32Mve(acc0Vec);                                 \
253     acc1 = vecAddAcrossF32Mve(acc1Vec);                                 \
254 }
255 
256 #define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count)                             \
257 {                                                                                                    \
258     float32_t const *pSrcX;                                                                          \
259     f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                          \
260     uint32_t    k;                                                                                   \
261                                                                                                      \
262     acc0Vec = vdupq_n_f32(0.0f);                                                                     \
263     acc1Vec = vdupq_n_f32(0.0f);                                                                     \
264     pSrcX = (float32_t const *) pX;                                                                  \
265     k = (count - 1) >> 2;                                                                            \
266                                                                                                      \
267     while (k > 0U)                                                                                   \
268     {                                                                                                \
269         yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
270         pY-=4;                                                                                       \
271         xVec = vldrwq_f32(&pSrcX[1]);                                                                \
272         acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                    \
273         xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
274         acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                    \
275         /*  Decrement the loop counter   */                                                          \
276         k--;                                                                                         \
277     }                                                                                                \
278     /* Loop with tail predication expected here  */                                                  \
279     k = (count - 1) % 0x4U;                                                                          \
280     mve_pred16_t p0 = vctp32q(k);                                                                    \
281     yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                         \
282     xVec = vldrwq_f32(&pSrcX[1]);                                                                    \
283     acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                                  \
284     xVec = vld1q(pSrcX);  pSrcX += 4;                                                                \
285     p0 = vctp32q(k+1);                                                                               \
286     acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                                  \
287                                                                                                      \
288     acc0 = vecAddAcrossF32Mve(acc0Vec);                                                              \
289     acc1 = vecAddAcrossF32Mve(acc1Vec);                                                              \
290 }
291 
292 #define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count)                           \
293 {                                                                                                    \
294     float32_t const *pSrcX;                                                                          \
295     f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                          \
296     uint32_t    k;                                                                                   \
297                                                                                                      \
298     acc0Vec = vdupq_n_f32(0.0f);                                                                     \
299     acc1Vec = vdupq_n_f32(0.0f);                                                                     \
300     pSrcX = (float32_t const *) pX;                                                                  \
301     k = count >> 2;                                                                                  \
302                                                                                                      \
303     while (k > 0U)                                                                                   \
304     {                                                                                                \
305         yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
306         pY-=4;                                                                                       \
307         xVec = vldrwq_f32(&pSrcX[1]);                                                                \
308         acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                    \
309         xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
310         acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                    \
311         /*  Decrement the loop counter   */                                                          \
312         k--;                                                                                         \
313     }                                                                                                \
314     /* Loop with tail predication expected here  */                                                  \
315     k = count % 0x4U;                                                                                \
316     if (k > 0U)                                                                                      \
317     {                                                                                                \
318         mve_pred16_t p0 = vctp32q(k);                                                                \
319         yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
320         xVec = vldrwq_f32(&pSrcX[1]);                                                                \
321         acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                              \
322         xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
323         acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                              \
324     }                                                                                                \
325     acc0 = vecAddAcrossF32Mve(acc0Vec);                                                              \
326     acc1 = vecAddAcrossF32Mve(acc1Vec);                                                              \
327 }
328 
329 #define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)\
330 {                                                                       \
331     float32_t   const *pSrcX;                                           \
332     const float32_t  *pY1 = pY + 1;                                     \
333     f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                             \
334     uint32_t    k;                                                      \
335                                                                         \
336     acc0Vec = vdupq_n_f32(0.0f);                                        \
337     acc1Vec = vdupq_n_f32(0.0f);                                        \
338     pSrcX = (float32_t const *) pX;                                     \
339     k = count >> 2;                                                     \
340                                                                         \
341     while (k > 0U)                                                      \
342     {                                                                   \
343         xVec = vld1q(pSrcX);  pSrcX += 4;                               \
344         yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);        \
345         pY-=4;                                                          \
346         acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                       \
347         yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec);       \
348         pY1-=4;                                                         \
349         acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                       \
350         /*  Decrement the loop counter   */                             \
351         k--;                                                            \
352     }                                                                   \
353     k = count % 0x4U;                                                   \
354     /* use predication to finalize MAC sum */                           \
355     /* acc0 requires exact number of sample  */                         \
356     /* disable extra lanes in final MAC computation  */                 \
357     mve_pred16_t p0 = vctp32q(k);                                       \
358     xVec = vld1q(pSrcX);  pSrcX += 4;                                   \
359     yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);            \
360     acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                     \
361     yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec);           \
362     /* acc1 requires 1 additional sample  */                            \
363     /* so add 1 to unmask an extra lane  in final MAC computation  */   \
364     p0 = vctp32q(k+1);                                                  \
365     acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                     \
366                                                                         \
367     acc0 = vecAddAcrossF32Mve(acc0Vec);                                 \
368     acc1 = vecAddAcrossF32Mve(acc1Vec);                                 \
369 }
370 
371 #define MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count)                                                 \
372 {                                                                                                    \
373     float32_t const *pSrcX;                                                                          \
374     f32x4_t   accVec, xVec, yVec;                                                                    \
375     uint32_t    k;                                                                                   \
376                                                                                                      \
377     accVec = vdupq_n_f32(0.0f);                                                                      \
378     pSrcX = (float32_t const *) pX;                                                                  \
379     k = count >> 2;                                                                                  \
380                                                                                                      \
381     while (k > 0U)                                                                                   \
382     {                                                                                                \
383         yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
384         pY-=4;                                                                                       \
385         xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
386         accVec = vfmaq_f32(accVec, xVec, yVec);                                                      \
387         /*  Decrement the loop counter   */                                                          \
388         k--;                                                                                         \
389     }                                                                                                \
390     /* Loop with tail predication expected here  */                                                  \
391     k = count % 0x4U;                                                                                \
392     if (k > 0U)                                                                                      \
393     {                                                                                                \
394         mve_pred16_t p0 = vctp32q(k);                                                                \
395         xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
396         yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
397         accVec = vfmaq_m_f32(accVec, xVec, yVec, p0);                                                \
398     }                                                                                                \
399     acc = vecAddAcrossF32Mve(accVec);                                                                \
400 }
401 
402 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
403 
404 #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
405 
406 #define MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count)                                                 \
407 {                                                                                                    \
408     q31_t const *pSrcX;                                                                              \
409     q31x4_t   xVec, yVec;                                                                            \
410     uint32_t    k;                                                                                   \
411                                                                                                      \
412     pSrcX = (q31_t const *) pX;                                                                      \
413     k = count >> 2;                                                                                  \
414                                                                                                      \
415     while (k > 0U)                                                                                   \
416     {                                                                                                \
417         yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
418         pY-=4;                                                                                       \
419         xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
420         acc = vmlaldavaq(acc, xVec, yVec);                                                           \
421         /*  Decrement the loop counter   */                                                          \
422         k--;                                                                                         \
423     }                                                                                                \
424     /* Loop with tail predication expected here  */                                                  \
425     k = count % 0x4U;                                                                                \
426     if (k > 0U)                                                                                      \
427     {                                                                                                \
428         mve_pred16_t p0 = vctp32q(k);                                                                \
429         xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
430         yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
431         acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                     \
432     }                                                                                                \
433     acc = asrl(acc, 31);                                                                             \
434 }
435 
436 
437 
438 #define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\
439 {                                                                       \
440     q31_t const *pSrcX;                                                 \
441     const q31_t       *pY1 = pY + 1;                                    \
442     q31x4_t   xVec, yVec;                                               \
443     uint32_t    k;                                                      \
444                                                                         \
445     pSrcX = (q31_t const *) pX;                                         \
446     k = count >> 2;                                                     \
447                                                                         \
448     while (k > 0U)                                                      \
449     {                                                                   \
450         xVec = vld1q(pSrcX); pSrcX += 4;                                \
451         yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);        \
452         pY-=4;                                                          \
453         acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
454         yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec);       \
455         pY1-=4;                                                         \
456         acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
457         /*  Decrement the loop counter   */                             \
458         k--;                                                            \
459     }                                                                   \
460     k = count % 0x4U;                                                   \
461     /* use predication to finalize MAC sum */                           \
462     /* acc0 requires exact number of sample  */                         \
463     /* disable extra lanes in final MAC computation  */                 \
464     mve_pred16_t p0 = vctp32q(k);                                       \
465     xVec = vld1q(pSrcX); pSrcX += 4;                                    \
466     yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);            \
467     acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
468     yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec);           \
469     /* acc1 requires 1 additional sample  */                            \
470     /* so add 1 to unmask an extra lane  in final MAC computation  */   \
471     p0 = vctp32q(k+1);                                                  \
472     acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
473                                                                         \
474     acc0 = asrl(acc0, 31);                                              \
475     acc1 = asrl(acc1, 31);                                              \
476 }
477 
478 
479 
480 
481 #define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count) \
482 {                                                                        \
483     q31_t const *pSrcX;                                                  \
484     q31x4_t   xVec, yVec;                                                \
485     uint32_t    k;                                                       \
486                                                                          \
487     pSrcX = (q31_t const *) pX;                                          \
488     k = (count-1) >> 2;                                                  \
489                                                                          \
490     while (k > 0U)                                                       \
491     {                                                                    \
492         yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);         \
493         pY-=4;                                                           \
494         xVec = vldrwq_s32(&pSrcX[1]);                                    \
495         acc1 = vmlaldavaq(acc1, xVec, yVec);                             \
496         xVec = vld1q(pSrcX);                                             \
497         pSrcX += 4;                                                      \
498         acc0 = vmlaldavaq(acc0, xVec, yVec);                             \
499         /*  Decrement the loop counter   */                              \
500         k--;                                                             \
501     }                                                                    \
502     k = (count - 1) % 0x4U;                                              \
503     /* use predication to finalize MAC sum */                            \
504     /* acc1 requires exact number of sample (count-1)  */                \
505     /* disable extra lanes in final MAC computation  */                  \
506     mve_pred16_t p0 = vctp32q(k);                                        \
507     yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);             \
508     xVec = vldrwq_s32(&pSrcX[1]);                                        \
509     acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                           \
510     /* acc0 requires 1 additional sample  (count) */                     \
511     /* so add 1 to unmask an extra lane  in final MAC computation  */    \
512     p0 = vctp32q(k+1);                                                   \
513     xVec = vld1q(pSrcX);                                                 \
514     pSrcX += 4;                                                          \
515     acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                           \
516                                                                          \
517     acc0 = asrl(acc0, 31);                                               \
518     acc1 = asrl(acc1, 31);                                               \
519 }
520 
521 
522 
523 #define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)                           \
524 {                                                                                                    \
525     q31_t const *pSrcX;                                                                              \
526     q31x4_t   xVec, yVec;                                                                            \
527     uint32_t    k;                                                                                   \
528                                                                                                      \
529     pSrcX = (q31_t const *) pX;                                                                      \
530     k = count >> 2;                                                                                  \
531                                                                                                      \
532     while (k > 0U)                                                                                   \
533     {                                                                                                \
534         yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
535         pY-=4;                                                                                       \
536         xVec = vldrwq_s32(&pSrcX[1]);                                                                \
537         acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
538         xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
539         acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
540         /*  Decrement the loop counter   */                                                          \
541         k--;                                                                                         \
542     }                                                                                                \
543     /* Loop with tail predication expected here  */                                                  \
544     k = count % 0x4U;                                                                                \
545     if (k > 0U)                                                                                      \
546     {                                                                                                \
547         mve_pred16_t p0 = vctp32q(k);                                                                \
548         yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
549         xVec = vldrwq_s32(&pSrcX[1]);                                                                \
550         acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                   \
551         xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
552         acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                   \
553     }                                                                                                \
554     acc0 = asrl(acc0, 31);                                                                           \
555     acc1 = asrl(acc1, 31);                                                                           \
556 }
557 
558 
559 
560 #define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)               \
561 {                                                                                                    \
562     q31_t const *pSrcX;                                                                              \
563     q31x4_t   xVec, yVec;                                                                            \
564     uint32_t    k;                                                                                   \
565                                                                                                      \
566     pSrcX = (q31_t const *) pX;                                                                      \
567     k = count >> 2;                                                                                  \
568                                                                                                      \
569     while (k > 0U)                                                                                   \
570     {                                                                                                \
571         yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
572         pY-=4;                                                                                       \
573         xVec = vldrwq_s32(&pSrcX[1]);                                                                \
574         acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
575         xVec = vldrwq_s32(&pSrcX[2]);                                                                \
576         acc2 = vmlaldavaq(acc2, xVec, yVec);                                                         \
577         xVec = vldrwq_s32(&pSrcX[3]);                                                                \
578         acc3 = vmlaldavaq(acc3, xVec, yVec);                                                         \
579         xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
580         acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
581         /*  Decrement the loop counter   */                                                          \
582         k--;                                                                                         \
583     }                                                                                                \
584     /* Loop with tail predication expected here  */                                                  \
585     k = count % 0x4U;                                                                                \
586     if (k > 0U)                                                                                      \
587     {                                                                                                \
588         mve_pred16_t p0 = vctp32q(k);                                                                \
589         yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
590         xVec = vldrwq_s32(&pSrcX[1]);                                                                \
591         acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                   \
592         xVec = vldrwq_s32(&pSrcX[2]);                                                                \
593         acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                   \
594         xVec = vldrwq_s32(&pSrcX[3]);                                                                \
595         acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                   \
596         xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
597         acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                   \
598     }                                                                                                \
599     acc0 = asrl(acc0, 31);                                                                           \
600     acc1 = asrl(acc1, 31);                                                                           \
601     acc2 = asrl(acc2, 31);                                                                           \
602     acc3 = asrl(acc3, 31);                                                                           \
603 }
604 
605 #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\
606 {                                                                       \
607     q31_t const *pSrcX, *pSrcY;                                         \
608     q31x4_t   xVec, yVec;                                               \
609     uint32_t    k;                                                      \
610                                                                         \
611     pSrcX = (q31_t const *) pX;                                         \
612     pSrcY  = (q31_t const *) pY;                                        \
613     k = count >> 2;                                                     \
614                                                                         \
615     while (k > 0U)                                                      \
616     {                                                                   \
617         xVec = vld1q(pSrcX); pSrcX += 4;                                \
618         yVec = vldrwq_s32(&pSrcY[-1]);                                  \
619         acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
620         yVec = vld1q(pSrcY); pSrcY += 4;                                \
621         acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
622         /*  Decrement the loop counter   */                             \
623         k--;                                                            \
624     }                                                                   \
625     k = count % 0x4U;                                                   \
626     /* use predication to finalize MAC sum */                           \
627     /* acc1 requires 1 additional sample  */                            \
628     /* so add 1 to unmask an extra lane  in final MAC computation  */   \
629     mve_pred16_t p0 = vctp32q(k+1);                                     \
630     xVec = vld1q(pSrcX); pSrcX += 4;                                    \
631     yVec = vldrwq_s32(&pSrcY[-1]);                                      \
632     acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0);                           \
633     /* acc0 requires exact number of sample  */                         \
634     /* disable extra lanes in final MAC computation  */                 \
635     p0 = vctp32q(k);                                                    \
636     yVec = vld1q(pSrcY);  pSrcY += 4;                                   \
637     acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0);                           \
638                                                                         \
639     acc0 = asrl(acc0, 31);                                              \
640     acc1 = asrl(acc1, 31);                                              \
641 }
642 
643 #define MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count)\
644 {                                                   \
645     q31_t const *pSrcX, *pSrcY;                     \
646     q31x4_t   xVec, yVec;                           \
647     uint32_t    k;                                  \
648                                                     \
649     pSrcX = (q31_t const *) pX;                     \
650     pSrcY = (q31_t const *) pY;                     \
651     k = count >> 2;                                 \
652                                                     \
653     while (k > 0U)                                  \
654     {                                               \
655         xVec = vld1q(pSrcX); pSrcX += 4;            \
656         yVec = vld1q(pSrcY); pSrcY += 4;            \
657         acc = vmlaldavaq(acc, xVec, yVec);          \
658         /*  Decrement the loop counter   */         \
659         k--;                                        \
660     }                                               \
661     /*  tail predication expected here  */          \
662     k = count % 0x4U;                               \
663     if (k > 0U)                                     \
664     {                                               \
665         mve_pred16_t p0 = vctp32q(k);               \
666         xVec = vld1q(pSrcX); pSrcX += 4;            \
667         yVec = vld1q(pSrcY); pSrcY += 4;            \
668         acc = vmlaldavaq_p(acc, xVec, yVec, p0);    \
669     }                                               \
670     acc = asrl(acc, 31);                            \
671 }
672 
673 #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)\
674 {                                                                                     \
675     q31_t const *pSrcX, *pSrcY;                                                       \
676     q31x4_t   xVec, yVec;                                                             \
677     uint32_t    k;                                                                    \
678                                                                                       \
679     pSrcX = (q31_t const *) pX;                                                       \
680     pSrcY  = (q31_t const *) pY;                                                      \
681     k = count >> 2;                                                                   \
682                                                                                       \
683     while (k > 0U)                                                                    \
684     {                                                                                 \
685         yVec = vld1q(pSrcY); pSrcY += 4;                                              \
686         xVec = vldrwq_s32(&pSrcX[1]);                                                 \
687         acc1 = vmlaldavaq(acc1, xVec, yVec);                                          \
688         xVec = vldrwq_s32(&pSrcX[2]);                                                 \
689         acc2 = vmlaldavaq(acc2, xVec, yVec);                                          \
690         xVec = vldrwq_s32(&pSrcX[3]);                                                 \
691         acc3 = vmlaldavaq(acc3, xVec, yVec);                                          \
692         xVec = vld1q(pSrcX); pSrcX += 4;                                              \
693         acc0 = vmlaldavaq(acc0, xVec, yVec);                                          \
694         /*  Decrement the loop counter   */                                           \
695         k--;                                                                          \
696     }                                                                                 \
697     /* loop + tail predication expected here  */                                      \
698     k = count % 0x4U;                                                                 \
699     if (k > 0U)                                                                       \
700     {                                                                                 \
701         mve_pred16_t p0 = vctp32q(k);                                                 \
702         yVec = vld1q(pSrcY); pSrcY += 4;                                              \
703         xVec = vldrwq_s32(&pSrcX[1]);                                                 \
704         acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                    \
705         xVec = vldrwq_s32(&pSrcX[2]);                                                 \
706         acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                    \
707         xVec = vldrwq_s32(&pSrcX[3]);                                                 \
708         acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                    \
709         xVec = vld1q(pSrcX); pSrcX += 4;                                              \
710         acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                    \
711     }                                                                                 \
712                                                                                       \
713     acc0 = asrl(acc0, 31);                                                            \
714     acc1 = asrl(acc1, 31);                                                            \
715     acc2 = asrl(acc2, 31);                                                            \
716     acc3 = asrl(acc3, 31);                                                            \
717 }
718 
719 #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)\
720 {                                                                         \
721     q31_t const *pSrcX, *pSrcY;                                           \
722     q31x4_t   xVec, yVec;                                                 \
723     uint32_t    k;                                                        \
724                                                                           \
725     pSrcX = (q31_t const *) pX;                                           \
726     pSrcY  = (q31_t const *) pY;                                          \
727     k = count >> 2;                                                       \
728                                                                           \
729     while (k > 0U)                                                        \
730     {                                                                     \
731         yVec = vld1q(pSrcY); pSrcY += 4;                                  \
732         xVec = vldrwq_s32(&pSrcX[1]);                                     \
733         acc1 = vmlaldavaq(acc1, xVec, yVec);                              \
734         xVec = vld1q(pSrcX); pSrcX += 4;                                  \
735         acc0 = vmlaldavaq(acc0, xVec, yVec);                              \
736         /*  Decrement the loop counter   */                               \
737         k--;                                                              \
738     }                                                                     \
739     /* loop + tail predication expected here  */                          \
740     k = count % 0x4U;                                                     \
741     if (k > 0U)                                                           \
742     {                                                                     \
743         mve_pred16_t p0 = vctp32q(k);                                     \
744         yVec = vld1q(pSrcY); pSrcY += 4;                                  \
745         xVec = vldrwq_s32(&pSrcX[1]);                                     \
746         acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                        \
747         xVec = vld1q(pSrcX); pSrcX += 4;                                  \
748         acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                        \
749     }                                                                     \
750                                                                           \
751     acc0 = asrl(acc0, 31);                                                \
752     acc1 = asrl(acc1, 31);                                                \
753 }
754 
755 #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)\
756 {                                                                       \
757     q31_t const *pSrcX, *pSrcY;                                         \
758     q31x4_t   xVec, yVec;                                               \
759     uint32_t    k;                                                      \
760                                                                         \
761     pSrcX = (q31_t const *) pX;                                         \
762     pSrcY  = (q31_t const *) pY;                                        \
763     k = (count-1) >> 2;                                                 \
764                                                                         \
765     while (k > 0U)                                                      \
766     {                                                                   \
767         yVec = vld1q(pSrcY); pSrcY += 4;                                \
768         xVec = vldrwq_s32(&pSrcX[1]);                                   \
769         acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
770         xVec = vld1q(pSrcX); pSrcX += 4;                                \
771         acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
772         /*  Decrement the loop counter   */                             \
773         k--;                                                            \
774     }                                                                   \
775     /* use predication to finalize MAC sum */                           \
776     /* acc1 requires exact number of sample (count-1)  */               \
777     /* disable extra lanes in final MAC computation  */                 \
778     k = (count-1) % 0x4U;                                               \
779     mve_pred16_t p0 = vctp32q(k);                                       \
780     yVec = vld1q(pSrcY);  pSrcY += 4;                                   \
781     xVec = vldrwq_s32(&pSrcX[1]);                                       \
782     acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
783     /* acc0 requires 1 additional sample  (count) */                    \
784     /* so add 1 to unmask an extra lane  in final MAC computation  */   \
785     p0 = vctp32q(k+1);                                                  \
786     xVec = vld1q(pSrcX); pSrcX += 4;                                    \
787     acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
788                                                                         \
789     acc0 = asrl(acc0, 31);                                              \
790     acc1 = asrl(acc1, 31);                                              \
791 }
792 
793 #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\
794 {                                                                       \
795     q15_t const *pSrcX, *pSrcY;                                         \
796     q15x8_t   xVec, yVec;                                               \
797     uint32_t    k;                                                      \
798                                                                         \
799     pSrcX = (q15_t const *) pX;                                         \
800     pSrcY  = (q15_t const *) pY;                                        \
801     k = count >> 3;                                                     \
802     while (k > 0U)                                                      \
803     {                                                                   \
804         xVec = vld1q(pSrcX);  pSrcX += 8;                               \
805         yVec = vldrhq_s16(&pSrcY[-1]);                                  \
806         acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
807         yVec = vld1q(pSrcY);  pSrcY += 8;                               \
808         acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
809         /*  Decrement the loop counter   */                             \
810         k--;                                                            \
811     }                                                                   \
812     k = count % 0x8U;                                                   \
813     /* use predication to finalize MAC sum */                           \
814     /* acc1 requires 1 additional sample  */                            \
815     /* so add 1 to unmask an extra lane  in final MAC computation  */   \
816     mve_pred16_t p0 = vctp16q(k+1);                                     \
817     xVec = vld1q(pSrcX);  pSrcX += 8;                                   \
818     yVec = vldrhq_s16(&pSrcY[-1]);                                      \
819     acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0);                           \
820     /* acc0 requires exact number of sample  */                         \
821     /* disable extra lanes in final MAC computation  */                 \
822     p0 = vctp16q(k);                                                    \
823     yVec = vld1q(pSrcY);  pSrcY += 8;                                   \
824     acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0);                           \
825                                                                         \
826     acc0 = asrl(acc0, 15);                                              \
827     acc1 = asrl(acc1, 15);                                              \
828     acc0 = __SSAT(acc0, 16);                                            \
829     acc1 = __SSAT(acc1, 16);                                            \
830 }
831 
832 #define MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count)\
833 {                                                   \
834     q15_t const *pSrcX, *pSrcY;                     \
835     q15x8_t   xVec, yVec;                           \
836     uint32_t    k;                                  \
837                                                     \
838     pSrcX = (q15_t const *) pX;                     \
839     pSrcY = (q15_t const *) pY;                     \
840     k = count >> 3;                                 \
841     while (k > 0U)                                  \
842     {                                               \
843         xVec = vld1q(pSrcX);  pSrcX += 8;           \
844         yVec = vld1q(pSrcY);  pSrcY += 8;           \
845         acc = vmlaldavaq(acc, xVec, yVec);          \
846         /*  Decrement the loop counter   */         \
847         k--;                                        \
848     }                                               \
849     /*  tail predication expected here  */          \
850     k = count % 0x8U;                               \
851     if (k > 0U)                                     \
852     {                                               \
853         mve_pred16_t p0 = vctp16q(k);               \
854         xVec = vld1q(pSrcX);  pSrcX += 8;           \
855         yVec = vld1q(pSrcY);  pSrcY += 8;           \
856         acc = vmlaldavaq_p(acc, xVec, yVec, p0);    \
857     }                                               \
858     acc = asrl(acc, 15);                            \
859     acc = __SSAT(acc, 16);                          \
860 }
861 
862 #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)\
863 {                                                                                     \
864     q15_t const *pSrcX, *pSrcY;                                                       \
865     q15x8_t   xVec, yVec;                                                             \
866     uint32_t    k;                                                                    \
867                                                                                       \
868     pSrcX = (q15_t const *) pX;                                                       \
869     pSrcY  = (q15_t const *) pY;                                                      \
870     k = count >> 3;                                                                   \
871                                                                                       \
872     while (k > 0U)                                                                    \
873     {                                                                                 \
874         yVec = vld1q(pSrcY);  pSrcY += 8;                                             \
875         xVec = vldrhq_s16(&pSrcX[1]);                                                 \
876         acc1 = vmlaldavaq(acc1, xVec, yVec);                                          \
877         xVec = vldrhq_s16(&pSrcX[2]);                                                 \
878         acc2 = vmlaldavaq(acc2, xVec, yVec);                                          \
879         xVec = vldrhq_s16(&pSrcX[3]);                                                 \
880         acc3 = vmlaldavaq(acc3, xVec, yVec);                                          \
881         xVec = vld1q(pSrcX);  pSrcX += 8;                                             \
882         acc0 = vmlaldavaq(acc0, xVec, yVec);                                          \
883         /*  Decrement the loop counter   */                                           \
884         k--;                                                                          \
885     }                                                                                 \
886     /* loop + tail predication expected here  */                                      \
887     k = count % 0x8U;                                                                 \
888     if (k > 0U)                                                                       \
889     {                                                                                 \
890         mve_pred16_t p0 = vctp16q(k);                                                 \
891         yVec = vld1q(pSrcY);  pSrcY += 8;                                             \
892         xVec = vldrhq_s16(&pSrcX[1]);                                                 \
893         acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                    \
894         xVec = vldrhq_s16(&pSrcX[2]);                                                 \
895         acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                    \
896         xVec = vldrhq_s16(&pSrcX[3]);                                                 \
897         acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                    \
898         xVec = vld1q(pSrcX);  pSrcX += 8;                                             \
899         acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                    \
900     }                                                                                 \
901                                                                                       \
902     acc0 = asrl(acc0, 15);                                                            \
903     acc1 = asrl(acc1, 15);                                                            \
904     acc2 = asrl(acc2, 15);                                                            \
905     acc3 = asrl(acc3, 15);                                                            \
906     acc0 = __SSAT(acc0, 16);                                                          \
907     acc1 = __SSAT(acc1, 16);                                                          \
908     acc2 = __SSAT(acc2, 16);                                                          \
909     acc3 = __SSAT(acc3, 16);                                                          \
910 }
911 
912 #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)\
913 {                                                                         \
914     q15_t const *pSrcX, *pSrcY;                                           \
915     q15x8_t   xVec, yVec;                                                 \
916     uint32_t    k;                                                        \
917                                                                           \
918     pSrcX = (q15_t const *) pX;                                           \
919     pSrcY  = (q15_t const *) pY;                                          \
920     k = count >> 3;                                                       \
921                                                                           \
922     while (k > 0U)                                                        \
923     {                                                                     \
924         yVec = vld1q(pSrcY);  pSrcY += 8;                                 \
925         xVec = vldrhq_s16(&pSrcX[1]);                                     \
926         acc1 = vmlaldavaq(acc1, xVec, yVec);                              \
927         xVec = vld1q(pSrcX);  pSrcX += 8;                                 \
928         acc0 = vmlaldavaq(acc0, xVec, yVec);                              \
929         /*  Decrement the loop counter   */                               \
930         k--;                                                              \
931     }                                                                     \
932     /* loop + tail predication expected here  */                          \
933     k = count % 0x8U;                                                     \
934     if (k > 0U)                                                           \
935     {                                                                     \
936         mve_pred16_t p0 = vctp16q(k);                                     \
937         yVec = vld1q(pSrcY);  pSrcY += 8;                                 \
938         xVec = vldrhq_s16(&pSrcX[1]);                                     \
939         acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                        \
940         xVec = vld1q(pSrcX);  pSrcX += 8;                                 \
941         acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                        \
942     }                                                                     \
943                                                                           \
944     acc0 = asrl(acc0, 15);                                                \
945     acc1 = asrl(acc1, 15);                                                \
946     acc0 = __SSAT(acc0, 16);                                              \
947     acc1 = __SSAT(acc1, 16);                                              \
948 }
949 
950 #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)\
951 {                                                                       \
952     q15_t const *pSrcX, *pSrcY;                                         \
953     q15x8_t   xVec, yVec;                                               \
954     uint32_t    k;                                                      \
955                                                                         \
956     pSrcX = (q15_t const *) pX;                                         \
957     pSrcY  = (q15_t const *) pY;                                        \
958     k = (count-1) >> 3;                                                 \
959                                                                         \
960     while (k > 0U)                                                      \
961     {                                                                   \
962         yVec = vld1q(pSrcY);  pSrcY += 8;                               \
963         xVec = vldrhq_s16(&pSrcX[1]);                                   \
964         acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
965         xVec = vld1q(pSrcX);  pSrcX += 8;                               \
966         acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
967         /*  Decrement the loop counter   */                             \
968         k--;                                                            \
969     }                                                                   \
970     /* use predication to finalize MAC sum */                           \
971     /* acc1 requires exact number of sample (count-1)  */               \
972     /* disable extra lanes in final MAC computation  */                 \
973     k = (count-1) % 0x8U;                                               \
974     mve_pred16_t p0 = vctp16q(k);                                       \
975     yVec = vld1q(pSrcY);  pSrcY += 8;                                   \
976     xVec = vldrhq_s16(&pSrcX[1]);                                       \
977     acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
978     /* acc0 requires 1 additional sample  (count) */                    \
979     /* so add 1 to unmask an extra lane  in final MAC computation  */   \
980     p0 = vctp16q(k+1);                                                  \
981     xVec = vld1q(pSrcX);  pSrcX += 8;                                   \
982     acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
983                                                                         \
984     acc0 = asrl(acc0, 15);                                              \
985     acc1 = asrl(acc1, 15);                                              \
986     acc0 = __SSAT(acc0, 16);                                            \
987     acc1 = __SSAT(acc1, 16);                                            \
988 }
989 
990 #define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\
991 {                                                                       \
992     q15_t const *pSrcX;                                                 \
993     const q15_t       *pY1 = pY + 1;                                    \
994     q15x8_t   xVec, yVec;                                               \
995     uint32_t    k;                                                      \
996                                                                         \
997     pSrcX = (q15_t const *) pX;                                         \
998     k = count >> 3;                                                     \
999                                                                         \
1000     while (k > 0U)                                                      \
1001     {                                                                   \
1002         xVec = vld1q(pSrcX);  pSrcX += 8;                               \
1003         yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);        \
1004         pY-=8;                                                          \
1005         acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
1006         yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec);       \
1007         pY1-=8;                                                         \
1008         acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
1009         /*  Decrement the loop counter   */                             \
1010         k--;                                                            \
1011     }                                                                   \
1012     k = count % 0x8U;                                                   \
1013     /* use predication to finalize MAC sum */                           \
1014     /* acc0 requires exact number of sample  */                         \
1015     /* disable extra lanes in final MAC computation  */                 \
1016     mve_pred16_t p0 = vctp16q(k);                                       \
1017     xVec = vld1q(pSrcX);  pSrcX += 8;                                   \
1018     yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);            \
1019     acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
1020     yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec);           \
1021     /* acc1 requires 1 additional sample  */                            \
1022     /* so add 1 to unmask an extra lane  in final MAC computation  */   \
1023     p0 = vctp16q(k+1);                                                  \
1024     acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
1025                                                                         \
1026     acc0 = asrl(acc0, 15);                                              \
1027     acc1 = asrl(acc1, 15);                                              \
1028     acc0 = __SSAT(acc0, 16);                                            \
1029     acc1 = __SSAT(acc1, 16);                                            \
1030 }
1031 
1032 #define MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count)                                                 \
1033 {                                                                                                    \
1034     q15_t const *pSrcX;                                                                              \
1035     q15x8_t   xVec, yVec;                                                                            \
1036     uint32_t    k;                                                                                   \
1037                                                                                                      \
1038     pSrcX = (q15_t const *) pX;                                                                      \
1039     k = count >> 3;                                                                                  \
1040                                                                                                      \
1041     while (k > 0U)                                                                                   \
1042     {                                                                                                \
1043         yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
1044         pY-=8;                                                                                       \
1045         xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
1046         acc = vmlaldavaq(acc, xVec, yVec);                                                           \
1047         /*  Decrement the loop counter   */                                                          \
1048         k--;                                                                                         \
1049     }                                                                                                \
1050     /* Loop with tail predication expected here  */                                                  \
1051     k = count % 0x8U;                                                                                \
1052     if (k > 0U)                                                                                      \
1053     {                                                                                                \
1054         mve_pred16_t p0 = vctp16q(k);                                                                \
1055         xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
1056         yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
1057         acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                     \
1058     }                                                                                                \
1059     acc = asrl(acc, 15);                                                                             \
1060     acc = __SSAT(acc, 16);                                                                           \
1061 }
1062 
1063 #define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)               \
1064 {                                                                                                    \
1065     q15_t const *pSrcX;                                                                              \
1066     q15x8_t   xVec, yVec;                                                                            \
1067     uint32_t    k;                                                                                   \
1068                                                                                                      \
1069     pSrcX = (q15_t const *) pX;                                                                      \
1070     k = count >> 3;                                                                                  \
1071                                                                                                      \
1072     while (k > 0U)                                                                                   \
1073     {                                                                                                \
1074         yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
1075         pY-=8;                                                                                       \
1076         xVec = vldrhq_s16(&pSrcX[1]);                                                                \
1077         acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
1078         xVec = vldrhq_s16(&pSrcX[2]);                                                                \
1079         acc2 = vmlaldavaq(acc2, xVec, yVec);                                                         \
1080         xVec = vldrhq_s16(&pSrcX[3]);                                                                \
1081         acc3 = vmlaldavaq(acc3, xVec, yVec);                                                         \
1082         xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
1083         acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
1084         /*  Decrement the loop counter   */                                                          \
1085         k--;                                                                                         \
1086     }                                                                                                \
1087     /* Loop with tail predication expected here  */                                                  \
1088     k = count % 0x8U;                                                                                \
1089     if (k > 0U)                                                                                      \
1090     {                                                                                                \
1091         mve_pred16_t p0 = vctp16q(k);                                                                \
1092         yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
1093         xVec = vldrhq_s16(&pSrcX[1]);                                                                \
1094         acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                   \
1095         xVec = vldrhq_s16(&pSrcX[2]);                                                                \
1096         acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                   \
1097         xVec = vldrhq_s16(&pSrcX[3]);                                                                \
1098         acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                   \
1099         xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
1100         acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                   \
1101     }                                                                                                \
1102     acc0 = asrl(acc0, 15);                                                                           \
1103     acc1 = asrl(acc1, 15);                                                                           \
1104     acc2 = asrl(acc2, 15);                                                                           \
1105     acc3 = asrl(acc3, 15);                                                                           \
1106     acc0 = __SSAT(acc0, 16);                                                                         \
1107     acc1 = __SSAT(acc1, 16);                                                                         \
1108     acc2 = __SSAT(acc2, 16);                                                                         \
1109     acc3 = __SSAT(acc3, 16);                                                                         \
1110 }
1111 
1112 #define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)                           \
1113 {                                                                                                    \
1114     q15_t const *pSrcX;                                                                              \
1115     q15x8_t   xVec, yVec;                                                                            \
1116     uint32_t    k;                                                                                   \
1117                                                                                                      \
1118     pSrcX = (q15_t const *) pX;                                                                      \
1119     k = count >> 3;                                                                                  \
1120                                                                                                      \
1121     while (k > 0U)                                                                                   \
1122     {                                                                                                \
1123         yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
1124         pY-=8;                                                                                       \
1125         xVec = vldrhq_s16(&pSrcX[1]);                                                                \
1126         acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
1127         xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
1128         acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
1129         /*  Decrement the loop counter   */                                                          \
1130         k--;                                                                                         \
1131     }                                                                                                \
1132     /* Loop with tail predication expected here  */                                                  \
1133     k = count % 0x8U;                                                                                \
1134     if (k > 0U)                                                                                      \
1135     {                                                                                                \
1136         mve_pred16_t p0 = vctp16q(k);                                                                \
1137         yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
1138         xVec = vldrhq_s16(&pSrcX[1]);                                                                \
1139         acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                   \
1140         xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
1141         acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                   \
1142     }                                                                                                \
1143     acc0 = asrl(acc0, 15);                                                                           \
1144     acc1 = asrl(acc1, 15);                                                                           \
1145     acc0 = __SSAT(acc0, 16);                                                                         \
1146     acc1 = __SSAT(acc1, 16);                                                                         \
1147 }
1148 
1149 #define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)                             \
1150 {                                                                                                    \
1151     q15_t const *pSrcX;                                                                              \
1152     q15x8_t   xVec, yVec;                                                                            \
1153     uint32_t    k;                                                                                   \
1154                                                                                                      \
1155     pSrcX = (q15_t const *) pX;                                                                      \
1156     k = (count-1) >> 3;                                                                              \
1157                                                                                                      \
1158     while (k > 0U)                                                                                   \
1159     {                                                                                                \
1160         yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
1161         pY-=8;                                                                                       \
1162         xVec = vldrhq_s16(&pSrcX[1]);                                                                \
1163         acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
1164         xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
1165         acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
1166         /*  Decrement the loop counter   */                                                          \
1167         k--;                                                                                         \
1168     }                                                                                                \
1169     k = (count - 1) % 0x8U;                                                                          \
1170     /* use predication to finalize MAC sum */                                                        \
1171     /* acc1 requires exact number of sample (count-1)  */                                            \
1172     /* disable extra lanes in final MAC computation  */                                              \
1173     mve_pred16_t p0 = vctp16q(k);                                                                    \
1174     yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                         \
1175     xVec = vldrhq_s16(&pSrcX[1]);                                                                    \
1176     acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                       \
1177     /* acc0 requires 1 additional sample  (count) */                                                 \
1178     /* so add 1 to unmask an extra lane  in final MAC computation  */                                \
1179     p0 = vctp16q(k+1);                                                                               \
1180     xVec = vld1q(pSrcX);  pSrcX += 8;                                                                \
1181     acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                       \
1182                                                                                                      \
1183     acc0 = asrl(acc0, 15);                                                                           \
1184     acc1 = asrl(acc1, 15);                                                                           \
1185     acc0 = __SSAT(acc0, 16);                                                                         \
1186     acc1 = __SSAT(acc1, 16);                                                                         \
1187 }
1188 
1189 #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\
1190 {                                                                      \
1191     q7_t const *pSrcX, *pSrcY;                                         \
1192     q7x16_t   xVec, yVec;                                              \
1193     uint32_t    k;                                                     \
1194                                                                        \
1195     pSrcX = (q7_t const *) pX;                                         \
1196     pSrcY = (q7_t const *) pY;                                         \
1197     k = count >> 4;                                                    \
1198     while (k > 0U)                                                     \
1199     {                                                                  \
1200         xVec = vld1q(pSrcX);  pSrcX += 16;                             \
1201         yVec = vldrbq_s8(&pSrcY[-1]);                                  \
1202         acc1 = vmladavaq(acc1, xVec, yVec);                            \
1203         yVec = vld1q(pSrcY);  pSrcY += 16;                             \
1204         acc0 = vmladavaq(acc0, xVec, yVec);                            \
1205         /*  Decrement the loop counter   */                            \
1206         k--;                                                           \
1207     }                                                                  \
1208     k = count % 0x10U;                                                 \
1209     /* use predication to finalize MAC sum */                          \
1210     /* acc1 requires 1 additional sample  */                           \
1211     /* so add 1 to unmask an extra lane  in final MAC computation  */  \
1212     mve_pred16_t p0 = vctp8q(k+1);                                     \
1213     xVec = vld1q(pSrcX);  pSrcX += 16;                                 \
1214     yVec = vldrbq_s8(&pSrcY[-1]);                                      \
1215     acc1 = vmladavaq_p(acc1, xVec, yVec,p0);                           \
1216     /* acc0 requires exact number of sample  */                        \
1217     /* disable extra lanes in final MAC computation  */                \
1218     p0 = vctp8q(k);                                                    \
1219     yVec = vld1q(pSrcY);  pSrcY += 16;                                 \
1220     acc0 = vmladavaq_p(acc0, xVec, yVec,p0);                           \
1221                                                                        \
1222     acc0 = (acc0 >> 7);                                                \
1223     acc1 = (acc1 >> 7);                                                \
1224     acc0 = __SSAT(acc0, 8);                                            \
1225     acc1 = __SSAT(acc1, 8);                                            \
1226 }
1227 
1228 #define MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count)\
1229 {                                                  \
1230     q7_t const *pSrcX, *pSrcY;                     \
1231     q7x16_t   xVec, yVec;                          \
1232     uint32_t    k;                                 \
1233                                                    \
1234     pSrcX = (q7_t const *) pX;                     \
1235     pSrcY = (q7_t const *) pY;                     \
1236     k = count >> 4;                                \
1237     while (k > 0U)                                 \
1238     {                                              \
1239         xVec = vld1q(pSrcX);  pSrcX += 16;         \
1240         yVec = vld1q(pSrcY);  pSrcY += 16;         \
1241         acc = vmladavaq(acc, xVec, yVec);          \
1242         /*  Decrement the loop counter   */        \
1243         k--;                                       \
1244     }                                              \
1245     /*  tail predication expected here  */         \
1246     k = count % 0x10U;                             \
1247     if (k > 0U)                                    \
1248     {                                              \
1249         mve_pred16_t p0 = vctp8q(k);               \
1250         xVec = vld1q(pSrcX);  pSrcX += 16;         \
1251         yVec = vld1q(pSrcY);  pSrcY += 16;         \
1252         acc = vmladavaq_p(acc, xVec, yVec, p0);    \
1253     }                                              \
1254     acc =(acc >> 7);                               \
1255     acc = __SSAT(acc, 8);                          \
1256 }
1257 
1258 #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)\
1259 {                                                                                    \
1260     q7_t const *pSrcX, *pSrcY;                                                       \
1261     q7x16_t   xVec, yVec;                                                            \
1262     uint32_t    k;                                                                   \
1263                                                                                      \
1264     pSrcX = (q7_t const *) pX;                                                       \
1265     pSrcY = (q7_t const *) pY;                                                       \
1266     k = count >> 4;                                                                  \
1267                                                                                      \
1268     while (k > 0U)                                                                   \
1269     {                                                                                \
1270         yVec = vld1q(pSrcY);  pSrcY += 16;                                           \
1271         xVec = vldrbq_s8(&pSrcX[1]);                                                 \
1272         acc1 = vmladavaq(acc1, xVec, yVec);                                          \
1273         xVec = vldrbq_s8(&pSrcX[2]);                                                 \
1274         acc2 = vmladavaq(acc2, xVec, yVec);                                          \
1275         xVec = vldrbq_s8(&pSrcX[3]);                                                 \
1276         acc3 = vmladavaq(acc3, xVec, yVec);                                          \
1277         xVec = vld1q(pSrcX);  pSrcX += 16;                                           \
1278         acc0 = vmladavaq(acc0, xVec, yVec);                                          \
1279         /*  Decrement the loop counter   */                                          \
1280         k--;                                                                         \
1281     }                                                                                \
1282     /* loop + tail predication expected here  */                                     \
1283     k = count % 0x10U;                                                               \
1284     if (k > 0U)                                                                      \
1285     {                                                                                \
1286         mve_pred16_t p0 = vctp8q(k);                                                 \
1287         yVec = vld1q(pSrcY);  pSrcY += 16;                                           \
1288         xVec = vldrbq_s8(&pSrcX[1]);                                                 \
1289         acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                    \
1290         xVec = vldrbq_s8(&pSrcX[2]);                                                 \
1291         acc2 = vmladavaq_p(acc2, xVec, yVec, p0);                                    \
1292         xVec = vldrbq_s8(&pSrcX[3]);                                                 \
1293         acc3 = vmladavaq_p(acc3, xVec, yVec, p0);                                    \
1294         xVec = vld1q(pSrcX);  pSrcX += 16;                                           \
1295         acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                    \
1296     }                                                                                \
1297                                                                                      \
1298     acc0 = (acc0 >> 7);                                                              \
1299     acc1 = (acc1 >> 7);                                                              \
1300     acc2 = (acc2 >> 7);                                                              \
1301     acc3 = (acc3 >> 7);                                                              \
1302     acc0 = __SSAT(acc0, 8);                                                          \
1303     acc1 = __SSAT(acc1, 8);                                                          \
1304     acc2 = __SSAT(acc2, 8);                                                          \
1305     acc3 = __SSAT(acc3, 8);                                                          \
1306 }
1307 
1308 #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)\
1309 {                                                                        \
1310     q7_t const *pSrcX, *pSrcY;                                           \
1311     q7x16_t   xVec, yVec;                                                \
1312     uint32_t    k;                                                       \
1313                                                                          \
1314     pSrcX = (q7_t const *) pX;                                           \
1315     pSrcY = (q7_t const *) pY;                                           \
1316     k = count >> 4;                                                      \
1317                                                                          \
1318     while (k > 0U)                                                       \
1319     {                                                                    \
1320         yVec = vld1q(pSrcY);  pSrcY += 16;                               \
1321         xVec = vldrbq_s8(&pSrcX[1]);                                     \
1322         acc1 = vmladavaq(acc1, xVec, yVec);                              \
1323         xVec = vld1q(pSrcX);  pSrcX += 16;                               \
1324         acc0 = vmladavaq(acc0, xVec, yVec);                              \
1325         /*  Decrement the loop counter   */                              \
1326         k--;                                                             \
1327     }                                                                    \
1328     /* loop + tail predication expected here  */                         \
1329     k = count % 0x10U;                                                   \
1330     if (k > 0U)                                                          \
1331     {                                                                    \
1332         mve_pred16_t p0 = vctp8q(k);                                     \
1333         yVec = vld1q(pSrcY);  pSrcY += 16;                               \
1334         xVec = vldrbq_s8(&pSrcX[1]);                                     \
1335         acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                        \
1336         xVec = vld1q(pSrcX);  pSrcX += 16;                               \
1337         acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                        \
1338     }                                                                    \
1339                                                                          \
1340     acc0 = (acc0 >> 7);                                                  \
1341     acc1 = (acc1 >> 7);                                                  \
1342     acc0 = __SSAT(acc0, 8);                                              \
1343     acc1 = __SSAT(acc1, 8);                                              \
1344 }
1345 
1346 #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)\
1347 {                                                                      \
1348     q7_t const *pSrcX, *pSrcY;                                         \
1349     q7x16_t   xVec, yVec;                                              \
1350     uint32_t    k;                                                     \
1351                                                                        \
1352     pSrcX = (q7_t const *) pX;                                         \
1353     pSrcY = (q7_t const *) pY;                                         \
1354     k = (count-1) >> 4;                                                \
1355                                                                        \
1356     while (k > 0U)                                                     \
1357     {                                                                  \
1358         yVec = vld1q(pSrcY);  pSrcY += 16;                             \
1359         xVec = vldrbq_s8(&pSrcX[1]);                                   \
1360         acc1 = vmladavaq(acc1, xVec, yVec);                            \
1361         xVec = vld1q(pSrcX);  pSrcX += 16;                             \
1362         acc0 = vmladavaq(acc0, xVec, yVec);                            \
1363         /*  Decrement the loop counter   */                            \
1364         k--;                                                           \
1365     }                                                                  \
1366     /* use predication to finalize MAC sum */                          \
1367     /* acc1 requires exact number of sample (count-1)  */              \
1368     /* disable extra lanes in final MAC computation  */                \
1369     k = (count-1) % 0x10U;                                             \
1370     mve_pred16_t p0 = vctp8q(k);                                       \
1371     yVec = vld1q(pSrcY);  pSrcY += 16;                                 \
1372     xVec = vldrbq_s8(&pSrcX[1]);                                       \
1373     acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                          \
1374     /* acc0 requires 1 additional sample  (count) */                   \
1375     /* so add 1 to unmask an extra lane  in final MAC computation  */  \
1376     p0 = vctp8q(k+1);                                                  \
1377     xVec = vld1q(pSrcX);  pSrcX += 16;                                 \
1378     acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                          \
1379                                                                        \
1380     acc0 = (acc0 >> 7);                                                \
1381     acc1 = (acc1 >> 7);                                                \
1382     acc0 = __SSAT(acc0, 8);                                            \
1383     acc1 = __SSAT(acc1, 8);                                            \
1384 }
1385 
1386 #define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\
1387 {                                                                      \
1388     q7_t const *pSrcX;                                                 \
1389     const q7_t       *pY1 = pY + 1;                                    \
1390     q7x16_t   xVec, yVec;                                              \
1391     uint32_t    k;                                                     \
1392                                                                        \
1393     pSrcX = (q7_t const *) pX;                                         \
1394     k = count >> 4;                                                    \
1395                                                                        \
1396     while (k > 0U)                                                     \
1397     {                                                                  \
1398         xVec = vld1q(pSrcX);  pSrcX += 16;                             \
1399         yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                \
1400         pY-=16;                                                        \
1401         acc0 = vmladavaq(acc0, xVec, yVec);                            \
1402         yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec);               \
1403         pY1-=16;                                                       \
1404         acc1 = vmladavaq(acc1, xVec, yVec);                            \
1405         /*  Decrement the loop counter   */                            \
1406         k--;                                                           \
1407     }                                                                  \
1408     k = count % 0x10U;                                                 \
1409     /* use predication to finalize MAC sum */                          \
1410     /* acc0 requires exact number of sample  */                        \
1411     /* disable extra lanes in final MAC computation  */                \
1412     mve_pred16_t p0 = vctp8q(k);                                       \
1413     xVec = vld1q(pSrcX);  pSrcX += 16;                                 \
1414     yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                    \
1415     acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                          \
1416     yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec);                   \
1417     /* acc1 requires 1 additional sample  */                           \
1418     /* so add 1 to unmask an extra lane  in final MAC computation  */  \
1419     p0 = vctp8q(k+1);                                                  \
1420     acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                          \
1421                                                                        \
1422     acc0 = (acc0 >> 7);                                                \
1423     acc1 = (acc1 >> 7);                                                \
1424     acc0 = __SSAT(acc0, 8);                                            \
1425     acc1 = __SSAT(acc1, 8);                                            \
1426 }
1427 
1428 #define MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count)                                                  \
1429 {                                                                                                    \
1430     q7_t const *pSrcX;                                                                               \
1431     q7x16_t   xVec, yVec;                                                                            \
1432     uint32_t    k;                                                                                   \
1433                                                                                                      \
1434     pSrcX = (q7_t const *) pX;                                                                       \
1435     k = count >> 4;                                                                                  \
1436                                                                                                      \
1437     while (k > 0U)                                                                                   \
1438     {                                                                                                \
1439         yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
1440         pY-=16;                                                                                      \
1441         xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
1442         acc = vmladavaq(acc, xVec, yVec);                                                            \
1443         /*  Decrement the loop counter   */                                                          \
1444         k--;                                                                                         \
1445     }                                                                                                \
1446     /* Loop with tail predication expected here  */                                                  \
1447     k = count % 0x10U;                                                                               \
1448     if (k > 0U)                                                                                      \
1449     {                                                                                                \
1450         mve_pred16_t p0 = vctp8q(k);                                                                 \
1451         xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
1452         yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
1453         acc = vmladavaq_p(acc, xVec, yVec, p0);                                                      \
1454     }                                                                                                \
1455     acc = __SSAT(acc >> 7, 8);                                                                       \
1456 }
1457 
1458 #define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)                \
1459 {                                                                                                    \
1460     q7_t const *pSrcX;                                                                               \
1461     q7x16_t   xVec, yVec;                                                                            \
1462     uint32_t    k;                                                                                   \
1463                                                                                                      \
1464     pSrcX = (q7_t const *) pX;                                                                       \
1465     k = count >> 4;                                                                                  \
1466                                                                                                      \
1467     while (k > 0U)                                                                                   \
1468     {                                                                                                \
1469         yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
1470         pY-=16;                                                                                      \
1471         xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
1472         acc1 = vmladavaq(acc1, xVec, yVec);                                                          \
1473         xVec = vldrbq_s8(&pSrcX[2]);                                                                 \
1474         acc2 = vmladavaq(acc2, xVec, yVec);                                                          \
1475         xVec = vldrbq_s8(&pSrcX[3]);                                                                 \
1476         acc3 = vmladavaq(acc3, xVec, yVec);                                                          \
1477         xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
1478         acc0 = vmladavaq(acc0, xVec, yVec);                                                          \
1479         /*  Decrement the loop counter   */                                                          \
1480         k--;                                                                                         \
1481     }                                                                                                \
1482     /* Loop with tail predication expected here  */                                                  \
1483     k = count % 0x10U;                                                                               \
1484     if (k > 0U)                                                                                      \
1485     {                                                                                                \
1486         mve_pred16_t p0 = vctp8q(k);                                                                 \
1487         yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
1488         xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
1489         acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                    \
1490         xVec = vldrbq_s8(&pSrcX[2]);                                                                 \
1491         acc2 = vmladavaq_p(acc2, xVec, yVec, p0);                                                    \
1492         xVec = vldrbq_s8(&pSrcX[3]);                                                                 \
1493         acc3 = vmladavaq_p(acc3, xVec, yVec, p0);                                                    \
1494         xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
1495         acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                    \
1496     }                                                                                                \
1497     acc0 = __SSAT(acc0 >> 7, 8);                                                                     \
1498     acc1 = __SSAT(acc1 >> 7, 8);                                                                     \
1499     acc2 = __SSAT(acc2 >> 7, 8);                                                                     \
1500     acc3 = __SSAT(acc3 >> 7, 8);                                                                     \
1501 }
1502 
1503 #define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)                            \
1504 {                                                                                                    \
1505     q7_t const *pSrcX;                                                                               \
1506     q7x16_t   xVec, yVec;                                                                            \
1507     uint32_t    k;                                                                                   \
1508                                                                                                      \
1509     pSrcX = (q7_t const *) pX;                                                                       \
1510     k = count >> 4;                                                                                  \
1511                                                                                                      \
1512     while (k > 0U)                                                                                   \
1513     {                                                                                                \
1514         yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
1515         pY-=16;                                                                                      \
1516         xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
1517         acc1 = vmladavaq(acc1, xVec, yVec);                                                          \
1518         xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
1519         acc0 = vmladavaq(acc0, xVec, yVec);                                                          \
1520         /*  Decrement the loop counter   */                                                          \
1521         k--;                                                                                         \
1522     }                                                                                                \
1523     /* Loop with tail predication expected here  */                                                  \
1524     k = count % 0x10U;                                                                               \
1525     if (k > 0U)                                                                                      \
1526     {                                                                                                \
1527         mve_pred16_t p0 = vctp8q(k);                                                                 \
1528         yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
1529         xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
1530         acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                    \
1531         xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
1532         acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                    \
1533     }                                                                                                \
1534     acc0 = __SSAT(acc0 >> 7, 8);                                                                     \
1535     acc1 = __SSAT(acc1 >> 7, 8);                                                                     \
1536 }
1537 
1538 
1539 #define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)                              \
1540 {                                                                                                    \
1541     q7_t const *pSrcX;                                                                               \
1542     q7x16_t   xVec, yVec;                                                                            \
1543     uint32_t    k;                                                                                   \
1544                                                                                                      \
1545     pSrcX = (q7_t const *) pX;                                                                       \
1546     k = (count-1) >> 4;                                                                              \
1547                                                                                                      \
1548     while (k > 0U)                                                                                   \
1549     {                                                                                                \
1550         yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
1551         pY-=16;                                                                                      \
1552         xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
1553         acc1 = vmladavaq(acc1, xVec, yVec);                                                          \
1554         xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
1555         acc0 = vmladavaq(acc0, xVec, yVec);                                                          \
1556         /*  Decrement the loop counter   */                                                          \
1557         k--;                                                                                         \
1558     }                                                                                                \
1559     k = (count - 1) % 0x10U;                                                                         \
1560     /* use predication to finalize MAC sum */                                                        \
1561     /* acc1 requires exact number of sample (count-1)  */                                            \
1562     /* disable extra lanes in final MAC computation  */                                              \
1563     mve_pred16_t p0 = vctp8q(k);                                                                     \
1564     yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                                  \
1565     xVec = vldrbq_s8(&pSrcX[1]);                                                                     \
1566     acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                        \
1567     /* acc0 requires 1 additional sample  (count) */                                                 \
1568     /* so add 1 to unmask an extra lane  in final MAC computation  */                                \
1569     p0 = vctp8q(k+1);                                                                                \
1570     xVec = vld1q(pSrcX);  pSrcX += 16;                                                               \
1571     acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                        \
1572                                                                                                      \
1573     acc0 = (acc0 >> 7);                                                                              \
1574     acc1 = (acc1 >> 7);                                                                              \
1575     acc0 = __SSAT(acc0, 8);                                                                          \
1576     acc1 = __SSAT(acc1, 8);                                                                          \
1577 }
1578 
1579 #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
1580 
1581 #ifdef   __cplusplus
1582 }
1583 #endif
1584 
1585 
1586 #endif /* _ARM_VEC_FILTERING_H_ */
1587