1 /****************************************************************************** 2 * @file arm_vec_filtering.h 3 * @brief Private header file for CMSIS DSP Library 4 * @version V1.7.0 5 * @date 30. October 2019 6 ******************************************************************************/ 7 /* 8 * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved. 9 * 10 * SPDX-License-Identifier: Apache-2.0 11 * 12 * Licensed under the Apache License, Version 2.0 (the License); you may 13 * not use this file except in compliance with the License. 14 * You may obtain a copy of the License at 15 * 16 * www.apache.org/licenses/LICENSE-2.0 17 * 18 * Unless required by applicable law or agreed to in writing, software 19 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 20 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 21 * See the License for the specific language governing permissions and 22 * limitations under the License. 23 */ 24 25 #ifndef _ARM_VEC_FILTERING_H_ 26 #define _ARM_VEC_FILTERING_H_ 27 28 #include "arm_math.h" 29 #include "arm_helium_utils.h" 30 31 #ifdef __cplusplus 32 extern "C" 33 { 34 #endif 35 36 #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) 37 38 #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F32(acc0, acc1, acc2, acc3, pX, pY, count)\ 39 { \ 40 float32_t const *pSrcX, *pSrcY; \ 41 f32x4_t acc0Vec, acc1Vec, acc2Vec, acc3Vec, xVec, yVec; \ 42 uint32_t k; \ 43 \ 44 acc0Vec = vdupq_n_f32(0.0f); \ 45 acc1Vec = vdupq_n_f32(0.0f); \ 46 acc2Vec = vdupq_n_f32(0.0f); \ 47 acc3Vec = vdupq_n_f32(0.0f); \ 48 pSrcX = (float32_t const *) pX; \ 49 pSrcY = (float32_t const *) pY; \ 50 k = count >> 2; \ 51 \ 52 while (k > 0U) \ 53 { \ 54 yVec = vld1q(pSrcY); \ 55 pSrcY += 4; \ 56 xVec = vldrwq_f32(&pSrcX[1]); \ 57 acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ 58 xVec = vldrwq_f32(&pSrcX[2]); \ 59 acc2Vec = vfmaq_f32(acc2Vec, xVec, yVec); \ 60 xVec = vldrwq_f32(&pSrcX[3]); \ 61 acc3Vec = vfmaq_f32(acc3Vec, xVec, yVec); \ 62 xVec = vld1q(pSrcX); \ 63 pSrcX += 4; \ 64 acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ 65 /* Decrement the loop counter */ \ 66 k--; \ 67 } \ 68 /* loop + tail predication expected here */ \ 69 k = count % 0x4U; \ 70 if (k > 0U) \ 71 { \ 72 mve_pred16_t p0 = vctp32q(k); \ 73 yVec = vld1q(pSrcY); \ 74 pSrcY += 4; \ 75 xVec = vldrwq_f32(&pSrcX[1]); \ 76 acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \ 77 xVec = vldrwq_f32(&pSrcX[2]); \ 78 acc2Vec = vfmaq_m_f32(acc2Vec, xVec, yVec, p0); \ 79 xVec = vldrwq_f32(&pSrcX[3]); \ 80 acc3Vec = vfmaq_m_f32(acc3Vec, xVec, yVec, p0); \ 81 xVec = vld1q(pSrcX); \ 82 pSrcX += 4; \ 83 acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \ 84 } \ 85 \ 86 acc0 = vecAddAcrossF32Mve(acc0Vec); \ 87 acc1 = vecAddAcrossF32Mve(acc1Vec); \ 88 acc2 = vecAddAcrossF32Mve(acc2Vec); \ 89 acc3 = vecAddAcrossF32Mve(acc3Vec); \ 90 } 91 92 #define MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, count) \ 93 { \ 94 float32_t const *pSrcX, *pSrcY; \ 95 f32x4_t accVec, xVec, yVec; \ 96 uint32_t k; \ 97 \ 98 accVec = vdupq_n_f32(0.0f); \ 99 pSrcX = (float32_t const *) pX; \ 100 pSrcY = (float32_t const *) pY; \ 101 k = count >> 2; \ 102 \ 103 while (k > 0U) \ 104 { \ 105 yVec = vld1q(pSrcY); \ 106 pSrcY += 4; \ 107 xVec = vld1q(pSrcX); \ 108 pSrcX += 4; \ 109 accVec = vfmaq_f32(accVec, xVec, yVec); \ 110 /* Decrement the loop counter */ \ 111 k--; \ 112 } \ 113 /* Loop with tail predication expected here */ \ 114 k = count % 0x4U; \ 115 if (k > 0U) \ 116 { \ 117 mve_pred16_t p0 = vctp32q(k); \ 118 yVec = vld1q(pSrcY); \ 119 pSrcY += 4; \ 120 xVec = vld1q(pSrcX); \ 121 pSrcX += 4; \ 122 accVec = vfmaq_m_f32(accVec, xVec, yVec, p0);\ 123 } \ 124 acc = vecAddAcrossF32Mve(accVec); \ 125 } 126 127 #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count)\ 128 { \ 129 float32_t const *pSrcX, *pSrcY; \ 130 f32x4_t acc0Vec, acc1Vec, xVec, yVec; \ 131 uint32_t k; \ 132 \ 133 acc0Vec = vdupq_n_f32(0.0f); \ 134 acc1Vec = vdupq_n_f32(0.0f); \ 135 pSrcX = (float32_t const *) pX; \ 136 pSrcY = (float32_t const *) pY; \ 137 k = (count-1) >> 2; \ 138 \ 139 while (k > 0U) \ 140 { \ 141 yVec = vld1q(pSrcY); \ 142 pSrcY += 4; \ 143 xVec = vldrwq_f32(&pSrcX[1]); \ 144 acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ 145 xVec = vld1q(pSrcX); \ 146 pSrcX += 4; \ 147 acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ 148 /* Decrement the loop counter */ \ 149 k--; \ 150 } \ 151 /* use predication to finalize MAC sum */ \ 152 /* acc1 requires exact number of sample (count-1) */ \ 153 /* disable extra lanes in final MAC computation */ \ 154 k = (count-1) % 0x4U; \ 155 mve_pred16_t p0 = vctp32q(k); \ 156 yVec = vld1q(pSrcY); \ 157 pSrcY += 4; \ 158 xVec = vldrwq_f32(&pSrcX[1]); \ 159 acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \ 160 /* acc0 requires 1 additional sample (count) */ \ 161 /* so add 1 to unmask an extra lane in final MAC computation */ \ 162 p0 = vctp32q(k+1); \ 163 xVec = vld1q(pSrcX); \ 164 pSrcX += 4; \ 165 acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \ 166 \ 167 acc0 = vecAddAcrossF32Mve(acc0Vec); \ 168 acc1 = vecAddAcrossF32Mve(acc1Vec); \ 169 } 170 171 #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count)\ 172 { \ 173 float32_t const *pSrcX, *pSrcY; \ 174 f32x4_t acc0Vec, acc1Vec, xVec, yVec; \ 175 uint32_t k; \ 176 \ 177 acc0Vec = vdupq_n_f32(0.0f); \ 178 acc1Vec = vdupq_n_f32(0.0f); \ 179 pSrcX = (float32_t const *) pX; \ 180 pSrcY = (float32_t const *) pY; \ 181 k = count >> 2; \ 182 \ 183 while (k > 0U) \ 184 { \ 185 yVec = vld1q(pSrcY); \ 186 pSrcY += 4; \ 187 xVec = vldrwq_f32(&pSrcX[1]); \ 188 acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ 189 xVec = vld1q(pSrcX); \ 190 pSrcX += 4; \ 191 acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ 192 /* Decrement the loop counter */ \ 193 k--; \ 194 } \ 195 /* loop + tail predication expected here */ \ 196 k = count % 0x4U; \ 197 if (k > 0U) \ 198 { \ 199 mve_pred16_t p0 = vctp32q(k); \ 200 yVec = vld1q(pSrcY); \ 201 pSrcY += 4; \ 202 xVec = vldrwq_f32(&pSrcX[1]); \ 203 acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \ 204 xVec = vld1q(pSrcX); \ 205 pSrcX += 4; \ 206 acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \ 207 } \ 208 \ 209 acc0 = vecAddAcrossF32Mve(acc0Vec); \ 210 acc1 = vecAddAcrossF32Mve(acc1Vec); \ 211 } 212 213 #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)\ 214 { \ 215 float32_t const *pSrcX, *pSrcY; \ 216 f32x4_t acc0Vec, acc1Vec, xVec, yVec; \ 217 uint32_t k; \ 218 \ 219 acc0Vec = vdupq_n_f32(0.0f); \ 220 acc1Vec = vdupq_n_f32(0.0f); \ 221 pSrcX = (float32_t const *) pX; \ 222 pSrcY = (float32_t const *) pY; \ 223 k = count >> 2; \ 224 while (k > 0U) \ 225 { \ 226 xVec = vld1q(pSrcX); \ 227 pSrcX += 4; \ 228 yVec = vldrwq_f32(&pSrcY[-1]); \ 229 acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ 230 yVec = vld1q(pSrcY); \ 231 pSrcY += 4; \ 232 acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ 233 /* Decrement the loop counter */ \ 234 k--; \ 235 } \ 236 k = count % 0x4U; \ 237 /* use predication to finalize MAC sum */ \ 238 /* acc1 requires 1 additional sample */ \ 239 /* so add 1 to unmask an extra lane in final MAC computation */ \ 240 mve_pred16_t p0 = vctp32q(k+1); \ 241 xVec = vld1q(pSrcX); \ 242 pSrcX += 4; \ 243 yVec = vldrwq_f32(&pSrcY[-1]); \ 244 acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec,p0); \ 245 /* acc0 requires exact number of sample */ \ 246 /* disable extra lanes in final MAC computation */ \ 247 p0 = vctp32q(k); \ 248 yVec = vld1q(pSrcY); \ 249 pSrcY += 4; \ 250 acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec,p0); \ 251 \ 252 acc0 = vecAddAcrossF32Mve(acc0Vec); \ 253 acc1 = vecAddAcrossF32Mve(acc1Vec); \ 254 } 255 256 #define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count) \ 257 { \ 258 float32_t const *pSrcX; \ 259 f32x4_t acc0Vec, acc1Vec, xVec, yVec; \ 260 uint32_t k; \ 261 \ 262 acc0Vec = vdupq_n_f32(0.0f); \ 263 acc1Vec = vdupq_n_f32(0.0f); \ 264 pSrcX = (float32_t const *) pX; \ 265 k = (count - 1) >> 2; \ 266 \ 267 while (k > 0U) \ 268 { \ 269 yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ 270 pY-=4; \ 271 xVec = vldrwq_f32(&pSrcX[1]); \ 272 acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ 273 xVec = vld1q(pSrcX); pSrcX += 4; \ 274 acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ 275 /* Decrement the loop counter */ \ 276 k--; \ 277 } \ 278 /* Loop with tail predication expected here */ \ 279 k = (count - 1) % 0x4U; \ 280 mve_pred16_t p0 = vctp32q(k); \ 281 yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ 282 xVec = vldrwq_f32(&pSrcX[1]); \ 283 acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \ 284 xVec = vld1q(pSrcX); pSrcX += 4; \ 285 p0 = vctp32q(k+1); \ 286 acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \ 287 \ 288 acc0 = vecAddAcrossF32Mve(acc0Vec); \ 289 acc1 = vecAddAcrossF32Mve(acc1Vec); \ 290 } 291 292 #define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count) \ 293 { \ 294 float32_t const *pSrcX; \ 295 f32x4_t acc0Vec, acc1Vec, xVec, yVec; \ 296 uint32_t k; \ 297 \ 298 acc0Vec = vdupq_n_f32(0.0f); \ 299 acc1Vec = vdupq_n_f32(0.0f); \ 300 pSrcX = (float32_t const *) pX; \ 301 k = count >> 2; \ 302 \ 303 while (k > 0U) \ 304 { \ 305 yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ 306 pY-=4; \ 307 xVec = vldrwq_f32(&pSrcX[1]); \ 308 acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ 309 xVec = vld1q(pSrcX); pSrcX += 4; \ 310 acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ 311 /* Decrement the loop counter */ \ 312 k--; \ 313 } \ 314 /* Loop with tail predication expected here */ \ 315 k = count % 0x4U; \ 316 if (k > 0U) \ 317 { \ 318 mve_pred16_t p0 = vctp32q(k); \ 319 yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ 320 xVec = vldrwq_f32(&pSrcX[1]); \ 321 acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \ 322 xVec = vld1q(pSrcX); pSrcX += 4; \ 323 acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \ 324 } \ 325 acc0 = vecAddAcrossF32Mve(acc0Vec); \ 326 acc1 = vecAddAcrossF32Mve(acc1Vec); \ 327 } 328 329 #define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)\ 330 { \ 331 float32_t const *pSrcX; \ 332 const float32_t *pY1 = pY + 1; \ 333 f32x4_t acc0Vec, acc1Vec, xVec, yVec; \ 334 uint32_t k; \ 335 \ 336 acc0Vec = vdupq_n_f32(0.0f); \ 337 acc1Vec = vdupq_n_f32(0.0f); \ 338 pSrcX = (float32_t const *) pX; \ 339 k = count >> 2; \ 340 \ 341 while (k > 0U) \ 342 { \ 343 xVec = vld1q(pSrcX); pSrcX += 4; \ 344 yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ 345 pY-=4; \ 346 acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ 347 yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec); \ 348 pY1-=4; \ 349 acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ 350 /* Decrement the loop counter */ \ 351 k--; \ 352 } \ 353 k = count % 0x4U; \ 354 /* use predication to finalize MAC sum */ \ 355 /* acc0 requires exact number of sample */ \ 356 /* disable extra lanes in final MAC computation */ \ 357 mve_pred16_t p0 = vctp32q(k); \ 358 xVec = vld1q(pSrcX); pSrcX += 4; \ 359 yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ 360 acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \ 361 yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec); \ 362 /* acc1 requires 1 additional sample */ \ 363 /* so add 1 to unmask an extra lane in final MAC computation */ \ 364 p0 = vctp32q(k+1); \ 365 acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \ 366 \ 367 acc0 = vecAddAcrossF32Mve(acc0Vec); \ 368 acc1 = vecAddAcrossF32Mve(acc1Vec); \ 369 } 370 371 #define MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count) \ 372 { \ 373 float32_t const *pSrcX; \ 374 f32x4_t accVec, xVec, yVec; \ 375 uint32_t k; \ 376 \ 377 accVec = vdupq_n_f32(0.0f); \ 378 pSrcX = (float32_t const *) pX; \ 379 k = count >> 2; \ 380 \ 381 while (k > 0U) \ 382 { \ 383 yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ 384 pY-=4; \ 385 xVec = vld1q(pSrcX); pSrcX += 4; \ 386 accVec = vfmaq_f32(accVec, xVec, yVec); \ 387 /* Decrement the loop counter */ \ 388 k--; \ 389 } \ 390 /* Loop with tail predication expected here */ \ 391 k = count % 0x4U; \ 392 if (k > 0U) \ 393 { \ 394 mve_pred16_t p0 = vctp32q(k); \ 395 xVec = vld1q(pSrcX); pSrcX += 4; \ 396 yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ 397 accVec = vfmaq_m_f32(accVec, xVec, yVec, p0); \ 398 } \ 399 acc = vecAddAcrossF32Mve(accVec); \ 400 } 401 402 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/ 403 404 #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) 405 406 #define MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count) \ 407 { \ 408 q31_t const *pSrcX; \ 409 q31x4_t xVec, yVec; \ 410 uint32_t k; \ 411 \ 412 pSrcX = (q31_t const *) pX; \ 413 k = count >> 2; \ 414 \ 415 while (k > 0U) \ 416 { \ 417 yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ 418 pY-=4; \ 419 xVec = vld1q(pSrcX); pSrcX += 4; \ 420 acc = vmlaldavaq(acc, xVec, yVec); \ 421 /* Decrement the loop counter */ \ 422 k--; \ 423 } \ 424 /* Loop with tail predication expected here */ \ 425 k = count % 0x4U; \ 426 if (k > 0U) \ 427 { \ 428 mve_pred16_t p0 = vctp32q(k); \ 429 xVec = vld1q(pSrcX); pSrcX += 4; \ 430 yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ 431 acc = vmlaldavaq_p(acc, xVec, yVec, p0); \ 432 } \ 433 acc = asrl(acc, 31); \ 434 } 435 436 437 438 #define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\ 439 { \ 440 q31_t const *pSrcX; \ 441 const q31_t *pY1 = pY + 1; \ 442 q31x4_t xVec, yVec; \ 443 uint32_t k; \ 444 \ 445 pSrcX = (q31_t const *) pX; \ 446 k = count >> 2; \ 447 \ 448 while (k > 0U) \ 449 { \ 450 xVec = vld1q(pSrcX); pSrcX += 4; \ 451 yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ 452 pY-=4; \ 453 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 454 yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec); \ 455 pY1-=4; \ 456 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 457 /* Decrement the loop counter */ \ 458 k--; \ 459 } \ 460 k = count % 0x4U; \ 461 /* use predication to finalize MAC sum */ \ 462 /* acc0 requires exact number of sample */ \ 463 /* disable extra lanes in final MAC computation */ \ 464 mve_pred16_t p0 = vctp32q(k); \ 465 xVec = vld1q(pSrcX); pSrcX += 4; \ 466 yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ 467 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 468 yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec); \ 469 /* acc1 requires 1 additional sample */ \ 470 /* so add 1 to unmask an extra lane in final MAC computation */ \ 471 p0 = vctp32q(k+1); \ 472 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 473 \ 474 acc0 = asrl(acc0, 31); \ 475 acc1 = asrl(acc1, 31); \ 476 } 477 478 479 480 481 #define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count) \ 482 { \ 483 q31_t const *pSrcX; \ 484 q31x4_t xVec, yVec; \ 485 uint32_t k; \ 486 \ 487 pSrcX = (q31_t const *) pX; \ 488 k = (count-1) >> 2; \ 489 \ 490 while (k > 0U) \ 491 { \ 492 yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ 493 pY-=4; \ 494 xVec = vldrwq_s32(&pSrcX[1]); \ 495 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 496 xVec = vld1q(pSrcX); \ 497 pSrcX += 4; \ 498 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 499 /* Decrement the loop counter */ \ 500 k--; \ 501 } \ 502 k = (count - 1) % 0x4U; \ 503 /* use predication to finalize MAC sum */ \ 504 /* acc1 requires exact number of sample (count-1) */ \ 505 /* disable extra lanes in final MAC computation */ \ 506 mve_pred16_t p0 = vctp32q(k); \ 507 yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ 508 xVec = vldrwq_s32(&pSrcX[1]); \ 509 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 510 /* acc0 requires 1 additional sample (count) */ \ 511 /* so add 1 to unmask an extra lane in final MAC computation */ \ 512 p0 = vctp32q(k+1); \ 513 xVec = vld1q(pSrcX); \ 514 pSrcX += 4; \ 515 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 516 \ 517 acc0 = asrl(acc0, 31); \ 518 acc1 = asrl(acc1, 31); \ 519 } 520 521 522 523 #define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count) \ 524 { \ 525 q31_t const *pSrcX; \ 526 q31x4_t xVec, yVec; \ 527 uint32_t k; \ 528 \ 529 pSrcX = (q31_t const *) pX; \ 530 k = count >> 2; \ 531 \ 532 while (k > 0U) \ 533 { \ 534 yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ 535 pY-=4; \ 536 xVec = vldrwq_s32(&pSrcX[1]); \ 537 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 538 xVec = vld1q(pSrcX); pSrcX += 4; \ 539 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 540 /* Decrement the loop counter */ \ 541 k--; \ 542 } \ 543 /* Loop with tail predication expected here */ \ 544 k = count % 0x4U; \ 545 if (k > 0U) \ 546 { \ 547 mve_pred16_t p0 = vctp32q(k); \ 548 yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ 549 xVec = vldrwq_s32(&pSrcX[1]); \ 550 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 551 xVec = vld1q(pSrcX); pSrcX += 4; \ 552 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 553 } \ 554 acc0 = asrl(acc0, 31); \ 555 acc1 = asrl(acc1, 31); \ 556 } 557 558 559 560 #define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count) \ 561 { \ 562 q31_t const *pSrcX; \ 563 q31x4_t xVec, yVec; \ 564 uint32_t k; \ 565 \ 566 pSrcX = (q31_t const *) pX; \ 567 k = count >> 2; \ 568 \ 569 while (k > 0U) \ 570 { \ 571 yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ 572 pY-=4; \ 573 xVec = vldrwq_s32(&pSrcX[1]); \ 574 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 575 xVec = vldrwq_s32(&pSrcX[2]); \ 576 acc2 = vmlaldavaq(acc2, xVec, yVec); \ 577 xVec = vldrwq_s32(&pSrcX[3]); \ 578 acc3 = vmlaldavaq(acc3, xVec, yVec); \ 579 xVec = vld1q(pSrcX); pSrcX += 4; \ 580 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 581 /* Decrement the loop counter */ \ 582 k--; \ 583 } \ 584 /* Loop with tail predication expected here */ \ 585 k = count % 0x4U; \ 586 if (k > 0U) \ 587 { \ 588 mve_pred16_t p0 = vctp32q(k); \ 589 yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ 590 xVec = vldrwq_s32(&pSrcX[1]); \ 591 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 592 xVec = vldrwq_s32(&pSrcX[2]); \ 593 acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \ 594 xVec = vldrwq_s32(&pSrcX[3]); \ 595 acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \ 596 xVec = vld1q(pSrcX); pSrcX += 4; \ 597 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 598 } \ 599 acc0 = asrl(acc0, 31); \ 600 acc1 = asrl(acc1, 31); \ 601 acc2 = asrl(acc2, 31); \ 602 acc3 = asrl(acc3, 31); \ 603 } 604 605 #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\ 606 { \ 607 q31_t const *pSrcX, *pSrcY; \ 608 q31x4_t xVec, yVec; \ 609 uint32_t k; \ 610 \ 611 pSrcX = (q31_t const *) pX; \ 612 pSrcY = (q31_t const *) pY; \ 613 k = count >> 2; \ 614 \ 615 while (k > 0U) \ 616 { \ 617 xVec = vld1q(pSrcX); pSrcX += 4; \ 618 yVec = vldrwq_s32(&pSrcY[-1]); \ 619 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 620 yVec = vld1q(pSrcY); pSrcY += 4; \ 621 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 622 /* Decrement the loop counter */ \ 623 k--; \ 624 } \ 625 k = count % 0x4U; \ 626 /* use predication to finalize MAC sum */ \ 627 /* acc1 requires 1 additional sample */ \ 628 /* so add 1 to unmask an extra lane in final MAC computation */ \ 629 mve_pred16_t p0 = vctp32q(k+1); \ 630 xVec = vld1q(pSrcX); pSrcX += 4; \ 631 yVec = vldrwq_s32(&pSrcY[-1]); \ 632 acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0); \ 633 /* acc0 requires exact number of sample */ \ 634 /* disable extra lanes in final MAC computation */ \ 635 p0 = vctp32q(k); \ 636 yVec = vld1q(pSrcY); pSrcY += 4; \ 637 acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0); \ 638 \ 639 acc0 = asrl(acc0, 31); \ 640 acc1 = asrl(acc1, 31); \ 641 } 642 643 #define MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count)\ 644 { \ 645 q31_t const *pSrcX, *pSrcY; \ 646 q31x4_t xVec, yVec; \ 647 uint32_t k; \ 648 \ 649 pSrcX = (q31_t const *) pX; \ 650 pSrcY = (q31_t const *) pY; \ 651 k = count >> 2; \ 652 \ 653 while (k > 0U) \ 654 { \ 655 xVec = vld1q(pSrcX); pSrcX += 4; \ 656 yVec = vld1q(pSrcY); pSrcY += 4; \ 657 acc = vmlaldavaq(acc, xVec, yVec); \ 658 /* Decrement the loop counter */ \ 659 k--; \ 660 } \ 661 /* tail predication expected here */ \ 662 k = count % 0x4U; \ 663 if (k > 0U) \ 664 { \ 665 mve_pred16_t p0 = vctp32q(k); \ 666 xVec = vld1q(pSrcX); pSrcX += 4; \ 667 yVec = vld1q(pSrcY); pSrcY += 4; \ 668 acc = vmlaldavaq_p(acc, xVec, yVec, p0); \ 669 } \ 670 acc = asrl(acc, 31); \ 671 } 672 673 #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)\ 674 { \ 675 q31_t const *pSrcX, *pSrcY; \ 676 q31x4_t xVec, yVec; \ 677 uint32_t k; \ 678 \ 679 pSrcX = (q31_t const *) pX; \ 680 pSrcY = (q31_t const *) pY; \ 681 k = count >> 2; \ 682 \ 683 while (k > 0U) \ 684 { \ 685 yVec = vld1q(pSrcY); pSrcY += 4; \ 686 xVec = vldrwq_s32(&pSrcX[1]); \ 687 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 688 xVec = vldrwq_s32(&pSrcX[2]); \ 689 acc2 = vmlaldavaq(acc2, xVec, yVec); \ 690 xVec = vldrwq_s32(&pSrcX[3]); \ 691 acc3 = vmlaldavaq(acc3, xVec, yVec); \ 692 xVec = vld1q(pSrcX); pSrcX += 4; \ 693 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 694 /* Decrement the loop counter */ \ 695 k--; \ 696 } \ 697 /* loop + tail predication expected here */ \ 698 k = count % 0x4U; \ 699 if (k > 0U) \ 700 { \ 701 mve_pred16_t p0 = vctp32q(k); \ 702 yVec = vld1q(pSrcY); pSrcY += 4; \ 703 xVec = vldrwq_s32(&pSrcX[1]); \ 704 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 705 xVec = vldrwq_s32(&pSrcX[2]); \ 706 acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \ 707 xVec = vldrwq_s32(&pSrcX[3]); \ 708 acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \ 709 xVec = vld1q(pSrcX); pSrcX += 4; \ 710 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 711 } \ 712 \ 713 acc0 = asrl(acc0, 31); \ 714 acc1 = asrl(acc1, 31); \ 715 acc2 = asrl(acc2, 31); \ 716 acc3 = asrl(acc3, 31); \ 717 } 718 719 #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)\ 720 { \ 721 q31_t const *pSrcX, *pSrcY; \ 722 q31x4_t xVec, yVec; \ 723 uint32_t k; \ 724 \ 725 pSrcX = (q31_t const *) pX; \ 726 pSrcY = (q31_t const *) pY; \ 727 k = count >> 2; \ 728 \ 729 while (k > 0U) \ 730 { \ 731 yVec = vld1q(pSrcY); pSrcY += 4; \ 732 xVec = vldrwq_s32(&pSrcX[1]); \ 733 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 734 xVec = vld1q(pSrcX); pSrcX += 4; \ 735 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 736 /* Decrement the loop counter */ \ 737 k--; \ 738 } \ 739 /* loop + tail predication expected here */ \ 740 k = count % 0x4U; \ 741 if (k > 0U) \ 742 { \ 743 mve_pred16_t p0 = vctp32q(k); \ 744 yVec = vld1q(pSrcY); pSrcY += 4; \ 745 xVec = vldrwq_s32(&pSrcX[1]); \ 746 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 747 xVec = vld1q(pSrcX); pSrcX += 4; \ 748 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 749 } \ 750 \ 751 acc0 = asrl(acc0, 31); \ 752 acc1 = asrl(acc1, 31); \ 753 } 754 755 #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)\ 756 { \ 757 q31_t const *pSrcX, *pSrcY; \ 758 q31x4_t xVec, yVec; \ 759 uint32_t k; \ 760 \ 761 pSrcX = (q31_t const *) pX; \ 762 pSrcY = (q31_t const *) pY; \ 763 k = (count-1) >> 2; \ 764 \ 765 while (k > 0U) \ 766 { \ 767 yVec = vld1q(pSrcY); pSrcY += 4; \ 768 xVec = vldrwq_s32(&pSrcX[1]); \ 769 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 770 xVec = vld1q(pSrcX); pSrcX += 4; \ 771 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 772 /* Decrement the loop counter */ \ 773 k--; \ 774 } \ 775 /* use predication to finalize MAC sum */ \ 776 /* acc1 requires exact number of sample (count-1) */ \ 777 /* disable extra lanes in final MAC computation */ \ 778 k = (count-1) % 0x4U; \ 779 mve_pred16_t p0 = vctp32q(k); \ 780 yVec = vld1q(pSrcY); pSrcY += 4; \ 781 xVec = vldrwq_s32(&pSrcX[1]); \ 782 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 783 /* acc0 requires 1 additional sample (count) */ \ 784 /* so add 1 to unmask an extra lane in final MAC computation */ \ 785 p0 = vctp32q(k+1); \ 786 xVec = vld1q(pSrcX); pSrcX += 4; \ 787 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 788 \ 789 acc0 = asrl(acc0, 31); \ 790 acc1 = asrl(acc1, 31); \ 791 } 792 793 #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\ 794 { \ 795 q15_t const *pSrcX, *pSrcY; \ 796 q15x8_t xVec, yVec; \ 797 uint32_t k; \ 798 \ 799 pSrcX = (q15_t const *) pX; \ 800 pSrcY = (q15_t const *) pY; \ 801 k = count >> 3; \ 802 while (k > 0U) \ 803 { \ 804 xVec = vld1q(pSrcX); pSrcX += 8; \ 805 yVec = vldrhq_s16(&pSrcY[-1]); \ 806 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 807 yVec = vld1q(pSrcY); pSrcY += 8; \ 808 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 809 /* Decrement the loop counter */ \ 810 k--; \ 811 } \ 812 k = count % 0x8U; \ 813 /* use predication to finalize MAC sum */ \ 814 /* acc1 requires 1 additional sample */ \ 815 /* so add 1 to unmask an extra lane in final MAC computation */ \ 816 mve_pred16_t p0 = vctp16q(k+1); \ 817 xVec = vld1q(pSrcX); pSrcX += 8; \ 818 yVec = vldrhq_s16(&pSrcY[-1]); \ 819 acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0); \ 820 /* acc0 requires exact number of sample */ \ 821 /* disable extra lanes in final MAC computation */ \ 822 p0 = vctp16q(k); \ 823 yVec = vld1q(pSrcY); pSrcY += 8; \ 824 acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0); \ 825 \ 826 acc0 = asrl(acc0, 15); \ 827 acc1 = asrl(acc1, 15); \ 828 acc0 = __SSAT(acc0, 16); \ 829 acc1 = __SSAT(acc1, 16); \ 830 } 831 832 #define MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count)\ 833 { \ 834 q15_t const *pSrcX, *pSrcY; \ 835 q15x8_t xVec, yVec; \ 836 uint32_t k; \ 837 \ 838 pSrcX = (q15_t const *) pX; \ 839 pSrcY = (q15_t const *) pY; \ 840 k = count >> 3; \ 841 while (k > 0U) \ 842 { \ 843 xVec = vld1q(pSrcX); pSrcX += 8; \ 844 yVec = vld1q(pSrcY); pSrcY += 8; \ 845 acc = vmlaldavaq(acc, xVec, yVec); \ 846 /* Decrement the loop counter */ \ 847 k--; \ 848 } \ 849 /* tail predication expected here */ \ 850 k = count % 0x8U; \ 851 if (k > 0U) \ 852 { \ 853 mve_pred16_t p0 = vctp16q(k); \ 854 xVec = vld1q(pSrcX); pSrcX += 8; \ 855 yVec = vld1q(pSrcY); pSrcY += 8; \ 856 acc = vmlaldavaq_p(acc, xVec, yVec, p0); \ 857 } \ 858 acc = asrl(acc, 15); \ 859 acc = __SSAT(acc, 16); \ 860 } 861 862 #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)\ 863 { \ 864 q15_t const *pSrcX, *pSrcY; \ 865 q15x8_t xVec, yVec; \ 866 uint32_t k; \ 867 \ 868 pSrcX = (q15_t const *) pX; \ 869 pSrcY = (q15_t const *) pY; \ 870 k = count >> 3; \ 871 \ 872 while (k > 0U) \ 873 { \ 874 yVec = vld1q(pSrcY); pSrcY += 8; \ 875 xVec = vldrhq_s16(&pSrcX[1]); \ 876 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 877 xVec = vldrhq_s16(&pSrcX[2]); \ 878 acc2 = vmlaldavaq(acc2, xVec, yVec); \ 879 xVec = vldrhq_s16(&pSrcX[3]); \ 880 acc3 = vmlaldavaq(acc3, xVec, yVec); \ 881 xVec = vld1q(pSrcX); pSrcX += 8; \ 882 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 883 /* Decrement the loop counter */ \ 884 k--; \ 885 } \ 886 /* loop + tail predication expected here */ \ 887 k = count % 0x8U; \ 888 if (k > 0U) \ 889 { \ 890 mve_pred16_t p0 = vctp16q(k); \ 891 yVec = vld1q(pSrcY); pSrcY += 8; \ 892 xVec = vldrhq_s16(&pSrcX[1]); \ 893 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 894 xVec = vldrhq_s16(&pSrcX[2]); \ 895 acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \ 896 xVec = vldrhq_s16(&pSrcX[3]); \ 897 acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \ 898 xVec = vld1q(pSrcX); pSrcX += 8; \ 899 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 900 } \ 901 \ 902 acc0 = asrl(acc0, 15); \ 903 acc1 = asrl(acc1, 15); \ 904 acc2 = asrl(acc2, 15); \ 905 acc3 = asrl(acc3, 15); \ 906 acc0 = __SSAT(acc0, 16); \ 907 acc1 = __SSAT(acc1, 16); \ 908 acc2 = __SSAT(acc2, 16); \ 909 acc3 = __SSAT(acc3, 16); \ 910 } 911 912 #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)\ 913 { \ 914 q15_t const *pSrcX, *pSrcY; \ 915 q15x8_t xVec, yVec; \ 916 uint32_t k; \ 917 \ 918 pSrcX = (q15_t const *) pX; \ 919 pSrcY = (q15_t const *) pY; \ 920 k = count >> 3; \ 921 \ 922 while (k > 0U) \ 923 { \ 924 yVec = vld1q(pSrcY); pSrcY += 8; \ 925 xVec = vldrhq_s16(&pSrcX[1]); \ 926 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 927 xVec = vld1q(pSrcX); pSrcX += 8; \ 928 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 929 /* Decrement the loop counter */ \ 930 k--; \ 931 } \ 932 /* loop + tail predication expected here */ \ 933 k = count % 0x8U; \ 934 if (k > 0U) \ 935 { \ 936 mve_pred16_t p0 = vctp16q(k); \ 937 yVec = vld1q(pSrcY); pSrcY += 8; \ 938 xVec = vldrhq_s16(&pSrcX[1]); \ 939 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 940 xVec = vld1q(pSrcX); pSrcX += 8; \ 941 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 942 } \ 943 \ 944 acc0 = asrl(acc0, 15); \ 945 acc1 = asrl(acc1, 15); \ 946 acc0 = __SSAT(acc0, 16); \ 947 acc1 = __SSAT(acc1, 16); \ 948 } 949 950 #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)\ 951 { \ 952 q15_t const *pSrcX, *pSrcY; \ 953 q15x8_t xVec, yVec; \ 954 uint32_t k; \ 955 \ 956 pSrcX = (q15_t const *) pX; \ 957 pSrcY = (q15_t const *) pY; \ 958 k = (count-1) >> 3; \ 959 \ 960 while (k > 0U) \ 961 { \ 962 yVec = vld1q(pSrcY); pSrcY += 8; \ 963 xVec = vldrhq_s16(&pSrcX[1]); \ 964 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 965 xVec = vld1q(pSrcX); pSrcX += 8; \ 966 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 967 /* Decrement the loop counter */ \ 968 k--; \ 969 } \ 970 /* use predication to finalize MAC sum */ \ 971 /* acc1 requires exact number of sample (count-1) */ \ 972 /* disable extra lanes in final MAC computation */ \ 973 k = (count-1) % 0x8U; \ 974 mve_pred16_t p0 = vctp16q(k); \ 975 yVec = vld1q(pSrcY); pSrcY += 8; \ 976 xVec = vldrhq_s16(&pSrcX[1]); \ 977 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 978 /* acc0 requires 1 additional sample (count) */ \ 979 /* so add 1 to unmask an extra lane in final MAC computation */ \ 980 p0 = vctp16q(k+1); \ 981 xVec = vld1q(pSrcX); pSrcX += 8; \ 982 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 983 \ 984 acc0 = asrl(acc0, 15); \ 985 acc1 = asrl(acc1, 15); \ 986 acc0 = __SSAT(acc0, 16); \ 987 acc1 = __SSAT(acc1, 16); \ 988 } 989 990 #define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\ 991 { \ 992 q15_t const *pSrcX; \ 993 const q15_t *pY1 = pY + 1; \ 994 q15x8_t xVec, yVec; \ 995 uint32_t k; \ 996 \ 997 pSrcX = (q15_t const *) pX; \ 998 k = count >> 3; \ 999 \ 1000 while (k > 0U) \ 1001 { \ 1002 xVec = vld1q(pSrcX); pSrcX += 8; \ 1003 yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ 1004 pY-=8; \ 1005 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 1006 yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec); \ 1007 pY1-=8; \ 1008 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 1009 /* Decrement the loop counter */ \ 1010 k--; \ 1011 } \ 1012 k = count % 0x8U; \ 1013 /* use predication to finalize MAC sum */ \ 1014 /* acc0 requires exact number of sample */ \ 1015 /* disable extra lanes in final MAC computation */ \ 1016 mve_pred16_t p0 = vctp16q(k); \ 1017 xVec = vld1q(pSrcX); pSrcX += 8; \ 1018 yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ 1019 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 1020 yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec); \ 1021 /* acc1 requires 1 additional sample */ \ 1022 /* so add 1 to unmask an extra lane in final MAC computation */ \ 1023 p0 = vctp16q(k+1); \ 1024 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 1025 \ 1026 acc0 = asrl(acc0, 15); \ 1027 acc1 = asrl(acc1, 15); \ 1028 acc0 = __SSAT(acc0, 16); \ 1029 acc1 = __SSAT(acc1, 16); \ 1030 } 1031 1032 #define MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count) \ 1033 { \ 1034 q15_t const *pSrcX; \ 1035 q15x8_t xVec, yVec; \ 1036 uint32_t k; \ 1037 \ 1038 pSrcX = (q15_t const *) pX; \ 1039 k = count >> 3; \ 1040 \ 1041 while (k > 0U) \ 1042 { \ 1043 yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ 1044 pY-=8; \ 1045 xVec = vld1q(pSrcX); pSrcX += 8; \ 1046 acc = vmlaldavaq(acc, xVec, yVec); \ 1047 /* Decrement the loop counter */ \ 1048 k--; \ 1049 } \ 1050 /* Loop with tail predication expected here */ \ 1051 k = count % 0x8U; \ 1052 if (k > 0U) \ 1053 { \ 1054 mve_pred16_t p0 = vctp16q(k); \ 1055 xVec = vld1q(pSrcX); pSrcX += 8; \ 1056 yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ 1057 acc = vmlaldavaq_p(acc, xVec, yVec, p0); \ 1058 } \ 1059 acc = asrl(acc, 15); \ 1060 acc = __SSAT(acc, 16); \ 1061 } 1062 1063 #define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count) \ 1064 { \ 1065 q15_t const *pSrcX; \ 1066 q15x8_t xVec, yVec; \ 1067 uint32_t k; \ 1068 \ 1069 pSrcX = (q15_t const *) pX; \ 1070 k = count >> 3; \ 1071 \ 1072 while (k > 0U) \ 1073 { \ 1074 yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ 1075 pY-=8; \ 1076 xVec = vldrhq_s16(&pSrcX[1]); \ 1077 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 1078 xVec = vldrhq_s16(&pSrcX[2]); \ 1079 acc2 = vmlaldavaq(acc2, xVec, yVec); \ 1080 xVec = vldrhq_s16(&pSrcX[3]); \ 1081 acc3 = vmlaldavaq(acc3, xVec, yVec); \ 1082 xVec = vld1q(pSrcX); pSrcX += 8; \ 1083 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 1084 /* Decrement the loop counter */ \ 1085 k--; \ 1086 } \ 1087 /* Loop with tail predication expected here */ \ 1088 k = count % 0x8U; \ 1089 if (k > 0U) \ 1090 { \ 1091 mve_pred16_t p0 = vctp16q(k); \ 1092 yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ 1093 xVec = vldrhq_s16(&pSrcX[1]); \ 1094 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 1095 xVec = vldrhq_s16(&pSrcX[2]); \ 1096 acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \ 1097 xVec = vldrhq_s16(&pSrcX[3]); \ 1098 acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \ 1099 xVec = vld1q(pSrcX); pSrcX += 8; \ 1100 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 1101 } \ 1102 acc0 = asrl(acc0, 15); \ 1103 acc1 = asrl(acc1, 15); \ 1104 acc2 = asrl(acc2, 15); \ 1105 acc3 = asrl(acc3, 15); \ 1106 acc0 = __SSAT(acc0, 16); \ 1107 acc1 = __SSAT(acc1, 16); \ 1108 acc2 = __SSAT(acc2, 16); \ 1109 acc3 = __SSAT(acc3, 16); \ 1110 } 1111 1112 #define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count) \ 1113 { \ 1114 q15_t const *pSrcX; \ 1115 q15x8_t xVec, yVec; \ 1116 uint32_t k; \ 1117 \ 1118 pSrcX = (q15_t const *) pX; \ 1119 k = count >> 3; \ 1120 \ 1121 while (k > 0U) \ 1122 { \ 1123 yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ 1124 pY-=8; \ 1125 xVec = vldrhq_s16(&pSrcX[1]); \ 1126 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 1127 xVec = vld1q(pSrcX); pSrcX += 8; \ 1128 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 1129 /* Decrement the loop counter */ \ 1130 k--; \ 1131 } \ 1132 /* Loop with tail predication expected here */ \ 1133 k = count % 0x8U; \ 1134 if (k > 0U) \ 1135 { \ 1136 mve_pred16_t p0 = vctp16q(k); \ 1137 yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ 1138 xVec = vldrhq_s16(&pSrcX[1]); \ 1139 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 1140 xVec = vld1q(pSrcX); pSrcX += 8; \ 1141 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 1142 } \ 1143 acc0 = asrl(acc0, 15); \ 1144 acc1 = asrl(acc1, 15); \ 1145 acc0 = __SSAT(acc0, 16); \ 1146 acc1 = __SSAT(acc1, 16); \ 1147 } 1148 1149 #define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count) \ 1150 { \ 1151 q15_t const *pSrcX; \ 1152 q15x8_t xVec, yVec; \ 1153 uint32_t k; \ 1154 \ 1155 pSrcX = (q15_t const *) pX; \ 1156 k = (count-1) >> 3; \ 1157 \ 1158 while (k > 0U) \ 1159 { \ 1160 yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ 1161 pY-=8; \ 1162 xVec = vldrhq_s16(&pSrcX[1]); \ 1163 acc1 = vmlaldavaq(acc1, xVec, yVec); \ 1164 xVec = vld1q(pSrcX); pSrcX += 8; \ 1165 acc0 = vmlaldavaq(acc0, xVec, yVec); \ 1166 /* Decrement the loop counter */ \ 1167 k--; \ 1168 } \ 1169 k = (count - 1) % 0x8U; \ 1170 /* use predication to finalize MAC sum */ \ 1171 /* acc1 requires exact number of sample (count-1) */ \ 1172 /* disable extra lanes in final MAC computation */ \ 1173 mve_pred16_t p0 = vctp16q(k); \ 1174 yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ 1175 xVec = vldrhq_s16(&pSrcX[1]); \ 1176 acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ 1177 /* acc0 requires 1 additional sample (count) */ \ 1178 /* so add 1 to unmask an extra lane in final MAC computation */ \ 1179 p0 = vctp16q(k+1); \ 1180 xVec = vld1q(pSrcX); pSrcX += 8; \ 1181 acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ 1182 \ 1183 acc0 = asrl(acc0, 15); \ 1184 acc1 = asrl(acc1, 15); \ 1185 acc0 = __SSAT(acc0, 16); \ 1186 acc1 = __SSAT(acc1, 16); \ 1187 } 1188 1189 #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\ 1190 { \ 1191 q7_t const *pSrcX, *pSrcY; \ 1192 q7x16_t xVec, yVec; \ 1193 uint32_t k; \ 1194 \ 1195 pSrcX = (q7_t const *) pX; \ 1196 pSrcY = (q7_t const *) pY; \ 1197 k = count >> 4; \ 1198 while (k > 0U) \ 1199 { \ 1200 xVec = vld1q(pSrcX); pSrcX += 16; \ 1201 yVec = vldrbq_s8(&pSrcY[-1]); \ 1202 acc1 = vmladavaq(acc1, xVec, yVec); \ 1203 yVec = vld1q(pSrcY); pSrcY += 16; \ 1204 acc0 = vmladavaq(acc0, xVec, yVec); \ 1205 /* Decrement the loop counter */ \ 1206 k--; \ 1207 } \ 1208 k = count % 0x10U; \ 1209 /* use predication to finalize MAC sum */ \ 1210 /* acc1 requires 1 additional sample */ \ 1211 /* so add 1 to unmask an extra lane in final MAC computation */ \ 1212 mve_pred16_t p0 = vctp8q(k+1); \ 1213 xVec = vld1q(pSrcX); pSrcX += 16; \ 1214 yVec = vldrbq_s8(&pSrcY[-1]); \ 1215 acc1 = vmladavaq_p(acc1, xVec, yVec,p0); \ 1216 /* acc0 requires exact number of sample */ \ 1217 /* disable extra lanes in final MAC computation */ \ 1218 p0 = vctp8q(k); \ 1219 yVec = vld1q(pSrcY); pSrcY += 16; \ 1220 acc0 = vmladavaq_p(acc0, xVec, yVec,p0); \ 1221 \ 1222 acc0 = (acc0 >> 7); \ 1223 acc1 = (acc1 >> 7); \ 1224 acc0 = __SSAT(acc0, 8); \ 1225 acc1 = __SSAT(acc1, 8); \ 1226 } 1227 1228 #define MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count)\ 1229 { \ 1230 q7_t const *pSrcX, *pSrcY; \ 1231 q7x16_t xVec, yVec; \ 1232 uint32_t k; \ 1233 \ 1234 pSrcX = (q7_t const *) pX; \ 1235 pSrcY = (q7_t const *) pY; \ 1236 k = count >> 4; \ 1237 while (k > 0U) \ 1238 { \ 1239 xVec = vld1q(pSrcX); pSrcX += 16; \ 1240 yVec = vld1q(pSrcY); pSrcY += 16; \ 1241 acc = vmladavaq(acc, xVec, yVec); \ 1242 /* Decrement the loop counter */ \ 1243 k--; \ 1244 } \ 1245 /* tail predication expected here */ \ 1246 k = count % 0x10U; \ 1247 if (k > 0U) \ 1248 { \ 1249 mve_pred16_t p0 = vctp8q(k); \ 1250 xVec = vld1q(pSrcX); pSrcX += 16; \ 1251 yVec = vld1q(pSrcY); pSrcY += 16; \ 1252 acc = vmladavaq_p(acc, xVec, yVec, p0); \ 1253 } \ 1254 acc =(acc >> 7); \ 1255 acc = __SSAT(acc, 8); \ 1256 } 1257 1258 #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)\ 1259 { \ 1260 q7_t const *pSrcX, *pSrcY; \ 1261 q7x16_t xVec, yVec; \ 1262 uint32_t k; \ 1263 \ 1264 pSrcX = (q7_t const *) pX; \ 1265 pSrcY = (q7_t const *) pY; \ 1266 k = count >> 4; \ 1267 \ 1268 while (k > 0U) \ 1269 { \ 1270 yVec = vld1q(pSrcY); pSrcY += 16; \ 1271 xVec = vldrbq_s8(&pSrcX[1]); \ 1272 acc1 = vmladavaq(acc1, xVec, yVec); \ 1273 xVec = vldrbq_s8(&pSrcX[2]); \ 1274 acc2 = vmladavaq(acc2, xVec, yVec); \ 1275 xVec = vldrbq_s8(&pSrcX[3]); \ 1276 acc3 = vmladavaq(acc3, xVec, yVec); \ 1277 xVec = vld1q(pSrcX); pSrcX += 16; \ 1278 acc0 = vmladavaq(acc0, xVec, yVec); \ 1279 /* Decrement the loop counter */ \ 1280 k--; \ 1281 } \ 1282 /* loop + tail predication expected here */ \ 1283 k = count % 0x10U; \ 1284 if (k > 0U) \ 1285 { \ 1286 mve_pred16_t p0 = vctp8q(k); \ 1287 yVec = vld1q(pSrcY); pSrcY += 16; \ 1288 xVec = vldrbq_s8(&pSrcX[1]); \ 1289 acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ 1290 xVec = vldrbq_s8(&pSrcX[2]); \ 1291 acc2 = vmladavaq_p(acc2, xVec, yVec, p0); \ 1292 xVec = vldrbq_s8(&pSrcX[3]); \ 1293 acc3 = vmladavaq_p(acc3, xVec, yVec, p0); \ 1294 xVec = vld1q(pSrcX); pSrcX += 16; \ 1295 acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ 1296 } \ 1297 \ 1298 acc0 = (acc0 >> 7); \ 1299 acc1 = (acc1 >> 7); \ 1300 acc2 = (acc2 >> 7); \ 1301 acc3 = (acc3 >> 7); \ 1302 acc0 = __SSAT(acc0, 8); \ 1303 acc1 = __SSAT(acc1, 8); \ 1304 acc2 = __SSAT(acc2, 8); \ 1305 acc3 = __SSAT(acc3, 8); \ 1306 } 1307 1308 #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)\ 1309 { \ 1310 q7_t const *pSrcX, *pSrcY; \ 1311 q7x16_t xVec, yVec; \ 1312 uint32_t k; \ 1313 \ 1314 pSrcX = (q7_t const *) pX; \ 1315 pSrcY = (q7_t const *) pY; \ 1316 k = count >> 4; \ 1317 \ 1318 while (k > 0U) \ 1319 { \ 1320 yVec = vld1q(pSrcY); pSrcY += 16; \ 1321 xVec = vldrbq_s8(&pSrcX[1]); \ 1322 acc1 = vmladavaq(acc1, xVec, yVec); \ 1323 xVec = vld1q(pSrcX); pSrcX += 16; \ 1324 acc0 = vmladavaq(acc0, xVec, yVec); \ 1325 /* Decrement the loop counter */ \ 1326 k--; \ 1327 } \ 1328 /* loop + tail predication expected here */ \ 1329 k = count % 0x10U; \ 1330 if (k > 0U) \ 1331 { \ 1332 mve_pred16_t p0 = vctp8q(k); \ 1333 yVec = vld1q(pSrcY); pSrcY += 16; \ 1334 xVec = vldrbq_s8(&pSrcX[1]); \ 1335 acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ 1336 xVec = vld1q(pSrcX); pSrcX += 16; \ 1337 acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ 1338 } \ 1339 \ 1340 acc0 = (acc0 >> 7); \ 1341 acc1 = (acc1 >> 7); \ 1342 acc0 = __SSAT(acc0, 8); \ 1343 acc1 = __SSAT(acc1, 8); \ 1344 } 1345 1346 #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)\ 1347 { \ 1348 q7_t const *pSrcX, *pSrcY; \ 1349 q7x16_t xVec, yVec; \ 1350 uint32_t k; \ 1351 \ 1352 pSrcX = (q7_t const *) pX; \ 1353 pSrcY = (q7_t const *) pY; \ 1354 k = (count-1) >> 4; \ 1355 \ 1356 while (k > 0U) \ 1357 { \ 1358 yVec = vld1q(pSrcY); pSrcY += 16; \ 1359 xVec = vldrbq_s8(&pSrcX[1]); \ 1360 acc1 = vmladavaq(acc1, xVec, yVec); \ 1361 xVec = vld1q(pSrcX); pSrcX += 16; \ 1362 acc0 = vmladavaq(acc0, xVec, yVec); \ 1363 /* Decrement the loop counter */ \ 1364 k--; \ 1365 } \ 1366 /* use predication to finalize MAC sum */ \ 1367 /* acc1 requires exact number of sample (count-1) */ \ 1368 /* disable extra lanes in final MAC computation */ \ 1369 k = (count-1) % 0x10U; \ 1370 mve_pred16_t p0 = vctp8q(k); \ 1371 yVec = vld1q(pSrcY); pSrcY += 16; \ 1372 xVec = vldrbq_s8(&pSrcX[1]); \ 1373 acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ 1374 /* acc0 requires 1 additional sample (count) */ \ 1375 /* so add 1 to unmask an extra lane in final MAC computation */ \ 1376 p0 = vctp8q(k+1); \ 1377 xVec = vld1q(pSrcX); pSrcX += 16; \ 1378 acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ 1379 \ 1380 acc0 = (acc0 >> 7); \ 1381 acc1 = (acc1 >> 7); \ 1382 acc0 = __SSAT(acc0, 8); \ 1383 acc1 = __SSAT(acc1, 8); \ 1384 } 1385 1386 #define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\ 1387 { \ 1388 q7_t const *pSrcX; \ 1389 const q7_t *pY1 = pY + 1; \ 1390 q7x16_t xVec, yVec; \ 1391 uint32_t k; \ 1392 \ 1393 pSrcX = (q7_t const *) pX; \ 1394 k = count >> 4; \ 1395 \ 1396 while (k > 0U) \ 1397 { \ 1398 xVec = vld1q(pSrcX); pSrcX += 16; \ 1399 yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ 1400 pY-=16; \ 1401 acc0 = vmladavaq(acc0, xVec, yVec); \ 1402 yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec); \ 1403 pY1-=16; \ 1404 acc1 = vmladavaq(acc1, xVec, yVec); \ 1405 /* Decrement the loop counter */ \ 1406 k--; \ 1407 } \ 1408 k = count % 0x10U; \ 1409 /* use predication to finalize MAC sum */ \ 1410 /* acc0 requires exact number of sample */ \ 1411 /* disable extra lanes in final MAC computation */ \ 1412 mve_pred16_t p0 = vctp8q(k); \ 1413 xVec = vld1q(pSrcX); pSrcX += 16; \ 1414 yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ 1415 acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ 1416 yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec); \ 1417 /* acc1 requires 1 additional sample */ \ 1418 /* so add 1 to unmask an extra lane in final MAC computation */ \ 1419 p0 = vctp8q(k+1); \ 1420 acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ 1421 \ 1422 acc0 = (acc0 >> 7); \ 1423 acc1 = (acc1 >> 7); \ 1424 acc0 = __SSAT(acc0, 8); \ 1425 acc1 = __SSAT(acc1, 8); \ 1426 } 1427 1428 #define MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count) \ 1429 { \ 1430 q7_t const *pSrcX; \ 1431 q7x16_t xVec, yVec; \ 1432 uint32_t k; \ 1433 \ 1434 pSrcX = (q7_t const *) pX; \ 1435 k = count >> 4; \ 1436 \ 1437 while (k > 0U) \ 1438 { \ 1439 yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ 1440 pY-=16; \ 1441 xVec = vld1q(pSrcX); pSrcX += 16; \ 1442 acc = vmladavaq(acc, xVec, yVec); \ 1443 /* Decrement the loop counter */ \ 1444 k--; \ 1445 } \ 1446 /* Loop with tail predication expected here */ \ 1447 k = count % 0x10U; \ 1448 if (k > 0U) \ 1449 { \ 1450 mve_pred16_t p0 = vctp8q(k); \ 1451 xVec = vld1q(pSrcX); pSrcX += 16; \ 1452 yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ 1453 acc = vmladavaq_p(acc, xVec, yVec, p0); \ 1454 } \ 1455 acc = __SSAT(acc >> 7, 8); \ 1456 } 1457 1458 #define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count) \ 1459 { \ 1460 q7_t const *pSrcX; \ 1461 q7x16_t xVec, yVec; \ 1462 uint32_t k; \ 1463 \ 1464 pSrcX = (q7_t const *) pX; \ 1465 k = count >> 4; \ 1466 \ 1467 while (k > 0U) \ 1468 { \ 1469 yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ 1470 pY-=16; \ 1471 xVec = vldrbq_s8(&pSrcX[1]); \ 1472 acc1 = vmladavaq(acc1, xVec, yVec); \ 1473 xVec = vldrbq_s8(&pSrcX[2]); \ 1474 acc2 = vmladavaq(acc2, xVec, yVec); \ 1475 xVec = vldrbq_s8(&pSrcX[3]); \ 1476 acc3 = vmladavaq(acc3, xVec, yVec); \ 1477 xVec = vld1q(pSrcX); pSrcX += 16; \ 1478 acc0 = vmladavaq(acc0, xVec, yVec); \ 1479 /* Decrement the loop counter */ \ 1480 k--; \ 1481 } \ 1482 /* Loop with tail predication expected here */ \ 1483 k = count % 0x10U; \ 1484 if (k > 0U) \ 1485 { \ 1486 mve_pred16_t p0 = vctp8q(k); \ 1487 yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ 1488 xVec = vldrbq_s8(&pSrcX[1]); \ 1489 acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ 1490 xVec = vldrbq_s8(&pSrcX[2]); \ 1491 acc2 = vmladavaq_p(acc2, xVec, yVec, p0); \ 1492 xVec = vldrbq_s8(&pSrcX[3]); \ 1493 acc3 = vmladavaq_p(acc3, xVec, yVec, p0); \ 1494 xVec = vld1q(pSrcX); pSrcX += 16; \ 1495 acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ 1496 } \ 1497 acc0 = __SSAT(acc0 >> 7, 8); \ 1498 acc1 = __SSAT(acc1 >> 7, 8); \ 1499 acc2 = __SSAT(acc2 >> 7, 8); \ 1500 acc3 = __SSAT(acc3 >> 7, 8); \ 1501 } 1502 1503 #define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count) \ 1504 { \ 1505 q7_t const *pSrcX; \ 1506 q7x16_t xVec, yVec; \ 1507 uint32_t k; \ 1508 \ 1509 pSrcX = (q7_t const *) pX; \ 1510 k = count >> 4; \ 1511 \ 1512 while (k > 0U) \ 1513 { \ 1514 yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ 1515 pY-=16; \ 1516 xVec = vldrbq_s8(&pSrcX[1]); \ 1517 acc1 = vmladavaq(acc1, xVec, yVec); \ 1518 xVec = vld1q(pSrcX); pSrcX += 16; \ 1519 acc0 = vmladavaq(acc0, xVec, yVec); \ 1520 /* Decrement the loop counter */ \ 1521 k--; \ 1522 } \ 1523 /* Loop with tail predication expected here */ \ 1524 k = count % 0x10U; \ 1525 if (k > 0U) \ 1526 { \ 1527 mve_pred16_t p0 = vctp8q(k); \ 1528 yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ 1529 xVec = vldrbq_s8(&pSrcX[1]); \ 1530 acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ 1531 xVec = vld1q(pSrcX); pSrcX += 16; \ 1532 acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ 1533 } \ 1534 acc0 = __SSAT(acc0 >> 7, 8); \ 1535 acc1 = __SSAT(acc1 >> 7, 8); \ 1536 } 1537 1538 1539 #define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count) \ 1540 { \ 1541 q7_t const *pSrcX; \ 1542 q7x16_t xVec, yVec; \ 1543 uint32_t k; \ 1544 \ 1545 pSrcX = (q7_t const *) pX; \ 1546 k = (count-1) >> 4; \ 1547 \ 1548 while (k > 0U) \ 1549 { \ 1550 yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ 1551 pY-=16; \ 1552 xVec = vldrbq_s8(&pSrcX[1]); \ 1553 acc1 = vmladavaq(acc1, xVec, yVec); \ 1554 xVec = vld1q(pSrcX); pSrcX += 16; \ 1555 acc0 = vmladavaq(acc0, xVec, yVec); \ 1556 /* Decrement the loop counter */ \ 1557 k--; \ 1558 } \ 1559 k = (count - 1) % 0x10U; \ 1560 /* use predication to finalize MAC sum */ \ 1561 /* acc1 requires exact number of sample (count-1) */ \ 1562 /* disable extra lanes in final MAC computation */ \ 1563 mve_pred16_t p0 = vctp8q(k); \ 1564 yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ 1565 xVec = vldrbq_s8(&pSrcX[1]); \ 1566 acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ 1567 /* acc0 requires 1 additional sample (count) */ \ 1568 /* so add 1 to unmask an extra lane in final MAC computation */ \ 1569 p0 = vctp8q(k+1); \ 1570 xVec = vld1q(pSrcX); pSrcX += 16; \ 1571 acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ 1572 \ 1573 acc0 = (acc0 >> 7); \ 1574 acc1 = (acc1 >> 7); \ 1575 acc0 = __SSAT(acc0, 8); \ 1576 acc1 = __SSAT(acc1, 8); \ 1577 } 1578 1579 #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */ 1580 1581 #ifdef __cplusplus 1582 } 1583 #endif 1584 1585 1586 #endif /* _ARM_VEC_FILTERING_H_ */ 1587