1 /****************************************************************************** 2 * @file matrix_utils.h 3 * @brief Public header file for CMSIS DSP Library 4 * @version V1.11.0 5 * @date 30 May 2022 6 * Target Processor: Cortex-M and Cortex-A cores 7 ******************************************************************************/ 8 /* 9 * Copyright (c) 2010-2022 Arm Limited or its affiliates. All rights reserved. 10 * 11 * SPDX-License-Identifier: Apache-2.0 12 * 13 * Licensed under the Apache License, Version 2.0 (the License); you may 14 * not use this file except in compliance with the License. 15 * You may obtain a copy of the License at 16 * 17 * www.apache.org/licenses/LICENSE-2.0 18 * 19 * Unless required by applicable law or agreed to in writing, software 20 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 21 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 * See the License for the specific language governing permissions and 23 * limitations under the License. 24 */ 25 26 27 #ifndef _MATRIX_UTILS_H_ 28 #define _MATRIX_UTILS_H_ 29 30 #include "arm_math_types.h" 31 #include "arm_math_memory.h" 32 33 #include "dsp/none.h" 34 #include "dsp/utils.h" 35 36 #ifdef __cplusplus 37 extern "C" 38 { 39 #endif 40 41 #define ELEM(A,ROW,COL) &((A)->pData[(A)->numCols* (ROW) + (COL)]) 42 43 #define SCALE_COL_T(T,CAST,A,ROW,v,i) \ 44 { \ 45 int32_t _w; \ 46 T *data = (A)->pData; \ 47 const int32_t _numCols = (A)->numCols; \ 48 const int32_t nb = (A)->numRows - ROW;\ 49 \ 50 data += i + _numCols * (ROW); \ 51 \ 52 for(_w=0;_w < nb; _w++) \ 53 { \ 54 *data *= CAST v; \ 55 data += _numCols; \ 56 } \ 57 } 58 59 #define COPY_COL_T(T,A,ROW,COL,DST) \ 60 { \ 61 uint32_t _row; \ 62 T *_pb=DST; \ 63 T *_pa = (A)->pData + ROW * (A)->numCols + COL;\ 64 for(_row = ROW; _row < (A)->numRows; _row ++) \ 65 { \ 66 *_pb++ = *_pa; \ 67 _pa += (A)->numCols; \ 68 } \ 69 } 70 71 #if defined(ARM_FLOAT16_SUPPORTED) 72 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) 73 74 #define SWAP_ROWS_F16(A,COL,i,j) \ 75 { \ 76 int cnt = ((A)->numCols)-(COL); \ 77 int32_t _w; \ 78 float16_t *data = (A)->pData; \ 79 const int32_t _numCols = (A)->numCols; \ 80 \ 81 for(_w=(COL);_w < _numCols; _w+=8) \ 82 { \ 83 f16x8_t tmpa,tmpb; \ 84 mve_pred16_t p0 = vctp16q(cnt); \ 85 \ 86 tmpa=vldrhq_z_f16(&data[i*_numCols + _w],p0);\ 87 tmpb=vldrhq_z_f16(&data[j*_numCols + _w],p0);\ 88 \ 89 vstrhq_p(&data[i*_numCols + _w], tmpb, p0); \ 90 vstrhq_p(&data[j*_numCols + _w], tmpa, p0); \ 91 \ 92 cnt -= 8; \ 93 } \ 94 } 95 96 #define SCALE_ROW_F16(A,COL,v,i) \ 97 { \ 98 int cnt = ((A)->numCols)-(COL); \ 99 int32_t _w; \ 100 float16_t *data = (A)->pData; \ 101 const int32_t _numCols = (A)->numCols; \ 102 \ 103 for(_w=(COL);_w < _numCols; _w+=8) \ 104 { \ 105 f16x8_t tmpa; \ 106 mve_pred16_t p0 = vctp16q(cnt); \ 107 tmpa = vldrhq_z_f16(&data[i*_numCols + _w],p0);\ 108 tmpa = vmulq_n_f16(tmpa,(_Float16)v); \ 109 vstrhq_p(&data[i*_numCols + _w], tmpa, p0); \ 110 cnt -= 8; \ 111 } \ 112 \ 113 } 114 115 #define MAC_ROW_F16(COL,A,i,v,B,j) \ 116 { \ 117 int cnt = ((A)->numCols)-(COL); \ 118 int32_t _w; \ 119 float16_t *dataA = (A)->pData; \ 120 float16_t *dataB = (B)->pData; \ 121 const int32_t _numCols = (A)->numCols; \ 122 \ 123 for(_w=(COL);_w < _numCols; _w+=8) \ 124 { \ 125 f16x8_t tmpa,tmpb; \ 126 mve_pred16_t p0 = vctp16q(cnt); \ 127 tmpa = vldrhq_z_f16(&dataA[i*_numCols + _w],p0);\ 128 tmpb = vldrhq_z_f16(&dataB[j*_numCols + _w],p0);\ 129 tmpa = vfmaq_n_f16(tmpa,tmpb,v); \ 130 vstrhq_p(&dataA[i*_numCols + _w], tmpa, p0); \ 131 cnt -= 8; \ 132 } \ 133 \ 134 } 135 136 #define MAS_ROW_F16(COL,A,i,v,B,j) \ 137 { \ 138 int cnt = ((A)->numCols)-(COL); \ 139 int32_t _w; \ 140 float16_t *dataA = (A)->pData; \ 141 float16_t *dataB = (B)->pData; \ 142 const int32_t _numCols = (A)->numCols; \ 143 f16x8_t vec=vdupq_n_f16(v); \ 144 \ 145 for(_w=(COL);_w < _numCols; _w+=8) \ 146 { \ 147 f16x8_t tmpa,tmpb; \ 148 mve_pred16_t p0 = vctp16q(cnt); \ 149 tmpa = vldrhq_z_f16(&dataA[i*_numCols + _w],p0);\ 150 tmpb = vldrhq_z_f16(&dataB[j*_numCols + _w],p0);\ 151 tmpa = vfmsq_f16(tmpa,tmpb,vec); \ 152 vstrhq_p(&dataA[i*_numCols + _w], tmpa, p0); \ 153 cnt -= 8; \ 154 } \ 155 \ 156 } 157 158 #else 159 160 161 #define SWAP_ROWS_F16(A,COL,i,j) \ 162 { \ 163 int32_t _w; \ 164 float16_t *dataI = (A)->pData; \ 165 float16_t *dataJ = (A)->pData; \ 166 const int32_t _numCols = (A)->numCols;\ 167 const int32_t nb = _numCols-(COL); \ 168 \ 169 dataI += i*_numCols + (COL); \ 170 dataJ += j*_numCols + (COL); \ 171 \ 172 for(_w=0;_w < nb; _w++) \ 173 { \ 174 float16_t tmp; \ 175 tmp = *dataI; \ 176 *dataI++ = *dataJ; \ 177 *dataJ++ = tmp; \ 178 } \ 179 } 180 181 #define SCALE_ROW_F16(A,COL,v,i) \ 182 { \ 183 int32_t _w; \ 184 float16_t *data = (A)->pData; \ 185 const int32_t _numCols = (A)->numCols;\ 186 const int32_t nb = _numCols-(COL); \ 187 \ 188 data += i*_numCols + (COL); \ 189 \ 190 for(_w=0;_w < nb; _w++) \ 191 { \ 192 *data++ *= (_Float16)v; \ 193 } \ 194 } 195 196 197 #define MAC_ROW_F16(COL,A,i,v,B,j) \ 198 { \ 199 int32_t _w; \ 200 float16_t *dataA = (A)->pData; \ 201 float16_t *dataB = (B)->pData; \ 202 const int32_t _numCols = (A)->numCols; \ 203 const int32_t nb = _numCols-(COL); \ 204 \ 205 dataA += i*_numCols + (COL); \ 206 dataB += j*_numCols + (COL); \ 207 \ 208 for(_w=0;_w < nb; _w++) \ 209 { \ 210 *dataA++ += (_Float16)v * (_Float16)*dataB++;\ 211 } \ 212 } 213 214 #define MAS_ROW_F16(COL,A,i,v,B,j) \ 215 { \ 216 int32_t _w; \ 217 float16_t *dataA = (A)->pData; \ 218 float16_t *dataB = (B)->pData; \ 219 const int32_t _numCols = (A)->numCols; \ 220 const int32_t nb = _numCols-(COL); \ 221 \ 222 dataA += i*_numCols + (COL); \ 223 dataB += j*_numCols + (COL); \ 224 \ 225 for(_w=0;_w < nb; _w++) \ 226 { \ 227 *dataA++ -= (_Float16)v * (_Float16)*dataB++;\ 228 } \ 229 } 230 231 #endif /*defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)*/ 232 233 /* Functions with only a scalar version */ 234 #define COPY_COL_F16(A,ROW,COL,DST) \ 235 COPY_COL_T(float16_t,A,ROW,COL,DST) 236 237 #define SCALE_COL_F16(A,ROW,v,i) \ 238 SCALE_COL_T(float16_t,(_Float16),A,ROW,v,i) 239 240 #endif /* defined(ARM_FLOAT16_SUPPORTED)*/ 241 242 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) 243 244 #define SWAP_ROWS_F32(A,COL,i,j) \ 245 { \ 246 int cnt = ((A)->numCols)-(COL); \ 247 float32_t *data = (A)->pData; \ 248 const int32_t _numCols = (A)->numCols; \ 249 int32_t _w; \ 250 \ 251 for(_w=(COL);_w < _numCols; _w+=4) \ 252 { \ 253 f32x4_t tmpa,tmpb; \ 254 mve_pred16_t p0 = vctp32q(cnt); \ 255 \ 256 tmpa=vldrwq_z_f32(&data[i*_numCols + _w],p0);\ 257 tmpb=vldrwq_z_f32(&data[j*_numCols + _w],p0);\ 258 \ 259 vstrwq_p(&data[i*_numCols + _w], tmpb, p0); \ 260 vstrwq_p(&data[j*_numCols + _w], tmpa, p0); \ 261 \ 262 cnt -= 4; \ 263 } \ 264 } 265 266 #define MAC_ROW_F32(COL,A,i,v,B,j) \ 267 { \ 268 int cnt = ((A)->numCols)-(COL); \ 269 float32_t *dataA = (A)->pData; \ 270 float32_t *dataB = (B)->pData; \ 271 const int32_t _numCols = (A)->numCols; \ 272 int32_t _w; \ 273 \ 274 for(_w=(COL);_w < _numCols; _w+=4) \ 275 { \ 276 f32x4_t tmpa,tmpb; \ 277 mve_pred16_t p0 = vctp32q(cnt); \ 278 tmpa = vldrwq_z_f32(&dataA[i*_numCols + _w],p0);\ 279 tmpb = vldrwq_z_f32(&dataB[j*_numCols + _w],p0);\ 280 tmpa = vfmaq_n_f32(tmpa,tmpb,v); \ 281 vstrwq_p(&dataA[i*_numCols + _w], tmpa, p0); \ 282 cnt -= 4; \ 283 } \ 284 \ 285 } 286 287 #define MAS_ROW_F32(COL,A,i,v,B,j) \ 288 { \ 289 int cnt = ((A)->numCols)-(COL); \ 290 float32_t *dataA = (A)->pData; \ 291 float32_t *dataB = (B)->pData; \ 292 const int32_t _numCols = (A)->numCols; \ 293 int32_t _w; \ 294 f32x4_t vec=vdupq_n_f32(v); \ 295 \ 296 for(_w=(COL);_w < _numCols; _w+=4) \ 297 { \ 298 f32x4_t tmpa,tmpb; \ 299 mve_pred16_t p0 = vctp32q(cnt); \ 300 tmpa = vldrwq_z_f32(&dataA[i*_numCols + _w],p0);\ 301 tmpb = vldrwq_z_f32(&dataB[j*_numCols + _w],p0);\ 302 tmpa = vfmsq_f32(tmpa,tmpb,vec); \ 303 vstrwq_p(&dataA[i*_numCols + _w], tmpa, p0); \ 304 cnt -= 4; \ 305 } \ 306 \ 307 } 308 309 #define SCALE_ROW_F32(A,COL,v,i) \ 310 { \ 311 int cnt = ((A)->numCols)-(COL); \ 312 float32_t *data = (A)->pData; \ 313 const int32_t _numCols = (A)->numCols; \ 314 int32_t _w; \ 315 \ 316 for(_w=(COL);_w < _numCols; _w+=4) \ 317 { \ 318 f32x4_t tmpa; \ 319 mve_pred16_t p0 = vctp32q(cnt); \ 320 tmpa = vldrwq_z_f32(&data[i*_numCols + _w],p0);\ 321 tmpa = vmulq_n_f32(tmpa,v); \ 322 vstrwq_p(&data[i*_numCols + _w], tmpa, p0); \ 323 cnt -= 4; \ 324 } \ 325 \ 326 } 327 328 #elif defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) 329 330 #define SWAP_ROWS_F32(A,COL,i,j) \ 331 { \ 332 int32_t _w; \ 333 float32_t *dataI = (A)->pData; \ 334 float32_t *dataJ = (A)->pData; \ 335 const int32_t _numCols = (A)->numCols;\ 336 const int32_t nb = _numCols - COL; \ 337 \ 338 dataI += i*_numCols + (COL); \ 339 dataJ += j*_numCols + (COL); \ 340 \ 341 float32_t tmp; \ 342 \ 343 for(_w=0;_w < nb; _w++) \ 344 { \ 345 tmp = *dataI; \ 346 *dataI++ = *dataJ; \ 347 *dataJ++ = tmp; \ 348 } \ 349 } 350 351 #define MAC_ROW_F32(COL,A,i,v,B,j) \ 352 { \ 353 float32_t *dataA = (A)->pData; \ 354 float32_t *dataB = (B)->pData; \ 355 const int32_t _numCols = (A)->numCols;\ 356 const int32_t nb = _numCols - (COL); \ 357 int32_t nbElems; \ 358 f32x4_t vec = vdupq_n_f32(v); \ 359 \ 360 nbElems = nb >> 2; \ 361 \ 362 dataA += i*_numCols + (COL); \ 363 dataB += j*_numCols + (COL); \ 364 \ 365 while(nbElems>0) \ 366 { \ 367 f32x4_t tmpa,tmpb; \ 368 tmpa = vld1q_f32(dataA,p0); \ 369 tmpb = vld1q_f32(dataB,p0); \ 370 tmpa = vmlaq_f32(tmpa,tmpb,vec);\ 371 vst1q_f32(dataA, tmpa, p0); \ 372 nbElems--; \ 373 dataA += 4; \ 374 dataB += 4; \ 375 } \ 376 \ 377 nbElems = nb & 3; \ 378 while(nbElems > 0) \ 379 { \ 380 *dataA++ += v* *dataB++; \ 381 nbElems--; \ 382 } \ 383 } 384 385 #define MAS_ROW_F32(COL,A,i,v,B,j) \ 386 { \ 387 float32_t *dataA = (A)->pData; \ 388 float32_t *dataB = (B)->pData; \ 389 const int32_t _numCols = (A)->numCols;\ 390 const int32_t nb = _numCols - (COL); \ 391 int32_t nbElems; \ 392 f32x4_t vec = vdupq_n_f32(v); \ 393 \ 394 nbElems = nb >> 2; \ 395 \ 396 dataA += i*_numCols + (COL); \ 397 dataB += j*_numCols + (COL); \ 398 \ 399 while(nbElems>0) \ 400 { \ 401 f32x4_t tmpa,tmpb; \ 402 tmpa = vld1q_f32(dataA); \ 403 tmpb = vld1q_f32(dataB); \ 404 tmpa = vmlsq_f32(tmpa,tmpb,vec);\ 405 vst1q_f32(dataA, tmpa); \ 406 nbElems--; \ 407 dataA += 4; \ 408 dataB += 4; \ 409 } \ 410 \ 411 nbElems = nb & 3; \ 412 while(nbElems > 0) \ 413 { \ 414 *dataA++ -= v* *dataB++; \ 415 nbElems--; \ 416 } \ 417 } 418 419 #define SCALE_ROW_F32(A,COL,v,i) \ 420 { \ 421 float32_t *data = (A)->pData; \ 422 const int32_t _numCols = (A)->numCols; \ 423 const int32_t nb = _numCols - (COL); \ 424 int32_t nbElems; \ 425 f32x4_t vec = vdupq_n_f32(v); \ 426 \ 427 nbElems = nb >> 2; \ 428 \ 429 data += i*_numCols + (COL); \ 430 while(nbElems>0) \ 431 { \ 432 f32x4_t tmpa; \ 433 tmpa = vld1q_f32(data); \ 434 tmpa = vmulq_f32(tmpa,vec); \ 435 vst1q_f32(data, tmpa); \ 436 data += 4; \ 437 nbElems --; \ 438 } \ 439 \ 440 nbElems = nb & 3; \ 441 while(nbElems > 0) \ 442 { \ 443 *data++ *= v; \ 444 nbElems--; \ 445 } \ 446 \ 447 } 448 449 #else 450 451 #define SWAP_ROWS_F32(A,COL,i,j) \ 452 { \ 453 int32_t _w; \ 454 float32_t tmp; \ 455 float32_t *dataI = (A)->pData; \ 456 float32_t *dataJ = (A)->pData; \ 457 const int32_t _numCols = (A)->numCols;\ 458 const int32_t nb = _numCols - COL; \ 459 \ 460 dataI += i*_numCols + (COL); \ 461 dataJ += j*_numCols + (COL); \ 462 \ 463 \ 464 for(_w=0;_w < nb; _w++) \ 465 { \ 466 tmp = *dataI; \ 467 *dataI++ = *dataJ; \ 468 *dataJ++ = tmp; \ 469 } \ 470 } 471 472 #define SCALE_ROW_F32(A,COL,v,i) \ 473 { \ 474 int32_t _w; \ 475 float32_t *data = (A)->pData; \ 476 const int32_t _numCols = (A)->numCols;\ 477 const int32_t nb = _numCols - COL; \ 478 \ 479 data += i*_numCols + (COL); \ 480 \ 481 for(_w=0;_w < nb; _w++) \ 482 { \ 483 *data++ *= v; \ 484 } \ 485 } 486 487 488 #define MAC_ROW_F32(COL,A,i,v,B,j) \ 489 { \ 490 int32_t _w; \ 491 float32_t *dataA = (A)->pData; \ 492 float32_t *dataB = (B)->pData; \ 493 const int32_t _numCols = (A)->numCols;\ 494 const int32_t nb = _numCols-(COL); \ 495 \ 496 dataA = dataA + i*_numCols + (COL); \ 497 dataB = dataB + j*_numCols + (COL); \ 498 \ 499 for(_w=0;_w < nb; _w++) \ 500 { \ 501 *dataA++ += v* *dataB++; \ 502 } \ 503 } 504 505 #define MAS_ROW_F32(COL,A,i,v,B,j) \ 506 { \ 507 int32_t _w; \ 508 float32_t *dataA = (A)->pData; \ 509 float32_t *dataB = (B)->pData; \ 510 const int32_t _numCols = (A)->numCols;\ 511 const int32_t nb = _numCols-(COL); \ 512 \ 513 dataA = dataA + i*_numCols + (COL); \ 514 dataB = dataB + j*_numCols + (COL); \ 515 \ 516 for(_w=0;_w < nb; _w++) \ 517 { \ 518 *dataA++ -= v* *dataB++; \ 519 } \ 520 } 521 522 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 523 524 525 /* Functions _with only a scalar version */ 526 527 #define COPY_COL_F32(A,ROW,COL,DST) \ 528 COPY_COL_T(float32_t,A,ROW,COL,DST) 529 530 #define COPY_COL_F64(A,ROW,COL,DST) \ 531 COPY_COL_T(float64_t,A,ROW,COL,DST) 532 533 #define SWAP_COLS_F32(A,COL,i,j) \ 534 { \ 535 int32_t _w; \ 536 float32_t *data = (A)->pData; \ 537 const int32_t _numCols = (A)->numCols; \ 538 for(_w=(COL);_w < _numCols; _w++) \ 539 { \ 540 float32_t tmp; \ 541 tmp = data[_w*_numCols + i]; \ 542 data[_w*_numCols + i] = data[_w*_numCols + j];\ 543 data[_w*_numCols + j] = tmp; \ 544 } \ 545 } 546 547 #define SCALE_COL_F32(A,ROW,v,i) \ 548 SCALE_COL_T(float32_t,,A,ROW,v,i) 549 550 #define SWAP_ROWS_F64(A,COL,i,j) \ 551 { \ 552 int32_t _w; \ 553 float64_t *dataI = (A)->pData; \ 554 float64_t *dataJ = (A)->pData; \ 555 const int32_t _numCols = (A)->numCols;\ 556 const int32_t nb = _numCols-(COL); \ 557 \ 558 dataI += i*_numCols + (COL); \ 559 dataJ += j*_numCols + (COL); \ 560 \ 561 for(_w=0;_w < nb; _w++) \ 562 { \ 563 float64_t tmp; \ 564 tmp = *dataI; \ 565 *dataI++ = *dataJ; \ 566 *dataJ++ = tmp; \ 567 } \ 568 } 569 570 #define SWAP_COLS_F64(A,COL,i,j) \ 571 { \ 572 int32_t _w; \ 573 float64_t *data = (A)->pData; \ 574 const int32_t _numCols = (A)->numCols; \ 575 for(_w=(COL);_w < _numCols; _w++) \ 576 { \ 577 float64_t tmp; \ 578 tmp = data[_w*_numCols + i]; \ 579 data[_w*_numCols + i] = data[_w*_numCols + j];\ 580 data[_w*_numCols + j] = tmp; \ 581 } \ 582 } 583 584 #define SCALE_ROW_F64(A,COL,v,i) \ 585 { \ 586 int32_t _w; \ 587 float64_t *data = (A)->pData; \ 588 const int32_t _numCols = (A)->numCols;\ 589 const int32_t nb = _numCols-(COL); \ 590 \ 591 data += i*_numCols + (COL); \ 592 \ 593 for(_w=0;_w < nb; _w++) \ 594 { \ 595 *data++ *= v; \ 596 } \ 597 } 598 599 #define SCALE_COL_F64(A,ROW,v,i) \ 600 SCALE_COL_T(float64_t,,A,ROW,v,i) 601 602 #define MAC_ROW_F64(COL,A,i,v,B,j) \ 603 { \ 604 int32_t _w; \ 605 float64_t *dataA = (A)->pData; \ 606 float64_t *dataB = (B)->pData; \ 607 const int32_t _numCols = (A)->numCols;\ 608 const int32_t nb = _numCols-(COL); \ 609 \ 610 dataA += i*_numCols + (COL); \ 611 dataB += j*_numCols + (COL); \ 612 \ 613 for(_w=0;_w < nb; _w++) \ 614 { \ 615 *dataA++ += v* *dataB++; \ 616 } \ 617 } 618 619 #define MAS_ROW_F64(COL,A,i,v,B,j) \ 620 { \ 621 int32_t _w; \ 622 float64_t *dataA = (A)->pData; \ 623 float64_t *dataB = (B)->pData; \ 624 const int32_t _numCols = (A)->numCols;\ 625 const int32_t nb = _numCols-(COL); \ 626 \ 627 dataA += i*_numCols + (COL); \ 628 dataB += j*_numCols + (COL); \ 629 \ 630 for(_w=0;_w < nb; _w++) \ 631 { \ 632 *dataA++ -= v* *dataB++; \ 633 } \ 634 } 635 636 #ifdef __cplusplus 637 } 638 #endif 639 640 #endif /* ifndef _MATRIX_UTILS_H_ */ 641