1 /******************************************************************************
2  * @file     matrix_utils.h
3  * @brief    Public header file for CMSIS DSP Library
4  * @version  V1.11.0
5  * @date     30 May 2022
6  * Target Processor: Cortex-M and Cortex-A cores
7  ******************************************************************************/
8 /*
9  * Copyright (c) 2010-2022 Arm Limited or its affiliates. All rights reserved.
10  *
11  * SPDX-License-Identifier: Apache-2.0
12  *
13  * Licensed under the Apache License, Version 2.0 (the License); you may
14  * not use this file except in compliance with the License.
15  * You may obtain a copy of the License at
16  *
17  * www.apache.org/licenses/LICENSE-2.0
18  *
19  * Unless required by applicable law or agreed to in writing, software
20  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
21  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  * See the License for the specific language governing permissions and
23  * limitations under the License.
24  */
25 
26 
27 #ifndef _MATRIX_UTILS_H_
28 #define _MATRIX_UTILS_H_
29 
30 #include "arm_math_types.h"
31 #include "arm_math_memory.h"
32 
33 #include "dsp/none.h"
34 #include "dsp/utils.h"
35 
36 #ifdef   __cplusplus
37 extern "C"
38 {
39 #endif
40 
41 #define ELEM(A,ROW,COL) &((A)->pData[(A)->numCols* (ROW) + (COL)])
42 
43 #define SCALE_COL_T(T,CAST,A,ROW,v,i)        \
44 {                                       \
45   int32_t _w;                            \
46   T *data = (A)->pData;                 \
47   const int32_t _numCols = (A)->numCols; \
48   const int32_t nb = (A)->numRows - ROW;\
49                                         \
50   data += i + _numCols * (ROW);          \
51                                         \
52   for(_w=0;_w < nb; _w++)                  \
53   {                                     \
54      *data *= CAST v;                   \
55      data += _numCols;                   \
56   }                                     \
57 }
58 
59 #define COPY_COL_T(T,A,ROW,COL,DST)               \
60 {                                                 \
61     uint32_t _row;                                \
62     T *_pb=DST;                                    \
63     T *_pa = (A)->pData + ROW * (A)->numCols + COL;\
64     for(_row = ROW; _row < (A)->numRows; _row ++) \
65     {                                             \
66          *_pb++ = *_pa;                             \
67          _pa += (A)->numCols;                      \
68     }                                             \
69 }
70 
71 #if defined(ARM_FLOAT16_SUPPORTED)
72 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
73 
74 #define SWAP_ROWS_F16(A,COL,i,j)                  \
75   {                                               \
76     int cnt = ((A)->numCols)-(COL);               \
77     int32_t _w;                                    \
78     float16_t *data = (A)->pData;                 \
79     const int32_t _numCols = (A)->numCols;        \
80                                                   \
81     for(_w=(COL);_w < _numCols; _w+=8)               \
82     {                                             \
83        f16x8_t tmpa,tmpb;                         \
84        mve_pred16_t p0 = vctp16q(cnt);            \
85                                                   \
86        tmpa=vldrhq_z_f16(&data[i*_numCols + _w],p0);\
87        tmpb=vldrhq_z_f16(&data[j*_numCols + _w],p0);\
88                                                   \
89        vstrhq_p(&data[i*_numCols + _w], tmpb, p0);  \
90        vstrhq_p(&data[j*_numCols + _w], tmpa, p0);  \
91                                                   \
92        cnt -= 8;                                  \
93     }                                             \
94   }
95 
96 #define SCALE_ROW_F16(A,COL,v,i)                   \
97 {                                                   \
98   int cnt = ((A)->numCols)-(COL);                   \
99   int32_t _w;                                       \
100   float16_t *data = (A)->pData;                     \
101   const int32_t _numCols = (A)->numCols;            \
102                                                     \
103   for(_w=(COL);_w < _numCols; _w+=8)                    \
104   {                                                 \
105        f16x8_t tmpa;                                \
106        mve_pred16_t p0 = vctp16q(cnt);              \
107        tmpa = vldrhq_z_f16(&data[i*_numCols + _w],p0);\
108        tmpa = vmulq_n_f16(tmpa,(_Float16)v);                  \
109        vstrhq_p(&data[i*_numCols + _w], tmpa, p0);    \
110        cnt -= 8;                                    \
111   }                                                 \
112                                                     \
113 }
114 
115 #define MAC_ROW_F16(COL,A,i,v,B,j)                   \
116 {                                                    \
117   int cnt = ((A)->numCols)-(COL);                    \
118   int32_t _w;                                        \
119   float16_t *dataA = (A)->pData;                     \
120   float16_t *dataB = (B)->pData;                     \
121   const int32_t _numCols = (A)->numCols;             \
122                                                      \
123   for(_w=(COL);_w < _numCols; _w+=8)                     \
124   {                                                  \
125        f16x8_t tmpa,tmpb;                            \
126        mve_pred16_t p0 = vctp16q(cnt);               \
127        tmpa = vldrhq_z_f16(&dataA[i*_numCols + _w],p0);\
128        tmpb = vldrhq_z_f16(&dataB[j*_numCols + _w],p0);\
129        tmpa = vfmaq_n_f16(tmpa,tmpb,v);              \
130        vstrhq_p(&dataA[i*_numCols + _w], tmpa, p0);    \
131        cnt -= 8;                                     \
132   }                                                  \
133                                                      \
134 }
135 
136 #define MAS_ROW_F16(COL,A,i,v,B,j)                   \
137 {                                                    \
138   int cnt = ((A)->numCols)-(COL);                    \
139   int32_t _w;                                        \
140   float16_t *dataA = (A)->pData;                     \
141   float16_t *dataB = (B)->pData;                     \
142   const int32_t _numCols = (A)->numCols;             \
143   f16x8_t vec=vdupq_n_f16(v);                        \
144                                                      \
145   for(_w=(COL);_w < _numCols; _w+=8)                     \
146   {                                                  \
147        f16x8_t tmpa,tmpb;                            \
148        mve_pred16_t p0 = vctp16q(cnt);               \
149        tmpa = vldrhq_z_f16(&dataA[i*_numCols + _w],p0);\
150        tmpb = vldrhq_z_f16(&dataB[j*_numCols + _w],p0);\
151        tmpa = vfmsq_f16(tmpa,tmpb,vec);              \
152        vstrhq_p(&dataA[i*_numCols + _w], tmpa, p0);    \
153        cnt -= 8;                                     \
154   }                                                  \
155                                                      \
156 }
157 
158 #else
159 
160 
161 #define SWAP_ROWS_F16(A,COL,i,j)       \
162 {                                      \
163   int32_t _w;                           \
164   float16_t *dataI = (A)->pData;       \
165   float16_t *dataJ = (A)->pData;       \
166   const int32_t _numCols = (A)->numCols;\
167   const int32_t nb = _numCols-(COL);    \
168                                        \
169   dataI += i*_numCols + (COL);          \
170   dataJ += j*_numCols + (COL);          \
171                                        \
172   for(_w=0;_w < nb; _w++)                 \
173   {                                    \
174      float16_t tmp;                    \
175      tmp = *dataI;                     \
176      *dataI++ = *dataJ;                \
177      *dataJ++ = tmp;                   \
178   }                                    \
179 }
180 
181 #define SCALE_ROW_F16(A,COL,v,i)       \
182 {                                      \
183   int32_t _w;                           \
184   float16_t *data = (A)->pData;        \
185   const int32_t _numCols = (A)->numCols;\
186   const int32_t nb = _numCols-(COL);    \
187                                        \
188   data += i*_numCols + (COL);           \
189                                        \
190   for(_w=0;_w < nb; _w++)                 \
191   {                                    \
192      *data++ *= (_Float16)v;           \
193   }                                    \
194 }
195 
196 
197 #define MAC_ROW_F16(COL,A,i,v,B,j)                \
198 {                                                 \
199   int32_t _w;                                      \
200   float16_t *dataA = (A)->pData;                  \
201   float16_t *dataB = (B)->pData;                  \
202   const int32_t _numCols = (A)->numCols;           \
203   const int32_t nb = _numCols-(COL);               \
204                                                   \
205   dataA += i*_numCols + (COL);                     \
206   dataB += j*_numCols + (COL);                     \
207                                                   \
208   for(_w=0;_w < nb; _w++)                            \
209   {                                               \
210      *dataA++ += (_Float16)v * (_Float16)*dataB++;\
211   }                                               \
212 }
213 
214 #define MAS_ROW_F16(COL,A,i,v,B,j)                \
215 {                                                 \
216   int32_t _w;                                      \
217   float16_t *dataA = (A)->pData;                  \
218   float16_t *dataB = (B)->pData;                  \
219   const int32_t _numCols = (A)->numCols;           \
220   const int32_t nb = _numCols-(COL);               \
221                                                   \
222   dataA += i*_numCols + (COL);                     \
223   dataB += j*_numCols + (COL);                     \
224                                                   \
225   for(_w=0;_w < nb; _w++)                            \
226   {                                               \
227      *dataA++ -= (_Float16)v * (_Float16)*dataB++;\
228   }                                               \
229 }
230 
231 #endif /*defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)*/
232 
233 /* Functions with only a scalar version */
234 #define COPY_COL_F16(A,ROW,COL,DST) \
235   COPY_COL_T(float16_t,A,ROW,COL,DST)
236 
237 #define SCALE_COL_F16(A,ROW,v,i)        \
238   SCALE_COL_T(float16_t,(_Float16),A,ROW,v,i)
239 
240 #endif /* defined(ARM_FLOAT16_SUPPORTED)*/
241 
242 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
243 
244 #define SWAP_ROWS_F32(A,COL,i,j)                  \
245   {                                               \
246     int cnt = ((A)->numCols)-(COL);               \
247     float32_t *data = (A)->pData;                 \
248     const int32_t _numCols = (A)->numCols;        \
249     int32_t _w;                                   \
250                                                   \
251     for(_w=(COL);_w < _numCols; _w+=4)                \
252     {                                             \
253        f32x4_t tmpa,tmpb;                         \
254        mve_pred16_t p0 = vctp32q(cnt);            \
255                                                   \
256        tmpa=vldrwq_z_f32(&data[i*_numCols + _w],p0);\
257        tmpb=vldrwq_z_f32(&data[j*_numCols + _w],p0);\
258                                                   \
259        vstrwq_p(&data[i*_numCols + _w], tmpb, p0);  \
260        vstrwq_p(&data[j*_numCols + _w], tmpa, p0);  \
261                                                   \
262        cnt -= 4;                                  \
263     }                                             \
264   }
265 
266 #define MAC_ROW_F32(COL,A,i,v,B,j)                   \
267 {                                                    \
268   int cnt = ((A)->numCols)-(COL);                    \
269   float32_t *dataA = (A)->pData;                     \
270   float32_t *dataB = (B)->pData;                     \
271   const int32_t _numCols = (A)->numCols;             \
272   int32_t _w;                                        \
273                                                      \
274   for(_w=(COL);_w < _numCols; _w+=4)                     \
275   {                                                  \
276        f32x4_t tmpa,tmpb;                            \
277        mve_pred16_t p0 = vctp32q(cnt);               \
278        tmpa = vldrwq_z_f32(&dataA[i*_numCols + _w],p0);\
279        tmpb = vldrwq_z_f32(&dataB[j*_numCols + _w],p0);\
280        tmpa = vfmaq_n_f32(tmpa,tmpb,v);              \
281        vstrwq_p(&dataA[i*_numCols + _w], tmpa, p0);    \
282        cnt -= 4;                                     \
283   }                                                  \
284                                                      \
285 }
286 
287 #define MAS_ROW_F32(COL,A,i,v,B,j)                   \
288 {                                                    \
289   int cnt = ((A)->numCols)-(COL);                    \
290   float32_t *dataA = (A)->pData;                     \
291   float32_t *dataB = (B)->pData;                     \
292   const int32_t _numCols = (A)->numCols;             \
293   int32_t _w;                                        \
294   f32x4_t vec=vdupq_n_f32(v);                        \
295                                                      \
296   for(_w=(COL);_w < _numCols; _w+=4)                     \
297   {                                                  \
298        f32x4_t tmpa,tmpb;                            \
299        mve_pred16_t p0 = vctp32q(cnt);               \
300        tmpa = vldrwq_z_f32(&dataA[i*_numCols + _w],p0);\
301        tmpb = vldrwq_z_f32(&dataB[j*_numCols + _w],p0);\
302        tmpa = vfmsq_f32(tmpa,tmpb,vec);              \
303        vstrwq_p(&dataA[i*_numCols + _w], tmpa, p0);    \
304        cnt -= 4;                                     \
305   }                                                  \
306                                                      \
307 }
308 
309 #define SCALE_ROW_F32(A,COL,v,i)                    \
310 {                                                   \
311   int cnt = ((A)->numCols)-(COL);                   \
312   float32_t *data = (A)->pData;                     \
313   const int32_t _numCols = (A)->numCols;            \
314   int32_t _w;                                       \
315                                                     \
316   for(_w=(COL);_w < _numCols; _w+=4)                    \
317   {                                                 \
318        f32x4_t tmpa;                                \
319        mve_pred16_t p0 = vctp32q(cnt);              \
320        tmpa = vldrwq_z_f32(&data[i*_numCols + _w],p0);\
321        tmpa = vmulq_n_f32(tmpa,v);                  \
322        vstrwq_p(&data[i*_numCols + _w], tmpa, p0);    \
323        cnt -= 4;                                    \
324   }                                                 \
325                                                     \
326 }
327 
328 #elif defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
329 
330 #define SWAP_ROWS_F32(A,COL,i,j)       \
331 {                                      \
332   int32_t _w;                           \
333   float32_t *dataI = (A)->pData;       \
334   float32_t *dataJ = (A)->pData;       \
335   const int32_t _numCols = (A)->numCols;\
336   const int32_t nb = _numCols - COL;    \
337                                        \
338   dataI += i*_numCols + (COL);          \
339   dataJ += j*_numCols + (COL);          \
340                                        \
341   float32_t tmp;                       \
342                                        \
343   for(_w=0;_w < nb; _w++)                 \
344   {                                    \
345      tmp = *dataI;                     \
346      *dataI++ = *dataJ;                \
347      *dataJ++ = tmp;                   \
348   }                                    \
349 }
350 
351 #define MAC_ROW_F32(COL,A,i,v,B,j)     \
352 {                                      \
353   float32_t *dataA = (A)->pData;       \
354   float32_t *dataB = (B)->pData;       \
355   const int32_t _numCols = (A)->numCols;\
356   const int32_t nb = _numCols - (COL);  \
357   int32_t nbElems;                     \
358   f32x4_t vec = vdupq_n_f32(v);        \
359                                        \
360   nbElems = nb >> 2;                   \
361                                        \
362   dataA += i*_numCols + (COL);          \
363   dataB += j*_numCols + (COL);          \
364                                        \
365   while(nbElems>0)                     \
366   {                                    \
367        f32x4_t tmpa,tmpb;              \
368        tmpa = vld1q_f32(dataA,p0);     \
369        tmpb = vld1q_f32(dataB,p0);     \
370        tmpa = vmlaq_f32(tmpa,tmpb,vec);\
371        vst1q_f32(dataA, tmpa, p0);     \
372        nbElems--;                      \
373        dataA += 4;                     \
374        dataB += 4;                     \
375   }                                    \
376                                        \
377   nbElems = nb & 3;                    \
378   while(nbElems > 0)                   \
379   {                                    \
380      *dataA++ += v* *dataB++;          \
381      nbElems--;                        \
382   }                                    \
383 }
384 
385 #define MAS_ROW_F32(COL,A,i,v,B,j)     \
386 {                                      \
387   float32_t *dataA = (A)->pData;       \
388   float32_t *dataB = (B)->pData;       \
389   const int32_t _numCols = (A)->numCols;\
390   const int32_t nb = _numCols - (COL);  \
391   int32_t nbElems;                     \
392   f32x4_t vec = vdupq_n_f32(v);        \
393                                        \
394   nbElems = nb >> 2;                   \
395                                        \
396   dataA += i*_numCols + (COL);          \
397   dataB += j*_numCols + (COL);          \
398                                        \
399   while(nbElems>0)                     \
400   {                                    \
401        f32x4_t tmpa,tmpb;              \
402        tmpa = vld1q_f32(dataA);        \
403        tmpb = vld1q_f32(dataB);        \
404        tmpa = vmlsq_f32(tmpa,tmpb,vec);\
405        vst1q_f32(dataA, tmpa);         \
406        nbElems--;                      \
407        dataA += 4;                     \
408        dataB += 4;                     \
409   }                                    \
410                                        \
411   nbElems = nb & 3;                    \
412   while(nbElems > 0)                   \
413   {                                    \
414      *dataA++ -= v* *dataB++;          \
415      nbElems--;                        \
416   }                                    \
417 }
418 
419 #define SCALE_ROW_F32(A,COL,v,i)        \
420 {                                       \
421   float32_t *data = (A)->pData;         \
422   const int32_t _numCols = (A)->numCols; \
423   const int32_t nb = _numCols - (COL);   \
424   int32_t nbElems;                      \
425   f32x4_t vec = vdupq_n_f32(v);         \
426                                         \
427   nbElems = nb >> 2;                    \
428                                         \
429   data += i*_numCols + (COL);            \
430   while(nbElems>0)                      \
431   {                                     \
432        f32x4_t tmpa;                    \
433        tmpa = vld1q_f32(data);          \
434        tmpa = vmulq_f32(tmpa,vec);      \
435        vst1q_f32(data, tmpa);           \
436        data += 4;                       \
437        nbElems --;                      \
438   }                                     \
439                                         \
440   nbElems = nb & 3;                     \
441   while(nbElems > 0)                    \
442   {                                     \
443      *data++ *= v;                      \
444      nbElems--;                         \
445   }                                     \
446                                         \
447 }
448 
449 #else
450 
451 #define SWAP_ROWS_F32(A,COL,i,j)       \
452 {                                      \
453   int32_t _w;                           \
454   float32_t tmp;                       \
455   float32_t *dataI = (A)->pData;       \
456   float32_t *dataJ = (A)->pData;       \
457   const int32_t _numCols = (A)->numCols;\
458   const int32_t nb = _numCols - COL;    \
459                                        \
460   dataI += i*_numCols + (COL);          \
461   dataJ += j*_numCols + (COL);          \
462                                        \
463                                        \
464   for(_w=0;_w < nb; _w++)                 \
465   {                                    \
466      tmp = *dataI;                     \
467      *dataI++ = *dataJ;                \
468      *dataJ++ = tmp;                   \
469   }                                    \
470 }
471 
472 #define SCALE_ROW_F32(A,COL,v,i)       \
473 {                                      \
474   int32_t _w;                           \
475   float32_t *data = (A)->pData;        \
476   const int32_t _numCols = (A)->numCols;\
477   const int32_t nb = _numCols - COL;    \
478                                        \
479   data += i*_numCols + (COL);           \
480                                        \
481   for(_w=0;_w < nb; _w++)                 \
482   {                                    \
483      *data++ *= v;                     \
484   }                                    \
485 }
486 
487 
488 #define MAC_ROW_F32(COL,A,i,v,B,j)     \
489 {                                      \
490   int32_t _w;                           \
491   float32_t *dataA = (A)->pData;       \
492   float32_t *dataB = (B)->pData;       \
493   const int32_t _numCols = (A)->numCols;\
494   const int32_t nb = _numCols-(COL);    \
495                                        \
496   dataA = dataA + i*_numCols + (COL);   \
497   dataB = dataB + j*_numCols + (COL);   \
498                                        \
499   for(_w=0;_w < nb; _w++)                 \
500   {                                    \
501      *dataA++ += v* *dataB++;          \
502   }                                    \
503 }
504 
505 #define MAS_ROW_F32(COL,A,i,v,B,j)     \
506 {                                      \
507   int32_t _w;                           \
508   float32_t *dataA = (A)->pData;       \
509   float32_t *dataB = (B)->pData;       \
510   const int32_t _numCols = (A)->numCols;\
511   const int32_t nb = _numCols-(COL);    \
512                                        \
513   dataA = dataA + i*_numCols + (COL);   \
514   dataB = dataB + j*_numCols + (COL);   \
515                                        \
516   for(_w=0;_w < nb; _w++)                 \
517   {                                    \
518      *dataA++ -= v* *dataB++;          \
519   }                                    \
520 }
521 
522 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
523 
524 
525 /* Functions _with only a scalar version */
526 
527 #define COPY_COL_F32(A,ROW,COL,DST) \
528   COPY_COL_T(float32_t,A,ROW,COL,DST)
529 
530 #define COPY_COL_F64(A,ROW,COL,DST) \
531   COPY_COL_T(float64_t,A,ROW,COL,DST)
532 
533 #define SWAP_COLS_F32(A,COL,i,j)               \
534 {                                              \
535   int32_t _w;                                  \
536   float32_t *data = (A)->pData;                \
537   const int32_t _numCols = (A)->numCols;       \
538   for(_w=(COL);_w < _numCols; _w++)                \
539   {                                            \
540      float32_t tmp;                            \
541      tmp = data[_w*_numCols + i];                \
542      data[_w*_numCols + i] = data[_w*_numCols + j];\
543      data[_w*_numCols + j] = tmp;                \
544   }                                            \
545 }
546 
547 #define SCALE_COL_F32(A,ROW,v,i)        \
548   SCALE_COL_T(float32_t,,A,ROW,v,i)
549 
550 #define SWAP_ROWS_F64(A,COL,i,j)       \
551 {                                      \
552   int32_t _w;                           \
553   float64_t *dataI = (A)->pData;       \
554   float64_t *dataJ = (A)->pData;       \
555   const int32_t _numCols = (A)->numCols;\
556   const int32_t nb = _numCols-(COL);    \
557                                        \
558   dataI += i*_numCols + (COL);          \
559   dataJ += j*_numCols + (COL);          \
560                                        \
561   for(_w=0;_w < nb; _w++)                 \
562   {                                    \
563      float64_t tmp;                    \
564      tmp = *dataI;                     \
565      *dataI++ = *dataJ;                \
566      *dataJ++ = tmp;                   \
567   }                                    \
568 }
569 
570 #define SWAP_COLS_F64(A,COL,i,j)               \
571 {                                              \
572   int32_t _w;                                  \
573   float64_t *data = (A)->pData;                \
574   const int32_t _numCols = (A)->numCols;       \
575   for(_w=(COL);_w < _numCols; _w++)                \
576   {                                            \
577      float64_t tmp;                            \
578      tmp = data[_w*_numCols + i];                \
579      data[_w*_numCols + i] = data[_w*_numCols + j];\
580      data[_w*_numCols + j] = tmp;                \
581   }                                            \
582 }
583 
584 #define SCALE_ROW_F64(A,COL,v,i)       \
585 {                                      \
586   int32_t _w;                           \
587   float64_t *data = (A)->pData;        \
588   const int32_t _numCols = (A)->numCols;\
589   const int32_t nb = _numCols-(COL);    \
590                                        \
591   data += i*_numCols + (COL);           \
592                                        \
593   for(_w=0;_w < nb; _w++)                 \
594   {                                    \
595      *data++ *= v;                     \
596   }                                    \
597 }
598 
599 #define SCALE_COL_F64(A,ROW,v,i)        \
600   SCALE_COL_T(float64_t,,A,ROW,v,i)
601 
602 #define MAC_ROW_F64(COL,A,i,v,B,j)      \
603 {                                       \
604   int32_t _w;                           \
605   float64_t *dataA = (A)->pData;        \
606   float64_t *dataB = (B)->pData;        \
607   const int32_t _numCols = (A)->numCols;\
608   const int32_t nb = _numCols-(COL);     \
609                                         \
610   dataA += i*_numCols + (COL);           \
611   dataB += j*_numCols + (COL);           \
612                                         \
613   for(_w=0;_w < nb; _w++)                  \
614   {                                     \
615      *dataA++ += v* *dataB++;           \
616   }                                     \
617 }
618 
619 #define MAS_ROW_F64(COL,A,i,v,B,j)      \
620 {                                       \
621   int32_t _w;                           \
622   float64_t *dataA = (A)->pData;        \
623   float64_t *dataB = (B)->pData;        \
624   const int32_t _numCols = (A)->numCols;\
625   const int32_t nb = _numCols-(COL);     \
626                                         \
627   dataA += i*_numCols + (COL);           \
628   dataB += j*_numCols + (COL);           \
629                                         \
630   for(_w=0;_w < nb; _w++)                  \
631   {                                     \
632      *dataA++ -= v* *dataB++;           \
633   }                                     \
634 }
635 
636 #ifdef   __cplusplus
637 }
638 #endif
639 
640 #endif /* ifndef _MATRIX_UTILS_H_ */
641