1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_qr_f16.c
4 * Description: Half floating-point matrix QR decomposition.
5 *
6 * $Date: 15 June 2022
7 * $Revision: V1.11.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/matrix_functions_f16.h"
30 #include "dsp/matrix_utils.h"
31
32
33 #if !defined(ARM_MATH_AUTOVECTORIZE)
34 #if defined(ARM_MATH_MVE_FLOAT16)
35 #include "arm_helium_utils.h"
36 #endif
37 #endif
38
39 /**
40 @ingroup groupMatrix
41 */
42
43
44 /**
45 @addtogroup MatrixQR
46 @{
47 */
48
49 /**
50 @brief QR decomposition of a m x n half floating point matrix with m >= n.
51 @param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
52 @param[in] threshold norm2 threshold.
53 @param[out] pOutR points to output R matrix structure of dimension m x n
54 @param[out] pOutQ points to output Q matrix structure of dimension m x m (can be NULL)
55 @param[out] pOutTau points to Householder scaling factors of dimension n
56 @param[inout] pTmpA points to a temporary vector of dimension m.
57 @param[inout] pTmpB points to a temporary vector of dimension m.
58 @return execution status
59 - \ref ARM_MATH_SUCCESS : Operation successful
60 - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
61
62 @par pOutQ is optional:
63 pOutQ can be a NULL pointer.
64 In this case, the argument will be ignored
65 and the output Q matrix won't be computed.
66
67 @par f16 implementation
68 The f16 implementation is not very accurate.
69
70 @par Norm2 threshold
71 For the meaning of this argument please
72 refer to the \ref MatrixHouseholder documentation
73
74 */
75
76 #if !defined(ARM_MATH_AUTOVECTORIZE)
77 #if defined(ARM_MATH_MVE_FLOAT16)
78
arm_mat_qr_f16(const arm_matrix_instance_f16 * pSrc,const float16_t threshold,arm_matrix_instance_f16 * pOutR,arm_matrix_instance_f16 * pOutQ,float16_t * pOutTau,float16_t * pTmpA,float16_t * pTmpB)79 ARM_DSP_ATTRIBUTE arm_status arm_mat_qr_f16(
80 const arm_matrix_instance_f16 * pSrc,
81 const float16_t threshold,
82 arm_matrix_instance_f16 * pOutR,
83 arm_matrix_instance_f16 * pOutQ,
84 float16_t * pOutTau,
85 float16_t *pTmpA,
86 float16_t *pTmpB
87 )
88
89 {
90 int32_t col=0;
91 int32_t nb,pos;
92 float16_t *pa,*pc;
93 float16_t beta;
94 float16_t *pv;
95 float16_t *pdst;
96 float16_t *p;
97
98 if (pSrc->numRows < pSrc->numCols)
99 {
100 return(ARM_MATH_SIZE_MISMATCH);
101 }
102
103 memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float16_t));
104 pOutR->numCols = pSrc->numCols;
105 pOutR->numRows = pSrc->numRows;
106
107 p = pOutR->pData;
108
109 pc = pOutTau;
110 for(col=0 ; col < pSrc->numCols; col++)
111 {
112 int32_t j,k,blkCnt,blkCnt2;
113 float16_t *pa0,*pa1,*pa2,*pa3,*ptemp;
114 float16_t temp;
115 float16x8_t v1,v2,vtemp;
116
117 COPY_COL_F16(pOutR,col,col,pTmpA);
118
119 beta = arm_householder_f16(pTmpA,threshold,pSrc->numRows - col,pTmpA);
120 *pc++ = beta;
121
122 pdst = pTmpB;
123
124 /* v.T A(col:,col:) -> tmpb */
125 pv = pTmpA;
126 pa = p;
127
128 temp = *pv;
129 blkCnt = (pSrc->numCols-col) >> 3;
130 while (blkCnt > 0)
131 {
132 v1 = vld1q_f16(pa);
133 v2 = vmulq_n_f16(v1,temp);
134 vst1q_f16(pdst,v2);
135
136 pa += 8;
137 pdst += 8;
138 blkCnt--;
139 }
140 blkCnt = (pSrc->numCols-col) & 7;
141 if (blkCnt > 0)
142 {
143 mve_pred16_t p0 = vctp16q(blkCnt);
144 v1 = vld1q_f16(pa);
145 v2 = vmulq_n_f16(v1,temp);
146 vst1q_p_f16(pdst,v2,p0);
147
148 pa += blkCnt;
149 }
150
151 pa += col;
152 pv++;
153 pdst = pTmpB;
154
155 pa0 = pa;
156 pa1 = pa0 + pSrc->numCols;
157 pa2 = pa1 + pSrc->numCols;
158 pa3 = pa2 + pSrc->numCols;
159
160 /* Unrolled loop */
161 blkCnt = (pSrc->numRows-col - 1) >> 2;
162 k=1;
163 while(blkCnt > 0)
164 {
165 vtemp=vld1q_f16(pv);
166
167 blkCnt2 = (pSrc->numCols-col) >> 3;
168 while (blkCnt2 > 0)
169 {
170 v1 = vld1q_f16(pdst);
171
172 v2 = vld1q_f16(pa0);
173 v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,0));
174
175 v2 = vld1q_f16(pa1);
176 v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,1));
177
178 v2 = vld1q_f16(pa2);
179 v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,2));
180
181 v2 = vld1q_f16(pa3);
182 v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,3));
183
184 vst1q_f16(pdst,v1);
185
186 pdst += 8;
187 pa0 += 8;
188 pa1 += 8;
189 pa2 += 8;
190 pa3 += 8;
191 blkCnt2--;
192 }
193 blkCnt2 = (pSrc->numCols-col) & 7;
194 if (blkCnt2 > 0)
195 {
196 mve_pred16_t p0 = vctp16q(blkCnt2);
197
198 v1 = vld1q_f16(pdst);
199
200 v2 = vld1q_f16(pa0);
201 v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,0));
202
203 v2 = vld1q_f16(pa1);
204 v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,1));
205
206 v2 = vld1q_f16(pa2);
207 v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,2));
208
209 v2 = vld1q_f16(pa3);
210 v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,3));
211
212 vst1q_p_f16(pdst,v1,p0);
213
214 pa0 += blkCnt2;
215 pa1 += blkCnt2;
216 pa2 += blkCnt2;
217 pa3 += blkCnt2;
218 }
219
220 pa0 += col + 3*pSrc->numCols;
221 pa1 += col + 3*pSrc->numCols;
222 pa2 += col + 3*pSrc->numCols;
223 pa3 += col + 3*pSrc->numCols;
224 pv += 4;
225 pdst = pTmpB;
226 k += 4;
227 blkCnt--;
228 }
229
230 pa = pa0;
231 for(;k<pSrc->numRows-col; k++)
232 {
233 temp = *pv;
234 blkCnt2 = (pSrc->numCols-col) >> 3;
235 while (blkCnt2 > 0)
236 {
237 v1 = vld1q_f16(pa);
238 v2 = vld1q_f16(pdst);
239 v2 = vfmaq_n_f16(v2,v1,temp);
240 vst1q_f16(pdst,v2);
241
242 pa += 8;
243 pdst += 8;
244 blkCnt2--;
245 }
246 blkCnt2 = (pSrc->numCols-col) & 7;
247 if (blkCnt2 > 0)
248 {
249 mve_pred16_t p0 = vctp16q(blkCnt2);
250 v1 = vld1q_f16(pa);
251 v2 = vld1q_f16(pdst);
252 v2 = vfmaq_n_f16(v2,v1,temp);
253 vst1q_p_f16(pdst,v2,p0);
254
255 pa += blkCnt2;
256 }
257
258 pa += col;
259 pv++;
260 pdst = pTmpB;
261 }
262
263 /* A(col:,col:) - beta v tmpb */
264 pa = p;
265 for(j=0;j<pSrc->numRows-col; j++)
266 {
267 float16_t f = -(_Float16)beta * (_Float16)pTmpA[j];
268 ptemp = pTmpB;
269
270 blkCnt2 = (pSrc->numCols-col) >> 3;
271 while (blkCnt2 > 0)
272 {
273 v1 = vld1q_f16(pa);
274 v2 = vld1q_f16(ptemp);
275 v1 = vfmaq_n_f16(v1,v2,f);
276 vst1q_f16(pa,v1);
277
278 pa += 8;
279 ptemp += 8;
280
281 blkCnt2--;
282 }
283 blkCnt2 = (pSrc->numCols-col) & 7;
284 if (blkCnt2 > 0)
285 {
286 mve_pred16_t p0 = vctp16q(blkCnt2);
287
288 v1 = vld1q_f16(pa);
289 v2 = vld1q_f16(ptemp);
290 v1 = vfmaq_n_f16(v1,v2,f);
291 vst1q_p_f16(pa,v1,p0);
292
293 pa += blkCnt2;
294 }
295
296 pa += col;
297 }
298
299 /* Copy Householder reflectors into R matrix */
300 pa = p + pOutR->numCols;
301 for(k=0;k<pSrc->numRows-col-1; k++)
302 {
303 *pa = pTmpA[k+1];
304 pa += pOutR->numCols;
305 }
306
307 p += 1 + pOutR->numCols;
308 }
309
310 /* Generate Q if requested by user matrix */
311
312 if (pOutQ != NULL)
313 {
314 /* Initialize Q matrix to identity */
315 memset(pOutQ->pData,0,sizeof(float16_t)*pOutQ->numRows*pOutQ->numRows);
316
317 pa = pOutQ->pData;
318 for(col=0 ; col < pOutQ->numCols; col++)
319 {
320 *pa = 1.0f16;
321 pa += pOutQ->numCols+1;
322 }
323
324 nb = pOutQ->numRows - pOutQ->numCols + 1;
325
326 pc = pOutTau + pOutQ->numCols - 1;
327 for(col=0 ; col < pOutQ->numCols; col++)
328 {
329 int32_t j,k, blkCnt, blkCnt2;
330 float16_t *pa0,*pa1,*pa2,*pa3,*ptemp;
331 float16_t temp;
332 float16x8_t v1,v2,vtemp;
333
334 pos = pSrc->numRows - nb;
335 p = pOutQ->pData + pos + pOutQ->numCols*pos ;
336
337
338 COPY_COL_F16(pOutR,pos,pos,pTmpA);
339 pTmpA[0] = 1.0f16;
340 pdst = pTmpB;
341
342 /* v.T A(col:,col:) -> tmpb */
343
344 pv = pTmpA;
345 pa = p;
346
347 temp = *pv;
348 blkCnt2 = (pOutQ->numRows-pos) >> 3;
349 while (blkCnt2 > 0)
350 {
351 v1 = vld1q_f16(pa);
352 v1 = vmulq_n_f16(v1, temp);
353 vst1q_f16(pdst,v1);
354
355 pa += 8;
356 pdst += 8;
357
358 blkCnt2--;
359 }
360 blkCnt2 = (pOutQ->numRows-pos) & 7;
361 if (blkCnt2 > 0)
362 {
363 mve_pred16_t p0 = vctp16q(blkCnt2);
364
365 v1 = vld1q_f16(pa);
366 v1 = vmulq_n_f16(v1, temp);
367 vst1q_p_f16(pdst,v1,p0);
368
369 pa += blkCnt2;
370 }
371
372 pa += pos;
373 pv++;
374 pdst = pTmpB;
375 pa0 = pa;
376 pa1 = pa0 + pOutQ->numRows;
377 pa2 = pa1 + pOutQ->numRows;
378 pa3 = pa2 + pOutQ->numRows;
379
380 /* Unrolled loop */
381 blkCnt = (pOutQ->numRows-pos - 1) >> 2;
382 k=1;
383 while(blkCnt > 0)
384 {
385
386 vtemp = vld1q_f16(pv);
387 blkCnt2 = (pOutQ->numRows-pos) >> 3;
388 while (blkCnt2 > 0)
389 {
390 v1 = vld1q_f16(pdst);
391
392 v2 = vld1q_f16(pa0);
393 v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,0));
394
395 v2 = vld1q_f16(pa1);
396 v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,1));
397
398 v2 = vld1q_f16(pa2);
399 v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,2));
400
401 v2 = vld1q_f16(pa3);
402 v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,3));
403
404 vst1q_f16(pdst,v1);
405
406 pa0 += 8;
407 pa1 += 8;
408 pa2 += 8;
409 pa3 += 8;
410 pdst += 8;
411
412 blkCnt2--;
413 }
414 blkCnt2 = (pOutQ->numRows-pos) & 7;
415 if (blkCnt2 > 0)
416 {
417 mve_pred16_t p0 = vctp16q(blkCnt2);
418
419 v1 = vld1q_f16(pdst);
420
421 v2 = vld1q_f16(pa0);
422 v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,0));
423
424 v2 = vld1q_f16(pa1);
425 v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,1));
426
427 v2 = vld1q_f16(pa2);
428 v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,2));
429
430 v2 = vld1q_f16(pa3);
431 v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,3));
432
433 vst1q_p_f16(pdst,v1,p0);
434
435 pa0 += blkCnt2;
436 pa1 += blkCnt2;
437 pa2 += blkCnt2;
438 pa3 += blkCnt2;
439
440 }
441
442 pa0 += pos + 3*pOutQ->numRows;
443 pa1 += pos + 3*pOutQ->numRows;
444 pa2 += pos + 3*pOutQ->numRows;
445 pa3 += pos + 3*pOutQ->numRows;
446 pv += 4;
447 pdst = pTmpB;
448 k += 4;
449 blkCnt--;
450 }
451
452 pa = pa0;
453 for(;k<pOutQ->numRows-pos; k++)
454 {
455 temp = *pv;
456 blkCnt2 = (pOutQ->numRows-pos) >> 3;
457 while (blkCnt2 > 0)
458 {
459 v1 = vld1q_f16(pdst);
460 v2 = vld1q_f16(pa);
461 v1 = vfmaq_n_f16(v1, v2, temp);
462 vst1q_f16(pdst,v1);
463
464 pdst += 8;
465 pa += 8;
466
467 blkCnt2--;
468 }
469 blkCnt2 = (pOutQ->numRows-pos) & 7;
470 if (blkCnt2 > 0)
471 {
472 mve_pred16_t p0 = vctp16q(blkCnt2);
473 v1 = vld1q_f16(pdst);
474 v2 = vld1q_f16(pa);
475 v1 = vfmaq_n_f16(v1, v2, temp);
476 vst1q_p_f16(pdst,v1,p0);
477
478 pa += blkCnt2;
479 }
480
481 pa += pos;
482 pv++;
483 pdst = pTmpB;
484 }
485
486 pa = p;
487 beta = *pc--;
488 for(j=0;j<pOutQ->numRows-pos; j++)
489 {
490 float16_t f = -(_Float16)beta * (_Float16)pTmpA[j];
491 ptemp = pTmpB;
492
493 blkCnt2 = (pOutQ->numCols-pos) >> 3;
494 while (blkCnt2 > 0)
495 {
496 v1 = vld1q_f16(pa);
497 v2 = vld1q_f16(ptemp);
498 v1 = vfmaq_n_f16(v1,v2,f);
499 vst1q_f16(pa,v1);
500
501 pa += 8;
502 ptemp += 8;
503
504 blkCnt2--;
505 }
506 blkCnt2 = (pOutQ->numCols-pos) & 7;
507 if (blkCnt2 > 0)
508 {
509 mve_pred16_t p0 = vctp16q(blkCnt2);
510
511 v1 = vld1q_f16(pa);
512 v2 = vld1q_f16(ptemp);
513 v1 = vfmaq_n_f16(v1,v2,f);
514 vst1q_p_f16(pa,v1,p0);
515
516 pa += blkCnt2;
517 }
518
519 pa += pos;
520 }
521
522
523 nb++;
524 }
525 }
526
527 arm_status status = ARM_MATH_SUCCESS;
528 /* Return to application */
529 return (status);
530 }
531
532 #endif /*#if !defined(ARM_MATH_MVEF)*/
533
534
535 #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
536
537
538 #if defined(ARM_FLOAT16_SUPPORTED)
539
540 #if (!defined(ARM_MATH_MVE_FLOAT16)) || defined(ARM_MATH_AUTOVECTORIZE)
541
542
arm_mat_qr_f16(const arm_matrix_instance_f16 * pSrc,const float16_t threshold,arm_matrix_instance_f16 * pOutR,arm_matrix_instance_f16 * pOutQ,float16_t * pOutTau,float16_t * pTmpA,float16_t * pTmpB)543 ARM_DSP_ATTRIBUTE arm_status arm_mat_qr_f16(
544 const arm_matrix_instance_f16 * pSrc,
545 const float16_t threshold,
546 arm_matrix_instance_f16 * pOutR,
547 arm_matrix_instance_f16 * pOutQ,
548 float16_t * pOutTau,
549 float16_t *pTmpA,
550 float16_t *pTmpB
551 )
552
553 {
554 int32_t col=0;
555 int32_t nb,pos;
556 float16_t *pa,*pc;
557 float16_t beta;
558 float16_t *pv;
559 float16_t *pdst;
560 float16_t *p;
561
562 if (pSrc->numRows < pSrc->numCols)
563 {
564 return(ARM_MATH_SIZE_MISMATCH);
565 }
566
567 memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float16_t));
568 pOutR->numCols = pSrc->numCols;
569 pOutR->numRows = pSrc->numRows;
570
571 p = pOutR->pData;
572
573 pc = pOutTau;
574 for(col=0 ; col < pSrc->numCols; col++)
575 {
576 int32_t i,j,k,blkCnt;
577 float16_t *pa0,*pa1,*pa2,*pa3;
578 COPY_COL_F16(pOutR,col,col,pTmpA);
579
580 beta = arm_householder_f16(pTmpA,threshold,pSrc->numRows - col,pTmpA);
581 *pc++ = beta;
582
583 pdst = pTmpB;
584
585 /* v.T A(col:,col:) -> tmpb */
586 pv = pTmpA;
587 pa = p;
588 for(j=0;j<pSrc->numCols-col; j++)
589 {
590 *pdst++ = (_Float16)*pv * (_Float16)*pa++;
591 }
592 pa += col;
593 pv++;
594 pdst = pTmpB;
595
596 pa0 = pa;
597 pa1 = pa0 + pSrc->numCols;
598 pa2 = pa1 + pSrc->numCols;
599 pa3 = pa2 + pSrc->numCols;
600
601 /* Unrolled loop */
602 blkCnt = (pSrc->numRows-col - 1) >> 2;
603 k=1;
604 while(blkCnt > 0)
605 {
606 float16_t sum;
607
608 for(j=0;j<pSrc->numCols-col; j++)
609 {
610 sum = *pdst;
611
612 sum += (_Float16)pv[0] * (_Float16)*pa0++;
613 sum += (_Float16)pv[1] * (_Float16)*pa1++;
614 sum += (_Float16)pv[2] * (_Float16)*pa2++;
615 sum += (_Float16)pv[3] * (_Float16)*pa3++;
616
617 *pdst++ = sum;
618 }
619 pa0 += col + 3*pSrc->numCols;
620 pa1 += col + 3*pSrc->numCols;
621 pa2 += col + 3*pSrc->numCols;
622 pa3 += col + 3*pSrc->numCols;
623 pv += 4;
624 pdst = pTmpB;
625 k += 4;
626 blkCnt--;
627 }
628
629 pa = pa0;
630 for(;k<pSrc->numRows-col; k++)
631 {
632 for(j=0;j<pSrc->numCols-col; j++)
633 {
634 *pdst++ += (_Float16)*pv * (_Float16)*pa++;
635 }
636 pa += col;
637 pv++;
638 pdst = pTmpB;
639 }
640
641 /* A(col:,col:) - beta v tmpb */
642 pa = p;
643 for(j=0;j<pSrc->numRows-col; j++)
644 {
645 float16_t f = (_Float16)beta * (_Float16)pTmpA[j];
646
647 for(i=0;i<pSrc->numCols-col; i++)
648 {
649 *pa = (_Float16)*pa - (_Float16)f * (_Float16)pTmpB[i] ;
650 pa++;
651 }
652 pa += col;
653 }
654
655 /* Copy Householder reflectors into R matrix */
656 pa = p + pOutR->numCols;
657 for(k=0;k<pSrc->numRows-col-1; k++)
658 {
659 *pa = pTmpA[k+1];
660 pa += pOutR->numCols;
661 }
662
663 p += 1 + pOutR->numCols;
664 }
665
666 /* Generate Q if requested by user matrix */
667
668 if (pOutQ != NULL)
669 {
670 /* Initialize Q matrix to identity */
671 memset(pOutQ->pData,0,sizeof(float16_t)*pOutQ->numRows*pOutQ->numRows);
672
673 pa = pOutQ->pData;
674 for(col=0 ; col < pOutQ->numCols; col++)
675 {
676 *pa = 1.0f16;
677 pa += pOutQ->numCols+1;
678 }
679
680 nb = pOutQ->numRows - pOutQ->numCols + 1;
681
682 pc = pOutTau + pOutQ->numCols - 1;
683 for(col=0 ; col < pOutQ->numCols; col++)
684 {
685 int32_t i,j,k, blkCnt;
686 float16_t *pa0,*pa1,*pa2,*pa3;
687 pos = pSrc->numRows - nb;
688 p = pOutQ->pData + pos + pOutQ->numCols*pos ;
689
690
691 COPY_COL_F16(pOutR,pos,pos,pTmpA);
692 pTmpA[0] = 1.0f16;
693 pdst = pTmpB;
694
695 /* v.T A(col:,col:) -> tmpb */
696
697 pv = pTmpA;
698 pa = p;
699 for(j=0;j<pOutQ->numRows-pos; j++)
700 {
701 *pdst++ = (_Float16)*pv * (_Float16)*pa++;
702 }
703 pa += pos;
704 pv++;
705 pdst = pTmpB;
706 pa0 = pa;
707 pa1 = pa0 + pOutQ->numRows;
708 pa2 = pa1 + pOutQ->numRows;
709 pa3 = pa2 + pOutQ->numRows;
710
711 /* Unrolled loop */
712 blkCnt = (pOutQ->numRows-pos - 1) >> 2;
713 k=1;
714 while(blkCnt > 0)
715 {
716 float16_t sum;
717
718 for(j=0;j<pOutQ->numRows-pos; j++)
719 {
720 sum = *pdst;
721
722 sum += (_Float16)pv[0] * (_Float16)*pa0++;
723 sum += (_Float16)pv[1] * (_Float16)*pa1++;
724 sum += (_Float16)pv[2] * (_Float16)*pa2++;
725 sum += (_Float16)pv[3] * (_Float16)*pa3++;
726
727 *pdst++ = sum;
728 }
729 pa0 += pos + 3*pOutQ->numRows;
730 pa1 += pos + 3*pOutQ->numRows;
731 pa2 += pos + 3*pOutQ->numRows;
732 pa3 += pos + 3*pOutQ->numRows;
733 pv += 4;
734 pdst = pTmpB;
735 k += 4;
736 blkCnt--;
737 }
738
739 pa = pa0;
740 for(;k<pOutQ->numRows-pos; k++)
741 {
742 for(j=0;j<pOutQ->numRows-pos; j++)
743 {
744 *pdst++ += (_Float16)*pv * (_Float16)*pa++;
745 }
746 pa += pos;
747 pv++;
748 pdst = pTmpB;
749 }
750
751 pa = p;
752 beta = *pc--;
753 for(j=0;j<pOutQ->numRows-pos; j++)
754 {
755 float16_t f = (_Float16)beta * (_Float16)pTmpA[j];
756
757 for(i=0;i<pOutQ->numCols-pos; i++)
758 {
759 *pa = (_Float16)*pa - (_Float16)f * (_Float16)pTmpB[i] ;
760 pa++;
761 }
762 pa += pos;
763 }
764
765
766 nb++;
767 }
768 }
769
770 arm_status status = ARM_MATH_SUCCESS;
771 /* Return to application */
772 return (status);
773 }
774
775 #endif /* end of test for Helium or Neon availability */
776
777 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
778 /**
779 @} end of MatrixQR group
780 */
781