1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_qr_f16.c
4  * Description:  Half floating-point matrix QR decomposition.
5  *
6  * $Date:        15 June 2022
7  * $Revision:    V1.11.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/matrix_functions_f16.h"
30 #include "dsp/matrix_utils.h"
31 
32 
33 #if !defined(ARM_MATH_AUTOVECTORIZE)
34 #if defined(ARM_MATH_MVE_FLOAT16)
35 #include "arm_helium_utils.h"
36 #endif
37 #endif
38 
39 /**
40   @ingroup groupMatrix
41  */
42 
43 
44 /**
45   @addtogroup MatrixQR
46   @{
47  */
48 
49 /**
50   @brief         QR decomposition of a m x n half floating point matrix with m >= n.
51   @param[in]     pSrc      points to input matrix structure. The source matrix is modified by the function.
52   @param[in]     threshold norm2 threshold.
53   @param[out]    pOutR     points to output R matrix structure of dimension m x n
54   @param[out]    pOutQ     points to output Q matrix structure of dimension m x m (can be NULL)
55   @param[out]    pOutTau   points to Householder scaling factors of dimension n
56   @param[inout]  pTmpA     points to a temporary vector of dimension m.
57   @param[inout]  pTmpB     points to a temporary vector of dimension m.
58   @return        execution status
59                    - \ref ARM_MATH_SUCCESS       : Operation successful
60                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
61 
62   @par           pOutQ is optional:
63                  pOutQ can be a NULL pointer.
64                  In this case, the argument will be ignored
65                  and the output Q matrix won't be computed.
66 
67   @par           f16 implementation
68                  The f16 implementation is not very accurate.
69 
70   @par           Norm2 threshold
71                  For the meaning of this argument please
72                  refer to the \ref MatrixHouseholder documentation
73 
74  */
75 
76 #if !defined(ARM_MATH_AUTOVECTORIZE)
77 #if defined(ARM_MATH_MVE_FLOAT16)
78 
arm_mat_qr_f16(const arm_matrix_instance_f16 * pSrc,const float16_t threshold,arm_matrix_instance_f16 * pOutR,arm_matrix_instance_f16 * pOutQ,float16_t * pOutTau,float16_t * pTmpA,float16_t * pTmpB)79 ARM_DSP_ATTRIBUTE arm_status arm_mat_qr_f16(
80     const arm_matrix_instance_f16 * pSrc,
81     const float16_t threshold,
82     arm_matrix_instance_f16 * pOutR,
83     arm_matrix_instance_f16 * pOutQ,
84     float16_t * pOutTau,
85     float16_t *pTmpA,
86     float16_t *pTmpB
87     )
88 
89 {
90   int32_t col=0;
91   int32_t nb,pos;
92   float16_t *pa,*pc;
93   float16_t beta;
94   float16_t *pv;
95   float16_t *pdst;
96   float16_t *p;
97 
98   if (pSrc->numRows < pSrc->numCols)
99   {
100     return(ARM_MATH_SIZE_MISMATCH);
101   }
102 
103   memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float16_t));
104   pOutR->numCols = pSrc->numCols;
105   pOutR->numRows = pSrc->numRows;
106 
107   p = pOutR->pData;
108 
109   pc = pOutTau;
110   for(col=0 ; col < pSrc->numCols; col++)
111   {
112       int32_t j,k,blkCnt,blkCnt2;
113       float16_t *pa0,*pa1,*pa2,*pa3,*ptemp;
114       float16_t temp;
115       float16x8_t v1,v2,vtemp;
116 
117       COPY_COL_F16(pOutR,col,col,pTmpA);
118 
119       beta = arm_householder_f16(pTmpA,threshold,pSrc->numRows - col,pTmpA);
120       *pc++ = beta;
121 
122       pdst = pTmpB;
123 
124       /* v.T A(col:,col:) -> tmpb */
125       pv = pTmpA;
126       pa = p;
127 
128       temp = *pv;
129       blkCnt = (pSrc->numCols-col) >> 3;
130       while (blkCnt > 0)
131       {
132           v1 = vld1q_f16(pa);
133           v2 = vmulq_n_f16(v1,temp);
134           vst1q_f16(pdst,v2);
135 
136           pa += 8;
137           pdst += 8;
138           blkCnt--;
139       }
140       blkCnt = (pSrc->numCols-col) & 7;
141       if (blkCnt > 0)
142       {
143           mve_pred16_t p0 = vctp16q(blkCnt);
144           v1 = vld1q_f16(pa);
145           v2 = vmulq_n_f16(v1,temp);
146           vst1q_p_f16(pdst,v2,p0);
147 
148           pa += blkCnt;
149       }
150 
151       pa += col;
152       pv++;
153       pdst = pTmpB;
154 
155       pa0 = pa;
156       pa1 = pa0 + pSrc->numCols;
157       pa2 = pa1 + pSrc->numCols;
158       pa3 = pa2 + pSrc->numCols;
159 
160       /* Unrolled loop */
161       blkCnt = (pSrc->numRows-col - 1) >> 2;
162       k=1;
163       while(blkCnt > 0)
164       {
165           vtemp=vld1q_f16(pv);
166 
167           blkCnt2 = (pSrc->numCols-col) >> 3;
168           while (blkCnt2 > 0)
169           {
170               v1 = vld1q_f16(pdst);
171 
172               v2 = vld1q_f16(pa0);
173               v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,0));
174 
175               v2 = vld1q_f16(pa1);
176               v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,1));
177 
178               v2 = vld1q_f16(pa2);
179               v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,2));
180 
181               v2 = vld1q_f16(pa3);
182               v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,3));
183 
184               vst1q_f16(pdst,v1);
185 
186               pdst += 8;
187               pa0 += 8;
188               pa1 += 8;
189               pa2 += 8;
190               pa3 += 8;
191               blkCnt2--;
192           }
193           blkCnt2 = (pSrc->numCols-col) & 7;
194           if (blkCnt2 > 0)
195           {
196               mve_pred16_t p0 = vctp16q(blkCnt2);
197 
198               v1 = vld1q_f16(pdst);
199 
200               v2 = vld1q_f16(pa0);
201               v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,0));
202 
203               v2 = vld1q_f16(pa1);
204               v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,1));
205 
206               v2 = vld1q_f16(pa2);
207               v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,2));
208 
209               v2 = vld1q_f16(pa3);
210               v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,3));
211 
212               vst1q_p_f16(pdst,v1,p0);
213 
214               pa0 += blkCnt2;
215               pa1 += blkCnt2;
216               pa2 += blkCnt2;
217               pa3 += blkCnt2;
218           }
219 
220           pa0 += col + 3*pSrc->numCols;
221           pa1 += col + 3*pSrc->numCols;
222           pa2 += col + 3*pSrc->numCols;
223           pa3 += col + 3*pSrc->numCols;
224           pv  += 4;
225           pdst = pTmpB;
226           k += 4;
227           blkCnt--;
228       }
229 
230       pa = pa0;
231       for(;k<pSrc->numRows-col; k++)
232       {
233           temp = *pv;
234           blkCnt2 = (pSrc->numCols-col) >> 3;
235           while (blkCnt2 > 0)
236           {
237               v1 = vld1q_f16(pa);
238               v2 = vld1q_f16(pdst);
239               v2 = vfmaq_n_f16(v2,v1,temp);
240               vst1q_f16(pdst,v2);
241 
242               pa += 8;
243               pdst += 8;
244               blkCnt2--;
245           }
246           blkCnt2 = (pSrc->numCols-col) & 7;
247           if (blkCnt2 > 0)
248           {
249               mve_pred16_t p0 = vctp16q(blkCnt2);
250               v1 = vld1q_f16(pa);
251               v2 = vld1q_f16(pdst);
252               v2 = vfmaq_n_f16(v2,v1,temp);
253               vst1q_p_f16(pdst,v2,p0);
254 
255               pa += blkCnt2;
256           }
257 
258           pa += col;
259           pv++;
260           pdst = pTmpB;
261       }
262 
263       /* A(col:,col:) - beta v tmpb */
264       pa = p;
265       for(j=0;j<pSrc->numRows-col; j++)
266       {
267         float16_t f = -(_Float16)beta * (_Float16)pTmpA[j];
268         ptemp = pTmpB;
269 
270         blkCnt2 = (pSrc->numCols-col) >> 3;
271         while (blkCnt2 > 0)
272         {
273             v1 = vld1q_f16(pa);
274             v2 = vld1q_f16(ptemp);
275             v1 = vfmaq_n_f16(v1,v2,f);
276             vst1q_f16(pa,v1);
277 
278             pa += 8;
279             ptemp += 8;
280 
281             blkCnt2--;
282         }
283         blkCnt2 = (pSrc->numCols-col) & 7;
284         if (blkCnt2 > 0)
285         {
286             mve_pred16_t p0 = vctp16q(blkCnt2);
287 
288             v1 = vld1q_f16(pa);
289             v2 = vld1q_f16(ptemp);
290             v1 = vfmaq_n_f16(v1,v2,f);
291             vst1q_p_f16(pa,v1,p0);
292 
293             pa += blkCnt2;
294         }
295 
296         pa += col;
297       }
298 
299       /* Copy Householder reflectors into R matrix */
300       pa = p + pOutR->numCols;
301       for(k=0;k<pSrc->numRows-col-1; k++)
302       {
303          *pa = pTmpA[k+1];
304          pa += pOutR->numCols;
305       }
306 
307       p += 1 + pOutR->numCols;
308   }
309 
310   /* Generate Q if requested by user matrix */
311 
312   if (pOutQ != NULL)
313   {
314      /* Initialize Q matrix to identity */
315      memset(pOutQ->pData,0,sizeof(float16_t)*pOutQ->numRows*pOutQ->numRows);
316 
317      pa = pOutQ->pData;
318      for(col=0 ; col < pOutQ->numCols; col++)
319      {
320         *pa = 1.0f16;
321         pa += pOutQ->numCols+1;
322      }
323 
324      nb = pOutQ->numRows - pOutQ->numCols + 1;
325 
326      pc = pOutTau + pOutQ->numCols - 1;
327      for(col=0 ; col < pOutQ->numCols; col++)
328      {
329        int32_t j,k, blkCnt, blkCnt2;
330        float16_t *pa0,*pa1,*pa2,*pa3,*ptemp;
331        float16_t temp;
332        float16x8_t v1,v2,vtemp;
333 
334        pos = pSrc->numRows - nb;
335        p = pOutQ->pData + pos + pOutQ->numCols*pos ;
336 
337 
338        COPY_COL_F16(pOutR,pos,pos,pTmpA);
339        pTmpA[0] = 1.0f16;
340        pdst = pTmpB;
341 
342        /* v.T A(col:,col:) -> tmpb */
343 
344        pv = pTmpA;
345        pa = p;
346 
347        temp = *pv;
348        blkCnt2 = (pOutQ->numRows-pos) >> 3;
349        while (blkCnt2 > 0)
350        {
351            v1 = vld1q_f16(pa);
352            v1 = vmulq_n_f16(v1, temp);
353            vst1q_f16(pdst,v1);
354 
355            pa += 8;
356            pdst += 8;
357 
358            blkCnt2--;
359        }
360        blkCnt2 = (pOutQ->numRows-pos) & 7;
361        if (blkCnt2 > 0)
362        {
363            mve_pred16_t p0 = vctp16q(blkCnt2);
364 
365            v1 = vld1q_f16(pa);
366            v1 = vmulq_n_f16(v1, temp);
367            vst1q_p_f16(pdst,v1,p0);
368 
369            pa += blkCnt2;
370        }
371 
372        pa += pos;
373        pv++;
374        pdst = pTmpB;
375        pa0 = pa;
376        pa1 = pa0 + pOutQ->numRows;
377        pa2 = pa1 + pOutQ->numRows;
378        pa3 = pa2 + pOutQ->numRows;
379 
380        /* Unrolled loop */
381        blkCnt = (pOutQ->numRows-pos - 1) >> 2;
382        k=1;
383        while(blkCnt > 0)
384        {
385 
386            vtemp = vld1q_f16(pv);
387            blkCnt2 = (pOutQ->numRows-pos) >> 3;
388            while (blkCnt2 > 0)
389            {
390                v1 = vld1q_f16(pdst);
391 
392                v2 = vld1q_f16(pa0);
393                v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,0));
394 
395                v2 = vld1q_f16(pa1);
396                v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,1));
397 
398                v2 = vld1q_f16(pa2);
399                v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,2));
400 
401                v2 = vld1q_f16(pa3);
402                v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,3));
403 
404                vst1q_f16(pdst,v1);
405 
406                pa0 += 8;
407                pa1 += 8;
408                pa2 += 8;
409                pa3 += 8;
410                pdst += 8;
411 
412                blkCnt2--;
413            }
414            blkCnt2 = (pOutQ->numRows-pos) & 7;
415            if (blkCnt2 > 0)
416            {
417                mve_pred16_t p0 = vctp16q(blkCnt2);
418 
419                v1 = vld1q_f16(pdst);
420 
421                v2 = vld1q_f16(pa0);
422                v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,0));
423 
424                v2 = vld1q_f16(pa1);
425                v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,1));
426 
427                v2 = vld1q_f16(pa2);
428                v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,2));
429 
430                v2 = vld1q_f16(pa3);
431                v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,3));
432 
433                vst1q_p_f16(pdst,v1,p0);
434 
435                pa0 += blkCnt2;
436                pa1 += blkCnt2;
437                pa2 += blkCnt2;
438                pa3 += blkCnt2;
439 
440            }
441 
442            pa0 += pos + 3*pOutQ->numRows;
443            pa1 += pos + 3*pOutQ->numRows;
444            pa2 += pos + 3*pOutQ->numRows;
445            pa3 += pos + 3*pOutQ->numRows;
446            pv  += 4;
447            pdst = pTmpB;
448            k += 4;
449            blkCnt--;
450        }
451 
452        pa = pa0;
453        for(;k<pOutQ->numRows-pos; k++)
454        {
455            temp = *pv;
456            blkCnt2 = (pOutQ->numRows-pos) >> 3;
457            while (blkCnt2 > 0)
458            {
459                v1 = vld1q_f16(pdst);
460                v2 = vld1q_f16(pa);
461                v1 = vfmaq_n_f16(v1, v2, temp);
462                vst1q_f16(pdst,v1);
463 
464                pdst += 8;
465                pa += 8;
466 
467                blkCnt2--;
468            }
469            blkCnt2 = (pOutQ->numRows-pos) & 7;
470            if (blkCnt2 > 0)
471            {
472                mve_pred16_t p0 = vctp16q(blkCnt2);
473                v1 = vld1q_f16(pdst);
474                v2 = vld1q_f16(pa);
475                v1 = vfmaq_n_f16(v1, v2, temp);
476                vst1q_p_f16(pdst,v1,p0);
477 
478                pa += blkCnt2;
479            }
480 
481            pa += pos;
482            pv++;
483            pdst = pTmpB;
484        }
485 
486        pa = p;
487        beta = *pc--;
488        for(j=0;j<pOutQ->numRows-pos; j++)
489        {
490            float16_t f = -(_Float16)beta * (_Float16)pTmpA[j];
491            ptemp = pTmpB;
492 
493            blkCnt2 = (pOutQ->numCols-pos) >> 3;
494            while (blkCnt2 > 0)
495            {
496                v1 = vld1q_f16(pa);
497                v2 = vld1q_f16(ptemp);
498                v1 = vfmaq_n_f16(v1,v2,f);
499                vst1q_f16(pa,v1);
500 
501                pa += 8;
502                ptemp += 8;
503 
504                blkCnt2--;
505            }
506            blkCnt2 = (pOutQ->numCols-pos) & 7;
507            if (blkCnt2 > 0)
508            {
509                mve_pred16_t p0 = vctp16q(blkCnt2);
510 
511                v1 = vld1q_f16(pa);
512                v2 = vld1q_f16(ptemp);
513                v1 = vfmaq_n_f16(v1,v2,f);
514                vst1q_p_f16(pa,v1,p0);
515 
516                pa += blkCnt2;
517            }
518 
519            pa += pos;
520        }
521 
522 
523        nb++;
524      }
525   }
526 
527   arm_status status = ARM_MATH_SUCCESS;
528   /* Return to application */
529   return (status);
530 }
531 
532 #endif /*#if !defined(ARM_MATH_MVEF)*/
533 
534 
535 #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
536 
537 
538 #if defined(ARM_FLOAT16_SUPPORTED)
539 
540 #if (!defined(ARM_MATH_MVE_FLOAT16)) || defined(ARM_MATH_AUTOVECTORIZE)
541 
542 
arm_mat_qr_f16(const arm_matrix_instance_f16 * pSrc,const float16_t threshold,arm_matrix_instance_f16 * pOutR,arm_matrix_instance_f16 * pOutQ,float16_t * pOutTau,float16_t * pTmpA,float16_t * pTmpB)543 ARM_DSP_ATTRIBUTE arm_status arm_mat_qr_f16(
544     const arm_matrix_instance_f16 * pSrc,
545     const float16_t threshold,
546     arm_matrix_instance_f16 * pOutR,
547     arm_matrix_instance_f16 * pOutQ,
548     float16_t * pOutTau,
549     float16_t *pTmpA,
550     float16_t *pTmpB
551     )
552 
553 {
554   int32_t col=0;
555   int32_t nb,pos;
556   float16_t *pa,*pc;
557   float16_t beta;
558   float16_t *pv;
559   float16_t *pdst;
560   float16_t *p;
561 
562   if (pSrc->numRows < pSrc->numCols)
563   {
564     return(ARM_MATH_SIZE_MISMATCH);
565   }
566 
567   memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float16_t));
568   pOutR->numCols = pSrc->numCols;
569   pOutR->numRows = pSrc->numRows;
570 
571   p = pOutR->pData;
572 
573   pc = pOutTau;
574   for(col=0 ; col < pSrc->numCols; col++)
575   {
576       int32_t i,j,k,blkCnt;
577       float16_t *pa0,*pa1,*pa2,*pa3;
578       COPY_COL_F16(pOutR,col,col,pTmpA);
579 
580       beta = arm_householder_f16(pTmpA,threshold,pSrc->numRows - col,pTmpA);
581       *pc++ = beta;
582 
583       pdst = pTmpB;
584 
585       /* v.T A(col:,col:) -> tmpb */
586       pv = pTmpA;
587       pa = p;
588       for(j=0;j<pSrc->numCols-col; j++)
589       {
590               *pdst++ = (_Float16)*pv * (_Float16)*pa++;
591       }
592       pa += col;
593       pv++;
594       pdst = pTmpB;
595 
596       pa0 = pa;
597       pa1 = pa0 + pSrc->numCols;
598       pa2 = pa1 + pSrc->numCols;
599       pa3 = pa2 + pSrc->numCols;
600 
601       /* Unrolled loop */
602       blkCnt = (pSrc->numRows-col - 1) >> 2;
603       k=1;
604       while(blkCnt > 0)
605       {
606           float16_t sum;
607 
608           for(j=0;j<pSrc->numCols-col; j++)
609           {
610               sum = *pdst;
611 
612               sum += (_Float16)pv[0] * (_Float16)*pa0++;
613               sum += (_Float16)pv[1] * (_Float16)*pa1++;
614               sum += (_Float16)pv[2] * (_Float16)*pa2++;
615               sum += (_Float16)pv[3] * (_Float16)*pa3++;
616 
617               *pdst++ = sum;
618           }
619           pa0 += col + 3*pSrc->numCols;
620           pa1 += col + 3*pSrc->numCols;
621           pa2 += col + 3*pSrc->numCols;
622           pa3 += col + 3*pSrc->numCols;
623           pv  += 4;
624           pdst = pTmpB;
625           k += 4;
626           blkCnt--;
627       }
628 
629       pa = pa0;
630       for(;k<pSrc->numRows-col; k++)
631       {
632           for(j=0;j<pSrc->numCols-col; j++)
633           {
634               *pdst++ += (_Float16)*pv * (_Float16)*pa++;
635           }
636           pa += col;
637           pv++;
638           pdst = pTmpB;
639       }
640 
641       /* A(col:,col:) - beta v tmpb */
642       pa = p;
643       for(j=0;j<pSrc->numRows-col; j++)
644       {
645         float16_t f = (_Float16)beta * (_Float16)pTmpA[j];
646 
647         for(i=0;i<pSrc->numCols-col; i++)
648         {
649           *pa = (_Float16)*pa - (_Float16)f * (_Float16)pTmpB[i] ;
650           pa++;
651         }
652         pa += col;
653       }
654 
655       /* Copy Householder reflectors into R matrix */
656       pa = p + pOutR->numCols;
657       for(k=0;k<pSrc->numRows-col-1; k++)
658       {
659          *pa = pTmpA[k+1];
660          pa += pOutR->numCols;
661       }
662 
663       p += 1 + pOutR->numCols;
664   }
665 
666   /* Generate Q if requested by user matrix */
667 
668   if (pOutQ != NULL)
669   {
670      /* Initialize Q matrix to identity */
671      memset(pOutQ->pData,0,sizeof(float16_t)*pOutQ->numRows*pOutQ->numRows);
672 
673      pa = pOutQ->pData;
674      for(col=0 ; col < pOutQ->numCols; col++)
675      {
676         *pa = 1.0f16;
677         pa += pOutQ->numCols+1;
678      }
679 
680      nb = pOutQ->numRows - pOutQ->numCols + 1;
681 
682      pc = pOutTau + pOutQ->numCols - 1;
683      for(col=0 ; col < pOutQ->numCols; col++)
684      {
685        int32_t i,j,k, blkCnt;
686        float16_t *pa0,*pa1,*pa2,*pa3;
687        pos = pSrc->numRows - nb;
688        p = pOutQ->pData + pos + pOutQ->numCols*pos ;
689 
690 
691        COPY_COL_F16(pOutR,pos,pos,pTmpA);
692        pTmpA[0] = 1.0f16;
693        pdst = pTmpB;
694 
695        /* v.T A(col:,col:) -> tmpb */
696 
697        pv = pTmpA;
698        pa = p;
699        for(j=0;j<pOutQ->numRows-pos; j++)
700        {
701                *pdst++ = (_Float16)*pv * (_Float16)*pa++;
702        }
703        pa += pos;
704        pv++;
705        pdst = pTmpB;
706        pa0 = pa;
707        pa1 = pa0 + pOutQ->numRows;
708        pa2 = pa1 + pOutQ->numRows;
709        pa3 = pa2 + pOutQ->numRows;
710 
711        /* Unrolled loop */
712        blkCnt = (pOutQ->numRows-pos - 1) >> 2;
713        k=1;
714        while(blkCnt > 0)
715        {
716            float16_t sum;
717 
718            for(j=0;j<pOutQ->numRows-pos; j++)
719            {
720               sum = *pdst;
721 
722               sum += (_Float16)pv[0] * (_Float16)*pa0++;
723               sum += (_Float16)pv[1] * (_Float16)*pa1++;
724               sum += (_Float16)pv[2] * (_Float16)*pa2++;
725               sum += (_Float16)pv[3] * (_Float16)*pa3++;
726 
727               *pdst++ = sum;
728            }
729            pa0 += pos + 3*pOutQ->numRows;
730            pa1 += pos + 3*pOutQ->numRows;
731            pa2 += pos + 3*pOutQ->numRows;
732            pa3 += pos + 3*pOutQ->numRows;
733            pv  += 4;
734            pdst = pTmpB;
735            k += 4;
736            blkCnt--;
737        }
738 
739        pa = pa0;
740        for(;k<pOutQ->numRows-pos; k++)
741        {
742            for(j=0;j<pOutQ->numRows-pos; j++)
743            {
744                *pdst++ += (_Float16)*pv * (_Float16)*pa++;
745            }
746            pa += pos;
747            pv++;
748            pdst = pTmpB;
749        }
750 
751        pa = p;
752        beta = *pc--;
753        for(j=0;j<pOutQ->numRows-pos; j++)
754        {
755            float16_t f = (_Float16)beta * (_Float16)pTmpA[j];
756 
757            for(i=0;i<pOutQ->numCols-pos; i++)
758            {
759              *pa = (_Float16)*pa - (_Float16)f * (_Float16)pTmpB[i] ;
760              pa++;
761            }
762            pa += pos;
763        }
764 
765 
766        nb++;
767      }
768   }
769 
770   arm_status status = ARM_MATH_SUCCESS;
771   /* Return to application */
772   return (status);
773 }
774 
775 #endif /* end of test for Helium or Neon availability */
776 
777 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
778 /**
779   @} end of MatrixQR group
780  */
781