1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_qr_f32.c
4  * Description:  Floating-point matrix QR decomposition.
5  *
6  * $Date:        15 June 2022
7  * $Revision:    V1.11.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/matrix_functions.h"
30 #include "dsp/matrix_utils.h"
31 
32 
33 #if !defined(ARM_MATH_AUTOVECTORIZE)
34 #if defined(ARM_MATH_MVEF)
35 #include "arm_helium_utils.h"
36 #endif
37 #endif
38 
39 /**
40   @ingroup groupMatrix
41  */
42 
43 /**
44   @defgroup MatrixQR QR decomposition of a Matrix
45 
46     Computes the QR decomposition of a matrix M using Householder algorithm.
47 
48     \f[
49         M = Q R
50     \f]
51 
52     where Q is an orthogonal matrix and R is upper triangular.
53     No pivoting strategy is used.
54 
55     The returned value for R is using a format a bit similar
56     to LAPACK : it is not just containing the matrix R but
57     also the Householder reflectors.
58 
59     The function is also returning a vector \f$\tau\f$
60     that is containing the scaling factor for the reflectors.
61 
62     Returned value R has the structure:
63 
64     \f[
65     \begin{pmatrix}
66     r_{11} & r_{12}     & \dots  & r_{1n} \\
67     v_{12} & r_{22}     & \dots  & r_{2n} \\
68     v_{13} & v_{22}     & \dots  & r_{3n} \\
69     \vdots & \vdots     & \ddots & \vdots   \\
70     v_{1m} & v_{2(m-1)} & \dots  & r_{mn} \\
71     \end{pmatrix}
72     \f]
73 
74     where
75 
76     \f[
77     v_1 =
78     \begin{pmatrix}
79     1       \\
80     v_{12}  \\
81     \vdots  \\
82     v_{1m}  \\
83     \end{pmatrix}
84     \f]
85 
86     is the first householder reflector.
87 
88     The Householder Matrix is given by \f$H_1\f$
89 
90     \f[
91     H_1 = I - \tau_1 v_1 v_1^T
92     \f]
93 
94     The Matrix Q is the product of the Householder matrices:
95 
96     \f[
97     Q = H_1 H_2 \dots H_n
98     \f]
99 
100     The computation of the matrix Q by this function is
101     optional.
102 
103     And the matrix R, would be the returned value R without the
104     householder reflectors:
105 
106     \f[
107     \begin{pmatrix}
108     r_{11} & r_{12} & \dots  & r_{1n} \\
109     0      & r_{22} & \dots  & r_{2n} \\
110     0      & 0      & \dots  & r_{3n} \\
111     \vdots & \vdots & \ddots & \vdots   \\
112     0      & 0      & \dots  & r_{mn} \\
113     \end{pmatrix}
114     \f]
115 
116 
117  */
118 
119 /**
120   @addtogroup MatrixQR
121   @{
122  */
123 
124 /**
125   @brief         QR decomposition of a m x n floating point matrix with m >= n.
126   @param[in]     pSrc      points to input matrix structure. The source matrix is modified by the function.
127   @param[in]     threshold norm2 threshold.
128   @param[out]    pOutR     points to output R matrix structure of dimension m x n
129   @param[out]    pOutQ     points to output Q matrix structure of dimension m x m (can be NULL)
130   @param[out]    pOutTau   points to Householder scaling factors of dimension n
131   @param[inout]  pTmpA     points to a temporary vector of dimension m.
132   @param[inout]  pTmpB     points to a temporary vector of dimension m.
133   @return        execution status
134                    - \ref ARM_MATH_SUCCESS       : Operation successful
135                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
136 
137   @par           pOutQ is optional:
138                  pOutQ can be a NULL pointer.
139                  In this case, the argument will be ignored
140                  and the output Q matrix won't be computed.
141 
142 
143   @par           Norm2 threshold
144                  For the meaning of this argument please
145                  refer to the \ref MatrixHouseholder documentation
146 
147  */
148 
149 #if !defined(ARM_MATH_AUTOVECTORIZE)
150 #if defined(ARM_MATH_MVEF)
151 
arm_mat_qr_f32(const arm_matrix_instance_f32 * pSrc,const float32_t threshold,arm_matrix_instance_f32 * pOutR,arm_matrix_instance_f32 * pOutQ,float32_t * pOutTau,float32_t * pTmpA,float32_t * pTmpB)152 ARM_DSP_ATTRIBUTE arm_status arm_mat_qr_f32(
153     const arm_matrix_instance_f32 * pSrc,
154     const float32_t threshold,
155     arm_matrix_instance_f32 * pOutR,
156     arm_matrix_instance_f32 * pOutQ,
157     float32_t * pOutTau,
158     float32_t *pTmpA,
159     float32_t *pTmpB
160     )
161 
162 {
163   int32_t col=0;
164   int32_t nb,pos;
165   float32_t *pa,*pc;
166   float32_t beta;
167   float32_t *pv;
168   float32_t *pdst;
169   float32_t *p;
170 
171   if (pSrc->numRows < pSrc->numCols)
172   {
173     return(ARM_MATH_SIZE_MISMATCH);
174   }
175 
176   memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float32_t));
177   pOutR->numCols = pSrc->numCols;
178   pOutR->numRows = pSrc->numRows;
179 
180   p = pOutR->pData;
181 
182   pc = pOutTau;
183   for(col=0 ; col < pSrc->numCols; col++)
184   {
185       int32_t j,k,blkCnt,blkCnt2;
186       float32_t *pa0,*pa1,*pa2,*pa3,*ptemp;
187       float32_t temp;
188       float32x4_t v1,v2,vtemp;
189 
190       COPY_COL_F32(pOutR,col,col,pTmpA);
191 
192       beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA);
193       *pc++ = beta;
194 
195       pdst = pTmpB;
196 
197       /* v.T A(col:,col:) -> tmpb */
198       pv = pTmpA;
199       pa = p;
200 
201       temp = *pv;
202       blkCnt = (pSrc->numCols-col) >> 2;
203       while (blkCnt > 0)
204       {
205           v1 = vld1q_f32(pa);
206           v2 = vmulq_n_f32(v1,temp);
207           vst1q_f32(pdst,v2);
208 
209           pa += 4;
210           pdst += 4;
211           blkCnt--;
212       }
213       blkCnt = (pSrc->numCols-col) & 3;
214       if (blkCnt > 0)
215       {
216           mve_pred16_t p0 = vctp32q(blkCnt);
217           v1 = vld1q_f32(pa);
218           v2 = vmulq_n_f32(v1,temp);
219           vst1q_p_f32(pdst,v2,p0);
220 
221           pa += blkCnt;
222       }
223 
224       pa += col;
225       pv++;
226       pdst = pTmpB;
227 
228       pa0 = pa;
229       pa1 = pa0 + pSrc->numCols;
230       pa2 = pa1 + pSrc->numCols;
231       pa3 = pa2 + pSrc->numCols;
232 
233       /* Unrolled loop */
234       blkCnt = (pSrc->numRows-col - 1) >> 2;
235       k=1;
236       while(blkCnt > 0)
237       {
238           vtemp=vld1q_f32(pv);
239 
240           blkCnt2 = (pSrc->numCols-col) >> 2;
241           while (blkCnt2 > 0)
242           {
243               v1 = vld1q_f32(pdst);
244 
245               v2 = vld1q_f32(pa0);
246               v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0));
247 
248               v2 = vld1q_f32(pa1);
249               v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1));
250 
251               v2 = vld1q_f32(pa2);
252               v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2));
253 
254               v2 = vld1q_f32(pa3);
255               v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3));
256 
257               vst1q_f32(pdst,v1);
258 
259               pdst += 4;
260               pa0 += 4;
261               pa1 += 4;
262               pa2 += 4;
263               pa3 += 4;
264               blkCnt2--;
265           }
266           blkCnt2 = (pSrc->numCols-col) & 3;
267           if (blkCnt2 > 0)
268           {
269               mve_pred16_t p0 = vctp32q(blkCnt2);
270 
271               v1 = vld1q_f32(pdst);
272 
273               v2 = vld1q_f32(pa0);
274               v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0));
275 
276               v2 = vld1q_f32(pa1);
277               v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1));
278 
279               v2 = vld1q_f32(pa2);
280               v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2));
281 
282               v2 = vld1q_f32(pa3);
283               v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3));
284 
285               vst1q_p_f32(pdst,v1,p0);
286 
287               pa0 += blkCnt2;
288               pa1 += blkCnt2;
289               pa2 += blkCnt2;
290               pa3 += blkCnt2;
291           }
292 
293           pa0 += col + 3*pSrc->numCols;
294           pa1 += col + 3*pSrc->numCols;
295           pa2 += col + 3*pSrc->numCols;
296           pa3 += col + 3*pSrc->numCols;
297           pv  += 4;
298           pdst = pTmpB;
299           k += 4;
300           blkCnt--;
301       }
302 
303       pa = pa0;
304       for(;k<pSrc->numRows-col; k++)
305       {
306           temp = *pv;
307           blkCnt2 = (pSrc->numCols-col) >> 2;
308           while (blkCnt2 > 0)
309           {
310               v1 = vld1q_f32(pa);
311               v2 = vld1q_f32(pdst);
312               v2 = vfmaq_n_f32(v2,v1,temp);
313               vst1q_f32(pdst,v2);
314 
315               pa += 4;
316               pdst += 4;
317               blkCnt2--;
318           }
319           blkCnt2 = (pSrc->numCols-col) & 3;
320           if (blkCnt2 > 0)
321           {
322               mve_pred16_t p0 = vctp32q(blkCnt2);
323               v1 = vld1q_f32(pa);
324               v2 = vld1q_f32(pdst);
325               v2 = vfmaq_n_f32(v2,v1,temp);
326               vst1q_p_f32(pdst,v2,p0);
327 
328               pa += blkCnt2;
329           }
330 
331           pa += col;
332           pv++;
333           pdst = pTmpB;
334       }
335 
336       /* A(col:,col:) - beta v tmpb */
337       pa = p;
338       for(j=0;j<pSrc->numRows-col; j++)
339       {
340         float32_t f = -beta * pTmpA[j];
341         ptemp = pTmpB;
342 
343         blkCnt2 = (pSrc->numCols-col) >> 2;
344         while (blkCnt2 > 0)
345         {
346             v1 = vld1q_f32(pa);
347             v2 = vld1q_f32(ptemp);
348             v1 = vfmaq_n_f32(v1,v2,f);
349             vst1q_f32(pa,v1);
350 
351             pa += 4;
352             ptemp += 4;
353 
354             blkCnt2--;
355         }
356         blkCnt2 = (pSrc->numCols-col) & 3;
357         if (blkCnt2 > 0)
358         {
359             mve_pred16_t p0 = vctp32q(blkCnt2);
360 
361             v1 = vld1q_f32(pa);
362             v2 = vld1q_f32(ptemp);
363             v1 = vfmaq_n_f32(v1,v2,f);
364             vst1q_p_f32(pa,v1,p0);
365 
366             pa += blkCnt2;
367         }
368 
369         pa += col;
370       }
371 
372       /* Copy Householder reflectors into R matrix */
373       pa = p + pOutR->numCols;
374       for(k=0;k<pSrc->numRows-col-1; k++)
375       {
376          *pa = pTmpA[k+1];
377          pa += pOutR->numCols;
378       }
379 
380       p += 1 + pOutR->numCols;
381   }
382 
383   /* Generate Q if requested by user matrix */
384 
385   if (pOutQ != NULL)
386   {
387      /* Initialize Q matrix to identity */
388      memset(pOutQ->pData,0,sizeof(float32_t)*pOutQ->numRows*pOutQ->numRows);
389 
390      pa = pOutQ->pData;
391      for(col=0 ; col < pOutQ->numCols; col++)
392      {
393         *pa = 1.0f;
394         pa += pOutQ->numCols+1;
395      }
396 
397      nb = pOutQ->numRows - pOutQ->numCols + 1;
398 
399      pc = pOutTau + pOutQ->numCols - 1;
400      for(col=0 ; col < pOutQ->numCols; col++)
401      {
402        int32_t j,k, blkCnt, blkCnt2;
403        float32_t *pa0,*pa1,*pa2,*pa3,*ptemp;
404        float32_t temp;
405        float32x4_t v1,v2,vtemp;
406 
407        pos = pSrc->numRows - nb;
408        p = pOutQ->pData + pos + pOutQ->numCols*pos ;
409 
410 
411        COPY_COL_F32(pOutR,pos,pos,pTmpA);
412        pTmpA[0] = 1.0f;
413        pdst = pTmpB;
414 
415        /* v.T A(col:,col:) -> tmpb */
416 
417        pv = pTmpA;
418        pa = p;
419 
420        temp = *pv;
421        blkCnt2 = (pOutQ->numRows-pos) >> 2;
422        while (blkCnt2 > 0)
423        {
424            v1 = vld1q_f32(pa);
425            v1 = vmulq_n_f32(v1, temp);
426            vst1q_f32(pdst,v1);
427 
428            pa += 4;
429            pdst += 4;
430 
431            blkCnt2--;
432        }
433        blkCnt2 = (pOutQ->numRows-pos) & 3;
434        if (blkCnt2 > 0)
435        {
436            mve_pred16_t p0 = vctp32q(blkCnt2);
437 
438            v1 = vld1q_f32(pa);
439            v1 = vmulq_n_f32(v1, temp);
440            vst1q_p_f32(pdst,v1,p0);
441 
442            pa += blkCnt2;
443        }
444 
445        pa += pos;
446        pv++;
447        pdst = pTmpB;
448        pa0 = pa;
449        pa1 = pa0 + pOutQ->numRows;
450        pa2 = pa1 + pOutQ->numRows;
451        pa3 = pa2 + pOutQ->numRows;
452 
453        /* Unrolled loop */
454        blkCnt = (pOutQ->numRows-pos - 1) >> 2;
455        k=1;
456        while(blkCnt > 0)
457        {
458 
459            vtemp = vld1q_f32(pv);
460            blkCnt2 = (pOutQ->numRows-pos) >> 2;
461            while (blkCnt2 > 0)
462            {
463                v1 = vld1q_f32(pdst);
464 
465                v2 = vld1q_f32(pa0);
466                v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0));
467 
468                v2 = vld1q_f32(pa1);
469                v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1));
470 
471                v2 = vld1q_f32(pa2);
472                v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2));
473 
474                v2 = vld1q_f32(pa3);
475                v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3));
476 
477                vst1q_f32(pdst,v1);
478 
479                pa0 += 4;
480                pa1 += 4;
481                pa2 += 4;
482                pa3 += 4;
483                pdst += 4;
484 
485                blkCnt2--;
486            }
487            blkCnt2 = (pOutQ->numRows-pos) & 3;
488            if (blkCnt2 > 0)
489            {
490                mve_pred16_t p0 = vctp32q(blkCnt2);
491 
492                v1 = vld1q_f32(pdst);
493 
494                v2 = vld1q_f32(pa0);
495                v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0));
496 
497                v2 = vld1q_f32(pa1);
498                v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1));
499 
500                v2 = vld1q_f32(pa2);
501                v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2));
502 
503                v2 = vld1q_f32(pa3);
504                v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3));
505 
506                vst1q_p_f32(pdst,v1,p0);
507 
508                pa0 += blkCnt2;
509                pa1 += blkCnt2;
510                pa2 += blkCnt2;
511                pa3 += blkCnt2;
512 
513            }
514 
515            pa0 += pos + 3*pOutQ->numRows;
516            pa1 += pos + 3*pOutQ->numRows;
517            pa2 += pos + 3*pOutQ->numRows;
518            pa3 += pos + 3*pOutQ->numRows;
519            pv  += 4;
520            pdst = pTmpB;
521            k += 4;
522            blkCnt--;
523        }
524 
525        pa = pa0;
526        for(;k<pOutQ->numRows-pos; k++)
527        {
528            temp = *pv;
529            blkCnt2 = (pOutQ->numRows-pos) >> 2;
530            while (blkCnt2 > 0)
531            {
532                v1 = vld1q_f32(pdst);
533                v2 = vld1q_f32(pa);
534                v1 = vfmaq_n_f32(v1, v2, temp);
535                vst1q_f32(pdst,v1);
536 
537                pdst += 4;
538                pa += 4;
539 
540                blkCnt2--;
541            }
542            blkCnt2 = (pOutQ->numRows-pos) & 3;
543            if (blkCnt2 > 0)
544            {
545                mve_pred16_t p0 = vctp32q(blkCnt2);
546                v1 = vld1q_f32(pdst);
547                v2 = vld1q_f32(pa);
548                v1 = vfmaq_n_f32(v1, v2, temp);
549                vst1q_p_f32(pdst,v1,p0);
550 
551                pa += blkCnt2;
552            }
553 
554            pa += pos;
555            pv++;
556            pdst = pTmpB;
557        }
558 
559        pa = p;
560        beta = *pc--;
561        for(j=0;j<pOutQ->numRows-pos; j++)
562        {
563            float32_t f = -beta * pTmpA[j];
564            ptemp = pTmpB;
565 
566            blkCnt2 = (pOutQ->numCols-pos) >> 2;
567            while (blkCnt2 > 0)
568            {
569                v1 = vld1q_f32(pa);
570                v2 = vld1q_f32(ptemp);
571                v1 = vfmaq_n_f32(v1,v2,f);
572                vst1q_f32(pa,v1);
573 
574                pa += 4;
575                ptemp += 4;
576 
577                blkCnt2--;
578            }
579            blkCnt2 = (pOutQ->numCols-pos) & 3;
580            if (blkCnt2 > 0)
581            {
582                mve_pred16_t p0 = vctp32q(blkCnt2);
583 
584                v1 = vld1q_f32(pa);
585                v2 = vld1q_f32(ptemp);
586                v1 = vfmaq_n_f32(v1,v2,f);
587                vst1q_p_f32(pa,v1,p0);
588 
589                pa += blkCnt2;
590            }
591 
592            pa += pos;
593        }
594 
595 
596        nb++;
597      }
598   }
599 
600   arm_status status = ARM_MATH_SUCCESS;
601   /* Return to application */
602   return (status);
603 }
604 
605 #endif /*#if !defined(ARM_MATH_MVEF)*/
606 
607 
608 #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
609 
610 
611 
612 #if (!defined(ARM_MATH_MVEF)) || defined(ARM_MATH_AUTOVECTORIZE)
613 
arm_mat_qr_f32(const arm_matrix_instance_f32 * pSrc,const float32_t threshold,arm_matrix_instance_f32 * pOutR,arm_matrix_instance_f32 * pOutQ,float32_t * pOutTau,float32_t * pTmpA,float32_t * pTmpB)614 ARM_DSP_ATTRIBUTE arm_status arm_mat_qr_f32(
615     const arm_matrix_instance_f32 * pSrc,
616     const float32_t threshold,
617     arm_matrix_instance_f32 * pOutR,
618     arm_matrix_instance_f32 * pOutQ,
619     float32_t * pOutTau,
620     float32_t *pTmpA,
621     float32_t *pTmpB
622     )
623 
624 {
625   int32_t col=0;
626   int32_t nb,pos;
627   float32_t *pa,*pc;
628   float32_t beta;
629   float32_t *pv;
630   float32_t *pdst;
631   float32_t *p;
632 
633   if (pSrc->numRows < pSrc->numCols)
634   {
635     return(ARM_MATH_SIZE_MISMATCH);
636   }
637 
638   memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float32_t));
639   pOutR->numCols = pSrc->numCols;
640   pOutR->numRows = pSrc->numRows;
641 
642   p = pOutR->pData;
643 
644   pc = pOutTau;
645   for(col=0 ; col < pSrc->numCols; col++)
646   {
647       int32_t i,j,k,blkCnt;
648       float32_t *pa0,*pa1,*pa2,*pa3;
649       COPY_COL_F32(pOutR,col,col,pTmpA);
650 
651       beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA);
652       *pc++ = beta;
653 
654       pdst = pTmpB;
655 
656       /* v.T A(col:,col:) -> tmpb */
657       pv = pTmpA;
658       pa = p;
659       for(j=0;j<pSrc->numCols-col; j++)
660       {
661               *pdst++ = *pv * *pa++;
662       }
663       pa += col;
664       pv++;
665       pdst = pTmpB;
666 
667       pa0 = pa;
668       pa1 = pa0 + pSrc->numCols;
669       pa2 = pa1 + pSrc->numCols;
670       pa3 = pa2 + pSrc->numCols;
671 
672       /* Unrolled loop */
673       blkCnt = (pSrc->numRows-col - 1) >> 2;
674       k=1;
675       while(blkCnt > 0)
676       {
677           float32_t sum;
678 
679           for(j=0;j<pSrc->numCols-col; j++)
680           {
681               sum = *pdst;
682 
683               sum += pv[0] * *pa0++;
684               sum += pv[1] * *pa1++;
685               sum += pv[2] * *pa2++;
686               sum += pv[3] * *pa3++;
687 
688               *pdst++ = sum;
689           }
690           pa0 += col + 3*pSrc->numCols;
691           pa1 += col + 3*pSrc->numCols;
692           pa2 += col + 3*pSrc->numCols;
693           pa3 += col + 3*pSrc->numCols;
694           pv  += 4;
695           pdst = pTmpB;
696           k += 4;
697           blkCnt--;
698       }
699 
700       pa = pa0;
701       for(;k<pSrc->numRows-col; k++)
702       {
703           for(j=0;j<pSrc->numCols-col; j++)
704           {
705               *pdst++ += *pv * *pa++;
706           }
707           pa += col;
708           pv++;
709           pdst = pTmpB;
710       }
711 
712       /* A(col:,col:) - beta v tmpb */
713       pa = p;
714       for(j=0;j<pSrc->numRows-col; j++)
715       {
716         float32_t f = beta * pTmpA[j];
717 
718         for(i=0;i<pSrc->numCols-col; i++)
719         {
720           *pa = *pa - f * pTmpB[i] ;
721           pa++;
722         }
723         pa += col;
724       }
725 
726       /* Copy Householder reflectors into R matrix */
727       pa = p + pOutR->numCols;
728       for(k=0;k<pSrc->numRows-col-1; k++)
729       {
730          *pa = pTmpA[k+1];
731          pa += pOutR->numCols;
732       }
733 
734       p += 1 + pOutR->numCols;
735   }
736 
737   /* Generate Q if requested by user matrix */
738 
739   if (pOutQ != NULL)
740   {
741      /* Initialize Q matrix to identity */
742      memset(pOutQ->pData,0,sizeof(float32_t)*pOutQ->numRows*pOutQ->numRows);
743 
744      pa = pOutQ->pData;
745      for(col=0 ; col < pOutQ->numCols; col++)
746      {
747         *pa = 1.0f;
748         pa += pOutQ->numCols+1;
749      }
750 
751      nb = pOutQ->numRows - pOutQ->numCols + 1;
752 
753      pc = pOutTau + pOutQ->numCols - 1;
754      for(col=0 ; col < pOutQ->numCols; col++)
755      {
756        int32_t i,j,k, blkCnt;
757        float32_t *pa0,*pa1,*pa2,*pa3;
758        pos = pSrc->numRows - nb;
759        p = pOutQ->pData + pos + pOutQ->numCols*pos ;
760 
761 
762        COPY_COL_F32(pOutR,pos,pos,pTmpA);
763        pTmpA[0] = 1.0f;
764        pdst = pTmpB;
765 
766        /* v.T A(col:,col:) -> tmpb */
767 
768        pv = pTmpA;
769        pa = p;
770        for(j=0;j<pOutQ->numRows-pos; j++)
771        {
772                *pdst++ = *pv * *pa++;
773        }
774        pa += pos;
775        pv++;
776        pdst = pTmpB;
777        pa0 = pa;
778        pa1 = pa0 + pOutQ->numRows;
779        pa2 = pa1 + pOutQ->numRows;
780        pa3 = pa2 + pOutQ->numRows;
781 
782        /* Unrolled loop */
783        blkCnt = (pOutQ->numRows-pos - 1) >> 2;
784        k=1;
785        while(blkCnt > 0)
786        {
787            float32_t sum;
788 
789            for(j=0;j<pOutQ->numRows-pos; j++)
790            {
791               sum = *pdst;
792 
793               sum += pv[0] * *pa0++;
794               sum += pv[1] * *pa1++;
795               sum += pv[2] * *pa2++;
796               sum += pv[3] * *pa3++;
797 
798               *pdst++ = sum;
799            }
800            pa0 += pos + 3*pOutQ->numRows;
801            pa1 += pos + 3*pOutQ->numRows;
802            pa2 += pos + 3*pOutQ->numRows;
803            pa3 += pos + 3*pOutQ->numRows;
804            pv  += 4;
805            pdst = pTmpB;
806            k += 4;
807            blkCnt--;
808        }
809 
810        pa = pa0;
811        for(;k<pOutQ->numRows-pos; k++)
812        {
813            for(j=0;j<pOutQ->numRows-pos; j++)
814            {
815                *pdst++ += *pv * *pa++;
816            }
817            pa += pos;
818            pv++;
819            pdst = pTmpB;
820        }
821 
822        pa = p;
823        beta = *pc--;
824        for(j=0;j<pOutQ->numRows-pos; j++)
825        {
826            float32_t f = beta * pTmpA[j];
827 
828            for(i=0;i<pOutQ->numCols-pos; i++)
829            {
830              *pa = *pa - f * pTmpB[i] ;
831              pa++;
832            }
833            pa += pos;
834        }
835 
836 
837        nb++;
838      }
839   }
840 
841   arm_status status = ARM_MATH_SUCCESS;
842   /* Return to application */
843   return (status);
844 }
845 
846 #endif /* end of test for Helium or Neon availability */
847 
848 /**
849   @} end of MatrixQR group
850  */
851