1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_qr_f32.c
4 * Description: Floating-point matrix QR decomposition.
5 *
6 * $Date: 15 June 2022
7 * $Revision: V1.11.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/matrix_functions.h"
30 #include "dsp/matrix_utils.h"
31
32
33 #if !defined(ARM_MATH_AUTOVECTORIZE)
34 #if defined(ARM_MATH_MVEF)
35 #include "arm_helium_utils.h"
36 #endif
37 #endif
38
39 /**
40 @ingroup groupMatrix
41 */
42
43 /**
44 @defgroup MatrixQR QR decomposition of a Matrix
45
46 Computes the QR decomposition of a matrix M using Householder algorithm.
47
48 \f[
49 M = Q R
50 \f]
51
52 where Q is an orthogonal matrix and R is upper triangular.
53 No pivoting strategy is used.
54
55 The returned value for R is using a format a bit similar
56 to LAPACK : it is not just containing the matrix R but
57 also the Householder reflectors.
58
59 The function is also returning a vector \f$\tau\f$
60 that is containing the scaling factor for the reflectors.
61
62 Returned value R has the structure:
63
64 \f[
65 \begin{pmatrix}
66 r_{11} & r_{12} & \dots & r_{1n} \\
67 v_{12} & r_{22} & \dots & r_{2n} \\
68 v_{13} & v_{22} & \dots & r_{3n} \\
69 \vdots & \vdots & \ddots & \vdots \\
70 v_{1m} & v_{2(m-1)} & \dots & r_{mn} \\
71 \end{pmatrix}
72 \f]
73
74 where
75
76 \f[
77 v_1 =
78 \begin{pmatrix}
79 1 \\
80 v_{12} \\
81 \vdots \\
82 v_{1m} \\
83 \end{pmatrix}
84 \f]
85
86 is the first householder reflector.
87
88 The Householder Matrix is given by \f$H_1\f$
89
90 \f[
91 H_1 = I - \tau_1 v_1 v_1^T
92 \f]
93
94 The Matrix Q is the product of the Householder matrices:
95
96 \f[
97 Q = H_1 H_2 \dots H_n
98 \f]
99
100 The computation of the matrix Q by this function is
101 optional.
102
103 And the matrix R, would be the returned value R without the
104 householder reflectors:
105
106 \f[
107 \begin{pmatrix}
108 r_{11} & r_{12} & \dots & r_{1n} \\
109 0 & r_{22} & \dots & r_{2n} \\
110 0 & 0 & \dots & r_{3n} \\
111 \vdots & \vdots & \ddots & \vdots \\
112 0 & 0 & \dots & r_{mn} \\
113 \end{pmatrix}
114 \f]
115
116
117 */
118
119 /**
120 @addtogroup MatrixQR
121 @{
122 */
123
124 /**
125 @brief QR decomposition of a m x n floating point matrix with m >= n.
126 @param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
127 @param[in] threshold norm2 threshold.
128 @param[out] pOutR points to output R matrix structure of dimension m x n
129 @param[out] pOutQ points to output Q matrix structure of dimension m x m (can be NULL)
130 @param[out] pOutTau points to Householder scaling factors of dimension n
131 @param[inout] pTmpA points to a temporary vector of dimension m.
132 @param[inout] pTmpB points to a temporary vector of dimension m.
133 @return execution status
134 - \ref ARM_MATH_SUCCESS : Operation successful
135 - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
136
137 @par pOutQ is optional:
138 pOutQ can be a NULL pointer.
139 In this case, the argument will be ignored
140 and the output Q matrix won't be computed.
141
142
143 @par Norm2 threshold
144 For the meaning of this argument please
145 refer to the \ref MatrixHouseholder documentation
146
147 */
148
149 #if !defined(ARM_MATH_AUTOVECTORIZE)
150 #if defined(ARM_MATH_MVEF)
151
arm_mat_qr_f32(const arm_matrix_instance_f32 * pSrc,const float32_t threshold,arm_matrix_instance_f32 * pOutR,arm_matrix_instance_f32 * pOutQ,float32_t * pOutTau,float32_t * pTmpA,float32_t * pTmpB)152 ARM_DSP_ATTRIBUTE arm_status arm_mat_qr_f32(
153 const arm_matrix_instance_f32 * pSrc,
154 const float32_t threshold,
155 arm_matrix_instance_f32 * pOutR,
156 arm_matrix_instance_f32 * pOutQ,
157 float32_t * pOutTau,
158 float32_t *pTmpA,
159 float32_t *pTmpB
160 )
161
162 {
163 int32_t col=0;
164 int32_t nb,pos;
165 float32_t *pa,*pc;
166 float32_t beta;
167 float32_t *pv;
168 float32_t *pdst;
169 float32_t *p;
170
171 if (pSrc->numRows < pSrc->numCols)
172 {
173 return(ARM_MATH_SIZE_MISMATCH);
174 }
175
176 memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float32_t));
177 pOutR->numCols = pSrc->numCols;
178 pOutR->numRows = pSrc->numRows;
179
180 p = pOutR->pData;
181
182 pc = pOutTau;
183 for(col=0 ; col < pSrc->numCols; col++)
184 {
185 int32_t j,k,blkCnt,blkCnt2;
186 float32_t *pa0,*pa1,*pa2,*pa3,*ptemp;
187 float32_t temp;
188 float32x4_t v1,v2,vtemp;
189
190 COPY_COL_F32(pOutR,col,col,pTmpA);
191
192 beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA);
193 *pc++ = beta;
194
195 pdst = pTmpB;
196
197 /* v.T A(col:,col:) -> tmpb */
198 pv = pTmpA;
199 pa = p;
200
201 temp = *pv;
202 blkCnt = (pSrc->numCols-col) >> 2;
203 while (blkCnt > 0)
204 {
205 v1 = vld1q_f32(pa);
206 v2 = vmulq_n_f32(v1,temp);
207 vst1q_f32(pdst,v2);
208
209 pa += 4;
210 pdst += 4;
211 blkCnt--;
212 }
213 blkCnt = (pSrc->numCols-col) & 3;
214 if (blkCnt > 0)
215 {
216 mve_pred16_t p0 = vctp32q(blkCnt);
217 v1 = vld1q_f32(pa);
218 v2 = vmulq_n_f32(v1,temp);
219 vst1q_p_f32(pdst,v2,p0);
220
221 pa += blkCnt;
222 }
223
224 pa += col;
225 pv++;
226 pdst = pTmpB;
227
228 pa0 = pa;
229 pa1 = pa0 + pSrc->numCols;
230 pa2 = pa1 + pSrc->numCols;
231 pa3 = pa2 + pSrc->numCols;
232
233 /* Unrolled loop */
234 blkCnt = (pSrc->numRows-col - 1) >> 2;
235 k=1;
236 while(blkCnt > 0)
237 {
238 vtemp=vld1q_f32(pv);
239
240 blkCnt2 = (pSrc->numCols-col) >> 2;
241 while (blkCnt2 > 0)
242 {
243 v1 = vld1q_f32(pdst);
244
245 v2 = vld1q_f32(pa0);
246 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0));
247
248 v2 = vld1q_f32(pa1);
249 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1));
250
251 v2 = vld1q_f32(pa2);
252 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2));
253
254 v2 = vld1q_f32(pa3);
255 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3));
256
257 vst1q_f32(pdst,v1);
258
259 pdst += 4;
260 pa0 += 4;
261 pa1 += 4;
262 pa2 += 4;
263 pa3 += 4;
264 blkCnt2--;
265 }
266 blkCnt2 = (pSrc->numCols-col) & 3;
267 if (blkCnt2 > 0)
268 {
269 mve_pred16_t p0 = vctp32q(blkCnt2);
270
271 v1 = vld1q_f32(pdst);
272
273 v2 = vld1q_f32(pa0);
274 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0));
275
276 v2 = vld1q_f32(pa1);
277 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1));
278
279 v2 = vld1q_f32(pa2);
280 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2));
281
282 v2 = vld1q_f32(pa3);
283 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3));
284
285 vst1q_p_f32(pdst,v1,p0);
286
287 pa0 += blkCnt2;
288 pa1 += blkCnt2;
289 pa2 += blkCnt2;
290 pa3 += blkCnt2;
291 }
292
293 pa0 += col + 3*pSrc->numCols;
294 pa1 += col + 3*pSrc->numCols;
295 pa2 += col + 3*pSrc->numCols;
296 pa3 += col + 3*pSrc->numCols;
297 pv += 4;
298 pdst = pTmpB;
299 k += 4;
300 blkCnt--;
301 }
302
303 pa = pa0;
304 for(;k<pSrc->numRows-col; k++)
305 {
306 temp = *pv;
307 blkCnt2 = (pSrc->numCols-col) >> 2;
308 while (blkCnt2 > 0)
309 {
310 v1 = vld1q_f32(pa);
311 v2 = vld1q_f32(pdst);
312 v2 = vfmaq_n_f32(v2,v1,temp);
313 vst1q_f32(pdst,v2);
314
315 pa += 4;
316 pdst += 4;
317 blkCnt2--;
318 }
319 blkCnt2 = (pSrc->numCols-col) & 3;
320 if (blkCnt2 > 0)
321 {
322 mve_pred16_t p0 = vctp32q(blkCnt2);
323 v1 = vld1q_f32(pa);
324 v2 = vld1q_f32(pdst);
325 v2 = vfmaq_n_f32(v2,v1,temp);
326 vst1q_p_f32(pdst,v2,p0);
327
328 pa += blkCnt2;
329 }
330
331 pa += col;
332 pv++;
333 pdst = pTmpB;
334 }
335
336 /* A(col:,col:) - beta v tmpb */
337 pa = p;
338 for(j=0;j<pSrc->numRows-col; j++)
339 {
340 float32_t f = -beta * pTmpA[j];
341 ptemp = pTmpB;
342
343 blkCnt2 = (pSrc->numCols-col) >> 2;
344 while (blkCnt2 > 0)
345 {
346 v1 = vld1q_f32(pa);
347 v2 = vld1q_f32(ptemp);
348 v1 = vfmaq_n_f32(v1,v2,f);
349 vst1q_f32(pa,v1);
350
351 pa += 4;
352 ptemp += 4;
353
354 blkCnt2--;
355 }
356 blkCnt2 = (pSrc->numCols-col) & 3;
357 if (blkCnt2 > 0)
358 {
359 mve_pred16_t p0 = vctp32q(blkCnt2);
360
361 v1 = vld1q_f32(pa);
362 v2 = vld1q_f32(ptemp);
363 v1 = vfmaq_n_f32(v1,v2,f);
364 vst1q_p_f32(pa,v1,p0);
365
366 pa += blkCnt2;
367 }
368
369 pa += col;
370 }
371
372 /* Copy Householder reflectors into R matrix */
373 pa = p + pOutR->numCols;
374 for(k=0;k<pSrc->numRows-col-1; k++)
375 {
376 *pa = pTmpA[k+1];
377 pa += pOutR->numCols;
378 }
379
380 p += 1 + pOutR->numCols;
381 }
382
383 /* Generate Q if requested by user matrix */
384
385 if (pOutQ != NULL)
386 {
387 /* Initialize Q matrix to identity */
388 memset(pOutQ->pData,0,sizeof(float32_t)*pOutQ->numRows*pOutQ->numRows);
389
390 pa = pOutQ->pData;
391 for(col=0 ; col < pOutQ->numCols; col++)
392 {
393 *pa = 1.0f;
394 pa += pOutQ->numCols+1;
395 }
396
397 nb = pOutQ->numRows - pOutQ->numCols + 1;
398
399 pc = pOutTau + pOutQ->numCols - 1;
400 for(col=0 ; col < pOutQ->numCols; col++)
401 {
402 int32_t j,k, blkCnt, blkCnt2;
403 float32_t *pa0,*pa1,*pa2,*pa3,*ptemp;
404 float32_t temp;
405 float32x4_t v1,v2,vtemp;
406
407 pos = pSrc->numRows - nb;
408 p = pOutQ->pData + pos + pOutQ->numCols*pos ;
409
410
411 COPY_COL_F32(pOutR,pos,pos,pTmpA);
412 pTmpA[0] = 1.0f;
413 pdst = pTmpB;
414
415 /* v.T A(col:,col:) -> tmpb */
416
417 pv = pTmpA;
418 pa = p;
419
420 temp = *pv;
421 blkCnt2 = (pOutQ->numRows-pos) >> 2;
422 while (blkCnt2 > 0)
423 {
424 v1 = vld1q_f32(pa);
425 v1 = vmulq_n_f32(v1, temp);
426 vst1q_f32(pdst,v1);
427
428 pa += 4;
429 pdst += 4;
430
431 blkCnt2--;
432 }
433 blkCnt2 = (pOutQ->numRows-pos) & 3;
434 if (blkCnt2 > 0)
435 {
436 mve_pred16_t p0 = vctp32q(blkCnt2);
437
438 v1 = vld1q_f32(pa);
439 v1 = vmulq_n_f32(v1, temp);
440 vst1q_p_f32(pdst,v1,p0);
441
442 pa += blkCnt2;
443 }
444
445 pa += pos;
446 pv++;
447 pdst = pTmpB;
448 pa0 = pa;
449 pa1 = pa0 + pOutQ->numRows;
450 pa2 = pa1 + pOutQ->numRows;
451 pa3 = pa2 + pOutQ->numRows;
452
453 /* Unrolled loop */
454 blkCnt = (pOutQ->numRows-pos - 1) >> 2;
455 k=1;
456 while(blkCnt > 0)
457 {
458
459 vtemp = vld1q_f32(pv);
460 blkCnt2 = (pOutQ->numRows-pos) >> 2;
461 while (blkCnt2 > 0)
462 {
463 v1 = vld1q_f32(pdst);
464
465 v2 = vld1q_f32(pa0);
466 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0));
467
468 v2 = vld1q_f32(pa1);
469 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1));
470
471 v2 = vld1q_f32(pa2);
472 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2));
473
474 v2 = vld1q_f32(pa3);
475 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3));
476
477 vst1q_f32(pdst,v1);
478
479 pa0 += 4;
480 pa1 += 4;
481 pa2 += 4;
482 pa3 += 4;
483 pdst += 4;
484
485 blkCnt2--;
486 }
487 blkCnt2 = (pOutQ->numRows-pos) & 3;
488 if (blkCnt2 > 0)
489 {
490 mve_pred16_t p0 = vctp32q(blkCnt2);
491
492 v1 = vld1q_f32(pdst);
493
494 v2 = vld1q_f32(pa0);
495 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0));
496
497 v2 = vld1q_f32(pa1);
498 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1));
499
500 v2 = vld1q_f32(pa2);
501 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2));
502
503 v2 = vld1q_f32(pa3);
504 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3));
505
506 vst1q_p_f32(pdst,v1,p0);
507
508 pa0 += blkCnt2;
509 pa1 += blkCnt2;
510 pa2 += blkCnt2;
511 pa3 += blkCnt2;
512
513 }
514
515 pa0 += pos + 3*pOutQ->numRows;
516 pa1 += pos + 3*pOutQ->numRows;
517 pa2 += pos + 3*pOutQ->numRows;
518 pa3 += pos + 3*pOutQ->numRows;
519 pv += 4;
520 pdst = pTmpB;
521 k += 4;
522 blkCnt--;
523 }
524
525 pa = pa0;
526 for(;k<pOutQ->numRows-pos; k++)
527 {
528 temp = *pv;
529 blkCnt2 = (pOutQ->numRows-pos) >> 2;
530 while (blkCnt2 > 0)
531 {
532 v1 = vld1q_f32(pdst);
533 v2 = vld1q_f32(pa);
534 v1 = vfmaq_n_f32(v1, v2, temp);
535 vst1q_f32(pdst,v1);
536
537 pdst += 4;
538 pa += 4;
539
540 blkCnt2--;
541 }
542 blkCnt2 = (pOutQ->numRows-pos) & 3;
543 if (blkCnt2 > 0)
544 {
545 mve_pred16_t p0 = vctp32q(blkCnt2);
546 v1 = vld1q_f32(pdst);
547 v2 = vld1q_f32(pa);
548 v1 = vfmaq_n_f32(v1, v2, temp);
549 vst1q_p_f32(pdst,v1,p0);
550
551 pa += blkCnt2;
552 }
553
554 pa += pos;
555 pv++;
556 pdst = pTmpB;
557 }
558
559 pa = p;
560 beta = *pc--;
561 for(j=0;j<pOutQ->numRows-pos; j++)
562 {
563 float32_t f = -beta * pTmpA[j];
564 ptemp = pTmpB;
565
566 blkCnt2 = (pOutQ->numCols-pos) >> 2;
567 while (blkCnt2 > 0)
568 {
569 v1 = vld1q_f32(pa);
570 v2 = vld1q_f32(ptemp);
571 v1 = vfmaq_n_f32(v1,v2,f);
572 vst1q_f32(pa,v1);
573
574 pa += 4;
575 ptemp += 4;
576
577 blkCnt2--;
578 }
579 blkCnt2 = (pOutQ->numCols-pos) & 3;
580 if (blkCnt2 > 0)
581 {
582 mve_pred16_t p0 = vctp32q(blkCnt2);
583
584 v1 = vld1q_f32(pa);
585 v2 = vld1q_f32(ptemp);
586 v1 = vfmaq_n_f32(v1,v2,f);
587 vst1q_p_f32(pa,v1,p0);
588
589 pa += blkCnt2;
590 }
591
592 pa += pos;
593 }
594
595
596 nb++;
597 }
598 }
599
600 arm_status status = ARM_MATH_SUCCESS;
601 /* Return to application */
602 return (status);
603 }
604
605 #endif /*#if !defined(ARM_MATH_MVEF)*/
606
607
608 #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
609
610
611
612 #if (!defined(ARM_MATH_MVEF)) || defined(ARM_MATH_AUTOVECTORIZE)
613
arm_mat_qr_f32(const arm_matrix_instance_f32 * pSrc,const float32_t threshold,arm_matrix_instance_f32 * pOutR,arm_matrix_instance_f32 * pOutQ,float32_t * pOutTau,float32_t * pTmpA,float32_t * pTmpB)614 ARM_DSP_ATTRIBUTE arm_status arm_mat_qr_f32(
615 const arm_matrix_instance_f32 * pSrc,
616 const float32_t threshold,
617 arm_matrix_instance_f32 * pOutR,
618 arm_matrix_instance_f32 * pOutQ,
619 float32_t * pOutTau,
620 float32_t *pTmpA,
621 float32_t *pTmpB
622 )
623
624 {
625 int32_t col=0;
626 int32_t nb,pos;
627 float32_t *pa,*pc;
628 float32_t beta;
629 float32_t *pv;
630 float32_t *pdst;
631 float32_t *p;
632
633 if (pSrc->numRows < pSrc->numCols)
634 {
635 return(ARM_MATH_SIZE_MISMATCH);
636 }
637
638 memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float32_t));
639 pOutR->numCols = pSrc->numCols;
640 pOutR->numRows = pSrc->numRows;
641
642 p = pOutR->pData;
643
644 pc = pOutTau;
645 for(col=0 ; col < pSrc->numCols; col++)
646 {
647 int32_t i,j,k,blkCnt;
648 float32_t *pa0,*pa1,*pa2,*pa3;
649 COPY_COL_F32(pOutR,col,col,pTmpA);
650
651 beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA);
652 *pc++ = beta;
653
654 pdst = pTmpB;
655
656 /* v.T A(col:,col:) -> tmpb */
657 pv = pTmpA;
658 pa = p;
659 for(j=0;j<pSrc->numCols-col; j++)
660 {
661 *pdst++ = *pv * *pa++;
662 }
663 pa += col;
664 pv++;
665 pdst = pTmpB;
666
667 pa0 = pa;
668 pa1 = pa0 + pSrc->numCols;
669 pa2 = pa1 + pSrc->numCols;
670 pa3 = pa2 + pSrc->numCols;
671
672 /* Unrolled loop */
673 blkCnt = (pSrc->numRows-col - 1) >> 2;
674 k=1;
675 while(blkCnt > 0)
676 {
677 float32_t sum;
678
679 for(j=0;j<pSrc->numCols-col; j++)
680 {
681 sum = *pdst;
682
683 sum += pv[0] * *pa0++;
684 sum += pv[1] * *pa1++;
685 sum += pv[2] * *pa2++;
686 sum += pv[3] * *pa3++;
687
688 *pdst++ = sum;
689 }
690 pa0 += col + 3*pSrc->numCols;
691 pa1 += col + 3*pSrc->numCols;
692 pa2 += col + 3*pSrc->numCols;
693 pa3 += col + 3*pSrc->numCols;
694 pv += 4;
695 pdst = pTmpB;
696 k += 4;
697 blkCnt--;
698 }
699
700 pa = pa0;
701 for(;k<pSrc->numRows-col; k++)
702 {
703 for(j=0;j<pSrc->numCols-col; j++)
704 {
705 *pdst++ += *pv * *pa++;
706 }
707 pa += col;
708 pv++;
709 pdst = pTmpB;
710 }
711
712 /* A(col:,col:) - beta v tmpb */
713 pa = p;
714 for(j=0;j<pSrc->numRows-col; j++)
715 {
716 float32_t f = beta * pTmpA[j];
717
718 for(i=0;i<pSrc->numCols-col; i++)
719 {
720 *pa = *pa - f * pTmpB[i] ;
721 pa++;
722 }
723 pa += col;
724 }
725
726 /* Copy Householder reflectors into R matrix */
727 pa = p + pOutR->numCols;
728 for(k=0;k<pSrc->numRows-col-1; k++)
729 {
730 *pa = pTmpA[k+1];
731 pa += pOutR->numCols;
732 }
733
734 p += 1 + pOutR->numCols;
735 }
736
737 /* Generate Q if requested by user matrix */
738
739 if (pOutQ != NULL)
740 {
741 /* Initialize Q matrix to identity */
742 memset(pOutQ->pData,0,sizeof(float32_t)*pOutQ->numRows*pOutQ->numRows);
743
744 pa = pOutQ->pData;
745 for(col=0 ; col < pOutQ->numCols; col++)
746 {
747 *pa = 1.0f;
748 pa += pOutQ->numCols+1;
749 }
750
751 nb = pOutQ->numRows - pOutQ->numCols + 1;
752
753 pc = pOutTau + pOutQ->numCols - 1;
754 for(col=0 ; col < pOutQ->numCols; col++)
755 {
756 int32_t i,j,k, blkCnt;
757 float32_t *pa0,*pa1,*pa2,*pa3;
758 pos = pSrc->numRows - nb;
759 p = pOutQ->pData + pos + pOutQ->numCols*pos ;
760
761
762 COPY_COL_F32(pOutR,pos,pos,pTmpA);
763 pTmpA[0] = 1.0f;
764 pdst = pTmpB;
765
766 /* v.T A(col:,col:) -> tmpb */
767
768 pv = pTmpA;
769 pa = p;
770 for(j=0;j<pOutQ->numRows-pos; j++)
771 {
772 *pdst++ = *pv * *pa++;
773 }
774 pa += pos;
775 pv++;
776 pdst = pTmpB;
777 pa0 = pa;
778 pa1 = pa0 + pOutQ->numRows;
779 pa2 = pa1 + pOutQ->numRows;
780 pa3 = pa2 + pOutQ->numRows;
781
782 /* Unrolled loop */
783 blkCnt = (pOutQ->numRows-pos - 1) >> 2;
784 k=1;
785 while(blkCnt > 0)
786 {
787 float32_t sum;
788
789 for(j=0;j<pOutQ->numRows-pos; j++)
790 {
791 sum = *pdst;
792
793 sum += pv[0] * *pa0++;
794 sum += pv[1] * *pa1++;
795 sum += pv[2] * *pa2++;
796 sum += pv[3] * *pa3++;
797
798 *pdst++ = sum;
799 }
800 pa0 += pos + 3*pOutQ->numRows;
801 pa1 += pos + 3*pOutQ->numRows;
802 pa2 += pos + 3*pOutQ->numRows;
803 pa3 += pos + 3*pOutQ->numRows;
804 pv += 4;
805 pdst = pTmpB;
806 k += 4;
807 blkCnt--;
808 }
809
810 pa = pa0;
811 for(;k<pOutQ->numRows-pos; k++)
812 {
813 for(j=0;j<pOutQ->numRows-pos; j++)
814 {
815 *pdst++ += *pv * *pa++;
816 }
817 pa += pos;
818 pv++;
819 pdst = pTmpB;
820 }
821
822 pa = p;
823 beta = *pc--;
824 for(j=0;j<pOutQ->numRows-pos; j++)
825 {
826 float32_t f = beta * pTmpA[j];
827
828 for(i=0;i<pOutQ->numCols-pos; i++)
829 {
830 *pa = *pa - f * pTmpB[i] ;
831 pa++;
832 }
833 pa += pos;
834 }
835
836
837 nb++;
838 }
839 }
840
841 arm_status status = ARM_MATH_SUCCESS;
842 /* Return to application */
843 return (status);
844 }
845
846 #endif /* end of test for Helium or Neon availability */
847
848 /**
849 @} end of MatrixQR group
850 */
851