pmat(float32_t * p,int nbrows,int nbcols)1 void pmat(float32_t *p,int nbrows,int nbcols)
2 {
3 for(int r=0;r<nbrows;r++)
4 {
5 for(int c=0;c<nbcols;c++)
6 {
7 printf("%f ",(double)p[c+r*nbcols]);
8 }
9 printf("\r\n");
10 }
11 printf("\r\n");
12 }
13
pvec(float32_t * p,int nb)14 void pvec(float32_t *p,int nb)
15 {
16 for(int c=0;c<nb;c++)
17 {
18 printf("%f ",(double)p[c]);
19 }
20 printf("\r\n");
21 }
22
pvec(Q7 * p,int nb)23 void pvec(Q7 *p,int nb)
24 {
25 for(int c=0;c<nb;c++)
26 {
27 printf("%f ",(double)(1.0f*p[c].v/128.0f));
28 }
29 printf("\r\n");
30 }
31
32 #if !defined(ARM_MATH_AUTOVECTORIZE)
33 #if defined(ARM_MATH_MVEF)
34
_arm_mat_qr_f32(const arm_matrix_instance_f32 * pSrc,const float32_t threshold,arm_matrix_instance_f32 * pOutR,arm_matrix_instance_f32 * pOutQ,float32_t * pOutTau,float32_t * pTmpA,float32_t * pTmpB)35 arm_status _arm_mat_qr_f32(
36 const arm_matrix_instance_f32 * pSrc,
37 const float32_t threshold,
38 arm_matrix_instance_f32 * pOutR,
39 arm_matrix_instance_f32 * pOutQ,
40 float32_t * pOutTau,
41 float32_t *pTmpA,
42 float32_t *pTmpB
43 )
44
45 {
46 int32_t col=0;
47 int32_t nb,pos;
48 float32_t *pa,*pc;
49 float32_t beta;
50 float32_t *pv;
51 float32_t *pdst;
52 float32_t *p;
53
54 if (pSrc->numRows < pSrc->numCols)
55 {
56 return(ARM_MATH_SIZE_MISMATCH);
57 }
58
59 memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float32_t));
60 pOutR->numCols = pSrc->numCols;
61 pOutR->numRows = pSrc->numRows;
62
63 p = pOutR->pData;
64
65 pc = pOutTau;
66 for(col=0 ; col < pSrc->numCols; col++)
67 {
68 int32_t j,k,blkCnt,blkCnt2;
69 float32_t *pa0,*pa1,*pa2,*pa3,*ptemp;
70 float32_t temp;
71 float32x4_t v1,v2,vtemp;
72
73
74 COPY_COL_F32(pOutR,col,col,pTmpA);
75
76 beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA);
77 *pc++ = beta;
78
79 //pvec(pTmpA,pSrc->numRows-col);
80 //pmat(p,pSrc->numRows-col,pSrc->numCols-col);
81
82 pdst = pTmpB;
83
84 /* v.T A(col:,col:) -> tmpb */
85 pv = pTmpA;
86 pa = p;
87
88 temp = *pv;
89 blkCnt = (pSrc->numCols-col) >> 2;
90 while (blkCnt > 0)
91 {
92 v1 = vld1q_f32(pa);
93 v2 = vmulq_n_f32(v1,temp);
94 vst1q_f32(pdst,v2);
95
96 pa += 4;
97 pdst += 4;
98 blkCnt--;
99 }
100 blkCnt = (pSrc->numCols-col) & 3;
101 if (blkCnt > 0)
102 {
103 mve_pred16_t p0 = vctp32q(blkCnt);
104 v1 = vld1q_f32(pa);
105 v2 = vmulq_n_f32(v1,temp);
106 vst1q_p_f32(pdst,v2,p0);
107
108 pa += blkCnt;
109 }
110
111
112
113 pa += col;
114 pv++;
115 pdst = pTmpB;
116
117 pa0 = pa;
118 pa1 = pa0 + pSrc->numCols;
119 pa2 = pa1 + pSrc->numCols;
120 pa3 = pa2 + pSrc->numCols;
121
122 /* Unrolled loop */
123 blkCnt = (pSrc->numRows-col - 1) >> 2;
124 k=1;
125 while(blkCnt > 0)
126 {
127 vtemp=vld1q_f32(pv);
128
129 blkCnt2 = (pSrc->numCols-col) >> 2;
130 while (blkCnt2 > 0)
131 {
132 v1 = vld1q_f32(pdst);
133
134 v2 = vld1q_f32(pa0);
135 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0));
136
137 v2 = vld1q_f32(pa1);
138 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1));
139
140 v2 = vld1q_f32(pa2);
141 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2));
142
143 v2 = vld1q_f32(pa3);
144 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3));
145
146 vst1q_f32(pdst,v1);
147
148 pdst += 4;
149 pa0 += 4;
150 pa1 += 4;
151 pa2 += 4;
152 pa3 += 4;
153 blkCnt2--;
154 }
155 blkCnt2 = (pSrc->numCols-col) & 3;
156 if (blkCnt2 > 0)
157 {
158 mve_pred16_t p0 = vctp32q(blkCnt2);
159
160 v1 = vld1q_f32(pdst);
161
162 v2 = vld1q_f32(pa0);
163 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0));
164
165 v2 = vld1q_f32(pa1);
166 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1));
167
168 v2 = vld1q_f32(pa2);
169 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2));
170
171 v2 = vld1q_f32(pa3);
172 v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3));
173
174 vst1q_p_f32(pdst,v1,p0);
175
176 pa0 += blkCnt2;
177 pa1 += blkCnt2;
178 pa2 += blkCnt2;
179 pa3 += blkCnt2;
180 }
181
182 pa0 += col + 3*pSrc->numCols;
183 pa1 += col + 3*pSrc->numCols;
184 pa2 += col + 3*pSrc->numCols;
185 pa3 += col + 3*pSrc->numCols;
186 pv += 4;
187 pdst = pTmpB;
188 k += 4;
189 blkCnt--;
190 }
191
192 pa = pa0;
193 for(;k<pSrc->numRows-col; k++)
194 {
195 temp = *pv;
196 blkCnt2 = (pSrc->numCols-col) >> 2;
197 while (blkCnt2 > 0)
198 {
199 v1 = vld1q_f32(pa);
200 v2 = vld1q_f32(pdst);
201 v2 = vfmaq_n_f32(v2,v1,temp);
202 vst1q_f32(pdst,v2);
203
204 pa += 4;
205 pdst += 4;
206 blkCnt2--;
207 }
208 blkCnt2 = (pSrc->numCols-col) & 3;
209 if (blkCnt2 > 0)
210 {
211 mve_pred16_t p0 = vctp32q(blkCnt2);
212 v1 = vld1q_f32(pa);
213 v2 = vld1q_f32(pdst);
214 v2 = vfmaq_n_f32(v2,v1,temp);
215 vst1q_p_f32(pdst,v2,p0);
216
217 pa += blkCnt2;
218 }
219
220 pa += col;
221 pv++;
222 pdst = pTmpB;
223 }
224
225 //pvec(pTmpB,pSrc->numCols-col);
226 //printf("--\r\n");
227
228 /* A(col:,col:) - beta v tmpb */
229 pa = p;
230 for(j=0;j<pSrc->numRows-col; j++)
231 {
232 float32_t f = -beta * pTmpA[j];
233 ptemp = pTmpB;
234
235 blkCnt2 = (pSrc->numCols-col) >> 2;
236 while (blkCnt2 > 0)
237 {
238 v1 = vld1q_f32(pa);
239 v2 = vld1q_f32(ptemp);
240 v1 = vfmaq_n_f32(v1,v2,f);
241 vst1q_f32(pa,v1);
242
243 pa += 4;
244 ptemp += 4;
245
246 blkCnt2--;
247 }
248 blkCnt2 = (pSrc->numCols-col) & 3;
249 if (blkCnt2 > 0)
250 {
251 mve_pred16_t p0 = vctp32q(blkCnt2);
252
253 v1 = vld1q_f32(pa);
254 v2 = vld1q_f32(ptemp);
255 v1 = vfmaq_n_f32(v1,v2,f);
256 vst1q_p_f32(pa,v1,p0);
257
258 pa += blkCnt2;
259 }
260
261 pa += col;
262 }
263
264 /* Copy Householder reflectors into R matrix */
265 pa = p + pOutR->numCols;
266 for(k=0;k<pSrc->numRows-col-1; k++)
267 {
268 *pa = pTmpA[k+1];
269 pa += pOutR->numCols;
270 }
271
272 p += 1 + pOutR->numCols;
273 }
274
275 /* Generate Q if requested by user matrix */
276
277 if (pOutQ != NULL)
278 {
279 /* Initialize Q matrix to identity */
280 memset(pOutQ->pData,0,sizeof(float32_t)*pOutQ->numRows*pOutQ->numRows);
281
282 pa = pOutQ->pData;
283 for(col=0 ; col < pOutQ->numCols; col++)
284 {
285 *pa = 1.0f;
286 pa += pOutQ->numCols+1;
287 }
288
289 nb = pOutQ->numRows - pOutQ->numCols + 1;
290
291 pc = pOutTau + pOutQ->numCols - 1;
292 for(col=0 ; col < pOutQ->numCols; col++)
293 {
294 int32_t j,k, blkCnt, blkCnt2;
295 float32_t *pa0,*pa1,*pa2,*pa3,*ptemp;
296 float32_t temp;
297 float32x4_t v1,v2,vtemp;
298
299 pos = pSrc->numRows - nb;
300 p = pOutQ->pData + pos + pOutQ->numCols*pos ;
301
302
303 COPY_COL_F32(pOutR,pos,pos,pTmpA);
304 pTmpA[0] = 1.0f;
305 pdst = pTmpB;
306
307 /* v.T A(col:,col:) -> tmpb */
308
309 pv = pTmpA;
310 pa = p;
311
312 temp = *pv;
313 blkCnt2 = (pOutQ->numRows-pos) >> 2;
314 while (blkCnt2 > 0)
315 {
316 v1 = vld1q_f32(pa);
317 v1 = vmulq_n_f32(v1, temp);
318 vst1q_f32(pdst,v1);
319
320 pa += 4;
321 pdst += 4;
322
323 blkCnt2--;
324 }
325 blkCnt2 = (pOutQ->numRows-pos) & 3;
326 if (blkCnt2 > 0)
327 {
328 mve_pred16_t p0 = vctp32q(blkCnt2);
329
330 v1 = vld1q_f32(pa);
331 v1 = vmulq_n_f32(v1, temp);
332 vst1q_p_f32(pdst,v1,p0);
333
334 pa += blkCnt2;
335 }
336
337 pa += pos;
338 pv++;
339 pdst = pTmpB;
340 pa0 = pa;
341 pa1 = pa0 + pOutQ->numRows;
342 pa2 = pa1 + pOutQ->numRows;
343 pa3 = pa2 + pOutQ->numRows;
344
345 /* Unrolled loop */
346 blkCnt = (pOutQ->numRows-pos - 1) >> 2;
347 k=1;
348 while(blkCnt > 0)
349 {
350
351 vtemp = vld1q_f32(pv);
352 blkCnt2 = (pOutQ->numRows-pos) >> 2;
353 while (blkCnt2 > 0)
354 {
355 v1 = vld1q_f32(pdst);
356
357 v2 = vld1q_f32(pa0);
358 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0));
359
360 v2 = vld1q_f32(pa1);
361 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1));
362
363 v2 = vld1q_f32(pa2);
364 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2));
365
366 v2 = vld1q_f32(pa3);
367 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3));
368
369 vst1q_f32(pdst,v1);
370
371 pa0 += 4;
372 pa1 += 4;
373 pa2 += 4;
374 pa3 += 4;
375 pdst += 4;
376
377 blkCnt2--;
378 }
379 blkCnt2 = (pOutQ->numRows-pos) & 3;
380 if (blkCnt2 > 0)
381 {
382 mve_pred16_t p0 = vctp32q(blkCnt2);
383
384 v1 = vld1q_f32(pdst);
385
386 v2 = vld1q_f32(pa0);
387 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0));
388
389 v2 = vld1q_f32(pa1);
390 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1));
391
392 v2 = vld1q_f32(pa2);
393 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2));
394
395 v2 = vld1q_f32(pa3);
396 v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3));
397
398 vst1q_p_f32(pdst,v1,p0);
399
400 pa0 += blkCnt2;
401 pa1 += blkCnt2;
402 pa2 += blkCnt2;
403 pa3 += blkCnt2;
404
405 }
406
407 pa0 += pos + 3*pOutQ->numRows;
408 pa1 += pos + 3*pOutQ->numRows;
409 pa2 += pos + 3*pOutQ->numRows;
410 pa3 += pos + 3*pOutQ->numRows;
411 pv += 4;
412 pdst = pTmpB;
413 k += 4;
414 blkCnt--;
415 }
416
417 pa = pa0;
418 for(;k<pOutQ->numRows-pos; k++)
419 {
420 temp = *pv;
421 blkCnt2 = (pOutQ->numRows-pos) >> 2;
422 while (blkCnt2 > 0)
423 {
424 v1 = vld1q_f32(pdst);
425 v2 = vld1q_f32(pa);
426 v1 = vfmaq_n_f32(v1, v2, temp);
427 vst1q_f32(pdst,v1);
428
429 pdst += 4;
430 pa += 4;
431
432 blkCnt2--;
433 }
434 blkCnt2 = (pOutQ->numRows-pos) & 3;
435 if (blkCnt2 > 0)
436 {
437 mve_pred16_t p0 = vctp32q(blkCnt2);
438 v1 = vld1q_f32(pdst);
439 v2 = vld1q_f32(pa);
440 v1 = vfmaq_n_f32(v1, v2, temp);
441 vst1q_p_f32(pdst,v1,p0);
442
443 pa += blkCnt2;
444 }
445
446 pa += pos;
447 pv++;
448 pdst = pTmpB;
449 }
450
451 pa = p;
452 beta = *pc--;
453 for(j=0;j<pOutQ->numRows-pos; j++)
454 {
455 float32_t f = -beta * pTmpA[j];
456 ptemp = pTmpB;
457
458 blkCnt2 = (pOutQ->numCols-pos) >> 2;
459 while (blkCnt2 > 0)
460 {
461 v1 = vld1q_f32(pa);
462 v2 = vld1q_f32(ptemp);
463 v1 = vfmaq_n_f32(v1,v2,f);
464 vst1q_f32(pa,v1);
465
466 pa += 4;
467 ptemp += 4;
468
469 blkCnt2--;
470 }
471 blkCnt2 = (pOutQ->numCols-pos) & 3;
472 if (blkCnt2 > 0)
473 {
474 mve_pred16_t p0 = vctp32q(blkCnt2);
475
476 v1 = vld1q_f32(pa);
477 v2 = vld1q_f32(ptemp);
478 v1 = vfmaq_n_f32(v1,v2,f);
479 vst1q_p_f32(pa,v1,p0);
480
481 pa += blkCnt2;
482 }
483
484 pa += pos;
485 }
486
487
488 nb++;
489 }
490 }
491
492 arm_status status = ARM_MATH_SUCCESS;
493 /* Return to application */
494 return (status);
495 }
496
497 #endif /*#if !defined(ARM_MATH_MVEF)*/
498
499
500 #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
501
502
503
504 #if (!defined(ARM_MATH_MVEF)) || defined(ARM_MATH_AUTOVECTORIZE)
505
_arm_mat_qr_f32(const arm_matrix_instance_f32 * pSrc,const float32_t threshold,arm_matrix_instance_f32 * pOutR,arm_matrix_instance_f32 * pOutQ,float32_t * pOutTau,float32_t * pTmpA,float32_t * pTmpB)506 arm_status _arm_mat_qr_f32(
507 const arm_matrix_instance_f32 * pSrc,
508 const float32_t threshold,
509 arm_matrix_instance_f32 * pOutR,
510 arm_matrix_instance_f32 * pOutQ,
511 float32_t * pOutTau,
512 float32_t *pTmpA,
513 float32_t *pTmpB
514 )
515
516 {
517 int32_t col=0;
518 int32_t nb,pos;
519 float32_t *pa,*pc;
520 float32_t beta;
521 float32_t *pv;
522 float32_t *pdst;
523 float32_t *p;
524
525 if (pSrc->numRows < pSrc->numCols)
526 {
527 return(ARM_MATH_SIZE_MISMATCH);
528 }
529
530 memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float32_t));
531 pOutR->numCols = pSrc->numCols;
532 pOutR->numRows = pSrc->numRows;
533
534 p = pOutR->pData;
535
536 pc = pOutTau;
537 for(col=0 ; col < pSrc->numCols; col++)
538 {
539 int32_t i,j,k,blkCnt;
540 float32_t *pa0,*pa1,*pa2,*pa3;
541 COPY_COL_F32(pOutR,col,col,pTmpA);
542
543 beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA);
544 *pc++ = beta;
545
546 pdst = pTmpB;
547
548 /* v.T A(col:,col:) -> tmpb */
549 pv = pTmpA;
550 pa = p;
551 for(j=0;j<pSrc->numCols-col; j++)
552 {
553 *pdst++ = *pv * *pa++;
554 }
555 pa += col;
556 pv++;
557 pdst = pTmpB;
558
559 pa0 = pa;
560 pa1 = pa0 + pSrc->numCols;
561 pa2 = pa1 + pSrc->numCols;
562 pa3 = pa2 + pSrc->numCols;
563
564 /* Unrolled loop */
565 blkCnt = (pSrc->numRows-col - 1) >> 2;
566 k=1;
567 while(blkCnt > 0)
568 {
569 float32_t sum;
570
571 for(j=0;j<pSrc->numCols-col; j++)
572 {
573 sum = *pdst;
574
575 sum += pv[0] * *pa0++;
576 sum += pv[1] * *pa1++;
577 sum += pv[2] * *pa2++;
578 sum += pv[3] * *pa3++;
579
580 *pdst++ = sum;
581 }
582 pa0 += col + 3*pSrc->numCols;
583 pa1 += col + 3*pSrc->numCols;
584 pa2 += col + 3*pSrc->numCols;
585 pa3 += col + 3*pSrc->numCols;
586 pv += 4;
587 pdst = pTmpB;
588 k += 4;
589 blkCnt--;
590 }
591
592 pa = pa0;
593 for(;k<pSrc->numRows-col; k++)
594 {
595 for(j=0;j<pSrc->numCols-col; j++)
596 {
597 *pdst++ += *pv * *pa++;
598 }
599 pa += col;
600 pv++;
601 pdst = pTmpB;
602 }
603
604 /* A(col:,col:) - beta v tmpb */
605 pa = p;
606 for(j=0;j<pSrc->numRows-col; j++)
607 {
608 float32_t f = beta * pTmpA[j];
609
610 for(i=0;i<pSrc->numCols-col; i++)
611 {
612 *pa = *pa - f * pTmpB[i] ;
613 pa++;
614 }
615 pa += col;
616 }
617
618 /* Copy Householder reflectors into R matrix */
619 pa = p + pOutR->numCols;
620 for(k=0;k<pSrc->numRows-col-1; k++)
621 {
622 *pa = pTmpA[k+1];
623 pa += pOutR->numCols;
624 }
625
626 p += 1 + pOutR->numCols;
627 }
628
629 /* Generate Q if requested by user matrix */
630
631 if (pOutQ != NULL)
632 {
633 /* Initialize Q matrix to identity */
634 memset(pOutQ->pData,0,sizeof(float32_t)*pOutQ->numRows*pOutQ->numRows);
635
636 pa = pOutQ->pData;
637 for(col=0 ; col < pOutQ->numCols; col++)
638 {
639 *pa = 1.0f;
640 pa += pOutQ->numCols+1;
641 }
642
643 nb = pOutQ->numRows - pOutQ->numCols + 1;
644
645 pc = pOutTau + pOutQ->numCols - 1;
646 for(col=0 ; col < pOutQ->numCols; col++)
647 {
648 int32_t i,j,k, blkCnt;
649 float32_t *pa0,*pa1,*pa2,*pa3;
650 pos = pSrc->numRows - nb;
651 p = pOutQ->pData + pos + pOutQ->numCols*pos ;
652
653
654 COPY_COL_F32(pOutR,pos,pos,pTmpA);
655 pTmpA[0] = 1.0f;
656 pdst = pTmpB;
657
658 /* v.T A(col:,col:) -> tmpb */
659
660 pv = pTmpA;
661 pa = p;
662 for(j=0;j<pOutQ->numRows-pos; j++)
663 {
664 *pdst++ = *pv * *pa++;
665 }
666 pa += pos;
667 pv++;
668 pdst = pTmpB;
669 pa0 = pa;
670 pa1 = pa0 + pOutQ->numRows;
671 pa2 = pa1 + pOutQ->numRows;
672 pa3 = pa2 + pOutQ->numRows;
673
674 /* Unrolled loop */
675 blkCnt = (pOutQ->numRows-pos - 1) >> 2;
676 k=1;
677 while(blkCnt > 0)
678 {
679 float32_t sum;
680
681 for(j=0;j<pOutQ->numRows-pos; j++)
682 {
683 sum = *pdst;
684
685 sum += pv[0] * *pa0++;
686 sum += pv[1] * *pa1++;
687 sum += pv[2] * *pa2++;
688 sum += pv[3] * *pa3++;
689
690 *pdst++ = sum;
691 }
692 pa0 += pos + 3*pOutQ->numRows;
693 pa1 += pos + 3*pOutQ->numRows;
694 pa2 += pos + 3*pOutQ->numRows;
695 pa3 += pos + 3*pOutQ->numRows;
696 pv += 4;
697 pdst = pTmpB;
698 k += 4;
699 blkCnt--;
700 }
701
702 pa = pa0;
703 for(;k<pOutQ->numRows-pos; k++)
704 {
705 for(j=0;j<pOutQ->numRows-pos; j++)
706 {
707 *pdst++ += *pv * *pa++;
708 }
709 pa += pos;
710 pv++;
711 pdst = pTmpB;
712 }
713
714 pa = p;
715 beta = *pc--;
716 for(j=0;j<pOutQ->numRows-pos; j++)
717 {
718 float32_t f = beta * pTmpA[j];
719
720 for(i=0;i<pOutQ->numCols-pos; i++)
721 {
722 *pa = *pa - f * pTmpB[i] ;
723 pa++;
724 }
725 pa += pos;
726 }
727
728
729 nb++;
730 }
731 }
732
733 arm_status status = ARM_MATH_SUCCESS;
734 /* Return to application */
735 return (status);
736 }
737
738 #endif /* end of test for Helium or Neon availability */
739