1 /** @file wls_QR_algorithm.c
2 *
3 * @brief This file contains the QR algorithm processing code.
4 *
5 * Copyright 2023-2024 NXP
6 *
7 * SPDX-License-Identifier: BSD-3-Clause
8 *
9 */
10
11 /************************************************************************
12 * DFW code that defines the QR algorithm processing.
13 ************************************************************************/
14
15 #include <osa.h>
16 #if CONFIG_WLS_CSI_PROC
17
18 // Standard includes.
19 #include <stdio.h>
20 // #include <string.h>
21 // #include <stdlib.h>
22 #ifdef ARM_DS5
23 #include <arm_neon.h>
24 #else
25 #include <math.h>
26 // 5 sqrtf, 10 fabsf -> use hardware
27 #endif
28
29 // Specific includes.
30 #include "wls_QR_algorithm.h"
31
32 #define TOL_HH_SIG 1e-8f
33 #define TOL_QR_STEP 1e-5f
34 #define TOL_QR_STEP_LOW 1e-3f
35 #define WILK_IT_MAX 15
36
37 // #define DEBUG_OUTPUT
38
house_holder_vec(float * x,float * v,float * betaPtr,float * muPtr,int len)39 int house_holder_vec(float *x, float *v, float *betaPtr, float *muPtr, int len)
40 {
41 int ii;
42 float sigma = 0;
43 float mu, xOne, vOne, vOneSqr;
44 #ifdef ARM_DS5
45 float32x2_t sigmaVec, xVecf32;
46 sigmaVec = vdup_n_f32(0.0f);
47 #endif
48 xOne = x[0];
49 v[0] = 1.0f;
50 for (ii = 1; ii < len - 1; ii += 2)
51 {
52 #ifdef ARM_DS5
53 xVecf32 = vld1_f32((float32_t const *)(x + ii));
54 sigmaVec = vmla_f32(sigmaVec, xVecf32, xVecf32);
55 vst1_f32((float32_t *)(v + ii), xVecf32);
56 #else
57 sigma += x[ii] * x[ii];
58 sigma += x[ii + 1] * x[ii + 1];
59 v[ii] = x[ii];
60 v[ii + 1] = x[ii + 1];
61 #endif
62 }
63 #ifdef ARM_DS5
64 sigmaVec = vpadd_f32(sigmaVec, sigmaVec);
65 sigma = vget_lane_f32(sigmaVec, 0);
66 #endif
67 if (ii < len)
68 {
69 sigma += x[ii] * x[ii];
70 v[ii] = x[ii];
71 }
72 if (sigma < TOL_HH_SIG)
73 {
74 *muPtr = xOne;
75 *betaPtr = 0;
76 return 1;
77 }
78
79 mu = SQRTF(xOne * xOne + sigma);
80 if (xOne <= 0)
81 {
82 vOne = xOne - mu;
83 }
84 else
85 {
86 vOne = -sigma / (xOne + mu);
87 }
88 vOneSqr = vOne * vOne;
89 *betaPtr = 2 * vOneSqr / (sigma + vOneSqr);
90 *muPtr = mu;
91 for (ii = 1; ii < len; ii++)
92 {
93 v[ii] = v[ii] / vOne;
94 }
95
96 return 0;
97 }
98
givens_rot(float a,float b,float * cosPtr,float * lenPtr)99 int givens_rot(float a, float b, float *cosPtr, float *lenPtr)
100 {
101 float r, temp, tempInv;
102
103 if (b == 0)
104 {
105 cosPtr[0] = 1;
106 cosPtr[1] = 0;
107 return 1;
108 }
109 if (FABSF(b) > FABSF(a))
110 {
111 r = -a / b;
112 temp = SQRTF(1 + r * r);
113 tempInv = 1 / temp;
114 cosPtr[1] = tempInv;
115 cosPtr[0] = tempInv * r;
116 *lenPtr = -b * temp;
117 }
118 else
119 {
120 r = -b / a;
121 temp = SQRTF(1 + r * r);
122 tempInv = 1 / temp;
123
124 cosPtr[0] = tempInv;
125 cosPtr[1] = tempInv * r;
126 *lenPtr = a * temp;
127 }
128
129 return 0;
130 }
131
QR_step_Wilkinson(float * diagA,float * offDiagA,float * Qmat,int matSizeN,int kk)132 int QR_step_Wilkinson(float *diagA, float *offDiagA, float *Qmat, int matSizeN, int kk)
133 {
134 #ifdef ARM_DS5
135 int ii, jj;
136 float a, b, c, d;
137 float bSqr, dSqr;
138 float mu, temp;
139 float x, z;
140 float lenVal, cosArray[2];
141 float offDiagA2 = 0;
142 float *ldPtr;
143
144 float32x2_t cosVec, cosVec2, diagAVec, offDiagAVec;
145 float32x2_t tempVec1, tempVec2, tempVec3;
146 float32x2_t colVec0, colVec1, resVec0, resVec1;
147 float32x2x2_t trnVec;
148
149 a = diagA[kk - 1]; // second last diagonal
150 b = offDiagA[kk];
151 c = diagA[kk]; // last diagonal
152 d = (a - c) / 2;
153
154 bSqr = b * b;
155 dSqr = d * d;
156 temp = SQRTF(dSqr + bSqr);
157 temp = (d > 0) ? (d + temp) : (d - temp);
158 mu = c - bSqr / temp;
159
160 // first iteration
161 x = diagA[0] - mu;
162 z = offDiagA[1];
163
164 givens_rot(x, z, &cosArray[0], &lenVal);
165
166 cosVec = vld1_f32((float32_t const *)cosArray); // VLD1.32 {d0}, [r0]
167
168 tempVec3 = vrev64_f32(cosVec); // VREV64.32 d0,d0
169 tempVec3 = vneg_f32(tempVec3);
170 trnVec = vtrn_f32(tempVec3, cosVec); // VTRN.32 d0,d0
171 cosVec2 = trnVec.val[0];
172
173 diagAVec = vld1_f32((float32_t const *)diagA);
174 offDiagAVec = vld1_f32((float32_t const *)(offDiagA + 1));
175
176 tempVec1 = vmul_lane_f32(cosVec, diagAVec, 0);
177 tempVec1 = vmla_lane_f32(tempVec1, cosVec2, offDiagAVec, 0);
178
179 tempVec2 = vmul_lane_f32(cosVec, offDiagAVec, 0);
180 tempVec2 = vmla_lane_f32(tempVec2, cosVec2, diagAVec, 1);
181
182 resVec0 = vmul_f32(tempVec1, cosVec);
183 resVec0 = vmla_f32(resVec0, tempVec2, cosVec2);
184
185 tempVec3 = vmul_lane_f32(cosVec2, offDiagAVec, 1);
186
187 vst1_f32((float32_t *)diagA, resVec0);
188
189 resVec1 = vmul_lane_f32(tempVec1, cosVec, 1);
190 resVec1 = vmla_lane_f32(resVec1, tempVec2, cosVec, 0);
191
192 offDiagA[1] = vget_lane_f32(resVec1, 0);
193
194 // update first two rows of Q
195 ldPtr = Qmat;
196 for (jj = 0; jj < matSizeN; jj += 2)
197 {
198 colVec0 = vld1_f32((float32_t const *)ldPtr);
199 colVec1 = vld1_f32((float32_t const *)(ldPtr + MAX_MAT_SIZE));
200
201 resVec0 = vmul_lane_f32(colVec0, cosVec, 0);
202 resVec1 = vmul_lane_f32(colVec1, cosVec, 0);
203
204 resVec0 = vmls_lane_f32(resVec0, colVec1, cosVec, 1);
205 resVec1 = vmla_lane_f32(resVec1, colVec0, cosVec, 1);
206
207 vst1_f32((float32_t *)ldPtr, resVec0); // VST1.32 {d0}, [r0]
208 vst1_f32((float32_t *)(ldPtr + MAX_MAT_SIZE), resVec1);
209 ldPtr += 2;
210 }
211
212 if (kk > 1)
213 {
214 offDiagA[2] = vget_lane_f32(tempVec3, 1); // VMOV.32 r0, d0[0]
215 offDiagA2 = vget_lane_f32(tempVec3, 0);
216
217 for (ii = 2; ii < kk; ii++)
218 {
219 x = offDiagA[ii - 1];
220 z = offDiagA2;
221 givens_rot(x, z, &cosArray[0], offDiagA + (ii - 1));
222 // offDiagA[ii-1] = lenVal;
223 cosVec = vld1_f32((float32_t const *)cosArray); // VLD1.32 {d0}, [r0]
224
225 tempVec3 = vrev64_f32(cosVec); // VREV64.32 d0,d0
226 tempVec3 = vneg_f32(tempVec3);
227 trnVec = vtrn_f32(tempVec3, cosVec); // VTRN.32 d0,d0
228 cosVec2 = trnVec.val[0];
229
230 diagAVec = vld1_f32((float32_t const *)(diagA + ii - 1));
231 offDiagAVec = vld1_f32((float32_t const *)(offDiagA + ii));
232
233 tempVec1 = vmul_lane_f32(cosVec, diagAVec, 0);
234 tempVec1 = vmla_lane_f32(tempVec1, cosVec2, offDiagAVec, 0);
235
236 tempVec2 = vmul_lane_f32(cosVec, offDiagAVec, 0);
237 tempVec2 = vmla_lane_f32(tempVec2, cosVec2, diagAVec, 1);
238
239 resVec0 = vmul_f32(tempVec1, cosVec);
240 resVec0 = vmla_f32(resVec0, tempVec2, cosVec2);
241
242 tempVec3 = vmul_lane_f32(cosVec2, offDiagAVec, 1);
243
244 vst1_f32((float32_t *)(diagA + ii - 1), resVec0);
245
246 resVec1 = vmul_lane_f32(tempVec1, cosVec, 1);
247 resVec1 = vmla_lane_f32(resVec1, tempVec2, cosVec, 0);
248
249 offDiagA[ii] = vget_lane_f32(resVec1, 0);
250
251 offDiagA[ii + 1] = vget_lane_f32(tempVec3, 1);
252 offDiagA2 = vget_lane_f32(tempVec3, 0);
253
254 // update two rows of Q
255 ldPtr = Qmat + (ii - 1) * MAX_MAT_SIZE;
256 for (jj = 0; jj < matSizeN; jj += 2)
257 {
258 colVec0 = vld1_f32((float32_t const *)ldPtr);
259 colVec1 = vld1_f32((float32_t const *)(ldPtr + MAX_MAT_SIZE));
260
261 resVec0 = vmul_lane_f32(colVec0, cosVec, 0);
262 resVec1 = vmul_lane_f32(colVec1, cosVec, 0);
263
264 resVec0 = vmls_lane_f32(resVec0, colVec1, cosVec, 1);
265 resVec1 = vmla_lane_f32(resVec1, colVec0, cosVec, 1);
266
267 vst1_f32((float32_t *)ldPtr, resVec0); // VST1.32 {d0}, [r0]
268 vst1_f32((float32_t *)(ldPtr + MAX_MAT_SIZE), resVec1);
269 ldPtr += 2;
270 }
271 }
272
273 // last iteration , ii=kk
274 x = offDiagA[kk - 1];
275 z = offDiagA2;
276 givens_rot(x, z, &cosArray[0], offDiagA + (kk - 1));
277 // offDiagA[kk-1] = lenVal;
278 cosVec = vld1_f32((float32_t const *)cosArray); // VLD1.32 {d0}, [r0]
279
280 tempVec3 = vrev64_f32(cosVec); // VREV64.32 d0,d0
281 tempVec3 = vneg_f32(tempVec3);
282 trnVec = vtrn_f32(tempVec3, cosVec); // VTRN.32 d0,d0
283 cosVec2 = trnVec.val[0];
284
285 diagAVec = vld1_f32((float32_t const *)(diagA + kk - 1));
286 offDiagAVec = vld1_f32((float32_t const *)(offDiagA + kk));
287
288 tempVec1 = vmul_lane_f32(cosVec, diagAVec, 0);
289 tempVec1 = vmla_lane_f32(tempVec1, cosVec2, offDiagAVec, 0);
290
291 tempVec2 = vmul_lane_f32(cosVec, offDiagAVec, 0);
292 tempVec2 = vmla_lane_f32(tempVec2, cosVec2, diagAVec, 1);
293
294 resVec0 = vmul_f32(tempVec1, cosVec);
295 resVec0 = vmla_f32(resVec0, tempVec2, cosVec2);
296
297 vst1_f32((float32_t *)(diagA + kk - 1), resVec0);
298
299 resVec1 = vmul_lane_f32(tempVec1, cosVec, 1);
300 resVec1 = vmla_lane_f32(resVec1, tempVec2, cosVec, 0);
301
302 offDiagA[kk] = vget_lane_f32(resVec1, 0);
303
304 // update two rows of Q
305 ldPtr = Qmat + (kk - 1) * MAX_MAT_SIZE;
306 for (jj = 0; jj < matSizeN; jj += 2)
307 {
308 colVec0 = vld1_f32((float32_t const *)ldPtr);
309 colVec1 = vld1_f32((float32_t const *)(ldPtr + MAX_MAT_SIZE));
310
311 resVec0 = vmul_lane_f32(colVec0, cosVec, 0);
312 resVec1 = vmul_lane_f32(colVec1, cosVec, 0);
313
314 resVec0 = vmls_lane_f32(resVec0, colVec1, cosVec, 1);
315 resVec1 = vmla_lane_f32(resVec1, colVec0, cosVec, 1);
316
317 vst1_f32((float32_t *)ldPtr, resVec0); // VST1.32 {d0}, [r0]
318 vst1_f32((float32_t *)(ldPtr + MAX_MAT_SIZE), resVec1);
319 ldPtr += 2;
320 }
321 }
322
323 return 0;
324 #else
325 int ii, jj;
326 float a, b, c, d;
327 float bSqr, dSqr;
328 float mu, temp;
329 float x, z;
330 float cosVal, sinVal, lenVal;
331 float cosArray[2];
332 float offDiagA2 = 0;
333 float temp11, temp21, temp12, temp22, temp13, temp23;
334
335 a = diagA[kk - 1]; // second last diagonal
336 b = offDiagA[kk];
337 c = diagA[kk]; // last diagonal
338 d = (a - c) / 2;
339
340 bSqr = b * b;
341 dSqr = d * d;
342 temp = sqrtf(dSqr + bSqr);
343 temp = (d > 0) ? (d + temp) : (d - temp);
344 mu = c - bSqr / temp;
345
346 // first iteration
347 x = diagA[0] - mu;
348 z = offDiagA[1];
349
350 givens_rot(x, z, &cosArray[0], &lenVal);
351 cosVal = cosArray[0];
352 sinVal = cosArray[1];
353
354 temp11 = cosVal * diagA[0] - sinVal * offDiagA[1];
355 temp21 = sinVal * diagA[0] + cosVal * offDiagA[1];
356
357 temp12 = cosVal * offDiagA[1] - sinVal * diagA[1];
358 temp22 = sinVal * offDiagA[1] + cosVal * diagA[1];
359
360 temp13 = -sinVal * offDiagA[2];
361 temp23 = cosVal * offDiagA[2];
362
363 diagA[0] = cosVal * temp11 - sinVal * temp12;
364 offDiagA[1] = sinVal * temp11 + cosVal * temp12;
365 diagA[1] = sinVal * temp21 + cosVal * temp22;
366
367 // update first two rows of Q
368 for (jj = 0; jj < matSizeN; jj++)
369 {
370 temp11 = Qmat[jj];
371 temp21 = Qmat[jj + MAX_MAT_SIZE];
372
373 Qmat[jj] = cosVal * temp11 - sinVal * temp21;
374 Qmat[jj + MAX_MAT_SIZE] = cosVal * temp21 + sinVal * temp11;
375 }
376
377 if (kk > 1)
378 {
379 offDiagA[2] = temp23;
380 offDiagA2 = temp13;
381
382 for (ii = 2; ii < kk; ii++)
383 {
384 x = offDiagA[ii - 1];
385 z = offDiagA2;
386 givens_rot(x, z, &cosArray[0], &lenVal);
387 cosVal = cosArray[0];
388 sinVal = cosArray[1];
389
390 temp11 = cosVal * diagA[ii - 1] - sinVal * offDiagA[ii];
391 temp21 = sinVal * diagA[ii - 1] + cosVal * offDiagA[ii];
392
393 temp12 = cosVal * offDiagA[ii] - sinVal * diagA[ii];
394 temp22 = sinVal * offDiagA[ii] + cosVal * diagA[ii];
395
396 temp13 = -sinVal * offDiagA[ii + 1];
397 temp23 = cosVal * offDiagA[ii + 1];
398
399 offDiagA[ii - 1] = lenVal;
400
401 diagA[ii - 1] = cosVal * temp11 - sinVal * temp12;
402 offDiagA[ii] = sinVal * temp11 + cosVal * temp12;
403 diagA[ii] = sinVal * temp21 + cosVal * temp22;
404
405 offDiagA[ii + 1] = temp23;
406 offDiagA2 = temp13;
407 // update two rows of Q
408 for (jj = 0; jj < matSizeN; jj++)
409 {
410 temp11 = Qmat[jj + (ii - 1) * MAX_MAT_SIZE];
411 temp21 = Qmat[jj + ii * MAX_MAT_SIZE];
412
413 Qmat[jj + (ii - 1) * MAX_MAT_SIZE] = cosVal * temp11 - sinVal * temp21;
414 Qmat[jj + ii * MAX_MAT_SIZE] = cosVal * temp21 + sinVal * temp11;
415 }
416 }
417
418 // last iteration , ii=kk
419 x = offDiagA[kk - 1];
420 z = offDiagA2;
421 givens_rot(x, z, &cosArray[0], &lenVal);
422 cosVal = cosArray[0];
423 sinVal = cosArray[1];
424
425 temp11 = cosVal * diagA[kk - 1] - sinVal * offDiagA[kk];
426 temp21 = sinVal * diagA[kk - 1] + cosVal * offDiagA[kk];
427
428 temp12 = cosVal * offDiagA[kk] - sinVal * diagA[kk];
429 temp22 = sinVal * offDiagA[kk] + cosVal * diagA[kk];
430
431 offDiagA[kk - 1] = lenVal;
432 diagA[kk - 1] = cosVal * temp11 - sinVal * temp12;
433 offDiagA[kk] = sinVal * temp11 + cosVal * temp12;
434 diagA[kk] = sinVal * temp21 + cosVal * temp22;
435 // update two rows of Q
436 for (jj = 0; jj < matSizeN; jj++)
437 {
438 temp11 = Qmat[jj + (kk - 1) * MAX_MAT_SIZE];
439 temp21 = Qmat[jj + kk * MAX_MAT_SIZE];
440
441 Qmat[jj + (kk - 1) * MAX_MAT_SIZE] = cosVal * temp11 - sinVal * temp21;
442 Qmat[jj + kk * MAX_MAT_SIZE] = cosVal * temp21 + sinVal * temp11;
443 }
444 }
445
446 return 0;
447 #endif
448 }
449
unsym_QR_step_Wilkinson(float * matA,int matSizeN,int kk,int offset)450 int unsym_QR_step_Wilkinson(float *matA, int matSizeN, int kk, int offset)
451 {
452 #ifdef ARM_DS5
453 int ii, jj;
454 float a, b, c, d;
455 float bSqr, dSqr;
456 float mu, temp;
457 float x, z;
458 float lenVal, cosArray[2];
459 float *ldPtr;
460
461 float32x2_t cosVec, cosVec2, tempVec3;
462 float32x2_t colVec0, colVec1, resVec0, resVec1;
463 float32x2x2_t trnVec;
464
465 a = matA[(kk - 1) * (matSizeN + 1)]; // diagA[kk-1]; // second last diagonal
466 b = matA[(kk - 1) * (matSizeN + 1) + 1]; // offDiagA[kk];
467 c = matA[kk * (matSizeN + 1)]; // diagA[kk]; // last diagonal
468 if (kk == 1 + offset)
469 {
470 float p, q;
471 d = matA[kk * (matSizeN + 1) - 1];
472
473 p = -a - c;
474 q = a * c - b * d;
475 if (p * p < 4 * q)
476 {
477 matA[offset * (matSizeN + 1)] = -p / 2;
478 matA[offset * (matSizeN + 1) + 1] = 0;
479 matA[(offset + 1) * (matSizeN + 1)] = -p / 2;
480
481 return 1;
482 }
483 }
484 d = (a - c) / 2;
485
486 bSqr = b * b;
487 dSqr = d * d;
488 temp = SQRTF(dSqr + bSqr);
489 temp = (d > 0) ? (d + temp) : (d - temp);
490 mu = c - bSqr / temp;
491
492 // first iteration
493 x = matA[offset * (matSizeN + 1)] - mu; // diagA[0]-mu;
494 z = matA[offset * (matSizeN + 1) + 1]; // offDiagA[1];
495
496 givens_rot(x, z, &cosArray[0], &lenVal);
497 cosVec = vld1_f32((float32_t const *)cosArray); // VLD1.32 {d0}, [r0]
498 tempVec3 = vrev64_f32(cosVec); // VREV64.32 d0,d0
499 tempVec3 = vneg_f32(tempVec3);
500 trnVec = vtrn_f32(tempVec3, cosVec); // VTRN.32 d0,d0
501 cosVec2 = trnVec.val[0]; //[-sin cos]
502
503 ldPtr = matA + offset;
504 for (jj = offset; jj < matSizeN; jj++)
505 { // apply Givens rotation on all columns
506 colVec0 = vld1_f32((float32_t const *)ldPtr);
507 resVec0 = vmul_lane_f32(cosVec, colVec0, 0);
508 resVec0 = vmla_lane_f32(resVec0, cosVec2, colVec0, 1);
509 vst1_f32((float32_t *)ldPtr, resVec0);
510 ldPtr += matSizeN;
511 }
512
513 ldPtr = matA + offset * matSizeN;
514 for (jj = offset; jj < kk; jj += 2)
515 { // apply Givens rotation on all active rows
516 colVec0 = vld1_f32((float32_t const *)ldPtr);
517 colVec1 = vld1_f32((float32_t const *)(ldPtr + matSizeN));
518
519 resVec0 = vmul_lane_f32(colVec0, cosVec, 0);
520 resVec1 = vmul_lane_f32(colVec1, cosVec, 0);
521
522 resVec0 = vmls_lane_f32(resVec0, colVec1, cosVec, 1);
523 resVec1 = vmla_lane_f32(resVec1, colVec0, cosVec, 1);
524
525 vst1_f32((float32_t *)ldPtr, resVec0);
526 vst1_f32((float32_t *)(ldPtr + matSizeN), resVec1);
527 ldPtr += 2;
528 }
529 if (jj == kk)
530 { // one left
531 colVec0 = vld1_f32((float32_t const *)ldPtr);
532 colVec1 = vld1_f32((float32_t const *)(ldPtr + matSizeN));
533
534 trnVec = vtrn_f32(colVec0, colVec1); // VTRN.32 d0,d0
535 colVec0 = trnVec.val[0];
536 resVec0 = vmul_lane_f32(cosVec, colVec0, 0);
537 resVec0 = vmla_lane_f32(resVec0, cosVec2, colVec0, 1);
538 ldPtr[0] = vget_lane_f32(resVec0, 0);
539 ldPtr[matSizeN] = vget_lane_f32(resVec0, 1);
540 }
541 if (kk > 1)
542 {
543 for (ii = offset + 1; ii < kk; ii++)
544 {
545 x = matA[(ii - 1) * (matSizeN + 1) + 1];
546 z = matA[(ii - 1) * (matSizeN + 1) + 2];
547 givens_rot(x, z, &cosArray[0], &lenVal);
548
549 cosVec = vld1_f32((float32_t const *)cosArray); // VLD1.32 {d0}, [r0]
550 tempVec3 = vrev64_f32(cosVec); // VREV64.32 d0,d0
551 tempVec3 = vneg_f32(tempVec3);
552 trnVec = vtrn_f32(tempVec3, cosVec); // VTRN.32 d0,d0
553 cosVec2 = trnVec.val[0]; //[-sin cos]
554
555 ldPtr = matA + ii + offset * matSizeN;
556 for (jj = offset; jj < matSizeN; jj++)
557 { // apply Givens rotation on all columns
558 colVec0 = vld1_f32((float32_t const *)ldPtr);
559 resVec0 = vmul_lane_f32(cosVec, colVec0, 0);
560 resVec0 = vmla_lane_f32(resVec0, cosVec2, colVec0, 1);
561 vst1_f32((float32_t *)ldPtr, resVec0);
562 ldPtr += matSizeN;
563 }
564 ldPtr = matA + offset + ii * matSizeN;
565 for (jj = offset; jj < kk; jj += 2)
566 { // apply Givens rotation on all active rows
567 colVec0 = vld1_f32((float32_t const *)ldPtr);
568 colVec1 = vld1_f32((float32_t const *)(ldPtr + matSizeN));
569
570 resVec0 = vmul_lane_f32(colVec0, cosVec, 0);
571 resVec1 = vmul_lane_f32(colVec1, cosVec, 0);
572
573 resVec0 = vmls_lane_f32(resVec0, colVec1, cosVec, 1);
574 resVec1 = vmla_lane_f32(resVec1, colVec0, cosVec, 1);
575
576 vst1_f32((float32_t *)ldPtr, resVec0);
577 vst1_f32((float32_t *)(ldPtr + matSizeN), resVec1);
578 ldPtr += 2;
579 }
580 if (jj == kk)
581 { // one left
582 colVec0 = vld1_f32((float32_t const *)ldPtr);
583 colVec1 = vld1_f32((float32_t const *)(ldPtr + matSizeN));
584
585 trnVec = vtrn_f32(colVec0, colVec1); // VTRN.32 d0,d0
586 colVec0 = trnVec.val[0];
587 resVec0 = vmul_lane_f32(cosVec, colVec0, 0);
588 resVec0 = vmla_lane_f32(resVec0, cosVec2, colVec0, 1);
589 ldPtr[0] = vget_lane_f32(resVec0, 0);
590 ldPtr[matSizeN] = vget_lane_f32(resVec0, 1);
591 }
592 }
593 }
594
595 return 0;
596 #else
597 int ii, jj;
598 float a, b, c, d;
599 float bSqr, dSqr;
600 float mu, temp;
601 float x, z;
602 float cosVal, sinVal, lenVal;
603 float cosArray[2];
604 float temp11, temp21;
605
606 a = matA[(kk - 1) * (matSizeN + 1)]; // diagA[kk-1]; // second last diagonal
607 b = matA[(kk - 1) * (matSizeN + 1) + 1]; // offDiagA[kk];
608 c = matA[kk * (matSizeN + 1)]; // diagA[kk]; // last diagonal
609 if (kk == 1 + offset)
610 {
611 float p, q;
612 d = matA[kk * (matSizeN + 1) - 1];
613
614 p = -a - c;
615 q = a * c - b * d;
616 if (p * p < 4 * q)
617 {
618 matA[offset * (matSizeN + 1)] = -p / 2;
619 matA[offset * (matSizeN + 1) + 1] = 0;
620 matA[(offset + 1) * (matSizeN + 1)] = -p / 2;
621
622 return 1;
623 }
624 }
625 // else{
626 d = (a - c) / 2;
627 //}
628
629 bSqr = b * b;
630 dSqr = d * d;
631 temp = sqrtf(dSqr + bSqr);
632 temp = (d > 0) ? (d + temp) : (d - temp);
633 mu = c - bSqr / temp;
634
635 // first iteration
636 x = matA[offset * (matSizeN + 1)] - mu; // diagA[0]-mu;
637 z = matA[offset * (matSizeN + 1) + 1]; // offDiagA[1];
638
639 givens_rot(x, z, &cosArray[0], &lenVal);
640 cosVal = cosArray[0];
641 sinVal = cosArray[1];
642
643 for (jj = offset; jj < matSizeN; jj++)
644 { // apply Givens rotation on all columns
645 temp11 = matA[offset + jj * matSizeN];
646 temp21 = matA[1 + offset + jj * matSizeN];
647
648 matA[offset + jj * matSizeN] = cosVal * temp11 - sinVal * temp21;
649 matA[1 + offset + jj * matSizeN] = cosVal * temp21 + sinVal * temp11;
650 }
651
652 for (jj = offset; jj <= kk; jj++)
653 { // apply Givens rotation on all active rows
654 temp11 = matA[jj + offset * matSizeN];
655 temp21 = matA[jj + (1 + offset) * matSizeN];
656
657 matA[jj + offset * matSizeN] = cosVal * temp11 - sinVal * temp21;
658 matA[jj + (1 + offset) * matSizeN] = cosVal * temp21 + sinVal * temp11;
659 }
660 if (kk > 1)
661 {
662 for (ii = offset + 1; ii < kk; ii++)
663 {
664 x = matA[(ii - 1) * (matSizeN + 1) + 1];
665 z = matA[(ii - 1) * (matSizeN + 1) + 2];
666 givens_rot(x, z, &cosArray[0], &lenVal);
667 cosVal = cosArray[0];
668 sinVal = cosArray[1];
669
670 for (jj = offset; jj < matSizeN; jj++)
671 { // apply Givens rotation on all columns
672 temp11 = matA[ii + jj * matSizeN];
673 temp21 = matA[(ii + 1) + jj * matSizeN];
674
675 matA[ii + jj * matSizeN] = cosVal * temp11 - sinVal * temp21;
676 matA[(ii + 1) + jj * matSizeN] = cosVal * temp21 + sinVal * temp11;
677 }
678
679 for (jj = offset; jj <= kk; jj++)
680 { // apply Givens rotation on all active rows
681 temp11 = matA[jj + ii * matSizeN];
682 temp21 = matA[jj + (ii + 1) * matSizeN];
683
684 matA[jj + ii * matSizeN] = cosVal * temp11 - sinVal * temp21;
685 matA[jj + (ii + 1) * matSizeN] = cosVal * temp21 + sinVal * temp11;
686 }
687 }
688 }
689
690 return 0;
691 #endif
692 }
693
694 // back substitution for LS via QR, solves R*X = Q'B = C for X, R is upper tri-angular
695 // R is NxM, X is MxM, C is NxM
myBackSub(float * C_MATR,float * R_MATR,float * X_MAT,int matSizeN,int matSizeM)696 void myBackSub(float *C_MATR, float *R_MATR, float *X_MAT, int matSizeN, int matSizeM)
697 {
698 #ifdef DEBUG_OUTPUT
699 FILE *fpOut;
700 #endif
701 int ii, jj, kk, row_idx;
702 float tempVal;
703
704 for (kk = 0; kk < matSizeM; kk++)
705 {
706 for (ii = 0; ii < matSizeM; ii++)
707 {
708 row_idx = matSizeM - 1 - ii;
709
710 tempVal = C_MATR[row_idx + kk * MAX_MAT_SIZE];
711 for (jj = matSizeM - ii; jj < matSizeM; jj++)
712 {
713 tempVal -= R_MATR[row_idx + jj * MAX_MAT_SIZE] * X_MAT[jj + kk * matSizeM];
714 }
715 X_MAT[row_idx + kk * matSizeM] = tempVal / R_MATR[(MAX_MAT_SIZE + 1) * row_idx];
716 }
717 }
718 #ifdef DEBUG_OUTPUT
719 fpOut = fopen("Psi_hat_C.txt", "w");
720 if (fpOut == NULL)
721 {
722 fprintf(stderr, "Cannot open output file %s for writing\n", "Psi_hat_C.txt");
723 return;
724 }
725
726 for (ii = 0; ii < matSizeM * matSizeM; ii++)
727 {
728 fprintf(fpOut, "%f\n", X_MAT[ii]);
729 }
730 fclose(fpOut);
731 #endif
732 }
733
734 // regular QR decomposition
QR_decomposition(float * inMatArr,float * resD,int matSizeN,int matSizeM)735 void QR_decomposition(float *inMatArr, float *resD, int matSizeN, int matSizeM)
736 {
737 #ifdef ARM_DS5
738 int ii, jj, kk, hh_len;
739 float *xVec;
740 float beta, xNorm;
741
742 float *R_MATR = inMatArr;
743 float *Qmat = resD; // Qmat[matSizeN*matSizeN]
744 float *vVec = Qmat + MAX_MAT_SIZE * matSizeN; // vVec[matSizeN]
745 float *qVec = vVec + MAX_MAT_SIZE; // qVec[matSizeN]
746 float *pVec = qVec + MAX_MAT_SIZE; // pVec[matSizeM]
747
748 float *ldPtr, *strPtr;
749
750 float32x2_t storeVec, resVec0, resVec1;
751 float32x2_t colVec0, colVec1, pVecf32, qVecf32, vVecf32, sumVec;
752
753 storeVec = vdup_n_f32(0.0f); // VDUP.32 d0,r0
754
755 // initialize Q
756 strPtr = Qmat;
757 for (ii = 0; ii < matSizeN; ii++)
758 {
759 for (jj = 0; jj < MAX_MAT_SIZE; jj += 2)
760 {
761 vst1_f32((float32_t *)strPtr, storeVec); // VST1.32 {d0}, [r0]
762 strPtr += 2;
763 }
764 strPtr[ii - MAX_MAT_SIZE] = 1.0f;
765 }
766
767 // Householder Tridiagonalization
768 hh_len = matSizeN;
769 xVec = R_MATR;
770 for (kk = 0; kk < matSizeM; kk++)
771 { // A=V
772 hh_len = matSizeN - kk;
773 xVec = R_MATR + kk * (MAX_MAT_SIZE + 1);
774
775 if (!house_holder_vec(xVec, vVec, &beta, &xNorm, hh_len))
776 {
777 for (ii = hh_len; ii < MAX_MAT_SIZE; ii++)
778 {
779 vVec[ii] = 0;
780 }
781 // apply house holder iteration
782 pVec[0] = 0;
783 for (ii = 1; ii < matSizeM - kk; ii += 2)
784 { // calc. p = beta*A'*v
785 resVec0 = vdup_n_f32(0.0f); // VDUP.32 d0,r0
786 resVec1 = vdup_n_f32(0.0f); // VDUP.32 d0,r0
787 ldPtr = R_MATR + (ii + kk) * MAX_MAT_SIZE + kk;
788 vVecf32 = vld1_f32((float32_t const *)vVec); // VLD1.32 {d0}, [r0]
789 for (jj = 0; jj < hh_len; jj += 2)
790 {
791 colVec0 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
792 resVec0 = vmla_f32(resVec0, colVec0, vVecf32);
793 colVec1 = vld1_f32((float32_t const *)(ldPtr + MAX_MAT_SIZE)); // VLD1.32 {d0}, [r0]
794 ldPtr += 2;
795 resVec1 = vmla_f32(resVec1, colVec1, vVecf32);
796 vVecf32 = vld1_f32((float32_t const *)(vVec + jj + 2)); // VLD1.32 {d0}, [r0]
797 }
798 sumVec = vpadd_f32(resVec0, resVec1);
799 sumVec = vmul_n_f32(sumVec, beta); // VMUL.F32 d0,d0,d0[0]
800 vst1_f32((float32_t *)(pVec + ii), sumVec); // VST1.32 {d0}, [r0]
801 }
802 pVec[matSizeM - kk] = 0;
803
804 strPtr = R_MATR + kk * (MAX_MAT_SIZE + 1);
805 *strPtr++ = xNorm;
806 for (ii = 1; ii < hh_len; ii++)
807 {
808 *strPtr++ = 0;
809 }
810 // R(index_col,index_row) = R(index_col,index_row)-v*p';
811 for (ii = 1; ii < matSizeM - kk; ii += 2)
812 {
813 pVecf32 = vld1_f32((float32_t const *)(pVec + ii)); // VLD1.32 {d0}, [r0]
814 ldPtr = R_MATR + (ii + kk) * MAX_MAT_SIZE + kk;
815 vVecf32 = vld1_f32((float32_t const *)vVec); // VLD1.32 {d0}, [r0]
816 for (jj = 0; jj < hh_len; jj += 2)
817 {
818 colVec0 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
819 colVec0 = vmls_lane_f32(colVec0, vVecf32, pVecf32, 0);
820 colVec1 = vld1_f32((float32_t const *)(ldPtr + MAX_MAT_SIZE)); // VLD1.32 {d0}, [r0]
821 vst1_f32((float32_t *)ldPtr, colVec0);
822 colVec1 = vmls_lane_f32(colVec1, vVecf32, pVecf32, 1);
823 vst1_f32((float32_t *)(ldPtr + MAX_MAT_SIZE), colVec1);
824 ldPtr += 2;
825 vVecf32 = vld1_f32((float32_t const *)(vVec + jj + 2)); // VLD1.32 {d0}, [r0]
826 }
827 }
828
829 // update Q
830 for (ii = 0; ii < matSizeN; ii += 2)
831 { // calc. q= beta*Q(:,index_col)*v;
832 sumVec = vdup_n_f32(0.0f); // VDUP.32 d0,r0
833 ldPtr = Qmat + ii + kk * MAX_MAT_SIZE;
834 vVecf32 = vld1_f32((float32_t const *)vVec); // VLD1.32 {d0}, [r0]
835 for (jj = 0; jj < hh_len; jj += 2)
836 {
837 colVec0 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
838 ldPtr += MAX_MAT_SIZE;
839 sumVec = vmla_lane_f32(sumVec, colVec0, vVecf32, 0);
840 colVec1 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
841 ldPtr += MAX_MAT_SIZE;
842 sumVec = vmla_lane_f32(sumVec, colVec1, vVecf32, 1);
843 vVecf32 = vld1_f32((float32_t const *)(vVec + jj + 2)); // VLD1.32 {d0}, [r0]
844 }
845 sumVec = vmul_n_f32(sumVec, beta); // VMUL.F32 d0,d0,d0[0]
846 vst1_f32((float32_t *)(qVec + ii), sumVec); // VST1.32 {d0}, [r0]
847 }
848 qVec[matSizeN] = 0;
849 // Q(:,index_col) = Q(:,index_col)-q*v';
850 for (ii = 0; ii < matSizeN; ii += 2)
851 {
852 qVecf32 = vld1_f32((float32_t const *)(qVec + ii));
853 ldPtr = Qmat + ii + kk * MAX_MAT_SIZE;
854 vVecf32 = vld1_f32((float32_t const *)vVec); // VLD1.32 {d0}, [r0]
855 for (jj = 0; jj < hh_len; jj += 2)
856 {
857 colVec0 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
858 colVec0 = vmls_lane_f32(colVec0, qVecf32, vVecf32, 0); // VMLS.F32 d0,
859 colVec1 = vld1_f32((float32_t const *)(ldPtr + MAX_MAT_SIZE)); // VLD1.32 {d0}, [r0]
860 colVec1 = vmls_lane_f32(colVec1, qVecf32, vVecf32, 1); // VMLS.F32 d0,
861 vst1_f32((float32_t *)ldPtr, colVec0); // VST1.32 {d0}, [r0]
862 vst1_f32((float32_t *)(ldPtr + MAX_MAT_SIZE), colVec1); // VST1.32 {d0}, [r0]
863 ldPtr += 2 * MAX_MAT_SIZE;
864 vVecf32 = vld1_f32((float32_t const *)(vVec + jj + 2)); // VLD1.32 {d0}, [r0]
865 }
866 }
867 }
868 hh_len--;
869 xVec += MAX_MAT_SIZE + 1;
870 }
871 #else
872 int ii, jj, kk, hh_len;
873 float *xVec;
874 float beta, xNorm, sum;
875 float tempVal;
876
877 float *R_MATR = inMatArr;
878 float *Qmat = resD; // Qmat[matSizeN*matSizeN]
879 float *vVec = Qmat + MAX_MAT_SIZE * matSizeN; // vVec[matSizeN]
880 float *qVec = vVec + matSizeN; // qVec[matSizeN]
881 float *pVec = qVec + matSizeN; // pVec[matSizeM]
882
883 // initialize Q
884 for (ii = 0; ii < matSizeN; ii++)
885 {
886 for (jj = 0; jj < matSizeN; jj++)
887 {
888 Qmat[jj + MAX_MAT_SIZE * ii] = 0.0f;
889 }
890 Qmat[ii + MAX_MAT_SIZE * ii] = 1.0f;
891 }
892
893 // Householder Tridiagonalization
894 for (kk = 0; kk < matSizeM; kk++)
895 { // A=V
896 hh_len = matSizeN - kk;
897 xVec = R_MATR + kk * (MAX_MAT_SIZE + 1);
898
899 if (!house_holder_vec(xVec, vVec, &beta, &xNorm, hh_len))
900 {
901 // apply house holder iteration
902 pVec[0] = 0;
903 for (ii = 1; ii < matSizeM - kk; ii++)
904 { // calc. p = beta*A'*v
905 sum = 0;
906 for (jj = 0; jj < hh_len; jj++)
907 {
908 sum += R_MATR[(ii + kk) * MAX_MAT_SIZE + (jj + kk)] * vVec[jj];
909 }
910 pVec[ii] = beta * sum;
911 }
912 R_MATR[kk * (MAX_MAT_SIZE + 1)] = xNorm;
913 for (ii = 1; ii < hh_len; ii++)
914 {
915 R_MATR[(ii + kk) + kk * MAX_MAT_SIZE] = 0;
916 }
917 // R(index_col,index_row) = R(index_col,index_row)-v*p';
918 for (ii = 1; ii < matSizeM - kk; ii++)
919 {
920 tempVal = pVec[ii];
921 for (jj = 0; jj < hh_len; jj++)
922 {
923 R_MATR[(ii + kk) * MAX_MAT_SIZE + (jj + kk)] -= vVec[jj] * tempVal;
924 }
925 }
926
927 // update Q
928 for (ii = 0; ii < matSizeN; ii++)
929 { // calc. q= beta*Q(:,index_col)*v;
930 sum = 0;
931 for (jj = 0; jj < hh_len; jj++)
932 {
933 sum += Qmat[ii + (jj + kk) * MAX_MAT_SIZE] * vVec[jj];
934 }
935 qVec[ii] = beta * sum;
936 }
937 // Q(:,index_col) = Q(:,index_col)-q*v';
938 for (ii = 0; ii < matSizeN; ii++)
939 {
940 tempVal = qVec[ii];
941 for (jj = 0; jj < hh_len; jj++)
942 {
943 Qmat[ii + (jj + kk) * MAX_MAT_SIZE] -= vVec[jj] * tempVal;
944 }
945 }
946 }
947 }
948 #endif
949 }
950
951 // symmetric eigen(Shur) decomposition
QR_algorithm(float * inMatArr,float * resD,int matSizeN,int low_accuracy)952 int QR_algorithm(float *inMatArr, float *resD, int matSizeN, int low_accuracy)
953 {
954 int kk, ii, jj, count, delta_count;
955 int N_hh_it;
956
957 float *xVec;
958 float beta, sum, prod, xNorm;
959 int startIdx, zeroIdx = 0;
960 unsigned char indexArr[MAX_MAT_SIZE];
961 float step;
962
963 float *eigVals = resD;
964 float *Qmat = eigVals + MAX_MAT_SIZE; // vVec[matSizeN*matSizeN]
965
966 float *vVec = Qmat + MAX_MAT_SIZE * MAX_MAT_SIZE; // vVec[matSizeN]
967 float *wVec = vVec + MAX_MAT_SIZE; // wVec[matSizeN]
968 float *pVec = wVec + MAX_MAT_SIZE; // pVec[matSizeN]
969
970 float *diagA = vVec; // diagA[matSizeN]
971 float *offDiagA = wVec; // offDiagA[matSizeN]
972
973 #ifdef ARM_DS5
974 float *ldPtr, *strPtr, *strPtrTr;
975 float32x2_t storeVec;
976 float32x2x2_t storeVec2;
977 float32x2_t colVec0, colVec1, pVecf32, vVecf32, wVecf32, vVecfTr, wVecfTr, sumVec;
978
979 step = (low_accuracy == 1) ? TOL_QR_STEP_LOW : TOL_QR_STEP;
980 storeVec = vdup_n_f32(0.0f); // VDUP.32 d0,r0
981
982 // initialize Q
983 strPtr = Qmat;
984 for (ii = 0; ii < MAX_MAT_SIZE; ii++)
985 {
986 for (jj = 0; jj < MAX_MAT_SIZE; jj += 2)
987 {
988 vst1_f32((float32_t *)strPtr, storeVec); // VST1.32 {d0}, [r0]
989 strPtr += 2;
990 }
991 strPtr[ii - MAX_MAT_SIZE] = 1.0f;
992 }
993
994 // Householder Tridiagonalization
995 xVec = inMatArr + 1;
996 N_hh_it = matSizeN - 1;
997 for (kk = 1; kk < matSizeN - 1; kk++)
998 {
999 if (!house_holder_vec(xVec, vVec, &beta, &xNorm, N_hh_it))
1000 { // calculate house holder vector
1001 for (ii = N_hh_it; ii < MAX_MAT_SIZE; ii++)
1002 {
1003 vVec[ii] = 0;
1004 }
1005 // apply house holder iteration
1006 for (ii = 0; ii < N_hh_it; ii += 2)
1007 { // calc. p = beta*A*v and q = beta*Q'*v;
1008 sumVec = vdup_n_f32(0.0f); // VDUP.32 d0,r0
1009 ldPtr = inMatArr + (kk * MAX_MAT_SIZE + (ii + kk));
1010 for (jj = 0; jj < N_hh_it; jj += 2)
1011 {
1012 vVecf32 = vld1_f32((float32_t const *)(vVec + jj)); // VLD1.32 {d0}, [r0]
1013 colVec0 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
1014 ldPtr += MAX_MAT_SIZE;
1015 sumVec = vmla_lane_f32(sumVec, colVec0, vVecf32, 0); // VMLA.F32 d0, d0, d0[0]
1016
1017 colVec1 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
1018 ldPtr += MAX_MAT_SIZE;
1019 sumVec = vmla_lane_f32(sumVec, colVec1, vVecf32, 1); // VMLA.F32 d0, d0, d0[0]
1020 }
1021 sumVec = vmul_n_f32(sumVec, beta); // VMUL.F32 d0,d0,d0[0]
1022 vst1_f32((float32_t *)(pVec + ii), sumVec); // VST1.32 {d0}, [r0]
1023 }
1024 pVec[N_hh_it] = 0;
1025
1026 sumVec = vdup_n_f32(0.0f); // VDUP.32 d0,r0
1027 for (jj = 0; jj < N_hh_it; jj += 2)
1028 { // calc. (beta*p'*v/2)
1029 pVecf32 = vld1_f32((float32_t const *)(pVec + jj)); // VLD1.32 {d0}, [r0]
1030 vVecf32 = vld1_f32((float32_t const *)(vVec + jj)); // VLD1.32 {d0}, [r0]
1031 sumVec = vmla_f32(sumVec, pVecf32, vVecf32); // VMLA.F32 d0,d0,d0
1032 }
1033 sumVec = vpadd_f32(sumVec, sumVec); // VPADD.F32 d0,d0,d0
1034 sum = vget_lane_f32(sumVec, 0); // VMOV.32 r0, d0[0]
1035 prod = beta * sum / 2;
1036
1037 for (ii = 0; ii < N_hh_it; ii += 2)
1038 { // calc. w = p-(beta*p'*v/2)*v;
1039 vVecf32 = vld1_f32((float32_t const *)(vVec + ii)); // VLD1.32 {d0}, [r0]
1040 pVecf32 = vld1_f32((float32_t const *)(pVec + ii)); // VLD1.32 {d0}, [r0]
1041 pVecf32 = vmls_n_f32(pVecf32, vVecf32, prod); // VMLS.F32 d0, d0, d0[0]
1042 vst1_f32((float32_t *)(wVec + ii), pVecf32); // VST1.32 {d0}, [r0]
1043 }
1044
1045 strPtr = inMatArr + ((kk)*MAX_MAT_SIZE + (kk - 1));
1046 strPtrTr = strPtr - (MAX_MAT_SIZE - 1); // inMatArr + ((kk-1)*matSizeN+(kk));
1047 *strPtr = xNorm;
1048 strPtr += MAX_MAT_SIZE;
1049 *strPtrTr++ = xNorm;
1050 for (ii = 1; ii < N_hh_it; ii++)
1051 {
1052 *strPtr = 0;
1053 strPtr += MAX_MAT_SIZE;
1054 *strPtrTr++ = 0;
1055 }
1056
1057 // calc A(index_kk,index_kk)-v*w'-w*v';
1058 for (ii = 0; ii < N_hh_it; ii += 2)
1059 {
1060 ldPtr = inMatArr + (ii + kk) * MAX_MAT_SIZE + (ii + kk);
1061 vVecf32 = vld1_f32((float32_t const *)(vVec + ii)); // VLD1.32 {d0}, [r0]
1062 wVecf32 = vld1_f32((float32_t const *)(wVec + ii)); // VLD1.32 {d0}, [r0]
1063 colVec0 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
1064 colVec1 = vld1_f32((float32_t const *)(ldPtr + MAX_MAT_SIZE)); // VLD1.32 {d0}, [r0]
1065 colVec0 = vmls_lane_f32(colVec0, vVecf32, wVecf32, 0); // VMLS.F32 d0,
1066 colVec1 = vmls_lane_f32(colVec1, vVecf32, wVecf32, 1); // VMLS.F32 d0,
1067 colVec0 = vmls_lane_f32(colVec0, wVecf32, vVecf32, 0); // VMLS.F32 d0,
1068 colVec1 = vmls_lane_f32(colVec1, wVecf32, vVecf32, 1); // VMLS.F32 d0,
1069 vst1_f32((float32_t *)ldPtr, colVec0); // VST1.32 {d0}, [r0]
1070 vst1_f32((float32_t *)(ldPtr + MAX_MAT_SIZE), colVec1); // VST1.32 {d0}, [r0]
1071
1072 ldPtr -= ii; // ldPtr = inMatArr+ (ii+kk)*matSizeN + kk;
1073 strPtr = inMatArr + kk * MAX_MAT_SIZE + (ii + kk);
1074 for (jj = 0; jj < ii; jj += 2)
1075 {
1076 wVecfTr = vld1_f32((float32_t const *)(wVec + jj)); // VLD1.32 {d0}, [r0]
1077 colVec0 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
1078 colVec1 = vld1_f32((float32_t const *)(ldPtr + MAX_MAT_SIZE)); // VLD1.32 {d0}, [r0]
1079 colVec0 = vmls_lane_f32(colVec0, wVecfTr, vVecf32, 0); // VMLS.F32 d0,
1080 colVec1 = vmls_lane_f32(colVec1, wVecfTr, vVecf32, 1); // VMLS.F32 d0,
1081 vVecfTr = vld1_f32((float32_t const *)(vVec + jj)); // VLD1.32 {d0}, [r0]
1082 colVec0 = vmls_lane_f32(colVec0, vVecfTr, wVecf32, 0); // VMLS.F32 d0,
1083 colVec1 = vmls_lane_f32(colVec1, vVecfTr, wVecf32, 1); // VMLS.F32 d0,
1084 vst1_f32((float32_t *)ldPtr, colVec0); // VST1.32 {d0}, [r0]
1085 vst1_f32((float32_t *)(ldPtr + MAX_MAT_SIZE), colVec1); // VST1.32 {d0}, [r0]
1086 ldPtr += 2;
1087 storeVec2 = vtrn_f32(colVec0, colVec1); // VTRN.32 d0,d0
1088 vst1_f32((float32_t *)strPtr, storeVec2.val[0]); // VST1.32 {d0}, [r0]
1089 strPtr += MAX_MAT_SIZE;
1090 vst1_f32((float32_t *)strPtr, storeVec2.val[1]); // VST1.32 {d0}, [r0]
1091 strPtr += MAX_MAT_SIZE;
1092 }
1093 }
1094
1095 // update Q
1096 for (ii = 0; ii < matSizeN; ii += 2)
1097 { // calc. p = beta*QQ0(index_kk,:)'*v;
1098 sumVec = vdup_n_f32(0.0f); // VDUP.32 d0,r0
1099 ldPtr = Qmat + kk * MAX_MAT_SIZE + ii;
1100 for (jj = 0; jj < N_hh_it; jj += 2)
1101 {
1102 colVec0 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
1103 ldPtr += MAX_MAT_SIZE;
1104 colVec1 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
1105 ldPtr += MAX_MAT_SIZE;
1106 vVecf32 = vld1_f32((float32_t const *)(vVec + jj)); // VLD1.32 {d0}, [r0]
1107 sumVec = vmla_lane_f32(sumVec, colVec0, vVecf32, 0);
1108 sumVec = vmla_lane_f32(sumVec, colVec1, vVecf32, 1);
1109 }
1110 sumVec = vmul_n_f32(sumVec, beta); // VMUL.F32 d0,d0,d0[0]
1111 vst1_f32((float32_t *)(pVec + ii), sumVec); // VST1.32 {d0}, [r0]
1112 }
1113 // Q0(index_kk,:)-v*p';
1114 for (ii = 0; ii < N_hh_it; ii += 2)
1115 {
1116 vVecf32 = vld1_f32((float32_t const *)(vVec + ii)); // VLD1.32 {d0}, [r0]
1117 ldPtr = Qmat + (ii + kk) * MAX_MAT_SIZE;
1118 for (jj = 0; jj < matSizeN; jj += 2)
1119 {
1120 pVecf32 = vld1_f32((float32_t const *)(pVec + jj)); // VLD1.32 {d0}, [r0]
1121 colVec0 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
1122 colVec1 = vld1_f32((float32_t const *)(ldPtr + MAX_MAT_SIZE)); // VLD1.32 {d0}, [r0]
1123 colVec0 = vmls_lane_f32(colVec0, pVecf32, vVecf32, 0); // VMLS.F32 d0,
1124 colVec1 = vmls_lane_f32(colVec1, pVecf32, vVecf32, 1); // VMLS.F32 d0,
1125
1126 vst1_f32((float32_t *)ldPtr, colVec0); // VST1.32 {d0}, [r0]
1127 vst1_f32((float32_t *)(ldPtr + MAX_MAT_SIZE), colVec1); // VST1.32 {d0}, [r0]
1128 ldPtr += 2;
1129 }
1130 }
1131 }
1132 xVec += MAX_MAT_SIZE + 1;
1133 N_hh_it--;
1134 }
1135 #else
1136 float temp, sum1, vTemp;
1137
1138 if (low_accuracy == 1)
1139 step = TOL_QR_STEP_LOW;
1140 else
1141 step = TOL_QR_STEP;
1142
1143 // initialize Q
1144 for (ii = 0; ii < MAX_MAT_SIZE; ii++)
1145 {
1146 for (jj = 0; jj < MAX_MAT_SIZE; jj++)
1147 {
1148 Qmat[jj + MAX_MAT_SIZE * ii] = 0.0f;
1149 }
1150 Qmat[ii + MAX_MAT_SIZE * ii] = 1.0f;
1151 vVec[ii] = 0;
1152 }
1153
1154 // Householder Tridiagonalization
1155 for (kk = 1; kk < matSizeN - 1; kk++)
1156 {
1157 N_hh_it = matSizeN - kk;
1158 xVec = inMatArr + kk + (kk - 1) * MAX_MAT_SIZE;
1159 // calculate house holder vector
1160 if (!house_holder_vec(xVec, vVec, &beta, &xNorm, N_hh_it))
1161 {
1162 // apply house holder iteration
1163 for (ii = 0; ii < N_hh_it; ii += 2)
1164 { // calc. p = beta*A*v and q = beta*Q'*v;
1165 sum = 0;
1166 sum1 = 0;
1167 for (jj = 0; jj < N_hh_it; jj++)
1168 {
1169 vTemp = vVec[jj];
1170 temp = inMatArr[(jj + kk) * MAX_MAT_SIZE + (ii + kk)];
1171 sum += temp * vTemp;
1172 temp = inMatArr[(jj + kk) * MAX_MAT_SIZE + (ii + 1 + kk)];
1173 sum1 += temp * vTemp;
1174 }
1175 pVec[ii] = beta * sum;
1176 pVec[ii + 1] = beta * sum1;
1177 }
1178 sum = 0; // calc. (beta*p'*v/2)
1179 for (jj = 0; jj < N_hh_it; jj++)
1180 {
1181 sum += pVec[jj] * vVec[jj];
1182 }
1183 prod = beta * sum / 2;
1184 for (ii = 0; ii < N_hh_it; ii++)
1185 { // calc. w = p-(beta*p'*v/2)*v;
1186 wVec[ii] = pVec[ii] - prod * vVec[ii];
1187 }
1188 inMatArr[(kk)*MAX_MAT_SIZE + (kk - 1)] = xNorm;
1189 inMatArr[(kk - 1) * MAX_MAT_SIZE + (kk)] = xNorm;
1190 for (ii = 1; ii < N_hh_it; ii++)
1191 {
1192 inMatArr[(ii + kk) * MAX_MAT_SIZE + (kk - 1)] = 0;
1193 inMatArr[(kk - 1) * MAX_MAT_SIZE + (ii + kk)] = 0;
1194 }
1195 // calc A(index_kk,index_kk)-v*w'-w*v';
1196 for (ii = 0; ii < N_hh_it; ii++)
1197 {
1198 sum = vVec[ii] * wVec[ii] + wVec[ii] * vVec[ii];
1199 inMatArr[(ii + kk) * MAX_MAT_SIZE + (ii + kk)] -= sum;
1200 for (jj = 0; jj < ii; jj++)
1201 {
1202 sum = vVec[ii] * wVec[jj] + wVec[ii] * vVec[jj];
1203 temp = inMatArr[(ii + kk) * MAX_MAT_SIZE + (jj + kk)];
1204 temp -= sum;
1205 inMatArr[(ii + kk) * MAX_MAT_SIZE + (jj + kk)] = temp;
1206 inMatArr[(jj + kk) * MAX_MAT_SIZE + (ii + kk)] = temp;
1207 }
1208 }
1209 // update Q
1210 for (ii = N_hh_it; ii < matSizeN; ii++)
1211 {
1212 vVec[ii] = 0;
1213 }
1214 for (ii = 0; ii < matSizeN; ii += 2)
1215 { // calc. p = beta*QQ0(index_kk,:)'*v;
1216 sum = 0;
1217 sum1 = 0;
1218 for (jj = 0; jj < N_hh_it; jj++)
1219 {
1220 sum += Qmat[(jj + kk) * MAX_MAT_SIZE + ii] * vVec[jj];
1221 sum1 += Qmat[(jj + kk) * MAX_MAT_SIZE + ii + 1] * vVec[jj];
1222 }
1223 pVec[ii] = beta * sum;
1224 pVec[ii + 1] = beta * sum1;
1225 }
1226 // Q0(index_kk,:)-v*p';
1227 for (ii = 0; ii < N_hh_it; ii++)
1228 {
1229 for (jj = 0; jj < matSizeN; jj++)
1230 {
1231 Qmat[(ii + kk) * MAX_MAT_SIZE + jj] -= vVec[ii] * pVec[jj];
1232 }
1233 }
1234 }
1235 }
1236 #endif
1237 // Implicit Symmetric QR Step with Wilkinson Shift
1238 diagA[0] = inMatArr[0]; // copy tridiagonal D
1239 offDiagA[0] = 0;
1240 indexArr[0] = 0;
1241 for (ii = 1; ii < matSizeN; ii++)
1242 {
1243 diagA[ii] = inMatArr[ii * (MAX_MAT_SIZE + 1)];
1244 offDiagA[ii] = inMatArr[ii * (MAX_MAT_SIZE + 1) - 1];
1245 if (FABSF(offDiagA[ii]) < step)
1246 { // find decoupled blocks
1247 zeroIdx++;
1248 indexArr[zeroIdx] = ii;
1249 }
1250 }
1251 count = 0;
1252
1253 startIdx = indexArr[zeroIdx--];
1254 for (kk = matSizeN - 1; kk > 0; kk--)
1255 {
1256 if (kk > startIdx)
1257 {
1258 delta_count = 0;
1259 while ((FABSF(offDiagA[kk]) > step * (FABSF(diagA[kk - 1]) + FABSF(diagA[kk]))) &&
1260 (delta_count < WILK_IT_MAX))
1261 {
1262 QR_step_Wilkinson(diagA + startIdx, offDiagA + startIdx, Qmat + startIdx * MAX_MAT_SIZE, matSizeN,
1263 kk - startIdx);
1264 delta_count++;
1265 }
1266 count += delta_count;
1267 if (kk - startIdx > 2)
1268 {
1269 zeroIdx++;
1270 // check for newly diagonalized blocks
1271 for (ii = startIdx + 1; ii < kk; ii++)
1272 {
1273 if (FABSF(offDiagA[ii]) < step)
1274 { // find decoupled blocks
1275 zeroIdx++;
1276 indexArr[zeroIdx] = ii;
1277 }
1278 }
1279 startIdx = indexArr[zeroIdx--];
1280 }
1281 }
1282 else
1283 {
1284 if (startIdx > 0)
1285 {
1286 startIdx = indexArr[zeroIdx--];
1287 }
1288 }
1289 }
1290 for (ii = 0; ii < matSizeN; ii++)
1291 {
1292 eigVals[ii] = diagA[ii];
1293 }
1294
1295 return 0;
1296 }
1297
1298 // unsymmetric eigen(Shur) decomposition, only solves for eigen values
unsym_QR_algorithm(float * inMatArr,float * resD,int matSizeN)1299 int unsym_QR_algorithm(float *inMatArr, float *resD, int matSizeN)
1300 {
1301 int kk, ii, jj, count, delta_count;
1302 int N_hh_it;
1303 float *xVec;
1304 float beta, sum, xNorm, tempVal;
1305 int startIdx, zeroIdx = 0;
1306 unsigned char indexArr[MAX_MAT_SIZE];
1307
1308 float *eigVals = resD;
1309 float *vVec = eigVals + MAX_MAT_SIZE; // vVec[matSizeN]
1310 float *pVec0 = vVec + MAX_MAT_SIZE; // pVec0[matSizeN]
1311 float *pVec1 = pVec0 + MAX_MAT_SIZE; // pVec1[matSizeN]
1312
1313 #ifdef ARM_DS5
1314 float *ldPtr, *strPtr;
1315 float32x2_t resVec0, resVec1;
1316 float32x2_t colVec0, colVec1, pVecf32, vVecf32, sumVec;
1317
1318 // Householder Tridiagonalization
1319 N_hh_it = matSizeN - 1;
1320 xVec = inMatArr + 1;
1321 for (kk = 1; kk < matSizeN - 1; kk++)
1322 {
1323 // calculate house holder vector
1324 if (!house_holder_vec(xVec, vVec, &beta, &xNorm, N_hh_it))
1325 {
1326 vVec[N_hh_it] = 0;
1327
1328 // apply house holder iteration
1329 for (ii = 0; ii < N_hh_it; ii += 2)
1330 { // calc. p0 = beta*A(index_kk,index_kk)'*v
1331 resVec0 = vdup_n_f32(0.0f); // VDUP.32 d0,r0
1332 resVec1 = vdup_n_f32(0.0f); // VDUP.32 d0,r0
1333 ldPtr = inMatArr + kk + (ii + kk) * matSizeN;
1334 vVecf32 = vld1_f32((float32_t const *)vVec); // VLD1.32 {d0}, [r0]
1335 for (jj = 0; jj < N_hh_it; jj += 2)
1336 {
1337 colVec0 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
1338 resVec0 = vmla_f32(resVec0, colVec0, vVecf32);
1339 colVec1 = vld1_f32((float32_t const *)(ldPtr + matSizeN)); // VLD1.32 {d0}, [r0]
1340 resVec1 = vmla_f32(resVec1, colVec1, vVecf32);
1341 ldPtr += 2;
1342 vVecf32 = vld1_f32((float32_t const *)(vVec + jj + 2)); // VLD1.32 {d0}, [r0]
1343 }
1344 sumVec = vpadd_f32(resVec0, resVec1);
1345 sumVec = vmul_n_f32(sumVec, beta); // VMUL.F32 d0,d0,d0[0]
1346 vst1_f32((float32_t *)(pVec0 + ii), sumVec); // VST1.32 {d0}, [r0]
1347 }
1348 pVec0[N_hh_it] = 0;
1349 strPtr = inMatArr + (kk) + (kk - 1) * matSizeN;
1350 *strPtr++ = xNorm;
1351 for (ii = 1; ii < N_hh_it; ii++)
1352 {
1353 *strPtr++ = 0;
1354 }
1355 // calc A(index_kk,index_kk)-v*p0'
1356 for (ii = 0; ii < N_hh_it; ii += 2)
1357 {
1358 pVecf32 = vld1_f32((float32_t const *)(pVec0 + ii)); // VLD1.32 {d0}, [r0]
1359 vVecf32 = vld1_f32((float32_t const *)vVec); // VLD1.32 {d0}, [r0]
1360 for (jj = 0; jj < N_hh_it; jj += 2)
1361 {
1362 colVec0 = vld1_f32(
1363 (float32_t const *)(inMatArr + (jj + kk) + (ii + kk) * matSizeN)); // VLD1.32 {d0}, [r0]
1364 colVec0 = vmls_lane_f32(colVec0, vVecf32, pVecf32, 0); // VMLS.F32 d0,
1365 colVec1 = vld1_f32(
1366 (float32_t const *)(inMatArr + (jj + kk) + (ii + 1 + kk) * matSizeN)); // VLD1.32 {d0}, [r0]
1367 colVec1 = vmls_lane_f32(colVec1, vVecf32, pVecf32, 1); // VMLS.F32 d0,
1368 vst1_f32((float32_t *)(inMatArr + (jj + kk) + (ii + kk) * matSizeN), colVec0); // VST1.32 {d0}, [r0]
1369 vst1_f32((float32_t *)(inMatArr + (jj + kk) + (ii + 1 + kk) * matSizeN),
1370 colVec1); // VST1.32 {d0}, [r0]
1371 vVecf32 = vld1_f32((float32_t const *)(vVec + jj + 2)); // VLD1.32 {d0}, [r0]
1372 }
1373 }
1374 // second update
1375 for (ii = 0; ii < matSizeN; ii += 2)
1376 { // calc. p1 = beta*A(:,index_kk)*v
1377 sumVec = vdup_n_f32(0.0f);
1378 ldPtr = inMatArr + ii + kk * matSizeN;
1379 vVecf32 = vld1_f32((float32_t const *)vVec); // VLD1.32 {d0}, [r0]
1380 for (jj = 0; jj < N_hh_it; jj += 2)
1381 {
1382 colVec0 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
1383 ldPtr += matSizeN;
1384 sumVec = vmla_lane_f32(sumVec, colVec0, vVecf32, 0);
1385 colVec1 = vld1_f32((float32_t const *)ldPtr); // VLD1.32 {d0}, [r0]
1386 ldPtr += matSizeN;
1387 sumVec = vmla_lane_f32(sumVec, colVec1, vVecf32, 1);
1388 vVecf32 = vld1_f32((float32_t const *)(vVec + jj + 2)); // VLD1.32 {d0}, [r0]
1389 }
1390 sumVec = vmul_n_f32(sumVec, beta); // VMUL.F32 d0,d0,d0[0]
1391 vst1_f32((float32_t *)(pVec1 + ii), sumVec);
1392 }
1393 pVec1[matSizeN] = 0;
1394
1395 // calc A(:,index_kk) -p1*v'
1396 for (ii = 0; ii < matSizeN; ii += 2)
1397 {
1398 pVecf32 = vld1_f32((float32_t const *)pVec1 + ii);
1399 ldPtr = inMatArr + ii + kk * matSizeN;
1400 vVecf32 = vld1_f32((float32_t const *)vVec);
1401 for (jj = 0; jj < N_hh_it; jj += 2)
1402 {
1403 colVec0 = vld1_f32((float32_t const *)ldPtr);
1404 colVec0 = vmls_lane_f32(colVec0, pVecf32, vVecf32, 0); // VMLS.F32 d0,
1405 colVec1 = vld1_f32((float32_t const *)(ldPtr + matSizeN));
1406 colVec1 = vmls_lane_f32(colVec1, pVecf32, vVecf32, 1); // VMLS.F32 d0,
1407 vst1_f32((float32_t *)ldPtr, colVec0);
1408 ldPtr += matSizeN;
1409 vst1_f32((float32_t *)ldPtr, colVec1);
1410 ldPtr += matSizeN;
1411 vVecf32 = vld1_f32((float32_t const *)(vVec + jj + 2));
1412 }
1413 }
1414 }
1415 N_hh_it--;
1416 xVec += matSizeN + 1;
1417 }
1418 #else
1419 // Householder Tridiagonalization
1420 for (kk = 1; kk < matSizeN - 1; kk++)
1421 {
1422 N_hh_it = matSizeN - kk;
1423 xVec = inMatArr + kk + (kk - 1) * matSizeN;
1424 // calculate house holder vector
1425 if (!house_holder_vec(xVec, vVec, &beta, &xNorm, N_hh_it))
1426 {
1427 // apply house holder iteration
1428 for (ii = 0; ii < N_hh_it; ii++)
1429 { // calc. p0 = beta*A(index_kk,index_kk)'*v
1430 sum = 0;
1431 for (jj = 0; jj < N_hh_it; jj++)
1432 {
1433 sum += inMatArr[(jj + kk) + (ii + kk) * matSizeN] * vVec[jj];
1434 }
1435 pVec0[ii] = beta * sum;
1436 }
1437 inMatArr[(kk) + (kk - 1) * matSizeN] = xNorm;
1438 for (ii = 1; ii < N_hh_it; ii++)
1439 {
1440 inMatArr[(ii + kk) + (kk - 1) * matSizeN] = 0;
1441 }
1442 // calc A(index_kk,index_kk)-v*p0'
1443 for (ii = 0; ii < N_hh_it; ii++)
1444 {
1445 for (jj = 0; jj < N_hh_it; jj++)
1446 {
1447 sum = vVec[jj] * pVec0[ii];
1448 inMatArr[(jj + kk) + (ii + kk) * matSizeN] -= sum;
1449 }
1450 }
1451 // second update
1452 for (ii = 0; ii < matSizeN; ii++)
1453 { // calc. p1 = beta*A(:,index_kk)*v
1454 sum = 0;
1455 for (jj = 0; jj < N_hh_it; jj++)
1456 {
1457 sum += inMatArr[ii + (jj + kk) * matSizeN] * vVec[jj];
1458 }
1459 pVec1[ii] = beta * sum;
1460 }
1461 // calc A(:,index_kk) -p1*v'
1462 for (ii = 0; ii < matSizeN; ii++)
1463 {
1464 for (jj = 0; jj < N_hh_it; jj++)
1465 {
1466 sum = pVec1[ii] * vVec[jj];
1467 inMatArr[ii + (jj + kk) * matSizeN] -= sum;
1468 }
1469 }
1470 }
1471 }
1472 #endif
1473
1474 // Implicit Symmetric QR Step with Wilkinson Shift
1475 indexArr[0] = 0;
1476 for (ii = 1; ii < matSizeN; ii++)
1477 {
1478 tempVal = FABSF(inMatArr[(ii - 1) * (matSizeN + 1) + 1]);
1479 sum = FABSF(inMatArr[(ii - 1) * (matSizeN + 1)]) + FABSF(inMatArr[ii * (matSizeN + 1)]);
1480 if (tempVal < TOL_QR_STEP * sum)
1481 { // find decoupled blocks
1482 zeroIdx++;
1483 indexArr[zeroIdx] = ii;
1484 }
1485 }
1486 count = 0;
1487
1488 startIdx = indexArr[zeroIdx--];
1489 for (kk = matSizeN - 1; kk > 0; kk--)
1490 {
1491 if (kk > startIdx)
1492 {
1493 tempVal = FABSF(inMatArr[(kk - 1) * (matSizeN + 1) + 1]);
1494 sum = FABSF(inMatArr[(kk - 1) * (matSizeN + 1)]) + FABSF(inMatArr[kk * (matSizeN + 1)]);
1495 delta_count = 0;
1496 while ((tempVal > TOL_QR_STEP * sum) && (delta_count < WILK_IT_MAX))
1497 {
1498 // while((tempVal > TOL_QR_STEP * sum)){
1499 unsym_QR_step_Wilkinson(inMatArr, matSizeN, kk, startIdx);
1500 delta_count++;
1501 tempVal = FABSF(inMatArr[(kk - 1) * (matSizeN + 1) + 1]);
1502 sum = FABSF(inMatArr[(kk - 1) * (matSizeN + 1)]) + FABSF(inMatArr[kk * (matSizeN + 1)]);
1503 }
1504 count += delta_count;
1505
1506 if (kk > 2)
1507 {
1508 // find newly decoupled blocks
1509 indexArr[0] = 0;
1510 zeroIdx = 0;
1511 for (ii = 1; ii < kk; ii++)
1512 {
1513 tempVal = FABSF(inMatArr[(ii - 1) * (matSizeN + 1) + 1]);
1514 sum = FABSF(inMatArr[(ii - 1) * (matSizeN + 1)]) + FABSF(inMatArr[ii * (matSizeN + 1)]);
1515 if (tempVal < TOL_QR_STEP * sum)
1516 { // find decoupled blocks
1517 zeroIdx++;
1518 indexArr[zeroIdx] = ii;
1519 }
1520 }
1521 startIdx = indexArr[zeroIdx--];
1522 }
1523 }
1524 else
1525 {
1526 if (startIdx > 0)
1527 {
1528 startIdx = indexArr[zeroIdx--];
1529 }
1530 }
1531 }
1532 for (ii = 0; ii < matSizeN; ii++)
1533 {
1534 eigVals[ii] = inMatArr[ii * (matSizeN + 1)];
1535 }
1536
1537 return 0;
1538 }
1539
1540 #endif /* CONFIG_WLS_CSI_PROC */
1541