1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_cholesky_f64.c
4  * Description:  Floating-point Cholesky decomposition
5  *
6  * $Date:        10 August 2022
7  * $Revision:    V1.9.1
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/matrix_functions.h"
30 #include "dsp/matrix_utils.h"
31 
32 /**
33   @ingroup groupMatrix
34  */
35 
36 /**
37   @addtogroup MatrixChol
38   @{
39  */
40 
41 /**
42    * @brief Floating-point Cholesky decomposition of positive-definite matrix.
43    * @param[in]  pSrc   points to the instance of the input floating-point matrix structure.
44    * @param[out] pDst   points to the instance of the output floating-point matrix structure.
45    * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
46    * @return        execution status
47                    - \ref ARM_MATH_SUCCESS       : Operation successful
48                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
49                    - \ref ARM_MATH_DECOMPOSITION_FAILURE      : Input matrix cannot be decomposed
50    * @par
51    * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
52    * The decomposition of A is returning a lower triangular matrix L such that A = L L^t
53    *
54    * @par
55    * The destination matrix should be set to 0 before calling the functions because
56    * the function may not overwrite all output elements.
57    */
58 
59 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__aarch64__)
60 
arm_mat_cholesky_f64(const arm_matrix_instance_f64 * pSrc,arm_matrix_instance_f64 * pDst)61 ARM_DSP_ATTRIBUTE arm_status arm_mat_cholesky_f64(
62     const arm_matrix_instance_f64 * pSrc,
63     arm_matrix_instance_f64 * pDst)
64 {
65 
66     arm_status status;                             /* status of matrix inverse */
67 
68 
69 #ifdef ARM_MATH_MATRIX_CHECK
70 
71     /* Check for matrix mismatch condition */
72     if ((pSrc->numRows != pSrc->numCols) ||
73         (pDst->numRows != pDst->numCols) ||
74         (pSrc->numRows != pDst->numRows)   )
75     {
76         /* Set status as ARM_MATH_SIZE_MISMATCH */
77         status = ARM_MATH_SIZE_MISMATCH;
78     }
79     else
80 
81 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
82 
83     {
84         int i,j,k;
85         int n = pSrc->numRows;
86         float64_t invSqrtVj;
87         float64_t *pA,*pG;
88         int kCnt;
89 
90 
91         float64x2_t acc, acc0, acc1, acc2, acc3;
92         float64x2_t vecGi;
93         float64x2_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
94 
95 
96         float64_t sum=0.0;
97         float64_t sum0=0.0,sum1=0.0,sum2=0.0,sum3=0.0;
98 
99 
100         pA = pSrc->pData;
101         pG = pDst->pData;
102 
103         for(i=0 ;i < n ; i++)
104         {
105             for(j=i ; j+3 < n ; j+=4)
106             {
107                 pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
108                 pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
109                 pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
110                 pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
111 
112                 acc0 = vdupq_n_f64(0.0);
113                 acc1 = vdupq_n_f64(0.0);
114                 acc2 = vdupq_n_f64(0.0);
115                 acc3 = vdupq_n_f64(0.0);
116 
117                 kCnt = i >> 1U;
118                 k=0;
119                 while(kCnt > 0)
120                 {
121 
122                     vecGi=vld1q_f64(&pG[i * n + k]);
123 
124                     vecGj0=vld1q_f64(&pG[(j + 0) * n + k]);
125                     vecGj1=vld1q_f64(&pG[(j + 1) * n + k]);
126                     vecGj2=vld1q_f64(&pG[(j + 2) * n + k]);
127                     vecGj3=vld1q_f64(&pG[(j + 3) * n + k]);
128 
129                     acc0 = vfmaq_f64(acc0, vecGi, vecGj0);
130                     acc1 = vfmaq_f64(acc1, vecGi, vecGj1);
131                     acc2 = vfmaq_f64(acc2, vecGi, vecGj2);
132                     acc3 = vfmaq_f64(acc3, vecGi, vecGj3);
133 
134                     kCnt--;
135                     k+=2;
136                 }
137 
138 
139                 sum0 = vaddvq_f64(acc0);
140                 sum1 = vaddvq_f64(acc1);
141                 sum2 = vaddvq_f64(acc2);
142                 sum3 = vaddvq_f64(acc3);
143 
144 
145                 kCnt = i & 1;
146                 while(kCnt > 0)
147                 {
148 
149                     sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k];
150                     sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k];
151                     sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k];
152                     sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k];
153                     kCnt--;
154                     k++;
155                 }
156 
157                 pG[(j + 0) * n + i] -= sum0;
158                 pG[(j + 1) * n + i] -= sum1;
159                 pG[(j + 2) * n + i] -= sum2;
160                 pG[(j + 3) * n + i] -= sum3;
161             }
162 
163             for(; j < n ; j++)
164             {
165                 pG[j * n + i] = pA[j * n + i];
166 
167                 acc = vdupq_n_f64(0.0);
168 
169                 kCnt = i >> 1U;
170                 k=0;
171                 while(kCnt > 0)
172                 {
173 
174                     vecGi=vld1q_f64(&pG[i * n + k]);
175                     vecGj=vld1q_f64(&pG[j * n + k]);
176 
177                     acc = vfmaq_f64(acc, vecGi, vecGj);
178 
179                     kCnt--;
180                     k+=2;
181                 }
182 
183 
184                 sum = vaddvq_f64(acc);
185 
186                 kCnt = i & 1;
187                 while(kCnt > 0)
188                 {
189                     sum = sum + pG[i * n + k] * pG[(j + 0) * n + k];
190 
191 
192                     kCnt--;
193                     k++;
194                 }
195 
196                 pG[j * n + i] -= sum;
197             }
198 
199             if (pG[i * n + i] <= 0.0)
200             {
201                 return(ARM_MATH_DECOMPOSITION_FAILURE);
202             }
203 
204             invSqrtVj = 1.0/sqrt(pG[i * n + i]);
205             SCALE_COL_F64(pDst,i,invSqrtVj,i);
206         }
207 
208         status = ARM_MATH_SUCCESS;
209 
210     }
211 
212 
213     /* Return to application */
214     return (status);
215 }
216 #else
arm_mat_cholesky_f64(const arm_matrix_instance_f64 * pSrc,arm_matrix_instance_f64 * pDst)217 ARM_DSP_ATTRIBUTE arm_status arm_mat_cholesky_f64(
218                                 const arm_matrix_instance_f64 * pSrc,
219                                 arm_matrix_instance_f64 * pDst)
220 {
221 
222     arm_status status;                             /* status of matrix inverse */
223 
224 
225 #ifdef ARM_MATH_MATRIX_CHECK
226 
227     /* Check for matrix mismatch condition */
228     if ((pSrc->numRows != pSrc->numCols) ||
229         (pDst->numRows != pDst->numCols) ||
230         (pSrc->numRows != pDst->numRows)   )
231     {
232         /* Set status as ARM_MATH_SIZE_MISMATCH */
233         status = ARM_MATH_SIZE_MISMATCH;
234     }
235     else
236 
237 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
238 
239     {
240         int i,j,k;
241         int n = pSrc->numRows;
242         float64_t invSqrtVj;
243         float64_t *pA,*pG;
244 
245         pA = pSrc->pData;
246         pG = pDst->pData;
247 
248 
249         for(i=0 ; i < n ; i++)
250         {
251             for(j=i ; j < n ; j++)
252             {
253                 pG[j * n + i] = pA[j * n + i];
254 
255                 for(k=0; k < i ; k++)
256                 {
257                     pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
258                 }
259             }
260 
261             if (pG[i * n + i] <= 0.0)
262             {
263                 return(ARM_MATH_DECOMPOSITION_FAILURE);
264             }
265 
266             invSqrtVj = 1.0/sqrt(pG[i * n + i]);
267             SCALE_COL_F64(pDst,i,invSqrtVj,i);
268 
269         }
270 
271         status = ARM_MATH_SUCCESS;
272 
273     }
274 
275 
276     /* Return to application */
277     return (status);
278 }
279 #endif
280 
281 /**
282   @} end of MatrixChol group
283  */
284