1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_inverse_f32.c
4  * Description:  Floating-point matrix inverse
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/matrix_functions.h"
30 
31 
32 /**
33   @ingroup groupMatrix
34  */
35 
36 /**
37   @defgroup MatrixInv Matrix Inverse
38 
39   Computes the inverse of a matrix.
40 
41   The inverse is defined only if the input matrix is square and non-singular (the determinant is non-zero).
42   The function checks that the input and output matrices are square and of the same size.
43 
44   Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
45   inversion of floating-point matrices.
46 
47   @par Algorithm
48   The Gauss-Jordan method is used to find the inverse.
49   The algorithm performs a sequence of elementary row-operations until it
50   reduces the input matrix to an identity matrix. Applying the same sequence
51   of elementary row-operations to an identity matrix yields the inverse matrix.
52   If the input matrix is singular, then the algorithm terminates and returns error status
53   <code>ARM_MATH_SINGULAR</code>.
54   \image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
55  */
56 
57 /**
58   @addtogroup MatrixInv
59   @{
60  */
61 
62 /**
63   @brief         Floating-point matrix inverse.
64   @param[in]     pSrc      points to input matrix structure. The source matrix is modified by the function.
65   @param[out]    pDst      points to output matrix structure
66   @return        execution status
67                    - \ref ARM_MATH_SUCCESS       : Operation successful
68                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
69                    - \ref ARM_MATH_SINGULAR      : Input matrix is found to be singular (non-invertible)
70  */
71 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
72 
arm_mat_inverse_f32(const arm_matrix_instance_f32 * pSrc,arm_matrix_instance_f32 * pDst)73 arm_status arm_mat_inverse_f32(
74   const arm_matrix_instance_f32 * pSrc,
75   arm_matrix_instance_f32 * pDst)
76 {
77     float32_t *pIn = pSrc->pData;   /* input data matrix pointer */
78     float32_t *pOut = pDst->pData;  /* output data matrix pointer */
79     float32_t *pInT1, *pInT2;   /* Temporary input data matrix pointer */
80     float32_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
81     float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;    /* Temporary input and output data matrix pointer */
82 
83     uint32_t  numRows = pSrc->numRows;  /* Number of rows in the matrix  */
84     uint32_t  numCols = pSrc->numCols;  /* Number of Cols in the matrix  */
85     float32_t *pTmpA, *pTmpB;
86 
87     float32_t in = 0.0f;        /* Temporary input values  */
88     uint32_t  i, rowCnt, flag = 0U, j, loopCnt, l;   /* loop counters */
89     arm_status status;          /* status of matrix inverse */
90     uint32_t  blkCnt;
91 
92 #ifdef ARM_MATH_MATRIX_CHECK
93    /* Check for matrix mismatch condition */
94   if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
95      || (pSrc->numRows != pDst->numRows))
96   {
97     /* Set status as ARM_MATH_SIZE_MISMATCH */
98     status = ARM_MATH_SIZE_MISMATCH;
99   }
100   else
101 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
102   {
103 
104     /*--------------------------------------------------------------------------------------------------------------
105      * Matrix Inverse can be solved using elementary row operations.
106      *
107      *  Gauss-Jordan Method:
108      *
109      *     1. First combine the identity matrix and the input matrix separated by a bar to form an
110      *        augmented matrix as follows:
111      *                      _  _          _     _      _   _         _         _
112      *                     |  |  a11  a12  | | | 1   0  |   |       |  X11 X12  |
113      *                     |  |            | | |        |   |   =   |           |
114      *                     |_ |_ a21  a22 _| | |_0   1 _|  _|       |_ X21 X21 _|
115      *
116      *      2. In our implementation, pDst Matrix is used as identity matrix.
117      *
118      *      3. Begin with the first row. Let i = 1.
119      *
120      *      4. Check to see if the pivot for row i is zero.
121      *         The pivot is the element of the main diagonal that is on the current row.
122      *         For instance, if working with row i, then the pivot element is aii.
123      *         If the pivot is zero, exchange that row with a row below it that does not
124      *         contain a zero in column i. If this is not possible, then an inverse
125      *         to that matrix does not exist.
126      *
127      *      5. Divide every element of row i by the pivot.
128      *
129      *      6. For every row below and  row i, replace that row with the sum of that row and
130      *         a multiple of row i so that each new element in column i below row i is zero.
131      *
132      *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
133      *         for every element below and above the main diagonal.
134      *
135      *      8. Now an identical matrix is formed to the left of the bar(input matrix, src).
136      *         Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
137      *----------------------------------------------------------------------------------------------------------------*/
138 
139         /*
140          * Working pointer for destination matrix
141          */
142         pOutT1 = pOut;
143         /*
144          * Loop over the number of rows
145          */
146         rowCnt = numRows;
147         /*
148          * Making the destination matrix as identity matrix
149          */
150         while (rowCnt > 0U)
151         {
152             /*
153              * Writing all zeroes in lower triangle of the destination matrix
154              */
155             j = numRows - rowCnt;
156             while (j > 0U)
157             {
158                 *pOutT1++ = 0.0f;
159                 j--;
160             }
161             /*
162              * Writing all ones in the diagonal of the destination matrix
163              */
164             *pOutT1++ = 1.0f;
165             /*
166              * Writing all zeroes in upper triangle of the destination matrix
167              */
168             j = rowCnt - 1U;
169             while (j > 0U)
170             {
171                 *pOutT1++ = 0.0f;
172                 j--;
173             }
174             /*
175              * Decrement the loop counter
176              */
177             rowCnt--;
178         }
179 
180         /*
181          * Loop over the number of columns of the input matrix.
182          * All the elements in each column are processed by the row operations
183          */
184         loopCnt = numCols;
185         /*
186          * Index modifier to navigate through the columns
187          */
188         l = 0U;
189         while (loopCnt > 0U)
190         {
191             /*
192              * Check if the pivot element is zero..
193              * If it is zero then interchange the row with non zero row below.
194              * If there is no non zero element to replace in the rows below,
195              * then the matrix is Singular.
196              */
197 
198             /*
199              * Working pointer for the input matrix that points
200              * * to the pivot element of the particular row
201              */
202             pInT1 = pIn + (l * numCols);
203             /*
204              * Working pointer for the destination matrix that points
205              * * to the pivot element of the particular row
206              */
207             pOutT1 = pOut + (l * numCols);
208             /*
209              * Temporary variable to hold the pivot value
210              */
211             in = *pInT1;
212 
213 
214             /*
215              * Check if the pivot element is zero
216              */
217             if (*pInT1 == 0.0f)
218             {
219                 /*
220                  * Loop over the number rows present below
221                  */
222                 for (i = 1U; i < numRows-l; i++)
223                 {
224                     /*
225                      * Update the input and destination pointers
226                      */
227                     pInT2 = pInT1 + (numCols * i);
228                     pOutT2 = pOutT1 + (numCols * i);
229                     /*
230                      * Check if there is a non zero pivot element to
231                      * * replace in the rows below
232                      */
233                     if (*pInT2 != 0.0f)
234                     {
235                         f32x4_t vecA, vecB;
236                         /*
237                          * Loop over number of columns
238                          * * to the right of the pilot element
239                          */
240                         pTmpA = pInT1;
241                         pTmpB = pInT2;
242                         blkCnt = (numCols - l) >> 2;
243                         while (blkCnt > 0U)
244                         {
245 
246                             vecA = vldrwq_f32(pTmpA);
247                             vecB = vldrwq_f32(pTmpB);
248                             vstrwq_f32(pTmpB, vecA);
249                             vstrwq_f32(pTmpA, vecB);
250 
251                             pTmpA += 4;
252                             pTmpB += 4;
253                             /*
254                              * Decrement the blockSize loop counter
255                              */
256                             blkCnt--;
257                         }
258                         /*
259                          * tail
260                          * (will be merged thru tail predication)
261                          */
262                         blkCnt = (numCols - l) & 3;
263                         if (blkCnt > 0U)
264                         {
265                             mve_pred16_t p0 = vctp32q(blkCnt);
266 
267                             vecA = vldrwq_f32(pTmpA);
268                             vecB = vldrwq_f32(pTmpB);
269                             vstrwq_p_f32(pTmpB, vecA, p0);
270                             vstrwq_p_f32(pTmpA, vecB, p0);
271                         }
272 
273                         pInT1 += numCols - l;
274                         pInT2 += numCols - l;
275                         pTmpA = pOutT1;
276                         pTmpB = pOutT2;
277                         blkCnt = numCols >> 2;
278                         while (blkCnt > 0U)
279                         {
280 
281                             vecA = vldrwq_f32(pTmpA);
282                             vecB = vldrwq_f32(pTmpB);
283                             vstrwq_f32(pTmpB, vecA);
284                             vstrwq_f32(pTmpA, vecB);
285                             pTmpA += 4;
286                             pTmpB += 4;
287                             /*
288                              * Decrement the blockSize loop counter
289                              */
290                             blkCnt--;
291                         }
292                         /*
293                          * tail
294                          */
295                         blkCnt = numCols & 3;
296                         if (blkCnt > 0U)
297                         {
298                             mve_pred16_t p0 = vctp32q(blkCnt);
299 
300                             vecA = vldrwq_f32(pTmpA);
301                             vecB = vldrwq_f32(pTmpB);
302                             vstrwq_p_f32(pTmpB, vecA, p0);
303                             vstrwq_p_f32(pTmpA, vecB, p0);
304                         }
305 
306                         pOutT1 += numCols;
307                         pOutT2 += numCols;
308                         /*
309                          * Flag to indicate whether exchange is done or not
310                          */
311                         flag = 1U;
312 
313                         /*
314                          * Break after exchange is done
315                          */
316                         break;
317                     }
318 
319                 }
320             }
321 
322             /*
323              * Update the status if the matrix is singular
324              */
325             if ((flag != 1U) && (in == 0.0f))
326             {
327                 return ARM_MATH_SINGULAR;
328             }
329 
330             /*
331              * Points to the pivot row of input and destination matrices
332              */
333             pPivotRowIn = pIn + (l * numCols);
334             pPivotRowDst = pOut + (l * numCols);
335 
336             /*
337              * Temporary pointers to the pivot row pointers
338              */
339             pInT1 = pPivotRowIn;
340             pOutT1 = pPivotRowDst;
341 
342             /*
343              * Pivot element of the row
344              */
345             in = *(pIn + (l * numCols));
346 
347             pTmpA = pInT1;
348 
349             f32x4_t invIn = vdupq_n_f32(1.0f / in);
350 
351             blkCnt = (numCols - l) >> 2;
352             f32x4_t vecA;
353             while (blkCnt > 0U)
354             {
355                 *(f32x4_t *) pTmpA = *(f32x4_t *) pTmpA * invIn;
356                 pTmpA += 4;
357                 /*
358                  * Decrement the blockSize loop counter
359                  */
360                 blkCnt--;
361             }
362             /*
363              * tail
364              */
365             blkCnt = (numCols - l) & 3;
366             if (blkCnt > 0U)
367             {
368                 mve_pred16_t p0 = vctp32q(blkCnt);
369 
370 
371                 vecA = vldrwq_f32(pTmpA);
372                 vecA = vecA * invIn;
373                 vstrwq_p_f32(pTmpA, vecA, p0);
374             }
375 
376             pInT1 += numCols - l;
377             /*
378              * Loop over number of columns
379              * * to the right of the pilot element
380              */
381 
382             pTmpA = pOutT1;
383             blkCnt = numCols >> 2;
384             while (blkCnt > 0U)
385             {
386                 *(f32x4_t *) pTmpA = *(f32x4_t *) pTmpA *invIn;
387                 pTmpA += 4;
388                 /*
389                  * Decrement the blockSize loop counter
390                  */
391                 blkCnt--;
392             }
393             /*
394              * tail
395              * (will be merged thru tail predication)
396              */
397             blkCnt = numCols & 3;
398             if (blkCnt > 0U)
399             {
400                 mve_pred16_t p0 = vctp32q(blkCnt);
401 
402                 vecA = vldrwq_f32(pTmpA);
403                 vecA = vecA * invIn;
404                 vstrwq_p_f32(pTmpA, vecA, p0);
405             }
406 
407             pOutT1 += numCols;
408 
409             /*
410              * Replace the rows with the sum of that row and a multiple of row i
411              * * so that each new element in column i above row i is zero.
412              */
413 
414             /*
415              * Temporary pointers for input and destination matrices
416              */
417             pInT1 = pIn;
418             pOutT1 = pOut;
419 
420             for (i = 0U; i < numRows; i++)
421             {
422                 /*
423                  * Check for the pivot element
424                  */
425                 if (i == l)
426                 {
427                     /*
428                      * If the processing element is the pivot element,
429                      * only the columns to the right are to be processed
430                      */
431                     pInT1 += numCols - l;
432                     pOutT1 += numCols;
433                 }
434                 else
435                 {
436                     /*
437                      * Element of the reference row
438                      */
439 
440                     /*
441                      * Working pointers for input and destination pivot rows
442                      */
443                     pPRT_in = pPivotRowIn;
444                     pPRT_pDst = pPivotRowDst;
445                     /*
446                      * Loop over the number of columns to the right of the pivot element,
447                      * to replace the elements in the input matrix
448                      */
449 
450                     in = *pInT1;
451                     f32x4_t tmpV = vdupq_n_f32(in);
452 
453                     blkCnt = (numCols - l) >> 2;
454                     while (blkCnt > 0U)
455                     {
456                         f32x4_t vec1, vec2;
457                         /*
458                          * Replace the element by the sum of that row
459                          * and a multiple of the reference row
460                          */
461                         vec1 = vldrwq_f32(pInT1);
462                         vec2 = vldrwq_f32(pPRT_in);
463                         vec1 = vfmsq_f32(vec1, tmpV, vec2);
464                         vstrwq_f32(pInT1, vec1);
465                         pPRT_in += 4;
466                         pInT1 += 4;
467                         /*
468                          * Decrement the blockSize loop counter
469                          */
470                         blkCnt--;
471                     }
472                     /*
473                      * tail
474                      * (will be merged thru tail predication)
475                      */
476                     blkCnt = (numCols - l) & 3;
477                     if (blkCnt > 0U)
478                     {
479                         f32x4_t vec1, vec2;
480                         mve_pred16_t p0 = vctp32q(blkCnt);
481 
482                         vec1 = vldrwq_f32(pInT1);
483                         vec2 = vldrwq_f32(pPRT_in);
484                         vec1 = vfmsq_f32(vec1, tmpV, vec2);
485                         vstrwq_p_f32(pInT1, vec1, p0);
486                         pInT1 += blkCnt;
487                     }
488 
489                     blkCnt = numCols >> 2;
490                     while (blkCnt > 0U)
491                     {
492                         f32x4_t vec1, vec2;
493 
494                         /*
495                          * Replace the element by the sum of that row
496                          * and a multiple of the reference row
497                          */
498                         vec1 = vldrwq_f32(pOutT1);
499                         vec2 = vldrwq_f32(pPRT_pDst);
500                         vec1 = vfmsq_f32(vec1, tmpV, vec2);
501                         vstrwq_f32(pOutT1, vec1);
502                         pPRT_pDst += 4;
503                         pOutT1 += 4;
504                         /*
505                          * Decrement the blockSize loop counter
506                          */
507                         blkCnt--;
508                     }
509                     /*
510                      * tail
511                      * (will be merged thru tail predication)
512                      */
513                     blkCnt = numCols & 3;
514                     if (blkCnt > 0U)
515                     {
516                         f32x4_t vec1, vec2;
517                         mve_pred16_t p0 = vctp32q(blkCnt);
518 
519                         vec1 = vldrwq_f32(pOutT1);
520                         vec2 = vldrwq_f32(pPRT_pDst);
521                         vec1 = vfmsq_f32(vec1, tmpV, vec2);
522                         vstrwq_p_f32(pOutT1, vec1, p0);
523 
524                         pInT2 += blkCnt;
525                         pOutT1 += blkCnt;
526                     }
527                 }
528                 /*
529                  * Increment the temporary input pointer
530                  */
531                 pInT1 = pInT1 + l;
532             }
533             /*
534              * Increment the input pointer
535              */
536             pIn++;
537             /*
538              * Decrement the loop counter
539              */
540             loopCnt--;
541             /*
542              * Increment the index modifier
543              */
544             l++;
545         }
546 
547         /*
548          * Set status as ARM_MATH_SUCCESS
549          */
550         status = ARM_MATH_SUCCESS;
551 
552         if ((flag != 1U) && (in == 0.0f))
553         {
554             pIn = pSrc->pData;
555             for (i = 0; i < numRows * numCols; i++)
556             {
557                 if (pIn[i] != 0.0f)
558                     break;
559             }
560 
561             if (i == numRows * numCols)
562                 status = ARM_MATH_SINGULAR;
563         }
564   }
565   /* Return to application */
566   return (status);
567 }
568 
569 #else
570 #if defined(ARM_MATH_NEON)
arm_mat_inverse_f32(const arm_matrix_instance_f32 * pSrc,arm_matrix_instance_f32 * pDst)571 arm_status arm_mat_inverse_f32(
572   const arm_matrix_instance_f32 * pSrc,
573   arm_matrix_instance_f32 * pDst)
574 {
575   float32_t *pIn = pSrc->pData;                  /* input data matrix pointer */
576   float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
577   float32_t *pInT1, *pInT2;                      /* Temporary input data matrix pointer */
578   float32_t *pOutT1, *pOutT2;                    /* Temporary output data matrix pointer */
579   float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;  /* Temporary input and output data matrix pointer */
580   uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
581   uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
582 
583 
584   float32_t Xchg, in = 0.0f, in1;                /* Temporary input values  */
585   uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l;      /* loop counters */
586   arm_status status;                             /* status of matrix inverse */
587   float32x4_t vec1;
588   float32x4_t vec2;
589   float32x4_t tmpV;
590 
591 #ifdef ARM_MATH_MATRIX_CHECK
592 
593   /* Check for matrix mismatch condition */
594   if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
595      || (pSrc->numRows != pDst->numRows))
596   {
597     /* Set status as ARM_MATH_SIZE_MISMATCH */
598     status = ARM_MATH_SIZE_MISMATCH;
599   }
600   else
601 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
602 
603   {
604    /*--------------------------------------------------------------------------------------------------------------
605    * Matrix Inverse can be solved using elementary row operations.
606    *
607    *  Gauss-Jordan Method:
608    *
609    *     1. First combine the identity matrix and the input matrix separated by a bar to form an
610    *        augmented matrix as follows:
611    *              _                  _         _         _
612    *             |  a11  a12 | 1   0  |       |  X11 X12  |
613    *             |           |        |   =   |           |
614    *             |_ a21  a22 | 0   1 _|       |_ X21 X21 _|
615    *
616    *    2. In our implementation, pDst Matrix is used as identity matrix.
617    *
618    *    3. Begin with the first row. Let i = 1.
619    *
620    *    4. Check to see if the pivot for row i is zero.
621    *       The pivot is the element of the main diagonal that is on the current row.
622    *       For instance, if working with row i, then the pivot element is aii.
623    *       If the pivot is zero, exchange that row with a row below it that does not
624    *       contain a zero in column i. If this is not possible, then an inverse
625    *       to that matrix does not exist.
626    *
627    *      5. Divide every element of row i by the pivot.
628    *
629    *      6. For every row below and  row i, replace that row with the sum of that row and
630    *       a multiple of row i so that each new element in column i below row i is zero.
631    *
632    *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
633    *       for every element below and above the main diagonal.
634    *
635    *    8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
636    *       Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
637    *----------------------------------------------------------------------------------------------------------------*/
638 
639     /* Working pointer for destination matrix */
640     pOutT1 = pOut;
641 
642     /* Loop over the number of rows */
643     rowCnt = numRows;
644 
645     /* Making the destination matrix as identity matrix */
646     while (rowCnt > 0U)
647     {
648       /* Writing all zeroes in lower triangle of the destination matrix */
649       j = numRows - rowCnt;
650       while (j > 0U)
651       {
652         *pOutT1++ = 0.0f;
653         j--;
654       }
655 
656       /* Writing all ones in the diagonal of the destination matrix */
657       *pOutT1++ = 1.0f;
658 
659       /* Writing all zeroes in upper triangle of the destination matrix */
660       j = rowCnt - 1U;
661 
662       while (j > 0U)
663       {
664         *pOutT1++ = 0.0f;
665         j--;
666       }
667 
668       /* Decrement the loop counter */
669       rowCnt--;
670     }
671 
672     /* Loop over the number of columns of the input matrix.
673        All the elements in each column are processed by the row operations */
674     loopCnt = numCols;
675 
676     /* Index modifier to navigate through the columns */
677     l = 0U;
678 
679     while (loopCnt > 0U)
680     {
681       /* Check if the pivot element is zero..
682        * If it is zero then interchange the row with non zero row below.
683        * If there is no non zero element to replace in the rows below,
684        * then the matrix is Singular. */
685 
686       /* Working pointer for the input matrix that points
687        * to the pivot element of the particular row  */
688       pInT1 = pIn + (l * numCols);
689 
690       /* Working pointer for the destination matrix that points
691        * to the pivot element of the particular row  */
692       pOutT1 = pOut + (l * numCols);
693 
694       /* Temporary variable to hold the pivot value */
695       in = *pInT1;
696 
697       /* Check if the pivot element is zero */
698       if (*pInT1 == 0.0f)
699       {
700         /* Loop over the number rows present below */
701         for (i = 1U; i < numRows - l; i++)
702         {
703           /* Update the input and destination pointers */
704           pInT2 = pInT1 + (numCols * i);
705           pOutT2 = pOutT1 + (numCols * i);
706 
707           /* Check if there is a non zero pivot element to
708            * replace in the rows below */
709           if (*pInT2 != 0.0f)
710           {
711             /* Loop over number of columns
712              * to the right of the pilot element */
713             j = numCols - l;
714 
715             while (j > 0U)
716             {
717               /* Exchange the row elements of the input matrix */
718               Xchg = *pInT2;
719               *pInT2++ = *pInT1;
720               *pInT1++ = Xchg;
721 
722               /* Decrement the loop counter */
723               j--;
724             }
725 
726             /* Loop over number of columns of the destination matrix */
727             j = numCols;
728 
729             while (j > 0U)
730             {
731               /* Exchange the row elements of the destination matrix */
732               Xchg = *pOutT2;
733               *pOutT2++ = *pOutT1;
734               *pOutT1++ = Xchg;
735 
736               /* Decrement the loop counter */
737               j--;
738             }
739 
740             /* Flag to indicate whether exchange is done or not */
741             flag = 1U;
742 
743             /* Break after exchange is done */
744             break;
745           }
746 
747 
748         }
749       }
750 
751       /* Update the status if the matrix is singular */
752       if ((flag != 1U) && (in == 0.0f))
753       {
754         return ARM_MATH_SINGULAR;
755       }
756 
757       /* Points to the pivot row of input and destination matrices */
758       pPivotRowIn = pIn + (l * numCols);
759       pPivotRowDst = pOut + (l * numCols);
760 
761       /* Temporary pointers to the pivot row pointers */
762       pInT1 = pPivotRowIn;
763       pInT2 = pPivotRowDst;
764 
765       /* Pivot element of the row */
766       in = *pPivotRowIn;
767       tmpV = vdupq_n_f32(1.0f/in);
768 
769       /* Loop over number of columns
770        * to the right of the pilot element */
771       j = (numCols - l) >> 2;
772 
773       while (j > 0U)
774       {
775         /* Divide each element of the row of the input matrix
776          * by the pivot element */
777         vec1 = vld1q_f32(pInT1);
778 
779         vec1 = vmulq_f32(vec1, tmpV);
780         vst1q_f32(pInT1, vec1);
781         pInT1 += 4;
782 
783         /* Decrement the loop counter */
784         j--;
785       }
786 
787       /* Tail */
788       j = (numCols - l) & 3;
789 
790       while (j > 0U)
791       {
792         /* Divide each element of the row of the input matrix
793          * by the pivot element */
794         in1 = *pInT1;
795         *pInT1++ = in1 / in;
796 
797         /* Decrement the loop counter */
798         j--;
799       }
800 
801       /* Loop over number of columns of the destination matrix */
802       j = numCols >> 2;
803 
804       while (j > 0U)
805       {
806         /* Divide each element of the row of the destination matrix
807          * by the pivot element */
808         vec1 = vld1q_f32(pInT2);
809 
810         vec1 = vmulq_f32(vec1, tmpV);
811         vst1q_f32(pInT2, vec1);
812         pInT2 += 4;
813 
814         /* Decrement the loop counter */
815         j--;
816       }
817 
818       /* Tail */
819       j = numCols & 3;
820 
821       while (j > 0U)
822       {
823         /* Divide each element of the row of the destination matrix
824          * by the pivot element */
825         in1 = *pInT2;
826         *pInT2++ = in1 / in;
827 
828         /* Decrement the loop counter */
829         j--;
830       }
831 
832       /* Replace the rows with the sum of that row and a multiple of row i
833        * so that each new element in column i above row i is zero.*/
834 
835       /* Temporary pointers for input and destination matrices */
836       pInT1 = pIn;
837       pInT2 = pOut;
838 
839       /* index used to check for pivot element */
840       i = 0U;
841 
842       /* Loop over number of rows */
843       /*  to be replaced by the sum of that row and a multiple of row i */
844       k = numRows;
845 
846       while (k > 0U)
847       {
848         /* Check for the pivot element */
849         if (i == l)
850         {
851           /* If the processing element is the pivot element,
852              only the columns to the right are to be processed */
853           pInT1 += numCols - l;
854 
855           pInT2 += numCols;
856         }
857         else
858         {
859           /* Element of the reference row */
860           in = *pInT1;
861           tmpV = vdupq_n_f32(in);
862 
863           /* Working pointers for input and destination pivot rows */
864           pPRT_in = pPivotRowIn;
865           pPRT_pDst = pPivotRowDst;
866 
867           /* Loop over the number of columns to the right of the pivot element,
868              to replace the elements in the input matrix */
869           j = (numCols - l) >> 2;
870 
871           while (j > 0U)
872           {
873             /* Replace the element by the sum of that row
874                and a multiple of the reference row  */
875             vec1 = vld1q_f32(pInT1);
876             vec2 = vld1q_f32(pPRT_in);
877             vec1 = vmlsq_f32(vec1, tmpV, vec2);
878             vst1q_f32(pInT1, vec1);
879             pPRT_in += 4;
880             pInT1 += 4;
881 
882             /* Decrement the loop counter */
883             j--;
884           }
885 
886 	  /* Tail */
887           j = (numCols - l) & 3;
888 
889           while (j > 0U)
890           {
891             /* Replace the element by the sum of that row
892                and a multiple of the reference row  */
893             in1 = *pInT1;
894             *pInT1++ = in1 - (in * *pPRT_in++);
895 
896             /* Decrement the loop counter */
897             j--;
898           }
899 
900           /* Loop over the number of columns to
901              replace the elements in the destination matrix */
902           j = numCols >> 2;
903 
904           while (j > 0U)
905           {
906             /* Replace the element by the sum of that row
907                and a multiple of the reference row  */
908             vec1 = vld1q_f32(pInT2);
909             vec2 = vld1q_f32(pPRT_pDst);
910             vec1 = vmlsq_f32(vec1, tmpV, vec2);
911             vst1q_f32(pInT2, vec1);
912             pPRT_pDst += 4;
913             pInT2 += 4;
914 
915             /* Decrement the loop counter */
916             j--;
917           }
918 
919 	  /* Tail */
920           j = numCols & 3;
921 
922           while (j > 0U)
923           {
924             /* Replace the element by the sum of that row
925                and a multiple of the reference row  */
926             in1 = *pInT2;
927             *pInT2++ = in1 - (in * *pPRT_pDst++);
928 
929             /* Decrement the loop counter */
930             j--;
931           }
932 
933         }
934 
935         /* Increment the temporary input pointer */
936         pInT1 = pInT1 + l;
937 
938         /* Decrement the loop counter */
939         k--;
940 
941         /* Increment the pivot index */
942         i++;
943       }
944 
945       /* Increment the input pointer */
946       pIn++;
947 
948       /* Decrement the loop counter */
949       loopCnt--;
950 
951       /* Increment the index modifier */
952       l++;
953     }
954 
955     /* Set status as ARM_MATH_SUCCESS */
956     status = ARM_MATH_SUCCESS;
957 
958     if ((flag != 1U) && (in == 0.0f))
959     {
960       pIn = pSrc->pData;
961       for (i = 0; i < numRows * numCols; i++)
962       {
963         if (pIn[i] != 0.0f)
964             break;
965       }
966 
967       if (i == numRows * numCols)
968         status = ARM_MATH_SINGULAR;
969     }
970   }
971   /* Return to application */
972   return (status);
973 }
974 #else
arm_mat_inverse_f32(const arm_matrix_instance_f32 * pSrc,arm_matrix_instance_f32 * pDst)975 arm_status arm_mat_inverse_f32(
976   const arm_matrix_instance_f32 * pSrc,
977         arm_matrix_instance_f32 * pDst)
978 {
979   float32_t *pIn = pSrc->pData;                  /* input data matrix pointer */
980   float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
981   float32_t *pInT1, *pInT2;                      /* Temporary input data matrix pointer */
982   float32_t *pOutT1, *pOutT2;                    /* Temporary output data matrix pointer */
983   float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;  /* Temporary input and output data matrix pointer */
984   uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
985   uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
986 
987 #if defined (ARM_MATH_DSP)
988 
989   float32_t Xchg, in = 0.0f, in1;                /* Temporary input values  */
990   uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l;      /* loop counters */
991   arm_status status;                             /* status of matrix inverse */
992 
993 #ifdef ARM_MATH_MATRIX_CHECK
994 
995   /* Check for matrix mismatch condition */
996   if ((pSrc->numRows != pSrc->numCols) ||
997       (pDst->numRows != pDst->numCols) ||
998       (pSrc->numRows != pDst->numRows)   )
999   {
1000     /* Set status as ARM_MATH_SIZE_MISMATCH */
1001     status = ARM_MATH_SIZE_MISMATCH;
1002   }
1003   else
1004 
1005 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
1006 
1007   {
1008 
1009     /*--------------------------------------------------------------------------------------------------------------
1010      * Matrix Inverse can be solved using elementary row operations.
1011      *
1012      *  Gauss-Jordan Method:
1013      *
1014      *      1. First combine the identity matrix and the input matrix separated by a bar to form an
1015      *        augmented matrix as follows:
1016      *                      _                  _         _         _
1017      *                     |  a11  a12 | 1   0  |       |  X11 X12  |
1018      *                     |           |        |   =   |           |
1019      *                     |_ a21  a22 | 0   1 _|       |_ X21 X21 _|
1020      *
1021      *      2. In our implementation, pDst Matrix is used as identity matrix.
1022      *
1023      *      3. Begin with the first row. Let i = 1.
1024      *
1025      *      4. Check to see if the pivot for row i is zero.
1026      *         The pivot is the element of the main diagonal that is on the current row.
1027      *         For instance, if working with row i, then the pivot element is aii.
1028      *         If the pivot is zero, exchange that row with a row below it that does not
1029      *         contain a zero in column i. If this is not possible, then an inverse
1030      *         to that matrix does not exist.
1031      *
1032      *      5. Divide every element of row i by the pivot.
1033      *
1034      *      6. For every row below and  row i, replace that row with the sum of that row and
1035      *         a multiple of row i so that each new element in column i below row i is zero.
1036      *
1037      *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
1038      *         for every element below and above the main diagonal.
1039      *
1040      *      8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
1041      *         Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
1042      *----------------------------------------------------------------------------------------------------------------*/
1043 
1044     /* Working pointer for destination matrix */
1045     pOutT1 = pOut;
1046 
1047     /* Loop over the number of rows */
1048     rowCnt = numRows;
1049 
1050     /* Making the destination matrix as identity matrix */
1051     while (rowCnt > 0U)
1052     {
1053       /* Writing all zeroes in lower triangle of the destination matrix */
1054       j = numRows - rowCnt;
1055       while (j > 0U)
1056       {
1057         *pOutT1++ = 0.0f;
1058         j--;
1059       }
1060 
1061       /* Writing all ones in the diagonal of the destination matrix */
1062       *pOutT1++ = 1.0f;
1063 
1064       /* Writing all zeroes in upper triangle of the destination matrix */
1065       j = rowCnt - 1U;
1066       while (j > 0U)
1067       {
1068         *pOutT1++ = 0.0f;
1069         j--;
1070       }
1071 
1072       /* Decrement loop counter */
1073       rowCnt--;
1074     }
1075 
1076     /* Loop over the number of columns of the input matrix.
1077        All the elements in each column are processed by the row operations */
1078     loopCnt = numCols;
1079 
1080     /* Index modifier to navigate through the columns */
1081     l = 0U;
1082 
1083     while (loopCnt > 0U)
1084     {
1085       /* Check if the pivot element is zero..
1086        * If it is zero then interchange the row with non zero row below.
1087        * If there is no non zero element to replace in the rows below,
1088        * then the matrix is Singular. */
1089 
1090       /* Working pointer for the input matrix that points
1091        * to the pivot element of the particular row  */
1092       pInT1 = pIn + (l * numCols);
1093 
1094       /* Working pointer for the destination matrix that points
1095        * to the pivot element of the particular row  */
1096       pOutT1 = pOut + (l * numCols);
1097 
1098       /* Temporary variable to hold the pivot value */
1099       in = *pInT1;
1100 
1101 
1102 
1103       /* Check if the pivot element is zero */
1104       if (*pInT1 == 0.0f)
1105       {
1106         /* Loop over the number rows present below */
1107 
1108         for (i = 1U; i < numRows - l; i++)
1109         {
1110           /* Update the input and destination pointers */
1111           pInT2 = pInT1 + (numCols * i);
1112           pOutT2 = pOutT1 + (numCols * i);
1113 
1114           /* Check if there is a non zero pivot element to
1115            * replace in the rows below */
1116           if (*pInT2 != 0.0f)
1117           {
1118             /* Loop over number of columns
1119              * to the right of the pilot element */
1120             j = numCols - l;
1121 
1122             while (j > 0U)
1123             {
1124               /* Exchange the row elements of the input matrix */
1125               Xchg = *pInT2;
1126               *pInT2++ = *pInT1;
1127               *pInT1++ = Xchg;
1128 
1129               /* Decrement the loop counter */
1130               j--;
1131             }
1132 
1133             /* Loop over number of columns of the destination matrix */
1134             j = numCols;
1135 
1136             while (j > 0U)
1137             {
1138               /* Exchange the row elements of the destination matrix */
1139               Xchg = *pOutT2;
1140               *pOutT2++ = *pOutT1;
1141               *pOutT1++ = Xchg;
1142 
1143               /* Decrement loop counter */
1144               j--;
1145             }
1146 
1147             /* Flag to indicate whether exchange is done or not */
1148             flag = 1U;
1149 
1150             /* Break after exchange is done */
1151             break;
1152           }
1153 
1154 
1155           /* Decrement loop counter */
1156         }
1157       }
1158 
1159       /* Update the status if the matrix is singular */
1160       if ((flag != 1U) && (in == 0.0f))
1161       {
1162         return ARM_MATH_SINGULAR;
1163       }
1164 
1165       /* Points to the pivot row of input and destination matrices */
1166       pPivotRowIn = pIn + (l * numCols);
1167       pPivotRowDst = pOut + (l * numCols);
1168 
1169       /* Temporary pointers to the pivot row pointers */
1170       pInT1 = pPivotRowIn;
1171       pInT2 = pPivotRowDst;
1172 
1173       /* Pivot element of the row */
1174       in = *pPivotRowIn;
1175 
1176       /* Loop over number of columns
1177        * to the right of the pilot element */
1178       j = (numCols - l);
1179 
1180       while (j > 0U)
1181       {
1182         /* Divide each element of the row of the input matrix
1183          * by the pivot element */
1184         in1 = *pInT1;
1185         *pInT1++ = in1 / in;
1186 
1187         /* Decrement the loop counter */
1188         j--;
1189       }
1190 
1191       /* Loop over number of columns of the destination matrix */
1192       j = numCols;
1193 
1194       while (j > 0U)
1195       {
1196         /* Divide each element of the row of the destination matrix
1197          * by the pivot element */
1198         in1 = *pInT2;
1199         *pInT2++ = in1 / in;
1200 
1201         /* Decrement the loop counter */
1202         j--;
1203       }
1204 
1205       /* Replace the rows with the sum of that row and a multiple of row i
1206        * so that each new element in column i above row i is zero.*/
1207 
1208       /* Temporary pointers for input and destination matrices */
1209       pInT1 = pIn;
1210       pInT2 = pOut;
1211 
1212       /* index used to check for pivot element */
1213       i = 0U;
1214 
1215       /* Loop over number of rows */
1216       /*  to be replaced by the sum of that row and a multiple of row i */
1217       k = numRows;
1218 
1219       while (k > 0U)
1220       {
1221         /* Check for the pivot element */
1222         if (i == l)
1223         {
1224           /* If the processing element is the pivot element,
1225              only the columns to the right are to be processed */
1226           pInT1 += numCols - l;
1227 
1228           pInT2 += numCols;
1229         }
1230         else
1231         {
1232           /* Element of the reference row */
1233           in = *pInT1;
1234 
1235           /* Working pointers for input and destination pivot rows */
1236           pPRT_in = pPivotRowIn;
1237           pPRT_pDst = pPivotRowDst;
1238 
1239           /* Loop over the number of columns to the right of the pivot element,
1240              to replace the elements in the input matrix */
1241           j = (numCols - l);
1242 
1243           while (j > 0U)
1244           {
1245             /* Replace the element by the sum of that row
1246                and a multiple of the reference row  */
1247             in1 = *pInT1;
1248             *pInT1++ = in1 - (in * *pPRT_in++);
1249 
1250             /* Decrement the loop counter */
1251             j--;
1252           }
1253 
1254           /* Loop over the number of columns to
1255              replace the elements in the destination matrix */
1256           j = numCols;
1257 
1258           while (j > 0U)
1259           {
1260             /* Replace the element by the sum of that row
1261                and a multiple of the reference row  */
1262             in1 = *pInT2;
1263             *pInT2++ = in1 - (in * *pPRT_pDst++);
1264 
1265             /* Decrement loop counter */
1266             j--;
1267           }
1268 
1269         }
1270 
1271         /* Increment temporary input pointer */
1272         pInT1 = pInT1 + l;
1273 
1274         /* Decrement loop counter */
1275         k--;
1276 
1277         /* Increment pivot index */
1278         i++;
1279       }
1280 
1281       /* Increment the input pointer */
1282       pIn++;
1283 
1284       /* Decrement the loop counter */
1285       loopCnt--;
1286 
1287       /* Increment the index modifier */
1288       l++;
1289     }
1290 
1291 
1292 #else
1293 
1294   float32_t Xchg, in = 0.0f;                     /* Temporary input values  */
1295   uint32_t i, rowCnt, flag = 0U, j, loopCnt, l;      /* loop counters */
1296   arm_status status;                             /* status of matrix inverse */
1297 
1298 #ifdef ARM_MATH_MATRIX_CHECK
1299 
1300   /* Check for matrix mismatch condition */
1301   if ((pSrc->numRows != pSrc->numCols) ||
1302       (pDst->numRows != pDst->numCols) ||
1303       (pSrc->numRows != pDst->numRows)   )
1304   {
1305     /* Set status as ARM_MATH_SIZE_MISMATCH */
1306     status = ARM_MATH_SIZE_MISMATCH;
1307   }
1308   else
1309 
1310 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
1311 
1312   {
1313 
1314     /*--------------------------------------------------------------------------------------------------------------
1315      * Matrix Inverse can be solved using elementary row operations.
1316      *
1317      *  Gauss-Jordan Method:
1318      *
1319      *      1. First combine the identity matrix and the input matrix separated by a bar to form an
1320      *        augmented matrix as follows:
1321      *                      _  _          _     _      _   _         _         _
1322      *                     |  |  a11  a12  | | | 1   0  |   |       |  X11 X12  |
1323      *                     |  |            | | |        |   |   =   |           |
1324      *                     |_ |_ a21  a22 _| | |_0   1 _|  _|       |_ X21 X21 _|
1325      *
1326      *      2. In our implementation, pDst Matrix is used as identity matrix.
1327      *
1328      *      3. Begin with the first row. Let i = 1.
1329      *
1330      *      4. Check to see if the pivot for row i is zero.
1331      *         The pivot is the element of the main diagonal that is on the current row.
1332      *         For instance, if working with row i, then the pivot element is aii.
1333      *         If the pivot is zero, exchange that row with a row below it that does not
1334      *         contain a zero in column i. If this is not possible, then an inverse
1335      *         to that matrix does not exist.
1336      *
1337      *      5. Divide every element of row i by the pivot.
1338      *
1339      *      6. For every row below and  row i, replace that row with the sum of that row and
1340      *         a multiple of row i so that each new element in column i below row i is zero.
1341      *
1342      *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
1343      *         for every element below and above the main diagonal.
1344      *
1345      *      8. Now an identical matrix is formed to the left of the bar(input matrix, src).
1346      *         Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
1347      *----------------------------------------------------------------------------------------------------------------*/
1348 
1349     /* Working pointer for destination matrix */
1350     pOutT1 = pOut;
1351 
1352     /* Loop over the number of rows */
1353     rowCnt = numRows;
1354 
1355     /* Making the destination matrix as identity matrix */
1356     while (rowCnt > 0U)
1357     {
1358       /* Writing all zeroes in lower triangle of the destination matrix */
1359       j = numRows - rowCnt;
1360       while (j > 0U)
1361       {
1362         *pOutT1++ = 0.0f;
1363         j--;
1364       }
1365 
1366       /* Writing all ones in the diagonal of the destination matrix */
1367       *pOutT1++ = 1.0f;
1368 
1369       /* Writing all zeroes in upper triangle of the destination matrix */
1370       j = rowCnt - 1U;
1371       while (j > 0U)
1372       {
1373         *pOutT1++ = 0.0f;
1374         j--;
1375       }
1376 
1377       /* Decrement loop counter */
1378       rowCnt--;
1379     }
1380 
1381     /* Loop over the number of columns of the input matrix.
1382        All the elements in each column are processed by the row operations */
1383     loopCnt = numCols;
1384 
1385     /* Index modifier to navigate through the columns */
1386     l = 0U;
1387 
1388     while (loopCnt > 0U)
1389     {
1390       /* Check if the pivot element is zero..
1391        * If it is zero then interchange the row with non zero row below.
1392        * If there is no non zero element to replace in the rows below,
1393        * then the matrix is Singular. */
1394 
1395       /* Working pointer for the input matrix that points
1396        * to the pivot element of the particular row  */
1397       pInT1 = pIn + (l * numCols);
1398 
1399       /* Working pointer for the destination matrix that points
1400        * to the pivot element of the particular row  */
1401       pOutT1 = pOut + (l * numCols);
1402 
1403       /* Temporary variable to hold the pivot value */
1404       in = *pInT1;
1405 
1406       /* Check if the pivot element is zero */
1407       if (*pInT1 == 0.0f)
1408       {
1409         /* Loop over the number rows present below */
1410         for (i = 1U; i < numRows-l; i++)
1411         {
1412           /* Update the input and destination pointers */
1413           pInT2 = pInT1 + (numCols * i);
1414           pOutT2 = pOutT1 + (numCols * i);
1415 
1416           /* Check if there is a non zero pivot element to
1417            * replace in the rows below */
1418           if (*pInT2 != 0.0f)
1419           {
1420             /* Loop over number of columns
1421              * to the right of the pilot element */
1422             for (j = 0U; j < (numCols - l); j++)
1423             {
1424               /* Exchange the row elements of the input matrix */
1425               Xchg = *pInT2;
1426               *pInT2++ = *pInT1;
1427               *pInT1++ = Xchg;
1428             }
1429 
1430             for (j = 0U; j < numCols; j++)
1431             {
1432               Xchg = *pOutT2;
1433               *pOutT2++ = *pOutT1;
1434               *pOutT1++ = Xchg;
1435             }
1436 
1437             /* Flag to indicate whether exchange is done or not */
1438             flag = 1U;
1439 
1440             /* Break after exchange is done */
1441             break;
1442           }
1443         }
1444       }
1445 
1446 
1447       /* Update the status if the matrix is singular */
1448       if ((flag != 1U) && (in == 0.0f))
1449       {
1450         return ARM_MATH_SINGULAR;
1451       }
1452 
1453       /* Points to the pivot row of input and destination matrices */
1454       pPivotRowIn = pIn + (l * numCols);
1455       pPivotRowDst = pOut + (l * numCols);
1456 
1457       /* Temporary pointers to the pivot row pointers */
1458       pInT1 = pPivotRowIn;
1459       pOutT1 = pPivotRowDst;
1460 
1461       /* Pivot element of the row */
1462       in = *(pIn + (l * numCols));
1463 
1464       /* Loop over number of columns
1465        * to the right of the pilot element */
1466       for (j = 0U; j < (numCols - l); j++)
1467       {
1468         /* Divide each element of the row of the input matrix
1469          * by the pivot element */
1470         *pInT1 = *pInT1 / in;
1471         pInT1++;
1472       }
1473       for (j = 0U; j < numCols; j++)
1474       {
1475         /* Divide each element of the row of the destination matrix
1476          * by the pivot element */
1477         *pOutT1 = *pOutT1 / in;
1478         pOutT1++;
1479       }
1480 
1481       /* Replace the rows with the sum of that row and a multiple of row i
1482        * so that each new element in column i above row i is zero.*/
1483 
1484       /* Temporary pointers for input and destination matrices */
1485       pInT1 = pIn;
1486       pOutT1 = pOut;
1487 
1488       for (i = 0U; i < numRows; i++)
1489       {
1490         /* Check for the pivot element */
1491         if (i == l)
1492         {
1493           /* If the processing element is the pivot element,
1494              only the columns to the right are to be processed */
1495           pInT1 += numCols - l;
1496           pOutT1 += numCols;
1497         }
1498         else
1499         {
1500           /* Element of the reference row */
1501           in = *pInT1;
1502 
1503           /* Working pointers for input and destination pivot rows */
1504           pPRT_in = pPivotRowIn;
1505           pPRT_pDst = pPivotRowDst;
1506 
1507           /* Loop over the number of columns to the right of the pivot element,
1508              to replace the elements in the input matrix */
1509           for (j = 0U; j < (numCols - l); j++)
1510           {
1511             /* Replace the element by the sum of that row
1512                and a multiple of the reference row  */
1513             *pInT1 = *pInT1 - (in * *pPRT_in++);
1514             pInT1++;
1515           }
1516 
1517           /* Loop over the number of columns to
1518              replace the elements in the destination matrix */
1519           for (j = 0U; j < numCols; j++)
1520           {
1521             /* Replace the element by the sum of that row
1522                and a multiple of the reference row  */
1523             *pOutT1 = *pOutT1 - (in * *pPRT_pDst++);
1524             pOutT1++;
1525           }
1526 
1527         }
1528 
1529         /* Increment temporary input pointer */
1530         pInT1 = pInT1 + l;
1531       }
1532 
1533       /* Increment the input pointer */
1534       pIn++;
1535 
1536       /* Decrement the loop counter */
1537       loopCnt--;
1538 
1539       /* Increment the index modifier */
1540       l++;
1541     }
1542 
1543 #endif /* #if defined (ARM_MATH_DSP) */
1544 
1545     /* Set status as ARM_MATH_SUCCESS */
1546     status = ARM_MATH_SUCCESS;
1547 
1548     if ((flag != 1U) && (in == 0.0f))
1549     {
1550       pIn = pSrc->pData;
1551       for (i = 0; i < numRows * numCols; i++)
1552       {
1553         if (pIn[i] != 0.0f)
1554             break;
1555       }
1556 
1557       if (i == numRows * numCols)
1558         status = ARM_MATH_SINGULAR;
1559     }
1560   }
1561 
1562   /* Return to application */
1563   return (status);
1564 }
1565 #endif /* #if defined(ARM_MATH_NEON) */
1566 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1567 
1568 /**
1569   @} end of MatrixInv group
1570  */
1571