1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_inverse_f16.c
4  * Description:  Floating-point matrix inverse
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/matrix_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 
34 /**
35   @ingroup groupMatrix
36  */
37 
38 
39 /**
40   @addtogroup MatrixInv
41   @{
42  */
43 
44 /**
45   @brief         Floating-point matrix inverse.
46   @param[in]     pSrc      points to input matrix structure. The source matrix is modified by the function.
47   @param[out]    pDst      points to output matrix structure
48   @return        execution status
49                    - \ref ARM_MATH_SUCCESS       : Operation successful
50                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
51                    - \ref ARM_MATH_SINGULAR      : Input matrix is found to be singular (non-invertible)
52  */
53 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
54 
arm_mat_inverse_f16(const arm_matrix_instance_f16 * pSrc,arm_matrix_instance_f16 * pDst)55 arm_status arm_mat_inverse_f16(
56   const arm_matrix_instance_f16 * pSrc,
57   arm_matrix_instance_f16 * pDst)
58 {
59     float16_t *pIn = pSrc->pData;   /* input data matrix pointer */
60     float16_t *pOut = pDst->pData;  /* output data matrix pointer */
61     float16_t *pInT1, *pInT2;   /* Temporary input data matrix pointer */
62     float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
63     float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;    /* Temporary input and output data matrix pointer */
64 
65     uint32_t  numRows = pSrc->numRows;  /* Number of rows in the matrix  */
66     uint32_t  numCols = pSrc->numCols;  /* Number of Cols in the matrix  */
67     float16_t *pTmpA, *pTmpB;
68 
69     _Float16 in = 0.0f16;        /* Temporary input values  */
70     uint32_t  i, rowCnt, flag = 0U, j, loopCnt, l;   /* loop counters */
71     arm_status status;          /* status of matrix inverse */
72     uint32_t  blkCnt;
73 
74 #ifdef ARM_MATH_MATRIX_CHECK
75    /* Check for matrix mismatch condition */
76   if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
77      || (pSrc->numRows != pDst->numRows))
78   {
79     /* Set status as ARM_MATH_SIZE_MISMATCH */
80     status = ARM_MATH_SIZE_MISMATCH;
81   }
82   else
83 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
84   {
85 
86     /*--------------------------------------------------------------------------------------------------------------
87      * Matrix Inverse can be solved using elementary row operations.
88      *
89      *  Gauss-Jordan Method:
90      *
91      *     1. First combine the identity matrix and the input matrix separated by a bar to form an
92      *        augmented matrix as follows:
93      *                      _  _          _     _      _   _         _         _
94      *                     |  |  a11  a12  | | | 1   0  |   |       |  X11 X12  |
95      *                     |  |            | | |        |   |   =   |           |
96      *                     |_ |_ a21  a22 _| | |_0   1 _|  _|       |_ X21 X21 _|
97      *
98      *      2. In our implementation, pDst Matrix is used as identity matrix.
99      *
100      *      3. Begin with the first row. Let i = 1.
101      *
102      *      4. Check to see if the pivot for row i is zero.
103      *         The pivot is the element of the main diagonal that is on the current row.
104      *         For instance, if working with row i, then the pivot element is aii.
105      *         If the pivot is zero, exchange that row with a row below it that does not
106      *         contain a zero in column i. If this is not possible, then an inverse
107      *         to that matrix does not exist.
108      *
109      *      5. Divide every element of row i by the pivot.
110      *
111      *      6. For every row below and  row i, replace that row with the sum of that row and
112      *         a multiple of row i so that each new element in column i below row i is zero.
113      *
114      *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
115      *         for every element below and above the main diagonal.
116      *
117      *      8. Now an identical matrix is formed to the left of the bar(input matrix, src).
118      *         Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
119      *----------------------------------------------------------------------------------------------------------------*/
120 
121         /*
122          * Working pointer for destination matrix
123          */
124         pOutT1 = pOut;
125         /*
126          * Loop over the number of rows
127          */
128         rowCnt = numRows;
129         /*
130          * Making the destination matrix as identity matrix
131          */
132         while (rowCnt > 0U)
133         {
134             /*
135              * Writing all zeroes in lower triangle of the destination matrix
136              */
137             j = numRows - rowCnt;
138             while (j > 0U)
139             {
140                 *pOutT1++ = 0.0f16;
141                 j--;
142             }
143             /*
144              * Writing all ones in the diagonal of the destination matrix
145              */
146             *pOutT1++ = 1.0f16;
147             /*
148              * Writing all zeroes in upper triangle of the destination matrix
149              */
150             j = rowCnt - 1U;
151             while (j > 0U)
152             {
153                 *pOutT1++ = 0.0f16;
154                 j--;
155             }
156             /*
157              * Decrement the loop counter
158              */
159             rowCnt--;
160         }
161 
162         /*
163          * Loop over the number of columns of the input matrix.
164          * All the elements in each column are processed by the row operations
165          */
166         loopCnt = numCols;
167         /*
168          * Index modifier to navigate through the columns
169          */
170         l = 0U;
171         while (loopCnt > 0U)
172         {
173             /*
174              * Check if the pivot element is zero..
175              * If it is zero then interchange the row with non zero row below.
176              * If there is no non zero element to replace in the rows below,
177              * then the matrix is Singular.
178              */
179 
180             /*
181              * Working pointer for the input matrix that points
182              * * to the pivot element of the particular row
183              */
184             pInT1 = pIn + (l * numCols);
185             /*
186              * Working pointer for the destination matrix that points
187              * * to the pivot element of the particular row
188              */
189             pOutT1 = pOut + (l * numCols);
190             /*
191              * Temporary variable to hold the pivot value
192              */
193             in = *pInT1;
194 
195 
196             /*
197              * Check if the pivot element is zero
198              */
199             if (*pInT1 == 0.0f16)
200             {
201                 /*
202                  * Loop over the number rows present below
203                  */
204                 for (i = 1U; i < numRows-l; i++)
205                 {
206                     /*
207                      * Update the input and destination pointers
208                      */
209                     pInT2 = pInT1 + (numCols * i);
210                     pOutT2 = pOutT1 + (numCols * i);
211                     /*
212                      * Check if there is a non zero pivot element to
213                      * * replace in the rows below
214                      */
215                     if (*pInT2 != 0.0f16)
216                     {
217                         f16x8_t vecA, vecB;
218                         /*
219                          * Loop over number of columns
220                          * * to the right of the pilot element
221                          */
222                         pTmpA = pInT1;
223                         pTmpB = pInT2;
224                         blkCnt = (numCols - l) >> 3;
225                         while (blkCnt > 0U)
226                         {
227 
228                             vecA = vldrhq_f16(pTmpA);
229                             vecB = vldrhq_f16(pTmpB);
230                             vstrhq_f16(pTmpB, vecA);
231                             vstrhq_f16(pTmpA, vecB);
232 
233                             pTmpA += 8;
234                             pTmpB += 8;
235                             /*
236                              * Decrement the blockSize loop counter
237                              */
238                             blkCnt--;
239                         }
240                         /*
241                          * tail
242                          * (will be merged thru tail predication)
243                          */
244                         blkCnt = (numCols - l) & 7;
245                         if (blkCnt > 0U)
246                         {
247                             mve_pred16_t p0 = vctp16q(blkCnt);
248 
249                             vecA = vldrhq_f16(pTmpA);
250                             vecB = vldrhq_f16(pTmpB);
251                             vstrhq_p_f16(pTmpB, vecA, p0);
252                             vstrhq_p_f16(pTmpA, vecB, p0);
253                         }
254 
255                         pInT1 += numCols - l;
256                         pInT2 += numCols - l;
257                         pTmpA = pOutT1;
258                         pTmpB = pOutT2;
259                         blkCnt = numCols >> 3;
260                         while (blkCnt > 0U)
261                         {
262 
263                             vecA = vldrhq_f16(pTmpA);
264                             vecB = vldrhq_f16(pTmpB);
265                             vstrhq_f16(pTmpB, vecA);
266                             vstrhq_f16(pTmpA, vecB);
267                             pTmpA += 8;
268                             pTmpB += 8;
269                             /*
270                              * Decrement the blockSize loop counter
271                              */
272                             blkCnt--;
273                         }
274                         /*
275                          * tail
276                          */
277                         blkCnt = numCols & 7;
278                         if (blkCnt > 0U)
279                         {
280                             mve_pred16_t p0 = vctp16q(blkCnt);
281 
282                             vecA = vldrhq_f16(pTmpA);
283                             vecB = vldrhq_f16(pTmpB);
284                             vstrhq_p_f16(pTmpB, vecA, p0);
285                             vstrhq_p_f16(pTmpA, vecB, p0);
286                         }
287 
288                         pOutT1 += numCols;
289                         pOutT2 += numCols;
290                         /*
291                          * Flag to indicate whether exchange is done or not
292                          */
293                         flag = 1U;
294 
295                         /*
296                          * Break after exchange is done
297                          */
298                         break;
299                     }
300 
301                 }
302             }
303 
304             /*
305              * Update the status if the matrix is singular
306              */
307             if ((flag != 1U) && (in == 0.0f16))
308             {
309                 return ARM_MATH_SINGULAR;
310             }
311 
312             /*
313              * Points to the pivot row of input and destination matrices
314              */
315             pPivotRowIn = pIn + (l * numCols);
316             pPivotRowDst = pOut + (l * numCols);
317 
318             /*
319              * Temporary pointers to the pivot row pointers
320              */
321             pInT1 = pPivotRowIn;
322             pOutT1 = pPivotRowDst;
323 
324             /*
325              * Pivot element of the row
326              */
327             in = *(pIn + (l * numCols));
328 
329             pTmpA = pInT1;
330 
331             f16x8_t invIn = vdupq_n_f16(1.0f16 / in);
332 
333             blkCnt = (numCols - l) >> 3;
334             f16x8_t vecA;
335             while (blkCnt > 0U)
336             {
337                 *(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA * invIn;
338                 pTmpA += 8;
339                 /*
340                  * Decrement the blockSize loop counter
341                  */
342                 blkCnt--;
343             }
344             /*
345              * tail
346              */
347             blkCnt = (numCols - l) & 7;
348             if (blkCnt > 0U)
349             {
350                 mve_pred16_t p0 = vctp16q(blkCnt);
351 
352 
353                 vecA = vldrhq_f16(pTmpA);
354                 vecA = vecA * invIn;
355                 vstrhq_p_f16(pTmpA, vecA, p0);
356             }
357 
358             pInT1 += numCols - l;
359             /*
360              * Loop over number of columns
361              * * to the right of the pilot element
362              */
363 
364             pTmpA = pOutT1;
365             blkCnt = numCols >> 3;
366             while (blkCnt > 0U)
367             {
368                 *(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA *invIn;
369                 pTmpA += 8;
370                 /*
371                  * Decrement the blockSize loop counter
372                  */
373                 blkCnt--;
374             }
375             /*
376              * tail
377              * (will be merged thru tail predication)
378              */
379             blkCnt = numCols & 7;
380             if (blkCnt > 0U)
381             {
382                 mve_pred16_t p0 = vctp16q(blkCnt);
383 
384                 vecA = vldrhq_f16(pTmpA);
385                 vecA = vecA * invIn;
386                 vstrhq_p_f16(pTmpA, vecA, p0);
387             }
388 
389             pOutT1 += numCols;
390 
391             /*
392              * Replace the rows with the sum of that row and a multiple of row i
393              * * so that each new element in column i above row i is zero.
394              */
395 
396             /*
397              * Temporary pointers for input and destination matrices
398              */
399             pInT1 = pIn;
400             pOutT1 = pOut;
401 
402             for (i = 0U; i < numRows; i++)
403             {
404                 /*
405                  * Check for the pivot element
406                  */
407                 if (i == l)
408                 {
409                     /*
410                      * If the processing element is the pivot element,
411                      * only the columns to the right are to be processed
412                      */
413                     pInT1 += numCols - l;
414                     pOutT1 += numCols;
415                 }
416                 else
417                 {
418                     /*
419                      * Element of the reference row
420                      */
421 
422                     /*
423                      * Working pointers for input and destination pivot rows
424                      */
425                     pPRT_in = pPivotRowIn;
426                     pPRT_pDst = pPivotRowDst;
427                     /*
428                      * Loop over the number of columns to the right of the pivot element,
429                      * to replace the elements in the input matrix
430                      */
431 
432                     in = *pInT1;
433                     f16x8_t tmpV = vdupq_n_f16(in);
434 
435                     blkCnt = (numCols - l) >> 3;
436                     while (blkCnt > 0U)
437                     {
438                         f16x8_t vec1, vec2;
439                         /*
440                          * Replace the element by the sum of that row
441                          * and a multiple of the reference row
442                          */
443                         vec1 = vldrhq_f16(pInT1);
444                         vec2 = vldrhq_f16(pPRT_in);
445                         vec1 = vfmsq_f16(vec1, tmpV, vec2);
446                         vstrhq_f16(pInT1, vec1);
447                         pPRT_in += 8;
448                         pInT1 += 8;
449                         /*
450                          * Decrement the blockSize loop counter
451                          */
452                         blkCnt--;
453                     }
454                     /*
455                      * tail
456                      * (will be merged thru tail predication)
457                      */
458                     blkCnt = (numCols - l) & 7;
459                     if (blkCnt > 0U)
460                     {
461                         f16x8_t vec1, vec2;
462                         mve_pred16_t p0 = vctp16q(blkCnt);
463 
464                         vec1 = vldrhq_f16(pInT1);
465                         vec2 = vldrhq_f16(pPRT_in);
466                         vec1 = vfmsq_f16(vec1, tmpV, vec2);
467                         vstrhq_p_f16(pInT1, vec1, p0);
468                         pInT1 += blkCnt;
469                     }
470 
471                     blkCnt = numCols >> 3;
472                     while (blkCnt > 0U)
473                     {
474                         f16x8_t vec1, vec2;
475 
476                         /*
477                          * Replace the element by the sum of that row
478                          * and a multiple of the reference row
479                          */
480                         vec1 = vldrhq_f16(pOutT1);
481                         vec2 = vldrhq_f16(pPRT_pDst);
482                         vec1 = vfmsq_f16(vec1, tmpV, vec2);
483                         vstrhq_f16(pOutT1, vec1);
484                         pPRT_pDst += 8;
485                         pOutT1 += 8;
486                         /*
487                          * Decrement the blockSize loop counter
488                          */
489                         blkCnt--;
490                     }
491                     /*
492                      * tail
493                      * (will be merged thru tail predication)
494                      */
495                     blkCnt = numCols & 7;
496                     if (blkCnt > 0U)
497                     {
498                         f16x8_t vec1, vec2;
499                         mve_pred16_t p0 = vctp16q(blkCnt);
500 
501                         vec1 = vldrhq_f16(pOutT1);
502                         vec2 = vldrhq_f16(pPRT_pDst);
503                         vec1 = vfmsq_f16(vec1, tmpV, vec2);
504                         vstrhq_p_f16(pOutT1, vec1, p0);
505 
506                         pInT2 += blkCnt;
507                         pOutT1 += blkCnt;
508                     }
509                 }
510                 /*
511                  * Increment the temporary input pointer
512                  */
513                 pInT1 = pInT1 + l;
514             }
515             /*
516              * Increment the input pointer
517              */
518             pIn++;
519             /*
520              * Decrement the loop counter
521              */
522             loopCnt--;
523             /*
524              * Increment the index modifier
525              */
526             l++;
527         }
528 
529         /*
530          * Set status as ARM_MATH_SUCCESS
531          */
532         status = ARM_MATH_SUCCESS;
533 
534         if ((flag != 1U) && (in == 0.0f16))
535         {
536             pIn = pSrc->pData;
537             for (i = 0; i < numRows * numCols; i++)
538             {
539                 if (pIn[i] != 0.0f16)
540                     break;
541             }
542 
543             if (i == numRows * numCols)
544                 status = ARM_MATH_SINGULAR;
545         }
546   }
547   /* Return to application */
548   return (status);
549 }
550 
551 #else
552 
arm_mat_inverse_f16(const arm_matrix_instance_f16 * pSrc,arm_matrix_instance_f16 * pDst)553 arm_status arm_mat_inverse_f16(
554   const arm_matrix_instance_f16 * pSrc,
555         arm_matrix_instance_f16 * pDst)
556 {
557   float16_t *pIn = pSrc->pData;                  /* input data matrix pointer */
558   float16_t *pOut = pDst->pData;                 /* output data matrix pointer */
559   float16_t *pInT1, *pInT2;                      /* Temporary input data matrix pointer */
560   float16_t *pOutT1, *pOutT2;                    /* Temporary output data matrix pointer */
561   float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;  /* Temporary input and output data matrix pointer */
562   uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
563   uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
564 
565   _Float16 Xchg, in = 0.0f16, in1;                /* Temporary input values  */
566   uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l;      /* loop counters */
567   arm_status status;                             /* status of matrix inverse */
568 
569 #ifdef ARM_MATH_MATRIX_CHECK
570 
571   /* Check for matrix mismatch condition */
572   if ((pSrc->numRows != pSrc->numCols) ||
573       (pDst->numRows != pDst->numCols) ||
574       (pSrc->numRows != pDst->numRows)   )
575   {
576     /* Set status as ARM_MATH_SIZE_MISMATCH */
577     status = ARM_MATH_SIZE_MISMATCH;
578   }
579   else
580 
581 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
582 
583   {
584 
585     /*--------------------------------------------------------------------------------------------------------------
586      * Matrix Inverse can be solved using elementary row operations.
587      *
588      *  Gauss-Jordan Method:
589      *
590      *      1. First combine the identity matrix and the input matrix separated by a bar to form an
591      *        augmented matrix as follows:
592      *                      _                  _         _         _
593      *                     |  a11  a12 | 1   0  |       |  X11 X12  |
594      *                     |           |        |   =   |           |
595      *                     |_ a21  a22 | 0   1 _|       |_ X21 X21 _|
596      *
597      *      2. In our implementation, pDst Matrix is used as identity matrix.
598      *
599      *      3. Begin with the first row. Let i = 1.
600      *
601      *      4. Check to see if the pivot for row i is zero.
602      *         The pivot is the element of the main diagonal that is on the current row.
603      *         For instance, if working with row i, then the pivot element is aii.
604      *         If the pivot is zero, exchange that row with a row below it that does not
605      *         contain a zero in column i. If this is not possible, then an inverse
606      *         to that matrix does not exist.
607      *
608      *      5. Divide every element of row i by the pivot.
609      *
610      *      6. For every row below and  row i, replace that row with the sum of that row and
611      *         a multiple of row i so that each new element in column i below row i is zero.
612      *
613      *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
614      *         for every element below and above the main diagonal.
615      *
616      *      8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
617      *         Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
618      *----------------------------------------------------------------------------------------------------------------*/
619 
620     /* Working pointer for destination matrix */
621     pOutT1 = pOut;
622 
623     /* Loop over the number of rows */
624     rowCnt = numRows;
625 
626     /* Making the destination matrix as identity matrix */
627     while (rowCnt > 0U)
628     {
629       /* Writing all zeroes in lower triangle of the destination matrix */
630       j = numRows - rowCnt;
631       while (j > 0U)
632       {
633         *pOutT1++ = 0.0f16;
634         j--;
635       }
636 
637       /* Writing all ones in the diagonal of the destination matrix */
638       *pOutT1++ = 1.0f16;
639 
640       /* Writing all zeroes in upper triangle of the destination matrix */
641       j = rowCnt - 1U;
642       while (j > 0U)
643       {
644         *pOutT1++ = 0.0f16;
645         j--;
646       }
647 
648       /* Decrement loop counter */
649       rowCnt--;
650     }
651 
652     /* Loop over the number of columns of the input matrix.
653        All the elements in each column are processed by the row operations */
654     loopCnt = numCols;
655 
656     /* Index modifier to navigate through the columns */
657     l = 0U;
658 
659     while (loopCnt > 0U)
660     {
661       /* Check if the pivot element is zero..
662        * If it is zero then interchange the row with non zero row below.
663        * If there is no non zero element to replace in the rows below,
664        * then the matrix is Singular. */
665 
666       /* Working pointer for the input matrix that points
667        * to the pivot element of the particular row  */
668       pInT1 = pIn + (l * numCols);
669 
670       /* Working pointer for the destination matrix that points
671        * to the pivot element of the particular row  */
672       pOutT1 = pOut + (l * numCols);
673 
674       /* Temporary variable to hold the pivot value */
675       in = *pInT1;
676 
677 
678       /* Check if the pivot element is zero */
679       if (*pInT1 == 0.0f16)
680       {
681         /* Loop over the number rows present below */
682 
683         for (i = 1U; i < numRows-l; i++)
684         {
685           /* Update the input and destination pointers */
686           pInT2 = pInT1 + (numCols * i);
687           pOutT2 = pOutT1 + (numCols * i);
688 
689           /* Check if there is a non zero pivot element to
690            * replace in the rows below */
691           if (*pInT2 != 0.0f16)
692           {
693             /* Loop over number of columns
694              * to the right of the pilot element */
695             j = numCols - l;
696 
697             while (j > 0U)
698             {
699               /* Exchange the row elements of the input matrix */
700               Xchg = *pInT2;
701               *pInT2++ = *pInT1;
702               *pInT1++ = Xchg;
703 
704               /* Decrement the loop counter */
705               j--;
706             }
707 
708             /* Loop over number of columns of the destination matrix */
709             j = numCols;
710 
711             while (j > 0U)
712             {
713               /* Exchange the row elements of the destination matrix */
714               Xchg = *pOutT2;
715               *pOutT2++ = *pOutT1;
716               *pOutT1++ = Xchg;
717 
718               /* Decrement loop counter */
719               j--;
720             }
721 
722             /* Flag to indicate whether exchange is done or not */
723             flag = 1U;
724 
725             /* Break after exchange is done */
726             break;
727           }
728 
729         }
730       }
731 
732       /* Update the status if the matrix is singular */
733       if ((flag != 1U) && (in == 0.0f16))
734       {
735         return ARM_MATH_SINGULAR;
736       }
737 
738       /* Points to the pivot row of input and destination matrices */
739       pPivotRowIn = pIn + (l * numCols);
740       pPivotRowDst = pOut + (l * numCols);
741 
742       /* Temporary pointers to the pivot row pointers */
743       pInT1 = pPivotRowIn;
744       pInT2 = pPivotRowDst;
745 
746       /* Pivot element of the row */
747       in = *pPivotRowIn;
748 
749       /* Loop over number of columns
750        * to the right of the pilot element */
751       j = (numCols - l);
752 
753       while (j > 0U)
754       {
755         /* Divide each element of the row of the input matrix
756          * by the pivot element */
757         in1 = *pInT1;
758         *pInT1++ = in1 / in;
759 
760         /* Decrement the loop counter */
761         j--;
762       }
763 
764       /* Loop over number of columns of the destination matrix */
765       j = numCols;
766 
767       while (j > 0U)
768       {
769         /* Divide each element of the row of the destination matrix
770          * by the pivot element */
771         in1 = *pInT2;
772         *pInT2++ = in1 / in;
773 
774         /* Decrement the loop counter */
775         j--;
776       }
777 
778       /* Replace the rows with the sum of that row and a multiple of row i
779        * so that each new element in column i above row i is zero.*/
780 
781       /* Temporary pointers for input and destination matrices */
782       pInT1 = pIn;
783       pInT2 = pOut;
784 
785       /* index used to check for pivot element */
786       i = 0U;
787 
788       /* Loop over number of rows */
789       /*  to be replaced by the sum of that row and a multiple of row i */
790       k = numRows;
791 
792       while (k > 0U)
793       {
794         /* Check for the pivot element */
795         if (i == l)
796         {
797           /* If the processing element is the pivot element,
798              only the columns to the right are to be processed */
799           pInT1 += numCols - l;
800 
801           pInT2 += numCols;
802         }
803         else
804         {
805           /* Element of the reference row */
806           in = *pInT1;
807 
808           /* Working pointers for input and destination pivot rows */
809           pPRT_in = pPivotRowIn;
810           pPRT_pDst = pPivotRowDst;
811 
812           /* Loop over the number of columns to the right of the pivot element,
813              to replace the elements in the input matrix */
814           j = (numCols - l);
815 
816           while (j > 0U)
817           {
818             /* Replace the element by the sum of that row
819                and a multiple of the reference row  */
820             in1 = *pInT1;
821             *pInT1++ = in1 - (in * *pPRT_in++);
822 
823             /* Decrement the loop counter */
824             j--;
825           }
826 
827           /* Loop over the number of columns to
828              replace the elements in the destination matrix */
829           j = numCols;
830 
831           while (j > 0U)
832           {
833             /* Replace the element by the sum of that row
834                and a multiple of the reference row  */
835             in1 = *pInT2;
836             *pInT2++ = in1 - (in * *pPRT_pDst++);
837 
838             /* Decrement loop counter */
839             j--;
840           }
841 
842         }
843 
844         /* Increment temporary input pointer */
845         pInT1 = pInT1 + l;
846 
847         /* Decrement loop counter */
848         k--;
849 
850         /* Increment pivot index */
851         i++;
852       }
853 
854       /* Increment the input pointer */
855       pIn++;
856 
857       /* Decrement the loop counter */
858       loopCnt--;
859 
860       /* Increment the index modifier */
861       l++;
862     }
863 
864     /* Set status as ARM_MATH_SUCCESS */
865     status = ARM_MATH_SUCCESS;
866 
867     if ((flag != 1U) && (in == 0.0f16))
868     {
869       pIn = pSrc->pData;
870       for (i = 0; i < numRows * numCols; i++)
871       {
872         if (pIn[i] != 0.0f16)
873             break;
874       }
875 
876       if (i == numRows * numCols)
877         status = ARM_MATH_SINGULAR;
878     }
879   }
880 
881   /* Return to application */
882   return (status);
883 }
884 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
885 
886 /**
887   @} end of MatrixInv group
888  */
889 
890 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
891 
892