1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_trans_f32.c
4  * Description:  Floating-point matrix transpose
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/matrix_functions.h"
30 
31 /**
32   @ingroup groupMatrix
33  */
34 
35 /**
36   @defgroup MatrixTrans Matrix Transpose
37 
38   Tranposes a matrix.
39 
40   Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
41   \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
42  */
43 
44 /**
45   @addtogroup MatrixTrans
46   @{
47  */
48 
49 /**
50   @brief         Floating-point matrix transpose.
51   @param[in]     pSrc      points to input matrix
52   @param[out]    pDst      points to output matrix
53   @return        execution status
54                    - \ref ARM_MATH_SUCCESS       : Operation successful
55                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
56  */
57 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
58 
59 #include "arm_helium_utils.h"
60 
arm_mat_trans_f32(const arm_matrix_instance_f32 * pSrc,arm_matrix_instance_f32 * pDst)61 arm_status arm_mat_trans_f32(
62   const arm_matrix_instance_f32 * pSrc,
63   arm_matrix_instance_f32 * pDst)
64 {
65   arm_status status;                             /* status of matrix transpose  */
66 
67 #ifdef ARM_MATH_MATRIX_CHECK
68 
69   /* Check for matrix mismatch condition */
70   if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
71   {
72     /* Set status as ARM_MATH_SIZE_MISMATCH */
73     status = ARM_MATH_SIZE_MISMATCH;
74   }
75   else
76 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
77   {
78     if (pDst->numRows == pDst->numCols)
79     {
80         if (pDst->numCols == 2)
81             return arm_mat_trans_32bit_2x2_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
82         if (pDst->numCols == 3)
83             return arm_mat_trans_32bit_3x3_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
84         if (pDst->numCols == 4)
85             return arm_mat_trans_32bit_4x4_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
86     }
87 
88     arm_mat_trans_32bit_generic_mve(pSrc->numRows, pSrc->numCols, (uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
89     /* Set status as ARM_MATH_SUCCESS */
90     status = ARM_MATH_SUCCESS;
91   }
92 
93   /* Return to application */
94   return (status);
95 }
96 
97 #else
98 #if defined(ARM_MATH_NEON)
99 
arm_mat_trans_f32(const arm_matrix_instance_f32 * pSrc,arm_matrix_instance_f32 * pDst)100 arm_status arm_mat_trans_f32(
101   const arm_matrix_instance_f32 * pSrc,
102   arm_matrix_instance_f32 * pDst)
103 {
104   float32_t *pIn = pSrc->pData;                  /* input data matrix pointer */
105   float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
106   float32_t *px;                                 /* Temporary output data matrix pointer */
107   uint16_t nRows = pSrc->numRows;                /* number of rows */
108   uint16_t nColumns = pSrc->numCols;             /* number of columns */
109 
110   uint16_t blkCnt, rowCnt, i = 0U, row = nRows;          /* loop counters */
111   arm_status status;                             /* status of matrix transpose  */
112 
113 #ifdef ARM_MATH_MATRIX_CHECK
114 
115   /* Check for matrix mismatch condition */
116   if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
117   {
118     /* Set status as ARM_MATH_SIZE_MISMATCH */
119     status = ARM_MATH_SIZE_MISMATCH;
120   }
121   else
122 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
123 
124   {
125     /* Matrix transpose by exchanging the rows with columns */
126     /* Row loop */
127     rowCnt = row >> 2;
128     while (rowCnt > 0U)
129     {
130       float32x4_t row0V,row1V,row2V,row3V;
131       float32x4x2_t ra0,ra1,rb0,rb1;
132 
133       blkCnt = nColumns >> 2;
134 
135       /* The pointer px is set to starting address of the column being processed */
136       px = pOut + i;
137 
138       /* Compute 4 outputs at a time.
139        ** a second loop below computes the remaining 1 to 3 samples. */
140       while (blkCnt > 0U)        /* Column loop */
141       {
142         row0V = vld1q_f32(pIn);
143         row1V = vld1q_f32(pIn + 1 * nColumns);
144         row2V = vld1q_f32(pIn + 2 * nColumns);
145         row3V = vld1q_f32(pIn + 3 * nColumns);
146         pIn += 4;
147 
148         ra0 = vzipq_f32(row0V,row2V);
149         ra1 = vzipq_f32(row1V,row3V);
150 
151         rb0 = vzipq_f32(ra0.val[0],ra1.val[0]);
152         rb1 = vzipq_f32(ra0.val[1],ra1.val[1]);
153 
154         vst1q_f32(px,rb0.val[0]);
155         px += nRows;
156 
157         vst1q_f32(px,rb0.val[1]);
158         px += nRows;
159 
160         vst1q_f32(px,rb1.val[0]);
161         px += nRows;
162 
163         vst1q_f32(px,rb1.val[1]);
164         px += nRows;
165 
166         /* Decrement the column loop counter */
167         blkCnt--;
168       }
169 
170       /* Perform matrix transpose for last 3 samples here. */
171       blkCnt = nColumns % 0x4U;
172 
173       while (blkCnt > 0U)
174       {
175         /* Read and store the input element in the destination */
176         *px++ = *pIn;
177         *px++ = *(pIn + 1 * nColumns);
178         *px++ = *(pIn + 2 * nColumns);
179         *px++ = *(pIn + 3 * nColumns);
180 
181         px += (nRows - 4);
182         pIn++;
183 
184         /* Decrement the column loop counter */
185         blkCnt--;
186       }
187 
188       i += 4;
189       pIn += 3 * nColumns;
190 
191       /* Decrement the row loop counter */
192       rowCnt--;
193 
194     }         /* Row loop end  */
195 
196     rowCnt = row & 3;
197     while (rowCnt > 0U)
198     {
199       blkCnt = nColumns ;
200       /* The pointer px is set to starting address of the column being processed */
201       px = pOut + i;
202 
203       while (blkCnt > 0U)
204       {
205         /* Read and store the input element in the destination */
206         *px = *pIn++;
207 
208         /* Update the pointer px to point to the next row of the transposed matrix */
209         px += nRows;
210 
211         /* Decrement the column loop counter */
212         blkCnt--;
213       }
214       i++;
215       rowCnt -- ;
216     }
217 
218     /* Set status as ARM_MATH_SUCCESS */
219     status = ARM_MATH_SUCCESS;
220   }
221 
222   /* Return to application */
223   return (status);
224 }
225 #else
arm_mat_trans_f32(const arm_matrix_instance_f32 * pSrc,arm_matrix_instance_f32 * pDst)226 arm_status arm_mat_trans_f32(
227   const arm_matrix_instance_f32 * pSrc,
228         arm_matrix_instance_f32 * pDst)
229 {
230   float32_t *pIn = pSrc->pData;                  /* input data matrix pointer */
231   float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
232   float32_t *px;                                 /* Temporary output data matrix pointer */
233   uint16_t nRows = pSrc->numRows;                /* number of rows */
234   uint16_t nCols = pSrc->numCols;                /* number of columns */
235   uint32_t col, row = nRows, i = 0U;             /* Loop counters */
236   arm_status status;                             /* status of matrix transpose */
237 
238 #ifdef ARM_MATH_MATRIX_CHECK
239 
240   /* Check for matrix mismatch condition */
241   if ((pSrc->numRows != pDst->numCols) ||
242       (pSrc->numCols != pDst->numRows)   )
243   {
244     /* Set status as ARM_MATH_SIZE_MISMATCH */
245     status = ARM_MATH_SIZE_MISMATCH;
246   }
247   else
248 
249 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
250 
251   {
252     /* Matrix transpose by exchanging the rows with columns */
253     /* row loop */
254     do
255     {
256       /* Pointer px is set to starting address of column being processed */
257       px = pOut + i;
258 
259 #if defined (ARM_MATH_LOOPUNROLL)
260 
261       /* Loop unrolling: Compute 4 outputs at a time */
262       col = nCols >> 2U;
263 
264       while (col > 0U)        /* column loop */
265       {
266         /* Read and store input element in destination */
267         *px = *pIn++;
268         /* Update pointer px to point to next row of transposed matrix */
269         px += nRows;
270 
271         *px = *pIn++;
272         px += nRows;
273 
274         *px = *pIn++;
275         px += nRows;
276 
277         *px = *pIn++;
278         px += nRows;
279 
280         /* Decrement column loop counter */
281         col--;
282       }
283 
284       /* Loop unrolling: Compute remaining outputs */
285       col = nCols % 0x4U;
286 
287 #else
288 
289       /* Initialize col with number of samples */
290       col = nCols;
291 
292 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
293 
294       while (col > 0U)
295       {
296         /* Read and store input element in destination */
297         *px = *pIn++;
298 
299         /* Update pointer px to point to next row of transposed matrix */
300         px += nRows;
301 
302         /* Decrement column loop counter */
303         col--;
304       }
305 
306       i++;
307 
308       /* Decrement row loop counter */
309       row--;
310 
311     } while (row > 0U);          /* row loop end */
312 
313     /* Set status as ARM_MATH_SUCCESS */
314     status = ARM_MATH_SUCCESS;
315   }
316 
317   /* Return to application */
318   return (status);
319 }
320 #endif /* #if defined(ARM_MATH_NEON) */
321 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
322 
323 /**
324  * @} end of MatrixTrans group
325  */
326