1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_trans_f32.c
4 * Description: Floating-point matrix transpose
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/matrix_functions.h"
30
31 /**
32 @ingroup groupMatrix
33 */
34
35 /**
36 @defgroup MatrixTrans Matrix Transpose
37
38 Tranposes a matrix.
39
40 Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
41 \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
42 */
43
44 /**
45 @addtogroup MatrixTrans
46 @{
47 */
48
49 /**
50 @brief Floating-point matrix transpose.
51 @param[in] pSrc points to input matrix
52 @param[out] pDst points to output matrix
53 @return execution status
54 - \ref ARM_MATH_SUCCESS : Operation successful
55 - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
56 */
57 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
58
59 #include "arm_helium_utils.h"
60
arm_mat_trans_f32(const arm_matrix_instance_f32 * pSrc,arm_matrix_instance_f32 * pDst)61 arm_status arm_mat_trans_f32(
62 const arm_matrix_instance_f32 * pSrc,
63 arm_matrix_instance_f32 * pDst)
64 {
65 arm_status status; /* status of matrix transpose */
66
67 #ifdef ARM_MATH_MATRIX_CHECK
68
69 /* Check for matrix mismatch condition */
70 if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
71 {
72 /* Set status as ARM_MATH_SIZE_MISMATCH */
73 status = ARM_MATH_SIZE_MISMATCH;
74 }
75 else
76 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
77 {
78 if (pDst->numRows == pDst->numCols)
79 {
80 if (pDst->numCols == 2)
81 return arm_mat_trans_32bit_2x2_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
82 if (pDst->numCols == 3)
83 return arm_mat_trans_32bit_3x3_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
84 if (pDst->numCols == 4)
85 return arm_mat_trans_32bit_4x4_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
86 }
87
88 arm_mat_trans_32bit_generic_mve(pSrc->numRows, pSrc->numCols, (uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
89 /* Set status as ARM_MATH_SUCCESS */
90 status = ARM_MATH_SUCCESS;
91 }
92
93 /* Return to application */
94 return (status);
95 }
96
97 #else
98 #if defined(ARM_MATH_NEON)
99
arm_mat_trans_f32(const arm_matrix_instance_f32 * pSrc,arm_matrix_instance_f32 * pDst)100 arm_status arm_mat_trans_f32(
101 const arm_matrix_instance_f32 * pSrc,
102 arm_matrix_instance_f32 * pDst)
103 {
104 float32_t *pIn = pSrc->pData; /* input data matrix pointer */
105 float32_t *pOut = pDst->pData; /* output data matrix pointer */
106 float32_t *px; /* Temporary output data matrix pointer */
107 uint16_t nRows = pSrc->numRows; /* number of rows */
108 uint16_t nColumns = pSrc->numCols; /* number of columns */
109
110 uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
111 arm_status status; /* status of matrix transpose */
112
113 #ifdef ARM_MATH_MATRIX_CHECK
114
115 /* Check for matrix mismatch condition */
116 if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
117 {
118 /* Set status as ARM_MATH_SIZE_MISMATCH */
119 status = ARM_MATH_SIZE_MISMATCH;
120 }
121 else
122 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
123
124 {
125 /* Matrix transpose by exchanging the rows with columns */
126 /* Row loop */
127 rowCnt = row >> 2;
128 while (rowCnt > 0U)
129 {
130 float32x4_t row0V,row1V,row2V,row3V;
131 float32x4x2_t ra0,ra1,rb0,rb1;
132
133 blkCnt = nColumns >> 2;
134
135 /* The pointer px is set to starting address of the column being processed */
136 px = pOut + i;
137
138 /* Compute 4 outputs at a time.
139 ** a second loop below computes the remaining 1 to 3 samples. */
140 while (blkCnt > 0U) /* Column loop */
141 {
142 row0V = vld1q_f32(pIn);
143 row1V = vld1q_f32(pIn + 1 * nColumns);
144 row2V = vld1q_f32(pIn + 2 * nColumns);
145 row3V = vld1q_f32(pIn + 3 * nColumns);
146 pIn += 4;
147
148 ra0 = vzipq_f32(row0V,row2V);
149 ra1 = vzipq_f32(row1V,row3V);
150
151 rb0 = vzipq_f32(ra0.val[0],ra1.val[0]);
152 rb1 = vzipq_f32(ra0.val[1],ra1.val[1]);
153
154 vst1q_f32(px,rb0.val[0]);
155 px += nRows;
156
157 vst1q_f32(px,rb0.val[1]);
158 px += nRows;
159
160 vst1q_f32(px,rb1.val[0]);
161 px += nRows;
162
163 vst1q_f32(px,rb1.val[1]);
164 px += nRows;
165
166 /* Decrement the column loop counter */
167 blkCnt--;
168 }
169
170 /* Perform matrix transpose for last 3 samples here. */
171 blkCnt = nColumns % 0x4U;
172
173 while (blkCnt > 0U)
174 {
175 /* Read and store the input element in the destination */
176 *px++ = *pIn;
177 *px++ = *(pIn + 1 * nColumns);
178 *px++ = *(pIn + 2 * nColumns);
179 *px++ = *(pIn + 3 * nColumns);
180
181 px += (nRows - 4);
182 pIn++;
183
184 /* Decrement the column loop counter */
185 blkCnt--;
186 }
187
188 i += 4;
189 pIn += 3 * nColumns;
190
191 /* Decrement the row loop counter */
192 rowCnt--;
193
194 } /* Row loop end */
195
196 rowCnt = row & 3;
197 while (rowCnt > 0U)
198 {
199 blkCnt = nColumns ;
200 /* The pointer px is set to starting address of the column being processed */
201 px = pOut + i;
202
203 while (blkCnt > 0U)
204 {
205 /* Read and store the input element in the destination */
206 *px = *pIn++;
207
208 /* Update the pointer px to point to the next row of the transposed matrix */
209 px += nRows;
210
211 /* Decrement the column loop counter */
212 blkCnt--;
213 }
214 i++;
215 rowCnt -- ;
216 }
217
218 /* Set status as ARM_MATH_SUCCESS */
219 status = ARM_MATH_SUCCESS;
220 }
221
222 /* Return to application */
223 return (status);
224 }
225 #else
arm_mat_trans_f32(const arm_matrix_instance_f32 * pSrc,arm_matrix_instance_f32 * pDst)226 arm_status arm_mat_trans_f32(
227 const arm_matrix_instance_f32 * pSrc,
228 arm_matrix_instance_f32 * pDst)
229 {
230 float32_t *pIn = pSrc->pData; /* input data matrix pointer */
231 float32_t *pOut = pDst->pData; /* output data matrix pointer */
232 float32_t *px; /* Temporary output data matrix pointer */
233 uint16_t nRows = pSrc->numRows; /* number of rows */
234 uint16_t nCols = pSrc->numCols; /* number of columns */
235 uint32_t col, row = nRows, i = 0U; /* Loop counters */
236 arm_status status; /* status of matrix transpose */
237
238 #ifdef ARM_MATH_MATRIX_CHECK
239
240 /* Check for matrix mismatch condition */
241 if ((pSrc->numRows != pDst->numCols) ||
242 (pSrc->numCols != pDst->numRows) )
243 {
244 /* Set status as ARM_MATH_SIZE_MISMATCH */
245 status = ARM_MATH_SIZE_MISMATCH;
246 }
247 else
248
249 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
250
251 {
252 /* Matrix transpose by exchanging the rows with columns */
253 /* row loop */
254 do
255 {
256 /* Pointer px is set to starting address of column being processed */
257 px = pOut + i;
258
259 #if defined (ARM_MATH_LOOPUNROLL)
260
261 /* Loop unrolling: Compute 4 outputs at a time */
262 col = nCols >> 2U;
263
264 while (col > 0U) /* column loop */
265 {
266 /* Read and store input element in destination */
267 *px = *pIn++;
268 /* Update pointer px to point to next row of transposed matrix */
269 px += nRows;
270
271 *px = *pIn++;
272 px += nRows;
273
274 *px = *pIn++;
275 px += nRows;
276
277 *px = *pIn++;
278 px += nRows;
279
280 /* Decrement column loop counter */
281 col--;
282 }
283
284 /* Loop unrolling: Compute remaining outputs */
285 col = nCols % 0x4U;
286
287 #else
288
289 /* Initialize col with number of samples */
290 col = nCols;
291
292 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
293
294 while (col > 0U)
295 {
296 /* Read and store input element in destination */
297 *px = *pIn++;
298
299 /* Update pointer px to point to next row of transposed matrix */
300 px += nRows;
301
302 /* Decrement column loop counter */
303 col--;
304 }
305
306 i++;
307
308 /* Decrement row loop counter */
309 row--;
310
311 } while (row > 0U); /* row loop end */
312
313 /* Set status as ARM_MATH_SUCCESS */
314 status = ARM_MATH_SUCCESS;
315 }
316
317 /* Return to application */
318 return (status);
319 }
320 #endif /* #if defined(ARM_MATH_NEON) */
321 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
322
323 /**
324 * @} end of MatrixTrans group
325 */
326