1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_trans_f32.c
4 * Description: Floating-point matrix transpose
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/matrix_functions.h"
30
31 /**
32 @ingroup groupMatrix
33 */
34
35 /**
36 @defgroup MatrixTrans Matrix Transpose
37
38 Tranposes a matrix.
39
40 Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
41
42 @par Transpose of a 3 x 3 matrix
43
44 \f[
45 \begin{pmatrix}
46 a_{1,1} & a_{1,2} & a_{1,3} \\
47 a_{2,1} & a_{2,2} & a_{2,3} \\
48 a_{3,1} & a_{3,2} & a_{3,3} \\
49 \end{pmatrix}^T
50 =
51 \begin{pmatrix}
52 a_{1,1} & a_{2,1} & a_{3,1} \\
53 a_{1,2} & a_{2,2} & a_{3,2} \\
54 a_{1,3} & a_{2,3} & a_{3,3} \\
55 \end{pmatrix}
56 \f]
57
58 */
59
60 /**
61 @addtogroup MatrixTrans
62 @{
63 */
64
65 /**
66 @brief Floating-point matrix transpose.
67 @param[in] pSrc points to input matrix
68 @param[out] pDst points to output matrix
69 @return execution status
70 - \ref ARM_MATH_SUCCESS : Operation successful
71 - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
72 */
73 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
74
75 #include "arm_helium_utils.h"
76
arm_mat_trans_f32(const arm_matrix_instance_f32 * pSrc,arm_matrix_instance_f32 * pDst)77 ARM_DSP_ATTRIBUTE arm_status arm_mat_trans_f32(
78 const arm_matrix_instance_f32 * pSrc,
79 arm_matrix_instance_f32 * pDst)
80 {
81 arm_status status; /* status of matrix transpose */
82
83 #ifdef ARM_MATH_MATRIX_CHECK
84
85 /* Check for matrix mismatch condition */
86 if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
87 {
88 /* Set status as ARM_MATH_SIZE_MISMATCH */
89 status = ARM_MATH_SIZE_MISMATCH;
90 }
91 else
92 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
93 {
94 if (pDst->numRows == pDst->numCols)
95 {
96 if (pDst->numCols == 2)
97 return arm_mat_trans_32bit_2x2_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
98 if (pDst->numCols == 3)
99 return arm_mat_trans_32bit_3x3_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
100 if (pDst->numCols == 4)
101 return arm_mat_trans_32bit_4x4_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
102 }
103
104 arm_mat_trans_32bit_generic_mve(pSrc->numRows, pSrc->numCols, (uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
105 /* Set status as ARM_MATH_SUCCESS */
106 status = ARM_MATH_SUCCESS;
107 }
108
109 /* Return to application */
110 return (status);
111 }
112
113 #else
114 #if defined(ARM_MATH_NEON)
115
arm_mat_trans_f32(const arm_matrix_instance_f32 * pSrc,arm_matrix_instance_f32 * pDst)116 ARM_DSP_ATTRIBUTE arm_status arm_mat_trans_f32(
117 const arm_matrix_instance_f32 * pSrc,
118 arm_matrix_instance_f32 * pDst)
119 {
120 float32_t *pIn = pSrc->pData; /* input data matrix pointer */
121 float32_t *pOut = pDst->pData; /* output data matrix pointer */
122 float32_t *px; /* Temporary output data matrix pointer */
123 uint16_t nRows = pSrc->numRows; /* number of rows */
124 uint16_t nColumns = pSrc->numCols; /* number of columns */
125
126 uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
127 arm_status status; /* status of matrix transpose */
128
129 #ifdef ARM_MATH_MATRIX_CHECK
130
131 /* Check for matrix mismatch condition */
132 if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
133 {
134 /* Set status as ARM_MATH_SIZE_MISMATCH */
135 status = ARM_MATH_SIZE_MISMATCH;
136 }
137 else
138 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
139
140 {
141 /* Matrix transpose by exchanging the rows with columns */
142 /* Row loop */
143 rowCnt = row >> 2;
144 while (rowCnt > 0U)
145 {
146 float32x4_t row0V,row1V,row2V,row3V;
147 float32x4x2_t ra0,ra1,rb0,rb1;
148
149 blkCnt = nColumns >> 2;
150
151 /* The pointer px is set to starting address of the column being processed */
152 px = pOut + i;
153
154 /* Compute 4 outputs at a time.
155 ** a second loop below computes the remaining 1 to 3 samples. */
156 while (blkCnt > 0U) /* Column loop */
157 {
158 row0V = vld1q_f32(pIn);
159 row1V = vld1q_f32(pIn + 1 * nColumns);
160 row2V = vld1q_f32(pIn + 2 * nColumns);
161 row3V = vld1q_f32(pIn + 3 * nColumns);
162 pIn += 4;
163
164 ra0 = vzipq_f32(row0V,row2V);
165 ra1 = vzipq_f32(row1V,row3V);
166
167 rb0 = vzipq_f32(ra0.val[0],ra1.val[0]);
168 rb1 = vzipq_f32(ra0.val[1],ra1.val[1]);
169
170 vst1q_f32(px,rb0.val[0]);
171 px += nRows;
172
173 vst1q_f32(px,rb0.val[1]);
174 px += nRows;
175
176 vst1q_f32(px,rb1.val[0]);
177 px += nRows;
178
179 vst1q_f32(px,rb1.val[1]);
180 px += nRows;
181
182 /* Decrement the column loop counter */
183 blkCnt--;
184 }
185
186 /* Perform matrix transpose for last 3 samples here. */
187 blkCnt = nColumns % 0x4U;
188
189 while (blkCnt > 0U)
190 {
191 /* Read and store the input element in the destination */
192 *px++ = *pIn;
193 *px++ = *(pIn + 1 * nColumns);
194 *px++ = *(pIn + 2 * nColumns);
195 *px++ = *(pIn + 3 * nColumns);
196
197 px += (nRows - 4);
198 pIn++;
199
200 /* Decrement the column loop counter */
201 blkCnt--;
202 }
203
204 i += 4;
205 pIn += 3 * nColumns;
206
207 /* Decrement the row loop counter */
208 rowCnt--;
209
210 } /* Row loop end */
211
212 rowCnt = row & 3;
213 while (rowCnt > 0U)
214 {
215 blkCnt = nColumns ;
216 /* The pointer px is set to starting address of the column being processed */
217 px = pOut + i;
218
219 while (blkCnt > 0U)
220 {
221 /* Read and store the input element in the destination */
222 *px = *pIn++;
223
224 /* Update the pointer px to point to the next row of the transposed matrix */
225 px += nRows;
226
227 /* Decrement the column loop counter */
228 blkCnt--;
229 }
230 i++;
231 rowCnt -- ;
232 }
233
234 /* Set status as ARM_MATH_SUCCESS */
235 status = ARM_MATH_SUCCESS;
236 }
237
238 /* Return to application */
239 return (status);
240 }
241 #else
arm_mat_trans_f32(const arm_matrix_instance_f32 * pSrc,arm_matrix_instance_f32 * pDst)242 ARM_DSP_ATTRIBUTE arm_status arm_mat_trans_f32(
243 const arm_matrix_instance_f32 * pSrc,
244 arm_matrix_instance_f32 * pDst)
245 {
246 float32_t *pIn = pSrc->pData; /* input data matrix pointer */
247 float32_t *pOut = pDst->pData; /* output data matrix pointer */
248 float32_t *px; /* Temporary output data matrix pointer */
249 uint16_t nRows = pSrc->numRows; /* number of rows */
250 uint16_t nCols = pSrc->numCols; /* number of columns */
251 uint32_t col, row = nRows, i = 0U; /* Loop counters */
252 arm_status status; /* status of matrix transpose */
253
254 #ifdef ARM_MATH_MATRIX_CHECK
255
256 /* Check for matrix mismatch condition */
257 if ((pSrc->numRows != pDst->numCols) ||
258 (pSrc->numCols != pDst->numRows) )
259 {
260 /* Set status as ARM_MATH_SIZE_MISMATCH */
261 status = ARM_MATH_SIZE_MISMATCH;
262 }
263 else
264
265 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
266
267 {
268 /* Matrix transpose by exchanging the rows with columns */
269 /* row loop */
270 do
271 {
272 /* Pointer px is set to starting address of column being processed */
273 px = pOut + i;
274
275 #if defined (ARM_MATH_LOOPUNROLL)
276
277 /* Loop unrolling: Compute 4 outputs at a time */
278 col = nCols >> 2U;
279
280 while (col > 0U) /* column loop */
281 {
282 /* Read and store input element in destination */
283 *px = *pIn++;
284 /* Update pointer px to point to next row of transposed matrix */
285 px += nRows;
286
287 *px = *pIn++;
288 px += nRows;
289
290 *px = *pIn++;
291 px += nRows;
292
293 *px = *pIn++;
294 px += nRows;
295
296 /* Decrement column loop counter */
297 col--;
298 }
299
300 /* Loop unrolling: Compute remaining outputs */
301 col = nCols % 0x4U;
302
303 #else
304
305 /* Initialize col with number of samples */
306 col = nCols;
307
308 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
309
310 while (col > 0U)
311 {
312 /* Read and store input element in destination */
313 *px = *pIn++;
314
315 /* Update pointer px to point to next row of transposed matrix */
316 px += nRows;
317
318 /* Decrement column loop counter */
319 col--;
320 }
321
322 i++;
323
324 /* Decrement row loop counter */
325 row--;
326
327 } while (row > 0U); /* row loop end */
328
329 /* Set status as ARM_MATH_SUCCESS */
330 status = ARM_MATH_SUCCESS;
331 }
332
333 /* Return to application */
334 return (status);
335 }
336 #endif /* #if defined(ARM_MATH_NEON) */
337 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
338
339 /**
340 * @} end of MatrixTrans group
341 */
342