1 // -*- C++ -*-
2 /** @file */
3 #pragma once
4
5 #ifdef DOXYGEN
6 #define ARM_MATH_DSP
7 #undef ARM_MATH_MVEI
8 #undef ARM_MATH_MVEF
9 #undef ARM_MATH_NEON
10 #endif
11
12 /** \addtogroup DSPALG
13 * @{
14 */
15
16 #if defined(ARM_MATH_DSP)
17 #if !defined(ARM_MATH_MVEI) && !defined(ARM_MATH_MVEF) && !defined(ARM_MATH_NEON)
18
19 template<typename MA,
20 typename MB,
21 typename std::enable_if<
22 std::is_same<typename traits<MA>::Scalar,Q15>::value &&
23 number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
_arm_mat_trans(const MA & src,MB & dst,const DSP * =nullptr)24 __STATIC_INLINE void _arm_mat_trans(
25 const MA &src,
26 MB &dst,
27 const DSP* = nullptr)
28 {
29 using T = typename traits<MA>::Scalar;
30 using VEC = typename vector_traits<T>::vector;
31 constexpr int nb_lanes = vector_traits<T>::nb_lanes;
32
33 T *pIn = src.ptr(); /* input data matrix pointer */
34 T *pOut = dst.ptr(); /* output data matrix pointer */
35 uint16_t nRows = src.rows(); /* number of rows */
36 uint16_t nCols = src.columns(); /* number of columns */
37 uint32_t col, row = nRows, i = 0U; /* Loop counters */
38
39 VEC in; /* variable to hold temporary output */
40
41 /* Matrix transpose by exchanging the rows with columns */
42 /* row loop */
43 do
44 {
45 /* Pointer pOut is set to starting address of column being processed */
46 pOut = dst.ptr() + i;
47
48
49 /* Loop unrolling: Compute 4 outputs at a time */
50 col = nCols / (2*nb_lanes);
51
52 while (col > 0U) /* column loop */
53 {
54 /* Read two elements from row */
55 in = inner::vload1<1>(pIn);
56 pIn += nb_lanes;
57
58 /* Unpack and store one element in destination */
59 *pOut = Q15(in.v);
60 /* Update pointer pOut to point to next row of transposed matrix */
61 pOut += dst.stride();
62
63 /* Unpack and store second element in destination */
64 *pOut = Q15((in.v & (q31_t) 0xffff0000) >> 16);
65 /* Update pointer pOut to point to next row of transposed matrix */
66 pOut += dst.stride();
67
68 /* Read two elements from row */
69 in = inner::vload1<1>(pIn);
70 pIn += nb_lanes;
71
72 /* Unpack and store one element in destination */
73 *pOut = Q15(in.v);
74 /* Update pointer pOut to point to next row of transposed matrix */
75 pOut += dst.stride();
76
77 /* Unpack and store second element in destination */
78 *pOut = Q15((in & (q31_t) 0xffff0000) >> 16);
79 /* Update pointer pOut to point to next row of transposed matrix */
80 pOut += dst.stride();
81
82 /* Decrement column loop counter */
83 col--;
84 }
85
86 /* Loop unrolling: Compute remaining outputs */
87 col = nCols & (2*nb_lanes-1);
88 while (col > 0U)
89 {
90 /* Read and store input element in destination */
91 *pOut = *pIn++;
92
93 /* Update pointer pOut to point to next row of transposed matrix */
94 pOut += dst.stride();
95
96 /* Decrement column loop counter */
97 col--;
98 }
99
100 pIn += (src.stride()-nCols);
101
102 i ++;
103
104 /* Decrement row loop counter */
105 row--;
106
107 } while (row > 0U); /* row loop end */
108
109
110
111 }
112
113
114 template<typename M,
115 typename V,
116 typename RES,
117 typename std::enable_if<
118 !std::is_same<typename traits<M>::Scalar,Q31>::value &&
119 number_traits<typename traits<M>::Scalar>::is_fixed,bool>::type = true>
_dot_m_v(RES & res,const M & m,const V & v,const DSP * =nullptr)120 inline void _dot_m_v(RES &res,
121 const M&m,const V&v,
122 const DSP* = nullptr)
123 {
124 using T = typename traits<M>::Scalar;
125 using ACC = typename vector_traits<T>::temp_accumulator;
126 using VEC = typename vector_traits<T>::vector;
127 constexpr int nb_lanes = vector_traits<T>::nb_lanes;
128
129 uint32_t numRows = m.rows();
130 uint32_t numCols = m.columns();
131 const T *pSrcA = m.ptr();
132 const T *pInA1; /* input data matrix pointer A of Q15 type */
133 const T *pInA2; /* input data matrix pointer A of Q15 type */
134 const T *pInA3; /* input data matrix pointer A of Q15 type */
135 const T *pInA4; /* input data matrix pointer A of Q15 type */
136 T *px; /* Temporary output data matrix pointer */
137 uint16_t i, row; /* loop counters */
138 int16_t colCnt;
139 VEC matData, matData2, vecData, vecData2;
140 T tmpData;
141
142
143 /* Process 4 rows at a time */
144 row = numRows >> 2;
145 i = 0u;
146 px = res.ptr();
147
148 /* The following loop performs the dot-product of each row in pSrcA with the vector */
149 /* row loop */
150 while (row > 0) {
151 /* Initialize accumulators */
152 ACC sum1 = ACC{};
153 ACC sum2 = ACC{};
154 ACC sum3 = ACC{};
155 ACC sum4 = ACC{};
156
157 /* For every row wise process, the pInVec pointer is set
158 ** to the starting address of the vector */
159
160 /* Loop unrolling: process 2 columns per iteration */
161
162 /* Initialize pointers to the starting address of the column being processed */
163 pInA1 = pSrcA + i;
164 pInA2 = pInA1 + m.stride();
165 pInA3 = pInA2 + m.stride();
166 pInA4 = pInA3 + m.stride();
167
168 // Main loop: matrix-vector multiplication
169 for(colCnt = 0 ; colCnt <= (int16_t)numCols - nb_lanes; colCnt += nb_lanes)
170 {
171 // Read 2 values from vector
172 vecData = v.vector_op(colCnt);
173
174 // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
175 matData = inner::vload1<1> (pInA1);
176 pInA1 += nb_lanes;
177 sum1 = inner::vmacc(sum1, matData, vecData);
178
179 matData = inner::vload1<1> (pInA2);
180 pInA2 += nb_lanes;
181 sum2 = inner::vmacc(sum2, matData, vecData);
182
183 matData = inner::vload1<1> (pInA3);
184 pInA3 += nb_lanes;
185 sum3 = inner::vmacc(sum3, matData, vecData);
186
187 matData = inner::vload1<1> (pInA4);
188 pInA4 += nb_lanes;
189 sum4 = inner::vmacc(sum4, matData, vecData);
190
191 // Decrement the loop counter
192 }
193
194 /* process any remaining columns */
195
196 for(; colCnt < (int16_t)numCols; colCnt ++)
197 {
198 tmpData = v[colCnt];
199 sum1 = inner::mac(sum1,*pInA1++ , tmpData);
200 sum2 = inner::mac(sum2,*pInA2++ , tmpData);
201 sum3 = inner::mac(sum3,*pInA3++ , tmpData);
202 sum4 = inner::mac(sum4,*pInA4++ , tmpData);
203 }
204
205 /* Saturate and store the result in the destination buffer */
206 *px++ = inner::from_accumulator(sum1);
207 *px++ = inner::from_accumulator(sum2);
208 *px++ = inner::from_accumulator(sum3);
209 *px++ = inner::from_accumulator(sum4);
210
211 i = i + m.stride() * 4;
212
213 /* Decrement the row loop counter */
214 row--;
215 }
216
217 /* process any remaining rows */
218 row = numRows & 3u;
219 while (row > 0) {
220
221 ACC sum = ACC{};
222 pInA1 = pSrcA + i;
223
224 // loop unrolling - process 4 elements at a time
225
226 for(colCnt = 0 ; colCnt <= (int16_t)numCols - 2*nb_lanes; colCnt += 2*nb_lanes)
227 {
228 vecData = v.vector_op(colCnt);
229 vecData2 = v.vector_op(colCnt+nb_lanes);
230
231 matData = inner::vload1<1>(pInA1);
232 pInA1 += nb_lanes;
233 matData2 = inner::vload1<1>(pInA1);
234 pInA1 += nb_lanes;
235 sum = inner::vmacc(sum, matData, vecData);
236 sum = inner::vmacc(sum, matData2, vecData2);
237 }
238
239 // process remainder of row
240 for(; colCnt < (int16_t)numCols; colCnt ++)
241 {
242
243 sum = inner::mac(sum, *pInA1++ , v[colCnt]);
244 }
245 *px++ = inner::from_accumulator(sum);
246 i = i + m.stride();
247 row--;
248 }
249 }
250
251 template<typename MA,
252 typename MB,
253 typename RES,
254 typename TMP,
255 typename std::enable_if<
256 !std::is_same<typename traits<MA>::Scalar,Q31>::value &&
257 number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
_dot_m_m(const MA & pSrcA,const MB & pSrcB,RES && pDst,const TMP & BT,const DSP * =nullptr)258 __STATIC_INLINE void _dot_m_m(const MA&pSrcA,const MB&pSrcB,
259 RES &&pDst,
260 const TMP &BT,
261 const DSP* = nullptr)
262 {
263 using T = typename traits<MA>::Scalar;
264 using ACC = typename vector_traits<T>::temp_accumulator;
265 using VEC = typename vector_traits<T>::vector;
266 constexpr int nb_lanes = vector_traits<T>::nb_lanes;
267
268 ACC sum; /* Accumulator */
269
270
271 T *pSrcBT = BT.ptr(); /* Input data matrix pointer for transpose */
272 T *pInA = pSrcA.ptr(); /* Input data matrix pointer A of Q15 type */
273 T *pInB = pSrcB.ptr(); /* Input data matrix pointer B of Q15 type */
274 T *px; /* Temporary output data matrix pointer */
275 uint16_t numRowsA = pSrcA.rows(); /* Number of rows of input matrix A */
276 uint16_t numColsB = pSrcB.columns(); /* Number of columns of input matrix B */
277 uint16_t numColsA = pSrcA.columns(); /* Number of columns of input matrix A */
278 uint16_t numRowsB = pSrcB.rows(); /* Number of rows of input matrix B */
279 uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
280
281 VEC inA1, inB1, inA2, inB2;
282
283
284 /* Reset variables for usage in following multiplication process */
285 row = numRowsA;
286 i = 0U;
287 px = pDst.ptr();
288
289 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
290 /* row loop */
291 do
292 {
293 /* For every row wise process, column loop counter is to be initiated */
294 col = numColsB;
295
296 /* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
297 pInB = pSrcBT;
298
299 /* column loop */
300 do
301 {
302 /* Set variable sum, that acts as accumulator, to zero */
303 sum = ACC{};
304
305 /* Initiate pointer pInA to point to starting address of column being processed */
306 pInA = pSrcA.ptr() + i;
307
308 /* Apply loop unrolling and compute 2 MACs simultaneously. */
309 colCnt = numColsA / (2*nb_lanes);
310
311 /* matrix multiplication */
312 while (colCnt > 0U)
313 {
314 /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
315
316 /* read real and imag values from pSrcA and pSrcB buffer */
317 inA1 = inner::vload1<1> (pInA);
318 pInA += nb_lanes;
319 inB1 = inner::vload1<1> (pInB);
320 pInB += nb_lanes;
321
322 inA2 = inner::vload1<1> (pInA);
323 pInA += nb_lanes;
324 inB2 = inner::vload1<1> (pInB);
325 pInB += nb_lanes;
326
327 /* Multiply and Accumulates */
328 sum = inner::vmacc(sum, inA1, inB1);
329 sum = inner::vmacc(sum, inA2, inB2);
330
331 /* Decrement loop counter */
332 colCnt--;
333 }
334
335 /* process remaining column samples */
336 colCnt = numColsA & (2*nb_lanes-1);
337
338 while (colCnt > 0U)
339 {
340 /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
341 sum = inner::mac(sum ,*pInA++ , *pInB++);
342
343 /* Decrement loop counter */
344 colCnt--;
345 }
346
347 /* Saturate and store result in destination buffer */
348 *px = inner::from_accumulator(sum);
349 px++;
350
351 /* Decrement column loop counter */
352 col--;
353
354 } while (col > 0U);
355
356 i = i + pSrcA.stride();
357
358 /* Decrement row loop counter */
359 row--;
360
361 } while (row > 0U);
362
363 }
364 #endif
365 #endif
366
367 /*! @} */