1 // -*- C++ -*-
2 /** @file */
3 #pragma once
4 
5 #ifdef DOXYGEN
6 #define ARM_MATH_DSP
7 #undef ARM_MATH_MVEI
8 #undef ARM_MATH_MVEF
9 #undef ARM_MATH_NEON
10 #endif
11 
12 /** \addtogroup DSPALG
13  *  @{
14  */
15 
16 #if defined(ARM_MATH_DSP)
17 #if !defined(ARM_MATH_MVEI) && !defined(ARM_MATH_MVEF) && !defined(ARM_MATH_NEON)
18 
19 template<typename MA,
20          typename MB,
21          typename std::enable_if<
22          std::is_same<typename traits<MA>::Scalar,Q15>::value &&
23          number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
_arm_mat_trans(const MA & src,MB & dst,const DSP * =nullptr)24 __STATIC_INLINE void _arm_mat_trans(
25     const MA    &src,
26     MB          &dst,
27     const DSP* = nullptr)
28 {
29     using T = typename traits<MA>::Scalar;
30     using VEC = typename vector_traits<T>::vector;
31     constexpr int nb_lanes = vector_traits<T>::nb_lanes;
32 
33     T *pIn = src.ptr();                      /* input data matrix pointer */
34     T *pOut = dst.ptr();                     /* output data matrix pointer */
35     uint16_t nRows = src.rows();                /* number of rows */
36     uint16_t nCols = src.columns();                /* number of columns */
37     uint32_t col, row = nRows, i = 0U;             /* Loop counters */
38 
39     VEC in;                                      /* variable to hold temporary output  */
40 
41     /* Matrix transpose by exchanging the rows with columns */
42     /* row loop */
43     do
44     {
45       /* Pointer pOut is set to starting address of column being processed */
46       pOut = dst.ptr() + i;
47 
48 
49       /* Loop unrolling: Compute 4 outputs at a time */
50       col = nCols / (2*nb_lanes);
51 
52       while (col > 0U)        /* column loop */
53       {
54         /* Read two elements from row */
55         in = inner::vload1<1>(pIn);
56         pIn += nb_lanes;
57 
58         /* Unpack and store one element in  destination */
59         *pOut = Q15(in.v);
60         /* Update pointer pOut to point to next row of transposed matrix */
61         pOut += dst.stride();
62 
63         /* Unpack and store second element in destination */
64         *pOut = Q15((in.v & (q31_t) 0xffff0000) >> 16);
65         /* Update  pointer pOut to point to next row of transposed matrix */
66         pOut += dst.stride();
67 
68         /* Read two elements from row */
69         in = inner::vload1<1>(pIn);
70         pIn += nb_lanes;
71 
72         /* Unpack and store one element in destination */
73         *pOut = Q15(in.v);
74         /* Update pointer pOut to point to next row of transposed matrix */
75         pOut += dst.stride();
76 
77         /* Unpack and store second element in destination */
78         *pOut = Q15((in & (q31_t) 0xffff0000) >> 16);
79         /* Update pointer pOut to point to next row of transposed matrix */
80         pOut += dst.stride();
81 
82         /* Decrement column loop counter */
83         col--;
84       }
85 
86       /* Loop unrolling: Compute remaining outputs */
87       col = nCols & (2*nb_lanes-1);
88       while (col > 0U)
89       {
90         /* Read and store input element in destination */
91         *pOut = *pIn++;
92 
93         /* Update pointer pOut to point to next row of transposed matrix */
94         pOut += dst.stride();
95 
96         /* Decrement column loop counter */
97         col--;
98       }
99 
100       pIn += (src.stride()-nCols);
101 
102       i ++;
103 
104       /* Decrement row loop counter */
105       row--;
106 
107     } while (row > 0U);          /* row loop end */
108 
109 
110 
111 }
112 
113 
114 template<typename M,
115          typename V,
116          typename RES,
117          typename std::enable_if<
118          !std::is_same<typename traits<M>::Scalar,Q31>::value &&
119          number_traits<typename traits<M>::Scalar>::is_fixed,bool>::type = true>
_dot_m_v(RES & res,const M & m,const V & v,const DSP * =nullptr)120 inline void _dot_m_v(RES &res,
121                      const M&m,const V&v,
122                      const DSP* = nullptr)
123 {
124     using T = typename traits<M>::Scalar;
125     using ACC = typename vector_traits<T>::temp_accumulator;
126     using VEC = typename vector_traits<T>::vector;
127     constexpr int nb_lanes = vector_traits<T>::nb_lanes;
128 
129     uint32_t numRows = m.rows();
130     uint32_t numCols = m.columns();
131     const T *pSrcA = m.ptr();
132     const T *pInA1;      /* input data matrix pointer A of Q15 type */
133     const T *pInA2;      /* input data matrix pointer A of Q15 type */
134     const T *pInA3;      /* input data matrix pointer A of Q15 type */
135     const T *pInA4;      /* input data matrix pointer A of Q15 type */
136     T *px;               /* Temporary output data matrix pointer */
137     uint16_t i, row; /* loop counters */
138     int16_t colCnt;
139     VEC matData, matData2, vecData, vecData2;
140     T tmpData;
141 
142 
143     /* Process 4 rows at a time */
144     row = numRows >> 2;
145     i = 0u;
146     px = res.ptr();
147 
148     /* The following loop performs the dot-product of each row in pSrcA with the vector */
149     /* row loop */
150     while (row > 0) {
151         /* Initialize accumulators */
152         ACC sum1 = ACC{};
153         ACC sum2 = ACC{};
154         ACC sum3 = ACC{};
155         ACC sum4 = ACC{};
156 
157         /* For every row wise process, the pInVec pointer is set
158          ** to the starting address of the vector */
159 
160         /* Loop unrolling: process 2 columns per iteration */
161 
162         /* Initialize pointers to the starting address of the column being processed */
163         pInA1 = pSrcA + i;
164         pInA2 = pInA1 + m.stride();
165         pInA3 = pInA2 + m.stride();
166         pInA4 = pInA3 + m.stride();
167 
168         // Main loop: matrix-vector multiplication
169         for(colCnt = 0 ; colCnt <= (int16_t)numCols - nb_lanes; colCnt += nb_lanes)
170         {
171             // Read 2 values from vector
172             vecData = v.vector_op(colCnt);
173 
174             // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
175             matData =  inner::vload1<1>  (pInA1);
176             pInA1 += nb_lanes;
177             sum1 = inner::vmacc(sum1, matData, vecData);
178 
179             matData = inner::vload1<1>  (pInA2);
180             pInA2 += nb_lanes;
181             sum2 = inner::vmacc(sum2, matData, vecData);
182 
183             matData = inner::vload1<1>  (pInA3);
184             pInA3 += nb_lanes;
185             sum3 = inner::vmacc(sum3, matData, vecData);
186 
187             matData = inner::vload1<1>  (pInA4);
188             pInA4 += nb_lanes;
189             sum4 = inner::vmacc(sum4, matData, vecData);
190 
191             // Decrement the loop counter
192         }
193 
194         /* process any remaining columns */
195 
196         for(; colCnt < (int16_t)numCols; colCnt ++)
197         {
198             tmpData = v[colCnt];
199             sum1 = inner::mac(sum1,*pInA1++ , tmpData);
200             sum2 = inner::mac(sum2,*pInA2++ , tmpData);
201             sum3 = inner::mac(sum3,*pInA3++ , tmpData);
202             sum4 = inner::mac(sum4,*pInA4++ , tmpData);
203         }
204 
205         /* Saturate and store the result in the destination buffer */
206         *px++ = inner::from_accumulator(sum1);
207         *px++ = inner::from_accumulator(sum2);
208         *px++ = inner::from_accumulator(sum3);
209         *px++ = inner::from_accumulator(sum4);
210 
211         i = i + m.stride() * 4;
212 
213         /* Decrement the row loop counter */
214         row--;
215     }
216 
217     /* process any remaining rows */
218     row = numRows & 3u;
219     while (row > 0) {
220 
221         ACC sum = ACC{};
222         pInA1 = pSrcA + i;
223 
224         // loop unrolling - process 4 elements at a time
225 
226         for(colCnt = 0 ; colCnt <= (int16_t)numCols - 2*nb_lanes; colCnt += 2*nb_lanes)
227         {
228             vecData = v.vector_op(colCnt);
229             vecData2 = v.vector_op(colCnt+nb_lanes);
230 
231             matData = inner::vload1<1>(pInA1);
232             pInA1 += nb_lanes;
233             matData2 = inner::vload1<1>(pInA1);
234             pInA1 += nb_lanes;
235             sum = inner::vmacc(sum, matData, vecData);
236             sum = inner::vmacc(sum, matData2, vecData2);
237         }
238 
239         // process remainder of row
240         for(; colCnt < (int16_t)numCols; colCnt ++)
241         {
242 
243             sum = inner::mac(sum, *pInA1++ , v[colCnt]);
244         }
245         *px++ = inner::from_accumulator(sum);
246         i = i + m.stride();
247         row--;
248     }
249 }
250 
251 template<typename MA,
252          typename MB,
253          typename RES,
254          typename TMP,
255          typename std::enable_if<
256          !std::is_same<typename traits<MA>::Scalar,Q31>::value &&
257          number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
_dot_m_m(const MA & pSrcA,const MB & pSrcB,RES && pDst,const TMP & BT,const DSP * =nullptr)258 __STATIC_INLINE void _dot_m_m(const MA&pSrcA,const MB&pSrcB,
259                      RES &&pDst,
260                      const TMP &BT,
261                      const DSP* = nullptr)
262 {
263   using T = typename traits<MA>::Scalar;
264   using ACC = typename vector_traits<T>::temp_accumulator;
265   using VEC = typename vector_traits<T>::vector;
266   constexpr int nb_lanes = vector_traits<T>::nb_lanes;
267 
268           ACC sum;                                     /* Accumulator */
269 
270 
271         T *pSrcBT = BT.ptr();                        /* Input data matrix pointer for transpose */
272         T *pInA = pSrcA.ptr();                    /* Input data matrix pointer A of Q15 type */
273         T *pInB = pSrcB.ptr();                    /* Input data matrix pointer B of Q15 type */
274         T *px;                                     /* Temporary output data matrix pointer */
275         uint16_t numRowsA = pSrcA.rows();            /* Number of rows of input matrix A */
276         uint16_t numColsB = pSrcB.columns();            /* Number of columns of input matrix B */
277         uint16_t numColsA = pSrcA.columns();            /* Number of columns of input matrix A */
278         uint16_t numRowsB = pSrcB.rows();            /* Number of rows of input matrix B */
279         uint32_t col, i = 0U, row = numRowsB, colCnt;  /* Loop counters */
280 
281         VEC inA1, inB1, inA2, inB2;
282 
283 
284     /* Reset variables for usage in following multiplication process */
285     row = numRowsA;
286     i = 0U;
287     px = pDst.ptr();
288 
289     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
290     /* row loop */
291     do
292     {
293       /* For every row wise process, column loop counter is to be initiated */
294       col = numColsB;
295 
296       /* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
297       pInB = pSrcBT;
298 
299       /* column loop */
300       do
301       {
302         /* Set variable sum, that acts as accumulator, to zero */
303         sum = ACC{};
304 
305         /* Initiate pointer pInA to point to starting address of column being processed */
306         pInA = pSrcA.ptr() + i;
307 
308         /* Apply loop unrolling and compute 2 MACs simultaneously. */
309         colCnt = numColsA / (2*nb_lanes);
310 
311         /* matrix multiplication */
312         while (colCnt > 0U)
313         {
314           /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
315 
316           /* read real and imag values from pSrcA and pSrcB buffer */
317           inA1 = inner::vload1<1> (pInA);
318           pInA += nb_lanes;
319           inB1 = inner::vload1<1> (pInB);
320           pInB += nb_lanes;
321 
322           inA2 = inner::vload1<1> (pInA);
323           pInA += nb_lanes;
324           inB2 = inner::vload1<1> (pInB);
325           pInB += nb_lanes;
326 
327           /* Multiply and Accumulates */
328           sum = inner::vmacc(sum, inA1, inB1);
329           sum = inner::vmacc(sum, inA2, inB2);
330 
331           /* Decrement loop counter */
332           colCnt--;
333         }
334 
335         /* process remaining column samples */
336         colCnt = numColsA & (2*nb_lanes-1);
337 
338         while (colCnt > 0U)
339         {
340           /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
341           sum = inner::mac(sum ,*pInA++ , *pInB++);
342 
343           /* Decrement loop counter */
344           colCnt--;
345         }
346 
347         /* Saturate and store result in destination buffer */
348         *px = inner::from_accumulator(sum);
349         px++;
350 
351         /* Decrement column loop counter */
352         col--;
353 
354       } while (col > 0U);
355 
356       i = i + pSrcA.stride();
357 
358       /* Decrement row loop counter */
359       row--;
360 
361     } while (row > 0U);
362 
363 }
364 #endif
365 #endif
366 
367 /*! @} */