1 // -*- C++ -*-
2 /** @file */
3 #pragma once
4
5
6 /** \addtogroup SCALARALG
7 * @{
8 */
9
10 /**
11 * @brief Transposition for scalar architecture
12 *
13 * @param[in] src The source
14 * @param dst The destination
15 *
16 * @tparam MA Source datatype
17 * @tparam MB Destination datatype
18 */
19 template<typename MA,
20 typename MB>
_arm_mat_trans(const MA & src,MB & dst,const Scalar * =nullptr)21 __STATIC_INLINE void _arm_mat_trans(
22 const MA &src,
23 MB &dst,
24 const Scalar* = nullptr)
25 {
26 DISABLE_LOOP_UNROLL
27 for(index_t r=0;r < dst.rows() ; r++)
28 {
29 dst.row(r) = copy(src.col(r));
30 }
31 }
32
33 /**
34 * @brief Matrix times vector for scalar architecture
35 *
36 * @param res Destination
37 * @param[in] m Matrix
38 * @param[in] v Vector (my be expression)
39 *
40 * @tparam M Matrix datatype
41 * @tparam V Vector datatype
42 * @tparam RES Result datatype
43 */
44 template<typename M,
45 typename V,
46 typename RES>
_dot_m_v(RES & res,const M & m,const V & v,const Scalar * =nullptr)47 inline void _dot_m_v(RES &res,
48 const M&m,const V&v,
49 const Scalar* = nullptr)
50 {
51 using T = typename traits<M>::Scalar;
52 using Acc = typename number_traits<T>::accumulator;
53 uint32_t numRows = m.rows();
54 uint32_t numCols = m.columns();
55 const T *pSrcA = m.ptr();
56 const T *pInA1; /* input data matrix pointer A of Q31 type */
57 const T *pInA2; /* input data matrix pointer A of Q31 type */
58 const T *pInA3; /* input data matrix pointer A of Q31 type */
59 const T *pInA4; /* input data matrix pointer A of Q31 type */
60 T *px; /* Temporary output data matrix pointer */
61 uint32_t i;
62 uint16_t row, colCnt; /* loop counters */
63 T matData, matData2, vecData, vecData2;
64
65
66 /* Process 4 rows at a time */
67 row = numRows >> 2;
68 i = 0u;
69 px = res.ptr();
70
71 /* The following loop performs the dot-product of each row in pSrcA with the vector */
72 /* row loop */
73 while (row > 0) {
74 /* Initialize accumulators */
75 Acc sum1 = Acc{};
76 Acc sum2 = Acc{};
77 Acc sum3 = Acc{};
78 Acc sum4 = Acc{};
79
80
81 /* Loop unrolling: process 2 columns per iteration */
82 //colCnt = numCols;
83
84 /* Initialize pointers to the starting address of the column being processed */
85 pInA1 = pSrcA + i;
86 pInA2 = pInA1 + m.stride();
87 pInA3 = pInA2 + m.stride();
88 pInA4 = pInA3 + m.stride();
89
90
91 // Main loop: matrix-vector multiplication
92 for(colCnt = 0 ; colCnt < numCols; colCnt ++)
93 {
94 // Read 2 values from vector
95 vecData = v[colCnt];
96 // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
97 matData = *(pInA1)++;
98 sum1 = inner::mac(sum1, matData, vecData);
99 matData = *(pInA2)++;
100 sum2 = inner::mac(sum2, matData, vecData);
101 matData = *(pInA3)++;
102 sum3 = inner::mac(sum3, matData, vecData);
103 matData = *(pInA4)++;
104 sum4 = inner::mac(sum4, matData, vecData);
105 }
106
107 /* Saturate and store the result in the destination buffer */
108 *px++ = inner::from_accumulator(sum1);
109 *px++ = inner::from_accumulator(sum2);
110 *px++ = inner::from_accumulator(sum3);
111 *px++ = inner::from_accumulator(sum4);
112
113 i = i + m.stride() * 4;
114
115 /* Decrement the row loop counter */
116 row--;
117 }
118
119 /* process any remaining rows */
120 row = numRows & 3u;
121 while (row > 0) {
122
123 Acc sum = Acc{};
124 pInA1 = pSrcA + i;
125
126 int32_t k=0;
127 for(k=0; k <= (int)numCols-2; k += 2)
128 {
129 vecData = v[k];
130 vecData2 = v[k+1];
131 matData = *(pInA1)++;
132 matData2 = *(pInA1)++;
133 sum = inner::mac(sum, matData, vecData);
134 sum = inner::mac(sum, matData2, vecData2);
135 }
136 // process remainder of row
137
138
139 for(; k < (int)numCols; k ++)
140 {
141 sum = inner::mac(sum ,*pInA1++, v[k]);
142 }
143
144 *px++ = inner::from_accumulator(sum);
145 i = i + m.stride();
146 row--;
147 }
148 }
149
150 #include "matrix_multiply_fixed.hpp"
151 #include "matrix_multiply_float.hpp"
152
153 /*! @} */