1 // -*- C++ -*-
2 /** @file */
3 #pragma once
4 
5 
6 /** \addtogroup SCALARALG
7  *  @{
8  */
9 
10 /**
11  * @brief      Transposition for scalar architecture
12  *
13  * @param[in]  src        The source
14  * @param      dst        The destination
15  *
16  * @tparam     MA         Source datatype
17  * @tparam     MB         Destination datatype
18  */
19 template<typename MA,
20          typename MB>
_arm_mat_trans(const MA & src,MB & dst,const Scalar * =nullptr)21 __STATIC_INLINE void _arm_mat_trans(
22     const MA    &src,
23     MB          &dst,
24     const Scalar* = nullptr)
25 {
26   DISABLE_LOOP_UNROLL
27   for(index_t r=0;r < dst.rows() ; r++)
28   {
29         dst.row(r) = copy(src.col(r));
30   }
31 }
32 
33 /**
34  * @brief      Matrix times vector for scalar architecture
35  *
36  * @param      res        Destination
37  * @param[in]  m          Matrix
38  * @param[in]  v          Vector (my be expression)
39  *
40  * @tparam     M          Matrix datatype
41  * @tparam     V          Vector datatype
42  * @tparam     RES        Result datatype
43  */
44 template<typename M,
45          typename V,
46          typename RES>
_dot_m_v(RES & res,const M & m,const V & v,const Scalar * =nullptr)47 inline void _dot_m_v(RES &res,
48                      const M&m,const V&v,
49                      const Scalar* = nullptr)
50 {
51     using T = typename traits<M>::Scalar;
52     using Acc = typename number_traits<T>::accumulator;
53     uint32_t numRows = m.rows();
54     uint32_t numCols = m.columns();
55     const T *pSrcA = m.ptr();
56     const T *pInA1;      /* input data matrix pointer A of Q31 type */
57     const T *pInA2;      /* input data matrix pointer A of Q31 type */
58     const T *pInA3;      /* input data matrix pointer A of Q31 type */
59     const T *pInA4;      /* input data matrix pointer A of Q31 type */
60     T *px;               /* Temporary output data matrix pointer */
61     uint32_t i;
62     uint16_t row, colCnt; /* loop counters */
63     T matData, matData2, vecData, vecData2;
64 
65 
66     /* Process 4 rows at a time */
67     row = numRows >> 2;
68     i = 0u;
69     px = res.ptr();
70 
71     /* The following loop performs the dot-product of each row in pSrcA with the vector */
72     /* row loop */
73     while (row > 0) {
74         /* Initialize accumulators */
75         Acc sum1 = Acc{};
76         Acc sum2 = Acc{};
77         Acc sum3 = Acc{};
78         Acc sum4 = Acc{};
79 
80 
81         /* Loop unrolling: process 2 columns per iteration */
82         //colCnt = numCols;
83 
84         /* Initialize pointers to the starting address of the column being processed */
85         pInA1 = pSrcA + i;
86         pInA2 = pInA1 + m.stride();
87         pInA3 = pInA2 + m.stride();
88         pInA4 = pInA3 + m.stride();
89 
90 
91         // Main loop: matrix-vector multiplication
92         for(colCnt = 0 ; colCnt < numCols; colCnt ++)
93         {
94             // Read 2 values from vector
95             vecData = v[colCnt];
96             // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
97             matData = *(pInA1)++;
98             sum1 = inner::mac(sum1, matData, vecData);
99             matData = *(pInA2)++;
100             sum2 = inner::mac(sum2, matData, vecData);
101             matData = *(pInA3)++;
102             sum3 = inner::mac(sum3, matData, vecData);
103             matData = *(pInA4)++;
104             sum4 = inner::mac(sum4, matData, vecData);
105         }
106 
107         /* Saturate and store the result in the destination buffer */
108         *px++ = inner::from_accumulator(sum1);
109         *px++ = inner::from_accumulator(sum2);
110         *px++ = inner::from_accumulator(sum3);
111         *px++ = inner::from_accumulator(sum4);
112 
113         i = i + m.stride() * 4;
114 
115         /* Decrement the row loop counter */
116         row--;
117     }
118 
119     /* process any remaining rows */
120     row = numRows & 3u;
121     while (row > 0) {
122 
123         Acc sum = Acc{};
124         pInA1 = pSrcA + i;
125 
126         int32_t k=0;
127         for(k=0; k <= (int)numCols-2; k += 2)
128         {
129             vecData = v[k];
130             vecData2 = v[k+1];
131             matData = *(pInA1)++;
132             matData2 = *(pInA1)++;
133             sum = inner::mac(sum, matData, vecData);
134             sum = inner::mac(sum, matData2, vecData2);
135         }
136         // process remainder of row
137 
138 
139         for(; k < (int)numCols; k ++)
140         {
141             sum = inner::mac(sum ,*pInA1++, v[k]);
142         }
143 
144         *px++ = inner::from_accumulator(sum);
145         i = i + m.stride();
146         row--;
147     }
148 }
149 
150 #include "matrix_multiply_fixed.hpp"
151 #include "matrix_multiply_float.hpp"
152 
153 /*! @} */