1 // -*- C++ -*- 2 /** @file */ 3 #pragma once 4 5 /** \addtogroup GenericNumber 6 * \ingroup NUMBER 7 * @{ 8 * \addtogroup GenericTUPLENumber Tuple 9 * Tuples of numbers or expressions used for unrolling 10 * \ingroup GenericNumber 11 * @{ 12 */ 13 14 /** 15 * @brief Number description for a tuple of numbers 16 * 17 * @tparam E Datatype for all numbers 18 */ 19 template<typename ... E> 20 struct number_traits<std::tuple<E...>> 21 { 22 //! It is not a float number 23 static constexpr bool is_float = false; 24 25 //! It is not a fixed point number 26 static constexpr bool is_fixed = false; 27 28 //! Tuple of accumulator datatype for the accumulator type 29 typedef std::tuple<typename number_traits<E>::accumulator...> accumulator; 30 31 //! Tuple of compute datatype for the compute type 32 typedef std::tuple<typename number_traits<E>::compute_type...> compute_type; 33 34 /** 35 * @brief Return of tuples of one values 36 * 37 * @return Tuples of one values with different datatypes 38 */ onenumber_traits39 static std::tuple<typename number_traits<E>::accumulator...> one() 40 { 41 return(std::make_tuple(vector_traits<E>::one()...)); 42 } 43 44 }; 45 46 /* 47 48 Assume that all E are using the same scalar type or coherent types 49 like f32 and q13 that have same number of lanes. 50 51 Any other mix will not work and won't be catched at build time. 52 53 */ 54 55 /** 56 * @brief Tuple of compatible vectors 57 * 58 * @tparam arch Current architecture 59 * @tparam E List of vector dataypes 60 * 61 * The vector datatypes must be coherent : have same number of lanes 62 * or same lane datatype 63 */ 64 template<typename arch,typename ... E> 65 struct vector_traits<std::tuple<E...>,arch> { 66 67 //! First element of tuple defines the scalar datatype 68 using RefScalar = typename std::tuple_element<0,std::tuple<E...>>::type; 69 70 71 //! Temporary accumulator datatype 72 typedef std::tuple<typename vector_traits<E,arch>::temp_accumulator...> temp_accumulator; 73 74 //! Vector datatype 75 typedef std::tuple<typename vector_traits<E,arch>::vector...> vector; 76 77 //! Predicate datatype 78 typedef std::tuple<typename vector_traits<E,arch>::predicate_t...> predicate_t; 79 80 //! Number of lanes (from RefScalar) 81 static constexpr int nb_lanes = vector_traits<RefScalar,arch>::nb_lanes; 82 83 //! Has vector instructions 84 static constexpr bool has_vector = vector_traits<RefScalar,arch>::has_vector; 85 86 //! Is a float 87 static constexpr bool is_float = vector_traits<RefScalar,arch>::is_float; 88 89 //! Is fixed point 90 static constexpr bool is_fixed = vector_traits<RefScalar,arch>::is_fixed; 91 92 //! Has predicated loop 93 static constexpr bool has_predicate = vector_traits<RefScalar,arch>::has_predicate; 94 95 /** 96 * @brief Zero represented with temp accumulator datatype 97 * 98 * @return Zero represented with temp accumulator datatype 99 */ temp_acc_zerovector_traits100 static temp_accumulator temp_acc_zero() 101 { 102 return(std::make_tuple(vector_traits<E,arch>::temp_acc_zero()...)); 103 } 104 105 }; 106 107 /** 108 * Inner implementation of generic intrinsics 109 * \ingroup GenericNumber 110 */ 111 namespace inner { 112 113 114 115 /* 116 117 Assume that the vctpq is the same for all tuple elements. 118 If it is not the case, we can't get a predicated loop and 119 the code contains additional VPSTTTT and it is not 120 efficient. 121 122 */ 123 #if defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) 124 /** 125 * @brief Tuple of predicates 126 * 127 * @tparam E Tuple of datatypes 128 * 129 * The datatypes must be coherent (same number of lanes). 130 * The first element is used to infer the vctpq instruction to use 131 */ 132 template<typename ...E> 133 struct vctpq<std::tuple<E...>> 134 { 135 /** 136 * @brief Make a predicate for predicated loop 137 * 138 * @param[in] v Remaining number of iterations 139 * 140 * @return Predicate 141 */ mkinner::vctpq142 static auto mk(const uint32_t v/*, 143 typename std::enable_if<(vector_traits<E>::nb_lanes == ...),bool>::type* = nullptr*/) 144 { 145 return(vctpq<std::tuple_element_t<0, std::tuple<E...>>>::mk(v)); 146 }; 147 }; 148 #endif 149 /* 150 151 Typical configuration is vmacc between tuple and tuple 152 but also very common is vmacc between tuple and vector 153 154 */ 155 156 /** 157 * @brief Vector accumulate for tuples of vectors 158 * 159 * @param[in] acc The accumulator 160 * @param[in] a First operand 161 * @param[in] b Second operand 162 * 163 * @tparam A Accumulator datatype 164 * @tparam V Vector datatype 165 * @tparam Ns Tuple index 166 * 167 * @return tuple of results 168 */ 169 template<typename A,typename V,std::size_t... Ns> vmacc_impl(const A & acc,const V & a,const V & b,std::index_sequence<Ns...>)170 __STATIC_FORCEINLINE A vmacc_impl(const A &acc,const V &a,const V &b, std::index_sequence<Ns...>) 171 { 172 return(std::make_tuple(vmacc(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b))...)); 173 }; 174 175 /** 176 * @brief Vector accumulate for tuples of vectors 177 * 178 * @param[in] acc The accumulator 179 * @param[in] a First operand 180 * @param[in] b Second operand 181 * 182 * @tparam A Accumulator datatype 183 * @tparam E Datatype of tuples elements 184 * 185 * @return Accumulator result 186 */ 187 template<typename A,typename ...E> 188 __STATIC_FORCEINLINE A vmacc(const A & acc,const std::tuple<E...> & a,const std::tuple<E...> & b)189 vmacc(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b) 190 { 191 return(vmacc_impl(acc,a,b,std::make_index_sequence<sizeof...(E)>())); 192 }; 193 194 /** 195 * @brief Predicated vector accumulate for tuple 196 * 197 * @param[in] acc Accumulator 198 * @param[in] a First operand 199 * @param[in] b Second operand 200 * @param[in] p0 Predicate 201 * 202 * @tparam A Accumulator datatype 203 * @tparam V Vector datatype 204 * @tparam B Predicate datatype 205 * @tparam Ns Tuple indexes 206 * 207 * @return Tuple of accumulated values 208 */ 209 template<typename A,typename V,typename B,std::size_t... Ns> vmacc_impl(const A & acc,const V & a,const V & b,const B p0,std::index_sequence<Ns...>)210 __STATIC_FORCEINLINE A vmacc_impl(const A &acc,const V &a,const V &b, const B p0,std::index_sequence<Ns...>) 211 { 212 return(std::make_tuple(vmacc(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b),p0)...)); 213 }; 214 215 /** 216 * @brief Predicated vector accumulate for tuples 217 * 218 * @param[in] acc Accumulator 219 * @param[in] a First operand 220 * @param[in] b Second operand 221 * @param[in] p0 Predicate 222 * 223 * @tparam A Accumulator datatype 224 * @tparam B Predicate datatype 225 * @tparam E Dadatype of tuples elements 226 * 227 * @return Tuple of accumulated vectors 228 */ 229 template<typename A,typename B,typename ...E> 230 __STATIC_FORCEINLINE A vmacc(const A & acc,const std::tuple<E...> & a,const std::tuple<E...> & b,const B p0)231 vmacc(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b,const B p0) 232 { 233 return(vmacc_impl(acc,a,b,p0,std::make_index_sequence<sizeof...(E)>())); 234 }; 235 236 237 238 /** 239 * @brief Reduce function for tuple 240 * 241 * @param[in] acc Accumulator 242 * 243 * @tparam A Accumulator datatype 244 * @tparam Ns Tuple indexes 245 * 246 * @return Reduced accumulator values 247 * 248 * Some vector instructions sets cannot accumulate vectors 249 * into a scalar. They accumulate into this vector. 250 * This vector must be reduced to a scalar at the end of 251 * the accumulation loop. 252 */ 253 template<typename A,std::size_t... Ns> vreduce_impl(const A & acc,std::index_sequence<Ns...>)254 __STATIC_FORCEINLINE auto vreduce_impl(const A &acc, std::index_sequence<Ns...>) 255 { 256 return(std::make_tuple(vreduce(std::get<Ns>(acc))...)); 257 }; 258 259 /** 260 * @brief Reduce function for tuples 261 * 262 * @param[in] acc The accumulator 263 * 264 * @tparam E Datatypes for tuples 265 * 266 * @return Tuples of reduced values 267 * 268 * Some vector instructions sets cannot accumulate vectors 269 * into a scalar. They accumulate into this vector. 270 * This vector must be reduced to a scalar at the end of 271 * the accumulation loop. 272 * 273 */ 274 template<typename ...E> vreduce(const std::tuple<E...> & acc)275 __STATIC_FORCEINLINE auto vreduce(const std::tuple<E...> &acc) 276 { 277 return(vreduce_impl(acc,std::make_index_sequence<sizeof...(E)>())); 278 }; 279 280 /** 281 * @brief Convert from accumulator value 282 * 283 * @param[in] acc The accumulator 284 * 285 * @tparam A Accumulator datatype 286 * @tparam Ns Tuples indexes 287 * 288 * @return Tuples of values 289 */ 290 template<typename A,std::size_t... Ns> from_accumulator_impl(const A & acc,std::index_sequence<Ns...>)291 __STATIC_FORCEINLINE auto from_accumulator_impl(const A &acc, std::index_sequence<Ns...>) 292 { 293 return(std::make_tuple(from_accumulator(std::get<Ns>(acc))...)); 294 }; 295 296 /** 297 * @brief Convert from tuple of accumulator values 298 * 299 * @param[in] acc Accumulator 300 * 301 * @tparam E Datatypes for tuple 302 * 303 * @return Tuples of converted accumulator values 304 * 305 * Accumulator may use more bits to avoid saturations. 306 * At the end of the accumulation, the final result must 307 * be converted to the current datatype (it may implies saturation) 308 */ 309 template<typename ...E> from_accumulator(const std::tuple<E...> & acc)310 __STATIC_FORCEINLINE auto from_accumulator(const std::tuple<E...> &acc) 311 { 312 return(from_accumulator_impl(acc,std::make_index_sequence<sizeof...(E)>())); 313 }; 314 315 /** 316 * @brief Multiply accumulate for tuple of scalar 317 * 318 * @param[in] acc Accumulator 319 * @param[in] a First operand 320 * @param[in] b Second operand 321 * 322 * @tparam A Accumulator datatype 323 * @tparam V Scalar datatype 324 * @tparam Ns Tuple indexes 325 * 326 * @return Tuples of accumulated values 327 */ 328 template<typename A,typename V,std::size_t... Ns> mac_impl(const A & acc,const V & a,const V & b,std::index_sequence<Ns...>)329 __STATIC_FORCEINLINE A mac_impl(const A &acc,const V &a,const V &b, std::index_sequence<Ns...>) 330 { 331 return(std::make_tuple(mac(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b))...)); 332 }; 333 334 /** 335 * @brief Multiply accumulate 336 * 337 * @param[in] acc Accumulator 338 * @param[in] a First operand 339 * @param[in] b Second operand 340 * 341 * @tparam A Accumulator datatype 342 * @tparam E Datatypes for tuple 343 * 344 * @return Accumulated values 345 */ 346 template<typename A,typename ...E> 347 __STATIC_FORCEINLINE A mac(const A & acc,const std::tuple<E...> & a,const std::tuple<E...> & b)348 mac(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b) 349 { 350 return(mac_impl(acc,a,b,std::make_index_sequence<sizeof...(E)>())); 351 }; 352 353 /** 354 * @brief Multiply accumulate for tuple of scalar 355 * 356 * @param[in] acc Accumulator 357 * @param[in] a First operand 358 * @param[in] b Second operand 359 * @param[in] p0 Predicate 360 * 361 * @tparam A Accumulator datatype 362 * @tparam V Scalar datatype 363 * @tparam B Predicate datatype 364 * @tparam Ns Tuple indexes 365 * 366 * @return Tuples of accumulated values 367 */ 368 template<typename A,typename V,typename B,std::size_t... Ns> mac_impl(const A & acc,const V & a,const V & b,const B p0,std::index_sequence<Ns...>)369 __STATIC_FORCEINLINE A mac_impl(const A &acc,const V &a,const V &b, const B p0,std::index_sequence<Ns...>) 370 { 371 return(std::make_tuple(mac(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b),p0)...)); 372 }; 373 374 /** 375 * @brief Multiply accumulate 376 * 377 * @param[in] acc Accumulator 378 * @param[in] a First operand 379 * @param[in] b Second operand 380 * @param[in] p0 Predicate 381 * 382 * @tparam A Accumulator datatype 383 * @tparam B Predicate datatype 384 * @tparam E Datatypes for tuple 385 * 386 * @return Accumulated values 387 */ 388 template<typename A,typename B,typename ...E> 389 __STATIC_FORCEINLINE A mac(const A & acc,const std::tuple<E...> & a,const std::tuple<E...> & b,const B p0)390 mac(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b,const B p0) 391 { 392 return(mac_impl(acc,a,b,p0,std::make_index_sequence<sizeof...(E)>())); 393 }; 394 395 }; 396 397 398 /*! @} */ 399 /*! @} */