1 // -*- C++ -*-
2 /** @file */
3 #pragma once
4 
5 /** \addtogroup GenericNumber
6  *  \ingroup NUMBER
7  *  @{
8  *  \addtogroup GenericTUPLENumber Tuple
9  *  Tuples of numbers or expressions used for unrolling
10  *  \ingroup GenericNumber
11  *  @{
12  */
13 
14 /**
15  * @brief      Number description for a tuple of numbers
16  *
17  * @tparam     E     Datatype for all numbers
18  */
19 template<typename ... E>
20 struct number_traits<std::tuple<E...>>
21 {
22    //! It is not a float number
23    static constexpr bool is_float = false;
24 
25    //! It is not a fixed point number
26    static constexpr bool is_fixed = false;
27 
28    //! Tuple of accumulator datatype for the accumulator type
29    typedef std::tuple<typename number_traits<E>::accumulator...> accumulator;
30 
31    //! Tuple of compute datatype for the compute type
32    typedef std::tuple<typename number_traits<E>::compute_type...> compute_type;
33 
34    /**
35     * @brief      Return of tuples of one values
36     *
37     * @return     Tuples of one values with different datatypes
38     */
onenumber_traits39    static std::tuple<typename number_traits<E>::accumulator...> one()
40    {
41        return(std::make_tuple(vector_traits<E>::one()...));
42    }
43 
44 };
45 
46 /*
47 
48 Assume that all E are using the same scalar type or coherent types
49 like f32 and q13 that have same number of lanes.
50 
51 Any other mix will not work and won't be catched at build time.
52 
53 */
54 
55 /**
56  * @brief      Tuple of compatible vectors
57  *
58  * @tparam     arch  Current architecture
59  * @tparam     E     List of vector dataypes
60  *
61  * The vector datatypes must be coherent : have same number of lanes
62  * or same lane datatype
63  */
64 template<typename arch,typename ... E>
65 struct vector_traits<std::tuple<E...>,arch> {
66 
67   //! First element of tuple defines the scalar datatype
68   using RefScalar = typename std::tuple_element<0,std::tuple<E...>>::type;
69 
70 
71   //! Temporary accumulator datatype
72   typedef std::tuple<typename vector_traits<E,arch>::temp_accumulator...> temp_accumulator;
73 
74   //! Vector datatype
75   typedef std::tuple<typename vector_traits<E,arch>::vector...> vector;
76 
77   //! Predicate datatype
78   typedef std::tuple<typename vector_traits<E,arch>::predicate_t...> predicate_t;
79 
80   //! Number of lanes (from RefScalar)
81   static constexpr int nb_lanes = vector_traits<RefScalar,arch>::nb_lanes;
82 
83   //! Has vector instructions
84   static constexpr bool has_vector = vector_traits<RefScalar,arch>::has_vector;
85 
86   //! Is a float
87   static constexpr bool is_float = vector_traits<RefScalar,arch>::is_float;
88 
89   //! Is fixed point
90   static constexpr bool is_fixed = vector_traits<RefScalar,arch>::is_fixed;
91 
92   //! Has predicated loop
93   static constexpr bool has_predicate = vector_traits<RefScalar,arch>::has_predicate;
94 
95   /**
96    * @brief      Zero represented with temp accumulator datatype
97    *
98    * @return     Zero represented with temp accumulator datatype
99    */
temp_acc_zerovector_traits100   static temp_accumulator temp_acc_zero()
101   {
102      return(std::make_tuple(vector_traits<E,arch>::temp_acc_zero()...));
103   }
104 
105 };
106 
107 /**
108  * Inner implementation of generic intrinsics
109  * \ingroup GenericNumber
110  */
111 namespace inner {
112 
113 
114 
115   /*
116 
117   Assume that the vctpq is the same for all tuple elements.
118   If it is not the case, we can't get a predicated loop and
119   the code contains additional VPSTTTT and it is not
120   efficient.
121 
122   */
123 #if defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
124   /**
125    * @brief      Tuple of predicates
126    *
127    * @tparam     E     Tuple of datatypes
128    *
129    * The datatypes must be coherent (same number of lanes).
130    * The first element is used to infer the vctpq instruction to use
131    */
132   template<typename ...E>
133   struct vctpq<std::tuple<E...>>
134   {
135      /**
136       * @brief      Make a predicate for predicated loop
137       *
138       * @param[in]  v     Remaining number of iterations
139       *
140       * @return     Predicate
141       */
mkinner::vctpq142      static auto mk(const uint32_t v/*,
143       typename std::enable_if<(vector_traits<E>::nb_lanes == ...),bool>::type* = nullptr*/)
144      {
145         return(vctpq<std::tuple_element_t<0, std::tuple<E...>>>::mk(v));
146      };
147   };
148 #endif
149   /*
150 
151   Typical configuration is vmacc between tuple and tuple
152   but also very common is vmacc between tuple and vector
153 
154   */
155 
156   /**
157    * @brief      Vector accumulate for tuples of vectors
158    *
159    * @param[in]  acc        The accumulator
160    * @param[in]  a          First operand
161    * @param[in]  b          Second operand
162    *
163    * @tparam     A          Accumulator datatype
164    * @tparam     V          Vector datatype
165    * @tparam     Ns         Tuple index
166    *
167    * @return     tuple of results
168    */
169   template<typename A,typename V,std::size_t... Ns>
vmacc_impl(const A & acc,const V & a,const V & b,std::index_sequence<Ns...>)170   __STATIC_FORCEINLINE A vmacc_impl(const A &acc,const V &a,const V &b, std::index_sequence<Ns...>)
171   {
172      return(std::make_tuple(vmacc(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b))...));
173   };
174 
175   /**
176    * @brief      Vector accumulate for tuples of vectors
177    *
178    * @param[in]  acc   The accumulator
179    * @param[in]  a     First operand
180    * @param[in]  b     Second operand
181    *
182    * @tparam     A     Accumulator datatype
183    * @tparam     E     Datatype of tuples elements
184    *
185    * @return     Accumulator result
186    */
187   template<typename A,typename ...E>
188   __STATIC_FORCEINLINE A
vmacc(const A & acc,const std::tuple<E...> & a,const std::tuple<E...> & b)189   vmacc(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b)
190   {
191      return(vmacc_impl(acc,a,b,std::make_index_sequence<sizeof...(E)>()));
192   };
193 
194   /**
195    * @brief      Predicated vector accumulate for tuple
196    *
197    * @param[in]  acc        Accumulator
198    * @param[in]  a          First operand
199    * @param[in]  b          Second operand
200    * @param[in]  p0         Predicate
201    *
202    * @tparam     A          Accumulator datatype
203    * @tparam     V          Vector datatype
204    * @tparam     B          Predicate datatype
205    * @tparam     Ns         Tuple indexes
206    *
207    * @return     Tuple of accumulated values
208    */
209   template<typename A,typename V,typename B,std::size_t... Ns>
vmacc_impl(const A & acc,const V & a,const V & b,const B p0,std::index_sequence<Ns...>)210   __STATIC_FORCEINLINE A vmacc_impl(const A &acc,const V &a,const V &b, const B p0,std::index_sequence<Ns...>)
211   {
212      return(std::make_tuple(vmacc(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b),p0)...));
213   };
214 
215   /**
216    * @brief      Predicated vector accumulate for tuples
217    *
218    * @param[in]  acc   Accumulator
219    * @param[in]  a     First operand
220    * @param[in]  b     Second operand
221    * @param[in]  p0    Predicate
222    *
223    * @tparam     A     Accumulator datatype
224    * @tparam     B     Predicate datatype
225    * @tparam     E     Dadatype of tuples elements
226    *
227    * @return     Tuple of accumulated vectors
228    */
229   template<typename A,typename B,typename ...E>
230   __STATIC_FORCEINLINE A
vmacc(const A & acc,const std::tuple<E...> & a,const std::tuple<E...> & b,const B p0)231   vmacc(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b,const B p0)
232   {
233      return(vmacc_impl(acc,a,b,p0,std::make_index_sequence<sizeof...(E)>()));
234   };
235 
236 
237 
238   /**
239    * @brief      Reduce function for tuple
240    *
241    * @param[in]  acc        Accumulator
242    *
243    * @tparam     A          Accumulator datatype
244    * @tparam     Ns         Tuple indexes
245    *
246    * @return     Reduced accumulator values
247    *
248    * Some vector instructions sets cannot accumulate vectors
249    * into a scalar. They accumulate into this vector.
250    * This vector must be reduced to a scalar at the end of
251    * the accumulation loop.
252    */
253   template<typename A,std::size_t... Ns>
vreduce_impl(const A & acc,std::index_sequence<Ns...>)254   __STATIC_FORCEINLINE auto vreduce_impl(const A &acc, std::index_sequence<Ns...>)
255   {
256      return(std::make_tuple(vreduce(std::get<Ns>(acc))...));
257   };
258 
259 /**
260  * @brief      Reduce function for tuples
261  *
262  * @param[in]  acc   The accumulator
263  *
264  * @tparam     E     Datatypes for tuples
265  *
266  * @return     Tuples of reduced values
267  *
268  * Some vector instructions sets cannot accumulate vectors
269  * into a scalar. They accumulate into this vector.
270  * This vector must be reduced to a scalar at the end of
271  * the accumulation loop.
272  *
273  */
274   template<typename ...E>
vreduce(const std::tuple<E...> & acc)275   __STATIC_FORCEINLINE auto vreduce(const std::tuple<E...> &acc)
276   {
277      return(vreduce_impl(acc,std::make_index_sequence<sizeof...(E)>()));
278   };
279 
280    /**
281     * @brief      Convert from accumulator value
282     *
283     * @param[in]  acc        The accumulator
284     *
285     * @tparam     A          Accumulator datatype
286     * @tparam     Ns         Tuples indexes
287     *
288     * @return     Tuples of values
289     */
290    template<typename A,std::size_t... Ns>
from_accumulator_impl(const A & acc,std::index_sequence<Ns...>)291   __STATIC_FORCEINLINE auto from_accumulator_impl(const A &acc, std::index_sequence<Ns...>)
292   {
293      return(std::make_tuple(from_accumulator(std::get<Ns>(acc))...));
294   };
295 
296   /**
297    * @brief      Convert from tuple of accumulator values
298    *
299    * @param[in]  acc   Accumulator
300    *
301    * @tparam     E     Datatypes for tuple
302    *
303    * @return     Tuples of converted accumulator values
304    *
305    * Accumulator may use more bits to avoid saturations.
306    * At the end of the accumulation, the final result must
307    * be converted to the current datatype (it may implies saturation)
308    */
309   template<typename ...E>
from_accumulator(const std::tuple<E...> & acc)310   __STATIC_FORCEINLINE auto from_accumulator(const std::tuple<E...> &acc)
311   {
312      return(from_accumulator_impl(acc,std::make_index_sequence<sizeof...(E)>()));
313   };
314 
315   /**
316    * @brief      Multiply accumulate for tuple of scalar
317    *
318    * @param[in]  acc        Accumulator
319    * @param[in]  a          First operand
320    * @param[in]  b          Second operand
321    *
322    * @tparam     A          Accumulator datatype
323    * @tparam     V          Scalar datatype
324    * @tparam     Ns         Tuple indexes
325    *
326    * @return     Tuples of accumulated values
327    */
328   template<typename A,typename V,std::size_t... Ns>
mac_impl(const A & acc,const V & a,const V & b,std::index_sequence<Ns...>)329   __STATIC_FORCEINLINE A mac_impl(const A &acc,const V &a,const V &b, std::index_sequence<Ns...>)
330   {
331      return(std::make_tuple(mac(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b))...));
332   };
333 
334 /**
335  * @brief      Multiply accumulate
336  *
337  * @param[in]  acc   Accumulator
338  * @param[in]  a     First operand
339  * @param[in]  b     Second operand
340  *
341  * @tparam     A     Accumulator datatype
342  * @tparam     E     Datatypes for tuple
343  *
344  * @return     Accumulated values
345  */
346   template<typename A,typename ...E>
347   __STATIC_FORCEINLINE A
mac(const A & acc,const std::tuple<E...> & a,const std::tuple<E...> & b)348   mac(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b)
349   {
350      return(mac_impl(acc,a,b,std::make_index_sequence<sizeof...(E)>()));
351   };
352 
353 /**
354    * @brief      Multiply accumulate for tuple of scalar
355    *
356    * @param[in]  acc        Accumulator
357    * @param[in]  a          First operand
358    * @param[in]  b          Second operand
359    * @param[in]  p0         Predicate
360    *
361    * @tparam     A          Accumulator datatype
362    * @tparam     V          Scalar datatype
363    * @tparam     B          Predicate datatype
364    * @tparam     Ns         Tuple indexes
365    *
366    * @return     Tuples of accumulated values
367    */
368  template<typename A,typename V,typename B,std::size_t... Ns>
mac_impl(const A & acc,const V & a,const V & b,const B p0,std::index_sequence<Ns...>)369   __STATIC_FORCEINLINE A mac_impl(const A &acc,const V &a,const V &b, const B p0,std::index_sequence<Ns...>)
370   {
371      return(std::make_tuple(mac(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b),p0)...));
372   };
373 
374 /**
375  * @brief      Multiply accumulate
376  *
377  * @param[in]  acc   Accumulator
378  * @param[in]  a     First operand
379  * @param[in]  b     Second operand
380  * @param[in]  p0    Predicate
381  *
382  * @tparam     A     Accumulator datatype
383  * @tparam     B          Predicate datatype
384  * @tparam     E     Datatypes for tuple
385  *
386  * @return     Accumulated values
387  */
388    template<typename A,typename B,typename ...E>
389   __STATIC_FORCEINLINE A
mac(const A & acc,const std::tuple<E...> & a,const std::tuple<E...> & b,const B p0)390   mac(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b,const B p0)
391   {
392      return(mac_impl(acc,a,b,p0,std::make_index_sequence<sizeof...(E)>()));
393   };
394 
395 };
396 
397 
398 /*! @} */
399 /*! @} */