1 // -*- C++ -*-
2 
3 #pragma once
4 
5 #include <dsppp/arch.hpp>
6 #include <type_traits>
7 #include <dsppp/number.hpp>
8 #if 0
9 
10 template<typename T>
11 void _Add(const T* pSrcA,
12            const T* pSrcB,
13            T* pDst,
14            const std::size_t l,
15            const Neon* = nullptr,
16            typename std::enable_if<number_traits<T>::is_float &&
17                                 vector_traits<T,Neon>::has_vector,T>::type* = nullptr)
18 {
19     using num = vector_traits<T,Neon>;
20     using VecType = typename num::vector;
21     constexpr int nb_lanes = num::nb_lanes;
22     constexpr int lanes_shift = shiftFromValue(nb_lanes);
23     constexpr int lanes_mask = maskFromShift(lanes_shift);
24 
25     //std::cout << "Neon float\r\n" ;
26 
27     uint32_t blkCnt;                               /* Loop counter */
28 
29     VecType vec1;
30     VecType vec2;
31     VecType res;
32 
33     /* Compute several lanes at a time */
34     blkCnt = l >> lanes_shift;
35 
36     while (blkCnt > 0U)
37     {
38         /* C = A + B */
39 
40         /* Add and then store the results in the destination buffer. */
41         vec1 = vld1q(pSrcA);
42         vec2 = vld1q(pSrcB);
43         res = vaddq(vec1, vec2);
44         vst1q(pDst, res);
45 
46         /* Increment pointers */
47         pSrcA += nb_lanes;
48         pSrcB += nb_lanes;
49         pDst += nb_lanes;
50 
51         /* Decrement the loop counter */
52         blkCnt--;
53     }
54 
55     /* Tail */
56     blkCnt = l & lanes_mask;
57 
58     while (blkCnt > 0U)
59     {
60       /* C = A + B */
61 
62       /* Add and store result in destination buffer. */
63       *pDst++ = (*pSrcA++) + (*pSrcB++);
64 
65       /* Decrement loop counter */
66       blkCnt--;
67     }
68 
69 
70 };
71 
72 
73 
74 
75 template<typename T>
76 void _Add(const T* pSrcA_Q,
77            const T* pSrcB_Q,
78            T* pDst_Q,
79            const std::size_t l,
80            const Neon* = nullptr,
81            typename std::enable_if<number_traits<T>::is_fixed &&
82                                    vector_traits<T,Neon>::has_vector,T>::type* = nullptr)
83 {
84     using num = vector_traits<T,Neon>;
85     using VecType = typename num::vector;
86     using value_type = typename T::value_type;
87     constexpr int nb_lanes = num::nb_lanes;
88     constexpr int lanes_shift = shiftFromValue(nb_lanes);
89     constexpr int lanes_mask = maskFromShift(lanes_shift);
90     const value_type *pSrcA = reinterpret_cast<const value_type*>(pSrcA_Q);
91     const value_type *pSrcB = reinterpret_cast<const value_type*>(pSrcB_Q);
92           value_type *pDst = reinterpret_cast<value_type*>(pDst_Q);
93 
94     uint32_t  blkCnt;           /* loop counters */
95     VecType vecA;
96     VecType vecB;
97 
98     /* Compute 8 outputs at a time */
99     blkCnt = l >> lanes_shift;
100     while (blkCnt > 0U)
101     {
102         /*
103          * C = A + B
104          * Add and then store the results in the destination buffer.
105          */
106         vecA = vld1q(pSrcA);
107         vecB = vld1q(pSrcB);
108         vst1q(pDst, vqaddq(vecA, vecB));
109         /*
110          * Decrement the blockSize loop counter
111          */
112         blkCnt--;
113         /*
114          * advance vector source and destination pointers
115          */
116         pSrcA  += nb_lanes;
117         pSrcB  += nb_lanes;
118         pDst   += nb_lanes;
119     }
120     /*
121      * tail
122      */
123     blkCnt = l & lanes_mask;
124     if (blkCnt > 0U)
125     {
126         mve_pred16_t p0 = num::vctpq(blkCnt);
127         vecA = vld1q(pSrcA);
128         vecB = vld1q(pSrcB);
129         vstrq_p(pDst, vqaddq(vecA, vecB), p0);
130     }
131 }
132 
133 #endif