1 // -*- C++ -*- 2 3 #pragma once 4 5 #include <dsppp/arch.hpp> 6 #include <type_traits> 7 #include <dsppp/number.hpp> 8 #if 0 9 10 template<typename T> 11 void _Add(const T* pSrcA, 12 const T* pSrcB, 13 T* pDst, 14 const std::size_t l, 15 const Neon* = nullptr, 16 typename std::enable_if<number_traits<T>::is_float && 17 vector_traits<T,Neon>::has_vector,T>::type* = nullptr) 18 { 19 using num = vector_traits<T,Neon>; 20 using VecType = typename num::vector; 21 constexpr int nb_lanes = num::nb_lanes; 22 constexpr int lanes_shift = shiftFromValue(nb_lanes); 23 constexpr int lanes_mask = maskFromShift(lanes_shift); 24 25 //std::cout << "Neon float\r\n" ; 26 27 uint32_t blkCnt; /* Loop counter */ 28 29 VecType vec1; 30 VecType vec2; 31 VecType res; 32 33 /* Compute several lanes at a time */ 34 blkCnt = l >> lanes_shift; 35 36 while (blkCnt > 0U) 37 { 38 /* C = A + B */ 39 40 /* Add and then store the results in the destination buffer. */ 41 vec1 = vld1q(pSrcA); 42 vec2 = vld1q(pSrcB); 43 res = vaddq(vec1, vec2); 44 vst1q(pDst, res); 45 46 /* Increment pointers */ 47 pSrcA += nb_lanes; 48 pSrcB += nb_lanes; 49 pDst += nb_lanes; 50 51 /* Decrement the loop counter */ 52 blkCnt--; 53 } 54 55 /* Tail */ 56 blkCnt = l & lanes_mask; 57 58 while (blkCnt > 0U) 59 { 60 /* C = A + B */ 61 62 /* Add and store result in destination buffer. */ 63 *pDst++ = (*pSrcA++) + (*pSrcB++); 64 65 /* Decrement loop counter */ 66 blkCnt--; 67 } 68 69 70 }; 71 72 73 74 75 template<typename T> 76 void _Add(const T* pSrcA_Q, 77 const T* pSrcB_Q, 78 T* pDst_Q, 79 const std::size_t l, 80 const Neon* = nullptr, 81 typename std::enable_if<number_traits<T>::is_fixed && 82 vector_traits<T,Neon>::has_vector,T>::type* = nullptr) 83 { 84 using num = vector_traits<T,Neon>; 85 using VecType = typename num::vector; 86 using value_type = typename T::value_type; 87 constexpr int nb_lanes = num::nb_lanes; 88 constexpr int lanes_shift = shiftFromValue(nb_lanes); 89 constexpr int lanes_mask = maskFromShift(lanes_shift); 90 const value_type *pSrcA = reinterpret_cast<const value_type*>(pSrcA_Q); 91 const value_type *pSrcB = reinterpret_cast<const value_type*>(pSrcB_Q); 92 value_type *pDst = reinterpret_cast<value_type*>(pDst_Q); 93 94 uint32_t blkCnt; /* loop counters */ 95 VecType vecA; 96 VecType vecB; 97 98 /* Compute 8 outputs at a time */ 99 blkCnt = l >> lanes_shift; 100 while (blkCnt > 0U) 101 { 102 /* 103 * C = A + B 104 * Add and then store the results in the destination buffer. 105 */ 106 vecA = vld1q(pSrcA); 107 vecB = vld1q(pSrcB); 108 vst1q(pDst, vqaddq(vecA, vecB)); 109 /* 110 * Decrement the blockSize loop counter 111 */ 112 blkCnt--; 113 /* 114 * advance vector source and destination pointers 115 */ 116 pSrcA += nb_lanes; 117 pSrcB += nb_lanes; 118 pDst += nb_lanes; 119 } 120 /* 121 * tail 122 */ 123 blkCnt = l & lanes_mask; 124 if (blkCnt > 0U) 125 { 126 mve_pred16_t p0 = num::vctpq(blkCnt); 127 vecA = vld1q(pSrcA); 128 vecB = vld1q(pSrcB); 129 vstrq_p(pDst, vqaddq(vecA, vecB), p0); 130 } 131 } 132 133 #endif