1 /*
2 * Copyright (c) 2021 - 2024 the ThorVG project. All rights reserved.
3
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10
11 * The above copyright notice and this permission notice shall be included in all
12 * copies or substantial portions of the Software.
13
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "../../lv_conf_internal.h"
24 #if LV_USE_THORVG_INTERNAL
25
26 #ifdef THORVG_NEON_VECTOR_SUPPORT
27
28 #include <arm_neon.h>
29
30 //TODO : need to support windows ARM
31
32 #if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
33 #define TVG_AARCH64 1
34 #else
35 #define TVG_AARCH64 0
36 #endif
37
38
ALPHA_BLEND(uint8x8_t c,uint8x8_t a)39 static inline uint8x8_t ALPHA_BLEND(uint8x8_t c, uint8x8_t a)
40 {
41 uint16x8_t t = vmull_u8(c, a);
42 return vshrn_n_u16(t, 8);
43 }
44
45
neonRasterGrayscale8(uint8_t * dst,uint8_t val,uint32_t offset,int32_t len)46 static void neonRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int32_t len)
47 {
48 dst += offset;
49
50 int32_t i = 0;
51 const uint8x16_t valVec = vdupq_n_u8(val);
52 #if TVG_AARCH64
53 uint8x16x4_t valQuad = {valVec, valVec, valVec, valVec};
54 for (; i <= len - 16 * 4; i += 16 * 4) {
55 vst1q_u8_x4(dst + i, valQuad);
56 }
57 #else
58 for (; i <= len - 16; i += 16) {
59 vst1q_u8(dst + i, valVec);
60 }
61 #endif
62 for (; i < len; i++) {
63 dst[i] = val;
64 }
65 }
66
67
neonRasterPixel32(uint32_t * dst,uint32_t val,uint32_t offset,int32_t len)68 static void neonRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
69 {
70 dst += offset;
71
72 uint32x4_t vectorVal = vdupq_n_u32(val);
73
74 #if TVG_AARCH64
75 uint32_t iterations = len / 16;
76 uint32_t neonFilled = iterations * 16;
77 uint32x4x4_t valQuad = {vectorVal, vectorVal, vectorVal, vectorVal};
78 for (uint32_t i = 0; i < iterations; ++i) {
79 vst4q_u32(dst, valQuad);
80 dst += 16;
81 }
82 #else
83 uint32_t iterations = len / 4;
84 uint32_t neonFilled = iterations * 4;
85 for (uint32_t i = 0; i < iterations; ++i) {
86 vst1q_u32(dst, vectorVal);
87 dst += 4;
88 }
89 #endif
90 int32_t leftovers = len - neonFilled;
91 while (leftovers--) *dst++ = val;
92 }
93
94
neonRasterTranslucentRle(SwSurface * surface,const SwRle * rle,uint8_t r,uint8_t g,uint8_t b,uint8_t a)95 static bool neonRasterTranslucentRle(SwSurface* surface, const SwRle* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
96 {
97 auto span = rle->spans;
98
99 //32bit channels
100 if (surface->channelSize == sizeof(uint32_t)) {
101 auto color = surface->join(r, g, b, a);
102 uint32_t src;
103 uint8x8_t *vDst = nullptr;
104 uint16_t align;
105
106 for (uint32_t i = 0; i < rle->size; ++i) {
107 if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage);
108 else src = color;
109
110 auto dst = &surface->buf32[span->y * surface->stride + span->x];
111 auto ialpha = IA(src);
112
113 if ((((uintptr_t) dst) & 0x7) != 0) {
114 //fill not aligned byte
115 *dst = src + ALPHA_BLEND(*dst, ialpha);
116 vDst = (uint8x8_t*)(dst + 1);
117 align = 1;
118 } else {
119 vDst = (uint8x8_t*) dst;
120 align = 0;
121 }
122
123 uint8x8_t vSrc = (uint8x8_t) vdup_n_u32(src);
124 uint8x8_t vIalpha = vdup_n_u8((uint8_t) ialpha);
125
126 for (uint32_t x = 0; x < (span->len - align) / 2; ++x)
127 vDst[x] = vadd_u8(vSrc, ALPHA_BLEND(vDst[x], vIalpha));
128
129 auto leftovers = (span->len - align) % 2;
130 if (leftovers > 0) dst[span->len - 1] = src + ALPHA_BLEND(dst[span->len - 1], ialpha);
131
132 ++span;
133 }
134 //8bit grayscale
135 } else if (surface->channelSize == sizeof(uint8_t)) {
136 TVGLOG("SW_ENGINE", "Require Neon Optimization, Channel Size = %d", surface->channelSize);
137 uint8_t src;
138 for (uint32_t i = 0; i < rle->size; ++i, ++span) {
139 auto dst = &surface->buf8[span->y * surface->stride + span->x];
140 if (span->coverage < 255) src = MULTIPLY(span->coverage, a);
141 else src = a;
142 auto ialpha = ~a;
143 for (uint32_t x = 0; x < span->len; ++x, ++dst) {
144 *dst = src + MULTIPLY(*dst, ialpha);
145 }
146 }
147 }
148 return true;
149 }
150
151
neonRasterTranslucentRect(SwSurface * surface,const SwBBox & region,uint8_t r,uint8_t g,uint8_t b,uint8_t a)152 static bool neonRasterTranslucentRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
153 {
154 auto h = static_cast<uint32_t>(region.max.y - region.min.y);
155 auto w = static_cast<uint32_t>(region.max.x - region.min.x);
156
157 //32bits channels
158 if (surface->channelSize == sizeof(uint32_t)) {
159 auto color = surface->join(r, g, b, a);
160 auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
161 auto ialpha = 255 - a;
162
163 auto vColor = vdup_n_u32(color);
164 auto vIalpha = vdup_n_u8((uint8_t) ialpha);
165
166 uint8x8_t* vDst = nullptr;
167 uint32_t align;
168
169 for (uint32_t y = 0; y < h; ++y) {
170 auto dst = &buffer[y * surface->stride];
171
172 if ((((uintptr_t) dst) & 0x7) != 0) {
173 //fill not aligned byte
174 *dst = color + ALPHA_BLEND(*dst, ialpha);
175 vDst = (uint8x8_t*) (dst + 1);
176 align = 1;
177 } else {
178 vDst = (uint8x8_t*) dst;
179 align = 0;
180 }
181
182 for (uint32_t x = 0; x < (w - align) / 2; ++x)
183 vDst[x] = vadd_u8((uint8x8_t)vColor, ALPHA_BLEND(vDst[x], vIalpha));
184
185 auto leftovers = (w - align) % 2;
186 if (leftovers > 0) dst[w - 1] = color + ALPHA_BLEND(dst[w - 1], ialpha);
187 }
188 //8bit grayscale
189 } else if (surface->channelSize == sizeof(uint8_t)) {
190 TVGLOG("SW_ENGINE", "Require Neon Optimization, Channel Size = %d", surface->channelSize);
191 auto buffer = surface->buf8 + (region.min.y * surface->stride) + region.min.x;
192 auto ialpha = ~a;
193 for (uint32_t y = 0; y < h; ++y) {
194 auto dst = &buffer[y * surface->stride];
195 for (uint32_t x = 0; x < w; ++x, ++dst) {
196 *dst = a + MULTIPLY(*dst, ialpha);
197 }
198 }
199 }
200 return true;
201 }
202
203 #endif
204
205 #endif /* LV_USE_THORVG_INTERNAL */
206
207