1 /*
2  * Copyright (c) 2021 - 2024 the ThorVG project. All rights reserved.
3 
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10 
11  * The above copyright notice and this permission notice shall be included in all
12  * copies or substantial portions of the Software.
13 
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20  * SOFTWARE.
21  */
22 
23 #include "../../lv_conf_internal.h"
24 #if LV_USE_THORVG_INTERNAL
25 
26 #ifdef THORVG_NEON_VECTOR_SUPPORT
27 
28 #include <arm_neon.h>
29 
30 //TODO : need to support windows ARM
31 
32 #if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
33 #define TVG_AARCH64 1
34 #else
35 #define TVG_AARCH64 0
36 #endif
37 
38 
ALPHA_BLEND(uint8x8_t c,uint8x8_t a)39 static inline uint8x8_t ALPHA_BLEND(uint8x8_t c, uint8x8_t a)
40 {
41     uint16x8_t t = vmull_u8(c, a);
42     return vshrn_n_u16(t, 8);
43 }
44 
45 
neonRasterGrayscale8(uint8_t * dst,uint8_t val,uint32_t offset,int32_t len)46 static void neonRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int32_t len)
47 {
48     dst += offset;
49 
50     int32_t i = 0;
51     const uint8x16_t valVec = vdupq_n_u8(val);
52 #if TVG_AARCH64
53     uint8x16x4_t valQuad = {valVec, valVec, valVec, valVec};
54     for (; i <= len - 16 * 4; i += 16 * 4) {
55         vst1q_u8_x4(dst + i, valQuad);
56     }
57 #else
58     for (; i <= len - 16; i += 16) {
59         vst1q_u8(dst + i, valVec);
60     }
61 #endif
62     for (; i < len; i++) {
63         dst[i] = val;
64     }
65 }
66 
67 
neonRasterPixel32(uint32_t * dst,uint32_t val,uint32_t offset,int32_t len)68 static void neonRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
69 {
70     dst += offset;
71 
72     uint32x4_t vectorVal = vdupq_n_u32(val);
73 
74 #if TVG_AARCH64
75     uint32_t iterations = len / 16;
76     uint32_t neonFilled = iterations * 16;
77     uint32x4x4_t valQuad = {vectorVal, vectorVal, vectorVal, vectorVal};
78     for (uint32_t i = 0; i < iterations; ++i) {
79         vst4q_u32(dst, valQuad);
80         dst += 16;
81     }
82 #else
83     uint32_t iterations = len / 4;
84     uint32_t neonFilled = iterations * 4;
85     for (uint32_t i = 0; i < iterations; ++i) {
86         vst1q_u32(dst, vectorVal);
87         dst += 4;
88     }
89 #endif
90     int32_t leftovers = len - neonFilled;
91     while (leftovers--) *dst++ = val;
92 }
93 
94 
neonRasterTranslucentRle(SwSurface * surface,const SwRle * rle,uint8_t r,uint8_t g,uint8_t b,uint8_t a)95 static bool neonRasterTranslucentRle(SwSurface* surface, const SwRle* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
96 {
97     auto span = rle->spans;
98 
99     //32bit channels
100     if (surface->channelSize == sizeof(uint32_t)) {
101         auto color = surface->join(r, g, b, a);
102         uint32_t src;
103         uint8x8_t *vDst = nullptr;
104         uint16_t align;
105 
106         for (uint32_t i = 0; i < rle->size; ++i) {
107             if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage);
108             else src = color;
109 
110             auto dst = &surface->buf32[span->y * surface->stride + span->x];
111             auto ialpha = IA(src);
112 
113             if ((((uintptr_t) dst) & 0x7) != 0) {
114                 //fill not aligned byte
115                 *dst = src + ALPHA_BLEND(*dst, ialpha);
116                 vDst = (uint8x8_t*)(dst + 1);
117                 align = 1;
118             } else {
119                 vDst = (uint8x8_t*) dst;
120                 align = 0;
121             }
122 
123             uint8x8_t vSrc = (uint8x8_t) vdup_n_u32(src);
124             uint8x8_t vIalpha = vdup_n_u8((uint8_t) ialpha);
125 
126             for (uint32_t x = 0; x < (span->len - align) / 2; ++x)
127                 vDst[x] = vadd_u8(vSrc, ALPHA_BLEND(vDst[x], vIalpha));
128 
129             auto leftovers = (span->len - align) % 2;
130             if (leftovers > 0) dst[span->len - 1] = src + ALPHA_BLEND(dst[span->len - 1], ialpha);
131 
132             ++span;
133         }
134     //8bit grayscale
135     } else if (surface->channelSize == sizeof(uint8_t)) {
136         TVGLOG("SW_ENGINE", "Require Neon Optimization, Channel Size = %d", surface->channelSize);
137         uint8_t src;
138         for (uint32_t i = 0; i < rle->size; ++i, ++span) {
139             auto dst = &surface->buf8[span->y * surface->stride + span->x];
140             if (span->coverage < 255) src = MULTIPLY(span->coverage, a);
141             else src = a;
142             auto ialpha = ~a;
143             for (uint32_t x = 0; x < span->len; ++x, ++dst) {
144                 *dst = src + MULTIPLY(*dst, ialpha);
145             }
146         }
147     }
148     return true;
149 }
150 
151 
neonRasterTranslucentRect(SwSurface * surface,const SwBBox & region,uint8_t r,uint8_t g,uint8_t b,uint8_t a)152 static bool neonRasterTranslucentRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
153 {
154     auto h = static_cast<uint32_t>(region.max.y - region.min.y);
155     auto w = static_cast<uint32_t>(region.max.x - region.min.x);
156 
157     //32bits channels
158     if (surface->channelSize == sizeof(uint32_t)) {
159         auto color = surface->join(r, g, b, a);
160         auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
161         auto ialpha = 255 - a;
162 
163         auto vColor = vdup_n_u32(color);
164         auto vIalpha = vdup_n_u8((uint8_t) ialpha);
165 
166         uint8x8_t* vDst = nullptr;
167         uint32_t align;
168 
169         for (uint32_t y = 0; y < h; ++y) {
170             auto dst = &buffer[y * surface->stride];
171 
172             if ((((uintptr_t) dst) & 0x7) != 0) {
173                 //fill not aligned byte
174                 *dst = color + ALPHA_BLEND(*dst, ialpha);
175                 vDst = (uint8x8_t*) (dst + 1);
176                 align = 1;
177             } else {
178                 vDst = (uint8x8_t*) dst;
179                 align = 0;
180             }
181 
182             for (uint32_t x = 0; x <  (w - align) / 2; ++x)
183                 vDst[x] = vadd_u8((uint8x8_t)vColor, ALPHA_BLEND(vDst[x], vIalpha));
184 
185             auto leftovers = (w - align) % 2;
186             if (leftovers > 0) dst[w - 1] = color + ALPHA_BLEND(dst[w - 1], ialpha);
187         }
188     //8bit grayscale
189     } else if (surface->channelSize == sizeof(uint8_t)) {
190         TVGLOG("SW_ENGINE", "Require Neon Optimization, Channel Size = %d", surface->channelSize);
191         auto buffer = surface->buf8 + (region.min.y * surface->stride) + region.min.x;
192         auto ialpha = ~a;
193         for (uint32_t y = 0; y < h; ++y) {
194             auto dst = &buffer[y * surface->stride];
195             for (uint32_t x = 0; x < w; ++x, ++dst) {
196                 *dst = a + MULTIPLY(*dst, ialpha);
197             }
198         }
199     }
200     return true;
201 }
202 
203 #endif
204 
205 #endif /* LV_USE_THORVG_INTERNAL */
206 
207