1/**
2 * @file lv_blend_neon.S
3 *
4 */
5
6#ifndef __ASSEMBLY__
7#define __ASSEMBLY__
8#endif
9
10#include "lv_blend_neon.h"
11
12#if LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_NEON
13
14.text
15.fpu neon
16.arch armv7a
17.syntax unified
18.p2align 2
19
20@ d0 ~ d3 : src B,G,R,A
21@ d4 ~ d7 : dst B,G,R,A
22@ q8 : src RGB565 raw
23@ q9 : dst RGB565 raw
24@ q10 ~ q12: pre-multiplied src
25@ d26~29 : temp
26@ d30 : mask
27@ d31 : opa
28
29FG_MASK     .req r0
30BG_MASK     .req r1
31DST_ADDR    .req r2
32DST_W       .req r3
33DST_H       .req r4
34DST_STRIDE  .req r5
35SRC_ADDR    .req r6
36SRC_STRIDE  .req r7
37MASK_ADDR   .req r8
38MASK_STRIDE .req r9
39W           .req r10
40H           .req r11
41S_8888_L    .qn  q0
42S_8888_H    .qn  q1
43D_8888_L    .qn  q2
44D_8888_H    .qn  q3
45                    S_B       .dn  d0
46                    S_G       .dn  d1
47                    S_R       .dn  d2
48                    S_A       .dn  d3
49                    D_B       .dn  d4
50                    D_G       .dn  d5
51                    D_R       .dn  d6
52                    D_A       .dn  d7
53S_565       .qn  q8
54D_565       .qn  q9
55                    S_565_L   .dn  d16
56                    S_565_H   .dn  d17
57                    D_565_L   .dn  d18
58                    D_565_H   .dn  d19
59PREMULT_B   .qn  q10
60PREMULT_G   .qn  q11
61PREMULT_R   .qn  q12
62TMP_Q0      .qn  q13
63                    TMP_D0    .dn  d26
64                    TMP_D1    .dn  d27
65TMP_Q1      .qn  q14
66                    TMP_D2    .dn  d28
67                    TMP_D3    .dn  d29
68                    M_A       .dn  d30
69                    OPA       .dn  d31
70
71.macro convert reg, bpp, intlv
72.if \bpp >= 31
73    .if \intlv
74        vzip.8          \reg\()_B, \reg\()_R   @ BRBRBRBR GGGGGGGG BRBRBRBR AAAAAAAA
75        vzip.8          \reg\()_G, \reg\()_A   @ BRBRBRBR GAGAGAGA BRBRBRBR GAGAGAGA
76        vzip.8          \reg\()_R, \reg\()_A   @ BRBRBRBR GAGAGAGA BGRABGRA BGRABGRA
77        vzip.8          \reg\()_B, \reg\()_G   @ BGRABGRA BGRABGRA BGRABGRA BGRABGRA
78    .else
79        vuzp.8          \reg\()_B, \reg\()_G   @ BRBRBRBR GAGAGAGA BGRABGRA BGRABGRA
80        vuzp.8          \reg\()_R, \reg\()_A   @ BRBRBRBR GAGAGAGA BRBRBRBR GAGAGAGA
81        vuzp.8          \reg\()_G, \reg\()_A   @ BRBRBRBR GGGGGGGG BRBRBRBR AAAAAAAA
82        vuzp.8          \reg\()_B, \reg\()_R   @ BBBBBBBB GGGGGGGG RRRRRRRR AAAAAAAA
83    .endif
84.elseif \bpp == 24
85    .if \intlv   @ for init only (same B,G,R for all channel)
86        vzip.8          \reg\()_B, \reg\()_G                @ BGBGBGBG BGBGBGBG RRRRRRRR
87        vzip.16         \reg\()_B, \reg\()_R                @ BGRRBGRR BGBGBGBG BGRRBGRR
88        vsli.64         \reg\()_8888_L, \reg\()_8888_L, #24 @ BGRBGRRB BGBBGBGB
89        vsli.64         \reg\()_B, \reg\()_G, #48           @ BGRBGRBG
90        vsri.64         \reg\()_R, \reg\()_B, #8            @                   GRBGRBGR
91        vsri.64         \reg\()_G, \reg\()_R, #8            @          RBGRBGRB
92    .endif
93.elseif \bpp == 16
94    .if \intlv
95        vshll.u8        \reg\()_565, \reg\()_R, #8 @ RRRrrRRR 00000000
96        vshll.u8        TMP_Q0, \reg\()_G, #8      @ GGGgggGG 00000000
97        vshll.u8        TMP_Q1, \reg\()_B, #8      @ BBBbbBBB 00000000
98        vsri.16         \reg\()_565, TMP_Q0, #5    @ RRRrrGGG gggGG000
99        vsri.16         \reg\()_565, TMP_Q1, #11   @ RRRrrGGG gggBBBbb
100    .else
101        vshr.u8         TMP_Q0, \reg\()_565, #3    @ 000RRRrr 000gggBB
102        vshrn.i16       \reg\()_G, \reg\()_565, #5 @ rrGGGggg
103        vshrn.i16       \reg\()_R, TMP_Q0, #5      @ RRRrr000
104        vshl.i8         \reg\()_G, \reg\()_G, #2   @ GGGggg00
105        vshl.i16        TMP_Q1, \reg\()_565, #3    @ rrGGGggg BBBbb000
106        vsri.8          \reg\()_R, \reg\()_R, #5   @ RRRrrRRR
107        vmovn.i16       \reg\()_B, TMP_Q1          @ BBBbb000
108        vsri.8          \reg\()_G, \reg\()_G, #6   @ GGGgggGG
109        vsri.8          \reg\()_B, \reg\()_B, #5   @ BBBbbBBB
110    .endif
111.endif
112.endm
113
114.macro ldst op, bpp, len, mem, reg, cvt, wb
115.if \bpp >= 31
116    .if \len == 8
117        .if \cvt
118            v\op\()4.8        {\reg\()_B, \reg\()_G, \reg\()_R, \reg\()_A}, [\mem\()_ADDR]\wb
119        .else
120            v\op\()1.32       {\reg\()_8888_L, \reg\()_8888_H}, [\mem\()_ADDR]\wb
121        .endif
122    .else
123        .ifc \op,st
124            .if \cvt
125                convert       \reg, \bpp, 1
126            .endif
127        .endif
128        .if \len == 7
129            v\op\()1.32       {\reg\()_8888_L}, [\mem\()_ADDR]!
130            v\op\()1.32       {\reg\()_R}, [\mem\()_ADDR]!
131            v\op\()1.32       {\reg\()_A[0]}, [\mem\()_ADDR]!
132        .elseif \len == 6
133            v\op\()1.32       {\reg\()_8888_L}, [\mem\()_ADDR]!
134            v\op\()1.32       {\reg\()_R}, [\mem\()_ADDR]!
135        .elseif \len == 5
136            v\op\()1.32       {\reg\()_8888_L}, [\mem\()_ADDR]!
137            v\op\()1.32       {\reg\()_R[0]}, [\mem\()_ADDR]!
138        .elseif \len == 4
139            v\op\()1.32       {\reg\()_8888_L}, [\mem\()_ADDR]\wb
140        .elseif \len == 3
141            v\op\()1.32       {\reg\()_B}, [\mem\()_ADDR]!
142            v\op\()1.32       {\reg\()_G[0]}, [\mem\()_ADDR]!
143        .elseif \len == 2
144            v\op\()1.32       {\reg\()_B}, [\mem\()_ADDR]\wb
145        .elseif \len == 1
146            v\op\()1.32       {\reg\()_B[0]}, [\mem\()_ADDR]\wb
147        .else
148            .error "[32bpp]len should be 1~8"
149        .endif
150        .ifc \op,ld
151            .if \cvt
152                convert       \reg, \bpp, 0
153            .endif
154        .endif
155        .ifb \wb
156            .if (\len != 4) && (\len != 2) && (\len != 1)
157                sub           \mem\()_ADDR, #4*\len
158            .endif
159        .endif
160    .endif
161.elseif \bpp == 24
162    .if \len == 8
163        .if \cvt
164            v\op\()3.8        {\reg\()_B, \reg\()_G, \reg\()_R}, [\mem\()_ADDR]\wb
165        .else
166            v\op\()1.8        {\reg\()_B, \reg\()_G, \reg\()_R}, [\mem\()_ADDR]\wb
167        .endif
168    .elseif (\len < 8) && (\len > 0)
169        .if \cvt
170            v\op\()3.8        {\reg\()_B[0], \reg\()_G[0], \reg\()_R[0]}, [\mem\()_ADDR]!
171            .if \len > 1
172                v\op\()3.8    {\reg\()_B[1], \reg\()_G[1], \reg\()_R[1]}, [\mem\()_ADDR]!
173            .endif
174            .if \len > 2
175                v\op\()3.8    {\reg\()_B[2], \reg\()_G[2], \reg\()_R[2]}, [\mem\()_ADDR]!
176            .endif
177            .if \len > 3
178                v\op\()3.8    {\reg\()_B[3], \reg\()_G[3], \reg\()_R[3]}, [\mem\()_ADDR]!
179            .endif
180            .if \len > 4
181                v\op\()3.8    {\reg\()_B[4], \reg\()_G[4], \reg\()_R[4]}, [\mem\()_ADDR]!
182            .endif
183            .if \len > 5
184                v\op\()3.8    {\reg\()_B[5], \reg\()_G[5], \reg\()_R[5]}, [\mem\()_ADDR]!
185            .endif
186            .if \len > 6
187                v\op\()3.8    {\reg\()_B[6], \reg\()_G[6], \reg\()_R[6]}, [\mem\()_ADDR]!
188            .endif
189            .ifb \wb
190                sub           \mem\()_ADDR, #3*\len
191            .endif
192        .else
193            .if \len == 7
194                v\op\()1.32   {\reg\()_8888_L}, [\mem\()_ADDR]!
195                v\op\()1.32   {\reg\()_R[0]}, [\mem\()_ADDR]!
196                v\op\()1.8    {\reg\()_R[4]}, [\mem\()_ADDR]!
197            .elseif \len == 6
198                v\op\()1.32   {\reg\()_8888_L}, [\mem\()_ADDR]!
199                v\op\()1.16   {\reg\()_R[0]}, [\mem\()_ADDR]!
200            .elseif \len == 5
201                v\op\()1.32   {\reg\()_B}, [\mem\()_ADDR]!
202                v\op\()1.32   {\reg\()_G[0]}, [\mem\()_ADDR]!
203                v\op\()1.16   {\reg\()_G[2]}, [\mem\()_ADDR]!
204                v\op\()1.8    {\reg\()_G[6]}, [\mem\()_ADDR]!
205            .elseif \len == 4
206                v\op\()1.32   {\reg\()_B}, [\mem\()_ADDR]!
207                v\op\()1.32   {\reg\()_G[0]}, [\mem\()_ADDR]!
208            .elseif \len == 3
209                v\op\()1.32   {\reg\()_B}, [\mem\()_ADDR]!
210                v\op\()1.8    {\reg\()_G[0]}, [\mem\()_ADDR]!
211            .elseif \len == 2
212                v\op\()1.32   {\reg\()_B[0]}, [\mem\()_ADDR]!
213                v\op\()1.16   {\reg\()_B[2]}, [\mem\()_ADDR]!
214            .elseif \len == 1
215                v\op\()1.16   {\reg\()_B[0]}, [\mem\()_ADDR]!
216                v\op\()1.8    {\reg\()_B[2]}, [\mem\()_ADDR]!
217            .endif
218            .ifb \wb
219                sub           \mem\()_ADDR, #3*\len
220            .endif
221        .endif
222    .else
223        .error "[24bpp]len should be 1~8"
224    .endif
225.elseif \bpp == 16
226    .ifc \op,st
227        .if \cvt
228            convert           \reg, \bpp, 1
229        .endif
230    .endif
231    .if \len == 8
232        v\op\()1.16           {\reg\()_565}, [\mem\()_ADDR]\wb
233    .elseif \len == 7
234        v\op\()1.16           {\reg\()_565_L}, [\mem\()_ADDR]!
235        v\op\()1.32           {\reg\()_565_H[0]}, [\mem\()_ADDR]!
236        v\op\()1.16           {\reg\()_565_H[2]}, [\mem\()_ADDR]!
237        .ifb \wb
238            sub               \mem\()_ADDR, #14
239        .endif
240    .elseif \len == 6
241        v\op\()1.16           {\reg\()_565_L}, [\mem\()_ADDR]!
242        v\op\()1.32           {\reg\()_565_H[0]}, [\mem\()_ADDR]!
243        .ifb \wb
244            sub               \mem\()_ADDR, #12
245        .endif
246    .elseif \len == 5
247        v\op\()1.16           {\reg\()_565_L}, [\mem\()_ADDR]!
248        v\op\()1.16           {\reg\()_565_H[0]}, [\mem\()_ADDR]!
249        .ifb \wb
250            sub               \mem\()_ADDR, #10
251        .endif
252    .elseif \len == 4
253        v\op\()1.16           {\reg\()_565_L}, [\mem\()_ADDR]\wb
254    .elseif \len == 3
255        v\op\()1.32           {\reg\()_565_L[0]}, [\mem\()_ADDR]!
256        v\op\()1.16           {\reg\()_565_L[2]}, [\mem\()_ADDR]!
257        .ifb \wb
258            sub               \mem\()_ADDR, #6
259        .endif
260    .elseif \len == 2
261        v\op\()1.32           {\reg\()_565_L[0]}, [\mem\()_ADDR]\wb
262    .elseif \len == 1
263        v\op\()1.16           {\reg\()_565_L[0]}, [\mem\()_ADDR]\wb
264    .else
265        .error "[16bpp]len should be 1~8"
266    .endif
267    .ifc \op,ld
268        .if \cvt
269            convert           \reg, \bpp, 0
270        .endif
271    .endif
272.elseif \bpp == 8
273    .if \len == 8
274        v\op\()1.8            {\reg\()_A}, [\mem\()_ADDR]\wb
275    .elseif \len == 7
276        v\op\()1.32           {\reg\()_A[0]}, [\mem\()_ADDR]!
277        v\op\()1.16           {\reg\()_A[2]}, [\mem\()_ADDR]!
278        v\op\()1.8            {\reg\()_A[6]}, [\mem\()_ADDR]!
279        .ifb \wb
280            sub               \mem\()_ADDR, #7
281        .endif
282    .elseif \len == 6
283        v\op\()1.32           {\reg\()_A[0]}, [\mem\()_ADDR]!
284        v\op\()1.16           {\reg\()_A[2]}, [\mem\()_ADDR]!
285        .ifb \wb
286            sub               \mem\()_ADDR, #6
287        .endif
288    .elseif \len == 5
289        v\op\()1.32           {\reg\()_A[0]}, [\mem\()_ADDR]!
290        v\op\()1.8            {\reg\()_A[4]}, [\mem\()_ADDR]!
291        .ifb \wb
292            sub               \mem\()_ADDR, #5
293        .endif
294    .elseif \len == 4
295        v\op\()1.32           {\reg\()_A[0]}, [\mem\()_ADDR]\wb
296    .elseif \len == 3
297        v\op\()1.16           {\reg\()_A[0]}, [\mem\()_ADDR]!
298        v\op\()1.8            {\reg\()_A[2]}, [\mem\()_ADDR]!
299        .ifb \wb
300            sub               \mem\()_ADDR, #3
301        .endif
302    .elseif \len == 2
303        v\op\()1.16           {\reg\()_A[0]}, [\mem\()_ADDR]\wb
304    .elseif \len == 1
305        v\op\()1.8            {\reg\()_A[0]}, [\mem\()_ADDR]\wb
306    .else
307        .error "[8bpp]len should be 1~8"
308    .endif
309.elseif \bpp == 0
310    .ifb \wb
311        .if \len == 8
312            v\op\()3.8        {\reg\()_B[], \reg\()_G[], \reg\()_R[]}, [\mem\()_ADDR]
313        .else
314            .error "[color]len should be 8"
315        .endif
316    .endif
317.endif
318.ifc \op,ld
319    .if \cvt && (\bpp > 8) && (\bpp < 32)
320        vmov.u8               \reg\()_A, #0xFF
321    .endif
322.endif
323.endm
324
325.macro premult alpha
326    vmull.u8        PREMULT_B, S_B, \alpha
327    vmull.u8        PREMULT_G, S_G, \alpha
328    vmull.u8        PREMULT_R, S_R, \alpha
329.endm
330
331.macro init src_bpp, dst_bpp, mask, opa
332    ldr             DST_ADDR, [r0, #4]
333    ldr             DST_W, [r0, #8]
334    ldr             DST_H, [r0, #12]
335    ldr             DST_STRIDE, [r0, #16]
336    ldr             SRC_ADDR, [r0, #20]
337.if \src_bpp > 0
338    ldr             SRC_STRIDE, [r0, #24]
339.endif
340.if \mask
341    ldr             MASK_ADDR, [r0, #28]
342    ldr             MASK_STRIDE, [r0, #32]
343    sub             MASK_STRIDE, MASK_STRIDE, DST_W
344.endif
345.if \opa
346    vld1.8          {OPA[]}, [r0]
347.else
348    vmov.u8         OPA, #0xFF
349.endif
350
351    vmvn            D_A, OPA
352.if \dst_bpp == 16
353    sub             DST_STRIDE, DST_STRIDE, DST_W, lsl #1
354.elseif \dst_bpp == 24
355    sub             DST_STRIDE, DST_STRIDE, DST_W
356    sub             DST_STRIDE, DST_STRIDE, DST_W, lsl #1
357.elseif \dst_bpp >= 31
358    sub             DST_STRIDE, DST_STRIDE, DST_W, lsl #2
359.endif
360.if \src_bpp == 0
361    .if \mask || \opa
362        ldst        ld, \src_bpp, 8, SRC, S, 1
363        vmov.u8     S_A, #0xFF
364        premult     OPA
365    .else
366        ldst        ld, \src_bpp, 8, SRC, D, 1
367        vmov.u8     D_A, #0xFF
368        convert     D, \dst_bpp, 1
369    .endif
370.else
371.if \src_bpp == 16
372    sub             SRC_STRIDE, SRC_STRIDE, DST_W, lsl #1
373.elseif \src_bpp == 24
374    sub             SRC_STRIDE, SRC_STRIDE, DST_W
375    sub             SRC_STRIDE, SRC_STRIDE, DST_W, lsl #1
376.elseif \src_bpp >= 31
377    sub             SRC_STRIDE, SRC_STRIDE, DST_W, lsl #2
378.endif
379.endif
380    mvn             FG_MASK, #0
381    mvn             BG_MASK, #0
382.endm
383
384@ input: M_A = 255 - fg.alpha
385.macro calc_alpha len
386    vmov.u8             TMP_D0, #0xFD
387    vmvn                D_A, D_A
388    vcge.u8             TMP_D1, S_A, TMP_D0      @ if (fg.alpha >= LV_OPA_MAX
389    vcge.u8             TMP_D2, D_A, TMP_D0      @ || bg.alpha <= LV_OPA_MIN)
390    vorr                TMP_D2, TMP_D1
391    vcge.u8             TMP_D3, M_A, TMP_D0      @ elseif (fg.alpha <= LV_OPA_MIN)
392    vmvn                TMP_Q1, TMP_Q1
393    vshrn.i16           TMP_D0, TMP_Q1, #4
394    vmov                FG_MASK, BG_MASK, TMP_D0
395    cbz                 FG_MASK, 99f             @ return fg;
396    vmull.u8            TMP_Q0, M_A, D_A         @ D_A = 255 - LV_OPA_MIX2(255 - fg.alpha, 255 - bg.alpha)
397    vqrshrn.u16         M_A, TMP_Q0, #8
398    vbif                M_A, D_A, TMP_D3         @ insert original D_A when fg.alpha <= LV_OPA_MIN
399    vmvn                D_A, M_A
400    cbz                 BG_MASK, 99f             @ return bg;
401    vmov.u8             TMP_D2, #0xFF
402    vmovl.u8            TMP_Q0, D_A
403    .if \len > 4
404        vmovl.u16       S_565, TMP_D1
405    .endif
406    vmovl.u16           TMP_Q0, TMP_D0
407    vmull.u8            TMP_Q1, S_A, TMP_D2
408    vcvt.f32.u32        TMP_Q0, TMP_Q0
409    .if \len > 4
410        vmovl.u16       D_565, TMP_D3
411        vcvt.f32.u32    S_565, S_565
412    .endif
413    vmovl.u16           TMP_Q1, TMP_D2
414    vrecpe.f32          TMP_Q0, TMP_Q0
415    vcvt.f32.u32        TMP_Q1, TMP_Q1
416    .if \len > 4
417        vcvt.f32.u32    D_565, D_565
418        vrecpe.f32      S_565, S_565
419    .endif
420    vmul.f32            TMP_Q0, TMP_Q0, TMP_Q1
421    .if \len > 4
422        vmul.f32        S_565, S_565, D_565
423    .endif
424    vcvt.u32.f32        TMP_Q0, TMP_Q0
425    .if \len > 4
426        vcvt.u32.f32    S_565, S_565
427    .endif
428    vmovn.u32           TMP_D0, TMP_Q0
429    .if \len > 4
430    vmovn.u32           TMP_D1, S_565
431    .endif
432    vmovn.u16           TMP_D0, TMP_Q0
433    premult             TMP_D0
434    vmvn                M_A, TMP_D0
43599:
436.endm
437
438.macro blend mode, dst_bpp
439.if \dst_bpp == 32
440    vmov            TMP_D0, FG_MASK, BG_MASK
441    vmovl.s8        TMP_Q0, TMP_D0
442    vsli.8          TMP_Q0, TMP_Q0, #4
443    cbz             FG_MASK, 98f
444.endif
445.ifc \mode,normal
446.if \dst_bpp == 32
447    cbz             BG_MASK, 97f
448    mvns            BG_MASK, BG_MASK
449    beq             96f
450    vmov            S_565_L, D_B
451    vmov            S_565_H, D_G
452    vmov            D_565_L, D_R
453.endif
45496:
455    vmlal.u8        PREMULT_B, D_B, M_A
456    vmlal.u8        PREMULT_G, D_G, M_A
457    vmlal.u8        PREMULT_R, D_R, M_A
458    vqrshrn.u16     D_B, PREMULT_B, #8
459    vqrshrn.u16     D_G, PREMULT_G, #8
460    vqrshrn.u16     D_R, PREMULT_R, #8
461.if \dst_bpp == 32
462    beq             97f
463    vbif            D_B, S_565_L, TMP_D1
464    vbif            D_G, S_565_H, TMP_D1
465    vbif            D_R, D_565_L, TMP_D1
46697:
467    mvns            FG_MASK, FG_MASK
468    beq             99f
469.endif
470.else
471    .error "blend mode is unsupported"
472.endif
473.if \dst_bpp == 32
47498:
475    vbif            D_B, S_B, TMP_D0
476    vbif            D_G, S_G, TMP_D0
477    vbif            D_R, S_R, TMP_D0
478    vbif            D_A, S_A, TMP_D0
47999:
480.endif
481.endm
482
483.macro process len, src_bpp, dst_bpp, mask, opa, mode
484.if (\src_bpp < 32) && (\mask == 0) && (\opa == 0)
485@ no blend
486    .if \src_bpp == 0 || \src_bpp == \dst_bpp
487        ldst            ld, \src_bpp, \len, SRC, D, 0, !
488        ldst            st, \dst_bpp, \len, DST, D, 0, !
489    .else
490        ldst            ld, \src_bpp, \len, SRC, D, 1, !
491        ldst            st, \dst_bpp, \len, DST, D, 1, !
492    .endif
493.elseif \src_bpp < 32
494@ no src_a
495    .if \src_bpp > 0
496        ldst            ld, \src_bpp, \len, SRC, S, 1, !
497    .endif
498    ldst                ld, \dst_bpp, \len, DST, D, 1
499    .if \mask
500        ldst            ld, 8, \len, MASK, S, 1, !
501        .if \opa
502            vmull.u8    TMP_Q0, S_A, OPA
503            vqrshrn.u16 S_A, TMP_Q0, #8
504        .endif
505        vmvn            M_A, S_A
506        .if \dst_bpp < 32
507            premult     S_A
508        .else
509            calc_alpha  \len
510        .endif
511    .else
512        vmvn            M_A, OPA
513        .if \dst_bpp < 32
514            premult     OPA
515        .else
516            vmov        S_A, OPA
517            calc_alpha  \len
518        .endif
519    .endif
520    blend               \mode, \dst_bpp
521    ldst                st, \dst_bpp, \len, DST, D, 1, !
522.else
523@ src_a (+\mask) (+\opa)
524    ldst                ld, \src_bpp, \len, SRC, S, 1, !
525    ldst                ld, \dst_bpp, \len, DST, D, 1
526    .if \mask == 0
527        .if \opa
528            vmull.u8    TMP_Q0, S_A, OPA
529            vqrshrn.u16 S_A, TMP_Q0, #8
530        .endif
531    .else
532        ldst            ld, 8, \len, MASK, M, 1, !
533        vmull.u8        TMP_Q0, S_A, M_A
534        vqrshrn.u16     S_A, TMP_Q0, #8
535        .if \opa
536            vmull.u8    TMP_Q0, S_A, OPA
537            vqrshrn.u16 S_A, TMP_Q0, #8
538        .endif
539    .endif
540    vmvn                M_A, S_A
541    .if \dst_bpp < 32
542        premult         S_A
543    .else
544        calc_alpha      \len
545    .endif
546    blend               \mode, \dst_bpp
547    ldst                st, \dst_bpp, \len, DST, D, 1, !
548.endif
549.endm
550
551.macro tail src_bpp, dst_bpp, mask, opa, mode
552    tst         DST_W, #4
553    beq         3f
554    tst         DST_W, #2
555    beq         5f
556    tst         DST_W, #1
557    beq         6f
558    process     7, \src_bpp, \dst_bpp, \mask, \opa, \mode
559    b           0f
5606:
561    process     6, \src_bpp, \dst_bpp, \mask, \opa, \mode
562    b           0f
5635:
564    tst         DST_W, #1
565    beq         4f
566    process     5, \src_bpp, \dst_bpp, \mask, \opa, \mode
567    b           0f
5684:
569    process     4, \src_bpp, \dst_bpp, \mask, \opa, \mode
570    b           0f
5713:
572    tst         DST_W, #2
573    beq         1f
574    tst         DST_W, #1
575    beq         2f
576    process     3, \src_bpp, \dst_bpp, \mask, \opa, \mode
577    b           0f
5782:
579    process     2, \src_bpp, \dst_bpp, \mask, \opa, \mode
580    b           0f
5811:
582    process     1, \src_bpp, \dst_bpp, \mask, \opa, \mode
5830:
584.endm
585
586.macro next src_bpp, mask
587    add         DST_ADDR, DST_ADDR, DST_STRIDE
588.if \src_bpp
589    add         SRC_ADDR, SRC_ADDR, SRC_STRIDE
590.endif
591.if \mask
592    add         MASK_ADDR, MASK_ADDR, MASK_STRIDE
593.endif
594.endm
595
596.macro enter
597    push        {r4-r11, lr}
598.endm
599
600.macro exit
601    pop         {r4-r11, pc}
602.endm
603
604.macro preload mem, bpp
605.if \bpp >= 31
606    pld         [\mem\()_ADDR, DST_W, lsl #2]
607.elseif \bpp == 24
608    add         W, DST_W, DST_W, lsl #1
609    pld         [\mem\()_ADDR, W]
610.elseif \bpp == 16
611    pld         [\mem\()_ADDR, DST_W, lsl #1]
612.elseif \bpp == 8
613    pld         [\mem\()_ADDR, DST_W]
614.endif
615.endm
616
617.macro blender src_bpp, dst_bpp, mask, opa, mode
618    enter
619    init        \src_bpp, \dst_bpp, \mask, \opa
620    movs        H, DST_H
621    beq         0f
622    preload     SRC, \src_bpp
623.if \mask || \opa || (\src_bpp == 32)
624    preload     DST, \dst_bpp
625.endif
626    subs        W, DST_W, #8
627    blt         7f
6289:
629    process     8, \src_bpp, \dst_bpp, \mask, \opa, \mode
630    subs        W, W, #8
631    bge         9b
632    tst         DST_W, #7
633    beq         8f
634    tail        \src_bpp, \dst_bpp, \mask, \opa, \mode
6358:
636    next        \src_bpp, \mask
637    preload     SRC, \src_bpp
638.if \mask || \opa || (\src_bpp == 32)
639    preload     DST, \dst_bpp
640.endif
641    sub         W, DST_W, #8
642    subs        H, H, #1
643    bgt         9b
644    exit
6457:
646    tail        \src_bpp, \dst_bpp, \mask, \opa, \mode
647    next        \src_bpp, \mask
648    subs        H, H, #1
649    bgt         7b
650    exit
651.endm
652
653.macro export name, src_bpp, dst_bpp, mask, opa, mode
654.thumb_func
655.func \name
656.global \name
657.hidden \name
658\name\():
659    blender     \src_bpp, \dst_bpp, \mask, \opa, \mode
660.endfunc
661.endm
662
663.macro export_set src, dst, src_bpp, dst_bpp, mode
664.ifc \src,color
665    export _lv_\src\()_blend_to_\dst\()_neon, \src_bpp, \dst_bpp, 0, 0, \mode
666    export _lv_\src\()_blend_to_\dst\()_with_opa_neon, \src_bpp, \dst_bpp, 0, 1, \mode
667    export _lv_\src\()_blend_to_\dst\()_with_mask_neon, \src_bpp, \dst_bpp, 1, 0, \mode
668    export _lv_\src\()_blend_to_\dst\()_mix_mask_opa_neon, \src_bpp, \dst_bpp, 1, 1, \mode
669.else
670    export _lv_\src\()_blend_\mode\()_to_\dst\()_neon, \src_bpp, \dst_bpp, 0, 0, \mode
671    export _lv_\src\()_blend_\mode\()_to_\dst\()_with_opa_neon, \src_bpp, \dst_bpp, 0, 1, \mode
672    export _lv_\src\()_blend_\mode\()_to_\dst\()_with_mask_neon, \src_bpp, \dst_bpp, 1, 0, \mode
673    export _lv_\src\()_blend_\mode\()_to_\dst\()_mix_mask_opa_neon, \src_bpp, \dst_bpp, 1, 1, \mode
674.endif
675.endm
676
677export_set color, rgb565, 0, 16, normal
678export_set rgb565, rgb565, 16, 16, normal
679export_set rgb888, rgb565, 24, 16, normal
680export_set xrgb8888, rgb565, 31, 16, normal
681export_set argb8888, rgb565, 32, 16, normal
682export_set color, rgb888, 0, 24, normal
683export_set rgb565, rgb888, 16, 24, normal
684export_set rgb888, rgb888, 24, 24, normal
685export_set xrgb8888, rgb888, 31, 24, normal
686export_set argb8888, rgb888, 32, 24, normal
687export_set color, xrgb8888, 0, 31, normal
688export_set rgb565, xrgb8888, 16, 31, normal
689export_set rgb888, xrgb8888, 24, 31, normal
690export_set xrgb8888, xrgb8888, 31, 31, normal
691export_set argb8888, xrgb8888, 32, 31, normal
692export_set color, argb8888, 0, 32, normal
693export_set rgb565, argb8888, 16, 32, normal
694export_set rgb888, argb8888, 24, 32, normal
695export_set xrgb8888, argb8888, 31, 32, normal
696export_set argb8888, argb8888, 32, 32, normal
697
698#endif /*LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_NEON*/
699