1/**
2 * @file lv_blend_helium.S
3 *
4 */
5
6#ifndef __ASSEMBLY__
7#define __ASSEMBLY__
8#endif
9
10#include "lv_blend_helium.h"
11
12#if LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_HELIUM && defined(__ARM_FEATURE_MVE) && __ARM_FEATURE_MVE && LV_USE_NATIVE_HELIUM_ASM
13
14.data
15reciprocal:
16.byte 0xFF, 0xE2, 0xCC, 0xB9, 0xAA, 0x9C, 0x91, 0x88
17
18.text
19.syntax unified
20.p2align 2
21
22TMP         .req r0
23DST_ADDR    .req r1
24DST_W       .req r2
25DST_H       .req r3
26DST_STRIDE  .req r4
27SRC_ADDR    .req r5
28SRC_STRIDE  .req r6
29MASK_ADDR   .req r7
30MASK_STRIDE .req r8
31H           .req r9
32OPA         .req r10
33RCP         .req r11
34
35S_B         .req q0
36S_G         .req q1
37S_R         .req q2
38S_A         .req q3
39D_B         .req q4
40D_G         .req q5
41D_R         .req q6
42D_A         .req q7
43N           .req q0
44V           .req q1
45R           .req q2
46L           .req q4
47S_565       .req q0
48D_565       .req q1
49S_L         .req q2
50D_L         .req q4
51D_T         .req q5
52BITMASK     .req q6
53
54.macro ldst st, op, bpp, mem, reg, areg, cvt, alt_index, wb, aligned
55.if \bpp == 0
56.if \cvt
57    ldr             TMP, [\mem\()_ADDR]
58    bfi             TMP, TMP, #2, #8
59    bfi             TMP, TMP, #3, #16
60    lsr             TMP, TMP, #8
61    vdup.16         \reg\()_565, TMP
62.else
63    ldr             TMP, [\mem\()_ADDR]
64    vdup.8          \reg\()_B, TMP
65    lsr             TMP, #8
66    vdup.8          \reg\()_G, TMP
67    lsr             TMP, #8
68    vdup.8          \reg\()_R, TMP
69.endif
70.elseif \bpp == 8
71.if \cvt
72    v\op\()rb.u16   \reg\()_A, [\mem\()_ADDR], #8
73.else
74    v\op\()rb.8     \reg\()_A, [\mem\()_ADDR], #16
75.endif
76.elseif \bpp == 16
77.if \cvt
78.if \st
79    vsri.8          \reg\()_R, \reg\()_G, #5
80    vshr.u8         \reg\()_G, \reg\()_G, #2
81    vshr.u8         \reg\()_B, \reg\()_B, #3
82    vsli.8          \reg\()_B, \reg\()_G, #5
83.endif
84.if \alt_index
85    v\op\()rb.8     \reg\()_B, [\mem\()_ADDR, S_B]
86    v\op\()rb.8     \reg\()_R, [\mem\()_ADDR, S_G]
87.else
88    v\op\()rb.8     \reg\()_B, [\mem\()_ADDR, \reg\()_A]
89    add             \mem\()_ADDR, #1
90    v\op\()rb.8     \reg\()_R, [\mem\()_ADDR, \reg\()_A]
91.endif
92.if \st == 0
93    vshl.u8         \reg\()_G, \reg\()_R, #5
94    vsri.u8         \reg\()_G, \reg\()_B, #3
95    vshl.u8         \reg\()_B, \reg\()_B, #3
96    vsri.u8         \reg\()_R, \reg\()_R, #5
97    vsri.u8         \reg\()_G, \reg\()_G, #6
98    vsri.u8         \reg\()_B, \reg\()_B, #5
99.endif
100.ifc \wb, !
101.if \alt_index
102    add             \mem\()_ADDR, #32
103.else
104    add             \mem\()_ADDR, #31
105.endif
106.elseif \alt_index == 0
107    sub             \mem\()_ADDR, #1
108.endif
109.else @ cvt
110.ifc \wb, !
111    v\op\()rh.16    \reg\()_565, [\mem\()_ADDR], #16
112.else
113    v\op\()rh.16    \reg\()_565, [\mem\()_ADDR]
114.endif
115.endif
116.elseif \bpp == 24
117.if \alt_index == 1
118    v\op\()rb.8     \reg\()_B, [\mem\()_ADDR, S_B]
119    v\op\()rb.8     \reg\()_G, [\mem\()_ADDR, S_G]
120    v\op\()rb.8     \reg\()_R, [\mem\()_ADDR, S_R]
121.elseif \alt_index == 2
122    v\op\()rb.8     \reg\()_B, [\mem\()_ADDR, S_R]
123    v\op\()rb.8     \reg\()_G, [\mem\()_ADDR, S_A]
124    v\op\()rb.8     \reg\()_R, [\mem\()_ADDR, D_A]
125.else
126    v\op\()rb.8     \reg\()_B, [\mem\()_ADDR, \reg\()_A]
127    add             \mem\()_ADDR, #1
128    v\op\()rb.8     \reg\()_G, [\mem\()_ADDR, \reg\()_A]
129    add             \mem\()_ADDR, #1
130    v\op\()rb.8     \reg\()_R, [\mem\()_ADDR, \reg\()_A]
131.endif
132.ifc \wb, !
133.if \alt_index
134    add             \mem\()_ADDR, #48
135.else
136    add             \mem\()_ADDR, #46
137.endif
138.elseif \alt_index == 0
139    sub             \mem\()_ADDR, #2
140.endif
141.elseif \aligned
142    v\op\()40.8     {\reg\()_B, \reg\()_G, \reg\()_R, \reg\()_A}, [\mem\()_ADDR]
143    v\op\()41.8     {\reg\()_B, \reg\()_G, \reg\()_R, \reg\()_A}, [\mem\()_ADDR]
144    v\op\()42.8     {\reg\()_B, \reg\()_G, \reg\()_R, \reg\()_A}, [\mem\()_ADDR]
145    v\op\()43.8     {\reg\()_B, \reg\()_G, \reg\()_R, \reg\()_A}, [\mem\()_ADDR]\wb
146.else
147    v\op\()rb.8     \reg\()_B, [\mem\()_ADDR, \areg\()_A]
148    add             \mem\()_ADDR, #1
149    v\op\()rb.8     \reg\()_G, [\mem\()_ADDR, \areg\()_A]
150    add             \mem\()_ADDR, #1
151    v\op\()rb.8     \reg\()_R, [\mem\()_ADDR, \areg\()_A]
152.if (\bpp == 32) || (\bpp == 31) && \st
153    add             \mem\()_ADDR, #1
154    v\op\()rb.8     \reg\()_A, [\mem\()_ADDR, \areg\()_A]
155.endif
156.ifc \wb, !
157    .if (\bpp == 32) || (\bpp == 31) && \st
158        add         \mem\()_ADDR, #61
159    .else
160        add         \mem\()_ADDR, #62
161    .endif
162.else
163    .if (\bpp == 32) || (\bpp == 31) && \st
164        sub         \mem\()_ADDR, #3
165    .else
166        sub         \mem\()_ADDR, #2
167    .endif
168.endif
169.endif
170.endm
171
172.macro load_index bpp, reg, areg, aligned
173.if (\bpp > 0) && ((\bpp < 31) || (\aligned == 0))
174    mov             TMP, #0
175.if \bpp == 8
176    vidup.u8        \reg\()_A, TMP, #1
177.elseif \bpp == 16
178    vidup.u8        \reg\()_A, TMP, #2
179.elseif \bpp == 24
180    vidup.u8        \reg\()_A, TMP, #1
181    mov             TMP, #3
182    vmul.u8         \reg\()_A, \reg\()_A, TMP
183.else
184    vidup.u8        \areg\()_A, TMP, #4
185.endif
186.endif
187.endm
188
189.macro init src_bpp, dst_bpp, mask, opa
190    ldr             DST_ADDR, [r0, #4]
191    ldr             DST_W, [r0, #8]
192    ldr             DST_H, [r0, #12]
193    ldr             DST_STRIDE, [r0, #16]
194    ldr             SRC_ADDR, [r0, #20]
195.if \src_bpp > 0
196    ldr             SRC_STRIDE, [r0, #24]
197.endif
198.if \mask
199    ldr             MASK_ADDR, [r0, #28]
200    ldr             MASK_STRIDE, [r0, #32]
201.endif
202.if \opa
203    ldr             OPA, [r0]
204.endif
205.if (\src_bpp <= 16) && (\dst_bpp == 16)
206.if \opa || \mask
207    mov             TMP, #0xF81F
208    movt            TMP, #0x7E0
209    vdup.32         BITMASK, TMP
210.endif
211    add             TMP, DST_W, #0x7
212    bic             TMP, TMP, #0x7
213.else
214    add             TMP, DST_W, #0xF
215    bic             TMP, TMP, #0xF
216.endif
217.if \dst_bpp == 32
218    ldr             RCP, =(reciprocal - 8)
219.endif
220
221.if \dst_bpp == 16
222    sub             DST_STRIDE, DST_STRIDE, TMP, lsl #1
223.elseif \dst_bpp == 24
224    sub             DST_STRIDE, DST_STRIDE, TMP
225    sub             DST_STRIDE, DST_STRIDE, TMP, lsl #1
226.elseif \dst_bpp >= 31
227    sub             DST_STRIDE, DST_STRIDE, TMP, lsl #2
228.endif
229.if \mask
230    sub             MASK_STRIDE, MASK_STRIDE, TMP
231.endif
232.if \src_bpp == 0
233.if \mask || \opa
234    .if \dst_bpp > 16
235        ldst        0, ld, \src_bpp, SRC, S, D, 0, 0
236        vmov.u8     S_A, #0xFF
237    .else
238        ldst        0, ld, \src_bpp, SRC, S, D, 1, 0
239        vmovlb.u16  S_L, S_565
240        vsli.32     S_L, S_L, #16
241        vand        S_L, S_L, BITMASK
242    .endif
243.else
244    .if \dst_bpp > 16
245        ldst        0, ld, \src_bpp, SRC, D, S, 0, 0
246    .else
247        ldst        0, ld, \src_bpp, SRC, D, S, 1, 0
248    .endif
249.endif
250.else
251    .if \src_bpp == 16
252        sub         SRC_STRIDE, SRC_STRIDE, TMP, lsl #1
253    .elseif \src_bpp == 24
254        sub         SRC_STRIDE, SRC_STRIDE, TMP
255        sub         SRC_STRIDE, SRC_STRIDE, TMP, lsl #1
256    .elseif \src_bpp >= 31
257        sub         SRC_STRIDE, SRC_STRIDE, TMP, lsl #2
258    .endif
259.endif
260.if (\src_bpp < 32) && (\mask == 0) && (\opa == 0) && !((\src_bpp <= 16) && (\dst_bpp == 16))
261@ 16 to 31/32 or reverse: index @ q0, q1
262@ 24 to 31/32 or reverse: index @ q0, q1, q2
263@ 16 to 24 or reverse: 16 index @ q0, q1, 24 index @ q2, q3, q7
264@ 31 to 31/32: index @ q3 (tail only)
265    mov         TMP, #0
266.if (\src_bpp == 16) || (\dst_bpp == 16)
267    vidup.u8    S_B, TMP, #2
268    mov         TMP, #1
269    vadd.u8     S_G, S_B, TMP
270.if (\src_bpp == 24) || (\dst_bpp == 24)
271    vshl.u8     S_R, S_B, #1
272    vadd.u8     S_R, S_R, S_B
273    vshr.u8     S_R, S_R, #1
274    vadd.u8     S_A, S_R, TMP
275    vadd.u8     D_A, S_A, TMP
276.endif
277.elseif (\src_bpp == 24) || (\dst_bpp == 24)
278    vidup.u8    S_B, TMP, #1
279    mov         TMP, #3
280    vmul.u8     S_B, S_B, TMP
281    mov         TMP, #1
282    vadd.u8     S_G, S_B, TMP
283    vadd.u8     S_R, S_G, TMP
284.endif
285.if \dst_bpp >= 31
286    load_index  \dst_bpp, D, S, 0
287    vmov.u8     D_A, #0xFF
288.endif
289.endif
290.endm
291
292.macro vqrdmulh_u8 Qd, Qn, Qm      @ 1 bit precision loss
293    vmulh.u8       \Qd, \Qn, \Qm
294    vqshl.u8       \Qd, \Qd, #1
295.endm
296
297.macro premult mem, alpha
298    vrmulh.u8       \mem\()_B, \mem\()_B, \alpha
299    vrmulh.u8       \mem\()_G, \mem\()_G, \alpha
300    vrmulh.u8       \mem\()_R, \mem\()_R, \alpha
301.endm
302
303.macro blend_565 p
304    vmovl\p\().u16  D_L, D_565
305    vsli.32         D_L, D_L, #16
306    vand            D_L, D_L, BITMASK
307    vsub.u32        D_T, S_L, D_L
308    vmovl\p\().u16  D_A, S_A
309    vmul.u32        D_T, D_T, D_A
310    vshr.u32        D_T, D_T, #5
311    vadd.u32        D_L, D_L, D_T
312    vand            D_L, D_L, BITMASK
313    vshr.u32        D_T, D_L, #16
314    vorr            D_L, D_L, D_T
315    vmovn\p\().u32  D_565, D_L
316.endm
317
318.macro late_init src_bpp, dst_bpp, mask, opa, mode
319.if (\src_bpp <= 16) && (\dst_bpp == 16) && (\mask == 0)
320.if \opa == 2
321    mov             TMP, #0x7BEF
322    vdup.16         BITMASK, TMP
323.if \src_bpp == 0
324    vshr.u16        S_L, S_565, #1
325    vand            S_L, S_L, BITMASK
326.endif
327.elseif \opa == 1
328    vdup.16         S_A, OPA
329    mov             TMP, #4
330    vadd.u16        S_A, S_A, TMP
331    vshr.u16        S_A, S_A, #3
332.endif
333.endif
334.endm
335
336.macro blend src_bpp, dst_bpp, mask, opa, mode
337.if (\mask == 0) && (\opa == 2)
338.if (\src_bpp <= 16) && (\dst_bpp == 16)
339.if \src_bpp > 0
340    vshr.u16        S_L, S_565, #1
341    vand            S_L, S_L, BITMASK
342.endif
343    vshr.u16        D_L, D_565, #1
344    vand            D_L, D_L, BITMASK
345    vadd.u16        D_565, S_L, D_L
346.else
347    vhadd.u8        D_B, D_B, S_B
348    vhadd.u8        D_G, D_G, S_G
349    vhadd.u8        D_R, D_R, S_R
350.endif
351.elseif (\src_bpp <= 16) && (\dst_bpp == 16)
352    lsl             lr, #1
353.if \src_bpp > 0
354    vmovlb.u16      S_L, S_565
355    vsli.32         S_L, S_L, #16
356    vand            S_L, S_L, BITMASK
357.endif
358    blend_565       b
359.if \src_bpp > 0
360    vmovlt.u16      S_L, S_565
361    vsli.32         S_L, S_L, #16
362    vand            S_L, S_L, BITMASK
363.endif
364    blend_565       t
365    lsr             lr, #1
366.else
367.if \dst_bpp < 32
368.if (\opa == 0) && (\mask == 0)
369    vmov.u8         D_A, #0xFF
370    mov             TMP, #0
371    vabav.u8        TMP, S_A, D_A
372    cbnz            TMP, 91f
373    vmov            D_B, S_B
374    vmov            D_G, S_G
375    vmov            D_R, S_R
376    b               88f
37791:
378.endif
379    vmvn            D_A, S_A
380    premult         S, S_A
381    premult         D, D_A
382.else
383    vpush           {d0-d5}
384    vmov.u8         S_B, #0xFF
385    vmov.u8         S_G, #0
386    mov             TMP, #0
387    vabav.u8        TMP, S_A, S_B
388    cbz             TMP, 91f        @ if(fg.alpha == 255
389    mov             TMP, #0
390    vabav.u8        TMP, D_A, S_G
391    cbnz            TMP, 90f        @    || bg.alpha == 0)
39291:
393    vpop            {d8-d13}        @   return fg;
394    vmov.u8         D_A, #0xFF
395    b               88f
39690:
397    mov             TMP, #0
398    vabav.u8        TMP, S_A, S_G
399    cmp             TMP, #2         @ if(fg.alpha <= LV_OPA_MIN)
400    itt             le              @   return bg;
401    vpople          {d0-d5}
402    ble             88f
403    mov             TMP, #0
404    vabav.u8        TMP, D_A, S_B   @ if (bg.alpha == 255)
405    cbnz            TMP, 89f        @   return lv_color_mix32(fg, bg);
406    vpop            {d0-d5}
407    vmvn            D_A, S_A
408    premult         S, S_A
409    premult         D, D_A
410    vqadd.u8        D_B, D_B, S_B
411    vqadd.u8        D_G, D_G, S_G
412    vqadd.u8        D_R, D_R, S_R
413    vmov.u8         D_A, #0xFF
414    b               88f
41589:
416    vmvn            N, S_A
417    vmvn            D_A, D_A
418    vrmulh.u8       D_A, N, D_A
419    vmvn            D_A, D_A        @ D_A = 255 - LV_OPA_MIX2(255 - fg.alpha, 255 - bg.alpha)
420    vclz.i8         N, D_A          @ n = clz(D_A)
421    vshl.u8         V, D_A, N       @ v = D_A << n
422    vshl.u8         S_A, S_A, N
423    vshr.u8         N, V, #4        @ N is used as tmp from now on
424    vldrb.u8        R, [RCP, N]     @ r = reciprocal[(v >> 4) - 8]
425    vrmulh.u8       N, V, R         @ r = newton(v,r)
426    vmvn            N, N            @   = vqrdmulh.u8(vmvn(vrmulh(v, r)), r)
427    vqrdmulh_u8     R, N, R         @ but vqrdmulh does not support u8, so we implement one
428    vrmulh.u8       N, V, R         @ and do it twice
429    vmvn            N, N
430    vqrdmulh_u8     R, N, R
431    vqrdmulh_u8     S_A, S_A, R     @ S_A' = S_A * 255 / D_A = vrdmulh(S_A << n, r)
432    vpop            {d0-d5}
433    premult         S, S_A
434    vmvn            S_A, S_A
435    premult         D, S_A
436.endif
437    vqadd.u8        D_B, D_B, S_B
438    vqadd.u8        D_G, D_G, S_G
439    vqadd.u8        D_R, D_R, S_R
440.endif
441.if \dst_bpp == 31
442    vmov.u8         D_A, #0xFF
443.endif
44488:
445.endm
446
447.macro blend_line src_bpp, dst_bpp, mask, opa, mode
448.if (\src_bpp < 31) && (\dst_bpp < 31)
449    blend_block     \src_bpp, \dst_bpp, \mask, \opa, \mode, DST_W, 0
450.else
451    bics            TMP, DST_W, #0xF
452    beq             87f
453    blend_block     \src_bpp, \dst_bpp, \mask, \opa, \mode, TMP, 1
45487:
455    ands            TMP, DST_W, #0xF
456    beq             86f
457    blend_block     \src_bpp, \dst_bpp, \mask, \opa, \mode, TMP, 0
45886:
459.endif
460.endm
461
462.macro blend_block src_bpp, dst_bpp, mask, opa, mode, w, aligned
463.if (\src_bpp <= 16) && (\dst_bpp == 16)
464    wlstp.16            lr, \w, 1f
465.else
466    wlstp.8             lr, \w, 1f
467.endif
4682:
469.if (\src_bpp < 32) && (\mask == 0) && (\opa == 0)
470@ no blend
471    .if \src_bpp == 0
472        ldst            1, st, \dst_bpp, DST, D, S, 0, 1, !, \aligned
473    .elseif (\src_bpp == \dst_bpp) || (\src_bpp == 31) && (\dst_bpp == 32)
474        .if \dst_bpp < 31
475            .if \src_bpp < 31
476                ldst    0, ld, \src_bpp, SRC, D, S, 0, 1, !, \aligned
477            .else
478                ldst    0, ld, \src_bpp, SRC, D, S, 0, 1, !, \aligned
479            .endif
480            ldst        1, st, \dst_bpp, DST, D, S, 0, 1, !, \aligned
481        .else
482            ldst        0, ld, \src_bpp, SRC, D, S, 0, 1, !, \aligned
483            ldst        1, st, \dst_bpp, DST, D, S, 0, 1, !, \aligned
484        .endif
485    .else
486        .if (\dst_bpp < 31) && (\src_bpp < 31)
487            ldst        0, ld, \src_bpp, SRC, D, S, 1, 2, !, \aligned
488            ldst        1, st, \dst_bpp, DST, D, S, 1, 2, !, \aligned
489        .else
490            ldst        0, ld, \src_bpp, SRC, D, S, 1, 1, !, \aligned
491            ldst        1, st, \dst_bpp, DST, D, S, 1, 1, !, \aligned
492        .endif
493    .endif
494.elseif (\src_bpp <= 16) && (\dst_bpp == 16)
495    .if \src_bpp > 0
496        ldst            0, ld, \src_bpp, SRC, S, D, 0, 0, !, \aligned
497    .endif
498        ldst            0, ld, \dst_bpp, DST, D, S, 0, 0, , \aligned
499    .if \mask
500        ldst            0, ld, 8, MASK, S, D, 1, 0, !
501        .if \opa == 2
502            vshr.u16    S_A, S_A, #1
503        .elseif \opa == 1
504            vmul.u16    S_A, S_A, OPA
505            vshr.u16    S_A, S_A, #8
506        .endif
507        mov             TMP, #4
508        vadd.u16        S_A, S_A, TMP
509        vshr.u16        S_A, S_A, #3
510    .endif
511    blend               \src_bpp, \dst_bpp, \mask, \opa, \mode
512    ldst                1, st, \dst_bpp, DST, D, S, 0, 0, !, \aligned
513.elseif \src_bpp < 32
514@ no src_a
515.if \src_bpp > 0
516    load_index          \src_bpp, S, D, \aligned
517    ldst                0, ld, \src_bpp, SRC, S, D, 1, 0, !, \aligned
518.elseif (\opa == 1) || \mask
519    vpush               {d0-d5}
520.endif
521    load_index          \dst_bpp, D, S, \aligned
522    ldst                0, ld, \dst_bpp, DST, D, S, 1, 0, , \aligned
523    .if \mask
524        ldst            0, ld, 8, MASK, S, D, 0, 0, !, \aligned
525        .if \opa == 2
526            vshr.u8     S_A, S_A, #1
527        .elseif \opa == 1
528        .if \dst_bpp == 32
529            vpush       {d14-d15}
530        .endif
531            vdup.8      D_A, OPA
532            vrmulh.u8   S_A, S_A, D_A
533        .if \dst_bpp == 32
534            vpop        {d14-d15}
535        .endif
536        .endif
537    .elseif \opa == 1
538        vdup.8          S_A, OPA
539    .endif
540    blend               \src_bpp, \dst_bpp, \mask, \opa, \mode
541.if (\src_bpp == 0) && ((\opa == 1) || \mask)
542    vpop                {d0-d5}
543.endif
544    .if (\dst_bpp == 32) || \mask || (\opa == 1)
545        load_index      \dst_bpp, D, S, \aligned
546    .endif
547    ldst                1, st, \dst_bpp, DST, D, S, 1, 0, !, \aligned
548.else
549@ src_a (+\mask) (+\opa)
550    load_index          \dst_bpp, D, S, \aligned
551    ldst                0, ld, \dst_bpp, DST, D, S, 1, 0, , \aligned
552    .if (\dst_bpp == 32) && (\mask || \opa || (\aligned == 0))
553        vpush           {d14-d15}
554    .endif
555    load_index          \src_bpp, S, D, \aligned
556    ldst                0, ld, \src_bpp, SRC, S, D, 1, 0, !, \aligned
557    .if \mask == 0
558        .if \opa
559            vdup.8      D_A, OPA
560            vrmulh.u8   S_A, S_A, D_A
561        .endif
562    .else
563        ldst            0, ld, 8, MASK, D, S, 0, 0, !, \aligned
564        vrmulh.u8       S_A, S_A, D_A
565        .if \opa
566            vdup.8      D_A, OPA
567            vrmulh.u8   S_A, S_A, D_A
568        .endif
569    .endif
570    .if (\dst_bpp == 32) && (\mask || \opa || (\aligned == 0))
571        vpop            {d14-d15}
572    .endif
573    blend               \src_bpp, \dst_bpp, \mask, \opa, \mode
574    load_index          \dst_bpp, D, S, \aligned
575    ldst                1, st, \dst_bpp, DST, D, S, 1, 0, !, \aligned
576.endif
577    letp                lr, 2b
5781:
579.endm
580
581.macro enter complex
582    push        {r4-r11, lr}
583.if \complex
584    vpush       {d8-d15}
585.endif
586.endm
587
588.macro exit complex
589.if \complex
590    vpop        {d8-d15}
591.endif
592    pop         {r4-r11, pc}
593.endm
594
595.macro preload mem, bpp
596.if \bpp >= 31
597    pld         [\mem\()_ADDR, DST_W, lsl #2]
598.elseif \bpp == 24
599    add         TMP, DST_W, DST_W, lsl #1
600    pld         [\mem\()_ADDR, TMP]
601.elseif \bpp == 16
602    pld         [\mem\()_ADDR, DST_W, lsl #1]
603.elseif \bpp == 8
604    pld         [\mem\()_ADDR, DST_W]
605.endif
606.endm
607
608.macro next src_bpp, mask
609    add         DST_ADDR, DST_ADDR, DST_STRIDE
610.if \src_bpp > 0
611    add         SRC_ADDR, SRC_ADDR, SRC_STRIDE
612.endif
613.if \mask
614    add         MASK_ADDR, MASK_ADDR, MASK_STRIDE
615.endif
616.endm
617
618.macro blender src_bpp, dst_bpp, mask, opa, mode
619.if (\src_bpp <= 16) && (\dst_bpp == 16) && (\opa == 0) && (\mask == 0)
620    enter       0
621.else
622    enter       1
623.endif
624    init        \src_bpp, \dst_bpp, \mask, \opa
625    movs        H, DST_H
626    beq         0f
627    preload     SRC, \src_bpp
628.if \mask || \opa || (\src_bpp == 32)
629    preload     DST, \dst_bpp
630.endif
631.if \opa && (\src_bpp < 32) && (\dst_bpp < 32)
6324:
633@ 50% OPA can be accelerated (OPA == 0x7F/0x80)
634    add         TMP, OPA, #1
635    tst         TMP, #0x7E
636    bne         3f
637    late_init   \src_bpp, \dst_bpp, \mask, 2, \mode
638    blend_line  \src_bpp, \dst_bpp, \mask, 2, \mode
639    next        \src_bpp, \mask
640    subs        H, #1
641    bne         4b
642    b           0f
643.endif
6443:
645    late_init   \src_bpp, \dst_bpp, \mask, \opa, \mode
646    blend_line  \src_bpp, \dst_bpp, \mask, \opa, \mode
647    next        \src_bpp, \mask
648    subs        H, #1
649    bne         3b
6500:
651.if (\src_bpp <= 16) && (\dst_bpp == 16) && (\opa == 0) && (\mask == 0)
652    exit        0
653.else
654    exit        1
655.endif
656.ltorg
657.endm
658
659.macro export name, src_bpp, dst_bpp, mask, opa, mode
660.thumb_func
661.global \name
662\name\():
663    blender     \src_bpp, \dst_bpp, \mask, \opa, \mode
664.endm
665
666.macro export_set src, dst, src_bpp, dst_bpp, mode
667.ifc \src, color
668    export lv_\src\()_blend_to_\dst\()_helium, \src_bpp, \dst_bpp, 0, 0, \mode
669    export lv_\src\()_blend_to_\dst\()_with_opa_helium, \src_bpp, \dst_bpp, 0, 1, \mode
670    export lv_\src\()_blend_to_\dst\()_with_mask_helium, \src_bpp, \dst_bpp, 1, 0, \mode
671    export lv_\src\()_blend_to_\dst\()_mix_mask_opa_helium, \src_bpp, \dst_bpp, 1, 1, \mode
672.else
673    export lv_\src\()_blend_\mode\()_to_\dst\()_helium, \src_bpp, \dst_bpp, 0, 0, \mode
674    export lv_\src\()_blend_\mode\()_to_\dst\()_with_opa_helium, \src_bpp, \dst_bpp, 0, 1, \mode
675    export lv_\src\()_blend_\mode\()_to_\dst\()_with_mask_helium, \src_bpp, \dst_bpp, 1, 0, \mode
676    export lv_\src\()_blend_\mode\()_to_\dst\()_mix_mask_opa_helium, \src_bpp, \dst_bpp, 1, 1, \mode
677.endif
678.endm
679
680export_set color, rgb565, 0, 16, normal
681export_set rgb565, rgb565, 16, 16, normal
682export_set rgb888, rgb565, 24, 16, normal
683export_set xrgb8888, rgb565, 31, 16, normal
684export_set argb8888, rgb565, 32, 16, normal
685export_set color, rgb888, 0, 24, normal
686export_set rgb565, rgb888, 16, 24, normal
687export_set rgb888, rgb888, 24, 24, normal
688export_set xrgb8888, rgb888, 31, 24, normal
689export_set argb8888, rgb888, 32, 24, normal
690export_set color, xrgb8888, 0, 31, normal
691export_set rgb565, xrgb8888, 16, 31, normal
692export_set rgb888, xrgb8888, 24, 31, normal
693export_set xrgb8888, xrgb8888, 31, 31, normal
694export_set argb8888, xrgb8888, 32, 31, normal
695export_set color, argb8888, 0, 32, normal
696export_set rgb565, argb8888, 16, 32, normal
697export_set rgb888, argb8888, 24, 32, normal
698export_set xrgb8888, argb8888, 31, 32, normal
699export_set argb8888, argb8888, 32, 32, normal
700
701#endif /*LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_HELIUM && defined(__ARM_FEATURE_MVE) && __ARM_FEATURE_MVE && LV_USE_NATIVE_HELIUM_ASM*/
702