1/*
2 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7#include "pico/asm_helper.S"
8
9#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
10
11#ifndef PICO_FLOAT_IN_RAM
12#define PICO_FLOAT_IN_RAM 0
13#endif
14
15pico_default_asm_setup
16
17.macro float_section name
18// todo separate flag for shims?
19#if PICO_FLOAT_IN_RAM
20.section RAM_SECTION_NAME(\name), "ax"
21#else
22.section SECTION_NAME(\name), "ax"
23#endif
24.endm
25
26float_section float_table_shim_on_use_helper
27regular_func float_table_shim_on_use_helper
28    push {r0-r2, lr}
29    mov r0, ip
30#ifndef NDEBUG
31    // sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro
32    cmp r0, #0
33    bne 1f
34    bkpt #0
35#endif
361:
37    ldrh r1, [r0]
38    lsrs r2, r1, #8
39    adds r0, #2
40    cmp r2, #0xdf
41    bne 1b
42    uxtb r1, r1 // r1 holds table offset
43    lsrs r2, r0, #2
44    bcc 1f
45    // unaligned
46    ldrh r2, [r0, #0]
47    ldrh r0, [r0, #2]
48    lsls r0, #16
49    orrs r0, r2
50    b 2f
511:
52    ldr r0, [r0]
532:
54    ldr r2, =sf_table
55    str r0, [r2, r1]
56    str r0, [sp, #12]
57    pop {r0-r2, pc}
58
59float_section 642float_shims
60
61@ convert uint64 to float, rounding
62regular_func uint642float_shim
63 movs r2,#0       @ fall through
64
65@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
66regular_func ufix642float_shim
67 push {r4,r5,r14}
68 cmp r1,#0
69 bpl 3f          @ positive? we can use signed code
70 lsls r5,r1,#31  @ contribution to sticky bits
71 orrs r5,r0
72 lsrs r0,r1,#1
73 subs r2,#1
74 b 4f
75
76@ convert int64 to float, rounding
77regular_func int642float_shim
78 movs r2,#0       @ fall through
79
80@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
81regular_func fix642float_shim
82 push {r4,r5,r14}
833:
84 movs r5,r0
85 orrs r5,r1
86 beq ret_pop45   @ zero? return +0
87 asrs r5,r1,#31  @ sign bits
882:
89 asrs r4,r1,#24  @ try shifting 7 bits at a time
90 cmp r4,r5
91 bne 1f          @ next shift will overflow?
92 lsls r1,#7
93 lsrs r4,r0,#25
94 orrs r1,r4
95 lsls r0,#7
96 adds r2,#7
97 b 2b
981:
99 movs r5,r0
100 movs r0,r1
1014:
102 negs r2,r2
103 adds r2,#32+29
104
105 // bl packx
106 ldr r1, =0x29ef // packx
107 blx r1
108ret_pop45:
109 pop {r4,r5,r15}
110
111float_section fatan2_shim
112regular_func fatan2_shim
113 push {r4,r5,r14}
114
115 ldr r4, =0x29c1 // unpackx
116 mov ip, r4
117@ unpack arguments and shift one down to have common exponent
118 blx ip
119 mov r4,r0
120 mov r0,r1
121 mov r1,r4
122 mov r4,r2
123 mov r2,r3
124 mov r3,r4
125 blx ip
126 lsls r0,r0,#5  @ Q28
127 lsls r1,r1,#5  @ Q28
128 adds r4,r2,r3  @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise
129 asrs r4,#9
130 adds r4,#1
131 bmi 2f         @ force y to 0 proper, so result will be zero
132 subs r4,r2,r3  @ calculate shift
133 bge 1f         @ ex>=ey?
134 negs r4,r4     @ make shift positive
135 asrs r0,r4
136 cmp r4,#28
137 blo 3f
138 asrs r0,#31
139 b 3f
1401:
141 asrs r1,r4
142 cmp r4,#28
143 blo 3f
1442:
145@ here |x|>>|y| or both x and y are ±0
146 cmp r0,#0
147 bge 4f         @ x positive, return signed 0
148 ldr r3, =0x2cfc         @ &pi_q29, circular coefficients
149 ldr r0,[r3]    @ x negative, return +/- pi
150 asrs r1,#31
151 eors r0,r1
152 b 7f
1534:
154 asrs r0,r1,#31
155 b 7f
1563:
157 movs r2,#0              @ initial angle
158 ldr r3, =0x2cfc         @ &pi_q29, circular coefficients
159 cmp r0,#0               @ x negative
160 bge 5f
161 negs r0,r0              @ rotate to 1st/4th quadrants
162 negs r1,r1
163 ldr r2,[r3]             @ pi Q29
1645:
165 movs r4,#1              @ m=1
166 ldr r5, =0x2b97         @ cordic_vec
167 blx r5                  @ also produces magnitude (with scaling factor 1.646760119), which is discarded
168 mov r0,r2               @ result here is -pi/2..3pi/2 Q29
169@ asrs r2,#29
170@ subs r0,r2
171 ldr r3, =0x2cfc         @ &pi_q29, circular coefficients
172 ldr r2,[r3]             @ pi Q29
173 adds r4,r0,r2           @ attempt to fix -3pi/2..-pi case
174 bcs 6f                  @ -pi/2..0? leave result as is
175 subs r4,r0,r2           @ <pi? leave as is
176 bmi 6f
177 subs r0,r4,r2           @ >pi: take off 2pi
1786:
179 subs r0,#1              @ fiddle factor so atan2(0,1)==0
1807:
181 movs r2,#0              @ exponent for pack
182 ldr r3, =0x2b19
183 bx r3
184
185float_section float232_shims
186
187regular_func float2int_shim
188     movs r1,#0                    @ fall through
189regular_func float2fix_shim
190     // check for -0 or -denormal upfront
191     asrs r2, r0, #23
192     adds r2, #128
193     adds r2, #128
194     beq 1f
195     // call original
196     ldr r2, =0x2acd
197     bx r2
198     1:
199     movs r0, #0
200     bx lr
201
202float_section float264_shims
203
204regular_func float2int64_shim
205 movs r1,#0                    @ and fall through
206regular_func float2fix64_shim
207 push {r14}
208 bl f2fix
209 b d2f64_a
210
211regular_func float2uint64_shim
212 movs r1,#0                    @ and fall through
213regular_func float2ufix64_shim
214 asrs r3,r0,#23                @ negative? return 0
215 bmi ret_dzero
216@ and fall through
217
218@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf
219@ result clamped so that r3 can only be 0 or -1
220@ trashes r12
221.thumb_func
222f2fix:
223 push {r4,r14}
224 mov r12,r1
225 asrs r3,r0,#31
226 lsls r0,#1
227 lsrs r2,r0,#24
228 beq 1f                        @ zero?
229 cmp r2,#0xff                  @ Inf?
230 beq 2f
231 subs r1,r2,#1
232 subs r2,#0x7f                 @ remove exponent bias
233 lsls r1,#24
234 subs r0,r1                    @ insert implied 1
235 eors r0,r3
236 subs r0,r3                    @ top two's complement
237 asrs r1,r0,#4                 @ convert to double format
238 lsls r0,#28
239 ldr r4, =d2fix_a
240 bx r4
2411:
242 movs r0,#0
243 movs r1,r0
244 movs r3,r0
245 pop {r4,r15}
2462:
247 mvns r0,r3                    @ return max/min value
248 mvns r1,r3
249 pop {r4,r15}
250
251ret_dzero:
252 movs r0,#0
253 movs r1,#0
254 bx r14
255
256float_section d2fix_a_float
257
258.weak d2fix_a // weak because it exists in float shims too
259.thumb_func
260d2fix_a:
261@ here
262@ r0:r1 two's complement mantissa
263@ r2    unbaised exponent
264@ r3    mantissa sign extension bits
265 add r2,r12                    @ exponent plus offset for required binary point position
266 subs r2,#52                   @ required shift
267 bmi 1f                        @ shift down?
268@ here a shift up by r2 places
269 cmp r2,#12                    @ will clamp?
270 bge 2f
271 movs r4,r0
272 lsls r1,r2
273 lsls r0,r2
274 negs r2,r2
275 adds r2,#32                   @ complementary shift
276 lsrs r4,r2
277 orrs r1,r4
278 pop {r4,r15}
2792:
280 mvns r0,r3
281 mvns r1,r3                    @ overflow: clamp to extreme fixed-point values
282 pop {r4,r15}
2831:
284@ here a shift down by -r2 places
285 adds r2,#32
286 bmi 1f                        @ long shift?
287 mov r4,r1
288 lsls r4,r2
289 negs r2,r2
290 adds r2,#32                   @ complementary shift
291 asrs r1,r2
292 lsrs r0,r2
293 orrs r0,r4
294 pop {r4,r15}
2951:
296@ here a long shift down
297 movs r0,r1
298 asrs r1,#31                   @ shift down 32 places
299 adds r2,#32
300 bmi 1f                        @ very long shift?
301 negs r2,r2
302 adds r2,#32
303 asrs r0,r2
304 pop {r4,r15}
3051:
306 movs r0,r3                    @ result very near zero: use sign extension bits
307 movs r1,r3
308 pop {r4,r15}
309d2f64_a:
310 asrs r2,r1,#31
311 cmp r2,r3
312 bne 1f                        @ sign extension bits fail to match sign of result?
313 pop {r15}
3141:
315 mvns r0,r3
316 movs r1,#1
317 lsls r1,#31
318 eors r1,r1,r0                 @ generate extreme fixed-point values
319 pop {r15}
320
321float_section float2double_shim
322regular_func float2double_shim
323 lsrs r3,r0,#31                @ sign bit
324 lsls r3,#31
325 lsls r1,r0,#1
326 lsrs r2,r1,#24                @ exponent
327 beq 1f                        @ zero?
328 cmp r2,#0xff                  @ Inf?
329 beq 2f
330 lsrs r1,#4                    @ exponent and top 20 bits of mantissa
331 ldr r2,=(0x3ff-0x7f)<<20     @ difference in exponent offsets
332 adds r1,r2
333 orrs r1,r3
334 lsls r0,#29                   @ bottom 3 bits of mantissa
335 bx r14
3361:
337 movs r1,r3                    @ return signed zero
3383:
339 movs r0,#0
340 bx r14
3412:
342 ldr r1,=0x7ff00000           @ return signed infinity
343 adds r1,r3
344 b 3b
345
346#endif