1/*
2 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7#if !PICO_RP2040
8#include "pico/asm_helper.S"
9
10pico_default_asm_setup
11
12.macro float_section name
13#if PICO_FLOAT_IN_RAM
14.section RAM_SECTION_NAME(\name), "ax"
15#else
16.section SECTION_NAME(\name), "ax"
17#endif
18.endm
19
20.macro float_wrapper_section func
21float_section WRAPPER_FUNC_NAME(\func)
22.endm
23
24@ load a 32-bit constant n into register rx
25.macro movlong rx,n
26 movw \rx,#(\n)&0xffff
27 movt \rx,#((\n)>>16)&0xffff
28.endm
29
30float_section frrcore_v
31
32// 1/2π to plenty of accuracy
33.long 0                      @ this allows values of e down to -32
34rtwopi:
35.long 0,0
36.long 0x28BE60DB, 0x9391054A, 0x7F09D5F4, 0x7D4D3770, 0x36D8A566, 0x4F10E410
37
38@ input:
39@ r0 mantissa m Q23
40@ r1 exponent e>=-32, typically offset by +9
41@ output:
42@ r0..r1 preserved
43@ r6 range reduced result in revolutions Q32
44@ r2,r3,r4,r5 trashed
45.thumb_func
46frr_core:
47 adr r2,rtwopi
48 asrs r3,r1,#5               @ k=e/32, k<=5 for e offsets up to 9+32
49 add r2,r2,r3,lsl#2          @ p
50 and r3,r1,#31               @ s=e%32
51 mov r4,#1
52 lsls r4,r4,r3               @ 1<<s
53 umull r3,r4,r4,r0
54@ r2    p
55@ r3:r4 u0:u1 = m<<(e%32); u1 is never more than 2<<23
56 ldr r5,[r2,#12]             @ a0=p[3]
57 umull r5,r6,r5,r4           @ r0=a0*u1 hi, discard lo
58@ r6  r0
59 ldr r5,[r2,#8]              @ a1=p[2]
60 mla r6,r5,r4,r6             @ a1*u1 lo, discard hi
61 umlal r5,r6,r5,r3           @ a1*u0 hi, discard lo
62@ r6  r0
63 ldr r5,[r2,#4]              @ a2=p[1]
64 mla r6,r5,r3,r6             @ r0+=a2*u0
65 bx r14
66
67float_wrapper_section expf
68
69wrapper_func expf
70@ soft float version, via 2^x
71 asrs r1,r0,#23
72 bmi 1f
73 cmp r1,#0x85
74 bge 3f
7510:
76 movs r2,#1
77 bfi r0,r2,#23,#9
78 subs r1,#0x7e
79 bmi 2f
80 lsl r0,r1                   @ x Q24
8111:
82 movlong r3,0x5c551d95       @ 1/log(2) Q30
83 smull r0,r1,r0,r3           @ Q54
84 adr r2,k_exp2
85 vldmia r2,{s8-s10}
86 lsrs r2,r0,#22
87 bfi r2,r1,#10,#17           @ ε Q32
88 vmov s0,r2
89 vcvt.f32.u32 s0,s0,#32
90 vmul.f32 s1,s0,s0
91 ubfx r2,r1,#17,#5
92 vmul.f32 s4,s0,s8
93 adr r3,exptab3
94 vmul.f32 s2,s0,s1
95 ldr r2,[r3,r2,lsl#2]
96 vmla.f32 s4,s1,s9
97 asrs r1,#22
98 vmla.f32 s4,s2,s10
99 add r2,r2,r1,lsl#23
100 vmov s0,r2
101 vmla.f32 s0,s0,s4
102 vmov r0,s0
103 bx r14
104
1052:                           @ x0.5
106 rsbs r1,#0
107 lsrs r0,r1
108@ adc r0,#0                  @ rounding not needed
109 b 11b
110
1113:                           @ risk of overflow, Inf,NaN
112 movlong r2,0x42B17218
113 cmp r0,r2
114 blo 10b                     @ in range after all
115 cmp r0,#0x7f800000
116 bls 4f                      @ not NaN?
117 orrs r0,#0x00400000
118 bx r14
119
1204:
121 movlong r0,0x7f800000       @ return +Inf
122 bx r14
123
1241:                           @ x<0, r1=0xffffffXX where XX is biased exponent
125 cmn r1,#0x7b
126 bge 5f                      @ risk of underflow, -Inf, -NaN?
12713:
128 movs r2,#1
129 bfi r0,r2,#23,#9
130 adds r1,#0x82
131 bpl 6f
132 rsbs r1,#0
133 lsrs r0,r1
134 adc r0,r0,#0                @ rounding
13512:
136 rsbs r0,#0
137 b 11b
138
1396:
140 lsls r0,r1
141 b 12b
142
1435:
144 movlong r2,0xC2AEAC4F
145 cmp r0,r2
146 bls 13b
147 cmp r0,#0xff800000
148 bls 14f
149 orrs r0,#0x00400000
150 bx r14
15114:
152 mov r0,#0
153 bx r14
154
155.align 3
156k_exp2:
157.float 0.693147181           @ log2
158.float 0.240226507           @ log²2/2
159.float 0.055504109           @ log³2/6
160
161exptab3:                     @ pow(2,[0..31]/32)
162.word 0x3f800000
163.word 0x3f82cd87
164.word 0x3f85aac3
165.word 0x3f88980f
166.word 0x3f8b95c2
167.word 0x3f8ea43a
168.word 0x3f91c3d3
169.word 0x3f94f4f0
170.word 0x3f9837f0
171.word 0x3f9b8d3a
172.word 0x3f9ef532
173.word 0x3fa27043
174.word 0x3fa5fed7
175.word 0x3fa9a15b
176.word 0x3fad583f
177.word 0x3fb123f6
178.word 0x3fb504f3
179.word 0x3fb8fbaf
180.word 0x3fbd08a4
181.word 0x3fc12c4d
182.word 0x3fc5672a
183.word 0x3fc9b9be
184.word 0x3fce248c
185.word 0x3fd2a81e
186.word 0x3fd744fd
187.word 0x3fdbfbb8
188.word 0x3fe0ccdf
189.word 0x3fe5b907
190.word 0x3feac0c7
191.word 0x3fefe4ba
192.word 0x3ff5257d
193.word 0x3ffa83b3
194
195float_wrapper_section logf
196
197wrapper_func logf
198 cmp r0,#0x7f800000          @ catch Inf, NaN, -ve
199 bhs 1f
200 asrs r1,r0,#23              @ get exponent; C from here is preserved...
201 beq 2f                      @ ±0?
202 mov r2,#1
203 bfi r0,r2,#23,#9            @ fix implied 1
204 it cc                       @ 50% ... to here...
205 lslcc r0,#1                 @ this plus sbc below means we work relative to nearby power of 2
206 adr r3,#k_log3
207 vldmia r3,{s8-s10}
208@ 0x00c00000r0 < 0x017fffff
209 adr r3,logtab3-24*8+4
210 add r3,r3,r0,lsr#16         @ look up r0>>19 rounded, preserving flags
211 bic r3,#7
212
213 ldrd r2,r3,[r3]
214 mul r0,r0,r2                @ ε
215 vmov s0,s1,r3,r0            @ s0=-log u, s1216
217 vcvt.f32.s32 s1,s1,#32
218 vmul.f32 s2,s1,s1           @ power series in ε
219 sbc r1,r1,#0x7e             @ ... and here
220 vmul.f32 s3,s1,s2
221 lsls r1,#23                 @ e Q23
222 vmul.f32 s4,s2,s2           @ to ε⁴
223@ movlong r2,0x58b90bfc      @ log 2 Q31, more accurate than we deserve
224 movw r2,0x0bfc
225 vmul.f32 s2,s2,s8
226 movt r2,0x58b9
227 vmul.f32 s3,s3,s9
228 smmulr r1,r1,r2             @ Q22
229 vmul.f32 s4,s4,s10
230 vmov s7,r1
231 vsub.f32 s3,s3,s4
232 vcvt.f32.s32 s7,s7,#22
233 vsub.f32 s2,s2,s3
234 vsub.f32 s1,s1,s2
235 vadd.f32 s0,s0,s1           @ log ε - log u
236 vadd.f32 s0,s0,s7           @ e log 2 + log ε - log u
237 vmov r0,s0
238 bx r14
239
2401:
241 bgt 3f                      @ +NaN?
242 beq 10f                     @ +Inf?
2432:
244 cmp r0,#0x80800000          @ -0?
245 blo 11f
246 cmp r0,#0xff800000          @ -NaN/-Inf?
2473:
248 orr r0,#0x00400000
249 bhi 10f
250 movlong r0,0xffc00000
25110:
252 bx r14
25311:
254 movlong r0,0xff800000
255 bx r14
256
257.align 3
258k_log3:
259.float 0.5
260.float 0.333333333333333
261.float 0.25
262.float 0                     @ alignment
263
264logtab3:
265@ u=64/[48:2:96]; u Q8, -log u F32
266.word 0x0155,0xbe92cb01      @ 00003e9b..00004145
267.word 0x0148,0xbe7dc8c3      @ 00003ec8..00004158
268.word 0x013b,0xbe545f68      @ 00003ec1..00004137
269.word 0x012f,0xbe2c99c7      @ 00003ebb..00004119
270.word 0x0125,0xbe0a3c2c      @ 00003ef3..0000413d
271.word 0x011a,0xbdc61a2f      @ 00003eca..000040fe
272.word 0x0111,0xbd83acc2      @ 00003eeb..0000410d
273.word 0x0108,0xbcfc14d8      @ 00003ee8..000040f8
274.word 0x0100,0x00000000      @ 00003f00..00004100
275.word 0x00f8,0x3d020aec      @ 00003ef8..000040e8
276.word 0x00f1,0x3d77518e      @ 00003f13..000040f5
277.word 0x00ea,0x3db80698      @ 00003f12..000040e6
278.word 0x00e4,0x3ded393b      @ 00003f3c..00004104
279.word 0x00dd,0x3e168b08      @ 00003f05..000040bf
280.word 0x00d8,0x3e2dfa03      @ 00003f48..000040f8
281.word 0x00d2,0x3e4ad2d7      @ 00003f2a..000040ce
282.word 0x00cd,0x3e637fde      @ 00003f43..000040dd
283.word 0x00c8,0x3e7cc8e3      @ 00003f48..000040d8
284.word 0x00c3,0x3e8b5ae6      @ 00003f39..000040bf
285.word 0x00bf,0x3e95f784      @ 00003f6b..000040e9
286.word 0x00ba,0x3ea38c6e      @ 00003f36..000040aa
287.word 0x00b6,0x3eaeadef      @ 00003f46..000040b2
288.word 0x00b2,0x3eba0ec4      @ 00003f46..000040aa
289.word 0x00ae,0x3ec5b1cd      @ 00003f36..00004092
290.word 0x00ab,0x3ece995f      @ 00003f75..000040cb
291
292float_wrapper_section fsin_fcos
293
29430:
295 lsls r1,r0,#9
296 bne 1f                      @ NaN? return it
297 orrs r0,r0,#0x80000000      @ Inf: make a NaN
2981:
299 orrs r0,r0,#0x00400000      @ set top mantissa bit of NaN
300 bx r14
301
302@ heavy-duty range reduction
303@ here x256, -e in r1
30440:
305 push {r4-r7,r14}
306 movs r3,#1
307 bfi r0,r3,#23,#9            @ insert implied 1 in mantissa, clear sign
308 rsb r1,#9                   @ e+9
309 mov r7,#0x7e                @ this will be the exponent of the reduced angle - 1
31042:
311 bl frr_core
312@ here r6 is revolutions Q32
313 lsrs r3,r6,#30              @ quadrant count
314 adcs r3,r3,#0               @ rounded
315 add r12,r12,r3
316 subs r6,r6,r3,lsl#30        @ reduced angle/2π Q32 -.125x<+.125
317@ comment out from here...
318 lsls r2,r6,#2               @ Q34
319 it cs
320 rsbcs r2,r2,#0              @ absolute value
321 cmp r2,#1<<28               @ big enough for accuracy?
322 bhs 41f
323@ ... to here for slightly better accuracy
32443:
325 adds r1,r1,#2               @ try again with increased exponent
326 bl frr_core
327 eors r2,r6,r6,asr#32        @ absolute value
328 adc r2,r2,#0
329 cmp r2,#1<<28               @ big enough yet?
330 bhs 44f
331 subs r7,r7,#2
332 bpl 43b                     @ safety net
33344:
334
33541:
336 ldr r4,=0xC90FDAA2          @ 2π Q29
337 umull r2,r4,r2,r4           @ r4 has reduced angle Q34+Q29-Q32=Q31
338@ add r4,r4,r2,lsr#31
339 clz r2,r4                   @ normalise
340 lsls r4,r4,r2
341 lsrs r4,r4,#8
342 sub r2,r7,r2
343 adc r0,r4,r2,lsl#23         @ with rounding
344 lsrs r1,r0,#23              @ re-extract exponent as there may have been a carry into it
345 rsbs r1,r1,#0x7f            @ prepare exponent for re-entry
346 lsrs r6,r6,#31
347 add r3,r0,r6,lsl#31         @ apply sign of reduced angle
348 pop {r4-r7,r14}
349 b 5f                        @ re-enter with no risk of looping
350
351.ltorg
352
353@ light-duty range reduction
354@ here argument1
355@ r0: argument
356@ r1: -e
357@ r12: quadrant count
358@ required result is sin(r0+r12*π/2)
35910:
360 cmn r1,#0x80
361 beq 30b                     @ Inf/NaN
362 bics r2,r0,r12,lsl#31       @ negative argument,doing sin -> +2 quadrants
363 it mi
364 addmi r12,r12,#2
365 bic r0,r0,#0x80000000       @ make positive: original sign is now captured in quadrant count in r12
366
367@ this may not actually be faster than doing it in integer registers
368 vmov s0,r0
369 adr r2,k_sc4
370 vldmia r2!,{s5-s7}
371@ vmul.f32 s4,s4,s0          @ this accurate calculation of the quadrant count does not seem necessary
372@ vfma.f32 s4,s5,s0
373 vmul.f32 s4,s5,s0           @ this is BALGE
374 cmn r1,#8                   @ ≥256?
375 vrintn.f32.f32 s4,s4        @ round to quadrant count: x<256 so count163
376 ble 40b                     @ then do heavy-duty range reduction
377 vfms.f32 s0,s4,s7
378 vfms.f32 s0,s4,s6
379 vmov r3,s0                  @ reduced angle
380 vcvt.s32.f32 s3,s4
381 ubfx r2,r3,#23,#8           @ get exponent
382 cmp r2,#0x78
383 blo 40b                     @ very small result? use heavy-duty reduction to get a more accurate answer
384 rsbs r1,r2,#0x7f            @ ready for re-entry
385 vmov r2,s3                  @ integer quadrant count
386 add r12,r12,r2
387@ prepare to re-enter with no risk of looping
388 b 5f
389
390k_sc4:
391@ 2/π=0.A2F9836E4E441529FC...
392.word 0x3f22f983             @ 2393@ π/2=1.921FB54442D1846989...
394.word 0xb695777a,0x3fc91000  @ these two add up to π/2 with error ~1.6e-13
395
396wrapper_func sincosf
397
398 push {r0-r2,r14}
399 ubfx r1,r0,#23,#8
400 cmp r1,#0xff                @ Inf/NaN?
401 beq 2f
402 bl cosf_entry               @ this will exit via 1f or 2f...
403 pop {r1-r2,r14}
404 str r0,[r14]
405@ here C is still set from lsrs r12,r12,#1
406 bcs 1f
407 mvns r1,r1
408 eor r12,r12,r1,lsr#31
409@ this is fsc_costail:
410@ here calculate cos φ+ε = cosθ
411 vmul.f32 s5,s7,s1           @ sinφ sinε
412 vfma.f32 s5,s2,s6           @ sinφ sinε + cosφ(1-cosε)
413 vsub.f32 s5,s6,s5           @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
414 vmov.f32 r0,s5
415 eor r0,r0,r12,lsl#31
416 str r0,[r2]
417 pop {r15}
418
4191:
420 eor r12,r12,r1,lsr#31
421@ this is fsc_sintail:
422@ here calculate sin φ+ε = sinθ
423 vmul.f32 s4,s2,s7           @ sinφ(1-cosε)
424 vfms.f32 s4,s6,s1           @ sinφ(1-cosε) - cosφ sinε
425 eor r1,r12,r3,lsr#31        @ flip sign if (reduced) argument was negative
426 vsub.f32 s4,s7,s4           @ cosφ sinε + sinφ cosε
427 vmov.f32 r0,s4
428 eor r0,r0,r1,lsl#31
429 str r0,[r2]                 @ save cos result
430 pop {r15}
431
432@ sincos of Inf or NaN
4332:
434 lsls r1,r0,#9
435 pop {r1-r3,r14}
436 bne 1f                      @ NaN? return it
437 orrs r0,r0,#0x80000000      @ Inf: make a NaN
4381:
439 orrs r0,r0,#0x00400000      @ set top mantissa bit of NaN
440 str r0,[r2]                 @ both sin and cos results
441 str r0,[r3]
442 bx r14
443
444wrapper_func sinf
445@ r12b1..0: quadrant count
446 movs r12,#0
447 b 1f
448
449wrapper_func cosf
450.thumb_func
451cosf_entry:
452 movs r12,#1                 @ cos -> +1 quadrant
4531:
454 ubfx r1,r0,#23,#8           @ get exponent
455 cbz r1,20f                  @ 0/denormal?
45622:
457 rsbs r1,r1,#0x7f
458 bls 10b                     @ argument1? needs reduction; also Inf/NaN handling
459 bic r3,r0,r12,lsl#31        @ this would mess up NaNs so do it here
4605:
461@ here we have a quadrant count in r12 and a signed offset r0 from r12*π/2
462 bic r0,r3,#0x80000000       @ this would mess up NaNs so do it here
463 vmov s0,r0
464 ubfx r0,r0,#18,#5           @ extract top of mantissa
465 adds r0,r0,#32              @ insert implied 1
466 lsrs r1,r0,r1               @ to fixed point Q5
467 ldr r2,=k_sc3
468 adcs r1,r1,#0               @ rounding
469 vldmia r2!,{s8-s9}
470 add r2,r2,r1,lsl#2          @ 12 bytes per entry
471 add r2,r2,r1,lsl#3
472
473 vldmia r2,{s5-s7}           @ φ, cosφ, sinφ
474 vsub.f32 s1,s0,s5           @ ε
475 vmul.f32 s2,s1,s1           @ ε²
476 lsrs r12,r12,#1             @ computing cosine?
477 vmul.f32 s3,s2,s1           @ ε³
478 bcs 2f
479
480 vmul.f32 s2,s2,s8           @ ε²/2! ~ 1-cosε
481 vmul.f32 s3,s3,s9           @ ε³/3!
482 vsub.f32 s1,s1,s3           @ ε-ε³/3! ~ sinε
483
484@ here:
485@ s1: sinε
486@ s2: 1-cosε
487@ s6: cosφ
488@ s7: sinφ
489@ r12: quadrant count
490fsc_sintail:
491@ here calculate sin φ+ε = sinθ
492 vmul.f32 s4,s2,s7           @ sinφ(1-cosε)
493 vfms.f32 s4,s6,s1           @ sinφ(1-cosε) - cosφ sinε
494 eor r1,r12,r3,lsr#31        @ flip sign if (reduced) argument was negative
495 vsub.f32 s4,s7,s4           @ cosφ sinε + sinφ cosε
496 vmov.f32 r0,s4
497 eor r0,r0,r1,lsl#31
498 bx r14
499
50020:
501 and r0,r0,#0x80000000       @ make signed zero
502 b 22b
503
504.align 2
5052:
506 vmul.f32 s3,s3,s9           @ ε³/3!
507 vsub.f32 s1,s1,s3           @ ε-ε³/3! ~ sinε
508 vmul.f32 s2,s2,s8           @ ε²/2! ~ 1-cosε
509fsc_costail:
510@ here calculate cos φ+ε = cosθ
511 vmul.f32 s5,s7,s1           @ sinφ sinε
512 vfma.f32 s5,s2,s6           @ sinφ sinε + cosφ(1-cosε)
513 vsub.f32 s5,s6,s5           @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
514 vmov.f32 r0,s5
515 eor r0,r0,r12,lsl#31
516 bx r14
517
518.align 3
519k_sc3:
520.word 0x3EFFFEC1             @ ~ 1/2! with PMC
521.word 0x3e2aaa25             @ ~ 1/3! with PMC
522
523trigtab2:
524//      φ          cos φ      sin φ
525.word 0x00000000,0x3f800000,0x00000000
526.word 0x3cfcc961,0x3f7fe0cd,0x3cfcbf1c           @ φ=0.03085774 : cos φ=3feffc199ff28ef4 33.3b; sin φ=3f9f97e38006c678 39.2b
527.word 0x3d810576,0x3f7f7dfe,0x3d80ef9e           @ φ=0.06299870 : cos φ=3fefefbfc00d6b6d 33.3b; sin φ=3fb01df3c000dfd5 40.2b
528.word 0x3dbf0c09,0x3f7ee30f,0x3dbec522           @ φ=0.09328467 : cos φ=3fefdc61dff4f58e 33.5b; sin φ=3fb7d8a43ffdf9ac 39.0b
529.word 0x3dff24b6,0x3f7e0414,0x3dfe7be2           @ φ=0.12458174 : cos φ=3fefc0827fdaf90f 31.8b; sin φ=3fbfcf7c3ff9dd0c 37.4b
530.word 0x3e1f0713,0x3f7ceb48,0x3e1e63a0           @ φ=0.15530042 : cos φ=3fef9d68ffe680a0 32.3b; sin φ=3fc3cc73fffa6d09 36.5b
531.word 0x3e40306d,0x3f7b811d,0x3e3f1015           @ φ=0.18768473 : cos φ=3fef70239fe32301 32.1b; sin φ=3fc7e2029ffdbc2c 37.8b
532.word 0x3e60ada2,0x3f79dccf,0x3e5ee13e           @ φ=0.21941236 : cos φ=3fef3b99e023f5aa 31.8b; sin φ=3fcbdc27bffe216d 38.1b
533.word 0x3e800d7b,0x3f7808fa,0x3e7d7196           @ φ=0.25010285 : cos φ=3fef011f401572a6 32.6b; sin φ=3fcfae32c00328bb 37.3b
534.word 0x3e8f986e,0x3f75ff65,0x3e8db868           @ φ=0.28045982 : cos φ=3feebfeca0aaaf99 29.6b; sin φ=3fd1b70cfffc1468 36.0b
535.word 0x3e9fe1f4,0x3f739e93,0x3e9d4bfd           @ φ=0.31227076 : cos φ=3fee73d25fbf733b 31.0b; sin φ=3fd3a97fa0002ced 40.5b
536.word 0x3eb054c6,0x3f70f7ae,0x3eacddb3           @ φ=0.34439677 : cos φ=3fee1ef5bfcf70cb 31.4b; sin φ=3fd59bb65fff5c30 38.6b
537.word 0x3ebf89c5,0x3f6e4b60,0x3ebb1a0a           @ φ=0.37409797 : cos φ=3fedc96bffdebb8a 31.9b; sin φ=3fd763414003344b 36.3b
538.word 0x3ecfc426,0x3f6b35ca,0x3eca1c63           @ φ=0.40579337 : cos φ=3fed66b93fe27dc6 32.1b; sin φ=3fd9438c5ffe5d45 37.3b
539.word 0x3ee054f2,0x3f67d166,0x3ed93907           @ φ=0.43814808 : cos φ=3fecfa2cbffc16e9 35.0b; sin φ=3fdb2720dffef5b6 37.9b
540.word 0x3eeff0dd,0x3f64664b,0x3ee74116           @ φ=0.46863452 : cos φ=3fec8cc95f714272 29.8b; sin φ=3fdce822c00479ad 35.8b
541.word 0x3f002b31,0x3f609488,0x3ef5c30f           @ φ=0.50065905 : cos φ=3fec1290ffc99208 31.2b; sin φ=3fdeb861dfff3932 38.4b
542.word 0x3f07e407,0x3f5cc5a2,0x3f01992b           @ φ=0.53082317 : cos φ=3feb98b44034cd46 31.3b; sin φ=3fe033255ffff628 41.7b
543.word 0x3f101fc5,0x3f587d8f,0x3f08a165           @ φ=0.56298476 : cos φ=3feb0fb1e0ceda6f 29.3b; sin φ=3fe1142c9ffd5ae4 35.6b
544.word 0x3f17f68a,0x3f5434b5,0x3f0f31ca           @ φ=0.59360564 : cos φ=3fea8696a038a06f 31.2b; sin φ=3fe1e639400269fb 35.7b
545.word 0x3f1fffe2,0x3f4f9b59,0x3f15c8d7           @ φ=0.62499821 : cos φ=3fe9f36b1f428363 29.4b; sin φ=3fe2b91ae001d55d 36.1b
546.word 0x3f280646,0x3f4acf6b,0x3f1c37c4           @ φ=0.65634573 : cos φ=3fe959ed61449f08 28.7b; sin φ=3fe386f87ffd9617 35.7b
547.word 0x3f303041,0x3f45b9e0,0x3f229ae4           @ φ=0.68823630 : cos φ=3fe8b73c0047ae7a 30.8b; sin φ=3fe4535c7ffdf1ac 36.0b
548.word 0x3f381da7,0x3f4098ca,0x3f28a620           @ φ=0.71920246 : cos φ=3fe81319402ae6e1 31.6b; sin φ=3fe514c3ffff423c 37.4b
549.word 0x3f3fc72f,0x3f3b76ac,0x3f2e564a           @ φ=0.74913305 : cos φ=3fe76ed5809d419f 29.7b; sin φ=3fe5cac93fffaf1d 38.7b
550.word 0x3f4813db,0x3f35b6cc,0x3f34526b           @ φ=0.78155297 : cos φ=3fe6b6d9800e8b52 33.1b; sin φ=3fe68a4d5ffe89fc 36.5b
551.word 0x3f4fc779,0x3f30352f,0x3f39b4d0           @ φ=0.81163746 : cos φ=3fe606a5dfdc2b5b 31.8b; sin φ=3fe73699fffd7fc8 35.7b
552.word 0x3f57dd52,0x3f2a4170,0x3f3f2d91           @ φ=0.84322083 : cos φ=3fe5482e011ba752 28.9b; sin φ=3fe7e5b21ffcb223 35.3b
553.word 0x3f5fce26,0x3f243e9f,0x3f445dc3           @ φ=0.87423933 : cos φ=3fe487d3e0b9864b 29.5b; sin φ=3fe88bb85ffde6d5 35.9b
554.word 0x3f6825f1,0x3f1dc250,0x3f499d1c           @ φ=0.90682894 : cos φ=3fe3b849ffea9b8f 32.6b; sin φ=3fe933a38002730d 35.7b
555.word 0x3f703be1,0x3f175041,0x3f4e7ebf           @ φ=0.93841368 : cos φ=3fe2ea0820791b4e 30.1b; sin φ=3fe9cfd7e0053e65 34.6b
556.word 0x3f781078,0x3f10ed71,0x3f5306af           @ φ=0.96900129 : cos φ=3fe21dae1fdea23e 31.9b; sin φ=3fea60d5e001b90b 36.2b
557.word 0x3f7ff4d4,0x3f0a5aa7,0x3f57649b           @ φ=0.99982953 : cos φ=3fe14b54deeaa407 28.9b; sin φ=3feaec9360012825 36.8b
558
559float_wrapper_section tanf
560
561wrapper_func tanf
562 push {r0,r14}
563 ubfx r1,r0,#23,#8
564 cmp r1,#0xff                @ Inf/NaN?
565 beq 2f
566 bl cosf_entry               @ this will exit via sintail or costail...
567 ldr r1,[sp,#0]
568@ here C is still set from lsrs r12,r12,#1
569 bcs 1f
570@ we exited via sintail
571@ this is fsc_costail:
572@ here calculate cos φ+ε = cosθ
573 vmul.f32 s5,s7,s1           @ sinφ sinε
574 vfma.f32 s5,s2,s6           @ sinφ sinε + cosφ(1-cosε)
575 eors r1,r1,r3
576 vsub.f32 s5,s6,s5           @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
577 vdiv.f32 s0,s5,s4
578 vmov.f32 r0,s0
579 it pl
580 eorpl r0,r0,#0x80000000
581 pop {r1,r15}
582
5831:
584@ we exited via costail
585@ this is fsc_sintail:
586@ here calculate sin φ+ε = sinθ
587 vmul.f32 s4,s2,s7           @ sinφ(1-cosε)
588 vfms.f32 s4,s6,s1           @ sinφ(1-cosε) - cosφ sinε
589 eors r1,r1,r3
590 vsub.f32 s4,s7,s4           @ cosφ sinε + sinφ cosε
591 vdiv.f32 s0,s4,s5
592 vmov.f32 r0,s0
593 it mi
594 eormi r0,r0,#0x80000000
595 pop {r1,r15}
596
597@ tan of Inf or NaN
5982:
599 lsls r1,r0,#9
600 bne 1f                      @ NaN? return it
601 orrs r0,r0,#0x80000000      @ Inf: make a NaN
6021:
603 orrs r0,r0,#0x00400000      @ set top mantissa bit of NaN
604 pop {r3,r15}
605
606
607float_wrapper_section atan2f
608
60950:
61060:
611 orrs r0,r1,#0x00400000
612 bx r14
613
61451:
615 bne 52f                     @ NaN?
616 cmp r3,#0x7f800000          @ y an infinity; x an infinity too?
617 bne 55f                     @ no: carry on
618@ here x and y are both infinities
619 b 66f
620
62152:
62262:
623 orrs r0,r0,#0x00400000
624 bx r14
625
62661:
627 bne 62b                     @ NaN?
628 cmp r3,#0x7f800000          @ y an infinity; x an infinity too?
629 bne 65f                     @ no: carry on
63066:
631@ here x and y are both infinities
632 subs r0,r0,#1               @ make both finite (and equal) with same sign and retry
633 subs r1,r1,#1
634 b 86f
635
63670:
637 and r3,#0x80000000
638 cmp r2,#0x00800000
639 bhs 72f                     @ y 0 or denormal?
640@ here both x and y are zeros
641 b 85f
64271:
643 and r2,#0x80000000
64472:
645 vmov s0,s1,r2,r3
646 vdiv.f32 s2,s0,s1           @ restart the division
647 b 73f                       @ and go back and check for NaNs
648
64980:
650 and r3,#0x80000000
651 cmp r2,#0x00800000
652 bhs 82f                     @ y 0 or denormal?
65385:
654@ here both x and y are zeros
655 orr r1,r1,0x3f800000        @ retry with x replaced by ~1 with appropriate sign
656 b 86f
657
65881:
659 and r2,#0x80000000
66082:
661 vmov s0,s1,r2,r3
662 vdiv.f32 s2,s1,s0           @ restart the division
663 b 83f                       @ and go back and check for NaNs
664
665wrapper_func atan2f
66686:
667 bic r2,r0,#0x80000000
668 bic r3,r1,#0x80000000
669 vmov s0,s1,r2,r3
670 cmp r2,r3                   @ |y| vs. |x|
671 bhi 1f
672@ here |x|≥|y| so we need |y|/|x|; octant/xs/ys: 0++,3-+,4--,7+-
673 vdiv.f32 s2,s0,s1           @ get this division started; result1
674 cmp r3,#0x00800000
675 blo 70b                     @ x 0 or denormal?
676 cmp r2,#0x00800000
677 blo 71b                     @ y 0 or denormal?
67873:
679 cmp r3,#0x7f800000
680 bhi 50b                     @ x NaN?
681 cmp r2,#0x7f800000
682 bhs 51b                     @ y Inf or NaN?
68355:
684 cmp r1,#0
685 ite mi
686 ldrmi r12,pi                @ if x<0, need two extra quadrants
687 movpl r12,#0
688                             @ inner negation is the sign of x
689 b 2f
690
6911:
692@ here |x|<|y| so we need |x|/|y|; octant/xs/ys: 1++,2-+,5--,6+-
693 vdiv.f32 s2,s1,s0           @ result <1
694 cmp r3,#0x00800000
695 blo 80b                     @ x 0 or denormal?
696 cmp r2,#0x00800000
697 blo 81b                     @ y 0 or denormal?
69883:
699 cmp r3,#0x7f800000
700 bhi 60b                     @ x NaN?
701 cmp r2,#0x7f800000
702 bhs 61b                     @ y Inf or NaN?
70365:
704 ldr r12,piover2             @ always one extra quadrant in this path
705 eors r1,r1,#0x80000000      @ inner negation is the complement of the sign of x
706
7072:
708@ here
709@ r0 y
710@ r1 ±x
711@ r2 |y|
712@ r3 |x|
713@ s0,s1 = |x|,|y|
714@ s2=s0/s1 or s1/s0, 0≤s21
715@ r12=quadrant count * π/2
716@ where the final result is
717@ ± (r12 ± atn s2) where the inner negation is given by r1b31 and the outer negation by r0b31
718
719 adr r2,trigtab3
720 vmov.f32 s3,s2
721 vcvt.u32.f32 s3,s3,#6
722 vmov.f32 r3,s3
723 lsrs r3,r3,#1
724 adcs r3,r3,#0               @ rounding; set Z if in φ==0 case
725 add r2,r2,r3,lsl#3
726 vldr s5,[r2,#4]             @ t=tanφ
727 vmul.f32 s0,s5,s2           @ ty
728 vsub.f32 s1,s2,s5           @ y-t
729 vmov.f32 s5,#1.0
730 vadd.f32 s0,s5,s0           @ 1+ty
731 beq 9f                      @ did we look up zeroth table entry?
732
733@ now (s0,s1) = (x,y)
734 vdiv.f32 s0,s1,s0           @ ε
735 ldr r2,[r2]                 @ φ Q29
736@ result is now ±(r12±(r2+atn(s0))
737 cmp r1,#0                   @ inner negation
738 it mi
739 rsbmi r2,r2,#0
740 add r2,r12,r2               @ Q29
741 cmp r0,#0                   @ outer negation
742 it mi
743 rsbmi r2,r2,#0
744 cmp r2,#0
745 bpl 1f
746 rsbs r2,r2,#0
747 clz r3,r2
748 lsls r2,r2,r3
749 beq 3f
750 rsb r3,#0x180
751 b 2f
7521:
753 clz r3,r2
754 lsls r2,r2,r3
755 beq 3f
756 rsb r3,#0x80
7572:
758 lsrs r2,r2,#8               @ rounding bit to carry
759 adc r2,r2,r3,lsl#23         @ with rounding
7603:
761 vmul.f32 s2,s0,s0           @ ε²
762 vldr.f32 s3,onethird
763 vmul.f32 s2,s2,s0           @ ε³
764 teq r0,r1
765 vmul.f32 s2,s2,s3           @ ε³/3
766 vmov.f32 s4,r2
767 vsub.f32 s0,s0,s2           @ ~atn(ε)
768 ite pl
769 vaddpl.f32 s0,s4,s0
770 vsubmi.f32 s0,s4,s0
771 vmov.f32 r0,s0
772 bx r14
773
7749:                           @ we looked up the zeroth table entry; we could generate slightly more accurate results here
775@ now (s0,s1) = (x,y)
776 vdiv.f32 s0,s1,s0           @ ε
777@ result is now ±(r12±(0+atn(s0))
778 mov r2,r12                  @ Q29; in fact r12 is only ±π/2 or ±π so can probably simplify this
779 cmp r0,#0                   @ outer negation
780 it mi
781 rsbmi r2,r2,#0
782 cmp r2,#0
783 bpl 1f
784 rsbs r2,r2,#0
785 clz r3,r2
786 lsls r2,r2,r3
787 beq 3f
788 rsb r3,#0x180
789 b 2f
7901:
791 clz r3,r2
792 lsls r2,r2,r3
793 beq 3f
794 rsb r3,#0x80
7952:
796 lsrs r2,r2,#8               @ rounding bit to carry
797 adc r2,r2,r3,lsl#23         @ with rounding
7983:
799 vmul.f32 s2,s0,s0           @ ε²
800 vldr.f32 s3,onethird
801 vmul.f32 s2,s2,s0           @ ε³
802 teq r0,r1
803 vmul.f32 s2,s2,s3           @ ε³/3
804 vmov.f32 s4,r2
805 vsub.f32 s0,s0,s2           @ ~atn(ε)
806 ite pl
807 vaddpl.f32 s0,s4,s0
808 vsubmi.f32 s0,s4,s0
809 vmov.f32 r0,s0
810 tst r0,#0x7f800000          @ about to return a denormal?
811 it ne
812 bxne r14
813 and r0,r0,#0x80000000       @ make it zero
814 bx r14
815
816piover2:  .word 0x3243f6a9   @ Q29
817pi:       .word 0x6487ed51   @ Q29
818onethird: .float 0.33333333
819
820trigtab3:
821//      φ Q29      tan φ SP
822.word 0x00000000,0x00000000
823.word 0x00ffee23,0x3d0001bb  @ φ=0.03124148 : tan φ=3fa000375fffff9d 50.4b
824.word 0x01fe88dc,0x3d7f992a  @ φ=0.06232112 : tan φ=3faff3253fffea1f 44.5b
825.word 0x02fe0a70,0x3dc01203  @ φ=0.09351084 : tan φ=3fb8024060002522 42.8b
826.word 0x03fad228,0x3e000368  @ φ=0.12436779 : tan φ=3fc0006cfffffc90 45.2b
827.word 0x04f5ab70,0x3e1ffdea  @ φ=0.15498897 : tan φ=3fc3ffbd400014d5 42.6b
828.word 0x05ed56f8,0x3e3fdddc  @ φ=0.18522213 : tan φ=3fc7fbbb80000beb 43.4b
829.word 0x06e4cfa0,0x3e601425  @ φ=0.21543103 : tan φ=3fcc02849fffe817 42.4b
830.word 0x07d8d3e0,0x3e80215d  @ φ=0.24521822 : tan φ=3fd0042b9ffff89f 43.1b
831.word 0x08c60460,0x3e9000a5  @ φ=0.27417201 : tan φ=3fd20014a000182b 41.4b
832.word 0x09b26770,0x3ea01492  @ φ=0.30302784 : tan φ=3fd402923ffff932 43.2b
833.word 0x0a996d50,0x3eb01377  @ φ=0.33122888 : tan φ=3fd6026ee0001062 42.0b
834.word 0x0b7a6d10,0x3ebff4a0  @ φ=0.35869458 : tan φ=3fd7fe93ffff8c38 39.1b
835.word 0x0c593ce0,0x3ed0019f  @ φ=0.38589329 : tan φ=3fda0033e0001354 41.7b
836.word 0x0d33ebd0,0x3ee01bbc  @ φ=0.41258803 : tan φ=3fdc0377800162a1 37.5b
837.word 0x0e087ab0,0x3ef01fbd  @ φ=0.43853506 : tan φ=3fde03f79fffddf2 40.9b
838.word 0x0ed56180,0x3effef98  @ φ=0.46354747 : tan φ=3fdffdf30000767d 39.1b
839.word 0x0fa1de80,0x3f080ebf  @ φ=0.48850942 : tan φ=3fe101d7dfffb9fc 38.9b
840.word 0x10639d00,0x3f0fec31  @ φ=0.51215982 : tan φ=3fe1fd862000aad5 37.6b
841.word 0x112690e0,0x3f180cfd  @ φ=0.53595775 : tan φ=3fe3019fa00069ea 38.3b
842.word 0x11e014c0,0x3f200065  @ φ=0.55860364 : tan φ=3fe4000ca00022e5 39.9b
843.word 0x129651e0,0x3f2808be  @ φ=0.58084959 : tan φ=3fe50117c00015a7 40.6b
844.word 0x1346d400,0x3f300a7d  @ φ=0.60239601 : tan φ=3fe6014f9fffa020 38.4b
845.word 0x13efc7c0,0x3f37ee2f  @ φ=0.62302005 : tan φ=3fe6fdc5dfff98d7 38.3b
846.word 0x14988960,0x3f400c32  @ φ=0.64362019 : tan φ=3fe801863fffff81 46.0b
847.word 0x1537a8c0,0x3f47ef42  @ φ=0.66304433 : tan φ=3fe8fde8400062a4 38.4b
848.word 0x15d4cc60,0x3f4ff630  @ φ=0.68222636 : tan φ=3fe9fec5ffff76e2 37.9b
849.word 0x166ef280,0x3f581534  @ φ=0.70104337 : tan φ=3feb02a680004e91 38.7b
850.word 0x16ff75c0,0x3f5fef1e  @ φ=0.71868408 : tan φ=3febfde3c0001404 40.7b
851.word 0x179116a0,0x3f68184d  @ φ=0.73646098 : tan φ=3fed03099ffed6e5 36.8b
852.word 0x181b5aa0,0x3f701722  @ φ=0.75333911 : tan φ=3fee02e43fffd351 39.5b
853.word 0x18a10560,0x3f781071  @ φ=0.76965588 : tan φ=3fef020e20005c05 38.5b
854.word 0x19214060,0x3f7ff451  @ φ=0.78530902 : tan φ=3feffe8a1fffe11b 40.1b
855
856#endif
857