1/* 2 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd. 3 * 4 * SPDX-License-Identifier: BSD-3-Clause 5 */ 6 7#if !PICO_RP2040 8#include "pico/asm_helper.S" 9 10pico_default_asm_setup 11 12.macro float_section name 13#if PICO_FLOAT_IN_RAM 14.section RAM_SECTION_NAME(\name), "ax" 15#else 16.section SECTION_NAME(\name), "ax" 17#endif 18.endm 19 20.macro float_wrapper_section func 21float_section WRAPPER_FUNC_NAME(\func) 22.endm 23 24@ load a 32-bit constant n into register rx 25.macro movlong rx,n 26 movw \rx,#(\n)&0xffff 27 movt \rx,#((\n)>>16)&0xffff 28.endm 29 30float_section frrcore_v 31 32// 1/2π to plenty of accuracy 33.long 0 @ this allows values of e down to -32 34rtwopi: 35.long 0,0 36.long 0x28BE60DB, 0x9391054A, 0x7F09D5F4, 0x7D4D3770, 0x36D8A566, 0x4F10E410 37 38@ input: 39@ r0 mantissa m Q23 40@ r1 exponent e>=-32, typically offset by +9 41@ output: 42@ r0..r1 preserved 43@ r6 range reduced result in revolutions Q32 44@ r2,r3,r4,r5 trashed 45.thumb_func 46frr_core: 47 adr r2,rtwopi 48 asrs r3,r1,#5 @ k=e/32, k<=5 for e offsets up to 9+32 49 add r2,r2,r3,lsl#2 @ p 50 and r3,r1,#31 @ s=e%32 51 mov r4,#1 52 lsls r4,r4,r3 @ 1<<s 53 umull r3,r4,r4,r0 54@ r2 p 55@ r3:r4 u0:u1 = m<<(e%32); u1 is never more than 2<<23 56 ldr r5,[r2,#12] @ a0=p[3] 57 umull r5,r6,r5,r4 @ r0=a0*u1 hi, discard lo 58@ r6 r0 59 ldr r5,[r2,#8] @ a1=p[2] 60 mla r6,r5,r4,r6 @ a1*u1 lo, discard hi 61 umlal r5,r6,r5,r3 @ a1*u0 hi, discard lo 62@ r6 r0 63 ldr r5,[r2,#4] @ a2=p[1] 64 mla r6,r5,r3,r6 @ r0+=a2*u0 65 bx r14 66 67float_wrapper_section expf 68 69wrapper_func expf 70@ soft float version, via 2^x 71 asrs r1,r0,#23 72 bmi 1f 73 cmp r1,#0x85 74 bge 3f 7510: 76 movs r2,#1 77 bfi r0,r2,#23,#9 78 subs r1,#0x7e 79 bmi 2f 80 lsl r0,r1 @ x Q24 8111: 82 movlong r3,0x5c551d95 @ 1/log(2) Q30 83 smull r0,r1,r0,r3 @ Q54 84 adr r2,k_exp2 85 vldmia r2,{s8-s10} 86 lsrs r2,r0,#22 87 bfi r2,r1,#10,#17 @ ε Q32 88 vmov s0,r2 89 vcvt.f32.u32 s0,s0,#32 90 vmul.f32 s1,s0,s0 91 ubfx r2,r1,#17,#5 92 vmul.f32 s4,s0,s8 93 adr r3,exptab3 94 vmul.f32 s2,s0,s1 95 ldr r2,[r3,r2,lsl#2] 96 vmla.f32 s4,s1,s9 97 asrs r1,#22 98 vmla.f32 s4,s2,s10 99 add r2,r2,r1,lsl#23 100 vmov s0,r2 101 vmla.f32 s0,s0,s4 102 vmov r0,s0 103 bx r14 104 1052: @ x≤0.5 106 rsbs r1,#0 107 lsrs r0,r1 108@ adc r0,#0 @ rounding not needed 109 b 11b 110 1113: @ risk of overflow, Inf,NaN 112 movlong r2,0x42B17218 113 cmp r0,r2 114 blo 10b @ in range after all 115 cmp r0,#0x7f800000 116 bls 4f @ not NaN? 117 orrs r0,#0x00400000 118 bx r14 119 1204: 121 movlong r0,0x7f800000 @ return +Inf 122 bx r14 123 1241: @ x<0, r1=0xffffffXX where XX is biased exponent 125 cmn r1,#0x7b 126 bge 5f @ risk of underflow, -Inf, -NaN? 12713: 128 movs r2,#1 129 bfi r0,r2,#23,#9 130 adds r1,#0x82 131 bpl 6f 132 rsbs r1,#0 133 lsrs r0,r1 134 adc r0,r0,#0 @ rounding 13512: 136 rsbs r0,#0 137 b 11b 138 1396: 140 lsls r0,r1 141 b 12b 142 1435: 144 movlong r2,0xC2AEAC4F 145 cmp r0,r2 146 bls 13b 147 cmp r0,#0xff800000 148 bls 14f 149 orrs r0,#0x00400000 150 bx r14 15114: 152 mov r0,#0 153 bx r14 154 155.align 3 156k_exp2: 157.float 0.693147181 @ log2 158.float 0.240226507 @ log²2/2 159.float 0.055504109 @ log³2/6 160 161exptab3: @ pow(2,[0..31]/32) 162.word 0x3f800000 163.word 0x3f82cd87 164.word 0x3f85aac3 165.word 0x3f88980f 166.word 0x3f8b95c2 167.word 0x3f8ea43a 168.word 0x3f91c3d3 169.word 0x3f94f4f0 170.word 0x3f9837f0 171.word 0x3f9b8d3a 172.word 0x3f9ef532 173.word 0x3fa27043 174.word 0x3fa5fed7 175.word 0x3fa9a15b 176.word 0x3fad583f 177.word 0x3fb123f6 178.word 0x3fb504f3 179.word 0x3fb8fbaf 180.word 0x3fbd08a4 181.word 0x3fc12c4d 182.word 0x3fc5672a 183.word 0x3fc9b9be 184.word 0x3fce248c 185.word 0x3fd2a81e 186.word 0x3fd744fd 187.word 0x3fdbfbb8 188.word 0x3fe0ccdf 189.word 0x3fe5b907 190.word 0x3feac0c7 191.word 0x3fefe4ba 192.word 0x3ff5257d 193.word 0x3ffa83b3 194 195float_wrapper_section logf 196 197wrapper_func logf 198 cmp r0,#0x7f800000 @ catch Inf, NaN, -ve 199 bhs 1f 200 asrs r1,r0,#23 @ get exponent; C from here is preserved... 201 beq 2f @ ±0? 202 mov r2,#1 203 bfi r0,r2,#23,#9 @ fix implied 1 204 it cc @ 50% ... to here... 205 lslcc r0,#1 @ this plus sbc below means we work relative to nearby power of 2 206 adr r3,#k_log3 207 vldmia r3,{s8-s10} 208@ 0x00c00000 ≤ r0 < 0x017fffff 209 adr r3,logtab3-24*8+4 210 add r3,r3,r0,lsr#16 @ look up r0>>19 rounded, preserving flags 211 bic r3,#7 212 213 ldrd r2,r3,[r3] 214 mul r0,r0,r2 @ ε 215 vmov s0,s1,r3,r0 @ s0=-log u, s1=ε 216 217 vcvt.f32.s32 s1,s1,#32 218 vmul.f32 s2,s1,s1 @ power series in ε 219 sbc r1,r1,#0x7e @ ... and here 220 vmul.f32 s3,s1,s2 221 lsls r1,#23 @ e Q23 222 vmul.f32 s4,s2,s2 @ to ε⁴ 223@ movlong r2,0x58b90bfc @ log 2 Q31, more accurate than we deserve 224 movw r2,0x0bfc 225 vmul.f32 s2,s2,s8 226 movt r2,0x58b9 227 vmul.f32 s3,s3,s9 228 smmulr r1,r1,r2 @ Q22 229 vmul.f32 s4,s4,s10 230 vmov s7,r1 231 vsub.f32 s3,s3,s4 232 vcvt.f32.s32 s7,s7,#22 233 vsub.f32 s2,s2,s3 234 vsub.f32 s1,s1,s2 235 vadd.f32 s0,s0,s1 @ log ε - log u 236 vadd.f32 s0,s0,s7 @ e log 2 + log ε - log u 237 vmov r0,s0 238 bx r14 239 2401: 241 bgt 3f @ +NaN? 242 beq 10f @ +Inf? 2432: 244 cmp r0,#0x80800000 @ -0? 245 blo 11f 246 cmp r0,#0xff800000 @ -NaN/-Inf? 2473: 248 orr r0,#0x00400000 249 bhi 10f 250 movlong r0,0xffc00000 25110: 252 bx r14 25311: 254 movlong r0,0xff800000 255 bx r14 256 257.align 3 258k_log3: 259.float 0.5 260.float 0.333333333333333 261.float 0.25 262.float 0 @ alignment 263 264logtab3: 265@ u=64/[48:2:96]; u Q8, -log u F32 266.word 0x0155,0xbe92cb01 @ 00003e9b..00004145 267.word 0x0148,0xbe7dc8c3 @ 00003ec8..00004158 268.word 0x013b,0xbe545f68 @ 00003ec1..00004137 269.word 0x012f,0xbe2c99c7 @ 00003ebb..00004119 270.word 0x0125,0xbe0a3c2c @ 00003ef3..0000413d 271.word 0x011a,0xbdc61a2f @ 00003eca..000040fe 272.word 0x0111,0xbd83acc2 @ 00003eeb..0000410d 273.word 0x0108,0xbcfc14d8 @ 00003ee8..000040f8 274.word 0x0100,0x00000000 @ 00003f00..00004100 275.word 0x00f8,0x3d020aec @ 00003ef8..000040e8 276.word 0x00f1,0x3d77518e @ 00003f13..000040f5 277.word 0x00ea,0x3db80698 @ 00003f12..000040e6 278.word 0x00e4,0x3ded393b @ 00003f3c..00004104 279.word 0x00dd,0x3e168b08 @ 00003f05..000040bf 280.word 0x00d8,0x3e2dfa03 @ 00003f48..000040f8 281.word 0x00d2,0x3e4ad2d7 @ 00003f2a..000040ce 282.word 0x00cd,0x3e637fde @ 00003f43..000040dd 283.word 0x00c8,0x3e7cc8e3 @ 00003f48..000040d8 284.word 0x00c3,0x3e8b5ae6 @ 00003f39..000040bf 285.word 0x00bf,0x3e95f784 @ 00003f6b..000040e9 286.word 0x00ba,0x3ea38c6e @ 00003f36..000040aa 287.word 0x00b6,0x3eaeadef @ 00003f46..000040b2 288.word 0x00b2,0x3eba0ec4 @ 00003f46..000040aa 289.word 0x00ae,0x3ec5b1cd @ 00003f36..00004092 290.word 0x00ab,0x3ece995f @ 00003f75..000040cb 291 292float_wrapper_section fsin_fcos 293 29430: 295 lsls r1,r0,#9 296 bne 1f @ NaN? return it 297 orrs r0,r0,#0x80000000 @ Inf: make a NaN 2981: 299 orrs r0,r0,#0x00400000 @ set top mantissa bit of NaN 300 bx r14 301 302@ heavy-duty range reduction 303@ here x≥256, -e in r1 30440: 305 push {r4-r7,r14} 306 movs r3,#1 307 bfi r0,r3,#23,#9 @ insert implied 1 in mantissa, clear sign 308 rsb r1,#9 @ e+9 309 mov r7,#0x7e @ this will be the exponent of the reduced angle - 1 31042: 311 bl frr_core 312@ here r6 is revolutions Q32 313 lsrs r3,r6,#30 @ quadrant count 314 adcs r3,r3,#0 @ rounded 315 add r12,r12,r3 316 subs r6,r6,r3,lsl#30 @ reduced angle/2π Q32 -.125≤x<+.125 317@ comment out from here... 318 lsls r2,r6,#2 @ Q34 319 it cs 320 rsbcs r2,r2,#0 @ absolute value 321 cmp r2,#1<<28 @ big enough for accuracy? 322 bhs 41f 323@ ... to here for slightly better accuracy 32443: 325 adds r1,r1,#2 @ try again with increased exponent 326 bl frr_core 327 eors r2,r6,r6,asr#32 @ absolute value 328 adc r2,r2,#0 329 cmp r2,#1<<28 @ big enough yet? 330 bhs 44f 331 subs r7,r7,#2 332 bpl 43b @ safety net 33344: 334 33541: 336 ldr r4,=0xC90FDAA2 @ 2π Q29 337 umull r2,r4,r2,r4 @ r4 has reduced angle Q34+Q29-Q32=Q31 338@ add r4,r4,r2,lsr#31 339 clz r2,r4 @ normalise 340 lsls r4,r4,r2 341 lsrs r4,r4,#8 342 sub r2,r7,r2 343 adc r0,r4,r2,lsl#23 @ with rounding 344 lsrs r1,r0,#23 @ re-extract exponent as there may have been a carry into it 345 rsbs r1,r1,#0x7f @ prepare exponent for re-entry 346 lsrs r6,r6,#31 347 add r3,r0,r6,lsl#31 @ apply sign of reduced angle 348 pop {r4-r7,r14} 349 b 5f @ re-enter with no risk of looping 350 351.ltorg 352 353@ light-duty range reduction 354@ here argument ≥1 355@ r0: argument 356@ r1: -e 357@ r12: quadrant count 358@ required result is sin(r0+r12*π/2) 35910: 360 cmn r1,#0x80 361 beq 30b @ Inf/NaN 362 bics r2,r0,r12,lsl#31 @ negative argument,doing sin -> +2 quadrants 363 it mi 364 addmi r12,r12,#2 365 bic r0,r0,#0x80000000 @ make positive: original sign is now captured in quadrant count in r12 366 367@ this may not actually be faster than doing it in integer registers 368 vmov s0,r0 369 adr r2,k_sc4 370 vldmia r2!,{s5-s7} 371@ vmul.f32 s4,s4,s0 @ this accurate calculation of the quadrant count does not seem necessary 372@ vfma.f32 s4,s5,s0 373 vmul.f32 s4,s5,s0 @ this is BALGE 374 cmn r1,#8 @ ≥256? 375 vrintn.f32.f32 s4,s4 @ round to quadrant count: x<256 so count≤163 376 ble 40b @ then do heavy-duty range reduction 377 vfms.f32 s0,s4,s7 378 vfms.f32 s0,s4,s6 379 vmov r3,s0 @ reduced angle 380 vcvt.s32.f32 s3,s4 381 ubfx r2,r3,#23,#8 @ get exponent 382 cmp r2,#0x78 383 blo 40b @ very small result? use heavy-duty reduction to get a more accurate answer 384 rsbs r1,r2,#0x7f @ ready for re-entry 385 vmov r2,s3 @ integer quadrant count 386 add r12,r12,r2 387@ prepare to re-enter with no risk of looping 388 b 5f 389 390k_sc4: 391@ 2/π=0.A2F9836E4E441529FC... 392.word 0x3f22f983 @ 2/π 393@ π/2=1.921FB54442D1846989... 394.word 0xb695777a,0x3fc91000 @ these two add up to π/2 with error ~1.6e-13 395 396wrapper_func sincosf 397 398 push {r0-r2,r14} 399 ubfx r1,r0,#23,#8 400 cmp r1,#0xff @ Inf/NaN? 401 beq 2f 402 bl cosf_entry @ this will exit via 1f or 2f... 403 pop {r1-r2,r14} 404 str r0,[r14] 405@ here C is still set from lsrs r12,r12,#1 406 bcs 1f 407 mvns r1,r1 408 eor r12,r12,r1,lsr#31 409@ this is fsc_costail: 410@ here calculate cos φ+ε = cosθ 411 vmul.f32 s5,s7,s1 @ sinφ sinε 412 vfma.f32 s5,s2,s6 @ sinφ sinε + cosφ(1-cosε) 413 vsub.f32 s5,s6,s5 @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε 414 vmov.f32 r0,s5 415 eor r0,r0,r12,lsl#31 416 str r0,[r2] 417 pop {r15} 418 4191: 420 eor r12,r12,r1,lsr#31 421@ this is fsc_sintail: 422@ here calculate sin φ+ε = sinθ 423 vmul.f32 s4,s2,s7 @ sinφ(1-cosε) 424 vfms.f32 s4,s6,s1 @ sinφ(1-cosε) - cosφ sinε 425 eor r1,r12,r3,lsr#31 @ flip sign if (reduced) argument was negative 426 vsub.f32 s4,s7,s4 @ cosφ sinε + sinφ cosε 427 vmov.f32 r0,s4 428 eor r0,r0,r1,lsl#31 429 str r0,[r2] @ save cos result 430 pop {r15} 431 432@ sincos of Inf or NaN 4332: 434 lsls r1,r0,#9 435 pop {r1-r3,r14} 436 bne 1f @ NaN? return it 437 orrs r0,r0,#0x80000000 @ Inf: make a NaN 4381: 439 orrs r0,r0,#0x00400000 @ set top mantissa bit of NaN 440 str r0,[r2] @ both sin and cos results 441 str r0,[r3] 442 bx r14 443 444wrapper_func sinf 445@ r12b1..0: quadrant count 446 movs r12,#0 447 b 1f 448 449wrapper_func cosf 450.thumb_func 451cosf_entry: 452 movs r12,#1 @ cos -> +1 quadrant 4531: 454 ubfx r1,r0,#23,#8 @ get exponent 455 cbz r1,20f @ 0/denormal? 45622: 457 rsbs r1,r1,#0x7f 458 bls 10b @ argument ≥1? needs reduction; also Inf/NaN handling 459 bic r3,r0,r12,lsl#31 @ this would mess up NaNs so do it here 4605: 461@ here we have a quadrant count in r12 and a signed offset r0 from r12*π/2 462 bic r0,r3,#0x80000000 @ this would mess up NaNs so do it here 463 vmov s0,r0 464 ubfx r0,r0,#18,#5 @ extract top of mantissa 465 adds r0,r0,#32 @ insert implied 1 466 lsrs r1,r0,r1 @ to fixed point Q5 467 ldr r2,=k_sc3 468 adcs r1,r1,#0 @ rounding 469 vldmia r2!,{s8-s9} 470 add r2,r2,r1,lsl#2 @ 12 bytes per entry 471 add r2,r2,r1,lsl#3 472 473 vldmia r2,{s5-s7} @ φ, cosφ, sinφ 474 vsub.f32 s1,s0,s5 @ ε 475 vmul.f32 s2,s1,s1 @ ε² 476 lsrs r12,r12,#1 @ computing cosine? 477 vmul.f32 s3,s2,s1 @ ε³ 478 bcs 2f 479 480 vmul.f32 s2,s2,s8 @ ε²/2! ~ 1-cosε 481 vmul.f32 s3,s3,s9 @ ε³/3! 482 vsub.f32 s1,s1,s3 @ ε-ε³/3! ~ sinε 483 484@ here: 485@ s1: sinε 486@ s2: 1-cosε 487@ s6: cosφ 488@ s7: sinφ 489@ r12: quadrant count 490fsc_sintail: 491@ here calculate sin φ+ε = sinθ 492 vmul.f32 s4,s2,s7 @ sinφ(1-cosε) 493 vfms.f32 s4,s6,s1 @ sinφ(1-cosε) - cosφ sinε 494 eor r1,r12,r3,lsr#31 @ flip sign if (reduced) argument was negative 495 vsub.f32 s4,s7,s4 @ cosφ sinε + sinφ cosε 496 vmov.f32 r0,s4 497 eor r0,r0,r1,lsl#31 498 bx r14 499 50020: 501 and r0,r0,#0x80000000 @ make signed zero 502 b 22b 503 504.align 2 5052: 506 vmul.f32 s3,s3,s9 @ ε³/3! 507 vsub.f32 s1,s1,s3 @ ε-ε³/3! ~ sinε 508 vmul.f32 s2,s2,s8 @ ε²/2! ~ 1-cosε 509fsc_costail: 510@ here calculate cos φ+ε = cosθ 511 vmul.f32 s5,s7,s1 @ sinφ sinε 512 vfma.f32 s5,s2,s6 @ sinφ sinε + cosφ(1-cosε) 513 vsub.f32 s5,s6,s5 @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε 514 vmov.f32 r0,s5 515 eor r0,r0,r12,lsl#31 516 bx r14 517 518.align 3 519k_sc3: 520.word 0x3EFFFEC1 @ ~ 1/2! with PMC 521.word 0x3e2aaa25 @ ~ 1/3! with PMC 522 523trigtab2: 524// φ cos φ sin φ 525.word 0x00000000,0x3f800000,0x00000000 526.word 0x3cfcc961,0x3f7fe0cd,0x3cfcbf1c @ φ=0.03085774 : cos φ=3feffc199ff28ef4 33.3b; sin φ=3f9f97e38006c678 39.2b 527.word 0x3d810576,0x3f7f7dfe,0x3d80ef9e @ φ=0.06299870 : cos φ=3fefefbfc00d6b6d 33.3b; sin φ=3fb01df3c000dfd5 40.2b 528.word 0x3dbf0c09,0x3f7ee30f,0x3dbec522 @ φ=0.09328467 : cos φ=3fefdc61dff4f58e 33.5b; sin φ=3fb7d8a43ffdf9ac 39.0b 529.word 0x3dff24b6,0x3f7e0414,0x3dfe7be2 @ φ=0.12458174 : cos φ=3fefc0827fdaf90f 31.8b; sin φ=3fbfcf7c3ff9dd0c 37.4b 530.word 0x3e1f0713,0x3f7ceb48,0x3e1e63a0 @ φ=0.15530042 : cos φ=3fef9d68ffe680a0 32.3b; sin φ=3fc3cc73fffa6d09 36.5b 531.word 0x3e40306d,0x3f7b811d,0x3e3f1015 @ φ=0.18768473 : cos φ=3fef70239fe32301 32.1b; sin φ=3fc7e2029ffdbc2c 37.8b 532.word 0x3e60ada2,0x3f79dccf,0x3e5ee13e @ φ=0.21941236 : cos φ=3fef3b99e023f5aa 31.8b; sin φ=3fcbdc27bffe216d 38.1b 533.word 0x3e800d7b,0x3f7808fa,0x3e7d7196 @ φ=0.25010285 : cos φ=3fef011f401572a6 32.6b; sin φ=3fcfae32c00328bb 37.3b 534.word 0x3e8f986e,0x3f75ff65,0x3e8db868 @ φ=0.28045982 : cos φ=3feebfeca0aaaf99 29.6b; sin φ=3fd1b70cfffc1468 36.0b 535.word 0x3e9fe1f4,0x3f739e93,0x3e9d4bfd @ φ=0.31227076 : cos φ=3fee73d25fbf733b 31.0b; sin φ=3fd3a97fa0002ced 40.5b 536.word 0x3eb054c6,0x3f70f7ae,0x3eacddb3 @ φ=0.34439677 : cos φ=3fee1ef5bfcf70cb 31.4b; sin φ=3fd59bb65fff5c30 38.6b 537.word 0x3ebf89c5,0x3f6e4b60,0x3ebb1a0a @ φ=0.37409797 : cos φ=3fedc96bffdebb8a 31.9b; sin φ=3fd763414003344b 36.3b 538.word 0x3ecfc426,0x3f6b35ca,0x3eca1c63 @ φ=0.40579337 : cos φ=3fed66b93fe27dc6 32.1b; sin φ=3fd9438c5ffe5d45 37.3b 539.word 0x3ee054f2,0x3f67d166,0x3ed93907 @ φ=0.43814808 : cos φ=3fecfa2cbffc16e9 35.0b; sin φ=3fdb2720dffef5b6 37.9b 540.word 0x3eeff0dd,0x3f64664b,0x3ee74116 @ φ=0.46863452 : cos φ=3fec8cc95f714272 29.8b; sin φ=3fdce822c00479ad 35.8b 541.word 0x3f002b31,0x3f609488,0x3ef5c30f @ φ=0.50065905 : cos φ=3fec1290ffc99208 31.2b; sin φ=3fdeb861dfff3932 38.4b 542.word 0x3f07e407,0x3f5cc5a2,0x3f01992b @ φ=0.53082317 : cos φ=3feb98b44034cd46 31.3b; sin φ=3fe033255ffff628 41.7b 543.word 0x3f101fc5,0x3f587d8f,0x3f08a165 @ φ=0.56298476 : cos φ=3feb0fb1e0ceda6f 29.3b; sin φ=3fe1142c9ffd5ae4 35.6b 544.word 0x3f17f68a,0x3f5434b5,0x3f0f31ca @ φ=0.59360564 : cos φ=3fea8696a038a06f 31.2b; sin φ=3fe1e639400269fb 35.7b 545.word 0x3f1fffe2,0x3f4f9b59,0x3f15c8d7 @ φ=0.62499821 : cos φ=3fe9f36b1f428363 29.4b; sin φ=3fe2b91ae001d55d 36.1b 546.word 0x3f280646,0x3f4acf6b,0x3f1c37c4 @ φ=0.65634573 : cos φ=3fe959ed61449f08 28.7b; sin φ=3fe386f87ffd9617 35.7b 547.word 0x3f303041,0x3f45b9e0,0x3f229ae4 @ φ=0.68823630 : cos φ=3fe8b73c0047ae7a 30.8b; sin φ=3fe4535c7ffdf1ac 36.0b 548.word 0x3f381da7,0x3f4098ca,0x3f28a620 @ φ=0.71920246 : cos φ=3fe81319402ae6e1 31.6b; sin φ=3fe514c3ffff423c 37.4b 549.word 0x3f3fc72f,0x3f3b76ac,0x3f2e564a @ φ=0.74913305 : cos φ=3fe76ed5809d419f 29.7b; sin φ=3fe5cac93fffaf1d 38.7b 550.word 0x3f4813db,0x3f35b6cc,0x3f34526b @ φ=0.78155297 : cos φ=3fe6b6d9800e8b52 33.1b; sin φ=3fe68a4d5ffe89fc 36.5b 551.word 0x3f4fc779,0x3f30352f,0x3f39b4d0 @ φ=0.81163746 : cos φ=3fe606a5dfdc2b5b 31.8b; sin φ=3fe73699fffd7fc8 35.7b 552.word 0x3f57dd52,0x3f2a4170,0x3f3f2d91 @ φ=0.84322083 : cos φ=3fe5482e011ba752 28.9b; sin φ=3fe7e5b21ffcb223 35.3b 553.word 0x3f5fce26,0x3f243e9f,0x3f445dc3 @ φ=0.87423933 : cos φ=3fe487d3e0b9864b 29.5b; sin φ=3fe88bb85ffde6d5 35.9b 554.word 0x3f6825f1,0x3f1dc250,0x3f499d1c @ φ=0.90682894 : cos φ=3fe3b849ffea9b8f 32.6b; sin φ=3fe933a38002730d 35.7b 555.word 0x3f703be1,0x3f175041,0x3f4e7ebf @ φ=0.93841368 : cos φ=3fe2ea0820791b4e 30.1b; sin φ=3fe9cfd7e0053e65 34.6b 556.word 0x3f781078,0x3f10ed71,0x3f5306af @ φ=0.96900129 : cos φ=3fe21dae1fdea23e 31.9b; sin φ=3fea60d5e001b90b 36.2b 557.word 0x3f7ff4d4,0x3f0a5aa7,0x3f57649b @ φ=0.99982953 : cos φ=3fe14b54deeaa407 28.9b; sin φ=3feaec9360012825 36.8b 558 559float_wrapper_section tanf 560 561wrapper_func tanf 562 push {r0,r14} 563 ubfx r1,r0,#23,#8 564 cmp r1,#0xff @ Inf/NaN? 565 beq 2f 566 bl cosf_entry @ this will exit via sintail or costail... 567 ldr r1,[sp,#0] 568@ here C is still set from lsrs r12,r12,#1 569 bcs 1f 570@ we exited via sintail 571@ this is fsc_costail: 572@ here calculate cos φ+ε = cosθ 573 vmul.f32 s5,s7,s1 @ sinφ sinε 574 vfma.f32 s5,s2,s6 @ sinφ sinε + cosφ(1-cosε) 575 eors r1,r1,r3 576 vsub.f32 s5,s6,s5 @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε 577 vdiv.f32 s0,s5,s4 578 vmov.f32 r0,s0 579 it pl 580 eorpl r0,r0,#0x80000000 581 pop {r1,r15} 582 5831: 584@ we exited via costail 585@ this is fsc_sintail: 586@ here calculate sin φ+ε = sinθ 587 vmul.f32 s4,s2,s7 @ sinφ(1-cosε) 588 vfms.f32 s4,s6,s1 @ sinφ(1-cosε) - cosφ sinε 589 eors r1,r1,r3 590 vsub.f32 s4,s7,s4 @ cosφ sinε + sinφ cosε 591 vdiv.f32 s0,s4,s5 592 vmov.f32 r0,s0 593 it mi 594 eormi r0,r0,#0x80000000 595 pop {r1,r15} 596 597@ tan of Inf or NaN 5982: 599 lsls r1,r0,#9 600 bne 1f @ NaN? return it 601 orrs r0,r0,#0x80000000 @ Inf: make a NaN 6021: 603 orrs r0,r0,#0x00400000 @ set top mantissa bit of NaN 604 pop {r3,r15} 605 606 607float_wrapper_section atan2f 608 60950: 61060: 611 orrs r0,r1,#0x00400000 612 bx r14 613 61451: 615 bne 52f @ NaN? 616 cmp r3,#0x7f800000 @ y an infinity; x an infinity too? 617 bne 55f @ no: carry on 618@ here x and y are both infinities 619 b 66f 620 62152: 62262: 623 orrs r0,r0,#0x00400000 624 bx r14 625 62661: 627 bne 62b @ NaN? 628 cmp r3,#0x7f800000 @ y an infinity; x an infinity too? 629 bne 65f @ no: carry on 63066: 631@ here x and y are both infinities 632 subs r0,r0,#1 @ make both finite (and equal) with same sign and retry 633 subs r1,r1,#1 634 b 86f 635 63670: 637 and r3,#0x80000000 638 cmp r2,#0x00800000 639 bhs 72f @ y 0 or denormal? 640@ here both x and y are zeros 641 b 85f 64271: 643 and r2,#0x80000000 64472: 645 vmov s0,s1,r2,r3 646 vdiv.f32 s2,s0,s1 @ restart the division 647 b 73f @ and go back and check for NaNs 648 64980: 650 and r3,#0x80000000 651 cmp r2,#0x00800000 652 bhs 82f @ y 0 or denormal? 65385: 654@ here both x and y are zeros 655 orr r1,r1,0x3f800000 @ retry with x replaced by ~1 with appropriate sign 656 b 86f 657 65881: 659 and r2,#0x80000000 66082: 661 vmov s0,s1,r2,r3 662 vdiv.f32 s2,s1,s0 @ restart the division 663 b 83f @ and go back and check for NaNs 664 665wrapper_func atan2f 66686: 667 bic r2,r0,#0x80000000 668 bic r3,r1,#0x80000000 669 vmov s0,s1,r2,r3 670 cmp r2,r3 @ |y| vs. |x| 671 bhi 1f 672@ here |x|≥|y| so we need |y|/|x|; octant/xs/ys: 0++,3-+,4--,7+- 673 vdiv.f32 s2,s0,s1 @ get this division started; result ≤1 674 cmp r3,#0x00800000 675 blo 70b @ x 0 or denormal? 676 cmp r2,#0x00800000 677 blo 71b @ y 0 or denormal? 67873: 679 cmp r3,#0x7f800000 680 bhi 50b @ x NaN? 681 cmp r2,#0x7f800000 682 bhs 51b @ y Inf or NaN? 68355: 684 cmp r1,#0 685 ite mi 686 ldrmi r12,pi @ if x<0, need two extra quadrants 687 movpl r12,#0 688 @ inner negation is the sign of x 689 b 2f 690 6911: 692@ here |x|<|y| so we need |x|/|y|; octant/xs/ys: 1++,2-+,5--,6+- 693 vdiv.f32 s2,s1,s0 @ result <1 694 cmp r3,#0x00800000 695 blo 80b @ x 0 or denormal? 696 cmp r2,#0x00800000 697 blo 81b @ y 0 or denormal? 69883: 699 cmp r3,#0x7f800000 700 bhi 60b @ x NaN? 701 cmp r2,#0x7f800000 702 bhs 61b @ y Inf or NaN? 70365: 704 ldr r12,piover2 @ always one extra quadrant in this path 705 eors r1,r1,#0x80000000 @ inner negation is the complement of the sign of x 706 7072: 708@ here 709@ r0 y 710@ r1 ±x 711@ r2 |y| 712@ r3 |x| 713@ s0,s1 = |x|,|y| 714@ s2=s0/s1 or s1/s0, 0≤s2≤1 715@ r12=quadrant count * π/2 716@ where the final result is 717@ ± (r12 ± atn s2) where the inner negation is given by r1b31 and the outer negation by r0b31 718 719 adr r2,trigtab3 720 vmov.f32 s3,s2 721 vcvt.u32.f32 s3,s3,#6 722 vmov.f32 r3,s3 723 lsrs r3,r3,#1 724 adcs r3,r3,#0 @ rounding; set Z if in φ==0 case 725 add r2,r2,r3,lsl#3 726 vldr s5,[r2,#4] @ t=tanφ 727 vmul.f32 s0,s5,s2 @ ty 728 vsub.f32 s1,s2,s5 @ y-t 729 vmov.f32 s5,#1.0 730 vadd.f32 s0,s5,s0 @ 1+ty 731 beq 9f @ did we look up zeroth table entry? 732 733@ now (s0,s1) = (x,y) 734 vdiv.f32 s0,s1,s0 @ ε 735 ldr r2,[r2] @ φ Q29 736@ result is now ±(r12±(r2+atn(s0)) 737 cmp r1,#0 @ inner negation 738 it mi 739 rsbmi r2,r2,#0 740 add r2,r12,r2 @ Q29 741 cmp r0,#0 @ outer negation 742 it mi 743 rsbmi r2,r2,#0 744 cmp r2,#0 745 bpl 1f 746 rsbs r2,r2,#0 747 clz r3,r2 748 lsls r2,r2,r3 749 beq 3f 750 rsb r3,#0x180 751 b 2f 7521: 753 clz r3,r2 754 lsls r2,r2,r3 755 beq 3f 756 rsb r3,#0x80 7572: 758 lsrs r2,r2,#8 @ rounding bit to carry 759 adc r2,r2,r3,lsl#23 @ with rounding 7603: 761 vmul.f32 s2,s0,s0 @ ε² 762 vldr.f32 s3,onethird 763 vmul.f32 s2,s2,s0 @ ε³ 764 teq r0,r1 765 vmul.f32 s2,s2,s3 @ ε³/3 766 vmov.f32 s4,r2 767 vsub.f32 s0,s0,s2 @ ~atn(ε) 768 ite pl 769 vaddpl.f32 s0,s4,s0 770 vsubmi.f32 s0,s4,s0 771 vmov.f32 r0,s0 772 bx r14 773 7749: @ we looked up the zeroth table entry; we could generate slightly more accurate results here 775@ now (s0,s1) = (x,y) 776 vdiv.f32 s0,s1,s0 @ ε 777@ result is now ±(r12±(0+atn(s0)) 778 mov r2,r12 @ Q29; in fact r12 is only ±π/2 or ±π so can probably simplify this 779 cmp r0,#0 @ outer negation 780 it mi 781 rsbmi r2,r2,#0 782 cmp r2,#0 783 bpl 1f 784 rsbs r2,r2,#0 785 clz r3,r2 786 lsls r2,r2,r3 787 beq 3f 788 rsb r3,#0x180 789 b 2f 7901: 791 clz r3,r2 792 lsls r2,r2,r3 793 beq 3f 794 rsb r3,#0x80 7952: 796 lsrs r2,r2,#8 @ rounding bit to carry 797 adc r2,r2,r3,lsl#23 @ with rounding 7983: 799 vmul.f32 s2,s0,s0 @ ε² 800 vldr.f32 s3,onethird 801 vmul.f32 s2,s2,s0 @ ε³ 802 teq r0,r1 803 vmul.f32 s2,s2,s3 @ ε³/3 804 vmov.f32 s4,r2 805 vsub.f32 s0,s0,s2 @ ~atn(ε) 806 ite pl 807 vaddpl.f32 s0,s4,s0 808 vsubmi.f32 s0,s4,s0 809 vmov.f32 r0,s0 810 tst r0,#0x7f800000 @ about to return a denormal? 811 it ne 812 bxne r14 813 and r0,r0,#0x80000000 @ make it zero 814 bx r14 815 816piover2: .word 0x3243f6a9 @ Q29 817pi: .word 0x6487ed51 @ Q29 818onethird: .float 0.33333333 819 820trigtab3: 821// φ Q29 tan φ SP 822.word 0x00000000,0x00000000 823.word 0x00ffee23,0x3d0001bb @ φ=0.03124148 : tan φ=3fa000375fffff9d 50.4b 824.word 0x01fe88dc,0x3d7f992a @ φ=0.06232112 : tan φ=3faff3253fffea1f 44.5b 825.word 0x02fe0a70,0x3dc01203 @ φ=0.09351084 : tan φ=3fb8024060002522 42.8b 826.word 0x03fad228,0x3e000368 @ φ=0.12436779 : tan φ=3fc0006cfffffc90 45.2b 827.word 0x04f5ab70,0x3e1ffdea @ φ=0.15498897 : tan φ=3fc3ffbd400014d5 42.6b 828.word 0x05ed56f8,0x3e3fdddc @ φ=0.18522213 : tan φ=3fc7fbbb80000beb 43.4b 829.word 0x06e4cfa0,0x3e601425 @ φ=0.21543103 : tan φ=3fcc02849fffe817 42.4b 830.word 0x07d8d3e0,0x3e80215d @ φ=0.24521822 : tan φ=3fd0042b9ffff89f 43.1b 831.word 0x08c60460,0x3e9000a5 @ φ=0.27417201 : tan φ=3fd20014a000182b 41.4b 832.word 0x09b26770,0x3ea01492 @ φ=0.30302784 : tan φ=3fd402923ffff932 43.2b 833.word 0x0a996d50,0x3eb01377 @ φ=0.33122888 : tan φ=3fd6026ee0001062 42.0b 834.word 0x0b7a6d10,0x3ebff4a0 @ φ=0.35869458 : tan φ=3fd7fe93ffff8c38 39.1b 835.word 0x0c593ce0,0x3ed0019f @ φ=0.38589329 : tan φ=3fda0033e0001354 41.7b 836.word 0x0d33ebd0,0x3ee01bbc @ φ=0.41258803 : tan φ=3fdc0377800162a1 37.5b 837.word 0x0e087ab0,0x3ef01fbd @ φ=0.43853506 : tan φ=3fde03f79fffddf2 40.9b 838.word 0x0ed56180,0x3effef98 @ φ=0.46354747 : tan φ=3fdffdf30000767d 39.1b 839.word 0x0fa1de80,0x3f080ebf @ φ=0.48850942 : tan φ=3fe101d7dfffb9fc 38.9b 840.word 0x10639d00,0x3f0fec31 @ φ=0.51215982 : tan φ=3fe1fd862000aad5 37.6b 841.word 0x112690e0,0x3f180cfd @ φ=0.53595775 : tan φ=3fe3019fa00069ea 38.3b 842.word 0x11e014c0,0x3f200065 @ φ=0.55860364 : tan φ=3fe4000ca00022e5 39.9b 843.word 0x129651e0,0x3f2808be @ φ=0.58084959 : tan φ=3fe50117c00015a7 40.6b 844.word 0x1346d400,0x3f300a7d @ φ=0.60239601 : tan φ=3fe6014f9fffa020 38.4b 845.word 0x13efc7c0,0x3f37ee2f @ φ=0.62302005 : tan φ=3fe6fdc5dfff98d7 38.3b 846.word 0x14988960,0x3f400c32 @ φ=0.64362019 : tan φ=3fe801863fffff81 46.0b 847.word 0x1537a8c0,0x3f47ef42 @ φ=0.66304433 : tan φ=3fe8fde8400062a4 38.4b 848.word 0x15d4cc60,0x3f4ff630 @ φ=0.68222636 : tan φ=3fe9fec5ffff76e2 37.9b 849.word 0x166ef280,0x3f581534 @ φ=0.70104337 : tan φ=3feb02a680004e91 38.7b 850.word 0x16ff75c0,0x3f5fef1e @ φ=0.71868408 : tan φ=3febfde3c0001404 40.7b 851.word 0x179116a0,0x3f68184d @ φ=0.73646098 : tan φ=3fed03099ffed6e5 36.8b 852.word 0x181b5aa0,0x3f701722 @ φ=0.75333911 : tan φ=3fee02e43fffd351 39.5b 853.word 0x18a10560,0x3f781071 @ φ=0.76965588 : tan φ=3fef020e20005c05 38.5b 854.word 0x19214060,0x3f7ff451 @ φ=0.78530902 : tan φ=3feffe8a1fffe11b 40.1b 855 856#endif 857