1/* 2 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd. 3 * 4 * SPDX-License-Identifier: BSD-3-Clause 5 */ 6 7#if !PICO_RP2040 8#include "pico/asm_helper.S" 9 10pico_default_asm_setup 11 12.macro double_section name 13#if PICO_DOUBLE_IN_RAM 14.section RAM_SECTION_NAME(\name), "ax" 15#else 16.section SECTION_NAME(\name), "ax" 17#endif 18.endm 19 20.macro double_wrapper_section func 21double_section WRAPPER_FUNC_NAME(\func) 22.endm 23 24.global rtwopi 25.global logtab0 26.global exptab0 27.global exptab1 28.global exptab2 29.global trigtab 30 31@ load a 32-bit constant n into register rx 32.macro movlong rx,n 33 movw \rx,#(\n)&0xffff 34 movt \rx,#((\n)>>16)&0xffff 35.endm 36 37double_section rtwopi 38 39// 1/2π to plenty of accuracy, 256 bits per line 40.long 0 @ this allows values of e down to -32 41rtwopi: 42.long 0,0 43.long 0x28BE60DB, 0x9391054A, 0x7F09D5F4, 0x7D4D3770, 0x36D8A566, 0x4F10E410, 0x7F9458EA, 0xF7AEF158 44.long 0x6DC91B8E, 0x909374B8, 0x01924BBA, 0x82746487, 0x3F877AC7, 0x2C4A69CF, 0xBA208D7D, 0x4BAED121 45.long 0x3A671C09, 0xAD17DF90, 0x4E64758E, 0x60D4CE7D, 0x272117E2, 0xEF7E4A0E, 0xC7FE25FF, 0xF7816603 46.long 0xFBCBC462, 0xD6829B47, 0xDB4D9FB3, 0xC9F2C26D, 0xD3D18FD9, 0xA797FA8B, 0x5D49EEB1, 0xFAF97C5E 47.long 0xCF41CE7D, 0xE294A4BA, 0x9AFED7EC, 0x47E35742, 0x1580CC11 48 49double_section drrcore 50 51@ input: 52@ r0:r1 mantissa m Q52 53@ r2 exponent e>=-32, typically offset by +12 54@ r3 pointer to rtwopi table 55@ output: 56@ r0..r2 preserved 57@ r7:r8 range reduced result 58@ r3,r4,r5,r6,r9,r10 trashed 59.thumb_func 60drr_core: 61 ldr r3,=rtwopi 62 and r5,r2,#31 @ s=e%32 63 mov r7,#1 64 lsls r7,r7,r5 @ 1<<s 65 asrs r8,r2,#5 @ k=e/32, k<=32 for double with e offset <32; with e offsets up to 12+64, k<=34 66 umull r4,r5,r7,r0 67 movs r6,#0 68 umlal r5,r6,r7,r1 69@ r2 e 70@ r4:r5:r6 u0:u1:u2 = m<<(e%32); u2 is never more than 2<<20 71@ r8 e/32 72 add r3,r3,r8,lsl#2 @ p 73 74 ldr r9,[r3,#16] @ a0=p[4] 75 umull r10,r7,r9,r6 @ r0=a0*u2 hi, discard lo 76@ r7 r0 77 ldr r9,[r3,#12] @ a1=p[3] 78 umull r10,r8,r9,r5 @ r1=a1*u1 hi, discard lo 79 umaal r7,r8,r9,r6 @ r0:r1=r0+r1+a1*u2 80@ r7:r8 r0:r1 81 ldr r9,[r3,#8] @ a2=p[2] 82 mla r8,r9,r6,r8 @ r1+=a2*u2 83 umlal r7,r8,r9,r5 @ r0:r1+=a2*u1 84 umull r10,r6,r9,r4 @ r0x=a2*u0 hi, discard lo; u2 no longer needed 85@ r7:r8 r0:r1 86@ r6 r0x 87 ldr r9,[r3,#4] @ a3=p[1] 88 mla r8,r9,r5,r8 @ r1+=a3*u1 89 umlal r7,r8,r9,r4 @ r0:r1+=a3*u0 90@ r7:r8 r0:r1 91@ r6 r0x 92 ldr r9,[r3,#0] @ a4=p[0] 93 mla r8,r9,r4,r8 @ r1+=a4*u0 94@ r7:r8 r0:r1 95@ r6 r0x 96 adds r7,r6 97 adc r8, r8, #0 98 bx r14 99 100double_wrapper_section tan 101wrapper_func tan 102 push {r14} 103 bl sincos_raw 104 bl __aeabi_ddiv 105 pop {r15} 106 107double_wrapper_section sin_cos 108 10910: @ process Inf/NaN for dsin and dcos 110 orrs r2,r0,r1,lsl#12 111 it eq 112 orreq r1,#0x80000000 113 orr r1,#0x00080000 114 movs r3,r1 @ copy results to cosine output 115 movs r2,r0 116 bx r14 117 118@ case where angle is very small (<2^-32) before reduction 11932: 120 cbnz r3,33f 121 movs r0,#0 @ flush denormal 122 movs r1,#0 12333: 124 movs r2,#0 @ return x for sine, 1 for cosine 125 movlong r3,0x3ff00000 126 tst r12,#2 127 itt ne 128 movne r0,r2 @ calculating cosine? move to result registers 129 movne r1,r3 130 bx r14 131 132@ case where angle is fairly small after reduction 13330: 134 movs r2,#0 135 movs r3,#0 136 movs r4,#0 137 movs r5,#0x80000000 138 b 31f 139 14040: 141@ case where range-reduced angle is very small 142@ here 143@ r0:r1 mantissa 144@ r2 exponent+12 145@ r7:r8 abs range-reduced angle < 2^-10 Q64 146@ r12b31: dsincos flag 147@ r12b30: original argument sign 148@ r12b2..1: quadrant count 149@ r12b0: sign of reduced angle 150 push {r12} 151 movs r12,#0 1522: 153 add r12,#4 154 add r2,#4 155 bl drr_core @ repeat range reduction with extra factor of 2^4 (, 2^8, 2^12, 2^16,...) 156 eors r9,r8,r8,asr#31 @ we want to keep the sign in r8b31 for later 157 cmp r9,#1<<24 @ ≥ 2^-8? 158 bhs 1f @ loop until the result is big enough 159 cmp r12,#64 @ safety net 160 bne 2b 1611: 162 eors r7,r7,r8,asr#31 163@ here r7:r9 is the abs range-reduced angle Q64+r12, 2^-8..2^-4 in Q64 terms 164 165@ 2π=6.487ED511 0B4611A6 (2633...) 166 movlong r6,0x0B4611A6 @ 2π Q64 low fractional word 167 umull r10,r0,r9,r6 168 movlong r6,0x487ED511 @ 2π Q64 high fractional word 169 umull r10,r1,r7,r6 170 umaal r0,r1,r9,r6 171 movs r6,#6 @ 2π integer part 172 umlal r0,r1,r7,r6 173 mla r1,r9,r6,r1 174 175@ here 176@ r0:r1 θ, abs range reduced angle θ 0..π/4 Q64+r12 177@ r8b31: sign of reduced angle 178@ r12: excess exponent ≥4, multiple of 4 179@ r0:r1 / 2^r12 < ~ 2π * 2^-10 so for sin we need to go to term in x^5 180 rsbs r10,r12,#32 181 bmi 1f @ very small result? 182 183 lsr r3,r1,r12 184 lsr r2,r0,r12 185 lsl r9,r1,r10 186 orr r2,r9 187@ r2:r3 θ Q64 (with r12 bits of loss of precision) 188 umull r9,r4,r2,r3 189 umull r9,r5,r2,r3 190 umaal r4,r5,r3,r3 191@ r4:r5 θ² Q64 = θ²/2 Q65 < ~ 4π² * 2^-20 192 umull r9,r6,r4,r5 193 umull r9,r7,r4,r5 194 umaal r6,r7,r5,r5 195@ r6:r7 θ⁴ Q64 < ~ 16π⁴ * 2^-40 < 2^-29 196 lsrs r6,#3 197 orrs r6,r6,r7,lsl#29 198@ r6 θ⁴ Q61 199 mov r9,#0xaaaaaaaa @ 2/3 Q32 200 umull r6,r7,r6,r9 201@ r7 θ⁴ * 2/3 Q61 = θ⁴/24 Q65 202 rsbs r6,r4,r7 203 sbcs r7,r5,r5,lsl#1 204@ r6:r7 - θ²/2 + θ⁴/24 Q65, <=0 205 asrs r9,r7,#12 206 lsrs r6,#12 207 adcs r6,r6,r7,lsl#20 208 adcs r7,r9,#0x0ff00000 209 adds r7,#0x30000000 210 push {r6,r7} @ packed cos θ 211 212@ here 213@ r0:r1 θ, abs range reduced angle θ 0..π/4 Q64+r12 214@ r4:r5 θ² Q64 = θ²/2 Q65 215 umull r9,r2,r0,r5 216 umull r9,r3,r1,r4 217 umaal r2,r3,r1,r5 218@ r2:r3 θ³ Q64+r12 219 umull r9,r6,r2,r5 220 umull r9,r7,r3,r4 221 umaal r6,r7,r3,r5 222@ r6:r7 θ⁵ Q64+r12; in fact r7 is always 0 223 mov r9,#0xaaaaaaaa @ 2/3 Q32 224 umull r10,r4,r2,r9 225 movs r5,#0 226 umlal r4,r5,r3,r9 227 adds r4,r4,r5 228 adcs r5,r5,#0 229@ r4:r5 θ³*2/3 Q64+r12 = θ³/6 Q66+r12 230 mov r9,#0x88888888 @ 8/15 Q32 231 umull r2,r3,r6,r9 232@ r3 θ⁵*8/15 Q64+r12 = θ⁵/120 Q70+r12 233 subs r4,r4,r3,lsr#4 234 sbc r5,r5,#0 235@ r4:r5 θ³/6-θ⁵/120 Q66+r12 236 subs r0,r0,r4,lsr#2 237 sbc r1,r1,r5,asr#2 238 subs r0,r0,r5,lsl#30 239 sbc r1,r1,#0 240@ r0:r1 θ-θ³/6+θ⁵/120 Q64+r12 241 rsb r4,r12,#0x400 242 sub r4,#14 243 bl dpack_q 244 pop {r6,r7,r12} @ get cosine, flags 245 b 2f 246 2471: 248@ case where range reduction result has excess exponent ≥ 32 249@ so sin x=x, cos x=1 250 rsb r4,r12,#0x400 251 sub r4,#14 252 bl dpack_q 253 pop {r12} @ get flags 254 movs r6,#0 255 movlong r7,0x3ff00000 2562: 257@ here r0:r1 is packed sine, r6:r7 is packed cosine 258 lsrs r8,#31 259 bfi r12,r8,#0,#1 260 261 asrs r9,r12,#1 262 bmi 23f @ doing dsincos? 263 asrs r9,#1 264 bcc 21f @ need sine? 265@ need cosine: 266 ands r12,#4 267 orrs r1,r7,r12,lsl#29 @ insert sign 268 movs r0,r6 269 pop {r4-r10,r15} 270 27121: 272 eors r12,r12,r12,lsr#2 273 orrs r1,r1,r12,lsl#31 @ insert sign 274 pop {r4-r10,r15} 275 27623: 277 ands r4,r12,#4 278 eors r7,r7,r4,lsl#29 @ insert sign 279 push {r6,r7} 280 b 20f 281 282@ sincos à la GNU with pointers to where to put results 283wrapper_func sincos 284 push {r2-r5, lr} 285 bl sincos_raw 286 pop {r4-r5} 287 stmia r4!, {r0, r1} 288 stmia r5!, {r2, r3} 289 pop {r4, r5, pc} 290 291@ sincos with results in r0:r1 and r2:r3 292.thumb_func 293sincos_raw: 294 ands r12,r1,#1<<31 295 lsrs r12,#1 @ save argument sign in r12b30 296 orrs r12,r12,#1<<31 @ flag we want both results in r12b31 297 b 1f 298 299wrapper_func sin 300 lsrs r12,r1,#29 @ negative argument -> +2 quadrants 301 ands r12,#4 302 b 1f 303 304wrapper_func cos 305 movs r12,#2 @ cos -> +1 quadrant 3061: 307 ubfx r3,r1,#20,#11 @ get exponent 308 sub r2,r3,#0x3ff 309 cmp r2,#0x400 310 beq 10b @ Inf or NaN? 311 cmn r2,#32 312 blt 32b @ very small argument? 313 movs r3,#1 314 bfi r1,r3,#20,#12 @ fix implied 1 in mantissa 315 push {r4-r10,r14} 316 add r2,#12 @ e+12 317 bl drr_core 318@ r7:r8 θ/2π 0..1 Q64 319 lsrs r4,r8,#30 @ quadrant count 320 adc r4,r4,#0 @ rounded 321 sub r8,r8,r4,lsl#30 @ now -0.125≤r7:r8<+0.125 Q64 322 add r12,r12,r4,lsl#1 323 orr r12,r12,r8,lsr#31 @ sign into r12b0 324@ r12b2..1: quadrant count 325@ r12b0: sign of reduced angle 326 eors r7,r7,r8,asr#31 327 eors r8,r8,r8,asr#31 @ absolute value of reduced angle 0≤r7:r8<0.125 Q64 328 cmp r8,#1<<22 @ < 2^-10? 329 blo 40b 330 331@ 2π=6.487ED511 0B4611A6 (2633...) 332 movlong r9,0x0B4611A6 @ 2π Q64 low fractional word 333 umull r10,r0,r8,r9 334 movlong r9,0x487ED511 @ 2π Q64 high fractional word 335 umull r10,r1,r7,r9 336 umaal r0,r1,r8,r9 337 movs r9,#6 @ 2π integer part 338 umlal r0,r1,r7,r9 339 mla r1,r8,r9,r1 340@ r0:r1 range reduced angle θ 0..π/4 Q64 341 342 cmp r1,#1<<25 @ θ < 2^-7? 343 blo 30b 344 lsrs r2,r1,#27 345 ldr r3,=trigtab 346 add r3,r3,r2,lsl#4 347 ldmia r3,{r2-r5} 34831: 349 subs r0,r0,r2 350 sbcs r1,r1,r3 @ ε=Θ-φ Q64 351 bmi 2f @ ε negative? 352 353 asrs r6,r12,#1 354 bmi 5f @ doing dsincos? 355 asrs r6,#1 356 bcs 3f @ need cosine? 357 358@ here ε is positive, we need the sin of θ and the sign of the result is r12b0^r12b2 359 bl dsc_h0 360 bl dsc_h0s 361 bl dpack_q63 36221: 363 eors r12,r12,r12,lsr#2 364 orrs r1,r1,r12,lsl#31 @ insert sign 365 pop {r4-r10,r15} 366 3672: 368 asrs r6,r12,#1 369 bmi 6f @ doing dsincos? 370 asrs r6,#1 371 bcs 4f @ need cosine? 372 373@ here ε is negative, we need the sin of θ and the sign of the result is r12b0^r12b2 374 bl dsc_h1 375 bl dsc_h1s 376 bl dnegpack_q63 377 eors r12,r12,r12,lsr#2 378 orrs r1,r1,r12,lsl#31 @ insert sign 379 pop {r4-r10,r15} 380 381@ here ε is positive, we need the cos of θ and the sign of the result is r12b2 3823: 383 bl dsc_h0 384 bl dsc_h0c 385 bl dnegpack_q63 38622: 387 ands r12,#4 388 orrs r1,r1,r12,lsl#29 @ insert sign 389 pop {r4-r10,r15} 390 391@ here ε is negative, we need the cos of θ and the sign of the result is r12b2 3924: 393 bl dsc_h1 394 bl dsc_h1c 39515: 396 bl dpack_q63 397 ands r12,#4 398 orrs r1,r1,r12,lsl#29 @ insert sign 399 pop {r4-r10,r15} 400 4015: 402@ dsincos, ε positive 403 bl dsc_h0 404 push {r2-r7} 405 bl dsc_h0c 406 bl dnegpack_q63 407 ands r4,r12,#4 408 eors r1,r1,r4,lsl#29 @ negate cosine in quadrants 2 and 3 409 pop {r2-r7} 410 push {r0,r1} 411 bl dsc_h0s 412 bl dpack_q63 41320: 414 eors r4,r12,r12,lsr#1 415 eors r4,r4,r12,lsr#2 416 eors r1,r1,r4,lsl#31 @ negate sine on b0^b1^b2 417 tst r12,#2 @ exchange sine and cosine in odd quadrants 418 ittte ne 419 movne r2,r0 420 movne r3,r1 421 popne {r0,r1} 422 popeq {r2,r3} 423 ands r4,r12,#1<<30 424 eors r1,r1,r4,lsl#1 @ negate sine result if argument was negative 425 pop {r4-r10,r15} 426 4276: 428@ dsincos, ε negative 429 bl dsc_h1 430 push {r2-r7} 431 bl dsc_h1c 432 bl dpack_q63 433 ands r4,r12,#4 434 eors r1,r1,r4,lsl#29 @ negate cosine in quadrants 2 and 3 435 pop {r2-r7} 436 push {r0,r1} 437 bl dsc_h1s 438 bl dnegpack_q63 439 b 20b 440 441@ sin/cos power series for negative ε 442dsc_h1: 443 rsbs r0,r0,#0 444 sbc r1,r1,r1,lsl#1 445@ drop into positive ε code 446 447@ sin/cos power series for positive ε 448dsc_h0: 449@ r0:r1 ε Q64 450@ r4: sin φ Q32 451@ r5: cos φ Q32 452 umull r6,r7,r1,r1 453 umull r2,r3,r0,r1 454 lsrs r7,#1 455 rrx r6,r6 456 adds r2,r6,r3 457 adc r3,r7,#0 458@ r2:r3 ε²/2 Q64 459 umull r7,r6,r3,r0 460 umlal r7,r6,r2,r1 461 movs r7,#0 462 umlal r6,r7,r3,r1 463@ r6:r7 ε³/2 Q64 464 mov r8,#0x55555555 465 umull r9,r6,r6,r8 466 mov r9,#0 467 umlal r6,r9,r7,r8 468 adds r6,r6,r9 469 adcs r7,r9,#0 470@ r6:r7 ε³/6 Q64 471 subs r6,r0,r6 472 sbcs r7,r1,r7 473@ r6:r7 ε-ε³/6 Q64 474 mov r0,r3,lsl#12 @ ε²/2 Q44 475 orr r0,r0,r2,lsr#20 476 umull r0,r9,r0,r0 @ r9: ε⁴/4 Q88-32=Q56 = ε⁴/32 Q59 477 umlal r0,r9,r9,r8 @ r9: ε⁴/24 Q59 478 479 umull r0,r8,r9,r1 @ ε⁵/24 Q59 480 mov r0,#0x33333333 481 umull r0,r8,r8,r0 @ r8: ε⁵/120 Q59 482@ r6:r7 ε-ε³/6+ε⁵/120 Q64 483 smmulr r10,r8,r1 @ r10: ε⁶/120 Q59 484 movlong r0,0x2aaaaaaa 485 smmlsr r9,r0,r10,r9 @ ε⁴/24-ε⁶/720 486 subs r2,r2,r9,lsl#5 487 sbcs r3,r3,r9,lsr#27 488@ r2:r3 ε²/2-ε⁴/24+ε⁶/720 Q64 489 smmulr r10,r10,r1 @ r10: ε⁷/120 Q59 490 mov r0,#0x6180000 @ 1/42 Q64 491 smmlsr r8,r10,r0,r8 @ r8: ε⁵/120-ε⁷/5040 Q59 492 adds r6,r6,r8,lsl#5 493 adcs r7,r7,r8,lsr#27 494 bx r14 495 496dsc_h0s: 497@ postprocess for sine, positive ε 498@ r2:r3 1-cos |ε| Q64 499@ r4: sin φ Q31 500@ r5: cos φ Q31 501@ r6:r7 sin |ε| Q64 502 umull r8,r0,r2,r4 503 mov r1,#0 504 umlal r0,r1,r3,r4 505@ r0:r1 sin φ(1-cos ε) 506 umull r8,r3,r6,r5 507 umlal r3,r4,r7,r5 508@ r3:r4 sin φ+cos φ.sin ε 509 subs r0,r3,r0 510 sbc r1,r4,r1 511@ r0:r1 sin φ+cos φ.sin ε - sin φ(1-cos ε) = sin φ.cos ε + cos φ.sin ε = sin(φ+ε) 512 bx r14 513 514dsc_h1s: 515@ postprocess for sine, negative ε 516@ r2:r3 1-cos |ε| Q64 517@ r4: sin φ Q31 518@ r5: cos φ Q31 519@ r6:r7 sin |ε| Q64 520 umull r8,r0,r2,r4 521 mov r1,#0 522 umlal r0,r1,r3,r4 523@ r0:r1 sin φ(1-cos ε) 524 umull r8,r3,r6,r5 525 rsbs r4,r4,#0 526 umlal r3,r4,r7,r5 527@ r3:r4 -sin φ-cos φ.sin ε 528 adds r0,r3,r0 529 adc r1,r4,r1 530@ r0:r1 -sin φ-cos φ.sin ε + sin φ(1-cos ε) = -sin φ.cos ε - cos φ.sin ε = -sin(φ+ε) 531 bx r14 532 533dsc_h0c: 534@ postprocess for cosine, positive ε 535@ r2:r3 1-cos |ε| Q64 536@ r4: sin φ Q32 537@ r5: cos φ Q32 538@ r6:r7 sin |ε| Q64 539 umull r8,r0,r2,r5 540 mov r1,#0 541 umlal r0,r1,r3,r5 542@ r0:r1 cos φ(1-cos ε) 543 umull r8,r3,r6,r4 544 rsbs r5,#0 545 umlal r3,r5,r7,r4 546@ r3:r5 -cos φ+sin φ.sin ε 547 adds r0,r3,r0 548 adc r1,r5,r1 549@ r0:r1 -cos φ+sin φ.sin ε + cos φ(1-cos ε) = - cos φ.cos ε + sin φ.sin ε = -cos(φ+ε) 550 bx r14 551 552dsc_h1c: 553@ postprocess for cosine, negative ε 554@ r2:r3 1-cos |ε| Q64 555@ r4: sin φ Q32 556@ r5: cos φ Q32 557@ r6:r7 sin |ε| Q64 558 umull r8,r0,r2,r5 559 mov r1,#0 560 umlal r0,r1,r3,r5 561@ r0:r1 cos φ(1-cos ε) 562 umull r8,r3,r6,r4 563 umlal r3,r5,r7,r4 564@ r3:r5 cos φ-sin φ.sin ε 565 subs r0,r3,r0 566 sbc r1,r5,r1 567@ r0:r1 cos φ-sin φ.sin ε - cos φ(1-cos ε) = cos φ.cos ε - sin φ.sin ε = cos(φ+ε) 568 bx r14 569 570double_section dpack 571 572@ dnegpack: negate and pack 573@ dpack_q63: 574@ input 575@ r0:r1 Q63 result, must not be zero 576@ r4 exponent offset [dpack_q only] 577@ output 578@ r0:r1 IEEE double 579@ trashes r2,r3,r4 580.thumb_func 581dnegpack_q63: 582 rsbs r0,r0,#0 583 sbc r1,r1,r1,lsl#1 584.thumb_func 585dpack_q63: 586 mov r4,#0x3ff-12 @ exponent 587.thumb_func 588dpack_q: 589 clz r2,r1 590 cmp r2,#12 591 bhs 1f 592 adds r2,#21 593 lsls r3,r1,r2 594 rsb r2,#32 595 lsrs r0,r0,r2 @ save rounding bit in carry 596 lsr r1,r1,r2 597 add r2,r2,r4 598 adcs r0,r0,r3 @ with rounding 599 adc r1,r1,r2,lsl#20 @ insert exponent 600 bx r14 601 6021: 603 cbz r1,2f 604 rsb r2,#43 605 lsrs r3,r0,r2 606 rsb r2,#32 607 lsls r1,r1,r2 608 lsls r0,r0,r2 609 orrs r1,r3 610 sub r2,r4,r2 611 add r1,r1,r2,lsl#20 612 bx r14 613 6142: 615 movs r1,r0 616 mov r0,#0 617 sub r4,#32 618 bne dpack_q 619 bx r14 620 621 622 623double_section dreduce 624 625@ input: 626@ r0:r1 x, double (only mantissa bits used) 627@ r2 quotient offset 628@ r3 exponent e of x with 0x3ff bias removed, -32≤e<12 so 2^-32≤x<2^12 629@ r4:r5 r Q64, 0.5≤r<1 630@ r6 1/r underestimate Q31 631@ output: 632@ r0:r1 x mod r Q64 [possibly slightly > r?] 633@ r2 quotient+offset 634@ r4:r5 r preserved 635@ trashes r7,r8 636@ increases r2 by up to 2^13 637@ this version only used by dexp 638.thumb_func 639dreduce: 640 movs r7,#1 641 bfi r1,r7,#20,#12 @ insert implied 1, clear exponent and sign 642 lsls r8,r7,r3 643 beq 1f @ e<0, x<1 644 umull r0,r7,r0,r8 645 mla r1,r1,r8,r7 646@ r0:r1 x Q52 647 umull r7,r8,r1,r6 @ Q83-32=Q51 648 lsrs r6,r8,#19 @ q Q0 649 adds r2,r2,r6 650 umull r7,r8,r6,r4 651 mla r8,r6,r5,r8 @ r7:r8 q*r Q64 652 lsls r1,#12 653 orrs r1,r1,r0,lsr#20 654 rsbs r0,r7,r0,lsl#12 655 sbc r1,r1,r8 @ x-qr Q64 656@ check we never return slightly more than r 657 cmp r1,r5 @ quick comparison 658 it lo 659 bxlo r14 660 b 2f 661 6621: 663 adds r3,#12 664 movs r7,#1 665 lsls r8,r7,r3 666 beq 1f @ e<-12 667 umull r0,r7,r0,r8 668 mla r1,r1,r8,r7 @ x Q64 669 cmp r1,r5 @ quick comparison 670 it lo 671 bxlo r14 6722: 673 it eq 674 cmpeq r0,r4 675 it lo 676 bxlo r14 677 subs r0,r0,r4 @ subtract one r 678 sbc r1,r1,r5 679 adds r2,#1 680 bx r14 681 6821: 683@ here e<-12, have to shift r0:r1 down by -r3 places 684 add r3,#32 685 lsls r6,r1,r3 686 rsbs r3,#32 687 lsrs r0,r0,r3 688 lsrs r1,r1,r3 689 orrs r0,r0,r6 690 bx r14 691 692double_section exptab 693 694.align 2 695exptab0: 696.quad 0x0000000000000000,0x0f85186008b15304,0x1e27076e2af2e5c8,0x2f57120421b2120d 697.quad 0x3f7230dabc7c5512,0x4e993155a517a717,0x5fabe0ee0abf0d9d,0x6fad36769c6defe3 698.quad 0x7ebd623de3cc7b69,0x8f42faf3820681f0,0x9ec813538ab7d537,0xaf70154920b3ab7f 699exptab1: 700.quad 0x0000000000000000,0x00ff805515885e02,0x01fe02a6b1067890,0x02fb88ebf0214edc 701.quad 0x03f815161f807c7a,0x04f3a910d1a95d3c,0x05ee46c1f56c46aa,0x06e7f009ebe465ff 702.quad 0x07e0a6c39e0cc013,0x08d86cc491ecbfe1,0x09cf43dcff5eafd5,0x0ac52dd7e4726a46 703.quad 0x0bba2c7b196e7e23,0x0cae41876471f5bf,0x0da16eb88cb8df61,0x0e93b5c56d85a909 704.quad 0x0f85186008b15331,0x1075983598e47130 705exptab2: 706.quad 0x000fff8005551559,0x002ffb808febc309,0x004ff3829a0e91b1,0x006fe78722fde71f 707.quad 0x008fd78f299aa0c3,0x00afc39bac66434f,0x00cfabada9832a41,0x00ef8fc61eb4b74f 708.quad 0x010f6fe6095f81b6,0x012f4c0e66898567,0x014f244032da521a,0x016ef87c6a9b3a48 709.quad 0x018ec8c409b781ff,0x01ae95180bbc8d9c,0x01ce5d796bda1070,0x01ee21e924e23b3a 710logtab0: 711.quad 0xa0ec7f4233957338,0x918986bdf5fa1431,0x8391f2e0e6fa026b,0x7751a813071282e6 712.quad 0x6a73b26a68212621,0x5fabe0ee0abf0d9d,0x546ab61cb7e0b419,0x48a507ef3de59695 713.quad 0x3c4e0edc55e5cbd1,0x32a4b539e8ad68ce,0x289a56d996fa3ccb,0x21aefcf9a11cb2c9 714.quad 0x16f0d28ae56b4b86,0x0f85186008b15304,0x07e0a6c39e0cc002,0x0000000000000000 715 716.align 2 717exprrdata: 718.quad 0xB17217F7D1CF79AC @ ln2 Q64 719.long 0xB8AA3B29 @ 1/ln2 Q31, rounded down 720 721double_wrapper_section exp 722 7232: 724@ could use dadd macro to calculate x+1 here 725 lsl r0,r1,#11 726 orr r0,#0x80000000 727 lsls r1,#1 728 adc r3,r3,#32 729 movlong r1,0x3ff00000 730 rsb r3,#11 731 lsr r0,r3 732 it cc 733 bxcc r14 734 rsbs r0,#0 735 sbc r1,r1,#0 736 bx r14 737 738wrapper_func exp 739 movs r12,r1,lsr#31 @ save sign 740 ubfx r3,r1,#20,#11 @ get exponent 741 sub r3,r3,#0x3ff 742 cmp r3,#12 743 bge 20f @ overflow, Inf or NaN? 744 cmn r3,#0x20 745 ble 2b @ <2^-32? return x+1 746 push {r4-r8,r14} 747 ldr r4,=exprrdata 748 ldmia r4,{r4-r6} 749 mov r2,#0 750 bl dreduce 751 tst r12,#1 752 beq 1f 753 mvn r2,r2 @ quotient is now signed 754 subs r0,r4,r0 755 sbc r1,r5,r1 7561: 757 add r12,r2,#0x3fe @ exponent offset 758 mov r3,#0x7fe 759 cmp r12,r3 760 bhs 1f @ under/overflow 761 lsrs r2,r1,#28 762 ldr r3,=exptab0 763 add r3,r3,r2,lsl#3 764 ldmia r3,{r2-r3} 765 and r5,r2,#63 766 orr r5,#64 @ y=(t&0x3f)+0x40; Q6 767 subs r0,r2 768 sbcs r1,r3 769 lsrs r2,r1,#24 770 ldr r3,=exptab1 771 add r3,r3,r2,lsl#3 772 ldmia r3,{r3-r4} 773 add r2,#256 774 muls r5,r5,r2 @ y Q14 775 subs r0,r3 776 sbcs r1,r4 777 lsrs r2,r1,#21 778 ldr r3,=exptab2 779 add r3,r3,r2,lsl#3 780 ldmia r3,{r3-r4} 781 add r2,r2,r2 782 add r2,#4096 783 mla r5,r5,r2,r5 @ y Q26 784 subs r0,r3 785 sbcs r1,r4 786 787 movs r2,r1,lsl#10 788 orrs r2,r2,r0,lsr#22 789 adc r2,r2,#0 @ ε Q42, rounded 790 smull r3,r4,r2,r2 @ ε² Q84-32=Q52 791 lsrs r3,#21 792 orrs r3,r3,r4,lsl#11 793 adds r0,r0,r3 794 adc r1,r1,r4,lsr#21 @ ε+ε²/2 Q64 795 smull r3,r4,r4,r2 @ Q52*Q42=Q94; Q94-32=Q62 796 mov r3,#0x55555555 @ 1/6 Q33 797 smull r3,r4,r3,r4 @ ε³/6 Q63 798 smmulr r3,r4,r1 @ ε⁴/6+ε⁵/12 Q63+Q32-32=Q63 799 add r4,r4,r3,lsr#2 800 adds r2,r0,r4,lsl#1 801 adc r3,r1,r4,asr#31 802 lsls r1,r5,#3 @ y Q29 803 umull r4,r0,r1,r2 @ εlo * y Q61+32 804 smlal r0,r1,r1,r3 @ εhi * y + y Q61 805@ assert result is in range 1..2 806 lsrs r0,#9 807 adcs r0,r0,r1,lsl#23 808 lsr r1,#9 809 adcs r1,r1,r12,lsl#20 810 pop {r4-r8,r15} 811 81220: 813@ process Inf/NaN for dexp 814 cmp r3,#0x400 815 bne 22f 816 orrs r2,r0,r1,lsl#12 817 ite eq 818 biceq r1,r1,r1,asr#31 @ +Inf -> + Inf; -Inf -> +0 819 orrne r1,r1,#0x00080000 820 bx r14 821 82222: 823 movs r0,#0 824 movs r1,#0 825 tst r12,#1 826 it eq 827 movteq r1,0x7ff0 828 bx r14 829 8301: @ under/overflow 831 mov r0,#0 832 mov r1,#0 833 it ge 834 movtge r1,#0x7ff0 835 pop {r4-r8,r15} 836 837 838 839 840double_wrapper_section log 841 8421: 843 movlong r1,0xfff00000 @ return -Inf 844 movs r0,#0 845 bx r14 846 8474: 848 orrs r2,r0,r1,lsl#12 849 it ne 850 orrne r1,#0x00080000 851 bx r14 852 85310: 854 mvns r5,r6,asr#22 @ very small argument? 855 bne 10f 856 mov r4,#4096 857 b 11f 858 859@ check for argument near 1: here 860@ r1 : mantissa 861@ r12: exponent, -1 or 0 86212: 863 eor r3,r12,r1,lsr#12 864 lsls r3,r3,#24 @ check 8 bits of mantissa 865 bne 12f @ not very close to 1 866 cmp r12,#0 867 bne 13f 868@ argument is 1+ε, result will be positive 869 lsls r1,#19 870 orrs r1,r1,r0,lsr#13 871 lsls r0,#19 872@ r0:r1 ε Q71 0≤ε<2^-8 873 clz r4,r1 @ r4≥1 874 cmp r4,#32 875 bhs 14f 876 movs r5,#1 877 lsls r5,r4 878 umull r2,r3,r0,r5 879 mla r3,r1,r5,r3 @ r2:r3 ε Q71+r4 880 umull r12,r5,r0,r3 881 umull r12,r6,r1,r2 882 umaal r5,r6,r1,r3 @ r5:r6 ε² Q142+r4-64 = Q78+r4 883 884 subs r2,r2,r5,lsr#8 885 sbc r3,r3,#0 886 subs r2,r2,r6,lsl#24 887 sbcs r3,r3,r6,lsr#8 888 889 umull r12,r7,r0,r6 890 umull r12,r8,r1,r5 891 umaal r7,r8,r1,r6 @ r7:r8 ε³ Q149+r4-64 = Q85+r4: when ε is nearly 2^-8, r4=1 and Q86, so r8<0x40000000 892 mov r5,#0x55555555 @ ~1/3 Q32 893 894 umull r12,r6,r7,r5 895 movs r12,#0 896 umlal r6,r12,r8,r5 897 adds r6,r6,r12 898 adc r12,r12,#0 @ multiply by 0x5555555555555555 899 900 adds r2,r2,r6,lsr#14 901 adc r3,r3,#0 902 adds r2,r2,r12,lsl#18 903 adc r3,r3,r12,lsr#14 904 905 smmulr r5,r8,r1 @ ε⁴ Q53+r4+Q71-Q64=Q60+r4 ~ 2^-32 906 movs r7,#0x33333333 @ 1/5 Q32 907 smmulr r6,r5,r1 @ ε⁵ Q60+r4+q71-Q64=Q67+r4 ~ 2^-40 908 smmulr r8,r6,r7 @ ε⁵/5 Q67+r4 ~ 2^-42 909 sub r7,r5,r8,lsr#5 910 smmulr r5,r6,r1 @ ε⁶ Q67+r4+q71-Q64=Q74+r4 ~ 2^-48 911 movt r6,#0x2a80 @ 1/6 Q32 fiddled slightly (PMC) 912 smmulr r5,r6,r5 @ ε⁶/6 Q75+r4 ~ 2^-50 913 add r7,r7,r5,lsr#12 914 915 subs r0,r2,r7,lsl#9 916 sbc r1,r3,r7,lsr#23 917 918 rsb r4,#0x400 919 sub r4,#0x15 920 bl dpack_q 921 pop {r4-r8,r15} 922 923@ here we have positive ε sufficiently small we need (at most) a quadratic term 92414: 925@ here r0=ε Q71, 0≤ε<2^-40 926 clz r4,r0 927 lsls r1,r0,r4 @ ε Q71+r4 928 umull r2,r3,r0,r1 @ ε² Q142+r4 929 mov r0,#0 930 subs r0,r0,r3,lsr#8 931 sbc r1,r1,#0 932 rsb r4,#0x400 933 sub r4,#0x35 934 bl dpack_q 935 pop {r4-r8,r15} 936 937 938 93913: @ argument is 1-ε, result will be negative 940 movs r1,r1,lsl#18 941 orrs r1,r1,r0,lsr#14 942 movs r0,r0,lsl#18 943 rsbs r0,#0 944 sbc r1,r1,r1,lsl#1 945@ r0:r1 -ε Q71 -2^-9≤ε<0 946 clz r4,r1 947 cmp r4,#32 948 bhs 15f 949 subs r4,#1 @ 0≤r4<31 950 movs r5,#1 951 lsls r5,r4 952 umull r2,r3,r0,r5 953 mla r3,r1,r5,r3 @ r2:r3 ε Q71+r4 954 umull r12,r5,r0,r3 955 umull r12,r6,r1,r2 956 umaal r5,r6,r1,r3 @ r5:r6 ε² Q142+r4-64 = Q78+r4 957 958 adds r2,r2,r5,lsr#8 959 adc r3,r3,#0 960 adds r2,r2,r6,lsl#24 961 adcs r3,r3,r6,lsr#8 962 963 umull r12,r7,r0,r6 964 umull r12,r8,r1,r5 965 umaal r7,r8,r1,r6 @ r7:r8 ε³ Q149+r4-64 = Q85+r4: when ε is nearly 2^-8, r4=0 and Q85, so r8<0x20000000 966 mov r5,#0x55555555 @ ~1/3 Q32 967 968 umull r12,r6,r7,r5 969 movs r12,#0 970 umlal r6,r12,r8,r5 971 adds r6,r6,r12 972 adc r12,r12,#0 @ multiply by 0x5555555555555555 973 974 adds r2,r2,r6,lsr#14 975 adc r3,r3,#0 976 adds r2,r2,r12,lsl#18 977 adc r3,r3,r12,lsr#14 978 979 smmulr r5,r8,r1 @ ε⁴ Q53+r4+Q71-Q64=Q60+r4 ~ 2^-32 980 movs r7,#0x33333333 @ 1/5 Q32 981 smmulr r6,r5,r1 @ ε⁵ Q60+r4+q71-Q64=Q67+r4 ~ 2^-40 982 smmulr r8,r6,r7 @ ε⁵/5 Q67+r4 ~ 2^-42 983 add r7,r5,r8,lsr#5 984 smmulr r5,r6,r1 @ ε⁶ Q67+r4+q71-Q64=Q74+r4 ~ 2^-48 985 movt r6,#0x2a80 @ 1/6 Q32 fiddled slightly (PMC) 986 smmulr r5,r6,r5 @ ε⁶/6 Q75+r4 ~ 2^-50 987 add r7,r7,r5,lsr#12 988 989 adds r0,r2,r7,lsl#9 990 adc r1,r3,r7,lsr#23 991 992 rsb r4,#0x400 993 sub r4,#0x15 994 bl dpack_q 995 orr r1,r1,#1<<31 996 pop {r4-r8,r15} 997 998@ here we have negative ε sufficiently small we need (at most) a quadratic term 999@ here r0=ε Q71, |ε|<2^-41 100015: 1001 clz r4,r0 1002 lsls r1,r0,r4 @ ε Q71+r4 1003 umull r2,r3,r0,r1 @ ε² Q142+r4 1004 mov r0,r3,lsr#8 1005 rsb r4,#0x400 1006 sub r4,#0x35 1007 bl dpack_q 1008 eors r1,r1,#0x80000000 1009 pop {r4-r8,r15} 1010 1011wrapper_func log 1012 lsls r12,r1,#1 1013 bcs 1b @ x<0? 1014 lsrs r12,#21 1015 beq 1b @ x==0/denormal? 1016 sub r12,#0x3ff 1017 cmp r12,#0x400 @ +Inf/NaN? 1018 beq 4b 1019 movs r2,#1 1020 bfi r1,r2,#20,#12 @ set implied 1, clear exponent Q52 1021 push {r4-r8,r14} 1022 cmp r12,r12,asr#31 @ exponent = -1 or 0? 1023 beq 12b 102412: 1025 lsrs r4,r1,#16 1026 ldr r5,=logtab0-16*8 1027 add r5,r5,r4,lsl#3 1028 ldmia r5,{r2-r3} 1029 and r5,r2,#63 1030 add r5,#64 1031 umull r0,r6,r5,r0 1032 mla r1,r5,r1,r6 @ Q59 1033 1034 mvn r4,r1,asr#19 1035 and r4,#31 1036 ldr r5,=exptab1 1037 add r5,r5,r4,lsl#3 1038 ldmia r5,{r5-r6} 1039 adds r2,r2,r5 1040 adc r3,r3,r6 1041 add r4,#256 1042 umull r0,r6,r4,r0 1043 mla r6,r4,r1,r6 @ r0:r6 Q67 1044 1045 mvns r4,r6,asr#24 1046 beq 10b @ small argument at this stage? 104710: 1048 ldr r5,=exptab2 1049 add r5,r5,r4,lsl#3 1050 ldmia r5,{r1,r5} 1051 adds r2,r2,r1 1052 adc r3,r3,r5 1053 mov r5,#4097 1054 add r4,r5,r4,lsl#1 @ 4097+2k 105511: 1056 lsls r4,#17 1057 umull r5,r0,r4,r0 1058 rsb r1,r4,r4,lsl#3 1059 umlal r0,r1,r4,r6 @ Q96=Q64 1060 1061@ r0:r1 ε Q64 1062@ r2:r3 y Q64 1063 1064 eor r4,r0,r1,asr#31 1065 eor r5,r1,r1,asr#31 @ r4:r5 |ε| Q64 1066 umull r6,r7,r5,r5 1067 umull r4,r8,r4,r5 1068 lsrs r7,#1 1069 rrx r6,r6 1070 adds r6,r6,r8 1071 adc r7,r7,#0 @ r6:r7 ε²/2 Q64 1072 1073 movs r4,r1,lsl#10 1074 orrs r4,r4,r0,lsr#22 1075 adc r4,r4,#0 @ ε Q42, rounded 1076 1077 subs r0,r0,r6 1078 sbc r1,r1,r7 @ r0:r1 ε-ε²/2 Q64 1079 1080 smmulr r5,r4,r4 @ ε² Q42+42-32=Q52 1081 smmulr r6,r5,r4 @ ε³ Q52+42-32=Q62 1082 smmulr r7,r5,r5 @ ε⁴ Q52+52-32=Q72 ε⁴/4 Q74 1083 mov r4,#0x55555555 @ 1/3 Q32 1084 smmulr r6,r6,r4 @ ε³/3 Q62 1085 subs r6,r6,r7,lsr#12 @ ε³/3-ε⁴/4 Q62 1086 1087 adds r0,r0,r6,lsl#2 @ Q64 1088 adc r1,r1,r6,asr#30 1089 1090 subs r0,r2,r0 1091 sbc r1,r3,r1 1092 ldr r2,=exprrdata 1093 ldmia r2,{r2,r5} 1094 adds r12,#1 1095 ble 1f 1096@ positive result 1097 umull r2,r3,r2,r12 1098 movs r4,#0 1099 umlal r3,r4,r5,r12 1100 subs r2,r2,r0 1101 sbcs r3,r3,r1 1102 sbc r4,r4,#0 1103 movs r1,#0x40000000 1104 b 2f @ to pack result 1105 1106@ negative result 11071: 1108 rsbs r12,#0 1109 umull r2,r3,r2,r12 1110 movs r4,#0 1111 umlal r3,r4,r5,r12 1112 adds r2,r0,r2 1113 adcs r3,r1,r3 1114 adc r4,r4,#0 1115 movs r1,#0xc0000000 11162: 1117 cbnz r4,2f 1118 movs r4,r3 1119 movs r3,r2 1120 movs r2,#0 1121 subs r1,#32<<20 11221: 1123@ here r3:r4 is guaranteed nonzero 1124 cbnz r4,2f 1125 movs r4,r3 1126 movs r3,#0 1127 subs r1,#32<<20 11282: 1129 clz r5,r4 1130 sub r6,r5,#0x1d 1131 sub r1,r1,r6,lsl#20 1132 lsls r4,r5 1133 lsls r0,r3,r5 1134 rsb r5,#32 1135 lsrs r3,r5 1136 orrs r4,r4,r3 1137 lsrs r2,r5 1138 orrs r0,r0,r2 1139@ now r0:r4 is normalised to Q63 1140 lsrs r0,#11 1141 adcs r0,r0,r4,lsl#21 @ with rounding 1142 adc r1,r1,r4,lsr#11 1143 pop {r4-r8,r15} 1144 1145@=========================================== 1146 1147double_section trigtab 1148trigtab: 1149// φ Q64 lo φ Q64 hi sin φ Q31 cos φ Q31 1150.long 0x25735c0b, 0x03f574a9, 0x01fab529, 0x7ffc1500 1151.long 0x00aa2feb, 0x0c44d954, 0x0621d2c8, 0x7fda601f 1152.long 0x42e86336, 0x13d9d3cf, 0x09ea5e3c, 0x7f9d88f0 1153.long 0x046dc42f, 0x1c0a86d9, 0x0dfe171f, 0x7f3b9ed0 1154.long 0xec509ba7, 0x23ebf53e, 0x11e6e7bc, 0x7ebdefc7 1155.long 0x039c1cd2, 0x2bf112b2, 0x15dcf546, 0x7e1e7749 1156.long 0x3d7a05ca, 0x33f293f4, 0x19cbc014, 0x7d5fac9c 1157.long 0x7aefa0a0, 0x3c19321d, 0x1dc6221d, 0x7c7d2f2f 1158.long 0x12fb0fb5, 0x4450ef70, 0x21c10c53, 0x7b782235 1159.long 0x476b8d5c, 0x4bf1a045, 0x256adc18, 0x7a68ad16 1160.long 0x576940c1, 0x53ead96d, 0x29361639, 0x792f2d3c 1161.long 0xd34eeadf, 0x5bb34289, 0x2ce039d6, 0x77e02625 1162.long 0x31ea5069, 0x641107d9, 0x30c4d3c0, 0x76586270 1163.long 0xf36756cb, 0x6bfda2b3, 0x34687e1c, 0x74c77a22 1164.long 0x2f7aed0e, 0x73e07531, 0x37fae7cc, 0x731c1127 1165.long 0xfcc48ec0, 0x7c14790e, 0x3ba3a70e, 0x7141cd8e 1166.long 0x3b04b713, 0x83547b8f, 0x3ed289d4, 0x6f85d8eb 1167.long 0x135b369a, 0x8c0b6fd9, 0x4294ead4, 0x6d51efa3 1168.long 0x4aca525f, 0x9439d326, 0x460a6e70, 0x6b230540 1169.long 0x41c44083, 0x9c0e0b34, 0x4948b03d, 0x68f1ef07 1170.long 0xa36d08bf, 0xa47961fd, 0x4cb1f285, 0x667a849d 1171.long 0xc8f81636, 0xabe547d2, 0x4fa2221e, 0x6436622f 1172.long 0xd0c8c9ad, 0xb3feade1, 0x52c37024, 0x61a4af86 1173.long 0x64730e73, 0xbbfe356a, 0x55c5f028, 0x5f02a3a3 1174.long 0xb08ca5c6, 0xc412b955, 0x58ba9208, 0x5c4194a3 1175.long 0x54530090, 0xcbb02d37, 0x5b6ef419, 0x59938ed8 1176.long 0xa18d7c36, 0xd3aa3c6e, 0x5e2e0225, 0x56af32fc 1177.long 0x48a2c7d0, 0xdbff19f6, 0x60f35332, 0x5392ed7e 1178.long 0x7aad9a94, 0xe422ade1, 0x638edfca, 0x50732bc7 1179.long 0x19ef15ff, 0xebfb85bd, 0x65fa18bb, 0x4d5c61c7 1180.long 0x7e0f96bd, 0xf44c4b49, 0x686f81f7, 0x4a0217e6 1181.long 0x1def09eb, 0xfc4dbdfd, 0x6ab2d2c4, 0x46b4e413 1182// maximum e=0.002617 = 0.167497*2^-6 1183 1184double_wrapper_section atan2 1185 1186@ datan small reduced angle case I 118720: 1188@ r0:r1 has z=y/x in IEEE format, <2^-11 1189@ r2 e+11 1190 rsbs r12,r2,#0 @ shift down of mantissa required to get to Q63 >0 1191 subs r10,r12,#32 1192 bge 1f @ very small reduced angle? 1193 bfi r1,r3,#20,#12 @ fix up mantissa 1194 cmp r7,#4 1195 bhs 2f @ at least one quadrant to add? then don't need extreme accuracy 1196@ otherwise we need to do a power series for accuracy 1197 1198 rsbs r10,#0 1199 lsr r3,r1,r12 1200 lsr r2,r0,r12 1201 lsl r9,r1,r10 1202 orr r2,r9 1203@ r2:r3 z Q63 (with r12 bits of loss of precision) 1204 lsls r1,#11 1205 orrs r1,r1,r0,lsr#21 1206 lsls r0,#11 1207@ r0:r1 z Q74+r12 1208 umull r9,r4,r2,r3 1209 umull r9,r5,r2,r3 1210 umaal r4,r5,r3,r3 1211@ r4:r5 z² Q62 < 2^-22 1212 umull r9,r2,r0,r5 1213 umull r9,r3,r1,r4 1214 umaal r2,r3,r1,r5 1215@ r2:r3 z³ Q72+r12 <2^-33 1216 umull r9,r6,r2,r5 1217 umull r9,r8,r3,r4 1218 umaal r6,r8,r3,r5 1219@ r6:r8 z⁵ Q70+r12 <2^-55; in fact r8 is always 0 1220 mov r9,#0xaaaaaaaa @ 2/3 Q32 1221 umull r10,r4,r2,r9 1222 movs r5,#0 1223 umlal r4,r5,r3,r9 1224 adds r4,r4,r5 1225 adcs r5,r5,#0 1226@ r4:r5 z³*2/3 Q72+r12 = z³/3 Q73+r12 1227 mov r9,#0x33333333 @ 1/5 Q32 1228 umull r2,r3,r6,r9 1229@ r3 z⁵*1/5 Q70+r12 1230 subs r4,r4,r3,lsl#3 1231 sbc r5,r5,#0 1232@ r4:r5 z³/3-z⁵/5 Q73+r12 1233 subs r0,r0,r4,lsl#1 1234 sbc r1,r1,r5,lsl#1 1235 sub r1,r1,r4,lsr#31 1236@ r0:r1 z-z³/3+z⁵/5 Q74+r12 1237 rsb r4,r12,#0x400 1238 sub r4,#24 1239 b 60f @ pack and return 1240 12412: 1242 rsbs r10,#0 1243 lsls r4,r1,r10 1244 lsrs r0,r0,r12 1245 lsrs r1,r1,r12 1246 orrs r0,r0,r4 @ shift down r12 places 1247 b 50f 1248 1249 12501: 1251 cmp r7,#4 1252 bhs 2f @ at least one quadrant to add? 1253 eors r1,r1,r7,lsl#31 @ no: return y/x with the correct sign 1254 pop {r4-r11,r15} 1255 12562: 1257 bfi r1,r3,#20,#12 @ fix up mantissa 1258 usat r10,#6,r10 @ saturate r10 to 63 1259 lsrs r0,r1,r10 1260 movs r1,#0 @ shift down 32+r10 places 1261 b 40f 1262 1263@ datan small reduced angle case II 126410: 1265 lsrs r1,#1 1266 rrxs r0,r0 1267 movs r2,#0 1268 movs r3,#0 1269 movs r6,#0 1270 mov r7,#1<<30 1271 b 11f 1272 1273@ case where reduced (x',y') has x' infinite 127471: 1275 sbfx r4,r1,#20,#11 1276 movs r0,#0 1277 movs r1,#0 1278 cmn r4,#1 @ y' also infinite? 1279 bne 80f 1280 movt r1,#0x3ff0 @ both infinite: pretend ∞/∞=1 1281 b 80f 1282 1283@ case where reduced (x',y') has y' zero 128470: 1285 ubfx r4,r3,#20,#11 1286 movs r0,#0 1287 movs r1,#0 1288 cbnz r4,80f @ x' also zero? 1289 tst r7,#4 1290 beq 80f @ already in quadrants 0/±2? then 0/0 result will be correct 1291 tst r7,#2 1292 ite eq 1293 addeq r7,#6 1294 subne r7,#6 @ fix up when in quadrants ±0 1295 b 80f 1296 129790: 1298 movs r0,r2 1299 movs r1,r3 130091: 1301 orrs r1,r1,#0x00080000 1302 bx r14 1303 1304wrapper_func atan2 1305 cmp r2,#1 @ set C if low word is non-zero 1306 adc r12,r3,r3 1307 cmn r12,#0x00200000 @ y NaN? 1308 bhi 90b 1309 cmp r0,#1 @ set C if low word is non-zero 1310 adc r12,r1,r1 1311 cmn r12,#0x00200000 @ x NaN? 1312 bhi 91b 1313 push {r4-r11,r14} 1314 lsrs r7,r1,#31 @ b31..2: quadrant count; b1: sign to apply before addition; b0: sign to apply after addition 1315 bic r1,#1<<31 1316@ y now positive 1317 movs r3,r3 1318 bpl 1f 1319 1320@ here y positive, x negative 1321 adds r7,#10 1322 bic r3,r3,#1<<31 1323 cmp r3,r1 1324 bhi 4f @ |x| > y: 3rd octant 1325@ |x| < y: 2nd octant 1326 subs r7,#6 1327 b 3f 1328 13291: 1330@ here x and y positive 1331 cmp r3,r1 1332 bhi 4f 1333@ x < y: 1st octant 1334 adds r7,#6 13353: 1336 movs r4,r2 @ exchange x and y 1337 movs r5,r3 1338 movs r2,r0 1339 movs r3,r1 1340 movs r0,r4 1341 movs r1,r5 13424: 1343 1344@ here 1345@ r0:r1 y' 1346@ r2:r3 x' 1347@ where both x' and y' are positive, y'/x' < 1+δ, and the final result is 1348@ ± (Q.π/2 ± atn y/x) where 0≤Q≤2 is a quadrant count in r7b3..2, the inner negation 1349@ is given by r7b1 and the outer negation by r7b0. x' can be infinite, or both x' and 1350@ y' can be infinite, but not y' alone. 1351 1352 sbfx r4,r3,#20,#11 1353 cmn r4,#1 1354 beq 71b @ x' infinite? 1355 ubfx r4,r1,#20,#11 1356 cmp r4,#0 1357 beq 70b @ y' zero? 1358 bl __aeabi_ddiv 135980: 1360@ r0:r1 y/x in IEEE format, 0..1 1361 lsr r2,r1,#20 @ exponent 1362 movs r3,#1 1363 subs r2,#0x3ff-11 1364 bmi 20b 1365 bfi r1,r3,#20,#12 @ fix up mantissa 1366 movs r3,#1 1367 lsl r3,r2 1368 umull r0,r4,r0,r3 1369 mla r1,r1,r3,r4 137050: 1371 push {r7} @ save flags 1372 1373@ from here atan2(y,1) where 1 implied 1374@ r0:r1 y Q63 0≤y<1+δ 1375 1376 lsrs r2,r1,#16 1377 cmp r2,#0x100 1378 blo 10b @ small angle? 1379 mul r3,r2,r2 @ y^2 1380 movw r4,#0x895c 1381 muls r2,r2,r4 @ y*0x895c 1382 movw r5,#0x1227 1383 lsrs r3,#14 1384 mls r2,r3,r5,r2 1385 subs r2,#0x330000 @ Chebyshev approximation to atn(y) 1386 lsrs r2,#25 1387 ldr r3,=trigtab 1388 add r3,r3,r2,lsl#4 1389 ldmia r3,{r2-r5} 1390 lsrs r3,#1 1391 rrxs r2,r2 1392@ r2:r3 phi0 Q63 1393@ r4 sphi0 1394@ r5 cphi0 1395 umull r12,r6,r4,r0 1396 movs r7,#0 1397 umlal r6,r7,r4,r1 1398 adds r6,r6,r5,lsl#31 1399 adc r7,r7,r5,lsr#1 @ x0= ((i128)cphi0<<31)+(((i128)sphi0*(i128)y)>>32); // Q62 1400@ r6:r7 x0 1401 umull r12,r0,r5,r0 1402 movs r8,#0 1403 umlal r0,r8,r5,r1 1404 subs r0,r0,r4,lsl#31 1405 sbc r1,r8,r4,lsr#1 @ y0=-((i128)sphi0<<31)+(((i128)cphi0*(i128)y)>>32); // Q62 140611: 1407@ r0:r1 y0 1408@ r2:r3 phi0 1409@ r6:r7 x0 1410 1411 lsls r4,r1,#6 1412 orr r4,r4,r0,lsr#26 1413 lsrs r5,r7,#15 1414 sdiv r4,r4,r5 @ t2=(y0>>26)/(x0>>47); // Q62-26/Q62-47=Q21 1415 1416 mul r5,r4,r4 @ t2_2 Q42 1417 add r3,r3,r4,lsl#10 @ phi0+t2 1418 smull r8,r9,r4,r5 @ t2_3 Q63 1419 mov r10,r9,lsl#16 1420 orr r10,r10,r8,lsr#16 1421 smmulr r10,r10,r5 @ t2_5 Q57 1422 mov r12,#0x66666666 @ 1/5 Q33 1423 smmulr r11,r10,r5 @ t2_7 Q67 1424 smmulr r10,r10,r12 @ t2_5/5 Q57+33-32=Q58 1425 1426 movlong r12,0x124925 @ 1/7 Q23 1427 smmulr r11,r11,r12 @ t2_7/7 Q67+23-32=Q58 1428 mov r12,#0x55555555 1429 sub r11,r11,r11,asr#12 @ Q58 PMC correction 1430 sub r10,r10,r11 1431 1432 adds r2,r2,r10,lsl#5 1433 adc r3,r3,r10,asr#27 @ Q63 phi0 + t_2 + t2_5/5 - t2_7/7 + t2_7/7/4096 1434 umull r5,r10,r8,r12 1435 mov r11,#0 1436 smlal r10,r11,r9,r12 @ t2_3 * 0x55555555 1437 adds r10,r10,r11 1438 adc r11,r11,r11,asr#31 @ t2_3/3 Q63 1439 subs r2,r2,r10 1440 sbc r3,r3,r11 @ Q63 phi0+phi1 1441 1442 lsls r4,r4,#11 @ t2 Q32 1443 umull r5,r8,r4,r0 @ t2*y0l 1444 it mi 1445 submi r8,r8,r0 @ correction if t2 is negative 1446 mov r9,r8,asr#31 @ sign extend 1447 smlal r8,r9,r4,r1 @ t2*y0h 1448@ r8:r9 (t2*y0)<<11 1449 1450 umull r5,r10,r4,r6 @ t2*x0l 1451 it mi 1452 submi r10,r10,r6 @ correction if t2 is negative 1453 mov r11,r10,asr#31 @ sign extend 1454 smlal r10,r11,r4,r7 1455@ r10:r11 (t2*x0)<<11 1456 1457 adds r6,r8 1458 adc r7,r7,r9 1459 subs r0,r10 1460 sbc r1,r1,r11 1461@ r0:r1 y1=y0-t2*x0 1462@ r2:r3 phi0+phi1 1463@ r6:r7 x1=x0+t2*y0 1464 1465 mov r4,#0xffffffff 1466 lsrs r5,r7,#14 1467 udiv r4,r4,r5 @ rx1 Q16 1468 lsrs r5,r0,#11 1469 orrs r5,r5,r1,lsl#21 @ N set according to y1, hence also t3 1470 smmul r5,r4,r5 @ t3=(y1>>11)*rx1 Q35 1471 lsr r6,r6,#3 1472 orr r6,r6,r7,lsl#29 1473 umull r11,r8,r5,r6 @ t3*x1l 1474 lsr r10,r7,#3 1475 it mi 1476 submi r8,r8,r6 @ correction if t3 is negative 1477 mla r8,r5,r10,r8 1478 adds r2,r2,r5,lsl#28 1479 adc r3,r3,r5,asr#4 1480 sub r0,r0,r8 1481@ r0: y2 1482@ r2:r3 phi0+phi1+phi2 1483@ r4: rx1 1484@ r5: t3 1485 1486 smull r8,r9,r0,r4 @ y2*rx1 1487@ stall 1488 lsrs r8,#14 1489 orr r8,r8,r9,lsl#18 @ t4 1490 smmlsr r0,r8,r7,r0 1491 adds r2,r2,r8,asr#1 1492 adc r3,r3,r8,asr#31 1493@ r0: y3 1494@ r4: rx1 1495 mul r4,r4,r0 1496 adds r0,r2,r4,asr#15 1497 adc r1,r3,r4,asr#31 1498@ r0:r1 result over reduced range Q63 1499 pop {r7} @ restore flags 150040: 1501 lsrs r1,#1 1502 rrxs r0,r0 1503@ r0:r1 result over reduced range Q62 1504 lsl r6,r7,#30 @ b1 -> b31 1505 eor r0,r0,r6,asr#31 @ negate if required 1506 eor r1,r1,r6,asr#31 1507 movlong r2,0x10B4611A @ π/2 Q62 low word 1508 movlong r3,0x6487ED51 @ π/2 Q62 high word 1509 lsr r6,r7,#2 @ quadrants to add 1510 umlal r0,r1,r6,r2 1511 mla r1,r6,r3,r1 1512 mov r4,#0x400-12 @ for packing Q62 151360: 1514 bl dpack_q 1515 eors r1,r1,r7,lsl#31 1516 pop {r4-r11,r15} 1517 1518#endif 1519