1/* 2 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd. 3 * 4 * SPDX-License-Identifier: BSD-3-Clause 5 */ 6 7#if !PICO_RP2040 8#include "pico/asm_helper.S" 9 10pico_default_asm_setup 11 12.macro float_section name 13#if PICO_FLOAT_IN_RAM 14.section RAM_SECTION_NAME(\name), "ax" 15#else 16.section SECTION_NAME(\name), "ax" 17#endif 18.endm 19 20.macro float_wrapper_section func 21float_section WRAPPER_FUNC_NAME(\func) 22.endm 23 24.extern rtwopi 25.extern logtab0 26.extern exptab0 27.extern exptab1 28.extern exptab2 29.extern trigtab 30 31@ load a 32-bit constant n into register rx 32.macro movlong rx,n 33 movw \rx,#(\n)&0xffff 34 movt \rx,#((\n)>>16)&0xffff 35.endm 36 37float_section frrcore 38 39@ input: 40@ r0 mantissa m Q23 41@ r1 exponent e>=-32, typically offset by +9 42@ output: 43@ r0..r1 preserved 44@ r6 range reduced result 45@ r2,r3,r4,r5 trashed 46frr_core: 47 ldr r2,=rtwopi 48 asrs r3,r1,#5 @ k=e/32, k<=5 for e offsets up to 9+32 49 add r2,r2,r3,lsl#2 @ p 50 and r3,r1,#31 @ s=e%32 51 mov r4,#1 52 lsls r4,r4,r3 @ 1<<s 53 umull r3,r4,r4,r0 54@ r2 p 55@ r3:r4 u0:u1 = m<<(e%32); u1 is never more than 2<<23 56 ldr r5,[r2,#12] @ a0=p[3] 57 umull r5,r6,r5,r4 @ r0=a0*u1 hi, discard lo 58@ r6 r0 59 ldr r5,[r2,#8] @ a1=p[2] 60 mla r6,r5,r4,r6 @ a1*u1 lo, discard hi 61 umlal r5,r6,r5,r3 @ a1*u0 hi, discard lo 62@ r6 r0 63 ldr r5,[r2,#4] @ a2=p[1] 64 mla r6,r5,r3,r6 @ r0+=a2*u0 65 bx r14 66 67 68float_wrapper_section expf 69 702: 71@ we could use fadd macro to calculate x+1 here 72@ need to add quadratic term 73 rsb r3,#1 @ 1-(e+9) 74 lsrs r0,r3 @ Q31 75 smmulr r1,r0,r0 @ Q30 76 cmp r12,#0 77 bne 1f 78 add r0,r0,r1 79 lsrs r0,#8 80 adc r0,r0,#0x3f800000 @ result is just over 1 81 bx r14 821: 83 sub r0,r1,r0 84 asrs r0,#7 85 adc r0,r0,#0x3f800000 @ result is just under 1 86 bx r14 87 887: 89 movs r1,#1 90 bfi r0,r1,#23,#9 @ implied 1, clear sign bit 91 adds r3,#9 92 bmi 2b @ <~2^-9? return 1+x+x²/2 93 movlong r1,0xb17217f8 @ ln2 Q32 94 lsls r0,r3 @ Q32 95 cmp r12,#0 96 bne 8f 97 cmp r0,r1 98 blo 6f 99 sub r0,r0,r1 100 mov r12,#1 101 b 6f 102 1038: 104 mvn r12,#0 105 subs r0,r1,r0 106 bhs 6f 107 add r0,r0,r1 108 mvn r12,#1 109 b 6f 110 111wrapper_func expf 112 ands r12,r0,#1<<31 @ save sign 113 ubfx r3,r0,#23,#8 @ get exponent 114 subs r3,r3,#0x7f 115 bmi 7b @ e<0? 116 cmp r3,#7 117 bge 20f @ overflow, Inf or NaN? 118 movs r1,#1 119 bfi r0,r1,#23,#9 @ implied 1, clear sign bit 120 lsls r0,r3 @ x Q23 121 eors r0,r0,r12,asr#31 122 adds r0,r0,r12,lsr#31 @ negate if sign bit set 123 mov r1,#0xB8AA @ 1/ln2 Q15, rounded down 124 add r1,r1,r12,lsr#31 @ rounded up if we have a negative argument 125 smull r1,r2,r1,r0 @ Q38 126 movlong r1,0xb17217f8 @ ln2 Q32 127 asr r12,r2,#6 @ Q0 exponent candidate 128 lsls r0,#9 @ Q32 129 mls r0,r12,r1,r0 @ Q32-Q0*Q32=Q32 130 1316: 132 lsrs r1,r0,#28 133 ldr r2,=exptab0 134 add r2,r2,r1,lsl#3 135 ldmia r2,{r2-r3} 136 and r2,r2,#63 137 orr r2,#64 @ y=(t&0x3f)+0x40; Q6 138 sub r0,r3 139 140 lsrs r1,r0,#24 141 ldr r3,=exptab1+4 142 ldr r3,[r3,r1,lsl#3] 143 add r1,#256 144 muls r2,r2,r1 @ y Q14 145 sub r0,r3 146 147 lsrs r1,r0,#21 148 ldr r3,=exptab2+4 149 ldr r3,[r3,r1,lsl#3] 150 add r1,r1,r1 151 add r1,#4096 152 mla r2,r2,r1,r2 @ y Q26 153 sub r0,r3 @ ε Q32 154 smmlar r2,r2,r0,r2 @ y(1+ε) Q26 155 156 cmp r2,#1<<27 157 bhs 1f 1582: 159 cmp r12,#0x7f 160 bgt 23f 161 cmn r12,#0x7e 162 blt 23f 163 lsls r0,r12,#23 164 adds r0,r0,r2,lsr#3 165 adc r0,r0,#0x3f000000 166 bx r14 167 168@ mantissa calculation has overflowed 1691: 170 lsrs r2,#1 171 adds r12,#1 172 b 2b 173 17420: 175@ process Inf/NaN for fexp 176 cmp r3,#0x80 177 bne 22f 178 lsls r2,r0,#9 179 ite eq 180 biceq r0,r0,r0,asr#31 @ +Inf -> + Inf; -Inf -> +0 181 orrne r0,r0,#0x00400000 182 bx r14 183 18423: 185 ands r12,r12,#1<<31 18622: 187 eors r0,r12,#1<<31 188 subs r0,r0,r0,lsr#8 @ overflow: convert to +0 or +Inf 189 bx r14 190 191 192 193 194float_wrapper_section logf 195 1961: 197 movlong r0,0xff800000 @ return -Inf 198 bx r14 199 2004: 201 lsls r1,r0,#9 202 it ne 203 orrne r0,r0,#0x00400000 204 bx r14 205 206@ here result may be close to zero 2075: 208 sub r1,r0,#0x3f800000 209 bmi 7f 210 cmp r1,#0x8000 211 bge 6f @ |ε|>~2^-8? 212@ here ε positive 213 clz r12,r1 214 sub r3,r12,#1 215 b 8f 216 2177: 218 rsbs r2,r1,#0 219 cmp r2,#0x8000 220 bge 6f @ |ε|>~2^-8? 221@ here ε negative 222@ r1: x Q24 223 clz r12,r2 224 sub r3,r12,#2 2258: 226 sub r12,#10 227 lsls r0,r1,r3 @ ε Q24+r3 228 beq 10f @ ε=0? 229 smmulr r1,r0,r0 230 asrs r1,r1,r12 231 sub r0,r0,r1,asr#1 @ - ε²/2 232 smmulr r1,r1,r0 233 asrs r2,r1,r12 234 movs r3,#0x55 235 mul r2,r2,r3 236 adds r0,r0,r2,asr#8 @ + ~ ε³/3 237 rsb r1,r12,#117 238 b 9f 239 240wrapper_func logf 241 lsls r3,r0,#1 242 bcs 1b @ x<0? 243 lsrs r3,#24 244 beq 1b @ x==0/denormal? 245 sub r12,r3,#0x7e 246 cmp r12,#0x81 @ +Inf/NaN? 247 beq 4b 248 push {r14} 249 cmp r12,#1 @ result will be near zero? 250 bls 5b 2516: 252 movs r2,#1 253 bfi r0,r2,#23,#9 @ set implied 1, clear exponent Q52 254 255 lsls r0,#8 256 257 lsrs r1,r0,#27 258 ldr r2,=logtab0-16*8 259 add r2,r2,r1,lsl#3 260 ldmia r2,{r2-r3} 261 lsls r2,#26 262 umlal r2,r0,r2,r0 263 264 mvn r1,r0,asr#24 265 ldr r2,=exptab1+4 266 ldr r2,[r2,r1,lsl#3] 267 add r3,r3,r2 268 lsls r1,#24 269 umlal r1,r0,r1,r0 270 271 ldr r2,=exptab2+4 272 mvn r1,r0,asr#21 273 ldr r2,[r2,r1,lsl#3] 274 lsls r1,#21 275 orr r1,#0x00100000 276 umlal r1,r0,r1,r0 277 add r3,r3,r2 278 279@ r0: ε Q32 280@ r3: log y 281 smmulr r1,r0,r0 @ ε² Q32 282 sub r3,r0 283 add r3,r3,r1,lsr#1 @ log u - ε + ε²/2 Q32 284 movlong r2,0xb17218 @ ln2 Q24, but accurate to 2^-29 or so 285 cmp r12,#1 @ result will be near zero? 286 bls 2f 287 mul r2,r2,r12 288 mov r1,#125 289 add r3,#255 @ reduce systematic error 290 subs r0,r2,r3,lsr#8 2919: 292 bmi 1f 293 bl fpack_q 29410: 295 pop {r15} 296 2972: 298 mov r1,#117 299 bne 3f @ r12=0? 300@ here r12=1 301 movlong r2,0xb17217f8 302 sub r0,r2,r3 303 bl fpack_q 304 pop {r15} 305 3063: 307 mov r0,r3 308 bl fpack_q 309 orrs r0,r0,#1<<31 310 pop {r15} 311 3121: 313 rsbs r0,#0 314 bl fpack_q 315 orrs r0,r0,#1<<31 316 pop {r15} 317 318 319float_wrapper_section atan2f 320 321@ fatan small reduced angle case 322@ r0 y/x in IEEE format, 0..2^-8 323@ r2 e+6 <0 324@ r3 #1 325@ r6 flags 32620: 327 rsbs r4,r2,#0 @ -e-6 = shift down of mantissa required to get to Q29 >0 328 cmp r4,#7 @ -e-6 ≥ 7 [ e ≤ -13 ] 329 bge 1f @ very small reduced angle? atn θ ~ θ 330@ do a power series 331 bfi r0,r3,#23,#9 @ fix up mantissa Q23-e 332 lsls r0,#7 @ Q30-e 333 smmulr r1,r0,r0 @ θ² Q60-2e-Q32=Q28-2e 334 lsrs r1,r4 @ Q28-2e + (e+6) = Q34-e 335 smmulr r1,r1,r0 @ θ³ Q34-e+Q30-e-Q32=Q32-2e 336 mov r3,#0x55555555 @ 1/3 Q32 337 lsrs r1,r4 @ Q32-2e + e+6 = Q38-e 338 smmulr r1,r1,r3 @ θ³/3 Q38-e 339 sub r0,r0,r1,lsr#8 @ Q30-e 340 cmp r6,#4 @ at least one quadrant to add? 341 bhs 2f 342 add r1,r2,#113 343 bl fpack_q 344 eors r0,r0,r6,lsl#31 @ fix sign 345 pop {r4-r6,r15} 346 347@ here reduced angle is < 2^-12 3481: 349 cmp r6,#4 350 bhs 3f @ at least one quadrant to add? 351 eors r0,r0,r6,lsl#31 @ no: return y/x with the correct sign 352 pop {r4-r6,r15} 353 3543: 355 bfi r0,r3,#23,#9 @ fix up mantissa 356 lsrs r0,r4 @ Q29 357 lsls r0,#1 @ Q30 358 b 40f 359 3602: 361 lsrs r0,r4 @ Q30-e + e+6 = Q36 362 lsrs r0,#6 @ Q30 363 b 40f 364 365@ case where reduced (x',y') has x' infinite 36671: 367 sbfx r4,r0,#23,#8 368 movs r0,#0 369 cmn r4,#1 @ y' also infinite? 370 bne 80f 371 mov r0,#0x3f800000 @ both infinite: pretend ∞/∞=1 372 b 80f 373 374@ case where reduced (x',y') has y' zero 37570: 376 ubfx r4,r1,#23,#8 377 movs r0,#0 378 cbnz r4,80f @ x' also zero? 379 tst r6,#4 380 beq 80f @ already in quadrants 0/±2? then 0/0 result will be correct 381 tst r6,#2 382 ite eq 383 addeq r6,#6 384 subne r6,#6 @ fix up when in quadrants ±0 385 b 80f 386 38790: 388 movs r0,r1 38991: 390 orrs r0,r0,#0x00400000 391 pop {r4-r6,r15} 392 393wrapper_func atan2f 394 push {r4-r6,r14} 395 lsls r4,r1,#1 396 cmp r4,#0xff000000 397 bhi 90b @ y NaN? 398 lsls r4,r0,#1 399 cmp r4,#0xff000000 400 bhi 91b @ x NaN? 401 lsrs r6,r0,#31 @ b31..2: quadrant count; b1: sign to apply before addition; b0: sign to apply after addition 402 bic r0,#1<<31 403@ y now positive 404 movs r1,r1 405 bpl 1f 406 407@ here y positive, x negative 408 adds r6,#10 409 bic r1,r1,#1<<31 410 cmp r1,r0 411 bhi 4f @ |x| > y: 3rd octant 412@ |x| < y: 2nd octant 413 subs r6,#6 414 b 3f 415 4161: 417@ here x and y positive 418 cmp r1,r0 419 bhs 4f 420@ x < y: 1st octant 421 adds r6,#6 4223: 423 movs r2,r1 @ exchange x and y 424 movs r1,r0 425 movs r0,r2 4264: 427@ here 428@ r0 y' 429@ r1 x' 430@ where both x' and y' are positive, y'/x' < 1+δ, and the final result is 431@ ± (Q.π/2 ± atn y/x) where 0≤Q≤2 is a quadrant count in r6b3..2, the inner negation 432@ is given by r6b1 and the outer negation by r6b0. x' can be infinite, or both x' and 433@ y' can be infinite, but not y' alone. 434 435 sbfx r2,r1,#23,#8 436 cmn r2,#1 437 beq 71b @ x' infinite? 438 ubfx r2,r0,#23,#8 439 cmp r2,#0 440 beq 70b @ y' zero? 441 bl __aeabi_fdiv 44280: 443 @ r0 y/x in IEEE format, 0..1 444 lsr r2,r0,#23 @ exponent 445 movs r3,#1 446 subs r2,#0x7f-6 447 bmi 20b 448 bfi r0,r3,#23,#9 @ fix up mantissa 449 lsls r0,r2 450 lsls r0,#2 45150: 452 453@ from here atan2(y,1) where 1 implied 454@ r0 y Q31 0≤y<1+δ 455 456 lsrs r2,r0,#16 457 mul r3,r2,r2 @ y^2 458 movw r4,#0x895c 459 muls r2,r2,r4 @ y*0x895c 460 movw r5,#0x1227 461 lsrs r3,#14 462 mls r2,r3,r5,r2 463 subs r2,#0x330000 @ Chebyshev approximation to atn(y) 464 lsrs r2,#25 465 ldr r3,=trigtab+4 466 add r3,r3,r2,lsl#4 467 ldmia r3,{r3-r5} 468@ r0 y Q30 469@ r3 phi0 Q32 470@ r4 sphi0 Q31 471@ r5 cphi0 Q31 472@ r6 flags 473@ x0= cphi0 + sphi0*y 474@ y0=-sphi0 + cphi0*y 475 umull r12,r1,r0,r4 @ sphi0*y 476 umull r12,r2,r0,r5 @ cphi0*y 477 add r1,r1,r5,lsr#1 @ x0 Q30 478 sub r2,r2,r4,lsr#1 @ y0 Q30 47911: 480@ r1 x0 Q30 481@ r2 y0 Q30 482@ r3 phi0 Q32 483@ r6 flags 484 mov r0,#0xffffffff 485 lsrs r4,r1,#15 486 udiv r12,r0,r4 @ rx0 Q17 487 lsls r4,r2,#6 @ y0 Q36 488 smmul r4,r4,r12 @ t2=y0*rx0 Q21 489 lsls r5,r4,#11 @ t2 Q32 490 smmlar r0,r5,r2,r1 491 smmlsr r1,r5,r1,r2 492@ r0 x1 Q30 493@ r1 y1 Q30 494@ r3 phi0 495@ r4 r2 Q21 496@ r12 rx0 Q17 497 mul r5,r4,r4 @ t2_2 Q42 498 smull r2,r5,r4,r5 @ t2_3 Q63 499 add r3,r3,r4,lsl#11 @ Q32 500 lsls r5,#1 @ Q32 501 mov r2,#0x55555555 @ 1/3 502 smmlsr r3,r2,r5,r3 @ t2_3/3 503 504@ r0 x1 Q30 505@ r1 y1 Q30 506@ r3 phi0+phi1 Q32 507@ r12 rx0 Q17 508 mul r0,r1,r12 @ y1*rx0 Q30+Q17=Q47 509 add r3,r3,r0,asr#15 510@ r3 phi0+phi1+phi2, result over reduced range Q32 511@ r6 flags 512 513 lsrs r0,r3,#2 @ Q30 514@ adc r0,#0 @ rounding 51540: 516 lsl r5,r6,#30 @ b1 -> b31 517 eor r0,r0,r5,asr#31 @ negate if required 518 movlong r1,0x6487ED51 @ π/2 Q30 519 520 lsr r5,r6,#2 @ quadrants to add 521 mla r0,r5,r1,r0 522 mov r1,#0x80-9 @ for packing Q30 52360: 524 bl fpack_q 525 eors r0,r0,r6,lsl#31 526 pop {r4-r6,r15} 527 528@======================================= 529 530float_wrapper_section fpack 531 532@ fnegpack: negate and pack 533@ fpack_q31: 534@ input 535@ r0 Q31 result, must not be zero 536@ r1 exponent offset [fpack_q only] 537@ output 538@ r0 IEEE single 539@ trashes (r1,)r2 540fnegpack_q31: 541 rsbs r0,r0,#0 542fpack_q31: 543 mov r1,#0x7f-9 @ exponent 544fpack_q: 545 clz r2,r0 546 rsbs r2,#8 547 bmi 1f 548 lsrs r0,r0,r2 @ save rounding bit in carry 549 add r2,r2,r1 550 adc r0,r0,r2,lsl#23 @ insert exponent 551 bx r14 552 5531: 554 rsb r2,#0 555 lsls r0,r0,r2 556 sub r2,r1,r2 557 add r0,r0,r2,lsl#23 558 bx r14 559 560float_wrapper_section tanf 561 562wrapper_func tanf 563 push {r14} 564 bl sincosf_raw 565 bl __aeabi_fdiv 566 pop {r15} 567 568float_wrapper_section fsin_fcos 569 57010: @ process Inf/NaN for sinf and fcos 571 lsls r1,r0,#9 572 it eq 573 orreq r0,#0x80000000 574 orr r0,#0x00400000 @ turn Inf into NaN 575 movs r1,r0 @ copy result to cosine output 576 bx r14 577 578@ case where angle is very small (<2^-32) before reduction 57932: 580 adds r1,r1,#0x7f 581 it eq 582 moveq r0,#0 @ flush denormal to zero 583 584 movs r1,0x3f800000 @ return x for sine, 1 for cosine 585 tst r12,#2 586 it ne 587 movne r0,r1 @ calculating cosine? move to result registers 588 bx r14 589 59040: 591@ case where range-reduced angle is very small 592@ here 593@ r0 mantissa 594@ r1 exponent+9 595@ r6 abs range-reduced angle / 2π < 2^-7 Q32 596@ r12b31: dsincos flag 597@ r12b30: original argument sign 598@ r12b2..1: quadrant count 599@ r12b0: sign of reduced angle 600 push {r12} 601 movs r12,#0 6022: 603 add r12,#2 604 add r1,#2 605 bl frr_core @ repeat range reduction with extra factor of 2^2 (, 2^4, 2^6, 2^8,...) 606 eors r4,r6,r6,asr#31 @ we want to keep the sign in r6b31 for later 607 cmp r4,#1<<26 @ ≥ 2^-6? 608 bhs 1f @ loop until the result is big enough 609 cmp r12,#32 @ safety net 610 bne 2b 6111: 612@ here r4 is the abs range-reduced angle Q32+r12, 2^-6..2^-4 in Q32 terms 613 614@ 2π=6.487ED511 (0B4611A6...) 615 movlong r5,0x487ED511 @ 2π Q64 high fractional word 616 umull r2,r0,r4,r5 617 movs r5,#6 @ 2π integer part 618 mla r0,r4,r5,r0 619 620@ here 621@ r0 θ, abs range reduced angle θ 0..π/4 Q32+r12, 2π * 2^-6..2^-4 in Q32 terms (so top bit is clear) 622@ r6b31: sign of reduced angle 623@ r12: excess exponent ≥2, multiple of 2 624@ r0 / 2^r12 < ~ 2π * 2^-7 so for sin we need to go to term in x^3 625 626 smmulr r1,r0,r0 @ θ² Q32+2*r12 627 lsrs r1,r1,r12 @ θ² Q32+r12 628 lsrs r1,r1,r12 @ θ² Q32 629 movs r4,#0x55555555 @ 1/3 Q32 630 smmulr r4,r1,r4 @ θ²/3 Q32 631 smmulr r2,r4,r1 @ θ⁴/3 Q32 632 rsb r3,r1,r2,lsr#2 @ -θ²+θ⁴/12) Q32 633 asrs r3,#9 @ -θ²/2+θ⁴/24 Q24 634 adc r3,r3,#0x3f800000 @ IEEE single with rounding 635 636 smmulr r2,r4,r0 @ θ³/3 Q32+r12 637 sub r0,r0,r2,lsr#1 @ θ-θ³/6 Q32+r12 638 rsb r1,r12,#117 639 bl fpack_q 640@ here 641@ r0 packed sin 642@ r3 packed cos 643@ r6b31: sign of reduced angle 644 pop {r12} @ get flags 645 lsrs r6,#31 646 bfi r12,r6,#0,#1 647 648 asrs r2,r12,#1 649 bmi 23f @ doing sincos? 650 asrs r2,#1 651 bcc 21f @ need sine? 652@ need cosine: 653 ands r12,#4 654 orrs r0,r3,r12,lsl#29 @ insert sign 655 pop {r4-r6,r15} 656 65721: 658 eors r12,r12,r12,lsr#2 659 orrs r0,r0,r12,lsl#31 @ insert sign 660 pop {r4-r6,r15} 661 66223: 663 ands r2,r12,#4 664 orrs r3,r3,r2,lsl#29 @ insert sign 665 push {r3} 666@ drop into sincosf code below... 667 b 20f 668 669wrapper_func sincosf 670 push {r1, r2, lr} 671 bl sincosf_raw 672 pop {r2, r3} 673 str r0, [r2] 674 str r1, [r3] 675 pop {pc} 676 677sincosf_raw: 678 ands r12,r0,#1<<31 679 lsrs r12,#1 @ save argument sign in r12b30 680 orrs r12,r12,#1<<31 @ flag we want both results in r12b31 681 b 1f 682 683wrapper_func sinf 684 lsrs r12,r0,#29 @ negative argument -> +2 quadrants 685 ands r12,#4 686 b 1f 687 688wrapper_func cosf 689 movs r12,#2 @ cos -> +1 quadrant 6901: 691 ubfx r1,r0,#23,#8 @ get exponent 692 sub r1,r1,#0x7f 693 cmp r1,#0x80 694 beq 10b @ Inf or NaN? 695 cmn r1,#32 696 blt 32b @ very small argument? 697 movs r3,#1 698 bfi r0,r3,#23,#9 @ fix implied 1 in mantissa 699 push {r4-r6,r14} 700 add r1,#9 @ e+9 701 bl frr_core 702@ r6 θ/2π 0..1 Q64 703 lsrs r4,r6,#30 @ quadrant count 704 adc r4,r4,#0 @ rounded 705 sub r6,r6,r4,lsl#30 @ now -0.125≤r6<+0.125 Q32 706 add r12,r12,r4,lsl#1 707 orr r12,r12,r6,lsr#31 @ sign into r12b0 708@ r12b2..1: quadrant count 709@ r12b0: sign of reduced angle 710 eors r6,r6,r6,asr#31 @ absolute value of reduced angle 0≤r7<0.125 Q32 711 cmp r6,#1<<25 @ θ / 2π < 2^-7? 712 blo 40b 713 714@ 2π=6.487ED511 (0B4611A6...) 715 movlong r5,0x487ED511 @ 2π Q64 high fractional word 716 umull r2,r0,r6,r5 717 movs r5,#6 @ 2π integer part 718 mla r0,r6,r5,r0 719@ r0 range reduced angle θ 0..π/4 Q32 720 lsrs r2,r0,#27 721 ldr r3,=trigtab+4 722 add r3,r3,r2,lsl#4 723 ldmia r3,{r1-r3} 724 subs r0,r1 725@ r0: ε Q32 |ε| < 1.17*2^-6 726@ r2: sin φ Q31 727@ r3: cos φ Q31 728 asrs r1,r12,#1 729 bmi 5f @ doing sincosf? 730 asrs r1,#1 731 bcs 3f @ need cosine? 732@ here need sine 733 smmulr r1,r0,r0 @ ε² Q32 734 mov r4,#0x55555555 @ 1/3 Q32 735 asrs r1,#1 736 smmlsr r2,r1,r2,r2 @ sin φ - ε²/2*sin φ ~ sin φ cos ε 737 smmulr r1,r1,r0 @ ε³/2 738 smmlsr r1,r1,r4,r0 @ ε - ε³/6 739 740 smmlar r0,r1,r3,r2 @ sin φ cos ε + cos φ (ε - ε³/6) ~ sin (φ+ε) 741@ the sign of the result is r12b0^r12b2 742 bl fpack_q31 743 eors r12,r12,r12,lsr#2 744 orrs r0,r0,r12,lsl#31 @ insert sign 745 pop {r4-r6,r15} 746 7473: 748@ here need cosine 749 smmulr r1,r0,r0 @ ε² Q32 750 mov r4,#0x55555555 @ 1/3 Q32 751 asrs r1,#1 752 smmlsr r3,r1,r3,r3 @ cos φ - ε²/2*cos φ ~ cos φ cos ε 753 smmulr r1,r1,r0 @ ε³/2 754 smmlsr r1,r1,r4,r0 @ ε - ε³/6 755 756 smmlsr r0,r1,r2,r3 @ cos φ cos ε - sin φ (ε - ε³/6) ~ cos (φ+ε) 757@ the sign of the result is r12b2 758 bl fpack_q31 759 ands r12,#4 760 orrs r0,r0,r12,lsl#29 @ insert sign 761 pop {r4-r6,r15} 762 7635: 764@ here doing sincosf 765 smmulr r1,r0,r0 @ ε² Q32 766 mov r6,#0x55555555 @ 1/3 Q32 767 asrs r1,#1 768 smmlsr r4,r1,r2,r2 @ sin φ - ε²/2*sin φ ~ sin φ cos ε 769 smmlsr r5,r1,r3,r3 @ cos φ - ε²/2*cos φ ~ cos φ cos ε 770 smmulr r1,r1,r0 @ ε³/2 771 smmlsr r6,r1,r6,r0 @ ε - ε³/6 772@ here 773@ r2 sin φ 774@ r3 cos φ 775@ r4 sin φ cos ε 776@ r5 cos φ cos ε 777@ r6 ε - ε³/6 ~ sin ε 778 smmlsr r0,r6,r2,r5 @ cos φ cos ε - sin φ (ε - ε³/6) ~ cos (φ+ε) 779 bl fpack_q31 780 ands r5,r12,#4 781 eors r0,r0,r5,lsl#29 @ negate cosine in quadrants 2 and 3 782 push {r0} 783 smmlar r0,r6,r3,r4 @ sin φ cos ε + cos φ (ε - ε³/6) ~ sin (φ+ε) 784 bl fpack_q31 78520: 786 eors r4,r12,r12,lsr#1 787 eors r4,r4,r12,lsr#2 788 ands r5,r12,#1<<30 789 tst r12,#2 @ exchange sine and cosine in odd quadrants 790 beq 1f 791 eors r1,r0,r4,lsl#31 @ negate sine on b0^b1^b2 792 pop {r0} 793 eors r0,r0,r5,lsl#1 @ negate sine result if argument was negative 794 pop {r4-r6,r15} 7951: 796 pop {r1} 797 eors r0,r0,r4,lsl#31 @ negate sine on b0^b1^b2 798 eors r0,r0,r5,lsl#1 @ negate sine result if argument was negative 799 pop {r4-r6,r15} 800 801#endif 802