1/* 2 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. 3 * 4 * SPDX-License-Identifier: BSD-3-Clause 5 */ 6 7#include "pico/asm_helper.S" 8 9pico_default_asm_setup 10 11.macro double_section name 12// todo separate flag for shims? 13#if PICO_DOUBLE_IN_RAM 14.section RAM_SECTION_NAME(\name), "ax" 15#else 16.section SECTION_NAME(\name), "ax" 17#endif 18.endm 19 20double_section double_table_shim_on_use_helper 21regular_func double_table_shim_on_use_helper 22 push {r0-r2, lr} 23 mov r0, ip 24#ifndef NDEBUG 25 // sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro 26 cmp r0, #0 27 bne 1f 28 bkpt #0 29#endif 301: 31 ldrh r1, [r0] 32 lsrs r2, r1, #8 33 adds r0, #2 34 cmp r2, #0xdf 35 bne 1b 36 uxtb r1, r1 // r1 holds table offset 37 lsrs r2, r0, #2 38 bcc 1f 39 // unaligned 40 ldrh r2, [r0, #0] 41 ldrh r0, [r0, #2] 42 lsls r0, #16 43 orrs r0, r2 44 b 2f 451: 46 ldr r0, [r0] 472: 48 ldr r2, =sd_table 49 str r0, [r2, r1] 50 str r0, [sp, #12] 51 pop {r0-r2, pc} 52 53#if PICO_DOUBLE_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED 54// Note that the V1 ROM has no double support, so this is basically the identical 55// library, and shim inter-function calls do not bother to redirect back thru the 56// wrapper functions 57 58.equ use_hw_div,1 59.equ IOPORT ,0xd0000000 60.equ DIV_UDIVIDEND,0x00000060 61.equ DIV_UDIVISOR ,0x00000064 62.equ DIV_QUOTIENT ,0x00000070 63.equ DIV_CSR ,0x00000078 64 65@ Notation: 66@ rx:ry means the concatenation of rx and ry with rx having the less significant bits 67 68.equ debug,0 69.macro mdump k 70.if debug 71 push {r0-r3} 72 push {r14} 73 push {r0-r3} 74 bl osp 75 movs r0,#\k 76 bl o1ch 77 pop {r0-r3} 78 bl dump 79 bl osp 80 bl osp 81 ldr r0,[r13] 82 bl o8hex @ r14 83 bl onl 84 pop {r0} 85 mov r14,r0 86 pop {r0-r3} 87.endif 88.endm 89 90 91@ IEEE double in ra:rb -> 92@ mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 set 93@ exponent in re 94@ sign in rs 95@ trashes rt 96.macro mdunpack ra,rb,re,rs,rt 97 lsrs \re,\rb,#20 @ extract sign and exponent 98 subs \rs,\re,#1 99 lsls \rs,#20 100 subs \rb,\rs @ clear sign and exponent in mantissa; insert implied 1 101 lsrs \rs,\re,#11 @ sign 102 lsls \re,#21 103 lsrs \re,#21 @ exponent 104 beq l\@_1 @ zero exponent? 105 adds \rt,\re,#1 106 lsrs \rt,#11 107 beq l\@_2 @ exponent != 0x7ff? then done 108l\@_1: 109 movs \ra,#0 110 movs \rb,#1 111 lsls \rb,#20 112 subs \re,#128 113 lsls \re,#12 114l\@_2: 115.endm 116 117@ IEEE double in ra:rb -> 118@ signed mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 119@ exponent in re 120@ trashes rt0 and rt1 121@ +zero, +denormal -> exponent=-0x80000 122@ -zero, -denormal -> exponent=-0x80000 123@ +Inf, +NaN -> exponent=+0x77f000 124@ -Inf, -NaN -> exponent=+0x77e000 125.macro mdunpacks ra,rb,re,rt0,rt1 126 lsrs \re,\rb,#20 @ extract sign and exponent 127 lsrs \rt1,\rb,#31 @ sign only 128 subs \rt0,\re,#1 129 lsls \rt0,#20 130 subs \rb,\rt0 @ clear sign and exponent in mantissa; insert implied 1 131 lsls \re,#21 132 bcc l\@_1 @ skip on positive 133 mvns \rb,\rb @ negate mantissa 134 negs \ra,\ra 135 bcc l\@_1 136 adds \rb,#1 137l\@_1: 138 lsrs \re,#21 139 beq l\@_2 @ zero exponent? 140 adds \rt0,\re,#1 141 lsrs \rt0,#11 142 beq l\@_3 @ exponent != 0x7ff? then done 143 subs \re,\rt1 144l\@_2: 145 movs \ra,#0 146 lsls \rt1,#1 @ +ve: 0 -ve: 2 147 adds \rb,\rt1,#1 @ +ve: 1 -ve: 3 148 lsls \rb,#30 @ create +/-1 mantissa 149 asrs \rb,#10 150 subs \re,#128 151 lsls \re,#12 152l\@_3: 153.endm 154 155double_section WRAPPER_FUNC_NAME(__aeabi_dsub) 156 157# frsub first because it is the only one that needs alignment 158regular_func drsub_shim 159 push {r0-r3} 160 pop {r0-r1} 161 pop {r2-r3} 162 // fall thru 163 164regular_func dsub_shim 165 push {r4-r7,r14} 166 movs r4,#1 167 lsls r4,#31 168 eors r3,r4 @ flip sign on second argument 169 b da_entry @ continue in dadd 170 171.align 2 172double_section dadd_shim 173regular_func dadd_shim 174 push {r4-r7,r14} 175da_entry: 176 mdunpacks r0,r1,r4,r6,r7 177 mdunpacks r2,r3,r5,r6,r7 178 subs r7,r5,r4 @ ye-xe 179 subs r6,r4,r5 @ xe-ye 180 bmi da_ygtx 181@ here xe>=ye: need to shift y down r6 places 182 mov r12,r4 @ save exponent 183 cmp r6,#32 184 bge da_xrgty @ xe rather greater than ye? 185 adds r7,#32 186 movs r4,r2 187 lsls r4,r4,r7 @ rounding bit + sticky bits 188da_xgty0: 189 movs r5,r3 190 lsls r5,r5,r7 191 lsrs r2,r6 192 asrs r3,r6 193 orrs r2,r5 194da_add: 195 adds r0,r2 196 adcs r1,r3 197da_pack: 198@ here unnormalised signed result (possibly 0) is in r0:r1 with exponent r12, rounding + sticky bits in r4 199@ Note that if a large normalisation shift is required then the arguments were close in magnitude and so we 200@ cannot have not gone via the xrgty/yrgtx paths. There will therefore always be enough high bits in r4 201@ to provide a correct continuation of the exact result. 202@ now pack result back up 203 lsrs r3,r1,#31 @ get sign bit 204 beq 1f @ skip on positive 205 mvns r1,r1 @ negate mantissa 206 mvns r0,r0 207 movs r2,#0 208 negs r4,r4 209 adcs r0,r2 210 adcs r1,r2 2111: 212 mov r2,r12 @ get exponent 213 lsrs r5,r1,#21 214 bne da_0 @ shift down required? 215 lsrs r5,r1,#20 216 bne da_1 @ normalised? 217 cmp r0,#0 218 beq da_5 @ could mantissa be zero? 219da_2: 220 adds r4,r4 221 adcs r0,r0 222 adcs r1,r1 223 subs r2,#1 @ adjust exponent 224 lsrs r5,r1,#20 225 beq da_2 226da_1: 227 lsls r4,#1 @ check rounding bit 228 bcc da_3 229da_4: 230 adds r0,#1 @ round up 231 bcc 2f 232 adds r1,#1 2332: 234 cmp r4,#0 @ sticky bits zero? 235 bne da_3 236 lsrs r0,#1 @ round to even 237 lsls r0,#1 238da_3: 239 subs r2,#1 240 bmi da_6 241 adds r4,r2,#2 @ check if exponent is overflowing 242 lsrs r4,#11 243 bne da_7 244 lsls r2,#20 @ pack exponent and sign 245 add r1,r2 246 lsls r3,#31 247 add r1,r3 248 pop {r4-r7,r15} 249 250da_7: 251@ here exponent overflow: return signed infinity 252 lsls r1,r3,#31 253 ldr r3,=0x7ff00000 254 orrs r1,r3 255 b 1f 256da_6: 257@ here exponent underflow: return signed zero 258 lsls r1,r3,#31 2591: 260 movs r0,#0 261 pop {r4-r7,r15} 262 263da_5: 264@ here mantissa could be zero 265 cmp r1,#0 266 bne da_2 267 cmp r4,#0 268 bne da_2 269@ inputs must have been of identical magnitude and opposite sign, so return +0 270 pop {r4-r7,r15} 271 272da_0: 273@ here a shift down by one place is required for normalisation 274 adds r2,#1 @ adjust exponent 275 lsls r6,r0,#31 @ save rounding bit 276 lsrs r0,#1 277 lsls r5,r1,#31 278 orrs r0,r5 279 lsrs r1,#1 280 cmp r6,#0 281 beq da_3 282 b da_4 283 284da_xrgty: @ xe>ye and shift>=32 places 285 cmp r6,#60 286 bge da_xmgty @ xe much greater than ye? 287 subs r6,#32 288 adds r7,#64 289 290 movs r4,r2 291 lsls r4,r4,r7 @ these would be shifted off the bottom of the sticky bits 292 beq 1f 293 movs r4,#1 2941: 295 lsrs r2,r2,r6 296 orrs r4,r2 297 movs r2,r3 298 lsls r3,r3,r7 299 orrs r4,r3 300 asrs r3,r2,#31 @ propagate sign bit 301 b da_xgty0 302 303da_ygtx: 304@ here ye>xe: need to shift x down r7 places 305 mov r12,r5 @ save exponent 306 cmp r7,#32 307 bge da_yrgtx @ ye rather greater than xe? 308 adds r6,#32 309 movs r4,r0 310 lsls r4,r4,r6 @ rounding bit + sticky bits 311da_ygtx0: 312 movs r5,r1 313 lsls r5,r5,r6 314 lsrs r0,r7 315 asrs r1,r7 316 orrs r0,r5 317 b da_add 318 319da_yrgtx: 320 cmp r7,#60 321 bge da_ymgtx @ ye much greater than xe? 322 subs r7,#32 323 adds r6,#64 324 325 movs r4,r0 326 lsls r4,r4,r6 @ these would be shifted off the bottom of the sticky bits 327 beq 1f 328 movs r4,#1 3291: 330 lsrs r0,r0,r7 331 orrs r4,r0 332 movs r0,r1 333 lsls r1,r1,r6 334 orrs r4,r1 335 asrs r1,r0,#31 @ propagate sign bit 336 b da_ygtx0 337 338da_ymgtx: @ result is just y 339 movs r0,r2 340 movs r1,r3 341da_xmgty: @ result is just x 342 movs r4,#0 @ clear sticky bits 343 b da_pack 344 345.ltorg 346 347@ equivalent of UMULL 348@ needs five temporary registers 349@ can have rt3==rx, in which case rx trashed 350@ can have rt4==ry, in which case ry trashed 351@ can have rzl==rx 352@ can have rzh==ry 353@ can have rzl,rzh==rt3,rt4 354.macro mul32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4 355 @ t0 t1 t2 t3 t4 356 @ (x) (y) 357 uxth \rt0,\rx @ xl 358 uxth \rt1,\ry @ yl 359 muls \rt0,\rt1 @ xlyl=L 360 lsrs \rt2,\rx,#16 @ xh 361 muls \rt1,\rt2 @ xhyl=M0 362 lsrs \rt4,\ry,#16 @ yh 363 muls \rt2,\rt4 @ xhyh=H 364 uxth \rt3,\rx @ xl 365 muls \rt3,\rt4 @ xlyh=M1 366 adds \rt1,\rt3 @ M0+M1=M 367 bcc l\@_1 @ addition of the two cross terms can overflow, so add carry into H 368 movs \rt3,#1 @ 1 369 lsls \rt3,#16 @ 0x10000 370 adds \rt2,\rt3 @ H' 371l\@_1: 372 @ t0 t1 t2 t3 t4 373 @ (zl) (zh) 374 lsls \rzl,\rt1,#16 @ ML 375 lsrs \rzh,\rt1,#16 @ MH 376 adds \rzl,\rt0 @ ZL 377 adcs \rzh,\rt2 @ ZH 378.endm 379 380@ SUMULL: x signed, y unsigned 381@ in table below ¯ means signed variable 382@ needs five temporary registers 383@ can have rt3==rx, in which case rx trashed 384@ can have rt4==ry, in which case ry trashed 385@ can have rzl==rx 386@ can have rzh==ry 387@ can have rzl,rzh==rt3,rt4 388.macro muls32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4 389 @ t0 t1 t2 t3 t4 390 @ ¯(x) (y) 391 uxth \rt0,\rx @ xl 392 uxth \rt1,\ry @ yl 393 muls \rt0,\rt1 @ xlyl=L 394 asrs \rt2,\rx,#16 @ ¯xh 395 muls \rt1,\rt2 @ ¯xhyl=M0 396 lsrs \rt4,\ry,#16 @ yh 397 muls \rt2,\rt4 @ ¯xhyh=H 398 uxth \rt3,\rx @ xl 399 muls \rt3,\rt4 @ xlyh=M1 400 asrs \rt4,\rt1,#31 @ M0sx (M1 sign extension is zero) 401 adds \rt1,\rt3 @ M0+M1=M 402 movs \rt3,#0 @ 0 403 adcs \rt4,\rt3 @ ¯Msx 404 lsls \rt4,#16 @ ¯Msx<<16 405 adds \rt2,\rt4 @ H' 406 407 @ t0 t1 t2 t3 t4 408 @ (zl) (zh) 409 lsls \rzl,\rt1,#16 @ M~ 410 lsrs \rzh,\rt1,#16 @ M~ 411 adds \rzl,\rt0 @ ZL 412 adcs \rzh,\rt2 @ ¯ZH 413.endm 414 415@ SSMULL: x signed, y signed 416@ in table below ¯ means signed variable 417@ needs five temporary registers 418@ can have rt3==rx, in which case rx trashed 419@ can have rt4==ry, in which case ry trashed 420@ can have rzl==rx 421@ can have rzh==ry 422@ can have rzl,rzh==rt3,rt4 423.macro muls32_s32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4 424 @ t0 t1 t2 t3 t4 425 @ ¯(x) (y) 426 uxth \rt0,\rx @ xl 427 uxth \rt1,\ry @ yl 428 muls \rt0,\rt1 @ xlyl=L 429 asrs \rt2,\rx,#16 @ ¯xh 430 muls \rt1,\rt2 @ ¯xhyl=M0 431 asrs \rt4,\ry,#16 @ ¯yh 432 muls \rt2,\rt4 @ ¯xhyh=H 433 uxth \rt3,\rx @ xl 434 muls \rt3,\rt4 @ ¯xlyh=M1 435 adds \rt1,\rt3 @ ¯M0+M1=M 436 asrs \rt3,\rt1,#31 @ Msx 437 bvc l\@_1 @ 438 mvns \rt3,\rt3 @ ¯Msx flip sign extension bits if overflow 439l\@_1: 440 lsls \rt3,#16 @ ¯Msx<<16 441 adds \rt2,\rt3 @ H' 442 443 @ t0 t1 t2 t3 t4 444 @ (zl) (zh) 445 lsls \rzl,\rt1,#16 @ M~ 446 lsrs \rzh,\rt1,#16 @ M~ 447 adds \rzl,\rt0 @ ZL 448 adcs \rzh,\rt2 @ ¯ZH 449.endm 450 451@ can have rt2==rx, in which case rx trashed 452@ can have rzl==rx 453@ can have rzh==rt1 454.macro square32_64 rx,rzl,rzh,rt0,rt1,rt2 455 @ t0 t1 t2 zl zh 456 uxth \rt0,\rx @ xl 457 muls \rt0,\rt0 @ xlxl=L 458 uxth \rt1,\rx @ xl 459 lsrs \rt2,\rx,#16 @ xh 460 muls \rt1,\rt2 @ xlxh=M 461 muls \rt2,\rt2 @ xhxh=H 462 lsls \rzl,\rt1,#17 @ ML 463 lsrs \rzh,\rt1,#15 @ MH 464 adds \rzl,\rt0 @ ZL 465 adcs \rzh,\rt2 @ ZH 466.endm 467 468double_section dmul_shim 469 regular_func dmul_shim 470 push {r4-r7,r14} 471 mdunpack r0,r1,r4,r6,r5 472 mov r12,r4 473 mdunpack r2,r3,r4,r7,r5 474 eors r7,r6 @ sign of result 475 add r4,r12 @ exponent of result 476 push {r0-r2,r4,r7} 477 478@ accumulate full product in r12:r5:r6:r7 479 mul32_32_64 r0,r2, r0,r5, r4,r6,r7,r0,r5 @ XL*YL 480 mov r12,r0 @ save LL bits 481 482 mul32_32_64 r1,r3, r6,r7, r0,r2,r4,r6,r7 @ XH*YH 483 484 pop {r0} @ XL 485 mul32_32_64 r0,r3, r0,r3, r1,r2,r4,r0,r3 @ XL*YH 486 adds r5,r0 487 adcs r6,r3 488 movs r0,#0 489 adcs r7,r0 490 491 pop {r1,r2} @ XH,YL 492 mul32_32_64 r1,r2, r1,r2, r0,r3,r4, r1,r2 @ XH*YL 493 adds r5,r1 494 adcs r6,r2 495 movs r0,#0 496 adcs r7,r0 497 498@ here r5:r6:r7 holds the product [1..4) in Q(104-32)=Q72, with extra LSBs in r12 499 pop {r3,r4} @ exponent in r3, sign in r4 500 lsls r1,r7,#11 501 lsrs r2,r6,#21 502 orrs r1,r2 503 lsls r0,r6,#11 504 lsrs r2,r5,#21 505 orrs r0,r2 506 lsls r5,#11 @ now r5:r0:r1 Q83=Q(51+32), extra LSBs in r12 507 lsrs r2,r1,#20 508 bne 1f @ skip if in range [2..4) 509 adds r5,r5 @ shift up so always [2..4) Q83, i.e. [1..2) Q84=Q(52+32) 510 adcs r0,r0 511 adcs r1,r1 512 subs r3,#1 @ correct exponent 5131: 514 ldr r6,=0x3ff 515 subs r3,r6 @ correct for exponent bias 516 lsls r6,#1 @ 0x7fe 517 cmp r3,r6 518 bhs dm_0 @ exponent over- or underflow 519 lsls r5,#1 @ rounding bit to carry 520 bcc 1f @ result is correctly rounded 521 adds r0,#1 522 movs r6,#0 523 adcs r1,r6 @ round up 524 mov r6,r12 @ remaining sticky bits 525 orrs r5,r6 526 bne 1f @ some sticky bits set? 527 lsrs r0,#1 528 lsls r0,#1 @ round to even 5291: 530 lsls r3,#20 531 adds r1,r3 532dm_2: 533 lsls r4,#31 534 add r1,r4 535 pop {r4-r7,r15} 536 537@ here for exponent over- or underflow 538dm_0: 539 bge dm_1 @ overflow? 540 adds r3,#1 @ would-be zero exponent? 541 bne 1f 542 adds r0,#1 543 bne 1f @ all-ones mantissa? 544 adds r1,#1 545 lsrs r7,r1,#21 546 beq 1f 547 lsrs r1,#1 548 b dm_2 5491: 550 lsls r1,r4,#31 551 movs r0,#0 552 pop {r4-r7,r15} 553 554@ here for exponent overflow 555dm_1: 556 adds r6,#1 @ 0x7ff 557 lsls r1,r6,#20 558 movs r0,#0 559 b dm_2 560 561.ltorg 562 563@ Approach to division y/x is as follows. 564@ 565@ First generate u1, an approximation to 1/x to about 29 bits. Multiply this by the top 566@ 32 bits of y to generate a0, a first approximation to the result (good to 28 bits or so). 567@ Calculate the exact remainder r0=y-a0*x, which will be about 0. Calculate a correction 568@ d0=r0*u1, and then write a1=a0+d0. If near a rounding boundary, compute the exact 569@ remainder r1=y-a1*x (which can be done using r0 as a basis) to determine whether to 570@ round up or down. 571@ 572@ The calculation of 1/x is as given in dreciptest.c. That code verifies exhaustively 573@ that | u1*x-1 | < 10*2^-32. 574@ 575@ More precisely: 576@ 577@ x0=(q16)x; 578@ x1=(q30)x; 579@ y0=(q31)y; 580@ u0=(q15~)"(0xffffffffU/(unsigned int)roundq(x/x_ulp))/powq(2,16)"(x0); // q15 approximation to 1/x; "~" denotes rounding rather than truncation 581@ v=(q30)(u0*x1-1); 582@ u1=(q30)u0-(q30~)(u0*v); 583@ 584@ a0=(q30)(u1*y0); 585@ r0=(q82)y-a0*x; 586@ r0x=(q57)r0; 587@ d0=r0x*u1; 588@ a1=d0+a0; 589@ 590@ Error analysis 591@ 592@ Use Greek letters to represent the errors introduced by rounding and truncation. 593@ 594@ r₀ = y - a₀x 595@ = y - [ u₁ ( y - α ) - β ] x where 0 ≤ α < 2^-31, 0 ≤ β < 2^-30 596@ = y ( 1 - u₁x ) + ( u₁α + β ) x 597@ 598@ Hence 599@ 600@ | r₀ / x | < 2 * 10*2^-32 + 2^-31 + 2^-30 601@ = 26*2^-32 602@ 603@ r₁ = y - a₁x 604@ = y - a₀x - d₀x 605@ = r₀ - d₀x 606@ = r₀ - u₁ ( r₀ - γ ) x where 0 ≤ γ < 2^-57 607@ = r₀ ( 1 - u₁x ) + u₁γx 608@ 609@ Hence 610@ 611@ | r₁ / x | < 26*2^-32 * 10*2^-32 + 2^-57 612@ = (260+128)*2^-64 613@ < 2^-55 614@ 615@ Empirically it seems to be nearly twice as good as this. 616@ 617@ To determine correctly whether the exact remainder calculation can be skipped we need a result 618@ accurate to < 0.25ulp. In the case where x>y the quotient will be shifted up one place for normalisation 619@ and so 1ulp is 2^-53 and so the calculation above suffices. 620 621double_section ddiv_shim 622 regular_func ddiv_shim 623 push {r4-r7,r14} 624ddiv0: @ entry point from dtan 625 mdunpack r2,r3,r4,r7,r6 @ unpack divisor 626 627.if use_hw_div 628 629 movs r5,#IOPORT>>24 630 lsls r5,#24 631 movs r6,#0 632 mvns r6,r6 633 str r6,[r5,#DIV_UDIVIDEND] 634 lsrs r6,r3,#4 @ x0=(q16)x 635 str r6,[r5,#DIV_UDIVISOR] 636@ if there are not enough cycles from now to the read of the quotient for 637@ the divider to do its stuff we need a busy-wait here 638 639.endif 640 641@ unpack dividend by hand to save on register use 642 lsrs r6,r1,#31 643 adds r6,r7 644 mov r12,r6 @ result sign in r12b0; r12b1 trashed 645 lsls r1,#1 646 lsrs r7,r1,#21 @ exponent 647 beq 1f @ zero exponent? 648 adds r6,r7,#1 649 lsrs r6,#11 650 beq 2f @ exponent != 0x7ff? then done 6511: 652 movs r0,#0 653 movs r1,#0 654 subs r7,#64 @ less drastic fiddling of exponents to get 0/0, Inf/Inf correct 655 lsls r7,#12 6562: 657 subs r6,r7,r4 658 lsls r6,#2 659 add r12,r12,r6 @ (signed) exponent in r12[31..8] 660 subs r7,#1 @ implied 1 661 lsls r7,#21 662 subs r1,r7 663 lsrs r1,#1 664 665.if use_hw_div 666 667 ldr r6,[r5,#DIV_QUOTIENT] 668 adds r6,#1 669 lsrs r6,#1 670 671.else 672 673@ this is not beautiful; could be replaced by better code that uses knowledge of divisor range 674 push {r0-r3} 675 movs r0,#0 676 mvns r0,r0 677 lsrs r1,r3,#4 @ x0=(q16)x 678 bl __aeabi_uidiv @ !!! this could (but apparently does not) trash R12 679 adds r6,r0,#1 680 lsrs r6,#1 681 pop {r0-r3} 682 683.endif 684 685@ here 686@ r0:r1 y mantissa 687@ r2:r3 x mantissa 688@ r6 u0, first approximation to 1/x Q15 689@ r12: result sign, exponent 690 691 lsls r4,r3,#10 692 lsrs r5,r2,#22 693 orrs r5,r4 @ x1=(q30)x 694 muls r5,r6 @ u0*x1 Q45 695 asrs r5,#15 @ v=u0*x1-1 Q30 696 muls r5,r6 @ u0*v Q45 697 asrs r5,#14 698 adds r5,#1 699 asrs r5,#1 @ round u0*v to Q30 700 lsls r6,#15 701 subs r6,r5 @ u1 Q30 702 703@ here 704@ r0:r1 y mantissa 705@ r2:r3 x mantissa 706@ r6 u1, second approximation to 1/x Q30 707@ r12: result sign, exponent 708 709 push {r2,r3} 710 lsls r4,r1,#11 711 lsrs r5,r0,#21 712 orrs r4,r5 @ y0=(q31)y 713 mul32_32_64 r4,r6, r4,r5, r2,r3,r7,r4,r5 @ y0*u1 Q61 714 adds r4,r4 715 adcs r5,r5 @ a0=(q30)(y0*u1) 716 717@ here 718@ r0:r1 y mantissa 719@ r5 a0, first approximation to y/x Q30 720@ r6 u1, second approximation to 1/x Q30 721@ r12 result sign, exponent 722 723 ldr r2,[r13,#0] @ xL 724 mul32_32_64 r2,r5, r2,r3, r1,r4,r7,r2,r3 @ xL*a0 725 ldr r4,[r13,#4] @ xH 726 muls r4,r5 @ xH*a0 727 adds r3,r4 @ r2:r3 now x*a0 Q82 728 lsrs r2,#25 729 lsls r1,r3,#7 730 orrs r2,r1 @ r2 now x*a0 Q57; r7:r2 is x*a0 Q89 731 lsls r4,r0,#5 @ y Q57 732 subs r0,r4,r2 @ r0x=y-x*a0 Q57 (signed) 733 734@ here 735@ r0 r0x Q57 736@ r5 a0, first approximation to y/x Q30 737@ r4 yL Q57 738@ r6 u1 Q30 739@ r12 result sign, exponent 740 741 muls32_32_64 r0,r6, r7,r6, r1,r2,r3, r7,r6 @ r7:r6 r0x*u1 Q87 742 asrs r3,r6,#25 743 adds r5,r3 744 lsls r3,r6,#7 @ r3:r5 a1 Q62 (but bottom 7 bits are zero so 55 bits of precision after binary point) 745@ here we could recover another 7 bits of precision (but not accuracy) from the top of r7 746@ but these bits are thrown away in the rounding and conversion to Q52 below 747 748@ here 749@ r3:r5 a1 Q62 candidate quotient [0.5,2) or so 750@ r4 yL Q57 751@ r12 result sign, exponent 752 753 movs r6,#0 754 adds r3,#128 @ for initial rounding to Q53 755 adcs r5,r5,r6 756 lsrs r1,r5,#30 757 bne dd_0 758@ here candidate quotient a1 is in range [0.5,1) 759@ so 30 significant bits in r5 760 761 lsls r4,#1 @ y now Q58 762 lsrs r1,r5,#9 @ to Q52 763 lsls r0,r5,#23 764 lsrs r3,#9 @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result 765 orrs r0,r3 766 bcs dd_1 767 b dd_2 768dd_0: 769@ here candidate quotient a1 is in range [1,2) 770@ so 31 significant bits in r5 771 772 movs r2,#4 773 add r12,r12,r2 @ fix exponent; r3:r5 now effectively Q61 774 adds r3,#128 @ complete rounding to Q53 775 adcs r5,r5,r6 776 lsrs r1,r5,#10 777 lsls r0,r5,#22 778 lsrs r3,#10 @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result 779 orrs r0,r3 780 bcc dd_2 781dd_1: 782 783@ here 784@ r0:r1 rounded result Q53 [0.5,1) or Q52 [1,2), but may not be correctly rounded-to-nearest 785@ r4 yL Q58 or Q57 786@ r12 result sign, exponent 787@ carry set 788 789 adcs r0,r0,r0 790 adcs r1,r1,r1 @ z Q53 with 1 in LSB 791 lsls r4,#16 @ Q105-32=Q73 792 ldr r2,[r13,#0] @ xL Q52 793 ldr r3,[r13,#4] @ xH Q20 794 795 movs r5,r1 @ zH Q21 796 muls r5,r2 @ zH*xL Q73 797 subs r4,r5 798 muls r3,r0 @ zL*xH Q73 799 subs r4,r3 800 mul32_32_64 r2,r0, r2,r3, r5,r6,r7,r2,r3 @ xL*zL 801 negs r2,r2 @ borrow from low half? 802 sbcs r4,r3 @ y-xz Q73 (remainder bits 52..73) 803 804 cmp r4,#0 805 806 bmi 1f 807 movs r2,#0 @ round up 808 adds r0,#1 809 adcs r1,r2 8101: 811 lsrs r0,#1 @ shift back down to Q52 812 lsls r2,r1,#31 813 orrs r0,r2 814 lsrs r1,#1 815dd_2: 816 add r13,#8 817 mov r2,r12 818 lsls r7,r2,#31 @ result sign 819 asrs r2,#2 @ result exponent 820 ldr r3,=0x3fd 821 adds r2,r3 822 ldr r3,=0x7fe 823 cmp r2,r3 824 bhs dd_3 @ over- or underflow? 825 lsls r2,#20 826 adds r1,r2 @ pack exponent 827dd_5: 828 adds r1,r7 @ pack sign 829 pop {r4-r7,r15} 830 831dd_3: 832 movs r0,#0 833 cmp r2,#0 834 bgt dd_4 @ overflow? 835 movs r1,r7 836 pop {r4-r7,r15} 837 838dd_4: 839 adds r3,#1 @ 0x7ff 840 lsls r1,r3,#20 841 b dd_5 842 843.section SECTION_NAME(dsqrt_shim) 844/* 845Approach to square root x=sqrt(y) is as follows. 846 847First generate a3, an approximation to 1/sqrt(y) to about 30 bits. Multiply this by y 848to give a4~sqrt(y) to about 28 bits and a remainder r4=y-a4^2. Then, because 849d sqrt(y) / dy = 1 / (2 sqrt(y)) let d4=r4*a3/2 and then the value a5=a4+d4 is 850a better approximation to sqrt(y). If this is near a rounding boundary we 851compute an exact remainder y-a5*a5 to decide whether to round up or down. 852 853The calculation of a3 and a4 is as given in dsqrttest.c. That code verifies exhaustively 854that | 1 - a3a4 | < 10*2^-32, | r4 | < 40*2^-32 and | r4/y | < 20*2^-32. 855 856More precisely, with "y" representing y truncated to 30 binary places: 857 858u=(q3)y; // 24-entry table 859a0=(q8~)"1/sqrtq(x+x_ulp/2)"(u); // first approximation from table 860p0=(q16)(a0*a0) * (q16)y; 861r0=(q20)(p0-1); 862dy0=(q15)(r0*a0); // Newton-Raphson correction term 863a1=(q16)a0-dy0/2; // good to ~9 bits 864 865p1=(q19)(a1*a1)*(q19)y; 866r1=(q23)(p1-1); 867dy1=(q15~)(r1*a1); // second Newton-Raphson correction 868a2x=(q16)a1-dy1/2; // good to ~16 bits 869a2=a2x-a2x/1t16; // prevent overflow of a2*a2 in 32 bits 870 871p2=(a2*a2)*(q30)y; // Q62 872r2=(q36)(p2-1+1t-31); 873dy2=(q30)(r2*a2); // Q52->Q30 874a3=(q31)a2-dy2/2; // good to about 30 bits 875a4=(q30)(a3*(q30)y+1t-31); // good to about 28 bits 876 877Error analysis 878 879 r₄ = y - a₄² 880 d₄ = 1/2 a₃r₄ 881 a₅ = a₄ + d₄ 882 r₅ = y - a₅² 883 = y - ( a₄ + d₄ )² 884 = y - a₄² - a₃a₄r₄ - 1/4 a₃²r₄² 885 = r₄ - a₃a₄r₄ - 1/4 a₃²r₄² 886 887 | r₅ | < | r₄ | | 1 - a₃a₄ | + 1/4 r₄² 888 889 a₅ = √y √( 1 - r₅/y ) 890 = √y ( 1 - 1/2 r₅/y + ... ) 891 892So to first order (second order being very tiny) 893 894 √y - a₅ = 1/2 r₅/y 895 896and 897 898 | √y - a₅ | < 1/2 ( | r₄/y | | 1 - a₃a₄ | + 1/4 r₄²/y ) 899 900From dsqrttest.c (conservatively): 901 902 < 1/2 ( 20*2^-32 * 10*2^-32 + 1/4 * 40*2^-32*20*2^-32 ) 903 = 1/2 ( 200 + 200 ) * 2^-64 904 < 2^-56 905 906Empirically we see about 1ulp worst-case error including rounding at Q57. 907 908To determine correctly whether the exact remainder calculation can be skipped we need a result 909accurate to < 0.25ulp at Q52, or 2^-54. 910*/ 911 912dq_2: 913 bge dq_3 @ +Inf? 914 movs r1,#0 915 b dq_4 916 917dq_0: 918 lsrs r1,#31 919 lsls r1,#31 @ preserve sign bit 920 lsrs r2,#21 @ extract exponent 921 beq dq_4 @ -0? return it 922 asrs r1,#11 @ make -Inf 923 b dq_4 924 925dq_3: 926 ldr r1,=0x7ff 927 lsls r1,#20 @ return +Inf 928dq_4: 929 movs r0,#0 930dq_1: 931 bx r14 932 933.align 2 934regular_func dsqrt_shim 935 lsls r2,r1,#1 936 bcs dq_0 @ negative? 937 lsrs r2,#21 @ extract exponent 938 subs r2,#1 939 ldr r3,=0x7fe 940 cmp r2,r3 941 bhs dq_2 @ catches 0 and +Inf 942 push {r4-r7,r14} 943 lsls r4,r2,#20 944 subs r1,r4 @ insert implied 1 945 lsrs r2,#1 946 bcc 1f @ even exponent? skip 947 adds r0,r0,r0 @ odd exponent: shift up mantissa 948 adcs r1,r1,r1 9491: 950 lsrs r3,#2 951 adds r2,r3 952 lsls r2,#20 953 mov r12,r2 @ save result exponent 954 955@ here 956@ r0:r1 y mantissa Q52 [1,4) 957@ r12 result exponent 958.equ drsqrtapp_minus_8, (drsqrtapp-8) 959 adr r4,drsqrtapp_minus_8 @ first eight table entries are never accessed because of the mantissa's leading 1 960 lsrs r2,r1,#17 @ y Q3 961 ldrb r2,[r4,r2] @ initial approximation to reciprocal square root a0 Q8 962 lsrs r3,r1,#4 @ first Newton-Raphson iteration 963 muls r3,r2 964 muls r3,r2 @ i32 p0=a0*a0*(y>>14); // Q32 965 asrs r3,r3,#12 @ i32 r0=p0>>12; // Q20 966 muls r3,r2 967 asrs r3,#13 @ i32 dy0=(r0*a0)>>13; // Q15 968 lsls r2,#8 969 subs r2,r3 @ i32 a1=(a0<<8)-dy0; // Q16 970 971 movs r3,r2 972 muls r3,r3 973 lsrs r3,#13 974 lsrs r4,r1,#1 975 muls r3,r4 @ i32 p1=((a1*a1)>>11)*(y>>11); // Q19*Q19=Q38 976 asrs r3,#15 @ i32 r1=p1>>15; // Q23 977 muls r3,r2 978 asrs r3,#23 979 adds r3,#1 980 asrs r3,#1 @ i32 dy1=(r1*a1+(1<<23))>>24; // Q23*Q16=Q39; Q15 981 subs r2,r3 @ i32 a2=a1-dy1; // Q16 982 lsrs r3,r2,#16 983 subs r2,r3 @ if(a2>=0x10000) a2=0xffff; to prevent overflow of a2*a2 984 985@ here 986@ r0:r1 y mantissa 987@ r2 a2 ~ 1/sqrt(y) Q16 988@ r12 result exponent 989 990 movs r3,r2 991 muls r3,r3 992 lsls r1,#10 993 lsrs r4,r0,#22 994 orrs r1,r4 @ y Q30 995 mul32_32_64 r1,r3, r4,r3, r5,r6,r7,r4,r3 @ i64 p2=(ui64)(a2*a2)*(ui64)y; // Q62 r4:r3 996 lsls r5,r3,#6 997 lsrs r4,#26 998 orrs r4,r5 999 adds r4,#0x20 @ i32 r2=(p2>>26)+0x20; // Q36 r4 1000 uxth r5,r4 1001 muls r5,r2 1002 asrs r4,#16 1003 muls r4,r2 1004 lsrs r5,#16 1005 adds r4,r5 1006 asrs r4,#6 @ i32 dy2=((i64)r2*(i64)a2)>>22; // Q36*Q16=Q52; Q30 1007 lsls r2,#15 1008 subs r2,r4 1009 1010@ here 1011@ r0 y low bits 1012@ r1 y Q30 1013@ r2 a3 ~ 1/sqrt(y) Q31 1014@ r12 result exponent 1015 1016 mul32_32_64 r2,r1, r3,r4, r5,r6,r7,r3,r4 1017 adds r3,r3,r3 1018 adcs r4,r4,r4 1019 adds r3,r3,r3 1020 movs r3,#0 1021 adcs r3,r4 @ ui32 a4=((ui64)a3*(ui64)y+(1U<<31))>>31; // Q30 1022 1023@ here 1024@ r0 y low bits 1025@ r1 y Q30 1026@ r2 a3 Q31 ~ 1/sqrt(y) 1027@ r3 a4 Q30 ~ sqrt(y) 1028@ r12 result exponent 1029 1030 square32_64 r3, r4,r5, r6,r5,r7 1031 lsls r6,r0,#8 1032 lsrs r7,r1,#2 1033 subs r6,r4 1034 sbcs r7,r5 @ r4=(q60)y-a4*a4 1035 1036@ by exhaustive testing, r4 = fffffffc0e134fdc .. 00000003c2bf539c Q60 1037 1038 lsls r5,r7,#29 1039 lsrs r6,#3 1040 adcs r6,r5 @ r4 Q57 with rounding 1041 muls32_32_64 r6,r2, r6,r2, r4,r5,r7,r6,r2 @ d4=a3*r4/2 Q89 1042@ r4+d4 is correct to 1ULP at Q57, tested on ~9bn cases including all extreme values of r4 for each possible y Q30 1043 1044 adds r2,#8 1045 asrs r2,#5 @ d4 Q52, rounded to Q53 with spare bit in carry 1046 1047@ here 1048@ r0 y low bits 1049@ r1 y Q30 1050@ r2 d4 Q52, rounded to Q53 1051@ C flag contains d4_b53 1052@ r3 a4 Q30 1053 1054 bcs dq_5 1055 1056 lsrs r5,r3,#10 @ a4 Q52 1057 lsls r4,r3,#22 1058 1059 asrs r1,r2,#31 1060 adds r0,r2,r4 1061 adcs r1,r5 @ a4+d4 1062 1063 add r1,r12 @ pack exponent 1064 pop {r4-r7,r15} 1065 1066.ltorg 1067 1068 1069@ round(sqrt(2^22./[68:8:252])) 1070drsqrtapp: 1071.byte 0xf8,0xeb,0xdf,0xd6,0xcd,0xc5,0xbe,0xb8 1072.byte 0xb2,0xad,0xa8,0xa4,0xa0,0x9c,0x99,0x95 1073.byte 0x92,0x8f,0x8d,0x8a,0x88,0x85,0x83,0x81 1074 1075dq_5: 1076@ here we are near a rounding boundary, C is set 1077 adcs r2,r2,r2 @ d4 Q53+1ulp 1078 lsrs r5,r3,#9 1079 lsls r4,r3,#23 @ r4:r5 a4 Q53 1080 asrs r1,r2,#31 1081 adds r4,r2,r4 1082 adcs r5,r1 @ r4:r5 a5=a4+d4 Q53+1ulp 1083 movs r3,r5 1084 muls r3,r4 1085 square32_64 r4,r1,r2,r6,r2,r7 1086 adds r2,r3 1087 adds r2,r3 @ r1:r2 a5^2 Q106 1088 lsls r0,#22 @ y Q84 1089 1090 negs r1,r1 1091 sbcs r0,r2 @ remainder y-a5^2 1092 bmi 1f @ y<a5^2: no need to increment a5 1093 movs r3,#0 1094 adds r4,#1 1095 adcs r5,r3 @ bump a5 if over rounding boundary 10961: 1097 lsrs r0,r4,#1 1098 lsrs r1,r5,#1 1099 lsls r5,#31 1100 orrs r0,r5 1101 add r1,r12 1102 pop {r4-r7,r15} 1103 1104@ "scientific" functions start here 1105 1106@ double-length CORDIC rotation step 1107 1108@ r0:r1 ω 1109@ r6 32-i (complementary shift) 1110@ r7 i (shift) 1111@ r8:r9 x 1112@ r10:r11 y 1113@ r12 coefficient pointer 1114 1115@ an option in rotation mode would be to compute the sequence of σ values 1116@ in one pass, rotate the initial vector by the residual ω and then run a 1117@ second pass to compute the final x and y. This would relieve pressure 1118@ on registers and hence possibly be faster. The same trick does not work 1119@ in vectoring mode (but perhaps one could work to single precision in 1120@ a first pass and then double precision in a second pass?). 1121 1122double_section dcordic_vec_step 1123 regular_func dcordic_vec_step 1124 mov r2,r12 1125 ldmia r2!,{r3,r4} 1126 mov r12,r2 1127 mov r2,r11 1128 cmp r2,#0 1129 blt 1f 1130 b 2f 1131 1132double_section dcordic_rot_step 1133 regular_func dcordic_rot_step 1134 mov r2,r12 1135 ldmia r2!,{r3,r4} 1136 mov r12,r2 1137 cmp r1,#0 1138 bge 1f 11392: 1140@ ω<0 / y>=0 1141@ ω+=dω 1142@ x+=y>>i, y-=x>>i 1143 adds r0,r3 1144 adcs r1,r4 1145 1146 mov r3,r11 1147 asrs r3,r7 1148 mov r4,r11 1149 lsls r4,r6 1150 mov r2,r10 1151 lsrs r2,r7 1152 orrs r2,r4 @ r2:r3 y>>i, rounding in carry 1153 mov r4,r8 1154 mov r5,r9 @ r4:r5 x 1155 adcs r2,r4 1156 adcs r3,r5 @ r2:r3 x+(y>>i) 1157 mov r8,r2 1158 mov r9,r3 1159 1160 mov r3,r5 1161 lsls r3,r6 1162 asrs r5,r7 1163 lsrs r4,r7 1164 orrs r4,r3 @ r4:r5 x>>i, rounding in carry 1165 mov r2,r10 1166 mov r3,r11 1167 sbcs r2,r4 1168 sbcs r3,r5 @ r2:r3 y-(x>>i) 1169 mov r10,r2 1170 mov r11,r3 1171 bx r14 1172 1173 1174@ ω>0 / y<0 1175@ ω-=dω 1176@ x-=y>>i, y+=x>>i 11771: 1178 subs r0,r3 1179 sbcs r1,r4 1180 1181 mov r3,r9 1182 asrs r3,r7 1183 mov r4,r9 1184 lsls r4,r6 1185 mov r2,r8 1186 lsrs r2,r7 1187 orrs r2,r4 @ r2:r3 x>>i, rounding in carry 1188 mov r4,r10 1189 mov r5,r11 @ r4:r5 y 1190 adcs r2,r4 1191 adcs r3,r5 @ r2:r3 y+(x>>i) 1192 mov r10,r2 1193 mov r11,r3 1194 1195 mov r3,r5 1196 lsls r3,r6 1197 asrs r5,r7 1198 lsrs r4,r7 1199 orrs r4,r3 @ r4:r5 y>>i, rounding in carry 1200 mov r2,r8 1201 mov r3,r9 1202 sbcs r2,r4 1203 sbcs r3,r5 @ r2:r3 x-(y>>i) 1204 mov r8,r2 1205 mov r9,r3 1206 bx r14 1207 1208@ convert packed double in r0:r1 to signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point], with rounding towards -Inf 1209@ fixed-point versions only work with reasonable values in r2 because of the way dunpacks works 1210 1211double_section double2int_shim 1212 regular_func double2int_shim 1213 movs r2,#0 @ and fall through 1214regular_func double2fix_shim 1215 push {r14} 1216 adds r2,#32 1217 bl double2fix64_shim 1218 movs r0,r1 1219 pop {r15} 1220 1221double_section double2uint_shim 1222 regular_func double2uint_shim 1223 movs r2,#0 @ and fall through 1224regular_func double2ufix_shim 1225 push {r14} 1226 adds r2,#32 1227 bl double2ufix64_shim 1228 movs r0,r1 1229 pop {r15} 1230 1231double_section double2int64_shim 1232 regular_func double2int64_shim 1233 movs r2,#0 @ and fall through 1234regular_func double2fix64_shim 1235 push {r14} 1236 bl d2fix 1237 1238 asrs r2,r1,#31 1239 cmp r2,r3 1240 bne 1f @ sign extension bits fail to match sign of result? 1241 pop {r15} 12421: 1243 mvns r0,r3 1244 movs r1,#1 1245 lsls r1,#31 1246 eors r1,r1,r0 @ generate extreme fixed-point values 1247 pop {r15} 1248 1249double_section double2uint64_shim 1250 regular_func double2uint64_shim 1251 movs r2,#0 @ and fall through 1252regular_func double2ufix64_shim 1253 asrs r3,r1,#20 @ negative? return 0 1254 bmi ret_dzero 1255@ and fall through 1256 1257@ convert double in r0:r1 to signed fixed point in r0:r1:r3, r2 places after point, rounding towards -Inf 1258@ result clamped so that r3 can only be 0 or -1 1259@ trashes r12 1260.thumb_func 1261d2fix: 1262 push {r4,r14} 1263 mov r12,r2 1264 bl dunpacks 1265 asrs r4,r2,#16 1266 adds r4,#1 1267 bge 1f 1268 movs r1,#0 @ -0 -> +0 12691: 1270 asrs r3,r1,#31 1271 ldr r4, =d2fix_a 1272 bx r4 1273 1274ret_dzero: 1275 movs r0,#0 1276 movs r1,#0 1277 bx r14 1278 1279.weak d2fix_a // weak because it exists in float code too 1280.thumb_func 1281d2fix_a: 1282@ here 1283@ r0:r1 two's complement mantissa 1284@ r2 unbaised exponent 1285@ r3 mantissa sign extension bits 1286 add r2,r12 @ exponent plus offset for required binary point position 1287 subs r2,#52 @ required shift 1288 bmi 1f @ shift down? 1289@ here a shift up by r2 places 1290 cmp r2,#12 @ will clamp? 1291 bge 2f 1292 movs r4,r0 1293 lsls r1,r2 1294 lsls r0,r2 1295 negs r2,r2 1296 adds r2,#32 @ complementary shift 1297 lsrs r4,r2 1298 orrs r1,r4 1299 pop {r4,r15} 13002: 1301 mvns r0,r3 1302 mvns r1,r3 @ overflow: clamp to extreme fixed-point values 1303 pop {r4,r15} 13041: 1305@ here a shift down by -r2 places 1306 adds r2,#32 1307 bmi 1f @ long shift? 1308 mov r4,r1 1309 lsls r4,r2 1310 negs r2,r2 1311 adds r2,#32 @ complementary shift 1312 asrs r1,r2 1313 lsrs r0,r2 1314 orrs r0,r4 1315 pop {r4,r15} 13161: 1317@ here a long shift down 1318 movs r0,r1 1319 asrs r1,#31 @ shift down 32 places 1320 adds r2,#32 1321 bmi 1f @ very long shift? 1322 negs r2,r2 1323 adds r2,#32 1324 asrs r0,r2 1325 pop {r4,r15} 13261: 1327 movs r0,r3 @ result very near zero: use sign extension bits 1328 movs r1,r3 1329 pop {r4,r15} 1330 1331double_section double2float_shim 1332 regular_func double2float_shim 1333 lsls r2,r1,#1 1334 lsrs r2,#21 @ exponent 1335 ldr r3,=0x3ff-0x7f 1336 subs r2,r3 @ fix exponent bias 1337 ble 1f @ underflow or zero 1338 cmp r2,#0xff 1339 bge 2f @ overflow or infinity 1340 lsls r2,#23 @ position exponent of result 1341 lsrs r3,r1,#31 1342 lsls r3,#31 1343 orrs r2,r3 @ insert sign 1344 lsls r3,r0,#3 @ rounding bits 1345 lsrs r0,#29 1346 lsls r1,#12 1347 lsrs r1,#9 1348 orrs r0,r1 @ assemble mantissa 1349 orrs r0,r2 @ insert exponent and sign 1350 lsls r3,#1 1351 bcc 3f @ no rounding 1352 beq 4f @ all sticky bits 0? 13535: 1354 adds r0,#1 13553: 1356 bx r14 13574: 1358 lsrs r3,r0,#1 @ odd? then round up 1359 bcs 5b 1360 bx r14 13611: 1362 beq 6f @ check case where value is just less than smallest normal 13637: 1364 lsrs r0,r1,#31 1365 lsls r0,#31 1366 bx r14 13676: 1368 lsls r2,r1,#12 @ 20 1:s at top of mantissa? 1369 asrs r2,#12 1370 adds r2,#1 1371 bne 7b 1372 lsrs r2,r0,#29 @ and 3 more 1:s? 1373 cmp r2,#7 1374 bne 7b 1375 movs r2,#1 @ return smallest normal with correct sign 1376 b 8f 13772: 1378 movs r2,#0xff 13798: 1380 lsrs r0,r1,#31 @ return signed infinity 1381 lsls r0,#8 1382 adds r0,r2 1383 lsls r0,#23 1384 bx r14 1385 1386double_section x2double_shims 1387@ convert signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point] to packed double in r0:r1, with rounding 1388 1389.align 2 1390regular_func uint2double_shim 1391 movs r1,#0 @ and fall through 1392regular_func ufix2double_shim 1393 movs r2,r1 1394 movs r1,#0 1395 b ufix642double_shim 1396 1397.align 2 1398regular_func int2double_shim 1399 movs r1,#0 @ and fall through 1400regular_func fix2double_shim 1401 movs r2,r1 1402 asrs r1,r0,#31 @ sign extend 1403 b fix642double_shim 1404 1405.align 2 1406regular_func uint642double_shim 1407 movs r2,#0 @ and fall through 1408regular_func ufix642double_shim 1409 movs r3,#0 1410 b uf2d 1411 1412.align 2 1413regular_func int642double_shim 1414 movs r2,#0 @ and fall through 1415regular_func fix642double_shim 1416 asrs r3,r1,#31 @ sign bit across all bits 1417 eors r0,r3 1418 eors r1,r3 1419 subs r0,r3 1420 sbcs r1,r3 1421uf2d: 1422 push {r4,r5,r14} 1423 ldr r4,=0x432 1424 subs r2,r4,r2 @ form biased exponent 1425@ here 1426@ r0:r1 unnormalised mantissa 1427@ r2 -Q (will become exponent) 1428@ r3 sign across all bits 1429 cmp r1,#0 1430 bne 1f @ short normalising shift? 1431 movs r1,r0 1432 beq 2f @ zero? return it 1433 movs r0,#0 1434 subs r2,#32 @ fix exponent 14351: 1436 asrs r4,r1,#21 1437 bne 3f @ will need shift down (and rounding?) 1438 bcs 4f @ normalised already? 14395: 1440 subs r2,#1 1441 adds r0,r0 @ shift up 1442 adcs r1,r1 1443 lsrs r4,r1,#21 1444 bcc 5b 14454: 1446 ldr r4,=0x7fe 1447 cmp r2,r4 1448 bhs 6f @ over/underflow? return signed zero/infinity 14497: 1450 lsls r2,#20 @ pack and return 1451 adds r1,r2 1452 lsls r3,#31 1453 adds r1,r3 14542: 1455 pop {r4,r5,r15} 14566: @ return signed zero/infinity according to unclamped exponent in r2 1457 mvns r2,r2 1458 lsrs r2,#21 1459 movs r0,#0 1460 movs r1,#0 1461 b 7b 1462 14633: 1464@ here we need to shift down to normalise and possibly round 1465 bmi 1f @ already normalised to Q63? 14662: 1467 subs r2,#1 1468 adds r0,r0 @ shift up 1469 adcs r1,r1 1470 bpl 2b 14711: 1472@ here we have a 1 in b63 of r0:r1 1473 adds r2,#11 @ correct exponent for subsequent shift down 1474 lsls r4,r0,#21 @ save bits for rounding 1475 lsrs r0,#11 1476 lsls r5,r1,#21 1477 orrs r0,r5 1478 lsrs r1,#11 1479 lsls r4,#1 1480 beq 1f @ sticky bits are zero? 14818: 1482 movs r4,#0 1483 adcs r0,r4 1484 adcs r1,r4 1485 b 4b 14861: 1487 bcc 4b @ sticky bits are zero but not on rounding boundary 1488 lsrs r4,r0,#1 @ increment if odd (force round to even) 1489 b 8b 1490 1491 1492.ltorg 1493 1494double_section dunpacks 1495 regular_func dunpacks 1496 mdunpacks r0,r1,r2,r3,r4 1497 ldr r3,=0x3ff 1498 subs r2,r3 @ exponent without offset 1499 bx r14 1500 1501@ r0:r1 signed mantissa Q52 1502@ r2 unbiased exponent < 10 (i.e., |x|<2^10) 1503@ r4 pointer to: 1504@ - divisor reciprocal approximation r=1/d Q15 1505@ - divisor d Q62 0..20 1506@ - divisor d Q62 21..41 1507@ - divisor d Q62 42..62 1508@ returns: 1509@ r0:r1 reduced result y Q62, -0.6 d < y < 0.6 d (better in practice) 1510@ r2 quotient q (number of reductions) 1511@ if exponent >=10, returns r0:r1=0, r2=1024*mantissa sign 1512@ designed to work for 0.5<d<2, in particular d=ln2 (~0.7) and d=π/2 (~1.6) 1513double_section dreduce 1514 regular_func dreduce 1515 adds r2,#2 @ e+2 1516 bmi 1f @ |x|<0.25, too small to need adjustment 1517 cmp r2,#12 1518 bge 4f 15192: 1520 movs r5,#17 1521 subs r5,r2 @ 15-e 1522 movs r3,r1 @ Q20 1523 asrs r3,r5 @ x Q5 1524 adds r2,#8 @ e+10 1525 adds r5,#7 @ 22-e = 32-(e+10) 1526 movs r6,r0 1527 lsrs r6,r5 1528 lsls r0,r2 1529 lsls r1,r2 1530 orrs r1,r6 @ r0:r1 x Q62 1531 ldmia r4,{r4-r7} 1532 muls r3,r4 @ rx Q20 1533 asrs r2,r3,#20 1534 movs r3,#0 1535 adcs r2,r3 @ rx Q0 rounded = q; for e.g. r=1.5 |q|<1.5*2^10 1536 muls r5,r2 @ qd in pieces: L Q62 1537 muls r6,r2 @ M Q41 1538 muls r7,r2 @ H Q20 1539 lsls r7,#10 1540 asrs r4,r6,#11 1541 lsls r6,#21 1542 adds r6,r5 1543 adcs r7,r4 1544 asrs r5,#31 1545 adds r7,r5 @ r6:r7 qd Q62 1546 subs r0,r6 1547 sbcs r1,r7 @ remainder Q62 1548 bx r14 15494: 1550 movs r2,#12 @ overflow: clamp to +/-1024 1551 movs r0,#0 1552 asrs r1,#31 1553 lsls r1,#1 1554 adds r1,#1 1555 lsls r1,#20 1556 b 2b 1557 15581: 1559 lsls r1,#8 1560 lsrs r3,r0,#24 1561 orrs r1,r3 1562 lsls r0,#8 @ r0:r1 Q60, to be shifted down -r2 places 1563 negs r3,r2 1564 adds r2,#32 @ shift down in r3, complementary shift in r2 1565 bmi 1f @ long shift? 15662: 1567 movs r4,r1 1568 asrs r1,r3 1569 lsls r4,r2 1570 lsrs r0,r3 1571 orrs r0,r4 1572 movs r2,#0 @ rounding 1573 adcs r0,r2 1574 adcs r1,r2 1575 bx r14 1576 15771: 1578 movs r0,r1 @ down 32 places 1579 asrs r1,#31 1580 subs r3,#32 1581 adds r2,#32 1582 bpl 2b 1583 movs r0,#0 @ very long shift? return 0 1584 movs r1,#0 1585 movs r2,#0 1586 bx r14 1587 1588double_section dtan_shim 1589 regular_func dtan_shim 1590 push {r4-r7,r14} 1591 bl push_r8_r11 1592 bl dsincos_internal 1593 mov r12,r0 @ save ε 1594 bl dcos_finish 1595 push {r0,r1} 1596 mov r0,r12 1597 bl dsin_finish 1598 pop {r2,r3} 1599 bl pop_r8_r11 1600 b ddiv0 @ compute sin θ/cos θ 1601 1602double_section dcos_shim 1603 regular_func dcos_shim 1604 push {r4-r7,r14} 1605 bl push_r8_r11 1606 bl dsincos_internal 1607 bl dcos_finish 1608 b 1f 1609 1610double_section dsin_shim 1611 regular_func dsin_shim 1612 push {r4-r7,r14} 1613 bl push_r8_r11 1614 bl dsincos_internal 1615 bl dsin_finish 16161: 1617 bl pop_r8_r11 1618 pop {r4-r7,r15} 1619 1620double_section dsincos_shim 1621 1622 @ Note that this function returns in r0-r3 1623 regular_func dsincos_shim 1624 1625 push {r4-r7,r14} 1626 bl push_r8_r11 1627 bl dsincos_internal 1628 mov r12,r0 @ save ε 1629 bl dcos_finish 1630 push {r0,r1} 1631 mov r0,r12 1632 bl dsin_finish 1633 pop {r2,r3} 1634 bl pop_r8_r11 1635 pop {r4-r7,r15} 1636 1637double_section dtrig_guts 1638 1639@ unpack double θ in r0:r1, range reduce and calculate ε, cos α and sin α such that 1640@ θ=α+ε and |ε|≤2^-32 1641@ on return: 1642@ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0) 1643@ r8:r9 cos α Q62 1644@ r10:r11 sin α Q62 1645.align 2 1646.thumb_func 1647dsincos_internal: 1648 push {r14} 1649 bl dunpacks 1650 adr r4,dreddata0 1651 bl dreduce 1652 1653 movs r4,#0 1654 ldr r5,=0x9df04dbb @ this value compensates for the non-unity scaling of the CORDIC rotations 1655 ldr r6,=0x36f656c5 1656 lsls r2,#31 1657 bcc 1f 1658@ quadrant 2 or 3 1659 mvns r6,r6 1660 negs r5,r5 1661 adcs r6,r4 16621: 1663 lsls r2,#1 1664 bcs 1f 1665@ even quadrant 1666 mov r10,r4 1667 mov r11,r4 1668 mov r8,r5 1669 mov r9,r6 1670 b 2f 16711: 1672@ odd quadrant 1673 mov r8,r4 1674 mov r9,r4 1675 mov r10,r5 1676 mov r11,r6 16772: 1678 adr r4,dtab_cc 1679 mov r12,r4 1680 movs r7,#1 1681 movs r6,#31 16821: 1683 bl dcordic_rot_step 1684 adds r7,#1 1685 subs r6,#1 1686 cmp r7,#33 1687 bne 1b 1688 pop {r15} 1689 1690.thumb_func 1691dcos_finish: 1692@ here 1693@ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0) 1694@ r8:r9 cos α Q62 1695@ r10:r11 sin α Q62 1696@ and we wish to calculate cos θ=cos(α+ε)~cos α - ε sin α 1697 mov r1,r11 1698@ mov r2,r10 1699@ lsrs r2,#31 1700@ adds r1,r2 @ rounding improves accuracy very slightly 1701 muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3 1702@ r2:r3 ε sin α Q(62+62-32)=Q92 1703 mov r0,r8 1704 mov r1,r9 1705 lsls r5,r3,#2 1706 asrs r3,r3,#30 1707 lsrs r2,r2,#30 1708 orrs r2,r5 1709 sbcs r0,r2 @ include rounding 1710 sbcs r1,r3 1711 movs r2,#62 1712 b fix642double_shim 1713 1714.thumb_func 1715dsin_finish: 1716@ here 1717@ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0) 1718@ r8:r9 cos α Q62 1719@ r10:r11 sin α Q62 1720@ and we wish to calculate sin θ=sin(α+ε)~sin α + ε cos α 1721 mov r1,r9 1722 muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3 1723@ r2:r3 ε cos α Q(62+62-32)=Q92 1724 mov r0,r10 1725 mov r1,r11 1726 lsls r5,r3,#2 1727 asrs r3,r3,#30 1728 lsrs r2,r2,#30 1729 orrs r2,r5 1730 adcs r0,r2 @ include rounding 1731 adcs r1,r3 1732 movs r2,#62 1733 b fix642double_shim 1734 1735.ltorg 1736.align 2 1737dreddata0: 1738.word 0x0000517d @ 2/π Q15 1739.word 0x0014611A @ π/2 Q62=6487ED5110B4611A split into 21-bit pieces 1740.word 0x000A8885 1741.word 0x001921FB 1742 1743 1744.align 2 1745regular_func datan2_shim 1746@ r0:r1 y 1747@ r2:r3 x 1748 push {r4-r7,r14} 1749 bl push_r8_r11 1750 ldr r5,=0x7ff00000 1751 movs r4,r1 1752 ands r4,r5 @ y==0? 1753 beq 1f 1754 cmp r4,r5 @ or Inf/NaN? 1755 bne 2f 17561: 1757 lsrs r1,#20 @ flush 1758 lsls r1,#20 1759 movs r0,#0 17602: 1761 movs r4,r3 1762 ands r4,r5 @ x==0? 1763 beq 1f 1764 cmp r4,r5 @ or Inf/NaN? 1765 bne 2f 17661: 1767 lsrs r3,#20 @ flush 1768 lsls r3,#20 1769 movs r2,#0 17702: 1771 movs r6,#0 @ quadrant offset 1772 lsls r5,#11 @ constant 0x80000000 1773 cmp r3,#0 1774 bpl 1f @ skip if x positive 1775 movs r6,#2 1776 eors r3,r5 1777 eors r1,r5 1778 bmi 1f @ quadrant offset=+2 if y was positive 1779 negs r6,r6 @ quadrant offset=-2 if y was negative 17801: 1781@ now in quadrant 0 or 3 1782 adds r7,r1,r5 @ r7=-r1 1783 bpl 1f 1784@ y>=0: in quadrant 0 1785 cmp r1,r3 1786 ble 2f @ y<~x so 0≤θ<~π/4: skip 1787 adds r6,#1 1788 eors r1,r5 @ negate x 1789 b 3f @ and exchange x and y = rotate by -π/2 17901: 1791 cmp r3,r7 1792 bge 2f @ -y<~x so -π/4<~θ≤0: skip 1793 subs r6,#1 1794 eors r3,r5 @ negate y and ... 17953: 1796 movs r7,r0 @ exchange x and y 1797 movs r0,r2 1798 movs r2,r7 1799 movs r7,r1 1800 movs r1,r3 1801 movs r3,r7 18022: 1803@ here -π/4<~θ<~π/4 1804@ r6 has quadrant offset 1805 push {r6} 1806 cmp r2,#0 1807 bne 1f 1808 cmp r3,#0 1809 beq 10f @ x==0 going into division? 1810 lsls r4,r3,#1 1811 asrs r4,#21 1812 adds r4,#1 1813 bne 1f @ x==Inf going into division? 1814 lsls r4,r1,#1 1815 asrs r4,#21 1816 adds r4,#1 @ y also ±Inf? 1817 bne 10f 1818 subs r1,#1 @ make them both just finite 1819 subs r3,#1 1820 b 1f 1821 182210: 1823 movs r0,#0 1824 movs r1,#0 1825 b 12f 1826 18271: 1828 bl ddiv_shim 1829 movs r2,#62 1830 bl double2fix64_shim 1831@ r0:r1 y/x 1832 mov r10,r0 1833 mov r11,r1 1834 movs r0,#0 @ ω=0 1835 movs r1,#0 1836 mov r8,r0 1837 movs r2,#1 1838 lsls r2,#30 1839 mov r9,r2 @ x=1 1840 1841 adr r4,dtab_cc 1842 mov r12,r4 1843 movs r7,#1 1844 movs r6,#31 18451: 1846 bl dcordic_vec_step 1847 adds r7,#1 1848 subs r6,#1 1849 cmp r7,#33 1850 bne 1b 1851@ r0:r1 atan(y/x) Q62 1852@ r8:r9 x residual Q62 1853@ r10:r11 y residual Q62 1854 mov r2,r9 1855 mov r3,r10 1856 subs r2,#12 @ this makes atan(0)==0 1857@ the following is basically a division residual y/x ~ atan(residual y/x) 1858 movs r4,#1 1859 lsls r4,#29 1860 movs r7,#0 18612: 1862 lsrs r2,#1 1863 movs r3,r3 @ preserve carry 1864 bmi 1f 1865 sbcs r3,r2 1866 adds r0,r4 1867 adcs r1,r7 1868 lsrs r4,#1 1869 bne 2b 1870 b 3f 18711: 1872 adcs r3,r2 1873 subs r0,r4 1874 sbcs r1,r7 1875 lsrs r4,#1 1876 bne 2b 18773: 1878 lsls r6,r1,#31 1879 asrs r1,#1 1880 lsrs r0,#1 1881 orrs r0,r6 @ Q61 1882 188312: 1884 pop {r6} 1885 1886 cmp r6,#0 1887 beq 1f 1888 ldr r4,=0x885A308D @ π/2 Q61 1889 ldr r5,=0x3243F6A8 1890 bpl 2f 1891 mvns r4,r4 @ negative quadrant offset 1892 mvns r5,r5 18932: 1894 lsls r6,#31 1895 bne 2f @ skip if quadrant offset is ±1 1896 adds r0,r4 1897 adcs r1,r5 18982: 1899 adds r0,r4 1900 adcs r1,r5 19011: 1902 movs r2,#61 1903 bl fix642double_shim 1904 1905 bl pop_r8_r11 1906 pop {r4-r7,r15} 1907 1908.ltorg 1909 1910dtab_cc: 1911.word 0x61bb4f69, 0x1dac6705 @ atan 2^-1 Q62 1912.word 0x96406eb1, 0x0fadbafc @ atan 2^-2 Q62 1913.word 0xab0bdb72, 0x07f56ea6 @ atan 2^-3 Q62 1914.word 0xe59fbd39, 0x03feab76 @ atan 2^-4 Q62 1915.word 0xba97624b, 0x01ffd55b @ atan 2^-5 Q62 1916.word 0xdddb94d6, 0x00fffaaa @ atan 2^-6 Q62 1917.word 0x56eeea5d, 0x007fff55 @ atan 2^-7 Q62 1918.word 0xaab7776e, 0x003fffea @ atan 2^-8 Q62 1919.word 0x5555bbbc, 0x001ffffd @ atan 2^-9 Q62 1920.word 0xaaaaadde, 0x000fffff @ atan 2^-10 Q62 1921.word 0xf555556f, 0x0007ffff @ atan 2^-11 Q62 1922.word 0xfeaaaaab, 0x0003ffff @ atan 2^-12 Q62 1923.word 0xffd55555, 0x0001ffff @ atan 2^-13 Q62 1924.word 0xfffaaaab, 0x0000ffff @ atan 2^-14 Q62 1925.word 0xffff5555, 0x00007fff @ atan 2^-15 Q62 1926.word 0xffffeaab, 0x00003fff @ atan 2^-16 Q62 1927.word 0xfffffd55, 0x00001fff @ atan 2^-17 Q62 1928.word 0xffffffab, 0x00000fff @ atan 2^-18 Q62 1929.word 0xfffffff5, 0x000007ff @ atan 2^-19 Q62 1930.word 0xffffffff, 0x000003ff @ atan 2^-20 Q62 1931.word 0x00000000, 0x00000200 @ atan 2^-21 Q62 @ consider optimising these 1932.word 0x00000000, 0x00000100 @ atan 2^-22 Q62 1933.word 0x00000000, 0x00000080 @ atan 2^-23 Q62 1934.word 0x00000000, 0x00000040 @ atan 2^-24 Q62 1935.word 0x00000000, 0x00000020 @ atan 2^-25 Q62 1936.word 0x00000000, 0x00000010 @ atan 2^-26 Q62 1937.word 0x00000000, 0x00000008 @ atan 2^-27 Q62 1938.word 0x00000000, 0x00000004 @ atan 2^-28 Q62 1939.word 0x00000000, 0x00000002 @ atan 2^-29 Q62 1940.word 0x00000000, 0x00000001 @ atan 2^-30 Q62 1941.word 0x80000000, 0x00000000 @ atan 2^-31 Q62 1942.word 0x40000000, 0x00000000 @ atan 2^-32 Q62 1943 1944double_section dexp_guts 1945regular_func dexp_shim 1946 push {r4-r7,r14} 1947 bl dunpacks 1948 adr r4,dreddata1 1949 bl dreduce 1950 cmp r1,#0 1951 bge 1f 1952 ldr r4,=0xF473DE6B 1953 ldr r5,=0x2C5C85FD @ ln2 Q62 1954 adds r0,r4 1955 adcs r1,r5 1956 subs r2,#1 19571: 1958 push {r2} 1959 movs r7,#1 @ shift 1960 adr r6,dtab_exp 1961 movs r2,#0 1962 movs r3,#1 1963 lsls r3,#30 @ x=1 Q62 1964 19653: 1966 ldmia r6!,{r4,r5} 1967 mov r12,r6 1968 subs r0,r4 1969 sbcs r1,r5 1970 bmi 1f 1971 1972 negs r6,r7 1973 adds r6,#32 @ complementary shift 1974 movs r5,r3 1975 asrs r5,r7 1976 movs r4,r3 1977 lsls r4,r6 1978 movs r6,r2 1979 lsrs r6,r7 @ rounding bit in carry 1980 orrs r4,r6 1981 adcs r2,r4 1982 adcs r3,r5 @ x+=x>>i 1983 b 2f 1984 19851: 1986 adds r0,r4 @ restore argument 1987 adcs r1,r5 19882: 1989 mov r6,r12 1990 adds r7,#1 1991 cmp r7,#33 1992 bne 3b 1993 1994@ here 1995@ r0:r1 ε (residual x, where x=a+ε) Q62, |ε|≤2^-32 (so fits in r0) 1996@ r2:r3 exp a Q62 1997@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε) 1998 muls32_32_64 r0,r3, r4,r1, r5,r6,r7,r4,r1 1999@ r4:r1 ε exp a Q(62+62-32)=Q92 2000 lsrs r4,#30 2001 lsls r0,r1,#2 2002 orrs r0,r4 2003 asrs r1,#30 2004 adds r0,r2 2005 adcs r1,r3 2006 2007 pop {r2} 2008 negs r2,r2 2009 adds r2,#62 2010 bl fix642double_shim @ in principle we can pack faster than this because we know the exponent 2011 pop {r4-r7,r15} 2012 2013.ltorg 2014 2015.align 2 2016regular_func dln_shim 2017 push {r4-r7,r14} 2018 lsls r7,r1,#1 2019 bcs 5f @ <0 ... 2020 asrs r7,#21 2021 beq 5f @ ... or =0? return -Inf 2022 adds r7,#1 2023 beq 6f @ Inf/NaN? return +Inf 2024 bl dunpacks 2025 push {r2} 2026 lsls r1,#9 2027 lsrs r2,r0,#23 2028 orrs r1,r2 2029 lsls r0,#9 2030@ r0:r1 m Q61 = m/2 Q62 0.5≤m/2<1 2031 2032 movs r7,#1 @ shift 2033 adr r6,dtab_exp 2034 mov r12,r6 2035 movs r2,#0 2036 movs r3,#0 @ y=0 Q62 2037 20383: 2039 negs r6,r7 2040 adds r6,#32 @ complementary shift 2041 movs r5,r1 2042 asrs r5,r7 2043 movs r4,r1 2044 lsls r4,r6 2045 movs r6,r0 2046 lsrs r6,r7 2047 orrs r4,r6 @ x>>i, rounding bit in carry 2048 adcs r4,r0 2049 adcs r5,r1 @ x+(x>>i) 2050 2051 lsrs r6,r5,#30 2052 bne 1f @ x+(x>>i)>1? 2053 movs r0,r4 2054 movs r1,r5 @ x+=x>>i 2055 mov r6,r12 2056 ldmia r6!,{r4,r5} 2057 subs r2,r4 2058 sbcs r3,r5 2059 20601: 2061 movs r4,#8 2062 add r12,r4 2063 adds r7,#1 2064 cmp r7,#33 2065 bne 3b 2066@ here: 2067@ r0:r1 residual x, nearly 1 Q62 2068@ r2:r3 y ~ ln m/2 = ln m - ln2 Q62 2069@ result is y + ln2 + ln x ~ y + ln2 + (x-1) 2070 lsls r1,#2 2071 asrs r1,#2 @ x-1 2072 adds r2,r0 2073 adcs r3,r1 2074 2075 pop {r7} 2076@ here: 2077@ r2:r3 ln m/2 = ln m - ln2 Q62 2078@ r7 unbiased exponent 2079.equ dreddata1_plus_4, (dreddata1+4) 2080 adr r4,dreddata1_plus_4 2081 ldmia r4,{r0,r1,r4} 2082 adds r7,#1 2083 muls r0,r7 @ Q62 2084 muls r1,r7 @ Q41 2085 muls r4,r7 @ Q20 2086 lsls r7,r1,#21 2087 asrs r1,#11 2088 asrs r5,r1,#31 2089 adds r0,r7 2090 adcs r1,r5 2091 lsls r7,r4,#10 2092 asrs r4,#22 2093 asrs r5,r1,#31 2094 adds r1,r7 2095 adcs r4,r5 2096@ r0:r1:r4 exponent*ln2 Q62 2097 asrs r5,r3,#31 2098 adds r0,r2 2099 adcs r1,r3 2100 adcs r4,r5 2101@ r0:r1:r4 result Q62 2102 movs r2,#62 21031: 2104 asrs r5,r1,#31 2105 cmp r4,r5 2106 beq 2f @ r4 a sign extension of r1? 2107 lsrs r0,#4 @ no: shift down 4 places and try again 2108 lsls r6,r1,#28 2109 orrs r0,r6 2110 lsrs r1,#4 2111 lsls r6,r4,#28 2112 orrs r1,r6 2113 asrs r4,#4 2114 subs r2,#4 2115 b 1b 21162: 2117 bl fix642double_shim 2118 pop {r4-r7,r15} 2119 21205: 2121 ldr r1,=0xfff00000 2122 movs r0,#0 2123 pop {r4-r7,r15} 2124 21256: 2126 ldr r1,=0x7ff00000 2127 movs r0,#0 2128 pop {r4-r7,r15} 2129 2130.ltorg 2131 2132.align 2 2133dreddata1: 2134.word 0x0000B8AA @ 1/ln2 Q15 2135.word 0x0013DE6B @ ln2 Q62 Q62=2C5C85FDF473DE6B split into 21-bit pieces 2136.word 0x000FEFA3 2137.word 0x000B1721 2138 2139dtab_exp: 2140.word 0xbf984bf3, 0x19f323ec @ log 1+2^-1 Q62 2141.word 0xcd4d10d6, 0x0e47fbe3 @ log 1+2^-2 Q62 2142.word 0x8abcb97a, 0x0789c1db @ log 1+2^-3 Q62 2143.word 0x022c54cc, 0x03e14618 @ log 1+2^-4 Q62 2144.word 0xe7833005, 0x01f829b0 @ log 1+2^-5 Q62 2145.word 0x87e01f1e, 0x00fe0545 @ log 1+2^-6 Q62 2146.word 0xac419e24, 0x007f80a9 @ log 1+2^-7 Q62 2147.word 0x45621781, 0x003fe015 @ log 1+2^-8 Q62 2148.word 0xa9ab10e6, 0x001ff802 @ log 1+2^-9 Q62 2149.word 0x55455888, 0x000ffe00 @ log 1+2^-10 Q62 2150.word 0x0aa9aac4, 0x0007ff80 @ log 1+2^-11 Q62 2151.word 0x01554556, 0x0003ffe0 @ log 1+2^-12 Q62 2152.word 0x002aa9ab, 0x0001fff8 @ log 1+2^-13 Q62 2153.word 0x00055545, 0x0000fffe @ log 1+2^-14 Q62 2154.word 0x8000aaaa, 0x00007fff @ log 1+2^-15 Q62 2155.word 0xe0001555, 0x00003fff @ log 1+2^-16 Q62 2156.word 0xf80002ab, 0x00001fff @ log 1+2^-17 Q62 2157.word 0xfe000055, 0x00000fff @ log 1+2^-18 Q62 2158.word 0xff80000b, 0x000007ff @ log 1+2^-19 Q62 2159.word 0xffe00001, 0x000003ff @ log 1+2^-20 Q62 2160.word 0xfff80000, 0x000001ff @ log 1+2^-21 Q62 2161.word 0xfffe0000, 0x000000ff @ log 1+2^-22 Q62 2162.word 0xffff8000, 0x0000007f @ log 1+2^-23 Q62 2163.word 0xffffe000, 0x0000003f @ log 1+2^-24 Q62 2164.word 0xfffff800, 0x0000001f @ log 1+2^-25 Q62 2165.word 0xfffffe00, 0x0000000f @ log 1+2^-26 Q62 2166.word 0xffffff80, 0x00000007 @ log 1+2^-27 Q62 2167.word 0xffffffe0, 0x00000003 @ log 1+2^-28 Q62 2168.word 0xfffffff8, 0x00000001 @ log 1+2^-29 Q62 2169.word 0xfffffffe, 0x00000000 @ log 1+2^-30 Q62 2170.word 0x80000000, 0x00000000 @ log 1+2^-31 Q62 2171.word 0x40000000, 0x00000000 @ log 1+2^-32 Q62 2172 2173 2174#endif 2175