1/* 2 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. 3 * 4 * SPDX-License-Identifier: BSD-3-Clause 5 */ 6 7#include "pico/asm_helper.S" 8#include "hardware/regs/addressmap.h" 9#include "hardware/divider_helper.S" 10 11#if !HAS_SIO_DIVIDER 12#warning "Building divider_hardware.S on a platform with no SIO divider hardware" 13#endif 14 15// PICO_CONFIG: PICO_DIVIDER_DISABLE_INTERRUPTS, Disable interrupts around division such that divider state need not be saved/restored in exception handlers, default=0, group=pico_divider 16 17// PICO_CONFIG: PICO_DIVIDER_CALL_IDIV0, Whether 32 bit division by zero should call __aeabi_idiv0, default=1, group=pico_divider 18#ifndef PICO_DIVIDER_CALL_IDIV0 19#define PICO_DIVIDER_CALL_IDIV0 1 20#endif 21 22// PICO_CONFIG: PICO_DIVIDER_CALL_IDIV0, Whether 64 bit division by zero should call __aeabi_ldiv0, default=1, group=pico_divider 23#ifndef PICO_DIVIDER_CALL_LDIV0 24#define PICO_DIVIDER_CALL_LDIV0 1 25#endif 26 27pico_default_asm_setup 28 29// PICO_CONFIG: PICO_DIVIDER_IN_RAM, Whether divider functions should be placed in RAM, default=0, group=pico_divider 30.macro div_section name 31#if PICO_DIVIDER_IN_RAM 32.section RAM_SECTION_NAME(\name), "ax" 33#else 34.section SECTION_NAME(\name), "ax" 35#endif 36.endm 37 38@ wait 8-n cycles for the hardware divider 39.macro wait_div n 40.rept (8-\n) / 2 41 b 9f 429: 43.endr 44.if (8-\n) % 2 45 nop 46.endif 47.endm 48 49#if (SIO_DIV_SDIVISOR_OFFSET != SIO_DIV_SDIVIDEND_OFFSET + 4) || (SIO_DIV_QUOTIENT_OFFSET != SIO_DIV_SDIVISOR_OFFSET + 4) || (SIO_DIV_REMAINDER_OFFSET != SIO_DIV_QUOTIENT_OFFSET + 4) 50#error register layout has changed - we rely on this order to make sure we save/restore in the right order 51#endif 52 53#if !PICO_DIVIDER_DISABLE_INTERRUPTS 54.macro save_div_state_and_lr_64 55 push {r4, r5, r6, r7, lr} 56 ldr r6, =SIO_BASE 57 // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia! 58 ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET] 59 ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET] 60 // No need to wait before reading result as long as preceding code takes more than 8 cycles 61 ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET] 62 ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET] 63.endm 64 65.macro restore_div_state_and_return_64 66 // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order 67 // 68 // it is worth considering what happens if we are interrupted 69 // 70 // after writing r4: we are DIRTY and !READY 71 // ... interruptor using div will complete based on incorrect inputs, but dividend at least will be 72 // saved/restored correctly and we'll restore the rest ourselves 73 // after writing r4, r5: we are DIRTY and !READY 74 // ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor 75 // at least will be saved/restored correctly and and we'll restore the rest ourselves 76 // after writing r4, r5, r6: we are DIRTY and READY 77 // ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves), 78 // and we'll restore the remainder after the fact 79 80 mov ip, r2 81 ldr r2, =SIO_BASE 82 // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space 83 // and so 4 reads is cheaper (and we don't have to adjust r2) 84 str r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET] 85 str r5, [r2, #SIO_DIV_UDIVISOR_OFFSET] 86 str r7, [r2, #SIO_DIV_REMAINDER_OFFSET] 87 str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET] 88 mov r2, ip 89 pop {r4, r5, r6, r7, pc} 90.endm 91 92#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ 93 94// since idiv and idivmod only differ by a cycle, we'll make them the same! 95div_section WRAPPER_FUNC_NAME(__aeabi_idiv) 96.align 2 97wrapper_func __aeabi_idiv 98wrapper_func __aeabi_idivmod 99regular_func div_s32s32 100regular_func divmod_s32s32 101#if !PICO_DIVIDER_DISABLE_INTERRUPTS 102 // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty 103 ldr r2, =SIO_BASE 104 ldr r3, [r2, #SIO_DIV_CSR_OFFSET] 105 lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY 106 bcs divmod_s32s32_savestate 107regular_func divmod_s32s32_unsafe 108#else 109// to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs 110// in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there 111// are the hardware_divider functions that can be used instead anyway 112regular_func divmod_s32s32_unsafe 113 // to avoid worrying about IRQs (or context switches), simply disable interrupts around call 114 ldr r2, =SIO_BASE 115 mrs r3, PRIMASK 116 cpsid i 117#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ 118 str r0, [r2, #SIO_DIV_SDIVIDEND_OFFSET] 119 str r1, [r2, #SIO_DIV_SDIVISOR_OFFSET] 120 cmp r1, #0 121 beq 1f 122 wait_div 2 123 // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last) 124 ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET] 125 ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET] 126#if PICO_DIVIDER_DISABLE_INTERRUPTS 127 msr PRIMASK, r3 128#endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */ 129 bx lr 1301: 131#if PICO_DIVIDER_DISABLE_INTERRUPTS 132 msr PRIMASK, r3 133#endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */ 134 push {r2, lr} 135 movs r1, #0x80 136 lsls r1, #24 137 asrs r2, r0, #31 138 eors r1, r2 139 cmp r0, #0 140 beq 1f 141 mvns r0, r1 1421: 143#if PICO_DIVIDER_CALL_IDIV0 144 bl __aeabi_idiv0 145#endif 146 movs r1, #0 // remainder 0 147 // need to restore saved r2 as it hold SIO ptr 148 pop {r2, pc} 149#if !PICO_DIVIDER_DISABLE_INTERRUPTS 150.align 2 151regular_func divmod_s32s32_savestate 152 save_div_state_and_lr 153 bl divmod_s32s32_unsafe 154 restore_div_state_and_return 155#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ 156 157// since uidiv and uidivmod only differ by a cycle, we'll make them the same! 158div_section WRAPPER_FUNC_NAME(__aeabi_uidiv) 159regular_func div_u32u32 160regular_func divmod_u32u32 161wrapper_func __aeabi_uidiv 162wrapper_func __aeabi_uidivmod 163#if !PICO_DIVIDER_DISABLE_INTERRUPTS 164 // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty 165 ldr r2, =SIO_BASE 166 ldr r3, [r2, #SIO_DIV_CSR_OFFSET] 167 lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY 168 bcs divmod_u32u32_savestate 169regular_func divmod_u32u32_unsafe 170#else 171// to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs 172// in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there 173// are the hardware_divider functions that can be used instead anyway 174regular_func divmod_u32u32_unsafe 175 // to avoid worrying about IRQs (or context switches), simply disable interrupts around call 176 ldr r2, =SIO_BASE 177 mrs r3, PRIMASK 178 cpsid i 179#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ 180 str r0, [r2, #SIO_DIV_UDIVIDEND_OFFSET] 181 str r1, [r2, #SIO_DIV_UDIVISOR_OFFSET] 182 cmp r1, #0 183 beq 1f 184 wait_div 2 185 // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last) 186 ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET] 187 ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET] 188#if PICO_DIVIDER_DISABLE_INTERRUPTS 189 msr PRIMASK, r3 190#endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */ 191 bx lr 1921: 193#if PICO_DIVIDER_DISABLE_INTERRUPTS 194 msr PRIMASK, r3 195#endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */ 196 push {r2, lr} 197 cmp r0, #0 198 beq 1f 199 movs r0, #0 200 mvns r0, r0 2011: 202#if PICO_DIVIDER_CALL_IDIV0 203 bl __aeabi_idiv0 204#endif 205 movs r1, #0 // remainder 0 206 // need to restore saved r2 as it hold SIO ptr 207 pop {r2, pc} 208#if !PICO_DIVIDER_DISABLE_INTERRUPTS 209.align 2 210regular_func divmod_u32u32_savestate 211 save_div_state_and_lr 212 bl divmod_u32u32_unsafe 213 restore_div_state_and_return 214#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ 215 216div_section WRAPPER_FUNC_NAME(__aeabi_ldiv) 217 218.align 2 219wrapper_func __aeabi_ldivmod 220regular_func div_s64s64 221regular_func divmod_s64s64 222#if !PICO_DIVIDER_DISABLE_INTERRUPTS 223 // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty 224 mov ip, r2 225 ldr r2, =SIO_BASE 226 ldr r2, [r2, #SIO_DIV_CSR_OFFSET] 227 lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY 228 mov r2, ip 229 bcs divmod_s64s64_savestate 230 b divmod_s64s64_unsafe 231.align 2 232divmod_s64s64_savestate: 233 save_div_state_and_lr_64 234 bl divmod_s64s64_unsafe 235 restore_div_state_and_return_64 236#else 237 // to avoid worrying about IRQs (or context switches), simply disable interrupts around call 238 push {r4, lr} 239 mrs r4, PRIMASK 240 cpsid i 241 bl divmod_s64s64_unsafe 242 msr PRIMASK, r4 243 pop {r4, pc} 244#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ 245 246.align 2 247wrapper_func __aeabi_uldivmod 248regular_func div_u64u64 249regular_func divmod_u64u64 250#if !PICO_DIVIDER_DISABLE_INTERRUPTS 251 // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty 252 mov ip, r2 253 ldr r2, =SIO_BASE 254 ldr r2, [r2, #SIO_DIV_CSR_OFFSET] 255 lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY 256 mov r2, ip 257 bcs divmod_u64u64_savestate 258 b divmod_u64u64_unsafe 259.align 2 260regular_func divmod_u64u64_savestate 261 save_div_state_and_lr_64 262 bl divmod_u64u64_unsafe 263 restore_div_state_and_return_64 264#else 265 // to avoid worrying about IRQs (or context switches), simply disable interrupts around call 266 push {r4, lr} 267 mrs r4, PRIMASK 268 cpsid i 269 bl divmod_u64u64_unsafe 270 msr PRIMASK, r4 271 pop {r4, pc} 272#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ 273 274.macro dneg lo,hi 275 mvns \hi,\hi 276 negs \lo,\lo 277 bne l\@_1 278 adds \hi,#1 279l\@_1: 280.endm 281 282.align 2 283regular_func divmod_s64s64_unsafe 284 cmp r3,#0 285 blt 1f 286@ here x +ve 287 beq 2f @ could x be zero? 2883: 289 cmp r1,#0 290 bge divmod_u64u64_unsafe @ both positive 291@ y -ve, x +ve 292 push {r14} 293 dneg r0,r1 294 bl divmod_u64u64_unsafe 295 dneg r0,r1 296 dneg r2,r3 297 pop {r15} 298 2992: 300 cmp r2,#0 301 bne 3b @ back if x not zero 302 303 cmp r0,#0 @ y==0? 304 bne 4f 305 cmp r1,#0 306 beq 5f @ then pass 0 to __aeabi_ldiv0 3074: 308 movs r0,#0 309 lsrs r1,#31 310 lsls r1,#31 @ get sign bit 311 bne 5f @ y -ve? pass -2^63 to __aeabi_ldiv0 312 mvns r0,r0 313 lsrs r1,r0,#1 @ y +ve: pass 2^63-1 to __aeabi_ldiv0 3145: 315 push {r14} 316#if PICO_DIVIDER_CALL_LDIV0 317 bl __aeabi_ldiv0 318#endif 319 movs r2,#0 @ and return 0 for the remainder 320 movs r3,#0 321 pop {r15} 322 3231: 324@ here x -ve 325 push {r14} 326 cmp r1,#0 327 blt 1f 328@ y +ve, x -ve 329 dneg r2,r3 330 bl divmod_u64u64_unsafe 331 dneg r0,r1 332 pop {r15} 333 3341: 335@ y -ve, x -ve 336 dneg r0,r1 337 dneg r2,r3 338 bl divmod_u64u64_unsafe 339 dneg r2,r3 340 pop {r15} 341 342regular_func divmod_u64u64_unsafe 343 cmp r1,#0 344 bne y64 @ y fits in 32 bits? 345 cmp r3,#0 @ yes; and x? 346 bne 1f 347 cmp r2,#0 348 beq 2f @ x==0? 349 mov r12,r7 350 ldr r7,=SIO_BASE 351 str r0,[r7,#SIO_DIV_UDIVIDEND_OFFSET] 352 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET] 353 movs r1,#0 354 movs r3,#0 355 wait_div 2 356 ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET] 357 ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET] 358 mov r7,r12 359 bx r14 360 3612: @ divide by 0 with y<2^32 362 cmp r0,#0 @ y==0? 363 beq 3f @ then pass 0 to __aeabi_ldiv0 364udiv0: 365 ldr r0,=0xffffffff 366 movs r1,r0 @ pass 2^64-1 to __aeabi_ldiv0 3673: 368 push {r14} 369#if PICO_DIVIDER_CALL_LDIV0 370 bl __aeabi_ldiv0 371#endif 372 movs r2,#0 @ and return 0 for the remainder 373 movs r3,#0 374 pop {r15} 375 3761: 377 movs r2,r0 @ x>y, so result is 0 remainder y 378 movs r3,r1 379 movs r0,#0 380 movs r1,#0 381 bx r14 382 383.ltorg 384 385@ here y occupies more than 32 bits 386@ split into cases according to the size of x 387y64: 388 cmp r3,#0 389 beq 1f 390 b y64_x48 @ if x does not fit in 32 bits, go to 48- and 64-bit cases 3911: 392 lsrs r3,r2,#16 393 bne y64_x32 @ jump if x is 17..32 bits 394 395@ here x is at most 16 bits 396 397 cmp r2,#0 398 beq udiv0 @ x==0? exit as with y!=0 case above 399 push {r7} 400 ldr r7,=SIO_BASE 401 str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET] 402 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET] 403 wait_div 4 404 push {r4, r5} 405 lsrs r4,r0,#16 406 ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r0=y0-q0*x; 0<=r0<x 407 ldr r1,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q0=y0/x; 408 lsls r3,#16 409 orrs r3,r4 410 str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16); 411 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET] @ must set divisor again, as we do not save/restore regs at all in IRQs if not dirty 412 wait_div 1 413 uxth r4,r0 414 ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r1=y1-q1*x; 0<=r1<x 415 ldr r5,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q1=y1/x; 416 lsls r3,#16 417 orrs r3,r4 418 str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16); 419 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET] @ must set divisor again, as we do not save/restore regs at all in IRQs if not dirty 420 wait_div 3 421 movs r3,#0 422 lsls r4,r5,#16 @ quotient=(q0<<32)+(q1<<16)+q2 423 lsrs r5,#16 424 ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r2=y2-q2*x; 0<=r2<x 425 ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q2=y2/x; 426 adds r0,r4 427 adcs r1,r5 428 pop {r4,r5,r7} 429 bx r14 430 431.ltorg 432 433y64_x32: 434@ here x is 17..32 bits 435 push {r4-r7,r14} 436 mov r12,r2 @ save x 437 movs r5,#0 @ xsh=0 438 lsrs r4,r2,#24 439 bne 1f 440 lsls r2,#8 @ if(x0<1U<<24) x0<<=8,xsh =8; 441 adds r5,#8 4421: 443 lsrs r4,r2,#28 444 bne 1f 445 lsls r2,#4 @ if(x0<1U<<28) x0<<=4,xsh+=4; 446 adds r5,#4 4471: 448 lsrs r4,r2,#30 449 bne 1f 450 lsls r2,#2 @ if(x0<1U<<30) x0<<=2,xsh+=2; 451 adds r5,#2 4521: 453 lsrs r4,r2,#31 454 bne 1f 455 lsls r2,#1 @ if(x0<1U<<31) x0<<=1,xsh+=1; 456 adds r5,#1 4571: 458@ now 2^31<=x0<2^32, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+33 33<=qb<49 459 lsrs r4,r2,#15 460 adds r4,#1 @ x1=(x0>>15)+1; 2^16<x1<=2^17 461 462 ldr r7,=SIO_BASE 463 str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET] 464 ldr r4,=0xffffffff 465 str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET] 466 lsrs r6,r1,#16 467 uxth r3,r2 @ x0l 468 wait_div 2 469 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate 470 471@ here 472@ r0:r1 y 473@ r2 x0 474@ r4 r 475@ r5 xsh 476@ r12 x 477 478 muls r6,r4 479 lsrs r6,#16 @ q=((ui32)(y>>48)*r)>>16; 480 lsls r7,r6,#13 481 mov r14,r7 @ quh=q0<<13 482 483 muls r3,r6 @ x0l*q 484 lsrs r7,r3,#15 485 lsls r3,#17 @ r3:r7 is (x0l*q)<<17 486 subs r0,r3 487 sbcs r1,r7 @ y-=(x0l*q)<<17 488 489 lsrs r3,r2,#16 @ x0h 490 muls r3,r6 @ q*x0h 491 adds r3,r3 492 subs r1,r3 @ y-=(x0h*q)<<17 493 494 lsrs r6,r1,#3 495 muls r6,r4 496 lsrs r6,#16 @ q=((ui32)(y>>35)*r)>>16; 497 add r14,r6 @ quh+=q1 498 499 uxth r3,r2 @ x0l 500 muls r3,r6 @ x0l*q 501 lsrs r7,r3,#28 502 lsls r3,#4 @ r3:r7 is (x0l*q)<<4 503 subs r0,r3 504 sbcs r1,r7 @ y-=(x0l*q)<<4 505 506 lsrs r3,r2,#16 @ x0h 507 muls r3,r6 @ x0h*q 508 lsrs r7,r3,#12 509 lsls r3,#20 @ r3:r7 is (x0h*q)<<4 510 subs r0,r3 511 sbcs r1,r7 @ y-=(x0h*q)<<4 512 513 lsrs r6,r0,#22 514 lsls r7,r1,#10 515 orrs r6,r7 @ y>>22 516 muls r6,r4 517 lsrs r6,#16 @ q=((ui32)(y>>22)*r)>>16; 518 519 cmp r5,#9 520 blt last0 @ if(xsh<9) goto last0; 521 522@ on this path xsh>=9, which means x<2^23 523 lsrs r2,#9 @ x0>>9: this shift loses no bits 524@ the remainder y-x0*q is guaranteed less than a very small multiple of the remaining quotient 525@ bits (at most 6 bits) times x, and so fits in one word 526 muls r2,r6 @ x0*q 527 subs r0,r2 @ y-x0*q 528 lsls r7,r6,#13 @ qul=q<<13 5291: 530 lsrs r6,r0,#9 531 muls r6,r4 532 lsrs r6,#16 @ q=((ui32)(y>>9)*r)>>16; 533 534@ here 535@ r0 y 536@ r2 x0>>9 537@ r5 xsh 538@ r6 q 539@ r7 qul 540@ r12 x 541@ r14 quh 542 543 movs r3,#22 544 subs r3,r5 @ 22-xsh 545 lsrs r6,r3 @ q>>=22-xsh 546 lsrs r7,r3 @ qul>>=22-xsh 547 adds r7,r6 @ qul+=q 548 mov r4,r12 549 muls r6,r4 @ x*q 550 subs r2,r0,r6 @ y-=x*q 551 mov r0,r14 @ quh 552 adds r5,#4 @ xsh+4 553 adds r3,#6 @ 28-xsh 554 movs r1,r0 555 lsrs r1,r3 556 lsls r0,r5 @ r0:r1 is quh<<(4+xsh) 557 adds r0,r7 558 bcc 1f 5592: 560 adds r1,#1 5611: @ qu=((ui64)quh<<(4+xsh))+qul 562 cmp r2,r4 563 bhs 3f 564 movs r3,#0 565 pop {r4-r7,r15} 566 567.ltorg 568 5693: 570 subs r2,r4 571 adds r0,#1 572 bcc 1b 573 b 2b @ while(y>=x) y-=x,qu++; 574 575@ here: 576@ r0:r1 y 577@ r2 x0 578@ r4 r 579@ r5 xsh; xsh<9 580@ r6 q 581 582last0: 583 movs r7,#9 584 subs r7,r5 @ 9-xsh 585 lsrs r6,r7 586 mov r4,r12 @ x 587 uxth r2,r4 588 muls r2,r6 @ q*xlo 589 subs r0,r2 590 bcs 1f 591 subs r1,#1 @ y-=q*xlo 5921: 593 lsrs r2,r4,#16 @ xhi 594 muls r2,r6 @ q*xhi 595 lsrs r3,r2,#16 596 lsls r2,#16 597 subs r2,r0,r2 598 sbcs r1,r3 @ y-q*xhi 599 movs r3,r1 @ y now in r2:r3 600 mov r0,r14 @ quh 601 adds r5,#4 @ xsh+4 602 adds r7,#19 @ 28-xsh 603 movs r1,r0 604 lsrs r1,r7 605 lsls r0,r5 @ r0:r1 is quh<<(4+xsh) 606 adds r0,r6 607 bcc 1f 608 adds r1,#1 @ quh<<(xsh+4))+q 6091: 610 cmp r3,#0 @ y>=2^32? 611 bne 3f 612 cmp r2,r4 @ y>=x? 613 bhs 4f 614 pop {r4-r7,r15} 615 6163: 617 adds r0,#1 @ qu++ 618 bcc 2f 619 adds r1,#1 6202: 621 subs r2,r4 @ y-=x 622 bcs 3b 623 subs r3,#1 624 bne 3b 625 6261: 627 cmp r2,r4 628 bhs 4f 629 pop {r4-r7,r15} 630 6314: 632 adds r0,#1 @ qu++ 633 bcc 2f 634 adds r1,#1 6352: 636 subs r2,r4 @ y-=x 637 b 1b 638 639y64_x48: 640@ here x is 33..64 bits 641 push {r4-r7,r14} @ save a copy of x 642 lsrs r4,r3,#16 643 beq 1f 644 b y64_x64 @ jump if x is 49..64 bits 6451: 646 push {r2-r3} @ save a copy of x 647@ here x is 33..48 bits 648 movs r5,#0 @ xsh=0 649 lsrs r4,r3,#8 650 bne 1f 651 lsls r3,#8 652 lsrs r6,r2,#24 653 orrs r3,r6 654 lsls r2,#8 @ if(x0<1U<<40) x0<<=8,xsh =8; 655 adds r5,#8 6561: 657 lsrs r4,r3,#12 658 bne 1f 659 lsls r3,#4 660 lsrs r6,r2,#28 661 orrs r3,r6 662 lsls r2,#4 @ if(x0<1U<<44) x0<<=4,xsh+=4; 663 adds r5,#4 6641: 665 lsrs r4,r3,#14 666 bne 1f 667 lsls r3,#2 668 lsrs r6,r2,#30 669 orrs r3,r6 670 lsls r2,#2 @ if(x0<1U<<46) x0<<=2,xsh+=2; 671 adds r5,#2 6721: 673 lsrs r4,r3,#15 674 bne 1f 675 adds r2,r2 676 adcs r3,r3 @ if(x0<1U<<47) x0<<=1,xsh+=1; 677 adds r5,#1 6781: 679@ now 2^47<=x0<2^48, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+17 17<=qb<33 680 movs r4,r3 681 adds r7,r2,r2 682 adcs r4,r4 683 adds r4,#1 @ x1=(ui32)(x0>>31)+1; // 2^16<x1<=2^17 684 685 ldr r7,=SIO_BASE 686 str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET] 687 ldr r4,=0xffffffff 688 str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET] 689 lsrs r6,r1,#16 690 wait_div 1 691 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate 692 693@ here 694@ r0:r1 y 695@ r2:r3 x0 696@ r4 r 697@ r5 xsh 0<=xsh<16 698 699 muls r6,r4 700 lsrs r6,#16 @ q=((ui32)(y>>48)*r)>>16; 701 lsls r7,r6,#13 702 mov r14,r7 @ save q<<13 703 uxth r7,r2 @ x0l 704 muls r7,r6 705 subs r0,r7 706 bcs 1f 707 subs r1,#1 7081: 709 subs r0,r7 710 bcs 1f 711 subs r1,#1 7121: 713 uxth r7,r3 @ x0h 714 muls r7,r6 715 subs r1,r7 716 subs r1,r7 717 lsrs r7,r2,#16 @ x0m 718 muls r7,r6 719 lsls r6,r7,#17 720 lsrs r7,#15 721 subs r0,r6 722 sbcs r1,r7 @ y-=((ui64)q*x0)<<1; 723 724 lsrs r6,r1,#3 @ y>>35 725 muls r6,r4 726 lsrs r6,#16 @ q=((ui32)(y>>35)*r)>>16; 727 728 cmp r5,#12 729 blt last1 @ if(xsh<12) goto last1; 730 731 add r14,r6 @ qu<<13+q 732 lsrs r2,#12 733 lsls r7,r3,#20 734 orrs r2,r7 735 lsrs r3,#12 @ x0>>12 736 737 uxth r7,r2 @ x0l 738 muls r7,r6 739 subs r0,r7 740 bcs 1f 741 subs r1,#1 7421: 743 uxth r7,r3 @ x0h 744 muls r7,r6 745 subs r1,r7 746 lsrs r7,r2,#16 @ x0m 747 muls r7,r6 748 lsls r6,r7,#16 749 lsrs r7,#16 750 subs r0,r6 751 sbcs r1,r7 @ y-=((ui64)q*x0)>>12 752 753 lsrs r6,r0,#22 754 lsls r7,r1,#10 755 orrs r6,r7 @ y>>22 756 muls r6,r4 757 movs r7,#41 758 subs r7,r5 759 lsrs r6,r7 @ q=((ui32)(y>>22)*r)>>(16+25-xsh) 760 761 subs r5,#12 762 mov r7,r14 763 lsls r7,r5 7642: 765 adds r7,r6 @ qu=(qu<<(xsh-12))+q 766 pop {r4,r5} @ recall x 767 768@ here 769@ r0:r1 y 770@ r4:r5 x 771@ r6 q 772@ r7 qu 773 774 uxth r2,r4 775 uxth r3,r5 776 muls r2,r6 @ xlo*q 777 muls r3,r6 @ xhi*q 778 subs r0,r2 779 sbcs r1,r3 780 lsrs r2,r4,#16 781 muls r2,r6 782 lsrs r3,r2,#16 783 lsls r2,#16 @ xm*q 784 subs r0,r2 785 sbcs r1,r3 @ y-=(ui64)q*x 786 7871: 788 movs r2,r0 789 movs r3,r1 790 adds r7,#1 791 subs r0,r4 792 sbcs r1,r5 @ while(y>=x) y-=x,qu++; 793 bhs 1b 794 subs r0,r7,#1 @ correction to qu 795 movs r1,#0 796 pop {r4-r7,r15} 797 798last1: 799@ r0:r1 y 800@ r2:r3 x0 801@ r5 xsh 802@ r6 q 803 804 movs r7,#12 805 subs r7,r5 806 lsrs r6,r7 @ q>>=12-xsh 807 mov r7,r14 808 lsrs r7,#13 809 lsls r7,r5 810 adds r7,r7 @ qu<<(xsh+1) 811 b 2b 812 813y64_x64: 814@ here x is 49..64 bits 815 movs r4,#0 @ q=0 if x>>32==0xffffffff 816 adds r5,r3,#1 817 beq 1f 818 819 ldr r7,=SIO_BASE 820 str r5,[r7,#SIO_DIV_UDIVISOR_OFFSET] 821 str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET] 822 wait_div 0 823 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q=(ui32)(y>>32)/((x>>32)+1) 8241: 825 uxth r5,r2 826 uxth r6,r3 827 muls r5,r4 828 muls r6,r4 829 subs r0,r5 830 sbcs r1,r6 831 lsrs r5,r2,#16 832 lsrs r6,r3,#16 833 muls r5,r4 834 muls r6,r4 835 lsls r6,#16 836 lsrs r7,r5,#16 837 orrs r6,r7 838 lsls r5,#16 839 subs r0,r5 840 sbcs r1,r6 @ y-=(ui64)q*x 841 842 cmp r1,r3 @ while(y>=x) y-=x,q++ 843 bhs 1f 8443: 845 movs r2,r0 846 movs r3,r1 847 movs r0,r4 848 movs r1,#0 849 pop {r4-r7,r15} 850 8511: 852 bne 2f 853 cmp r0,r2 854 blo 3b 8552: 856 subs r0,r2 857 sbcs r1,r3 858 adds r4,#1 859 cmp r1,r3 860 blo 3b 861 b 1b 862 863div_section divmod_s64s64_rem 864regular_func divmod_s64s64_rem 865 push {r4, lr} 866 bl divmod_s64s64 867 ldr r4, [sp, #8] 868 stmia r4!, {r2,r3} 869 pop {r4, pc} 870 871div_section divmod_u64u64_rem 872regular_func divmod_u64u64_rem 873 push {r4, lr} 874 bl divmod_u64u64 875 ldr r4, [sp, #8] 876 stmia r4!, {r2,r3} 877 pop {r4, pc} 878