1/*
2 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7#include "pico/asm_helper.S"
8#include "hardware/regs/addressmap.h"
9#include "hardware/divider_helper.S"
10
11#if !HAS_SIO_DIVIDER
12#warning "Building divider_hardware.S on a platform with no SIO divider hardware"
13#endif
14
15// PICO_CONFIG: PICO_DIVIDER_DISABLE_INTERRUPTS, Disable interrupts around division such that divider state need not be saved/restored in exception handlers, default=0, group=pico_divider
16
17// PICO_CONFIG: PICO_DIVIDER_CALL_IDIV0, Whether 32 bit division by zero should call __aeabi_idiv0, default=1, group=pico_divider
18#ifndef PICO_DIVIDER_CALL_IDIV0
19#define PICO_DIVIDER_CALL_IDIV0 1
20#endif
21
22// PICO_CONFIG: PICO_DIVIDER_CALL_IDIV0, Whether 64 bit division by zero should call __aeabi_ldiv0, default=1, group=pico_divider
23#ifndef PICO_DIVIDER_CALL_LDIV0
24#define PICO_DIVIDER_CALL_LDIV0 1
25#endif
26
27pico_default_asm_setup
28
29// PICO_CONFIG: PICO_DIVIDER_IN_RAM, Whether divider functions should be placed in RAM, default=0, group=pico_divider
30.macro div_section name
31#if PICO_DIVIDER_IN_RAM
32.section RAM_SECTION_NAME(\name), "ax"
33#else
34.section SECTION_NAME(\name), "ax"
35#endif
36.endm
37
38@ wait 8-n cycles for the hardware divider
39.macro wait_div n
40.rept (8-\n) / 2
41    b 9f
429:
43.endr
44.if (8-\n) % 2
45    nop
46.endif
47.endm
48
49#if (SIO_DIV_SDIVISOR_OFFSET != SIO_DIV_SDIVIDEND_OFFSET + 4) || (SIO_DIV_QUOTIENT_OFFSET != SIO_DIV_SDIVISOR_OFFSET + 4) || (SIO_DIV_REMAINDER_OFFSET != SIO_DIV_QUOTIENT_OFFSET + 4)
50#error register layout has changed - we rely on this order to make sure we save/restore in the right order
51#endif
52
53#if !PICO_DIVIDER_DISABLE_INTERRUPTS
54.macro save_div_state_and_lr_64
55    push {r4, r5, r6, r7, lr}
56    ldr r6, =SIO_BASE
57    // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
58    ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET]
59    ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET]
60    // No need to wait before reading result as long as preceding code takes more than 8 cycles
61    ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET]
62    ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET]
63.endm
64
65.macro restore_div_state_and_return_64
66    // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order
67    //
68    // it is worth considering what happens if we are interrupted
69    //
70    // after writing r4: we are DIRTY and !READY
71    //    ... interruptor using div will complete based on incorrect inputs, but dividend at least will be
72    //        saved/restored correctly and we'll restore the rest ourselves
73    // after writing r4, r5: we are DIRTY and !READY
74    //    ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor
75    //        at least will be saved/restored correctly and and we'll restore the rest ourselves
76    // after writing r4, r5, r6: we are DIRTY and READY
77    //    ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves),
78    //        and we'll restore the remainder after the fact
79
80    mov ip, r2
81    ldr r2, =SIO_BASE
82    // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space
83    // and so 4 reads is cheaper (and we don't have to adjust r2)
84    str r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
85    str r5, [r2, #SIO_DIV_UDIVISOR_OFFSET]
86    str r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
87    str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
88    mov r2, ip
89    pop {r4, r5, r6, r7, pc}
90.endm
91
92#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
93
94// since idiv and idivmod only differ by a cycle, we'll make them the same!
95div_section WRAPPER_FUNC_NAME(__aeabi_idiv)
96.align 2
97wrapper_func __aeabi_idiv
98wrapper_func __aeabi_idivmod
99regular_func div_s32s32
100regular_func divmod_s32s32
101#if !PICO_DIVIDER_DISABLE_INTERRUPTS
102    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
103    ldr r2, =SIO_BASE
104    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
105    lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
106    bcs divmod_s32s32_savestate
107regular_func divmod_s32s32_unsafe
108#else
109// to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs
110// in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there
111// are the hardware_divider functions that can be used instead anyway
112regular_func divmod_s32s32_unsafe
113    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
114    ldr r2, =SIO_BASE
115    mrs r3, PRIMASK
116    cpsid i
117#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
118    str r0, [r2, #SIO_DIV_SDIVIDEND_OFFSET]
119    str r1, [r2, #SIO_DIV_SDIVISOR_OFFSET]
120    cmp r1, #0
121    beq 1f
122    wait_div 2
123    // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last)
124    ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET]
125    ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET]
126#if PICO_DIVIDER_DISABLE_INTERRUPTS
127    msr PRIMASK, r3
128#endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */
129    bx lr
1301:
131#if PICO_DIVIDER_DISABLE_INTERRUPTS
132    msr PRIMASK, r3
133#endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */
134    push {r2, lr}
135    movs r1, #0x80
136    lsls r1, #24
137    asrs r2, r0, #31
138    eors r1, r2
139    cmp r0, #0
140    beq 1f
141    mvns r0, r1
1421:
143#if PICO_DIVIDER_CALL_IDIV0
144    bl __aeabi_idiv0
145#endif
146    movs r1, #0 // remainder 0
147    // need to restore saved r2 as it hold SIO ptr
148    pop {r2, pc}
149#if !PICO_DIVIDER_DISABLE_INTERRUPTS
150.align 2
151regular_func divmod_s32s32_savestate
152    save_div_state_and_lr
153    bl divmod_s32s32_unsafe
154    restore_div_state_and_return
155#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
156
157// since uidiv and uidivmod only differ by a cycle, we'll make them the same!
158div_section WRAPPER_FUNC_NAME(__aeabi_uidiv)
159regular_func div_u32u32
160regular_func divmod_u32u32
161wrapper_func __aeabi_uidiv
162wrapper_func __aeabi_uidivmod
163#if !PICO_DIVIDER_DISABLE_INTERRUPTS
164    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
165    ldr r2, =SIO_BASE
166    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
167    lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
168    bcs divmod_u32u32_savestate
169regular_func divmod_u32u32_unsafe
170#else
171// to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs
172// in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there
173// are the hardware_divider functions that can be used instead anyway
174regular_func divmod_u32u32_unsafe
175    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
176    ldr r2, =SIO_BASE
177    mrs r3, PRIMASK
178    cpsid i
179#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
180    str r0, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
181    str r1, [r2, #SIO_DIV_UDIVISOR_OFFSET]
182    cmp r1, #0
183    beq 1f
184    wait_div 2
185    // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last)
186    ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET]
187    ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET]
188#if PICO_DIVIDER_DISABLE_INTERRUPTS
189    msr PRIMASK, r3
190#endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */
191    bx lr
1921:
193#if PICO_DIVIDER_DISABLE_INTERRUPTS
194    msr PRIMASK, r3
195#endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */
196    push {r2, lr}
197    cmp r0, #0
198    beq 1f
199    movs r0, #0
200    mvns r0, r0
2011:
202#if PICO_DIVIDER_CALL_IDIV0
203    bl __aeabi_idiv0
204#endif
205    movs r1, #0 // remainder 0
206    // need to restore saved r2 as it hold SIO ptr
207    pop {r2, pc}
208#if !PICO_DIVIDER_DISABLE_INTERRUPTS
209.align 2
210regular_func divmod_u32u32_savestate
211    save_div_state_and_lr
212    bl divmod_u32u32_unsafe
213    restore_div_state_and_return
214#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
215
216div_section WRAPPER_FUNC_NAME(__aeabi_ldiv)
217
218.align 2
219wrapper_func __aeabi_ldivmod
220regular_func div_s64s64
221regular_func divmod_s64s64
222#if !PICO_DIVIDER_DISABLE_INTERRUPTS
223    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
224    mov ip, r2
225    ldr r2, =SIO_BASE
226    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
227    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
228    mov r2, ip
229    bcs divmod_s64s64_savestate
230    b divmod_s64s64_unsafe
231.align 2
232divmod_s64s64_savestate:
233    save_div_state_and_lr_64
234    bl divmod_s64s64_unsafe
235    restore_div_state_and_return_64
236#else
237    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
238    push {r4, lr}
239    mrs r4, PRIMASK
240    cpsid i
241    bl divmod_s64s64_unsafe
242    msr PRIMASK, r4
243    pop {r4, pc}
244#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
245
246.align 2
247wrapper_func __aeabi_uldivmod
248regular_func div_u64u64
249regular_func divmod_u64u64
250#if !PICO_DIVIDER_DISABLE_INTERRUPTS
251    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
252    mov ip, r2
253    ldr r2, =SIO_BASE
254    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
255    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
256    mov r2, ip
257    bcs divmod_u64u64_savestate
258    b divmod_u64u64_unsafe
259.align 2
260regular_func divmod_u64u64_savestate
261    save_div_state_and_lr_64
262    bl divmod_u64u64_unsafe
263    restore_div_state_and_return_64
264#else
265    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
266    push {r4, lr}
267    mrs r4, PRIMASK
268    cpsid i
269    bl divmod_u64u64_unsafe
270    msr PRIMASK, r4
271    pop {r4, pc}
272#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
273
274.macro dneg lo,hi
275 mvns \hi,\hi
276 negs \lo,\lo
277 bne l\@_1
278 adds \hi,#1
279l\@_1:
280.endm
281
282.align 2
283regular_func divmod_s64s64_unsafe
284 cmp r3,#0
285 blt 1f
286@ here x +ve
287 beq 2f                    @ could x be zero?
2883:
289 cmp r1,#0
290 bge divmod_u64u64_unsafe  @ both positive
291@ y -ve, x +ve
292 push {r14}
293 dneg r0,r1
294 bl divmod_u64u64_unsafe
295 dneg r0,r1
296 dneg r2,r3
297 pop {r15}
298
2992:
300 cmp r2,#0
301 bne 3b                    @ back if x not zero
302
303 cmp r0,#0                 @ y==0?
304 bne 4f
305 cmp r1,#0
306 beq 5f                    @ then pass 0 to __aeabi_ldiv0
3074:
308 movs r0,#0
309 lsrs r1,#31
310 lsls r1,#31               @ get sign bit
311 bne 5f                    @ y -ve? pass -2^63 to __aeabi_ldiv0
312 mvns r0,r0
313 lsrs r1,r0,#1             @ y +ve: pass 2^63-1 to __aeabi_ldiv0
3145:
315 push {r14}
316#if PICO_DIVIDER_CALL_LDIV0
317 bl __aeabi_ldiv0
318#endif
319 movs r2,#0                @ and return 0 for the remainder
320 movs r3,#0
321 pop {r15}
322
3231:
324@ here x -ve
325 push {r14}
326 cmp r1,#0
327 blt 1f
328@ y +ve, x -ve
329 dneg r2,r3
330 bl divmod_u64u64_unsafe
331 dneg r0,r1
332 pop {r15}
333
3341:
335@ y -ve, x -ve
336 dneg r0,r1
337 dneg r2,r3
338 bl divmod_u64u64_unsafe
339 dneg r2,r3
340 pop {r15}
341
342regular_func divmod_u64u64_unsafe
343 cmp r1,#0
344 bne y64                   @ y fits in 32 bits?
345 cmp r3,#0                 @ yes; and x?
346 bne 1f
347 cmp r2,#0
348 beq 2f                    @ x==0?
349 mov r12,r7
350 ldr r7,=SIO_BASE
351 str r0,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
352 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET]
353 movs r1,#0
354 movs r3,#0
355 wait_div 2
356 ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET]
357 ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET]
358 mov r7,r12
359 bx r14
360
3612:                         @ divide by 0 with y<2^32
362 cmp r0,#0                 @ y==0?
363 beq 3f                    @ then pass 0 to __aeabi_ldiv0
364udiv0:
365 ldr r0,=0xffffffff
366 movs r1,r0                @ pass 2^64-1 to __aeabi_ldiv0
3673:
368 push {r14}
369#if PICO_DIVIDER_CALL_LDIV0
370 bl __aeabi_ldiv0
371#endif
372 movs r2,#0                @ and return 0 for the remainder
373 movs r3,#0
374 pop {r15}
375
3761:
377 movs r2,r0                @ x>y, so result is 0 remainder y
378 movs r3,r1
379 movs r0,#0
380 movs r1,#0
381 bx r14
382
383.ltorg
384
385@ here y occupies more than 32 bits
386@ split into cases according to the size of x
387y64:
388 cmp r3,#0
389 beq 1f
390 b y64_x48                 @ if x does not fit in 32 bits, go to 48- and 64-bit cases
3911:
392 lsrs r3,r2,#16
393 bne y64_x32               @ jump if x is 17..32 bits
394
395@ here x is at most 16 bits
396
397 cmp r2,#0
398 beq udiv0                 @ x==0? exit as with y!=0 case above
399 push {r7}
400 ldr r7,=SIO_BASE
401 str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
402 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET]
403 wait_div 4
404 push {r4, r5}
405 lsrs r4,r0,#16
406 ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r0=y0-q0*x; 0<=r0<x
407 ldr r1,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ q0=y0/x;
408 lsls r3,#16
409 orrs r3,r4
410 str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16);
411 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET]  @ must set divisor again, as we do not save/restore regs at all in IRQs if not dirty
412 wait_div 1
413 uxth r4,r0
414 ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r1=y1-q1*x; 0<=r1<x
415 ldr r5,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ q1=y1/x;
416 lsls r3,#16
417 orrs r3,r4
418 str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16);
419 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET]  @ must set divisor again, as we do not save/restore regs at all in IRQs if not dirty
420 wait_div 3
421 movs r3,#0
422 lsls r4,r5,#16             @ quotient=(q0<<32)+(q1<<16)+q2
423 lsrs r5,#16
424 ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r2=y2-q2*x; 0<=r2<x
425 ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ q2=y2/x;
426 adds r0,r4
427 adcs r1,r5
428 pop {r4,r5,r7}
429 bx r14
430
431.ltorg
432
433y64_x32:
434@ here x is 17..32 bits
435 push {r4-r7,r14}
436 mov r12,r2                @ save x
437 movs r5,#0                @ xsh=0
438 lsrs r4,r2,#24
439 bne 1f
440 lsls r2,#8                @ if(x0<1U<<24) x0<<=8,xsh =8;
441 adds r5,#8
4421:
443 lsrs r4,r2,#28
444 bne 1f
445 lsls r2,#4                @ if(x0<1U<<28) x0<<=4,xsh+=4;
446 adds r5,#4
4471:
448 lsrs r4,r2,#30
449 bne 1f
450 lsls r2,#2                @ if(x0<1U<<30) x0<<=2,xsh+=2;
451 adds r5,#2
4521:
453 lsrs r4,r2,#31
454 bne 1f
455 lsls r2,#1                @ if(x0<1U<<31) x0<<=1,xsh+=1;
456 adds r5,#1
4571:
458@ now 2^31<=x0<2^32, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+33 33<=qb<49
459 lsrs r4,r2,#15
460 adds r4,#1                @ x1=(x0>>15)+1; 2^16<x1<=2^17
461
462 ldr r7,=SIO_BASE
463 str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET]
464 ldr r4,=0xffffffff
465 str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
466 lsrs r6,r1,#16
467 uxth r3,r2                @ x0l
468 wait_div 2
469 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate
470
471@ here
472@ r0:r1 y
473@ r2    x0
474@ r4    r
475@ r5    xsh
476@ r12   x
477
478 muls r6,r4
479 lsrs r6,#16               @ q=((ui32)(y>>48)*r)>>16;
480 lsls r7,r6,#13
481 mov r14,r7                @ quh=q0<<13
482
483 muls r3,r6                @ x0l*q
484 lsrs r7,r3,#15
485 lsls r3,#17               @ r3:r7 is (x0l*q)<<17
486 subs r0,r3
487 sbcs r1,r7                @ y-=(x0l*q)<<17
488
489 lsrs r3,r2,#16            @ x0h
490 muls r3,r6                @ q*x0h
491 adds r3,r3
492 subs r1,r3                @ y-=(x0h*q)<<17
493
494 lsrs r6,r1,#3
495 muls r6,r4
496 lsrs r6,#16               @ q=((ui32)(y>>35)*r)>>16;
497 add r14,r6                @ quh+=q1
498
499 uxth r3,r2                @ x0l
500 muls r3,r6                @ x0l*q
501 lsrs r7,r3,#28
502 lsls r3,#4                @ r3:r7 is (x0l*q)<<4
503 subs r0,r3
504 sbcs r1,r7                @ y-=(x0l*q)<<4
505
506 lsrs r3,r2,#16            @ x0h
507 muls r3,r6                @ x0h*q
508 lsrs r7,r3,#12
509 lsls r3,#20               @ r3:r7 is (x0h*q)<<4
510 subs r0,r3
511 sbcs r1,r7                @ y-=(x0h*q)<<4
512
513 lsrs r6,r0,#22
514 lsls r7,r1,#10
515 orrs r6,r7                @ y>>22
516 muls r6,r4
517 lsrs r6,#16               @ q=((ui32)(y>>22)*r)>>16;
518
519 cmp r5,#9
520 blt last0                 @ if(xsh<9) goto last0;
521
522@ on this path xsh>=9, which means x<2^23
523 lsrs r2,#9                @ x0>>9: this shift loses no bits
524@ the remainder y-x0*q is guaranteed less than a very small multiple of the remaining quotient
525@ bits (at most 6 bits) times x, and so fits in one word
526 muls r2,r6                @ x0*q
527 subs r0,r2                @ y-x0*q
528 lsls r7,r6,#13            @ qul=q<<13
5291:
530 lsrs r6,r0,#9
531 muls r6,r4
532 lsrs r6,#16               @ q=((ui32)(y>>9)*r)>>16;
533
534@ here
535@ r0 y
536@ r2 x0>>9
537@ r5 xsh
538@ r6 q
539@ r7 qul
540@ r12 x
541@ r14 quh
542
543 movs r3,#22
544 subs r3,r5                @ 22-xsh
545 lsrs r6,r3                @ q>>=22-xsh
546 lsrs r7,r3                @ qul>>=22-xsh
547 adds r7,r6                @ qul+=q
548 mov r4,r12
549 muls r6,r4                @ x*q
550 subs r2,r0,r6             @ y-=x*q
551 mov r0,r14                @ quh
552 adds r5,#4                @ xsh+4
553 adds r3,#6                @ 28-xsh
554 movs r1,r0
555 lsrs r1,r3
556 lsls r0,r5                @ r0:r1 is quh<<(4+xsh)
557 adds r0,r7
558 bcc 1f
5592:
560 adds r1,#1
5611:                         @ qu=((ui64)quh<<(4+xsh))+qul
562 cmp r2,r4
563 bhs 3f
564 movs r3,#0
565 pop {r4-r7,r15}
566
567.ltorg
568
5693:
570 subs r2,r4
571 adds r0,#1
572 bcc 1b
573 b 2b                      @ while(y>=x) y-=x,qu++;
574
575@ here:
576@ r0:r1 y
577@ r2 x0
578@ r4 r
579@ r5 xsh; xsh<9
580@ r6 q
581
582last0:
583 movs r7,#9
584 subs r7,r5                @ 9-xsh
585 lsrs r6,r7
586 mov r4,r12                @ x
587 uxth r2,r4
588 muls r2,r6                @ q*xlo
589 subs r0,r2
590 bcs 1f
591 subs r1,#1                @ y-=q*xlo
5921:
593 lsrs r2,r4,#16            @ xhi
594 muls r2,r6                @ q*xhi
595 lsrs r3,r2,#16
596 lsls r2,#16
597 subs r2,r0,r2
598 sbcs r1,r3                @ y-q*xhi
599 movs r3,r1                @ y now in r2:r3
600 mov r0,r14                @ quh
601 adds r5,#4                @ xsh+4
602 adds r7,#19               @ 28-xsh
603 movs r1,r0
604 lsrs r1,r7
605 lsls r0,r5                @ r0:r1 is quh<<(4+xsh)
606 adds r0,r6
607 bcc 1f
608 adds r1,#1                @ quh<<(xsh+4))+q
6091:
610 cmp r3,#0                 @ y>=2^32?
611 bne 3f
612 cmp r2,r4                 @ y>=x?
613 bhs 4f
614 pop {r4-r7,r15}
615
6163:
617 adds r0,#1                @ qu++
618 bcc 2f
619 adds r1,#1
6202:
621 subs r2,r4                @ y-=x
622 bcs 3b
623 subs r3,#1
624 bne 3b
625
6261:
627 cmp r2,r4
628 bhs 4f
629 pop {r4-r7,r15}
630
6314:
632 adds r0,#1                @ qu++
633 bcc 2f
634 adds r1,#1
6352:
636 subs r2,r4                @ y-=x
637 b 1b
638
639y64_x48:
640@ here x is 33..64 bits
641 push {r4-r7,r14}          @ save a copy of x
642 lsrs r4,r3,#16
643 beq 1f
644 b y64_x64                 @ jump if x is 49..64 bits
6451:
646 push {r2-r3}              @ save a copy of x
647@ here x is 33..48 bits
648 movs r5,#0                @ xsh=0
649 lsrs r4,r3,#8
650 bne 1f
651 lsls r3,#8
652 lsrs r6,r2,#24
653 orrs r3,r6
654 lsls r2,#8                @ if(x0<1U<<40) x0<<=8,xsh =8;
655 adds r5,#8
6561:
657 lsrs r4,r3,#12
658 bne 1f
659 lsls r3,#4
660 lsrs r6,r2,#28
661 orrs r3,r6
662 lsls r2,#4                @ if(x0<1U<<44) x0<<=4,xsh+=4;
663 adds r5,#4
6641:
665 lsrs r4,r3,#14
666 bne 1f
667 lsls r3,#2
668 lsrs r6,r2,#30
669 orrs r3,r6
670 lsls r2,#2                @ if(x0<1U<<46) x0<<=2,xsh+=2;
671 adds r5,#2
6721:
673 lsrs r4,r3,#15
674 bne 1f
675 adds r2,r2
676 adcs r3,r3                @ if(x0<1U<<47) x0<<=1,xsh+=1;
677 adds r5,#1
6781:
679@ now 2^47<=x0<2^48, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+17 17<=qb<33
680 movs r4,r3
681 adds r7,r2,r2
682 adcs r4,r4
683 adds r4,#1                @ x1=(ui32)(x0>>31)+1; // 2^16<x1<=2^17
684
685 ldr r7,=SIO_BASE
686 str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET]
687 ldr r4,=0xffffffff
688 str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
689 lsrs r6,r1,#16
690 wait_div 1
691 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate
692
693@ here
694@ r0:r1 y
695@ r2:r3 x0
696@ r4    r
697@ r5    xsh 0<=xsh<16
698
699 muls r6,r4
700 lsrs r6,#16               @ q=((ui32)(y>>48)*r)>>16;
701 lsls r7,r6,#13
702 mov r14,r7                @ save q<<13
703 uxth r7,r2                @ x0l
704 muls r7,r6
705 subs r0,r7
706 bcs 1f
707 subs r1,#1
7081:
709 subs r0,r7
710 bcs 1f
711 subs r1,#1
7121:
713 uxth r7,r3                @ x0h
714 muls r7,r6
715 subs r1,r7
716 subs r1,r7
717 lsrs r7,r2,#16            @ x0m
718 muls r7,r6
719 lsls r6,r7,#17
720 lsrs r7,#15
721 subs r0,r6
722 sbcs r1,r7                @ y-=((ui64)q*x0)<<1;
723
724 lsrs r6,r1,#3             @ y>>35
725 muls r6,r4
726 lsrs r6,#16               @ q=((ui32)(y>>35)*r)>>16;
727
728 cmp r5,#12
729 blt last1                 @ if(xsh<12) goto last1;
730
731 add r14,r6                @ qu<<13+q
732 lsrs r2,#12
733 lsls r7,r3,#20
734 orrs r2,r7
735 lsrs r3,#12               @ x0>>12
736
737 uxth r7,r2                @ x0l
738 muls r7,r6
739 subs r0,r7
740 bcs 1f
741 subs r1,#1
7421:
743 uxth r7,r3                @ x0h
744 muls r7,r6
745 subs r1,r7
746 lsrs r7,r2,#16            @ x0m
747 muls r7,r6
748 lsls r6,r7,#16
749 lsrs r7,#16
750 subs r0,r6
751 sbcs r1,r7                @ y-=((ui64)q*x0)>>12
752
753 lsrs r6,r0,#22
754 lsls r7,r1,#10
755 orrs r6,r7                @ y>>22
756 muls r6,r4
757 movs r7,#41
758 subs r7,r5
759 lsrs r6,r7                @ q=((ui32)(y>>22)*r)>>(16+25-xsh)
760
761 subs r5,#12
762 mov r7,r14
763 lsls r7,r5
7642:
765 adds r7,r6                @ qu=(qu<<(xsh-12))+q
766 pop {r4,r5}               @ recall x
767
768@ here
769@ r0:r1 y
770@ r4:r5 x
771@ r6 q
772@ r7 qu
773
774 uxth r2,r4
775 uxth r3,r5
776 muls r2,r6                @ xlo*q
777 muls r3,r6                @ xhi*q
778 subs r0,r2
779 sbcs r1,r3
780 lsrs r2,r4,#16
781 muls r2,r6
782 lsrs r3,r2,#16
783 lsls r2,#16               @ xm*q
784 subs r0,r2
785 sbcs r1,r3                @ y-=(ui64)q*x
786
7871:
788 movs r2,r0
789 movs r3,r1
790 adds r7,#1
791 subs r0,r4
792 sbcs r1,r5                @ while(y>=x) y-=x,qu++;
793 bhs 1b
794 subs r0,r7,#1             @ correction to qu
795 movs r1,#0
796 pop {r4-r7,r15}
797
798last1:
799@ r0:r1 y
800@ r2:r3 x0
801@ r5 xsh
802@ r6 q
803
804 movs r7,#12
805 subs r7,r5
806 lsrs r6,r7                @ q>>=12-xsh
807 mov r7,r14
808 lsrs r7,#13
809 lsls r7,r5
810 adds r7,r7                @ qu<<(xsh+1)
811 b 2b
812
813y64_x64:
814@ here x is 49..64 bits
815 movs r4,#0                @ q=0 if x>>32==0xffffffff
816 adds r5,r3,#1
817 beq 1f
818
819 ldr r7,=SIO_BASE
820 str r5,[r7,#SIO_DIV_UDIVISOR_OFFSET]
821 str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
822 wait_div 0
823 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q=(ui32)(y>>32)/((x>>32)+1)
8241:
825 uxth r5,r2
826 uxth r6,r3
827 muls r5,r4
828 muls r6,r4
829 subs r0,r5
830 sbcs r1,r6
831 lsrs r5,r2,#16
832 lsrs r6,r3,#16
833 muls r5,r4
834 muls r6,r4
835 lsls r6,#16
836 lsrs r7,r5,#16
837 orrs r6,r7
838 lsls r5,#16
839 subs r0,r5
840 sbcs r1,r6                @   y-=(ui64)q*x
841
842 cmp r1,r3                 @   while(y>=x) y-=x,q++
843 bhs 1f
8443:
845 movs r2,r0
846 movs r3,r1
847 movs r0,r4
848 movs r1,#0
849 pop {r4-r7,r15}
850
8511:
852 bne 2f
853 cmp r0,r2
854 blo 3b
8552:
856 subs r0,r2
857 sbcs r1,r3
858 adds r4,#1
859 cmp r1,r3
860 blo 3b
861 b 1b
862
863div_section divmod_s64s64_rem
864regular_func divmod_s64s64_rem
865    push {r4, lr}
866    bl divmod_s64s64
867    ldr r4, [sp, #8]
868    stmia r4!, {r2,r3}
869    pop {r4, pc}
870
871div_section divmod_u64u64_rem
872regular_func divmod_u64u64_rem
873    push {r4, lr}
874    bl divmod_u64u64
875    ldr r4, [sp, #8]
876    stmia r4!, {r2,r3}
877    pop {r4, pc}
878