1/*
2 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7#include "pico/asm_helper.S"
8
9pico_default_asm_setup
10
11.macro double_section name
12// todo separate flag for shims?
13#if PICO_DOUBLE_IN_RAM
14.section RAM_SECTION_NAME(\name), "ax"
15#else
16.section SECTION_NAME(\name), "ax"
17#endif
18.endm
19
20double_section double_table_shim_on_use_helper
21regular_func double_table_shim_on_use_helper
22    push {r0-r2, lr}
23    mov r0, ip
24#ifndef NDEBUG
25    // sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro
26    cmp r0, #0
27    bne 1f
28    bkpt #0
29#endif
301:
31    ldrh r1, [r0]
32    lsrs r2, r1, #8
33    adds r0, #2
34    cmp r2, #0xdf
35    bne 1b
36    uxtb r1, r1 // r1 holds table offset
37    lsrs r2, r0, #2
38    bcc 1f
39    // unaligned
40    ldrh r2, [r0, #0]
41    ldrh r0, [r0, #2]
42    lsls r0, #16
43    orrs r0, r2
44    b 2f
451:
46    ldr r0, [r0]
472:
48    ldr r2, =sd_table
49    str r0, [r2, r1]
50    str r0, [sp, #12]
51    pop {r0-r2, pc}
52
53#if PICO_DOUBLE_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
54// Note that the V1 ROM has no double support, so this is basically the identical
55// library, and shim inter-function calls do not bother to redirect back thru the
56// wrapper functions
57
58.equ use_hw_div,1
59.equ IOPORT       ,0xd0000000
60.equ DIV_UDIVIDEND,0x00000060
61.equ DIV_UDIVISOR ,0x00000064
62.equ DIV_QUOTIENT ,0x00000070
63.equ DIV_CSR      ,0x00000078
64
65@ Notation:
66@ rx:ry means the concatenation of rx and ry with rx having the less significant bits
67
68.equ debug,0
69.macro mdump k
70.if debug
71 push {r0-r3}
72 push {r14}
73 push {r0-r3}
74 bl osp
75 movs r0,#\k
76 bl o1ch
77 pop {r0-r3}
78 bl dump
79 bl osp
80 bl osp
81 ldr r0,[r13]
82 bl o8hex                      @ r14
83 bl onl
84 pop {r0}
85 mov r14,r0
86 pop {r0-r3}
87.endif
88.endm
89
90
91@ IEEE double in ra:rb ->
92@ mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 set
93@ exponent in re
94@ sign in rs
95@ trashes rt
96.macro mdunpack ra,rb,re,rs,rt
97 lsrs \re,\rb,#20              @ extract sign and exponent
98 subs \rs,\re,#1
99 lsls \rs,#20
100 subs \rb,\rs                  @ clear sign and exponent in mantissa; insert implied 1
101 lsrs \rs,\re,#11              @ sign
102 lsls \re,#21
103 lsrs \re,#21                  @ exponent
104 beq l\@_1                     @ zero exponent?
105 adds \rt,\re,#1
106 lsrs \rt,#11
107 beq l\@_2                     @ exponent != 0x7ff? then done
108l\@_1:
109 movs \ra,#0
110 movs \rb,#1
111 lsls \rb,#20
112 subs \re,#128
113 lsls \re,#12
114l\@_2:
115.endm
116
117@ IEEE double in ra:rb ->
118@ signed mantissa in ra:rb 12Q52 (53 significant bits) with implied 1
119@ exponent in re
120@ trashes rt0 and rt1
121@ +zero, +denormal -> exponent=-0x80000
122@ -zero, -denormal -> exponent=-0x80000
123@ +Inf, +NaN -> exponent=+0x77f000
124@ -Inf, -NaN -> exponent=+0x77e000
125.macro mdunpacks ra,rb,re,rt0,rt1
126 lsrs \re,\rb,#20              @ extract sign and exponent
127 lsrs \rt1,\rb,#31             @ sign only
128 subs \rt0,\re,#1
129 lsls \rt0,#20
130 subs \rb,\rt0                 @ clear sign and exponent in mantissa; insert implied 1
131 lsls \re,#21
132 bcc l\@_1                     @ skip on positive
133 mvns \rb,\rb                  @ negate mantissa
134 negs \ra,\ra
135 bcc l\@_1
136 adds \rb,#1
137l\@_1:
138 lsrs \re,#21
139 beq l\@_2                     @ zero exponent?
140 adds \rt0,\re,#1
141 lsrs \rt0,#11
142 beq l\@_3                     @ exponent != 0x7ff? then done
143 subs \re,\rt1
144l\@_2:
145 movs \ra,#0
146 lsls \rt1,#1                  @ +ve: 0  -ve: 2
147 adds \rb,\rt1,#1              @ +ve: 1  -ve: 3
148 lsls \rb,#30                  @ create +/-1 mantissa
149 asrs \rb,#10
150 subs \re,#128
151 lsls \re,#12
152l\@_3:
153.endm
154
155double_section WRAPPER_FUNC_NAME(__aeabi_dsub)
156
157# frsub first because it is the only one that needs alignment
158regular_func drsub_shim
159    push {r0-r3}
160    pop {r0-r1}
161    pop {r2-r3}
162    // fall thru
163
164regular_func dsub_shim
165 push {r4-r7,r14}
166 movs r4,#1
167 lsls r4,#31
168 eors r3,r4                    @ flip sign on second argument
169 b da_entry                    @ continue in dadd
170
171.align 2
172double_section dadd_shim
173regular_func dadd_shim
174 push {r4-r7,r14}
175da_entry:
176 mdunpacks r0,r1,r4,r6,r7
177 mdunpacks r2,r3,r5,r6,r7
178 subs r7,r5,r4                 @ ye-xe
179 subs r6,r4,r5                 @ xe-ye
180 bmi da_ygtx
181@ here xe>=ye: need to shift y down r6 places
182 mov r12,r4                    @ save exponent
183 cmp r6,#32
184 bge da_xrgty                  @ xe rather greater than ye?
185 adds r7,#32
186 movs r4,r2
187 lsls r4,r4,r7                 @ rounding bit + sticky bits
188da_xgty0:
189 movs r5,r3
190 lsls r5,r5,r7
191 lsrs r2,r6
192 asrs r3,r6
193 orrs r2,r5
194da_add:
195 adds r0,r2
196 adcs r1,r3
197da_pack:
198@ here unnormalised signed result (possibly 0) is in r0:r1 with exponent r12, rounding + sticky bits in r4
199@ Note that if a large normalisation shift is required then the arguments were close in magnitude and so we
200@ cannot have not gone via the xrgty/yrgtx paths. There will therefore always be enough high bits in r4
201@ to provide a correct continuation of the exact result.
202@ now pack result back up
203 lsrs r3,r1,#31                @ get sign bit
204 beq 1f                        @ skip on positive
205 mvns r1,r1                    @ negate mantissa
206 mvns r0,r0
207 movs r2,#0
208 negs r4,r4
209 adcs r0,r2
210 adcs r1,r2
2111:
212 mov r2,r12                    @ get exponent
213 lsrs r5,r1,#21
214 bne da_0                      @ shift down required?
215 lsrs r5,r1,#20
216 bne da_1                      @ normalised?
217 cmp r0,#0
218 beq da_5                      @ could mantissa be zero?
219da_2:
220 adds r4,r4
221 adcs r0,r0
222 adcs r1,r1
223 subs r2,#1                    @ adjust exponent
224 lsrs r5,r1,#20
225 beq da_2
226da_1:
227 lsls r4,#1                    @ check rounding bit
228 bcc da_3
229da_4:
230 adds r0,#1                    @ round up
231 bcc 2f
232 adds r1,#1
2332:
234 cmp r4,#0                     @ sticky bits zero?
235 bne da_3
236 lsrs r0,#1                    @ round to even
237 lsls r0,#1
238da_3:
239 subs r2,#1
240 bmi da_6
241 adds r4,r2,#2                 @ check if exponent is overflowing
242 lsrs r4,#11
243 bne da_7
244 lsls r2,#20                   @ pack exponent and sign
245 add r1,r2
246 lsls r3,#31
247 add r1,r3
248 pop {r4-r7,r15}
249
250da_7:
251@ here exponent overflow: return signed infinity
252 lsls r1,r3,#31
253 ldr r3,=0x7ff00000
254 orrs r1,r3
255 b 1f
256da_6:
257@ here exponent underflow: return signed zero
258 lsls r1,r3,#31
2591:
260 movs r0,#0
261 pop {r4-r7,r15}
262
263da_5:
264@ here mantissa could be zero
265 cmp r1,#0
266 bne da_2
267 cmp r4,#0
268 bne da_2
269@ inputs must have been of identical magnitude and opposite sign, so return +0
270 pop {r4-r7,r15}
271
272da_0:
273@ here a shift down by one place is required for normalisation
274 adds r2,#1                    @ adjust exponent
275 lsls r6,r0,#31                @ save rounding bit
276 lsrs r0,#1
277 lsls r5,r1,#31
278 orrs r0,r5
279 lsrs r1,#1
280 cmp r6,#0
281 beq da_3
282 b da_4
283
284da_xrgty:                      @ xe>ye and shift>=32 places
285 cmp r6,#60
286 bge da_xmgty                  @ xe much greater than ye?
287 subs r6,#32
288 adds r7,#64
289
290 movs r4,r2
291 lsls r4,r4,r7                 @ these would be shifted off the bottom of the sticky bits
292 beq 1f
293 movs r4,#1
2941:
295 lsrs r2,r2,r6
296 orrs r4,r2
297 movs r2,r3
298 lsls r3,r3,r7
299 orrs r4,r3
300 asrs r3,r2,#31                @ propagate sign bit
301 b da_xgty0
302
303da_ygtx:
304@ here ye>xe: need to shift x down r7 places
305 mov r12,r5                    @ save exponent
306 cmp r7,#32
307 bge da_yrgtx                  @ ye rather greater than xe?
308 adds r6,#32
309 movs r4,r0
310 lsls r4,r4,r6                 @ rounding bit + sticky bits
311da_ygtx0:
312 movs r5,r1
313 lsls r5,r5,r6
314 lsrs r0,r7
315 asrs r1,r7
316 orrs r0,r5
317 b da_add
318
319da_yrgtx:
320 cmp r7,#60
321 bge da_ymgtx                  @ ye much greater than xe?
322 subs r7,#32
323 adds r6,#64
324
325 movs r4,r0
326 lsls r4,r4,r6                 @ these would be shifted off the bottom of the sticky bits
327 beq 1f
328 movs r4,#1
3291:
330 lsrs r0,r0,r7
331 orrs r4,r0
332 movs r0,r1
333 lsls r1,r1,r6
334 orrs r4,r1
335 asrs r1,r0,#31                @ propagate sign bit
336 b da_ygtx0
337
338da_ymgtx:                      @ result is just y
339 movs r0,r2
340 movs r1,r3
341da_xmgty:                      @ result is just x
342 movs r4,#0                    @ clear sticky bits
343 b da_pack
344
345.ltorg
346
347@ equivalent of UMULL
348@ needs five temporary registers
349@ can have rt3==rx, in which case rx trashed
350@ can have rt4==ry, in which case ry trashed
351@ can have rzl==rx
352@ can have rzh==ry
353@ can have rzl,rzh==rt3,rt4
354.macro mul32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
355                               @   t0   t1   t2   t3   t4
356                               @                  (x)  (y)
357 uxth \rt0,\rx                 @   xl
358 uxth \rt1,\ry                 @        yl
359 muls \rt0,\rt1                @  xlyl=L
360 lsrs \rt2,\rx,#16             @             xh
361 muls \rt1,\rt2                @       xhyl=M0
362 lsrs \rt4,\ry,#16             @                       yh
363 muls \rt2,\rt4                @           xhyh=H
364 uxth \rt3,\rx                 @                   xl
365 muls \rt3,\rt4                @                  xlyh=M1
366 adds \rt1,\rt3                @      M0+M1=M
367 bcc l\@_1                     @ addition of the two cross terms can overflow, so add carry into H
368 movs \rt3,#1                  @                   1
369 lsls \rt3,#16                 @                0x10000
370 adds \rt2,\rt3                @             H'
371l\@_1:
372                               @   t0   t1   t2   t3   t4
373                               @                 (zl) (zh)
374 lsls \rzl,\rt1,#16            @                  ML
375 lsrs \rzh,\rt1,#16            @                       MH
376 adds \rzl,\rt0                @                  ZL
377 adcs \rzh,\rt2                @                       ZH
378.endm
379
380@ SUMULL: x signed, y unsigned
381@ in table below ¯ means signed variable
382@ needs five temporary registers
383@ can have rt3==rx, in which case rx trashed
384@ can have rt4==ry, in which case ry trashed
385@ can have rzl==rx
386@ can have rzh==ry
387@ can have rzl,rzh==rt3,rt4
388.macro muls32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
389                               @   t0   t1   t2   t3   t4
390                               @                 ¯(x)  (y)
391 uxth \rt0,\rx                 @   xl
392 uxth \rt1,\ry                 @        yl
393 muls \rt0,\rt1                @  xlyl=L
394 asrs \rt2,\rx,#16             @            ¯xh
395 muls \rt1,\rt2                @      ¯xhyl=M0
396 lsrs \rt4,\ry,#16             @                       yh
397 muls \rt2,\rt4                @          ¯xhyh=H
398 uxth \rt3,\rx                 @                   xl
399 muls \rt3,\rt4                @                 xlyh=M1
400 asrs \rt4,\rt1,#31            @                      M0sx   (M1 sign extension is zero)
401 adds \rt1,\rt3                @      M0+M1=M
402 movs \rt3,#0                  @                    0
403 adcs \rt4,\rt3                @                      ¯Msx
404 lsls \rt4,#16                 @                    ¯Msx<<16
405 adds \rt2,\rt4                @             H'
406
407                               @   t0   t1   t2   t3   t4
408                               @                 (zl) (zh)
409 lsls \rzl,\rt1,#16            @                  M~
410 lsrs \rzh,\rt1,#16            @                       M~
411 adds \rzl,\rt0                @                  ZL
412 adcs \rzh,\rt2                @                      ¯ZH
413.endm
414
415@ SSMULL: x signed, y signed
416@ in table below ¯ means signed variable
417@ needs five temporary registers
418@ can have rt3==rx, in which case rx trashed
419@ can have rt4==ry, in which case ry trashed
420@ can have rzl==rx
421@ can have rzh==ry
422@ can have rzl,rzh==rt3,rt4
423.macro muls32_s32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
424                               @   t0   t1   t2   t3   t4
425                               @                 ¯(x)  (y)
426 uxth \rt0,\rx                 @   xl
427 uxth \rt1,\ry                 @        yl
428 muls \rt0,\rt1                @  xlyl=L
429 asrs \rt2,\rx,#16             @            ¯xh
430 muls \rt1,\rt2                @      ¯xhyl=M0
431 asrs \rt4,\ry,#16             @                      ¯yh
432 muls \rt2,\rt4                @          ¯xhyh=H
433 uxth \rt3,\rx                 @                   xl
434 muls \rt3,\rt4                @                ¯xlyh=M1
435 adds \rt1,\rt3                @     ¯M0+M1=M
436 asrs \rt3,\rt1,#31            @                  Msx
437 bvc l\@_1                     @
438 mvns \rt3,\rt3                @                 ¯Msx        flip sign extension bits if overflow
439l\@_1:
440 lsls \rt3,#16                 @                    ¯Msx<<16
441 adds \rt2,\rt3                @             H'
442
443                               @   t0   t1   t2   t3   t4
444                               @                 (zl) (zh)
445 lsls \rzl,\rt1,#16            @                  M~
446 lsrs \rzh,\rt1,#16            @                       M~
447 adds \rzl,\rt0                @                  ZL
448 adcs \rzh,\rt2                @                      ¯ZH
449.endm
450
451@ can have rt2==rx, in which case rx trashed
452@ can have rzl==rx
453@ can have rzh==rt1
454.macro square32_64 rx,rzl,rzh,rt0,rt1,rt2
455                               @   t0   t1   t2   zl   zh
456 uxth \rt0,\rx                 @   xl
457 muls \rt0,\rt0                @ xlxl=L
458 uxth \rt1,\rx                 @        xl
459 lsrs \rt2,\rx,#16             @             xh
460 muls \rt1,\rt2                @      xlxh=M
461 muls \rt2,\rt2                @           xhxh=H
462 lsls \rzl,\rt1,#17            @                  ML
463 lsrs \rzh,\rt1,#15            @                       MH
464 adds \rzl,\rt0                @                  ZL
465 adcs \rzh,\rt2                @                       ZH
466.endm
467
468double_section dmul_shim
469 regular_func dmul_shim
470 push {r4-r7,r14}
471 mdunpack r0,r1,r4,r6,r5
472 mov r12,r4
473 mdunpack r2,r3,r4,r7,r5
474 eors r7,r6                    @ sign of result
475 add r4,r12                    @ exponent of result
476 push {r0-r2,r4,r7}
477
478@ accumulate full product in r12:r5:r6:r7
479 mul32_32_64 r0,r2, r0,r5, r4,r6,r7,r0,r5    @ XL*YL
480 mov r12,r0                    @ save LL bits
481
482 mul32_32_64 r1,r3, r6,r7, r0,r2,r4,r6,r7    @ XH*YH
483
484 pop {r0}                      @ XL
485 mul32_32_64 r0,r3, r0,r3, r1,r2,r4,r0,r3    @ XL*YH
486 adds r5,r0
487 adcs r6,r3
488 movs r0,#0
489 adcs r7,r0
490
491 pop {r1,r2}                   @ XH,YL
492 mul32_32_64 r1,r2, r1,r2, r0,r3,r4, r1,r2   @ XH*YL
493 adds r5,r1
494 adcs r6,r2
495 movs r0,#0
496 adcs r7,r0
497
498@ here r5:r6:r7 holds the product [1..4) in Q(104-32)=Q72, with extra LSBs in r12
499 pop {r3,r4}                   @ exponent in r3, sign in r4
500 lsls r1,r7,#11
501 lsrs r2,r6,#21
502 orrs r1,r2
503 lsls r0,r6,#11
504 lsrs r2,r5,#21
505 orrs r0,r2
506 lsls r5,#11                   @ now r5:r0:r1 Q83=Q(51+32), extra LSBs in r12
507 lsrs r2,r1,#20
508 bne 1f                        @ skip if in range [2..4)
509 adds r5,r5                    @ shift up so always [2..4) Q83, i.e. [1..2) Q84=Q(52+32)
510 adcs r0,r0
511 adcs r1,r1
512 subs r3,#1                    @ correct exponent
5131:
514 ldr r6,=0x3ff
515 subs r3,r6                    @ correct for exponent bias
516 lsls r6,#1                    @ 0x7fe
517 cmp r3,r6
518 bhs dm_0                      @ exponent over- or underflow
519 lsls r5,#1                    @ rounding bit to carry
520 bcc 1f                        @ result is correctly rounded
521 adds r0,#1
522 movs r6,#0
523 adcs r1,r6                    @ round up
524 mov r6,r12                    @ remaining sticky bits
525 orrs r5,r6
526 bne 1f                        @ some sticky bits set?
527 lsrs r0,#1
528 lsls r0,#1                    @ round to even
5291:
530 lsls r3,#20
531 adds r1,r3
532dm_2:
533 lsls r4,#31
534 add r1,r4
535 pop {r4-r7,r15}
536
537@ here for exponent over- or underflow
538dm_0:
539 bge dm_1                      @ overflow?
540 adds r3,#1                    @ would-be zero exponent?
541 bne 1f
542 adds r0,#1
543 bne 1f                        @ all-ones mantissa?
544 adds r1,#1
545 lsrs r7,r1,#21
546 beq 1f
547 lsrs r1,#1
548 b dm_2
5491:
550 lsls r1,r4,#31
551 movs r0,#0
552 pop {r4-r7,r15}
553
554@ here for exponent overflow
555dm_1:
556 adds r6,#1                    @ 0x7ff
557 lsls r1,r6,#20
558 movs r0,#0
559 b dm_2
560
561.ltorg
562
563@ Approach to division y/x is as follows.
564@
565@ First generate u1, an approximation to 1/x to about 29 bits. Multiply this by the top
566@ 32 bits of y to generate a0, a first approximation to the result (good to 28 bits or so).
567@ Calculate the exact remainder r0=y-a0*x, which will be about 0. Calculate a correction
568@ d0=r0*u1, and then write a1=a0+d0. If near a rounding boundary, compute the exact
569@ remainder r1=y-a1*x (which can be done using r0 as a basis) to determine whether to
570@ round up or down.
571@
572@ The calculation of 1/x is as given in dreciptest.c. That code verifies exhaustively
573@ that | u1*x-1 | < 10*2^-32.
574@
575@ More precisely:
576@
577@ x0=(q16)x;
578@ x1=(q30)x;
579@ y0=(q31)y;
580@ u0=(q15~)"(0xffffffffU/(unsigned int)roundq(x/x_ulp))/powq(2,16)"(x0); // q15 approximation to 1/x; "~" denotes rounding rather than truncation
581@ v=(q30)(u0*x1-1);
582@ u1=(q30)u0-(q30~)(u0*v);
583@
584@ a0=(q30)(u1*y0);
585@ r0=(q82)y-a0*x;
586@ r0x=(q57)r0;
587@ d0=r0x*u1;
588@ a1=d0+a0;
589@
590@ Error analysis
591@
592@ Use Greek letters to represent the errors introduced by rounding and truncation.
593@
594@               r₀ = y - a₀x
595@                  = y - [ u₁ ( y - α ) - β ] x    where 0 ≤ α < 2^-31, 0 ≤ β < 2^-30
596@                  = y ( 1 - u₁x ) + ( u₁α + β ) x
597@
598@     Hence
599@
600@       | r₀ / x | < 2 * 10*2^-32 + 2^-31 + 2^-30
601@                  = 26*2^-32
602@
603@               r₁ = y - a₁x
604@                  = y - a₀x - d₀x
605@                  = r₀ - d₀x
606@                  = r₀ - u₁ ( r₀ - γ ) x    where 0 ≤ γ < 2^-57
607@                  = r₀ ( 1 - u₁x ) + u₁γx
608@
609@     Hence
610@
611@       | r₁ / x | < 26*2^-32 * 10*2^-32 + 2^-57
612@                  = (260+128)*2^-64
613@                  < 2^-55
614@
615@ Empirically it seems to be nearly twice as good as this.
616@
617@ To determine correctly whether the exact remainder calculation can be skipped we need a result
618@ accurate to < 0.25ulp. In the case where x>y the quotient will be shifted up one place for normalisation
619@ and so 1ulp is 2^-53 and so the calculation above suffices.
620
621double_section ddiv_shim
622 regular_func ddiv_shim
623 push {r4-r7,r14}
624ddiv0:                         @ entry point from dtan
625 mdunpack r2,r3,r4,r7,r6       @ unpack divisor
626
627.if use_hw_div
628
629 movs r5,#IOPORT>>24
630 lsls r5,#24
631 movs r6,#0
632 mvns r6,r6
633 str r6,[r5,#DIV_UDIVIDEND]
634 lsrs r6,r3,#4                 @ x0=(q16)x
635 str r6,[r5,#DIV_UDIVISOR]
636@ if there are not enough cycles from now to the read of the quotient for
637@ the divider to do its stuff we need a busy-wait here
638
639.endif
640
641@ unpack dividend by hand to save on register use
642 lsrs r6,r1,#31
643 adds r6,r7
644 mov r12,r6                    @ result sign in r12b0; r12b1 trashed
645 lsls r1,#1
646 lsrs r7,r1,#21                @ exponent
647 beq 1f                        @ zero exponent?
648 adds r6,r7,#1
649 lsrs r6,#11
650 beq 2f                        @ exponent != 0x7ff? then done
6511:
652 movs r0,#0
653 movs r1,#0
654 subs r7,#64                   @ less drastic fiddling of exponents to get 0/0, Inf/Inf correct
655 lsls r7,#12
6562:
657 subs r6,r7,r4
658 lsls r6,#2
659 add r12,r12,r6                @ (signed) exponent in r12[31..8]
660 subs r7,#1                    @ implied 1
661 lsls r7,#21
662 subs r1,r7
663 lsrs r1,#1
664
665.if use_hw_div
666
667 ldr r6,[r5,#DIV_QUOTIENT]
668 adds r6,#1
669 lsrs r6,#1
670
671.else
672
673@ this is not beautiful; could be replaced by better code that uses knowledge of divisor range
674 push {r0-r3}
675 movs r0,#0
676 mvns r0,r0
677 lsrs r1,r3,#4                 @ x0=(q16)x
678 bl __aeabi_uidiv              @ !!! this could (but apparently does not) trash R12
679 adds r6,r0,#1
680 lsrs r6,#1
681 pop {r0-r3}
682
683.endif
684
685@ here
686@ r0:r1 y mantissa
687@ r2:r3 x mantissa
688@ r6    u0, first approximation to 1/x Q15
689@ r12: result sign, exponent
690
691 lsls r4,r3,#10
692 lsrs r5,r2,#22
693 orrs r5,r4                    @ x1=(q30)x
694 muls r5,r6                    @ u0*x1 Q45
695 asrs r5,#15                   @ v=u0*x1-1 Q30
696 muls r5,r6                    @ u0*v Q45
697 asrs r5,#14
698 adds r5,#1
699 asrs r5,#1                    @ round u0*v to Q30
700 lsls r6,#15
701 subs r6,r5                    @ u1 Q30
702
703@ here
704@ r0:r1 y mantissa
705@ r2:r3 x mantissa
706@ r6    u1, second approximation to 1/x Q30
707@ r12: result sign, exponent
708
709 push {r2,r3}
710 lsls r4,r1,#11
711 lsrs r5,r0,#21
712 orrs r4,r5                    @ y0=(q31)y
713 mul32_32_64 r4,r6, r4,r5, r2,r3,r7,r4,r5  @ y0*u1 Q61
714 adds r4,r4
715 adcs r5,r5                    @ a0=(q30)(y0*u1)
716
717@ here
718@ r0:r1 y mantissa
719@ r5    a0, first approximation to y/x Q30
720@ r6    u1, second approximation to 1/x Q30
721@ r12   result sign, exponent
722
723 ldr r2,[r13,#0]               @ xL
724 mul32_32_64 r2,r5, r2,r3, r1,r4,r7,r2,r3  @ xL*a0
725 ldr r4,[r13,#4]               @ xH
726 muls r4,r5                    @ xH*a0
727 adds r3,r4                    @ r2:r3 now x*a0 Q82
728 lsrs r2,#25
729 lsls r1,r3,#7
730 orrs r2,r1                    @ r2 now x*a0 Q57; r7:r2 is x*a0 Q89
731 lsls r4,r0,#5                 @ y Q57
732 subs r0,r4,r2                 @ r0x=y-x*a0 Q57 (signed)
733
734@ here
735@ r0  r0x Q57
736@ r5  a0, first approximation to y/x Q30
737@ r4  yL  Q57
738@ r6  u1 Q30
739@ r12 result sign, exponent
740
741 muls32_32_64 r0,r6, r7,r6, r1,r2,r3, r7,r6   @ r7:r6 r0x*u1 Q87
742 asrs r3,r6,#25
743 adds r5,r3
744 lsls r3,r6,#7                 @ r3:r5 a1 Q62 (but bottom 7 bits are zero so 55 bits of precision after binary point)
745@ here we could recover another 7 bits of precision (but not accuracy) from the top of r7
746@ but these bits are thrown away in the rounding and conversion to Q52 below
747
748@ here
749@ r3:r5  a1 Q62 candidate quotient [0.5,2) or so
750@ r4     yL Q57
751@ r12    result sign, exponent
752
753 movs r6,#0
754 adds r3,#128                  @ for initial rounding to Q53
755 adcs r5,r5,r6
756 lsrs  r1,r5,#30
757 bne dd_0
758@ here candidate quotient a1 is in range [0.5,1)
759@ so 30 significant bits in r5
760
761 lsls r4,#1                    @ y now Q58
762 lsrs r1,r5,#9                 @ to Q52
763 lsls r0,r5,#23
764 lsrs r3,#9                    @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
765 orrs r0,r3
766 bcs dd_1
767 b dd_2
768dd_0:
769@ here candidate quotient a1 is in range [1,2)
770@ so 31 significant bits in r5
771
772 movs r2,#4
773 add r12,r12,r2                @ fix exponent; r3:r5 now effectively Q61
774 adds r3,#128                  @ complete rounding to Q53
775 adcs r5,r5,r6
776 lsrs r1,r5,#10
777 lsls r0,r5,#22
778 lsrs r3,#10                   @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
779 orrs r0,r3
780 bcc dd_2
781dd_1:
782
783@ here
784@ r0:r1  rounded result Q53 [0.5,1) or Q52 [1,2), but may not be correctly rounded-to-nearest
785@ r4     yL Q58 or Q57
786@ r12    result sign, exponent
787@ carry set
788
789 adcs r0,r0,r0
790 adcs r1,r1,r1                 @ z Q53 with 1 in LSB
791 lsls r4,#16                   @ Q105-32=Q73
792 ldr r2,[r13,#0]               @ xL Q52
793 ldr r3,[r13,#4]               @ xH Q20
794
795 movs r5,r1                    @ zH Q21
796 muls r5,r2                    @ zH*xL Q73
797 subs r4,r5
798 muls r3,r0                    @ zL*xH Q73
799 subs r4,r3
800 mul32_32_64 r2,r0, r2,r3, r5,r6,r7,r2,r3  @ xL*zL
801 negs r2,r2                    @ borrow from low half?
802 sbcs r4,r3                    @ y-xz Q73 (remainder bits 52..73)
803
804 cmp r4,#0
805
806 bmi 1f
807 movs r2,#0                    @ round up
808 adds r0,#1
809 adcs r1,r2
8101:
811 lsrs r0,#1                    @ shift back down to Q52
812 lsls r2,r1,#31
813 orrs r0,r2
814 lsrs r1,#1
815dd_2:
816 add r13,#8
817 mov r2,r12
818 lsls r7,r2,#31                @ result sign
819 asrs r2,#2                    @ result exponent
820 ldr r3,=0x3fd
821 adds r2,r3
822 ldr r3,=0x7fe
823 cmp r2,r3
824 bhs dd_3                      @ over- or underflow?
825 lsls r2,#20
826 adds r1,r2                    @ pack exponent
827dd_5:
828 adds r1,r7                    @ pack sign
829 pop {r4-r7,r15}
830
831dd_3:
832 movs r0,#0
833 cmp r2,#0
834 bgt dd_4                      @ overflow?
835 movs r1,r7
836 pop {r4-r7,r15}
837
838dd_4:
839 adds r3,#1                    @ 0x7ff
840 lsls r1,r3,#20
841 b dd_5
842
843.section SECTION_NAME(dsqrt_shim)
844/*
845Approach to square root x=sqrt(y) is as follows.
846
847First generate a3, an approximation to 1/sqrt(y) to about 30 bits. Multiply this by y
848to give a4~sqrt(y) to about 28 bits and a remainder r4=y-a4^2. Then, because
849d sqrt(y) / dy = 1 / (2 sqrt(y)) let d4=r4*a3/2 and then the value a5=a4+d4 is
850a better approximation to sqrt(y). If this is near a rounding boundary we
851compute an exact remainder y-a5*a5 to decide whether to round up or down.
852
853The calculation of a3 and a4 is as given in dsqrttest.c. That code verifies exhaustively
854that | 1 - a3a4 | < 10*2^-32, | r4 | < 40*2^-32 and | r4/y | < 20*2^-32.
855
856More precisely, with "y" representing y truncated to 30 binary places:
857
858u=(q3)y;                          // 24-entry table
859a0=(q8~)"1/sqrtq(x+x_ulp/2)"(u);  // first approximation from table
860p0=(q16)(a0*a0) * (q16)y;
861r0=(q20)(p0-1);
862dy0=(q15)(r0*a0);                 // Newton-Raphson correction term
863a1=(q16)a0-dy0/2;                 // good to ~9 bits
864
865p1=(q19)(a1*a1)*(q19)y;
866r1=(q23)(p1-1);
867dy1=(q15~)(r1*a1);                // second Newton-Raphson correction
868a2x=(q16)a1-dy1/2;                // good to ~16 bits
869a2=a2x-a2x/1t16;                  // prevent overflow of a2*a2 in 32 bits
870
871p2=(a2*a2)*(q30)y;                // Q62
872r2=(q36)(p2-1+1t-31);
873dy2=(q30)(r2*a2);                 // Q52->Q30
874a3=(q31)a2-dy2/2;                 // good to about 30 bits
875a4=(q30)(a3*(q30)y+1t-31);        // good to about 28 bits
876
877Error analysis
878
879          r₄ = y - a₄²
880          d₄ = 1/2 a₃r₄
881          a₅ = a₄ + d₄
882          r₅ = y - a₅²
883             = y - ( a₄ + d₄ )²
884             = y - a₄² - a₃a₄r₄ - 1/4 a₃²r₄²
885             = r₄ - a₃a₄r₄ - 1/4 a₃²r₄²
886
887      | r₅ | < | r₄ | | 1 - a₃a₄ | + 1/4 r₄²
888
889          a₅ = √y √( 1 - r₅/y )
890             = √y ( 1 - 1/2 r₅/y + ... )
891
892So to first order (second order being very tiny)
893
894     √y - a₅ = 1/2 r₅/y
895
896and
897
898 | √y - a₅ | < 1/2 ( | r₄/y | | 1 - a₃a₄ | + 1/4 r₄²/y )
899
900From dsqrttest.c (conservatively):
901
902             < 1/2 ( 20*2^-32 * 10*2^-32 + 1/4 * 40*2^-32*20*2^-32 )
903             = 1/2 ( 200 + 200 ) * 2^-64
904             < 2^-56
905
906Empirically we see about 1ulp worst-case error including rounding at Q57.
907
908To determine correctly whether the exact remainder calculation can be skipped we need a result
909accurate to < 0.25ulp at Q52, or 2^-54.
910*/
911
912dq_2:
913 bge dq_3                      @ +Inf?
914 movs r1,#0
915 b dq_4
916
917dq_0:
918 lsrs r1,#31
919 lsls r1,#31                   @ preserve sign bit
920 lsrs r2,#21                   @ extract exponent
921 beq dq_4                      @ -0? return it
922 asrs r1,#11                   @ make -Inf
923 b dq_4
924
925dq_3:
926 ldr r1,=0x7ff
927 lsls r1,#20                   @ return +Inf
928dq_4:
929 movs r0,#0
930dq_1:
931 bx r14
932
933.align 2
934regular_func dsqrt_shim
935 lsls r2,r1,#1
936 bcs dq_0                      @ negative?
937 lsrs r2,#21                   @ extract exponent
938 subs r2,#1
939 ldr r3,=0x7fe
940 cmp r2,r3
941 bhs dq_2                      @ catches 0 and +Inf
942 push {r4-r7,r14}
943 lsls r4,r2,#20
944 subs r1,r4                    @ insert implied 1
945 lsrs r2,#1
946 bcc 1f                        @ even exponent? skip
947 adds r0,r0,r0                 @ odd exponent: shift up mantissa
948 adcs r1,r1,r1
9491:
950 lsrs r3,#2
951 adds r2,r3
952 lsls r2,#20
953 mov r12,r2                    @ save result exponent
954
955@ here
956@ r0:r1  y mantissa Q52 [1,4)
957@ r12    result exponent
958.equ drsqrtapp_minus_8, (drsqrtapp-8)
959 adr r4,drsqrtapp_minus_8      @ first eight table entries are never accessed because of the mantissa's leading 1
960 lsrs r2,r1,#17                @ y Q3
961 ldrb r2,[r4,r2]               @ initial approximation to reciprocal square root a0 Q8
962 lsrs r3,r1,#4                 @ first Newton-Raphson iteration
963 muls r3,r2
964 muls r3,r2                    @  i32 p0=a0*a0*(y>>14);          // Q32
965 asrs r3,r3,#12                @  i32 r0=p0>>12;                 // Q20
966 muls r3,r2
967 asrs r3,#13                   @  i32 dy0=(r0*a0)>>13;           // Q15
968 lsls r2,#8
969 subs r2,r3                    @  i32 a1=(a0<<8)-dy0;         // Q16
970
971 movs r3,r2
972 muls r3,r3
973 lsrs r3,#13
974 lsrs r4,r1,#1
975 muls r3,r4                    @  i32 p1=((a1*a1)>>11)*(y>>11);  // Q19*Q19=Q38
976 asrs r3,#15                   @  i32 r1=p1>>15;                 // Q23
977 muls r3,r2
978 asrs r3,#23
979 adds r3,#1
980 asrs r3,#1                    @  i32 dy1=(r1*a1+(1<<23))>>24;   // Q23*Q16=Q39; Q15
981 subs r2,r3                    @  i32 a2=a1-dy1;                 // Q16
982 lsrs r3,r2,#16
983 subs r2,r3                    @  if(a2>=0x10000) a2=0xffff; to prevent overflow of a2*a2
984
985@ here
986@ r0:r1 y mantissa
987@ r2    a2 ~ 1/sqrt(y) Q16
988@ r12   result exponent
989
990 movs r3,r2
991 muls r3,r3
992 lsls r1,#10
993 lsrs r4,r0,#22
994 orrs r1,r4                    @ y Q30
995 mul32_32_64 r1,r3, r4,r3, r5,r6,r7,r4,r3   @  i64 p2=(ui64)(a2*a2)*(ui64)y;  // Q62 r4:r3
996 lsls r5,r3,#6
997 lsrs r4,#26
998 orrs r4,r5
999 adds r4,#0x20                 @  i32 r2=(p2>>26)+0x20;          // Q36 r4
1000 uxth r5,r4
1001 muls r5,r2
1002 asrs r4,#16
1003 muls r4,r2
1004 lsrs r5,#16
1005 adds r4,r5
1006 asrs r4,#6                    @ i32 dy2=((i64)r2*(i64)a2)>>22; // Q36*Q16=Q52; Q30
1007 lsls r2,#15
1008 subs r2,r4
1009
1010@ here
1011@ r0    y low bits
1012@ r1    y Q30
1013@ r2    a3 ~ 1/sqrt(y) Q31
1014@ r12   result exponent
1015
1016 mul32_32_64 r2,r1, r3,r4, r5,r6,r7,r3,r4
1017 adds r3,r3,r3
1018 adcs r4,r4,r4
1019 adds r3,r3,r3
1020 movs r3,#0
1021 adcs r3,r4                    @ ui32 a4=((ui64)a3*(ui64)y+(1U<<31))>>31; // Q30
1022
1023@ here
1024@ r0    y low bits
1025@ r1    y Q30
1026@ r2    a3 Q31 ~ 1/sqrt(y)
1027@ r3    a4 Q30 ~ sqrt(y)
1028@ r12   result exponent
1029
1030 square32_64 r3, r4,r5, r6,r5,r7
1031 lsls r6,r0,#8
1032 lsrs r7,r1,#2
1033 subs r6,r4
1034 sbcs r7,r5                    @ r4=(q60)y-a4*a4
1035
1036@ by exhaustive testing, r4 = fffffffc0e134fdc .. 00000003c2bf539c Q60
1037
1038 lsls r5,r7,#29
1039 lsrs r6,#3
1040 adcs r6,r5                    @ r4 Q57 with rounding
1041 muls32_32_64 r6,r2, r6,r2, r4,r5,r7,r6,r2    @ d4=a3*r4/2 Q89
1042@ r4+d4 is correct to 1ULP at Q57, tested on ~9bn cases including all extreme values of r4 for each possible y Q30
1043
1044 adds r2,#8
1045 asrs r2,#5                    @ d4 Q52, rounded to Q53 with spare bit in carry
1046
1047@ here
1048@ r0    y low bits
1049@ r1    y Q30
1050@ r2    d4 Q52, rounded to Q53
1051@ C flag contains d4_b53
1052@ r3    a4 Q30
1053
1054 bcs dq_5
1055
1056 lsrs r5,r3,#10                @ a4 Q52
1057 lsls r4,r3,#22
1058
1059 asrs r1,r2,#31
1060 adds r0,r2,r4
1061 adcs r1,r5                    @ a4+d4
1062
1063 add r1,r12                    @ pack exponent
1064 pop {r4-r7,r15}
1065
1066.ltorg
1067
1068
1069@ round(sqrt(2^22./[68:8:252]))
1070drsqrtapp:
1071.byte 0xf8,0xeb,0xdf,0xd6,0xcd,0xc5,0xbe,0xb8
1072.byte 0xb2,0xad,0xa8,0xa4,0xa0,0x9c,0x99,0x95
1073.byte 0x92,0x8f,0x8d,0x8a,0x88,0x85,0x83,0x81
1074
1075dq_5:
1076@ here we are near a rounding boundary, C is set
1077 adcs r2,r2,r2                 @ d4 Q53+1ulp
1078 lsrs r5,r3,#9
1079 lsls r4,r3,#23                @ r4:r5 a4 Q53
1080 asrs r1,r2,#31
1081 adds r4,r2,r4
1082 adcs r5,r1                    @ r4:r5 a5=a4+d4 Q53+1ulp
1083 movs r3,r5
1084 muls r3,r4
1085 square32_64 r4,r1,r2,r6,r2,r7
1086 adds r2,r3
1087 adds r2,r3                    @ r1:r2 a5^2 Q106
1088 lsls r0,#22                   @ y Q84
1089
1090 negs r1,r1
1091 sbcs r0,r2                    @ remainder y-a5^2
1092 bmi 1f                        @ y<a5^2: no need to increment a5
1093 movs r3,#0
1094 adds r4,#1
1095 adcs r5,r3                    @ bump a5 if over rounding boundary
10961:
1097 lsrs r0,r4,#1
1098 lsrs r1,r5,#1
1099 lsls r5,#31
1100 orrs r0,r5
1101 add r1,r12
1102 pop {r4-r7,r15}
1103
1104@ "scientific" functions start here
1105
1106@ double-length CORDIC rotation step
1107
1108@ r0:r1   ω
1109@ r6      32-i (complementary shift)
1110@ r7      i (shift)
1111@ r8:r9   x
1112@ r10:r11 y
1113@ r12     coefficient pointer
1114
1115@ an option in rotation mode would be to compute the sequence of σ values
1116@ in one pass, rotate the initial vector by the residual ω and then run a
1117@ second pass to compute the final x and y. This would relieve pressure
1118@ on registers and hence possibly be faster. The same trick does not work
1119@ in vectoring mode (but perhaps one could work to single precision in
1120@ a first pass and then double precision in a second pass?).
1121
1122double_section dcordic_vec_step
1123 regular_func dcordic_vec_step
1124 mov r2,r12
1125 ldmia r2!,{r3,r4}
1126 mov r12,r2
1127 mov r2,r11
1128 cmp r2,#0
1129 blt 1f
1130 b 2f
1131
1132double_section dcordic_rot_step
1133 regular_func dcordic_rot_step
1134 mov r2,r12
1135 ldmia r2!,{r3,r4}
1136 mov r12,r2
1137 cmp r1,#0
1138 bge 1f
11392:
1140@ ω<0 / y>=0
1141@ ω+=dω
1142@ x+=y>>i, y-=x>>i
1143 adds r0,r3
1144 adcs r1,r4
1145
1146 mov r3,r11
1147 asrs r3,r7
1148 mov r4,r11
1149 lsls r4,r6
1150 mov r2,r10
1151 lsrs r2,r7
1152 orrs r2,r4                    @ r2:r3 y>>i, rounding in carry
1153 mov r4,r8
1154 mov r5,r9                     @ r4:r5 x
1155 adcs r2,r4
1156 adcs r3,r5                    @ r2:r3 x+(y>>i)
1157 mov r8,r2
1158 mov r9,r3
1159
1160 mov r3,r5
1161 lsls r3,r6
1162 asrs r5,r7
1163 lsrs r4,r7
1164 orrs r4,r3                    @ r4:r5 x>>i, rounding in carry
1165 mov r2,r10
1166 mov r3,r11
1167 sbcs r2,r4
1168 sbcs r3,r5                    @ r2:r3 y-(x>>i)
1169 mov r10,r2
1170 mov r11,r3
1171 bx r14
1172
1173
1174@ ω>0 / y<0
1175@ ω-=dω
1176@ x-=y>>i, y+=x>>i
11771:
1178 subs r0,r3
1179 sbcs r1,r4
1180
1181 mov r3,r9
1182 asrs r3,r7
1183 mov r4,r9
1184 lsls r4,r6
1185 mov r2,r8
1186 lsrs r2,r7
1187 orrs r2,r4                    @ r2:r3 x>>i, rounding in carry
1188 mov r4,r10
1189 mov r5,r11                    @ r4:r5 y
1190 adcs r2,r4
1191 adcs r3,r5                    @ r2:r3 y+(x>>i)
1192 mov r10,r2
1193 mov r11,r3
1194
1195 mov r3,r5
1196 lsls r3,r6
1197 asrs r5,r7
1198 lsrs r4,r7
1199 orrs r4,r3                    @ r4:r5 y>>i, rounding in carry
1200 mov r2,r8
1201 mov r3,r9
1202 sbcs r2,r4
1203 sbcs r3,r5                    @ r2:r3 x-(y>>i)
1204 mov r8,r2
1205 mov r9,r3
1206 bx r14
1207
1208@ convert packed double in r0:r1 to signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point], with rounding towards -Inf
1209@ fixed-point versions only work with reasonable values in r2 because of the way dunpacks works
1210
1211double_section double2int_shim
1212 regular_func double2int_shim
1213 movs r2,#0                    @ and fall through
1214regular_func double2fix_shim
1215 push {r14}
1216 adds r2,#32
1217 bl double2fix64_shim
1218 movs r0,r1
1219 pop {r15}
1220
1221double_section double2uint_shim
1222 regular_func double2uint_shim
1223 movs r2,#0                    @ and fall through
1224regular_func double2ufix_shim
1225 push {r14}
1226 adds r2,#32
1227 bl double2ufix64_shim
1228 movs r0,r1
1229 pop {r15}
1230
1231double_section double2int64_shim
1232 regular_func double2int64_shim
1233 movs r2,#0                    @ and fall through
1234regular_func double2fix64_shim
1235 push {r14}
1236 bl d2fix
1237
1238 asrs r2,r1,#31
1239 cmp r2,r3
1240 bne 1f                        @ sign extension bits fail to match sign of result?
1241 pop {r15}
12421:
1243 mvns r0,r3
1244 movs r1,#1
1245 lsls r1,#31
1246 eors r1,r1,r0                 @ generate extreme fixed-point values
1247 pop {r15}
1248
1249double_section double2uint64_shim
1250 regular_func double2uint64_shim
1251 movs r2,#0                    @ and fall through
1252regular_func double2ufix64_shim
1253 asrs r3,r1,#20                @ negative? return 0
1254 bmi ret_dzero
1255@ and fall through
1256
1257@ convert double in r0:r1 to signed fixed point in r0:r1:r3, r2 places after point, rounding towards -Inf
1258@ result clamped so that r3 can only be 0 or -1
1259@ trashes r12
1260.thumb_func
1261d2fix:
1262 push {r4,r14}
1263 mov r12,r2
1264 bl dunpacks
1265 asrs r4,r2,#16
1266 adds r4,#1
1267 bge 1f
1268 movs r1,#0                    @ -0 -> +0
12691:
1270 asrs r3,r1,#31
1271 ldr r4, =d2fix_a
1272 bx r4
1273
1274ret_dzero:
1275 movs r0,#0
1276 movs r1,#0
1277 bx r14
1278
1279.weak d2fix_a // weak because it exists in float code too
1280.thumb_func
1281d2fix_a:
1282@ here
1283@ r0:r1 two's complement mantissa
1284@ r2    unbaised exponent
1285@ r3    mantissa sign extension bits
1286 add r2,r12                    @ exponent plus offset for required binary point position
1287 subs r2,#52                   @ required shift
1288 bmi 1f                        @ shift down?
1289@ here a shift up by r2 places
1290 cmp r2,#12                    @ will clamp?
1291 bge 2f
1292 movs r4,r0
1293 lsls r1,r2
1294 lsls r0,r2
1295 negs r2,r2
1296 adds r2,#32                   @ complementary shift
1297 lsrs r4,r2
1298 orrs r1,r4
1299 pop {r4,r15}
13002:
1301 mvns r0,r3
1302 mvns r1,r3                    @ overflow: clamp to extreme fixed-point values
1303 pop {r4,r15}
13041:
1305@ here a shift down by -r2 places
1306 adds r2,#32
1307 bmi 1f                        @ long shift?
1308 mov r4,r1
1309 lsls r4,r2
1310 negs r2,r2
1311 adds r2,#32                   @ complementary shift
1312 asrs r1,r2
1313 lsrs r0,r2
1314 orrs r0,r4
1315 pop {r4,r15}
13161:
1317@ here a long shift down
1318 movs r0,r1
1319 asrs r1,#31                   @ shift down 32 places
1320 adds r2,#32
1321 bmi 1f                        @ very long shift?
1322 negs r2,r2
1323 adds r2,#32
1324 asrs r0,r2
1325 pop {r4,r15}
13261:
1327 movs r0,r3                    @ result very near zero: use sign extension bits
1328 movs r1,r3
1329 pop {r4,r15}
1330
1331double_section double2float_shim
1332 regular_func double2float_shim
1333 lsls r2,r1,#1
1334 lsrs r2,#21                   @ exponent
1335 ldr r3,=0x3ff-0x7f
1336 subs r2,r3                    @ fix exponent bias
1337 ble 1f                        @ underflow or zero
1338 cmp r2,#0xff
1339 bge 2f                        @ overflow or infinity
1340 lsls r2,#23                   @ position exponent of result
1341 lsrs r3,r1,#31
1342 lsls r3,#31
1343 orrs r2,r3                    @ insert sign
1344 lsls r3,r0,#3                 @ rounding bits
1345 lsrs r0,#29
1346 lsls r1,#12
1347 lsrs r1,#9
1348 orrs r0,r1                    @ assemble mantissa
1349 orrs r0,r2                    @ insert exponent and sign
1350 lsls r3,#1
1351 bcc 3f                        @ no rounding
1352 beq 4f                        @ all sticky bits 0?
13535:
1354 adds r0,#1
13553:
1356 bx r14
13574:
1358 lsrs r3,r0,#1                 @ odd? then round up
1359 bcs 5b
1360 bx r14
13611:
1362 beq 6f                        @ check case where value is just less than smallest normal
13637:
1364 lsrs r0,r1,#31
1365 lsls r0,#31
1366 bx r14
13676:
1368 lsls r2,r1,#12                @ 20 1:s at top of mantissa?
1369 asrs r2,#12
1370 adds r2,#1
1371 bne 7b
1372 lsrs r2,r0,#29                @ and 3 more 1:s?
1373 cmp r2,#7
1374 bne 7b
1375 movs r2,#1                    @ return smallest normal with correct sign
1376 b 8f
13772:
1378 movs r2,#0xff
13798:
1380 lsrs r0,r1,#31                @ return signed infinity
1381 lsls r0,#8
1382 adds r0,r2
1383 lsls r0,#23
1384 bx r14
1385
1386double_section x2double_shims
1387@ convert signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point] to packed double in r0:r1, with rounding
1388
1389.align 2
1390regular_func uint2double_shim
1391 movs r1,#0                    @ and fall through
1392regular_func ufix2double_shim
1393 movs r2,r1
1394 movs r1,#0
1395 b ufix642double_shim
1396
1397.align 2
1398regular_func int2double_shim
1399 movs r1,#0                    @ and fall through
1400regular_func fix2double_shim
1401 movs r2,r1
1402 asrs r1,r0,#31                @ sign extend
1403 b fix642double_shim
1404
1405.align 2
1406regular_func uint642double_shim
1407 movs r2,#0                    @ and fall through
1408regular_func ufix642double_shim
1409 movs r3,#0
1410 b uf2d
1411
1412.align 2
1413regular_func int642double_shim
1414 movs r2,#0                    @ and fall through
1415regular_func fix642double_shim
1416 asrs r3,r1,#31                @ sign bit across all bits
1417 eors r0,r3
1418 eors r1,r3
1419 subs r0,r3
1420 sbcs r1,r3
1421uf2d:
1422 push {r4,r5,r14}
1423 ldr r4,=0x432
1424 subs r2,r4,r2                 @ form biased exponent
1425@ here
1426@ r0:r1 unnormalised mantissa
1427@ r2 -Q (will become exponent)
1428@ r3 sign across all bits
1429 cmp r1,#0
1430 bne 1f                        @ short normalising shift?
1431 movs r1,r0
1432 beq 2f                        @ zero? return it
1433 movs r0,#0
1434 subs r2,#32                   @ fix exponent
14351:
1436 asrs r4,r1,#21
1437 bne 3f                        @ will need shift down (and rounding?)
1438 bcs 4f                        @ normalised already?
14395:
1440 subs r2,#1
1441 adds r0,r0                    @ shift up
1442 adcs r1,r1
1443 lsrs r4,r1,#21
1444 bcc 5b
14454:
1446 ldr r4,=0x7fe
1447 cmp r2,r4
1448 bhs 6f                        @ over/underflow? return signed zero/infinity
14497:
1450 lsls r2,#20                   @ pack and return
1451 adds r1,r2
1452 lsls r3,#31
1453 adds r1,r3
14542:
1455 pop {r4,r5,r15}
14566:                             @ return signed zero/infinity according to unclamped exponent in r2
1457 mvns r2,r2
1458 lsrs r2,#21
1459 movs r0,#0
1460 movs r1,#0
1461 b 7b
1462
14633:
1464@ here we need to shift down to normalise and possibly round
1465 bmi 1f                        @ already normalised to Q63?
14662:
1467 subs r2,#1
1468 adds r0,r0                    @ shift up
1469 adcs r1,r1
1470 bpl 2b
14711:
1472@ here we have a 1 in b63 of r0:r1
1473 adds r2,#11                   @ correct exponent for subsequent shift down
1474 lsls r4,r0,#21                @ save bits for rounding
1475 lsrs r0,#11
1476 lsls r5,r1,#21
1477 orrs r0,r5
1478 lsrs r1,#11
1479 lsls r4,#1
1480 beq 1f                        @ sticky bits are zero?
14818:
1482 movs r4,#0
1483 adcs r0,r4
1484 adcs r1,r4
1485 b 4b
14861:
1487 bcc 4b                        @ sticky bits are zero but not on rounding boundary
1488 lsrs r4,r0,#1                 @ increment if odd (force round to even)
1489 b 8b
1490
1491
1492.ltorg
1493
1494double_section dunpacks
1495 regular_func dunpacks
1496 mdunpacks r0,r1,r2,r3,r4
1497 ldr r3,=0x3ff
1498 subs r2,r3                    @ exponent without offset
1499 bx r14
1500
1501@ r0:r1  signed mantissa Q52
1502@ r2     unbiased exponent < 10 (i.e., |x|<2^10)
1503@ r4     pointer to:
1504@          - divisor reciprocal approximation r=1/d Q15
1505@          - divisor d Q62  0..20
1506@          - divisor d Q62 21..41
1507@          - divisor d Q62 42..62
1508@ returns:
1509@ r0:r1  reduced result y Q62, -0.6 d < y < 0.6 d (better in practice)
1510@ r2     quotient q (number of reductions)
1511@ if exponent >=10, returns r0:r1=0, r2=1024*mantissa sign
1512@ designed to work for 0.5<d<2, in particular d=ln2 (~0.7) and d=π/2 (~1.6)
1513double_section dreduce
1514 regular_func dreduce
1515 adds r2,#2                    @ e+2
1516 bmi 1f                        @ |x|<0.25, too small to need adjustment
1517 cmp r2,#12
1518 bge 4f
15192:
1520 movs r5,#17
1521 subs r5,r2                    @ 15-e
1522 movs r3,r1                    @ Q20
1523 asrs r3,r5                    @ x Q5
1524 adds r2,#8                    @ e+10
1525 adds r5,#7                    @ 22-e = 32-(e+10)
1526 movs r6,r0
1527 lsrs r6,r5
1528 lsls r0,r2
1529 lsls r1,r2
1530 orrs r1,r6                    @ r0:r1 x Q62
1531 ldmia r4,{r4-r7}
1532 muls r3,r4                    @ rx Q20
1533 asrs r2,r3,#20
1534 movs r3,#0
1535 adcs r2,r3                    @ rx Q0 rounded = q; for e.g. r=1.5 |q|<1.5*2^10
1536 muls r5,r2                    @ qd in pieces: L Q62
1537 muls r6,r2                    @               M Q41
1538 muls r7,r2                    @               H Q20
1539 lsls r7,#10
1540 asrs r4,r6,#11
1541 lsls r6,#21
1542 adds r6,r5
1543 adcs r7,r4
1544 asrs r5,#31
1545 adds r7,r5                    @ r6:r7 qd Q62
1546 subs r0,r6
1547 sbcs r1,r7                    @ remainder Q62
1548 bx r14
15494:
1550 movs r2,#12                   @ overflow: clamp to +/-1024
1551 movs r0,#0
1552 asrs r1,#31
1553 lsls r1,#1
1554 adds r1,#1
1555 lsls r1,#20
1556 b 2b
1557
15581:
1559 lsls r1,#8
1560 lsrs r3,r0,#24
1561 orrs r1,r3
1562 lsls r0,#8                    @ r0:r1 Q60, to be shifted down -r2 places
1563 negs r3,r2
1564 adds r2,#32                   @ shift down in r3, complementary shift in r2
1565 bmi 1f                        @ long shift?
15662:
1567 movs r4,r1
1568 asrs r1,r3
1569 lsls r4,r2
1570 lsrs r0,r3
1571 orrs r0,r4
1572 movs r2,#0                    @ rounding
1573 adcs r0,r2
1574 adcs r1,r2
1575 bx r14
1576
15771:
1578 movs r0,r1                    @ down 32 places
1579 asrs r1,#31
1580 subs r3,#32
1581 adds r2,#32
1582 bpl 2b
1583 movs r0,#0                    @ very long shift? return 0
1584 movs r1,#0
1585 movs r2,#0
1586 bx r14
1587
1588double_section dtan_shim
1589 regular_func dtan_shim
1590 push {r4-r7,r14}
1591 bl push_r8_r11
1592 bl dsincos_internal
1593 mov r12,r0                    @ save ε
1594 bl dcos_finish
1595 push {r0,r1}
1596 mov r0,r12
1597 bl dsin_finish
1598 pop {r2,r3}
1599 bl pop_r8_r11
1600 b ddiv0                       @ compute sin θ/cos θ
1601
1602double_section dcos_shim
1603 regular_func dcos_shim
1604 push {r4-r7,r14}
1605 bl push_r8_r11
1606 bl dsincos_internal
1607 bl dcos_finish
1608 b 1f
1609
1610double_section dsin_shim
1611 regular_func dsin_shim
1612 push {r4-r7,r14}
1613 bl push_r8_r11
1614 bl dsincos_internal
1615 bl dsin_finish
16161:
1617 bl pop_r8_r11
1618 pop {r4-r7,r15}
1619
1620double_section dsincos_shim
1621
1622 @ Note that this function returns in r0-r3
1623 regular_func dsincos_shim
1624
1625 push {r4-r7,r14}
1626 bl push_r8_r11
1627 bl dsincos_internal
1628 mov r12,r0                    @ save ε
1629 bl dcos_finish
1630 push {r0,r1}
1631 mov r0,r12
1632 bl dsin_finish
1633 pop {r2,r3}
1634 bl pop_r8_r11
1635 pop {r4-r7,r15}
1636
1637double_section dtrig_guts
1638
1639@ unpack double θ in r0:r1, range reduce and calculate ε, cos α and sin α such that
1640@ θ=α+ε and |ε|≤2^-32
1641@ on return:
1642@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
1643@ r8:r9   cos α Q62
1644@ r10:r11 sin α Q62
1645.align 2
1646.thumb_func
1647dsincos_internal:
1648 push {r14}
1649 bl dunpacks
1650 adr r4,dreddata0
1651 bl dreduce
1652
1653 movs r4,#0
1654 ldr r5,=0x9df04dbb           @ this value compensates for the non-unity scaling of the CORDIC rotations
1655 ldr r6,=0x36f656c5
1656 lsls r2,#31
1657 bcc 1f
1658@ quadrant 2 or 3
1659 mvns r6,r6
1660 negs r5,r5
1661 adcs r6,r4
16621:
1663 lsls r2,#1
1664 bcs 1f
1665@ even quadrant
1666 mov r10,r4
1667 mov r11,r4
1668 mov r8,r5
1669 mov r9,r6
1670 b 2f
16711:
1672@ odd quadrant
1673 mov r8,r4
1674 mov r9,r4
1675 mov r10,r5
1676 mov r11,r6
16772:
1678 adr r4,dtab_cc
1679 mov r12,r4
1680 movs r7,#1
1681 movs r6,#31
16821:
1683 bl dcordic_rot_step
1684 adds r7,#1
1685 subs r6,#1
1686 cmp r7,#33
1687 bne 1b
1688 pop {r15}
1689
1690.thumb_func
1691dcos_finish:
1692@ here
1693@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
1694@ r8:r9   cos α Q62
1695@ r10:r11 sin α Q62
1696@ and we wish to calculate cos θ=cos(α+ε)~cos α - ε sin α
1697 mov r1,r11
1698@ mov r2,r10
1699@ lsrs r2,#31
1700@ adds r1,r2                    @ rounding improves accuracy very slightly
1701 muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
1702@ r2:r3   ε sin α Q(62+62-32)=Q92
1703 mov r0,r8
1704 mov r1,r9
1705 lsls r5,r3,#2
1706 asrs r3,r3,#30
1707 lsrs r2,r2,#30
1708 orrs r2,r5
1709 sbcs r0,r2                    @ include rounding
1710 sbcs r1,r3
1711 movs r2,#62
1712 b fix642double_shim
1713
1714.thumb_func
1715dsin_finish:
1716@ here
1717@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
1718@ r8:r9   cos α Q62
1719@ r10:r11 sin α Q62
1720@ and we wish to calculate sin θ=sin(α+ε)~sin α + ε cos α
1721 mov r1,r9
1722 muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
1723@ r2:r3   ε cos α Q(62+62-32)=Q92
1724 mov r0,r10
1725 mov r1,r11
1726 lsls r5,r3,#2
1727 asrs r3,r3,#30
1728 lsrs r2,r2,#30
1729 orrs r2,r5
1730 adcs r0,r2                    @ include rounding
1731 adcs r1,r3
1732 movs r2,#62
1733 b fix642double_shim
1734
1735.ltorg
1736.align 2
1737dreddata0:
1738.word 0x0000517d               @ 2/π Q15
1739.word 0x0014611A               @ π/2 Q62=6487ED5110B4611A split into 21-bit pieces
1740.word 0x000A8885
1741.word 0x001921FB
1742
1743
1744.align 2
1745regular_func datan2_shim
1746@ r0:r1 y
1747@ r2:r3 x
1748 push {r4-r7,r14}
1749 bl push_r8_r11
1750 ldr r5,=0x7ff00000
1751 movs r4,r1
1752 ands r4,r5                    @ y==0?
1753 beq 1f
1754 cmp r4,r5                     @ or Inf/NaN?
1755 bne 2f
17561:
1757 lsrs r1,#20                   @ flush
1758 lsls r1,#20
1759 movs r0,#0
17602:
1761 movs r4,r3
1762 ands r4,r5                    @ x==0?
1763 beq 1f
1764 cmp r4,r5                     @ or Inf/NaN?
1765 bne 2f
17661:
1767 lsrs r3,#20                   @ flush
1768 lsls r3,#20
1769 movs r2,#0
17702:
1771 movs r6,#0                    @ quadrant offset
1772 lsls r5,#11                   @ constant 0x80000000
1773 cmp r3,#0
1774 bpl 1f                        @ skip if x positive
1775 movs r6,#2
1776 eors r3,r5
1777 eors r1,r5
1778 bmi 1f                        @ quadrant offset=+2 if y was positive
1779 negs r6,r6                    @ quadrant offset=-2 if y was negative
17801:
1781@ now in quadrant 0 or 3
1782 adds r7,r1,r5                 @ r7=-r1
1783 bpl 1f
1784@ y>=0: in quadrant 0
1785 cmp r1,r3
1786 ble 2f                        @ y<~x so 0≤θ<~π/4: skip
1787 adds r6,#1
1788 eors r1,r5                    @ negate x
1789 b 3f                          @ and exchange x and y = rotate by -π/2
17901:
1791 cmp r3,r7
1792 bge 2f                        @ -y<~x so -π/4<~θ≤0: skip
1793 subs r6,#1
1794 eors r3,r5                    @ negate y and ...
17953:
1796 movs r7,r0                    @ exchange x and y
1797 movs r0,r2
1798 movs r2,r7
1799 movs r7,r1
1800 movs r1,r3
1801 movs r3,r7
18022:
1803@ here -π/4<~θ<~π/4
1804@ r6 has quadrant offset
1805 push {r6}
1806 cmp r2,#0
1807 bne 1f
1808 cmp r3,#0
1809 beq 10f                       @ x==0 going into division?
1810 lsls r4,r3,#1
1811 asrs r4,#21
1812 adds r4,#1
1813 bne 1f                        @ x==Inf going into division?
1814 lsls r4,r1,#1
1815 asrs r4,#21
1816 adds r4,#1                    @ y also ±Inf?
1817 bne 10f
1818 subs r1,#1                    @ make them both just finite
1819 subs r3,#1
1820 b 1f
1821
182210:
1823 movs r0,#0
1824 movs r1,#0
1825 b 12f
1826
18271:
1828 bl ddiv_shim
1829 movs r2,#62
1830 bl double2fix64_shim
1831@ r0:r1 y/x
1832 mov r10,r0
1833 mov r11,r1
1834 movs r0,#0                    @ ω=0
1835 movs r1,#0
1836 mov r8,r0
1837 movs r2,#1
1838 lsls r2,#30
1839 mov r9,r2                     @ x=1
1840
1841 adr r4,dtab_cc
1842 mov r12,r4
1843 movs r7,#1
1844 movs r6,#31
18451:
1846 bl dcordic_vec_step
1847 adds r7,#1
1848 subs r6,#1
1849 cmp r7,#33
1850 bne 1b
1851@ r0:r1   atan(y/x) Q62
1852@ r8:r9   x residual Q62
1853@ r10:r11 y residual Q62
1854 mov r2,r9
1855 mov r3,r10
1856 subs r2,#12                   @ this makes atan(0)==0
1857@ the following is basically a division residual y/x ~ atan(residual y/x)
1858 movs r4,#1
1859 lsls r4,#29
1860 movs r7,#0
18612:
1862 lsrs r2,#1
1863 movs r3,r3                    @ preserve carry
1864 bmi 1f
1865 sbcs r3,r2
1866 adds r0,r4
1867 adcs r1,r7
1868 lsrs r4,#1
1869 bne 2b
1870 b 3f
18711:
1872 adcs r3,r2
1873 subs r0,r4
1874 sbcs r1,r7
1875 lsrs r4,#1
1876 bne 2b
18773:
1878 lsls r6,r1,#31
1879 asrs r1,#1
1880 lsrs r0,#1
1881 orrs r0,r6                    @ Q61
1882
188312:
1884 pop {r6}
1885
1886 cmp r6,#0
1887 beq 1f
1888 ldr r4,=0x885A308D           @ π/2 Q61
1889 ldr r5,=0x3243F6A8
1890 bpl 2f
1891 mvns r4,r4                    @ negative quadrant offset
1892 mvns r5,r5
18932:
1894 lsls r6,#31
1895 bne 2f                        @ skip if quadrant offset is ±1
1896 adds r0,r4
1897 adcs r1,r5
18982:
1899 adds r0,r4
1900 adcs r1,r5
19011:
1902 movs r2,#61
1903 bl fix642double_shim
1904
1905 bl pop_r8_r11
1906 pop {r4-r7,r15}
1907
1908.ltorg
1909
1910dtab_cc:
1911.word 0x61bb4f69, 0x1dac6705   @ atan 2^-1 Q62
1912.word 0x96406eb1, 0x0fadbafc   @ atan 2^-2 Q62
1913.word 0xab0bdb72, 0x07f56ea6   @ atan 2^-3 Q62
1914.word 0xe59fbd39, 0x03feab76   @ atan 2^-4 Q62
1915.word 0xba97624b, 0x01ffd55b   @ atan 2^-5 Q62
1916.word 0xdddb94d6, 0x00fffaaa   @ atan 2^-6 Q62
1917.word 0x56eeea5d, 0x007fff55   @ atan 2^-7 Q62
1918.word 0xaab7776e, 0x003fffea   @ atan 2^-8 Q62
1919.word 0x5555bbbc, 0x001ffffd   @ atan 2^-9 Q62
1920.word 0xaaaaadde, 0x000fffff   @ atan 2^-10 Q62
1921.word 0xf555556f, 0x0007ffff   @ atan 2^-11 Q62
1922.word 0xfeaaaaab, 0x0003ffff   @ atan 2^-12 Q62
1923.word 0xffd55555, 0x0001ffff   @ atan 2^-13 Q62
1924.word 0xfffaaaab, 0x0000ffff   @ atan 2^-14 Q62
1925.word 0xffff5555, 0x00007fff   @ atan 2^-15 Q62
1926.word 0xffffeaab, 0x00003fff   @ atan 2^-16 Q62
1927.word 0xfffffd55, 0x00001fff   @ atan 2^-17 Q62
1928.word 0xffffffab, 0x00000fff   @ atan 2^-18 Q62
1929.word 0xfffffff5, 0x000007ff   @ atan 2^-19 Q62
1930.word 0xffffffff, 0x000003ff   @ atan 2^-20 Q62
1931.word 0x00000000, 0x00000200   @ atan 2^-21 Q62 @ consider optimising these
1932.word 0x00000000, 0x00000100   @ atan 2^-22 Q62
1933.word 0x00000000, 0x00000080   @ atan 2^-23 Q62
1934.word 0x00000000, 0x00000040   @ atan 2^-24 Q62
1935.word 0x00000000, 0x00000020   @ atan 2^-25 Q62
1936.word 0x00000000, 0x00000010   @ atan 2^-26 Q62
1937.word 0x00000000, 0x00000008   @ atan 2^-27 Q62
1938.word 0x00000000, 0x00000004   @ atan 2^-28 Q62
1939.word 0x00000000, 0x00000002   @ atan 2^-29 Q62
1940.word 0x00000000, 0x00000001   @ atan 2^-30 Q62
1941.word 0x80000000, 0x00000000   @ atan 2^-31 Q62
1942.word 0x40000000, 0x00000000   @ atan 2^-32 Q62
1943
1944double_section dexp_guts
1945regular_func dexp_shim
1946 push {r4-r7,r14}
1947 bl dunpacks
1948 adr r4,dreddata1
1949 bl dreduce
1950 cmp r1,#0
1951 bge 1f
1952 ldr r4,=0xF473DE6B
1953 ldr r5,=0x2C5C85FD           @ ln2 Q62
1954 adds r0,r4
1955 adcs r1,r5
1956 subs r2,#1
19571:
1958 push {r2}
1959 movs r7,#1                    @ shift
1960 adr r6,dtab_exp
1961 movs r2,#0
1962 movs r3,#1
1963 lsls r3,#30                   @ x=1 Q62
1964
19653:
1966 ldmia r6!,{r4,r5}
1967 mov r12,r6
1968 subs r0,r4
1969 sbcs r1,r5
1970 bmi 1f
1971
1972 negs r6,r7
1973 adds r6,#32                   @ complementary shift
1974 movs r5,r3
1975 asrs r5,r7
1976 movs r4,r3
1977 lsls r4,r6
1978 movs r6,r2
1979 lsrs r6,r7                    @ rounding bit in carry
1980 orrs r4,r6
1981 adcs r2,r4
1982 adcs r3,r5                    @ x+=x>>i
1983 b 2f
1984
19851:
1986 adds r0,r4                    @ restore argument
1987 adcs r1,r5
19882:
1989 mov r6,r12
1990 adds r7,#1
1991 cmp r7,#33
1992 bne 3b
1993
1994@ here
1995@ r0:r1   ε (residual x, where x=a+ε) Q62, |ε|≤2^-32 (so fits in r0)
1996@ r2:r3   exp a Q62
1997@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)
1998 muls32_32_64 r0,r3, r4,r1, r5,r6,r7,r4,r1
1999@ r4:r1 ε exp a Q(62+62-32)=Q92
2000 lsrs r4,#30
2001 lsls r0,r1,#2
2002 orrs r0,r4
2003 asrs r1,#30
2004 adds r0,r2
2005 adcs r1,r3
2006
2007 pop {r2}
2008 negs r2,r2
2009 adds r2,#62
2010 bl fix642double_shim                 @ in principle we can pack faster than this because we know the exponent
2011 pop {r4-r7,r15}
2012
2013.ltorg
2014
2015.align 2
2016regular_func dln_shim
2017 push {r4-r7,r14}
2018 lsls r7,r1,#1
2019 bcs 5f                        @ <0 ...
2020 asrs r7,#21
2021 beq 5f                        @ ... or =0? return -Inf
2022 adds r7,#1
2023 beq 6f                        @ Inf/NaN? return +Inf
2024 bl dunpacks
2025 push {r2}
2026 lsls r1,#9
2027 lsrs r2,r0,#23
2028 orrs r1,r2
2029 lsls r0,#9
2030@ r0:r1 m Q61 = m/2 Q62 0.5≤m/2<1
2031
2032 movs r7,#1                    @ shift
2033 adr r6,dtab_exp
2034 mov r12,r6
2035 movs r2,#0
2036 movs r3,#0                    @ y=0 Q62
2037
20383:
2039 negs r6,r7
2040 adds r6,#32                   @ complementary shift
2041 movs r5,r1
2042 asrs r5,r7
2043 movs r4,r1
2044 lsls r4,r6
2045 movs r6,r0
2046 lsrs r6,r7
2047 orrs r4,r6                    @ x>>i, rounding bit in carry
2048 adcs r4,r0
2049 adcs r5,r1                    @ x+(x>>i)
2050
2051 lsrs r6,r5,#30
2052 bne 1f                        @ x+(x>>i)>1?
2053 movs r0,r4
2054 movs r1,r5                    @ x+=x>>i
2055 mov r6,r12
2056 ldmia r6!,{r4,r5}
2057 subs r2,r4
2058 sbcs r3,r5
2059
20601:
2061 movs r4,#8
2062 add r12,r4
2063 adds r7,#1
2064 cmp r7,#33
2065 bne 3b
2066@ here:
2067@ r0:r1 residual x, nearly 1 Q62
2068@ r2:r3 y ~ ln m/2 = ln m - ln2 Q62
2069@ result is y + ln2 + ln x ~ y + ln2 + (x-1)
2070 lsls r1,#2
2071 asrs r1,#2                    @ x-1
2072 adds r2,r0
2073 adcs r3,r1
2074
2075 pop {r7}
2076@ here:
2077@ r2:r3 ln m/2 = ln m - ln2 Q62
2078@ r7    unbiased exponent
2079.equ dreddata1_plus_4, (dreddata1+4)
2080 adr r4,dreddata1_plus_4
2081 ldmia r4,{r0,r1,r4}
2082 adds r7,#1
2083 muls r0,r7                    @ Q62
2084 muls r1,r7                    @ Q41
2085 muls r4,r7                    @ Q20
2086 lsls r7,r1,#21
2087 asrs r1,#11
2088 asrs r5,r1,#31
2089 adds r0,r7
2090 adcs r1,r5
2091 lsls r7,r4,#10
2092 asrs r4,#22
2093 asrs r5,r1,#31
2094 adds r1,r7
2095 adcs r4,r5
2096@ r0:r1:r4 exponent*ln2 Q62
2097 asrs r5,r3,#31
2098 adds r0,r2
2099 adcs r1,r3
2100 adcs r4,r5
2101@ r0:r1:r4 result Q62
2102 movs r2,#62
21031:
2104 asrs r5,r1,#31
2105 cmp r4,r5
2106 beq 2f                        @ r4 a sign extension of r1?
2107 lsrs r0,#4                    @ no: shift down 4 places and try again
2108 lsls r6,r1,#28
2109 orrs r0,r6
2110 lsrs r1,#4
2111 lsls r6,r4,#28
2112 orrs r1,r6
2113 asrs r4,#4
2114 subs r2,#4
2115 b 1b
21162:
2117 bl fix642double_shim
2118 pop {r4-r7,r15}
2119
21205:
2121 ldr r1,=0xfff00000
2122 movs r0,#0
2123 pop {r4-r7,r15}
2124
21256:
2126 ldr r1,=0x7ff00000
2127 movs r0,#0
2128 pop {r4-r7,r15}
2129
2130.ltorg
2131
2132.align 2
2133dreddata1:
2134.word 0x0000B8AA               @ 1/ln2 Q15
2135.word 0x0013DE6B               @ ln2 Q62 Q62=2C5C85FDF473DE6B split into 21-bit pieces
2136.word 0x000FEFA3
2137.word 0x000B1721
2138
2139dtab_exp:
2140.word 0xbf984bf3, 0x19f323ec   @ log 1+2^-1 Q62
2141.word 0xcd4d10d6, 0x0e47fbe3   @ log 1+2^-2 Q62
2142.word 0x8abcb97a, 0x0789c1db   @ log 1+2^-3 Q62
2143.word 0x022c54cc, 0x03e14618   @ log 1+2^-4 Q62
2144.word 0xe7833005, 0x01f829b0   @ log 1+2^-5 Q62
2145.word 0x87e01f1e, 0x00fe0545   @ log 1+2^-6 Q62
2146.word 0xac419e24, 0x007f80a9   @ log 1+2^-7 Q62
2147.word 0x45621781, 0x003fe015   @ log 1+2^-8 Q62
2148.word 0xa9ab10e6, 0x001ff802   @ log 1+2^-9 Q62
2149.word 0x55455888, 0x000ffe00   @ log 1+2^-10 Q62
2150.word 0x0aa9aac4, 0x0007ff80   @ log 1+2^-11 Q62
2151.word 0x01554556, 0x0003ffe0   @ log 1+2^-12 Q62
2152.word 0x002aa9ab, 0x0001fff8   @ log 1+2^-13 Q62
2153.word 0x00055545, 0x0000fffe   @ log 1+2^-14 Q62
2154.word 0x8000aaaa, 0x00007fff   @ log 1+2^-15 Q62
2155.word 0xe0001555, 0x00003fff   @ log 1+2^-16 Q62
2156.word 0xf80002ab, 0x00001fff   @ log 1+2^-17 Q62
2157.word 0xfe000055, 0x00000fff   @ log 1+2^-18 Q62
2158.word 0xff80000b, 0x000007ff   @ log 1+2^-19 Q62
2159.word 0xffe00001, 0x000003ff   @ log 1+2^-20 Q62
2160.word 0xfff80000, 0x000001ff   @ log 1+2^-21 Q62
2161.word 0xfffe0000, 0x000000ff   @ log 1+2^-22 Q62
2162.word 0xffff8000, 0x0000007f   @ log 1+2^-23 Q62
2163.word 0xffffe000, 0x0000003f   @ log 1+2^-24 Q62
2164.word 0xfffff800, 0x0000001f   @ log 1+2^-25 Q62
2165.word 0xfffffe00, 0x0000000f   @ log 1+2^-26 Q62
2166.word 0xffffff80, 0x00000007   @ log 1+2^-27 Q62
2167.word 0xffffffe0, 0x00000003   @ log 1+2^-28 Q62
2168.word 0xfffffff8, 0x00000001   @ log 1+2^-29 Q62
2169.word 0xfffffffe, 0x00000000   @ log 1+2^-30 Q62
2170.word 0x80000000, 0x00000000   @ log 1+2^-31 Q62
2171.word 0x40000000, 0x00000000   @ log 1+2^-32 Q62
2172
2173
2174#endif
2175