1/*
2 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7#if !PICO_RP2040
8#include "pico/asm_helper.S"
9
10pico_default_asm_setup
11
12.macro double_section name
13#if PICO_DOUBLE_IN_RAM
14.section RAM_SECTION_NAME(\name), "ax"
15#else
16.section SECTION_NAME(\name), "ax"
17#endif
18.endm
19
20.macro double_wrapper_section func
21double_section WRAPPER_FUNC_NAME(\func)
22.endm
23
24.global rtwopi
25.global logtab0
26.global exptab0
27.global exptab1
28.global exptab2
29.global trigtab
30
31@ load a 32-bit constant n into register rx
32.macro movlong rx,n
33 movw \rx,#(\n)&0xffff
34 movt \rx,#((\n)>>16)&0xffff
35.endm
36
37double_section rtwopi
38
39// 1/2π to plenty of accuracy, 256 bits per line
40.long 0                      @ this allows values of e down to -32
41rtwopi:
42.long 0,0
43.long 0x28BE60DB, 0x9391054A, 0x7F09D5F4, 0x7D4D3770, 0x36D8A566, 0x4F10E410, 0x7F9458EA, 0xF7AEF158
44.long 0x6DC91B8E, 0x909374B8, 0x01924BBA, 0x82746487, 0x3F877AC7, 0x2C4A69CF, 0xBA208D7D, 0x4BAED121
45.long 0x3A671C09, 0xAD17DF90, 0x4E64758E, 0x60D4CE7D, 0x272117E2, 0xEF7E4A0E, 0xC7FE25FF, 0xF7816603
46.long 0xFBCBC462, 0xD6829B47, 0xDB4D9FB3, 0xC9F2C26D, 0xD3D18FD9, 0xA797FA8B, 0x5D49EEB1, 0xFAF97C5E
47.long 0xCF41CE7D, 0xE294A4BA, 0x9AFED7EC, 0x47E35742, 0x1580CC11
48
49double_section drrcore
50
51@ input:
52@ r0:r1 mantissa m Q52
53@ r2 exponent e>=-32, typically offset by +12
54@ r3 pointer to rtwopi table
55@ output:
56@ r0..r2 preserved
57@ r7:r8 range reduced result
58@ r3,r4,r5,r6,r9,r10 trashed
59.thumb_func
60drr_core:
61 ldr r3,=rtwopi
62 and r5,r2,#31               @ s=e%32
63 mov r7,#1
64 lsls r7,r7,r5               @ 1<<s
65 asrs r8,r2,#5               @ k=e/32, k<=32 for double with e offset <32; with e offsets up to 12+64, k<=34
66 umull r4,r5,r7,r0
67 movs r6,#0
68 umlal r5,r6,r7,r1
69@ r2       e
70@ r4:r5:r6 u0:u1:u2 = m<<(e%32); u2 is never more than 2<<20
71@ r8       e/32
72 add r3,r3,r8,lsl#2          @ p
73
74 ldr r9,[r3,#16]             @ a0=p[4]
75 umull r10,r7,r9,r6          @ r0=a0*u2 hi, discard lo
76@ r7  r0
77 ldr r9,[r3,#12]             @ a1=p[3]
78 umull r10,r8,r9,r5          @ r1=a1*u1 hi, discard lo
79 umaal r7,r8,r9,r6           @ r0:r1=r0+r1+a1*u2
80@ r7:r8  r0:r1
81 ldr r9,[r3,#8]              @ a2=p[2]
82 mla r8,r9,r6,r8             @ r1+=a2*u2
83 umlal r7,r8,r9,r5           @ r0:r1+=a2*u1
84 umull r10,r6,r9,r4          @ r0x=a2*u0 hi, discard lo; u2 no longer needed
85@ r7:r8 r0:r1
86@ r6 r0x
87 ldr r9,[r3,#4]              @ a3=p[1]
88 mla r8,r9,r5,r8             @ r1+=a3*u1
89 umlal r7,r8,r9,r4           @ r0:r1+=a3*u0
90@ r7:r8 r0:r1
91@ r6 r0x
92 ldr r9,[r3,#0]              @ a4=p[0]
93 mla r8,r9,r4,r8             @ r1+=a4*u0
94@ r7:r8 r0:r1
95@ r6 r0x
96 adds r7,r6
97 adc r8, r8, #0
98 bx r14
99
100double_wrapper_section tan
101wrapper_func tan
102 push {r14}
103 bl sincos_raw
104 bl __aeabi_ddiv
105 pop {r15}
106
107double_wrapper_section sin_cos
108
10910:                          @ process Inf/NaN for dsin and dcos
110 orrs r2,r0,r1,lsl#12
111 it eq
112 orreq r1,#0x80000000
113 orr r1,#0x00080000
114 movs r3,r1                  @ copy results to cosine output
115 movs r2,r0
116 bx r14
117
118@ case where angle is very small (<2^-32) before reduction
11932:
120 cbnz r3,33f
121 movs r0,#0                  @ flush denormal
122 movs r1,#0
12333:
124 movs r2,#0                  @ return x for sine, 1 for cosine
125 movlong r3,0x3ff00000
126 tst r12,#2
127 itt ne
128 movne r0,r2                 @ calculating cosine? move to result registers
129 movne r1,r3
130 bx r14
131
132@ case where angle is fairly small after reduction
13330:
134 movs r2,#0
135 movs r3,#0
136 movs r4,#0
137 movs r5,#0x80000000
138 b 31f
139
14040:
141@ case where range-reduced angle is very small
142@ here
143@ r0:r1 mantissa
144@ r2 exponent+12
145@ r7:r8 abs range-reduced angle < 2^-10 Q64
146@ r12b31: dsincos flag
147@ r12b30: original argument sign
148@ r12b2..1: quadrant count
149@ r12b0: sign of reduced angle
150 push {r12}
151 movs r12,#0
1522:
153 add r12,#4
154 add r2,#4
155 bl drr_core                 @ repeat range reduction with extra factor of 2^4 (, 2^8, 2^12, 2^16,...)
156 eors r9,r8,r8,asr#31        @ we want to keep the sign in r8b31 for later
157 cmp r9,#1<<24               @ ≥ 2^-8?
158 bhs 1f                      @ loop until the result is big enough
159 cmp r12,#64                 @ safety net
160 bne 2b
1611:
162 eors r7,r7,r8,asr#31
163@ here r7:r9 is the abs range-reduced angle Q64+r12, 2^-8..2^-4 in Q64 terms
164
165@ 2π=6.487ED511 0B4611A6 (2633...)
166 movlong r6,0x0B4611A6       @ 2π Q64 low fractional word
167 umull r10,r0,r9,r6
168 movlong r6,0x487ED511       @ 2π Q64 high fractional word
169 umull r10,r1,r7,r6
170 umaal r0,r1,r9,r6
171 movs r6,#6                  @ 2π integer part
172 umlal r0,r1,r7,r6
173 mla r1,r9,r6,r1
174
175@ here
176@ r0:r1 θ, abs range reduced angle θ 0..π/4 Q64+r12
177@ r8b31: sign of reduced angle
178@ r12: excess exponent4, multiple of 4
179@ r0:r1 / 2^r12 < ~ 2π * 2^-10 so for sin we need to go to term in x^5
180 rsbs r10,r12,#32
181 bmi 1f                      @ very small result?
182
183 lsr r3,r1,r12
184 lsr r2,r0,r12
185 lsl r9,r1,r10
186 orr r2,r9
187@ r2:r3 θ Q64 (with r12 bits of loss of precision)
188 umull r9,r4,r2,r3
189 umull r9,r5,r2,r3
190 umaal r4,r5,r3,r3
191@ r4:r5 θ² Q64 = θ²/2 Q65 < ~ 4π² * 2^-20
192 umull r9,r6,r4,r5
193 umull r9,r7,r4,r5
194 umaal r6,r7,r5,r5
195@ r6:r7 θ⁴ Q64 < ~ 16π⁴ * 2^-40 < 2^-29
196 lsrs r6,#3
197 orrs r6,r6,r7,lsl#29
198@ r6 θ⁴ Q61
199 mov r9,#0xaaaaaaaa          @ 2/3 Q32
200 umull r6,r7,r6,r9
201@ r7 θ⁴ * 2/3 Q61 = θ⁴/24 Q65
202 rsbs r6,r4,r7
203 sbcs r7,r5,r5,lsl#1
204@ r6:r7  - θ²/2 + θ⁴/24 Q65, <=0
205 asrs r9,r7,#12
206 lsrs r6,#12
207 adcs r6,r6,r7,lsl#20
208 adcs r7,r9,#0x0ff00000
209 adds r7,#0x30000000
210 push {r6,r7}                @ packed cos θ
211
212@ here
213@ r0:r1 θ, abs range reduced angle θ 0..π/4 Q64+r12
214@ r4:r5 θ² Q64 = θ²/2 Q65
215 umull r9,r2,r0,r5
216 umull r9,r3,r1,r4
217 umaal r2,r3,r1,r5
218@ r2:r3 θ³ Q64+r12
219 umull r9,r6,r2,r5
220 umull r9,r7,r3,r4
221 umaal r6,r7,r3,r5
222@ r6:r7 θ⁵ Q64+r12; in fact r7 is always 0
223 mov r9,#0xaaaaaaaa          @ 2/3 Q32
224 umull r10,r4,r2,r9
225 movs r5,#0
226 umlal r4,r5,r3,r9
227 adds r4,r4,r5
228 adcs r5,r5,#0
229@ r4:r5 θ³*2/3 Q64+r12 = θ³/6 Q66+r12
230 mov r9,#0x88888888          @ 8/15 Q32
231 umull r2,r3,r6,r9
232@ r3 θ⁵*8/15 Q64+r12 = θ⁵/120 Q70+r12
233 subs r4,r4,r3,lsr#4
234 sbc r5,r5,#0
235@ r4:r5 θ³/6-θ⁵/120 Q66+r12
236 subs r0,r0,r4,lsr#2
237 sbc r1,r1,r5,asr#2
238 subs r0,r0,r5,lsl#30
239 sbc r1,r1,#0
240@ r0:r1 θ-θ³/6+θ⁵/120 Q64+r12
241 rsb r4,r12,#0x400
242 sub r4,#14
243 bl dpack_q
244 pop {r6,r7,r12}             @ get cosine, flags
245 b 2f
246
2471:
248@ case where range reduction result has excess exponent32
249@ so sin x=x, cos x=1
250 rsb r4,r12,#0x400
251 sub r4,#14
252 bl dpack_q
253 pop {r12}                   @ get flags
254 movs r6,#0
255 movlong r7,0x3ff00000
2562:
257@ here r0:r1 is packed sine, r6:r7 is packed cosine
258 lsrs r8,#31
259 bfi r12,r8,#0,#1
260
261 asrs r9,r12,#1
262 bmi 23f                     @ doing dsincos?
263 asrs r9,#1
264 bcc 21f                     @ need sine?
265@ need cosine:
266 ands r12,#4
267 orrs r1,r7,r12,lsl#29       @ insert sign
268 movs r0,r6
269 pop {r4-r10,r15}
270
27121:
272 eors r12,r12,r12,lsr#2
273 orrs r1,r1,r12,lsl#31       @ insert sign
274 pop {r4-r10,r15}
275
27623:
277 ands r4,r12,#4
278 eors r7,r7,r4,lsl#29        @ insert sign
279 push {r6,r7}
280 b 20f
281
282@ sincos à la GNU with pointers to where to put results
283wrapper_func sincos
284 push {r2-r5, lr}
285 bl sincos_raw
286 pop {r4-r5}
287 stmia r4!, {r0, r1}
288 stmia r5!, {r2, r3}
289 pop {r4, r5, pc}
290
291@ sincos with results in r0:r1 and r2:r3
292.thumb_func
293sincos_raw:
294 ands r12,r1,#1<<31
295 lsrs r12,#1                 @ save argument sign in r12b30
296 orrs r12,r12,#1<<31         @ flag we want both results in r12b31
297 b 1f
298
299wrapper_func sin
300 lsrs r12,r1,#29             @ negative argument -> +2 quadrants
301 ands r12,#4
302 b 1f
303
304wrapper_func cos
305 movs r12,#2                 @ cos -> +1 quadrant
3061:
307 ubfx r3,r1,#20,#11          @ get exponent
308 sub r2,r3,#0x3ff
309 cmp r2,#0x400
310 beq 10b                     @ Inf or NaN?
311 cmn r2,#32
312 blt 32b                     @ very small argument?
313 movs r3,#1
314 bfi r1,r3,#20,#12           @ fix implied 1 in mantissa
315 push {r4-r10,r14}
316 add r2,#12                  @ e+12
317 bl drr_core
318@ r7:r8 θ/2π 0..1 Q64
319 lsrs r4,r8,#30              @ quadrant count
320 adc r4,r4,#0                @ rounded
321 sub r8,r8,r4,lsl#30         @ now -0.125r7:r8<+0.125 Q64
322 add r12,r12,r4,lsl#1
323 orr r12,r12,r8,lsr#31       @ sign into r12b0
324@ r12b2..1: quadrant count
325@ r12b0: sign of reduced angle
326 eors r7,r7,r8,asr#31
327 eors r8,r8,r8,asr#31        @ absolute value of reduced angle 0≤r7:r8<0.125 Q64
328 cmp r8,#1<<22               @ < 2^-10?
329 blo 40b
330
331@ 2π=6.487ED511 0B4611A6 (2633...)
332 movlong r9,0x0B4611A6       @ 2π Q64 low fractional word
333 umull r10,r0,r8,r9
334 movlong r9,0x487ED511       @ 2π Q64 high fractional word
335 umull r10,r1,r7,r9
336 umaal r0,r1,r8,r9
337 movs r9,#6                  @ 2π integer part
338 umlal r0,r1,r7,r9
339 mla r1,r8,r9,r1
340@ r0:r1 range reduced angle θ 0..π/4 Q64
341
342 cmp r1,#1<<25               @ θ < 2^-7?
343 blo 30b
344 lsrs r2,r1,#27
345 ldr r3,=trigtab
346 add r3,r3,r2,lsl#4
347 ldmia r3,{r2-r5}
34831:
349 subs r0,r0,r2
350 sbcs r1,r1,r3               @ ε=Θ-φ Q64
351 bmi 2f                      @ ε negative?
352
353 asrs r6,r12,#1
354 bmi 5f                      @ doing dsincos?
355 asrs r6,#1
356 bcs 3f                      @ need cosine?
357
358@ here ε is positive, we need the sin of θ and the sign of the result is r12b0^r12b2
359 bl dsc_h0
360 bl dsc_h0s
361 bl dpack_q63
36221:
363 eors r12,r12,r12,lsr#2
364 orrs r1,r1,r12,lsl#31       @ insert sign
365 pop {r4-r10,r15}
366
3672:
368 asrs r6,r12,#1
369 bmi 6f                      @ doing dsincos?
370 asrs r6,#1
371 bcs 4f                      @ need cosine?
372
373@ here ε is negative, we need the sin of θ and the sign of the result is r12b0^r12b2
374 bl dsc_h1
375 bl dsc_h1s
376 bl dnegpack_q63
377 eors r12,r12,r12,lsr#2
378 orrs r1,r1,r12,lsl#31       @ insert sign
379 pop {r4-r10,r15}
380
381@ here ε is positive, we need the cos of θ and the sign of the result is r12b2
3823:
383 bl dsc_h0
384 bl dsc_h0c
385 bl dnegpack_q63
38622:
387 ands r12,#4
388 orrs r1,r1,r12,lsl#29       @ insert sign
389 pop {r4-r10,r15}
390
391@ here ε is negative, we need the cos of θ and the sign of the result is r12b2
3924:
393 bl dsc_h1
394 bl dsc_h1c
39515:
396 bl dpack_q63
397 ands r12,#4
398 orrs r1,r1,r12,lsl#29       @ insert sign
399 pop {r4-r10,r15}
400
4015:
402@ dsincos, ε positive
403 bl dsc_h0
404 push {r2-r7}
405 bl dsc_h0c
406 bl dnegpack_q63
407 ands r4,r12,#4
408 eors r1,r1,r4,lsl#29        @ negate cosine in quadrants 2 and 3
409 pop {r2-r7}
410 push {r0,r1}
411 bl dsc_h0s
412 bl dpack_q63
41320:
414 eors r4,r12,r12,lsr#1
415 eors r4,r4,r12,lsr#2
416 eors r1,r1,r4,lsl#31        @ negate sine on b0^b1^b2
417 tst r12,#2                  @ exchange sine and cosine in odd quadrants
418 ittte ne
419 movne r2,r0
420 movne r3,r1
421 popne {r0,r1}
422 popeq {r2,r3}
423 ands r4,r12,#1<<30
424 eors r1,r1,r4,lsl#1         @ negate sine result if argument was negative
425 pop {r4-r10,r15}
426
4276:
428@ dsincos, ε negative
429 bl dsc_h1
430 push {r2-r7}
431 bl dsc_h1c
432 bl dpack_q63
433 ands r4,r12,#4
434 eors r1,r1,r4,lsl#29        @ negate cosine in quadrants 2 and 3
435 pop {r2-r7}
436 push {r0,r1}
437 bl dsc_h1s
438 bl dnegpack_q63
439 b 20b
440
441@ sin/cos power series for negative ε
442dsc_h1:
443 rsbs r0,r0,#0
444 sbc r1,r1,r1,lsl#1
445@ drop into positive ε code
446
447@ sin/cos power series for positive ε
448dsc_h0:
449@ r0:r1 ε Q64
450@ r4:   sin φ Q32
451@ r5:   cos φ Q32
452 umull r6,r7,r1,r1
453 umull r2,r3,r0,r1
454 lsrs r7,#1
455 rrx r6,r6
456 adds r2,r6,r3
457 adc r3,r7,#0
458@ r2:r3 ε²/2 Q64
459 umull r7,r6,r3,r0
460 umlal r7,r6,r2,r1
461 movs r7,#0
462 umlal r6,r7,r3,r1
463@ r6:r7 ε³/2 Q64
464 mov r8,#0x55555555
465 umull r9,r6,r6,r8
466 mov r9,#0
467 umlal r6,r9,r7,r8
468 adds r6,r6,r9
469 adcs r7,r9,#0
470@ r6:r7 ε³/6 Q64
471 subs r6,r0,r6
472 sbcs r7,r1,r7
473@ r6:r7 ε-ε³/6 Q64
474 mov r0,r3,lsl#12            @ ε²/2 Q44
475 orr r0,r0,r2,lsr#20
476 umull r0,r9,r0,r0           @ r9: ε⁴/4 Q88-32=Q56 = ε⁴/32 Q59
477 umlal r0,r9,r9,r8           @ r9: ε⁴/24 Q59
478
479 umull r0,r8,r9,r1           @ ε⁵/24 Q59
480 mov r0,#0x33333333
481 umull r0,r8,r8,r0           @ r8: ε⁵/120 Q59
482@ r6:r7 ε-ε³/6+ε⁵/120 Q64
483 smmulr r10,r8,r1            @ r10: ε⁶/120 Q59
484 movlong r0,0x2aaaaaaa
485 smmlsr r9,r0,r10,r9         @ ε⁴/24-ε⁶/720
486 subs r2,r2,r9,lsl#5
487 sbcs r3,r3,r9,lsr#27
488@ r2:r3 ε²/2-ε⁴/24+ε⁶/720 Q64
489 smmulr r10,r10,r1           @ r10: ε⁷/120 Q59
490 mov r0,#0x6180000           @ 1/42 Q64
491 smmlsr r8,r10,r0,r8         @ r8: ε⁵/120-ε⁷/5040 Q59
492 adds r6,r6,r8,lsl#5
493 adcs r7,r7,r8,lsr#27
494 bx r14
495
496dsc_h0s:
497@ postprocess for sine, positive ε
498@ r2:r3 1-cos |ε| Q64
499@ r4:   sin φ Q31
500@ r5:   cos φ Q31
501@ r6:r7 sin |ε| Q64
502 umull r8,r0,r2,r4
503 mov r1,#0
504 umlal r0,r1,r3,r4
505@ r0:r1 sin φ(1-cos ε)
506 umull r8,r3,r6,r5
507 umlal r3,r4,r7,r5
508@ r3:r4 sin φ+cos φ.sin ε
509 subs r0,r3,r0
510 sbc r1,r4,r1
511@ r0:r1 sin φ+cos φ.sin ε - sin φ(1-cos ε) = sin φ.cos ε + cos φ.sin ε = sin(φ+ε)
512 bx r14
513
514dsc_h1s:
515@ postprocess for sine, negative ε
516@ r2:r3 1-cos |ε| Q64
517@ r4:   sin φ Q31
518@ r5:   cos φ Q31
519@ r6:r7 sin |ε| Q64
520 umull r8,r0,r2,r4
521 mov r1,#0
522 umlal r0,r1,r3,r4
523@ r0:r1 sin φ(1-cos ε)
524 umull r8,r3,r6,r5
525 rsbs r4,r4,#0
526 umlal r3,r4,r7,r5
527@ r3:r4 -sin φ-cos φ.sin ε
528 adds r0,r3,r0
529 adc r1,r4,r1
530@ r0:r1 -sin φ-cos φ.sin ε + sin φ(1-cos ε) = -sin φ.cos ε - cos φ.sin ε = -sin(φ+ε)
531 bx r14
532
533dsc_h0c:
534@ postprocess for cosine, positive ε
535@ r2:r3 1-cos |ε| Q64
536@ r4:   sin φ Q32
537@ r5:   cos φ Q32
538@ r6:r7 sin |ε| Q64
539 umull r8,r0,r2,r5
540 mov r1,#0
541 umlal r0,r1,r3,r5
542@ r0:r1 cos φ(1-cos ε)
543 umull r8,r3,r6,r4
544 rsbs r5,#0
545 umlal r3,r5,r7,r4
546@ r3:r5 -cos φ+sin φ.sin ε
547 adds r0,r3,r0
548 adc r1,r5,r1
549@ r0:r1 -cos φ+sin φ.sin ε + cos φ(1-cos ε) = - cos φ.cos ε + sin φ.sin ε = -cos(φ+ε)
550 bx r14
551
552dsc_h1c:
553@ postprocess for cosine, negative ε
554@ r2:r3 1-cos |ε| Q64
555@ r4:   sin φ Q32
556@ r5:   cos φ Q32
557@ r6:r7 sin |ε| Q64
558 umull r8,r0,r2,r5
559 mov r1,#0
560 umlal r0,r1,r3,r5
561@ r0:r1 cos φ(1-cos ε)
562 umull r8,r3,r6,r4
563 umlal r3,r5,r7,r4
564@ r3:r5 cos φ-sin φ.sin ε
565 subs r0,r3,r0
566 sbc r1,r5,r1
567@ r0:r1 cos φ-sin φ.sin ε - cos φ(1-cos ε) = cos φ.cos ε - sin φ.sin ε = cos(φ+ε)
568 bx r14
569
570double_section dpack
571
572@ dnegpack: negate and pack
573@ dpack_q63:
574@ input
575@ r0:r1 Q63 result, must not be zero
576@ r4    exponent offset [dpack_q only]
577@ output
578@ r0:r1 IEEE double
579@ trashes r2,r3,r4
580.thumb_func
581dnegpack_q63:
582 rsbs r0,r0,#0
583 sbc r1,r1,r1,lsl#1
584.thumb_func
585dpack_q63:
586 mov r4,#0x3ff-12            @ exponent
587.thumb_func
588dpack_q:
589 clz r2,r1
590 cmp r2,#12
591 bhs 1f
592 adds r2,#21
593 lsls r3,r1,r2
594 rsb r2,#32
595 lsrs r0,r0,r2               @ save rounding bit in carry
596 lsr r1,r1,r2
597 add r2,r2,r4
598 adcs r0,r0,r3               @ with rounding
599 adc r1,r1,r2,lsl#20         @ insert exponent
600 bx r14
601
6021:
603 cbz r1,2f
604 rsb r2,#43
605 lsrs r3,r0,r2
606 rsb r2,#32
607 lsls r1,r1,r2
608 lsls r0,r0,r2
609 orrs r1,r3
610 sub r2,r4,r2
611 add r1,r1,r2,lsl#20
612 bx r14
613
6142:
615 movs r1,r0
616 mov r0,#0
617 sub r4,#32
618 bne dpack_q
619 bx r14
620
621
622
623double_section dreduce
624
625@ input:
626@ r0:r1 x, double (only mantissa bits used)
627@ r2    quotient offset
628@ r3    exponent e of x with 0x3ff bias removed, -32e<12 so 2^-32x<2^12
629@ r4:r5 r Q64, 0.5r<1
630@ r6    1/r underestimate Q31
631@ output:
632@ r0:r1 x mod r Q64 [possibly slightly > r?]
633@ r2    quotient+offset
634@ r4:r5 r preserved
635@ trashes r7,r8
636@ increases r2 by up to 2^13
637@ this version only used by dexp
638.thumb_func
639dreduce:
640 movs r7,#1
641 bfi r1,r7,#20,#12           @ insert implied 1, clear exponent and sign
642 lsls r8,r7,r3
643 beq 1f                      @ e<0, x<1
644 umull r0,r7,r0,r8
645 mla r1,r1,r8,r7
646@ r0:r1 x Q52
647 umull r7,r8,r1,r6           @ Q83-32=Q51
648 lsrs r6,r8,#19              @ q Q0
649 adds r2,r2,r6
650 umull r7,r8,r6,r4
651 mla r8,r6,r5,r8             @ r7:r8 q*r Q64
652 lsls r1,#12
653 orrs r1,r1,r0,lsr#20
654 rsbs r0,r7,r0,lsl#12
655 sbc r1,r1,r8                @ x-qr Q64
656@ check we never return slightly more than r
657 cmp r1,r5                   @ quick comparison
658 it lo
659 bxlo r14
660 b 2f
661
6621:
663 adds r3,#12
664 movs r7,#1
665 lsls r8,r7,r3
666 beq 1f                      @ e<-12
667 umull r0,r7,r0,r8
668 mla r1,r1,r8,r7             @ x Q64
669 cmp r1,r5                   @ quick comparison
670 it lo
671 bxlo r14
6722:
673 it eq
674 cmpeq r0,r4
675 it lo
676 bxlo r14
677 subs r0,r0,r4               @ subtract one r
678 sbc r1,r1,r5
679 adds r2,#1
680 bx r14
681
6821:
683@ here e<-12, have to shift r0:r1 down by -r3 places
684 add r3,#32
685 lsls r6,r1,r3
686 rsbs r3,#32
687 lsrs r0,r0,r3
688 lsrs r1,r1,r3
689 orrs r0,r0,r6
690 bx r14
691
692double_section exptab
693
694.align 2
695exptab0:
696.quad 0x0000000000000000,0x0f85186008b15304,0x1e27076e2af2e5c8,0x2f57120421b2120d
697.quad 0x3f7230dabc7c5512,0x4e993155a517a717,0x5fabe0ee0abf0d9d,0x6fad36769c6defe3
698.quad 0x7ebd623de3cc7b69,0x8f42faf3820681f0,0x9ec813538ab7d537,0xaf70154920b3ab7f
699exptab1:
700.quad 0x0000000000000000,0x00ff805515885e02,0x01fe02a6b1067890,0x02fb88ebf0214edc
701.quad 0x03f815161f807c7a,0x04f3a910d1a95d3c,0x05ee46c1f56c46aa,0x06e7f009ebe465ff
702.quad 0x07e0a6c39e0cc013,0x08d86cc491ecbfe1,0x09cf43dcff5eafd5,0x0ac52dd7e4726a46
703.quad 0x0bba2c7b196e7e23,0x0cae41876471f5bf,0x0da16eb88cb8df61,0x0e93b5c56d85a909
704.quad 0x0f85186008b15331,0x1075983598e47130
705exptab2:
706.quad 0x000fff8005551559,0x002ffb808febc309,0x004ff3829a0e91b1,0x006fe78722fde71f
707.quad 0x008fd78f299aa0c3,0x00afc39bac66434f,0x00cfabada9832a41,0x00ef8fc61eb4b74f
708.quad 0x010f6fe6095f81b6,0x012f4c0e66898567,0x014f244032da521a,0x016ef87c6a9b3a48
709.quad 0x018ec8c409b781ff,0x01ae95180bbc8d9c,0x01ce5d796bda1070,0x01ee21e924e23b3a
710logtab0:
711.quad 0xa0ec7f4233957338,0x918986bdf5fa1431,0x8391f2e0e6fa026b,0x7751a813071282e6
712.quad 0x6a73b26a68212621,0x5fabe0ee0abf0d9d,0x546ab61cb7e0b419,0x48a507ef3de59695
713.quad 0x3c4e0edc55e5cbd1,0x32a4b539e8ad68ce,0x289a56d996fa3ccb,0x21aefcf9a11cb2c9
714.quad 0x16f0d28ae56b4b86,0x0f85186008b15304,0x07e0a6c39e0cc002,0x0000000000000000
715
716.align 2
717exprrdata:
718.quad 0xB17217F7D1CF79AC     @ ln2 Q64
719.long 0xB8AA3B29             @ 1/ln2 Q31, rounded down
720
721double_wrapper_section exp
722
7232:
724@ could use dadd macro to calculate x+1 here
725 lsl r0,r1,#11
726 orr r0,#0x80000000
727 lsls r1,#1
728 adc r3,r3,#32
729 movlong r1,0x3ff00000
730 rsb r3,#11
731 lsr r0,r3
732 it cc
733 bxcc r14
734 rsbs r0,#0
735 sbc r1,r1,#0
736 bx r14
737
738wrapper_func exp
739 movs r12,r1,lsr#31          @ save sign
740 ubfx r3,r1,#20,#11          @ get exponent
741 sub r3,r3,#0x3ff
742 cmp r3,#12
743 bge 20f                     @ overflow, Inf or NaN?
744 cmn r3,#0x20
745 ble 2b                      @ <2^-32? return x+1
746 push {r4-r8,r14}
747 ldr r4,=exprrdata
748 ldmia r4,{r4-r6}
749 mov r2,#0
750 bl dreduce
751 tst r12,#1
752 beq 1f
753 mvn r2,r2                   @ quotient is now signed
754 subs r0,r4,r0
755 sbc r1,r5,r1
7561:
757 add r12,r2,#0x3fe           @ exponent offset
758 mov r3,#0x7fe
759 cmp r12,r3
760 bhs 1f                      @ under/overflow
761 lsrs r2,r1,#28
762 ldr r3,=exptab0
763 add r3,r3,r2,lsl#3
764 ldmia r3,{r2-r3}
765 and r5,r2,#63
766 orr r5,#64                  @ y=(t&0x3f)+0x40; Q6
767 subs r0,r2
768 sbcs r1,r3
769 lsrs r2,r1,#24
770 ldr r3,=exptab1
771 add r3,r3,r2,lsl#3
772 ldmia r3,{r3-r4}
773 add r2,#256
774 muls r5,r5,r2               @ y Q14
775 subs r0,r3
776 sbcs r1,r4
777 lsrs r2,r1,#21
778 ldr r3,=exptab2
779 add r3,r3,r2,lsl#3
780 ldmia r3,{r3-r4}
781 add r2,r2,r2
782 add r2,#4096
783 mla r5,r5,r2,r5             @ y Q26
784 subs r0,r3
785 sbcs r1,r4
786
787 movs r2,r1,lsl#10
788 orrs r2,r2,r0,lsr#22
789 adc r2,r2,#0                @ ε Q42, rounded
790 smull r3,r4,r2,r2           @ ε² Q84-32=Q52
791 lsrs r3,#21
792 orrs r3,r3,r4,lsl#11
793 adds r0,r0,r3
794 adc r1,r1,r4,lsr#21         @ ε+ε²/2 Q64
795 smull r3,r4,r4,r2           @ Q52*Q42=Q94; Q94-32=Q62
796 mov r3,#0x55555555          @ 1/6 Q33
797 smull r3,r4,r3,r4           @ ε³/6 Q63
798 smmulr r3,r4,r1             @ ε⁴/6+ε⁵/12 Q63+Q32-32=Q63
799 add r4,r4,r3,lsr#2
800 adds r2,r0,r4,lsl#1
801 adc r3,r1,r4,asr#31
802 lsls r1,r5,#3               @ y Q29
803 umull r4,r0,r1,r2           @ εlo * y Q61+32
804 smlal r0,r1,r1,r3           @ εhi * y + y Q61
805@ assert result is in range 1..2
806 lsrs r0,#9
807 adcs r0,r0,r1,lsl#23
808 lsr r1,#9
809 adcs r1,r1,r12,lsl#20
810 pop {r4-r8,r15}
811
81220:
813@ process Inf/NaN for dexp
814 cmp r3,#0x400
815 bne 22f
816 orrs r2,r0,r1,lsl#12
817 ite eq
818 biceq r1,r1,r1,asr#31       @ +Inf -> + Inf; -Inf -> +0
819 orrne r1,r1,#0x00080000
820 bx r14
821
82222:
823 movs r0,#0
824 movs r1,#0
825 tst r12,#1
826 it eq
827 movteq r1,0x7ff0
828 bx r14
829
8301:                           @ under/overflow
831 mov r0,#0
832 mov r1,#0
833 it ge
834 movtge r1,#0x7ff0
835 pop {r4-r8,r15}
836
837
838
839
840double_wrapper_section log
841
8421:
843 movlong r1,0xfff00000       @ return -Inf
844 movs r0,#0
845 bx r14
846
8474:
848 orrs r2,r0,r1,lsl#12
849 it ne
850 orrne r1,#0x00080000
851 bx r14
852
85310:
854 mvns r5,r6,asr#22           @ very small argument?
855 bne 10f
856 mov r4,#4096
857 b 11f
858
859@ check for argument near 1: here
860@ r1 : mantissa
861@ r12: exponent, -1 or 0
86212:
863 eor r3,r12,r1,lsr#12
864 lsls r3,r3,#24              @ check 8 bits of mantissa
865 bne 12f                     @ not very close to 1
866 cmp r12,#0
867 bne 13f
868@ argument is 1+ε, result will be positive
869 lsls r1,#19
870 orrs r1,r1,r0,lsr#13
871 lsls r0,#19
872@ r0:r1 ε Q71 0≤ε<2^-8
873 clz r4,r1                   @ r41
874 cmp r4,#32
875 bhs 14f
876 movs r5,#1
877 lsls r5,r4
878 umull r2,r3,r0,r5
879 mla r3,r1,r5,r3             @ r2:r3 ε Q71+r4
880 umull r12,r5,r0,r3
881 umull r12,r6,r1,r2
882 umaal r5,r6,r1,r3           @ r5:r6 ε² Q142+r4-64 = Q78+r4
883
884 subs r2,r2,r5,lsr#8
885 sbc r3,r3,#0
886 subs r2,r2,r6,lsl#24
887 sbcs r3,r3,r6,lsr#8
888
889 umull r12,r7,r0,r6
890 umull r12,r8,r1,r5
891 umaal r7,r8,r1,r6           @ r7:r8 ε³ Q149+r4-64 = Q85+r4: when ε is nearly 2^-8, r4=1 and Q86, so r8<0x40000000
892 mov r5,#0x55555555          @ ~1/3 Q32
893
894 umull r12,r6,r7,r5
895 movs r12,#0
896 umlal r6,r12,r8,r5
897 adds r6,r6,r12
898 adc r12,r12,#0              @ multiply by 0x5555555555555555
899
900 adds r2,r2,r6,lsr#14
901 adc r3,r3,#0
902 adds r2,r2,r12,lsl#18
903 adc r3,r3,r12,lsr#14
904
905 smmulr r5,r8,r1             @ ε⁴ Q53+r4+Q71-Q64=Q60+r4 ~ 2^-32
906 movs r7,#0x33333333         @ 1/5 Q32
907 smmulr r6,r5,r1             @ ε⁵ Q60+r4+q71-Q64=Q67+r4 ~ 2^-40
908 smmulr r8,r6,r7             @ ε⁵/5 Q67+r4 ~ 2^-42
909 sub r7,r5,r8,lsr#5
910 smmulr r5,r6,r1             @ ε⁶ Q67+r4+q71-Q64=Q74+r4 ~ 2^-48
911 movt r6,#0x2a80             @ 1/6 Q32 fiddled slightly (PMC)
912 smmulr r5,r6,r5             @ ε⁶/6 Q75+r4 ~ 2^-50
913 add r7,r7,r5,lsr#12
914
915 subs r0,r2,r7,lsl#9
916 sbc r1,r3,r7,lsr#23
917
918 rsb r4,#0x400
919 sub r4,#0x15
920 bl dpack_q
921 pop {r4-r8,r15}
922
923@ here we have positive ε sufficiently small we need (at most) a quadratic term
92414:
925@ here r0Q71, 0≤ε<2^-40
926 clz r4,r0
927 lsls r1,r0,r4               @ ε Q71+r4
928 umull r2,r3,r0,r1           @ ε² Q142+r4
929 mov r0,#0
930 subs r0,r0,r3,lsr#8
931 sbc r1,r1,#0
932 rsb r4,#0x400
933 sub r4,#0x35
934 bl dpack_q
935 pop {r4-r8,r15}
936
937
938
93913:                          @ argument is 1-ε, result will be negative
940 movs r1,r1,lsl#18
941 orrs r1,r1,r0,lsr#14
942 movs r0,r0,lsl#18
943 rsbs r0,#0
944 sbc r1,r1,r1,lsl#1
945@ r0:r1Q71 -2^-9≤ε<0
946 clz r4,r1
947 cmp r4,#32
948 bhs 15f
949 subs r4,#1                  @ 0≤r4<31
950 movs r5,#1
951 lsls r5,r4
952 umull r2,r3,r0,r5
953 mla r3,r1,r5,r3             @ r2:r3 ε Q71+r4
954 umull r12,r5,r0,r3
955 umull r12,r6,r1,r2
956 umaal r5,r6,r1,r3           @ r5:r6 ε² Q142+r4-64 = Q78+r4
957
958 adds r2,r2,r5,lsr#8
959 adc r3,r3,#0
960 adds r2,r2,r6,lsl#24
961 adcs r3,r3,r6,lsr#8
962
963 umull r12,r7,r0,r6
964 umull r12,r8,r1,r5
965 umaal r7,r8,r1,r6           @ r7:r8 ε³ Q149+r4-64 = Q85+r4: when ε is nearly 2^-8, r4=0 and Q85, so r8<0x20000000
966 mov r5,#0x55555555          @ ~1/3 Q32
967
968 umull r12,r6,r7,r5
969 movs r12,#0
970 umlal r6,r12,r8,r5
971 adds r6,r6,r12
972 adc r12,r12,#0              @ multiply by 0x5555555555555555
973
974 adds r2,r2,r6,lsr#14
975 adc r3,r3,#0
976 adds r2,r2,r12,lsl#18
977 adc r3,r3,r12,lsr#14
978
979 smmulr r5,r8,r1             @ ε⁴ Q53+r4+Q71-Q64=Q60+r4 ~ 2^-32
980 movs r7,#0x33333333         @ 1/5 Q32
981 smmulr r6,r5,r1             @ ε⁵ Q60+r4+q71-Q64=Q67+r4 ~ 2^-40
982 smmulr r8,r6,r7             @ ε⁵/5 Q67+r4 ~ 2^-42
983 add r7,r5,r8,lsr#5
984 smmulr r5,r6,r1             @ ε⁶ Q67+r4+q71-Q64=Q74+r4 ~ 2^-48
985 movt r6,#0x2a80             @ 1/6 Q32 fiddled slightly (PMC)
986 smmulr r5,r6,r5             @ ε⁶/6 Q75+r4 ~ 2^-50
987 add r7,r7,r5,lsr#12
988
989 adds r0,r2,r7,lsl#9
990 adc r1,r3,r7,lsr#23
991
992 rsb r4,#0x400
993 sub r4,#0x15
994 bl dpack_q
995 orr r1,r1,#1<<31
996 pop {r4-r8,r15}
997
998@ here we have negative ε sufficiently small we need (at most) a quadratic term
999@ here r0Q71, |ε|<2^-41
100015:
1001 clz r4,r0
1002 lsls r1,r0,r4               @ ε Q71+r4
1003 umull r2,r3,r0,r1           @ ε² Q142+r4
1004 mov r0,r3,lsr#8
1005 rsb r4,#0x400
1006 sub r4,#0x35
1007 bl dpack_q
1008 eors r1,r1,#0x80000000
1009 pop {r4-r8,r15}
1010
1011wrapper_func log
1012 lsls r12,r1,#1
1013 bcs 1b                      @ x<0?
1014 lsrs r12,#21
1015 beq 1b                      @ x==0/denormal?
1016 sub r12,#0x3ff
1017 cmp r12,#0x400              @ +Inf/NaN?
1018 beq 4b
1019 movs r2,#1
1020 bfi r1,r2,#20,#12           @ set implied 1, clear exponent Q52
1021 push {r4-r8,r14}
1022 cmp r12,r12,asr#31          @ exponent = -1 or 0?
1023 beq 12b
102412:
1025 lsrs r4,r1,#16
1026 ldr r5,=logtab0-16*8
1027 add r5,r5,r4,lsl#3
1028 ldmia r5,{r2-r3}
1029 and r5,r2,#63
1030 add r5,#64
1031 umull r0,r6,r5,r0
1032 mla r1,r5,r1,r6             @ Q59
1033
1034 mvn r4,r1,asr#19
1035 and r4,#31
1036 ldr r5,=exptab1
1037 add r5,r5,r4,lsl#3
1038 ldmia r5,{r5-r6}
1039 adds r2,r2,r5
1040 adc r3,r3,r6
1041 add r4,#256
1042 umull r0,r6,r4,r0
1043 mla r6,r4,r1,r6             @ r0:r6 Q67
1044
1045 mvns r4,r6,asr#24
1046 beq 10b                     @ small argument at this stage?
104710:
1048 ldr r5,=exptab2
1049 add r5,r5,r4,lsl#3
1050 ldmia r5,{r1,r5}
1051 adds r2,r2,r1
1052 adc r3,r3,r5
1053 mov r5,#4097
1054 add r4,r5,r4,lsl#1          @ 4097+2k
105511:
1056 lsls r4,#17
1057 umull r5,r0,r4,r0
1058 rsb r1,r4,r4,lsl#3
1059 umlal r0,r1,r4,r6           @ Q96=Q64
1060
1061@ r0:r1 ε Q64
1062@ r2:r3 y Q64
1063
1064 eor r4,r0,r1,asr#31
1065 eor r5,r1,r1,asr#31         @ r4:r5 |ε| Q64
1066 umull r6,r7,r5,r5
1067 umull r4,r8,r4,r5
1068 lsrs r7,#1
1069 rrx r6,r6
1070 adds r6,r6,r8
1071 adc r7,r7,#0                @ r6:r7 ε²/2 Q64
1072
1073 movs r4,r1,lsl#10
1074 orrs r4,r4,r0,lsr#22
1075 adc r4,r4,#0                @ ε Q42, rounded
1076
1077 subs r0,r0,r6
1078 sbc r1,r1,r7                @ r0:r1 ε-ε²/2 Q64
1079
1080 smmulr r5,r4,r4             @ ε² Q42+42-32=Q52
1081 smmulr r6,r5,r4             @ ε³ Q52+42-32=Q62
1082 smmulr r7,r5,r5             @ ε⁴ Q52+52-32=Q72 ε⁴/4 Q74
1083 mov r4,#0x55555555          @ 1/3 Q32
1084 smmulr r6,r6,r4             @ ε³/3 Q62
1085 subs r6,r6,r7,lsr#12        @ ε³/3-ε⁴/4 Q62
1086
1087 adds r0,r0,r6,lsl#2         @ Q64
1088 adc r1,r1,r6,asr#30
1089
1090 subs r0,r2,r0
1091 sbc r1,r3,r1
1092 ldr r2,=exprrdata
1093 ldmia r2,{r2,r5}
1094 adds r12,#1
1095 ble 1f
1096@ positive result
1097 umull r2,r3,r2,r12
1098 movs r4,#0
1099 umlal r3,r4,r5,r12
1100 subs r2,r2,r0
1101 sbcs r3,r3,r1
1102 sbc r4,r4,#0
1103 movs r1,#0x40000000
1104 b 2f                        @ to pack result
1105
1106@ negative result
11071:
1108 rsbs r12,#0
1109 umull r2,r3,r2,r12
1110 movs r4,#0
1111 umlal r3,r4,r5,r12
1112 adds r2,r0,r2
1113 adcs r3,r1,r3
1114 adc r4,r4,#0
1115 movs r1,#0xc0000000
11162:
1117 cbnz r4,2f
1118 movs r4,r3
1119 movs r3,r2
1120 movs r2,#0
1121 subs r1,#32<<20
11221:
1123@ here r3:r4 is guaranteed nonzero
1124 cbnz r4,2f
1125 movs r4,r3
1126 movs r3,#0
1127 subs r1,#32<<20
11282:
1129 clz r5,r4
1130 sub r6,r5,#0x1d
1131 sub r1,r1,r6,lsl#20
1132 lsls r4,r5
1133 lsls r0,r3,r5
1134 rsb r5,#32
1135 lsrs r3,r5
1136 orrs r4,r4,r3
1137 lsrs r2,r5
1138 orrs r0,r0,r2
1139@ now r0:r4 is normalised to Q63
1140 lsrs r0,#11
1141 adcs r0,r0,r4,lsl#21        @ with rounding
1142 adc r1,r1,r4,lsr#11
1143 pop {r4-r8,r15}
1144
1145@===========================================
1146
1147double_section trigtab
1148trigtab:
1149//     φ Q64 lo    φ Q64 hi    sin φ Q31   cos φ Q31
1150.long 0x25735c0b, 0x03f574a9, 0x01fab529, 0x7ffc1500
1151.long 0x00aa2feb, 0x0c44d954, 0x0621d2c8, 0x7fda601f
1152.long 0x42e86336, 0x13d9d3cf, 0x09ea5e3c, 0x7f9d88f0
1153.long 0x046dc42f, 0x1c0a86d9, 0x0dfe171f, 0x7f3b9ed0
1154.long 0xec509ba7, 0x23ebf53e, 0x11e6e7bc, 0x7ebdefc7
1155.long 0x039c1cd2, 0x2bf112b2, 0x15dcf546, 0x7e1e7749
1156.long 0x3d7a05ca, 0x33f293f4, 0x19cbc014, 0x7d5fac9c
1157.long 0x7aefa0a0, 0x3c19321d, 0x1dc6221d, 0x7c7d2f2f
1158.long 0x12fb0fb5, 0x4450ef70, 0x21c10c53, 0x7b782235
1159.long 0x476b8d5c, 0x4bf1a045, 0x256adc18, 0x7a68ad16
1160.long 0x576940c1, 0x53ead96d, 0x29361639, 0x792f2d3c
1161.long 0xd34eeadf, 0x5bb34289, 0x2ce039d6, 0x77e02625
1162.long 0x31ea5069, 0x641107d9, 0x30c4d3c0, 0x76586270
1163.long 0xf36756cb, 0x6bfda2b3, 0x34687e1c, 0x74c77a22
1164.long 0x2f7aed0e, 0x73e07531, 0x37fae7cc, 0x731c1127
1165.long 0xfcc48ec0, 0x7c14790e, 0x3ba3a70e, 0x7141cd8e
1166.long 0x3b04b713, 0x83547b8f, 0x3ed289d4, 0x6f85d8eb
1167.long 0x135b369a, 0x8c0b6fd9, 0x4294ead4, 0x6d51efa3
1168.long 0x4aca525f, 0x9439d326, 0x460a6e70, 0x6b230540
1169.long 0x41c44083, 0x9c0e0b34, 0x4948b03d, 0x68f1ef07
1170.long 0xa36d08bf, 0xa47961fd, 0x4cb1f285, 0x667a849d
1171.long 0xc8f81636, 0xabe547d2, 0x4fa2221e, 0x6436622f
1172.long 0xd0c8c9ad, 0xb3feade1, 0x52c37024, 0x61a4af86
1173.long 0x64730e73, 0xbbfe356a, 0x55c5f028, 0x5f02a3a3
1174.long 0xb08ca5c6, 0xc412b955, 0x58ba9208, 0x5c4194a3
1175.long 0x54530090, 0xcbb02d37, 0x5b6ef419, 0x59938ed8
1176.long 0xa18d7c36, 0xd3aa3c6e, 0x5e2e0225, 0x56af32fc
1177.long 0x48a2c7d0, 0xdbff19f6, 0x60f35332, 0x5392ed7e
1178.long 0x7aad9a94, 0xe422ade1, 0x638edfca, 0x50732bc7
1179.long 0x19ef15ff, 0xebfb85bd, 0x65fa18bb, 0x4d5c61c7
1180.long 0x7e0f96bd, 0xf44c4b49, 0x686f81f7, 0x4a0217e6
1181.long 0x1def09eb, 0xfc4dbdfd, 0x6ab2d2c4, 0x46b4e413
1182// maximum e=0.002617 = 0.167497*2^-6
1183
1184double_wrapper_section atan2
1185
1186@ datan small reduced angle case I
118720:
1188@ r0:r1 has z=y/x in IEEE format, <2^-11
1189@ r2 e+11
1190 rsbs r12,r2,#0              @ shift down of mantissa required to get to Q63 >0
1191 subs r10,r12,#32
1192 bge 1f                      @ very small reduced angle?
1193 bfi r1,r3,#20,#12           @ fix up mantissa
1194 cmp r7,#4
1195 bhs 2f                      @ at least one quadrant to add? then don't need extreme accuracy
1196@ otherwise we need to do a power series for accuracy
1197
1198 rsbs r10,#0
1199 lsr r3,r1,r12
1200 lsr r2,r0,r12
1201 lsl r9,r1,r10
1202 orr r2,r9
1203@ r2:r3 z Q63 (with r12 bits of loss of precision)
1204 lsls r1,#11
1205 orrs r1,r1,r0,lsr#21
1206 lsls r0,#11
1207@ r0:r1 z Q74+r12
1208 umull r9,r4,r2,r3
1209 umull r9,r5,r2,r3
1210 umaal r4,r5,r3,r3
1211@ r4:r5 z² Q62 < 2^-22
1212 umull r9,r2,r0,r5
1213 umull r9,r3,r1,r4
1214 umaal r2,r3,r1,r5
1215@ r2:r3 z³ Q72+r12 <2^-33
1216 umull r9,r6,r2,r5
1217 umull r9,r8,r3,r4
1218 umaal r6,r8,r3,r5
1219@ r6:r8 z⁵ Q70+r12 <2^-55; in fact r8 is always 0
1220 mov r9,#0xaaaaaaaa          @ 2/3 Q32
1221 umull r10,r4,r2,r9
1222 movs r5,#0
1223 umlal r4,r5,r3,r9
1224 adds r4,r4,r5
1225 adcs r5,r5,#0
1226@ r4:r5 z³*2/3 Q72+r12 = z³/3 Q73+r12
1227 mov r9,#0x33333333          @ 1/5 Q32
1228 umull r2,r3,r6,r9
1229@ r3 z⁵*1/5 Q70+r12
1230 subs r4,r4,r3,lsl#3
1231 sbc r5,r5,#0
1232@ r4:r5 z³/3-z⁵/5 Q73+r12
1233 subs r0,r0,r4,lsl#1
1234 sbc r1,r1,r5,lsl#1
1235 sub r1,r1,r4,lsr#31
1236@ r0:r1 z-z³/3+z⁵/5 Q74+r12
1237 rsb r4,r12,#0x400
1238 sub r4,#24
1239 b 60f                       @ pack and return
1240
12412:
1242 rsbs r10,#0
1243 lsls r4,r1,r10
1244 lsrs r0,r0,r12
1245 lsrs r1,r1,r12
1246 orrs r0,r0,r4               @ shift down r12 places
1247 b 50f
1248
1249
12501:
1251 cmp r7,#4
1252 bhs 2f                      @ at least one quadrant to add?
1253 eors r1,r1,r7,lsl#31        @ no: return y/x with the correct sign
1254 pop {r4-r11,r15}
1255
12562:
1257 bfi r1,r3,#20,#12           @ fix up mantissa
1258 usat r10,#6,r10             @ saturate r10 to 63
1259 lsrs r0,r1,r10
1260 movs r1,#0                  @ shift down 32+r10 places
1261 b 40f
1262
1263@ datan small reduced angle case II
126410:
1265 lsrs r1,#1
1266 rrxs r0,r0
1267 movs r2,#0
1268 movs r3,#0
1269 movs r6,#0
1270 mov r7,#1<<30
1271 b 11f
1272
1273@ case where reduced (x',y') has x' infinite
127471:
1275 sbfx r4,r1,#20,#11
1276 movs r0,#0
1277 movs r1,#0
1278 cmn r4,#1                   @ y' also infinite?
1279 bne 80f
1280 movt r1,#0x3ff0             @ both infinite: pretend ∞/∞=1
1281 b 80f
1282
1283@ case where reduced (x',y') has y' zero
128470:
1285 ubfx r4,r3,#20,#11
1286 movs r0,#0
1287 movs r1,#0
1288 cbnz r4,80f                 @ x' also zero?
1289 tst r7,#4
1290 beq 80f                     @ already in quadrants 0/±2? then 0/0 result will be correct
1291 tst r7,#2
1292 ite eq
1293 addeq r7,#6
1294 subne r7,#6                 @ fix up when in quadrants ±0
1295 b 80f
1296
129790:
1298 movs r0,r2
1299 movs r1,r3
130091:
1301 orrs r1,r1,#0x00080000
1302 bx r14
1303
1304wrapper_func atan2
1305 cmp r2,#1                   @ set C if low word is non-zero
1306 adc r12,r3,r3
1307 cmn r12,#0x00200000         @ y NaN?
1308 bhi 90b
1309 cmp r0,#1                   @ set C if low word is non-zero
1310 adc r12,r1,r1
1311 cmn r12,#0x00200000         @ x NaN?
1312 bhi 91b
1313 push {r4-r11,r14}
1314 lsrs r7,r1,#31              @ b31..2: quadrant count; b1: sign to apply before addition; b0: sign to apply after addition
1315 bic r1,#1<<31
1316@ y now positive
1317 movs r3,r3
1318 bpl 1f
1319
1320@ here y positive, x negative
1321 adds r7,#10
1322 bic r3,r3,#1<<31
1323 cmp r3,r1
1324 bhi 4f                      @ |x| > y: 3rd octant
1325@ |x| < y: 2nd octant
1326 subs r7,#6
1327 b 3f
1328
13291:
1330@ here x and y positive
1331 cmp r3,r1
1332 bhi 4f
1333@ x < y: 1st octant
1334 adds r7,#6
13353:
1336 movs r4,r2                  @ exchange x and y
1337 movs r5,r3
1338 movs r2,r0
1339 movs r3,r1
1340 movs r0,r4
1341 movs r1,r5
13424:
1343
1344@ here
1345@ r0:r1 y'
1346@ r2:r3 x'
1347@ where both x' and y' are positive, y'/x' < 1+δ, and the final result is
1348@ ± (Q.π/2 ± atn y/x) where 0≤Q≤2 is a quadrant count in r7b3..2, the inner negation
1349@ is given by r7b1 and the outer negation by r7b0. x' can be infinite, or both x' and
1350@ y' can be infinite, but not y' alone.
1351
1352 sbfx r4,r3,#20,#11
1353 cmn r4,#1
1354 beq 71b                     @ x' infinite?
1355 ubfx r4,r1,#20,#11
1356 cmp r4,#0
1357 beq 70b                     @ y' zero?
1358 bl __aeabi_ddiv
135980:
1360@ r0:r1 y/x in IEEE format, 0..1
1361 lsr r2,r1,#20               @ exponent
1362 movs r3,#1
1363 subs r2,#0x3ff-11
1364 bmi 20b
1365 bfi r1,r3,#20,#12           @ fix up mantissa
1366 movs r3,#1
1367 lsl r3,r2
1368 umull r0,r4,r0,r3
1369 mla r1,r1,r3,r4
137050:
1371 push {r7}                   @ save flags
1372
1373@ from here atan2(y,1) where 1 implied
1374@ r0:r1 y Q63 0≤y<1+δ
1375
1376 lsrs r2,r1,#16
1377 cmp r2,#0x100
1378 blo 10b                     @ small angle?
1379 mul r3,r2,r2                @ y^2
1380 movw r4,#0x895c
1381 muls r2,r2,r4               @ y*0x895c
1382 movw r5,#0x1227
1383 lsrs r3,#14
1384 mls r2,r3,r5,r2
1385 subs r2,#0x330000           @ Chebyshev approximation to atn(y)
1386 lsrs r2,#25
1387 ldr r3,=trigtab
1388 add r3,r3,r2,lsl#4
1389 ldmia r3,{r2-r5}
1390 lsrs r3,#1
1391 rrxs r2,r2
1392@ r2:r3 phi0 Q63
1393@ r4    sphi0
1394@ r5    cphi0
1395 umull r12,r6,r4,r0
1396 movs r7,#0
1397 umlal r6,r7,r4,r1
1398 adds r6,r6,r5,lsl#31
1399 adc r7,r7,r5,lsr#1          @ x0= ((i128)cphi0<<31)+(((i128)sphi0*(i128)y)>>32); // Q62
1400@ r6:r7 x0
1401 umull r12,r0,r5,r0
1402 movs r8,#0
1403 umlal r0,r8,r5,r1
1404 subs r0,r0,r4,lsl#31
1405 sbc r1,r8,r4,lsr#1          @ y0=-((i128)sphi0<<31)+(((i128)cphi0*(i128)y)>>32); // Q62
140611:
1407@ r0:r1 y0
1408@ r2:r3 phi0
1409@ r6:r7 x0
1410
1411 lsls r4,r1,#6
1412 orr r4,r4,r0,lsr#26
1413 lsrs r5,r7,#15
1414 sdiv r4,r4,r5               @  t2=(y0>>26)/(x0>>47); // Q62-26/Q62-47=Q21
1415
1416 mul r5,r4,r4                @ t2_2 Q42
1417 add r3,r3,r4,lsl#10         @ phi0+t2
1418 smull r8,r9,r4,r5           @ t2_3 Q63
1419 mov r10,r9,lsl#16
1420 orr r10,r10,r8,lsr#16
1421 smmulr r10,r10,r5           @ t2_5 Q57
1422 mov r12,#0x66666666         @ 1/5 Q33
1423 smmulr r11,r10,r5           @ t2_7 Q67
1424 smmulr r10,r10,r12          @ t2_5/5 Q57+33-32=Q58
1425
1426 movlong r12,0x124925        @ 1/7 Q23
1427 smmulr r11,r11,r12          @ t2_7/7 Q67+23-32=Q58
1428 mov r12,#0x55555555
1429 sub r11,r11,r11,asr#12      @ Q58 PMC correction
1430 sub r10,r10,r11
1431
1432 adds r2,r2,r10,lsl#5
1433 adc r3,r3,r10,asr#27        @ Q63 phi0 + t_2 + t2_5/5 - t2_7/7 + t2_7/7/4096
1434 umull r5,r10,r8,r12
1435 mov r11,#0
1436 smlal r10,r11,r9,r12        @ t2_3 * 0x55555555
1437 adds r10,r10,r11
1438 adc r11,r11,r11,asr#31      @ t2_3/3 Q63
1439 subs r2,r2,r10
1440 sbc r3,r3,r11               @ Q63 phi0+phi1
1441
1442 lsls r4,r4,#11              @ t2 Q32
1443 umull r5,r8,r4,r0           @ t2*y0l
1444 it mi
1445 submi r8,r8,r0              @ correction if t2 is negative
1446 mov r9,r8,asr#31            @ sign extend
1447 smlal r8,r9,r4,r1           @ t2*y0h
1448@ r8:r9 (t2*y0)<<11
1449
1450 umull r5,r10,r4,r6          @ t2*x0l
1451 it mi
1452 submi r10,r10,r6            @ correction if t2 is negative
1453 mov r11,r10,asr#31          @ sign extend
1454 smlal r10,r11,r4,r7
1455@ r10:r11 (t2*x0)<<11
1456
1457 adds r6,r8
1458 adc r7,r7,r9
1459 subs r0,r10
1460 sbc r1,r1,r11
1461@ r0:r1 y1=y0-t2*x0
1462@ r2:r3 phi0+phi1
1463@ r6:r7 x1=x0+t2*y0
1464
1465 mov r4,#0xffffffff
1466 lsrs r5,r7,#14
1467 udiv r4,r4,r5               @ rx1 Q16
1468 lsrs r5,r0,#11
1469 orrs r5,r5,r1,lsl#21        @ N set according to y1, hence also t3
1470 smmul r5,r4,r5              @ t3=(y1>>11)*rx1 Q35
1471 lsr r6,r6,#3
1472 orr r6,r6,r7,lsl#29
1473 umull r11,r8,r5,r6          @ t3*x1l
1474 lsr r10,r7,#3
1475 it mi
1476 submi r8,r8,r6              @ correction if t3 is negative
1477 mla r8,r5,r10,r8
1478 adds r2,r2,r5,lsl#28
1479 adc r3,r3,r5,asr#4
1480 sub r0,r0,r8
1481@ r0: y2
1482@ r2:r3 phi0+phi1+phi2
1483@ r4: rx1
1484@ r5: t3
1485
1486 smull r8,r9,r0,r4           @ y2*rx1
1487@ stall
1488 lsrs r8,#14
1489 orr r8,r8,r9,lsl#18         @ t4
1490 smmlsr r0,r8,r7,r0
1491 adds r2,r2,r8,asr#1
1492 adc r3,r3,r8,asr#31
1493@ r0: y3
1494@ r4: rx1
1495 mul r4,r4,r0
1496 adds r0,r2,r4,asr#15
1497 adc r1,r3,r4,asr#31
1498@ r0:r1 result over reduced range Q63
1499 pop {r7}                    @ restore flags
150040:
1501 lsrs r1,#1
1502 rrxs r0,r0
1503@ r0:r1 result over reduced range Q62
1504 lsl r6,r7,#30               @ b1 -> b31
1505 eor r0,r0,r6,asr#31         @ negate if required
1506 eor r1,r1,r6,asr#31
1507 movlong r2,0x10B4611A       @ π/2 Q62 low word
1508 movlong r3,0x6487ED51       @ π/2 Q62 high word
1509 lsr r6,r7,#2                @ quadrants to add
1510 umlal r0,r1,r6,r2
1511 mla r1,r6,r3,r1
1512 mov r4,#0x400-12            @ for packing Q62
151360:
1514 bl dpack_q
1515 eors r1,r1,r7,lsl#31
1516 pop {r4-r11,r15}
1517
1518#endif
1519