1/*
2 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7#if !PICO_RP2040
8#include "pico/asm_helper.S"
9
10pico_default_asm_setup
11
12.macro float_section name
13#if PICO_FLOAT_IN_RAM
14.section RAM_SECTION_NAME(\name), "ax"
15#else
16.section SECTION_NAME(\name), "ax"
17#endif
18.endm
19
20.macro float_wrapper_section func
21float_section WRAPPER_FUNC_NAME(\func)
22.endm
23
24.extern rtwopi
25.extern logtab0
26.extern exptab0
27.extern exptab1
28.extern exptab2
29.extern trigtab
30
31@ load a 32-bit constant n into register rx
32.macro movlong rx,n
33 movw \rx,#(\n)&0xffff
34 movt \rx,#((\n)>>16)&0xffff
35.endm
36
37float_section frrcore
38
39@ input:
40@ r0 mantissa m Q23
41@ r1 exponent e>=-32, typically offset by +9
42@ output:
43@ r0..r1 preserved
44@ r6 range reduced result
45@ r2,r3,r4,r5 trashed
46frr_core:
47 ldr r2,=rtwopi
48 asrs r3,r1,#5               @ k=e/32, k<=5 for e offsets up to 9+32
49 add r2,r2,r3,lsl#2 @ p
50 and r3,r1,#31               @ s=e%32
51 mov r4,#1
52 lsls r4,r4,r3               @ 1<<s
53 umull r3,r4,r4,r0
54@ r2    p
55@ r3:r4 u0:u1 = m<<(e%32); u1 is never more than 2<<23
56 ldr r5,[r2,#12]             @ a0=p[3]
57 umull r5,r6,r5,r4           @ r0=a0*u1 hi, discard lo
58@ r6  r0
59 ldr r5,[r2,#8]              @ a1=p[2]
60 mla r6,r5,r4,r6             @ a1*u1 lo, discard hi
61 umlal r5,r6,r5,r3           @ a1*u0 hi, discard lo
62@ r6  r0
63 ldr r5,[r2,#4]              @ a2=p[1]
64 mla r6,r5,r3,r6             @ r0+=a2*u0
65 bx r14
66
67
68float_wrapper_section expf
69
702:
71@ we could use fadd macro to calculate x+1 here
72@ need to add quadratic term
73 rsb r3,#1                   @ 1-(e+9)
74 lsrs r0,r3                  @ Q31
75 smmulr r1,r0,r0             @ Q30
76 cmp r12,#0
77 bne 1f
78 add r0,r0,r1
79 lsrs r0,#8
80 adc r0,r0,#0x3f800000       @ result is just over 1
81 bx r14
821:
83 sub r0,r1,r0
84 asrs r0,#7
85 adc r0,r0,#0x3f800000       @ result is just under 1
86 bx r14
87
887:
89 movs r1,#1
90 bfi r0,r1,#23,#9            @ implied 1, clear sign bit
91 adds r3,#9
92 bmi 2b                      @ <~2^-9? return 1+x+x²/2
93 movlong r1,0xb17217f8       @ ln2 Q32
94 lsls r0,r3                  @ Q32
95 cmp r12,#0
96 bne 8f
97 cmp r0,r1
98 blo 6f
99 sub r0,r0,r1
100 mov r12,#1
101 b 6f
102
1038:
104 mvn r12,#0
105 subs r0,r1,r0
106 bhs 6f
107 add r0,r0,r1
108 mvn r12,#1
109 b 6f
110
111wrapper_func expf
112 ands r12,r0,#1<<31          @ save sign
113 ubfx r3,r0,#23,#8           @ get exponent
114 subs r3,r3,#0x7f
115 bmi 7b                      @ e<0?
116 cmp r3,#7
117 bge 20f                     @ overflow, Inf or NaN?
118 movs r1,#1
119 bfi r0,r1,#23,#9            @ implied 1, clear sign bit
120 lsls r0,r3                  @ x Q23
121 eors r0,r0,r12,asr#31
122 adds r0,r0,r12,lsr#31       @ negate if sign bit set
123 mov r1,#0xB8AA              @ 1/ln2 Q15, rounded down
124 add r1,r1,r12,lsr#31        @ rounded up if we have a negative argument
125 smull r1,r2,r1,r0           @ Q38
126 movlong r1,0xb17217f8       @ ln2 Q32
127 asr r12,r2,#6               @ Q0 exponent candidate
128 lsls r0,#9                  @ Q32
129 mls r0,r12,r1,r0            @ Q32-Q0*Q32=Q32
130
1316:
132 lsrs r1,r0,#28
133 ldr r2,=exptab0
134 add r2,r2,r1,lsl#3
135 ldmia r2,{r2-r3}
136 and r2,r2,#63
137 orr r2,#64                  @ y=(t&0x3f)+0x40; Q6
138 sub r0,r3
139
140 lsrs r1,r0,#24
141 ldr r3,=exptab1+4
142 ldr r3,[r3,r1,lsl#3]
143 add r1,#256
144 muls r2,r2,r1 @ y Q14
145 sub r0,r3
146
147 lsrs r1,r0,#21
148 ldr r3,=exptab2+4
149 ldr r3,[r3,r1,lsl#3]
150 add r1,r1,r1
151 add r1,#4096
152 mla r2,r2,r1,r2             @ y Q26
153 sub r0,r3                   @ ε Q32
154 smmlar r2,r2,r0,r2          @ y(1+ε) Q26
155
156 cmp r2,#1<<27
157 bhs 1f
1582:
159 cmp r12,#0x7f
160 bgt 23f
161 cmn r12,#0x7e
162 blt 23f
163 lsls r0,r12,#23
164 adds r0,r0,r2,lsr#3
165 adc r0,r0,#0x3f000000
166 bx r14
167
168@ mantissa calculation has overflowed
1691:
170 lsrs r2,#1
171 adds r12,#1
172 b 2b
173
17420:
175@ process Inf/NaN for fexp
176 cmp r3,#0x80
177 bne 22f
178 lsls r2,r0,#9
179 ite eq
180 biceq r0,r0,r0,asr#31       @ +Inf -> + Inf; -Inf -> +0
181 orrne r0,r0,#0x00400000
182 bx r14
183
18423:
185 ands r12,r12,#1<<31
18622:
187 eors r0,r12,#1<<31
188 subs r0,r0,r0,lsr#8         @ overflow: convert to +0 or +Inf
189 bx r14
190
191
192
193
194float_wrapper_section logf
195
1961:
197 movlong r0,0xff800000       @ return -Inf
198 bx r14
199
2004:
201 lsls r1,r0,#9
202 it ne
203 orrne r0,r0,#0x00400000
204 bx r14
205
206@ here result may be close to zero
2075:
208 sub r1,r0,#0x3f800000
209 bmi 7f
210 cmp r1,#0x8000
211 bge 6f                      @ |ε|>~2^-8?
212@ here ε positive
213 clz r12,r1
214 sub r3,r12,#1
215 b 8f
216
2177:
218 rsbs r2,r1,#0
219 cmp r2,#0x8000
220 bge 6f                      @ |ε|>~2^-8?
221@ here ε negative
222@ r1: x Q24
223 clz r12,r2
224 sub r3,r12,#2
2258:
226 sub r12,#10
227 lsls r0,r1,r3               @ ε Q24+r3
228 beq 10f                     @ ε=0?
229 smmulr r1,r0,r0
230 asrs r1,r1,r12
231 sub r0,r0,r1,asr#1          @ - ε²/2
232 smmulr r1,r1,r0
233 asrs r2,r1,r12
234 movs r3,#0x55
235 mul r2,r2,r3
236 adds r0,r0,r2,asr#8         @ + ~ ε³/3
237 rsb r1,r12,#117
238 b 9f
239
240wrapper_func logf
241 lsls r3,r0,#1
242 bcs 1b                      @ x<0?
243 lsrs r3,#24
244 beq 1b                      @ x==0/denormal?
245 sub r12,r3,#0x7e
246 cmp r12,#0x81               @ +Inf/NaN?
247 beq 4b
248 push {r14}
249 cmp r12,#1                  @ result will be near zero?
250 bls 5b
2516:
252 movs r2,#1
253 bfi r0,r2,#23,#9            @ set implied 1, clear exponent Q52
254
255 lsls r0,#8
256
257 lsrs r1,r0,#27
258 ldr r2,=logtab0-16*8
259 add r2,r2,r1,lsl#3
260 ldmia r2,{r2-r3}
261 lsls r2,#26
262 umlal r2,r0,r2,r0
263
264 mvn r1,r0,asr#24
265 ldr r2,=exptab1+4
266 ldr r2,[r2,r1,lsl#3]
267 add r3,r3,r2
268 lsls r1,#24
269 umlal r1,r0,r1,r0
270
271 ldr r2,=exptab2+4
272 mvn r1,r0,asr#21
273 ldr r2,[r2,r1,lsl#3]
274 lsls r1,#21
275 orr r1,#0x00100000
276 umlal r1,r0,r1,r0
277 add r3,r3,r2
278
279@ r0: ε Q32
280@ r3: log y
281 smmulr r1,r0,r0             @ ε² Q32
282 sub r3,r0
283 add r3,r3,r1,lsr#1          @ log u - ε + ε²/2 Q32
284 movlong r2,0xb17218         @ ln2 Q24, but accurate to 2^-29 or so
285 cmp r12,#1                  @ result will be near zero?
286 bls 2f
287 mul r2,r2,r12
288 mov r1,#125
289 add r3,#255                 @ reduce systematic error
290 subs r0,r2,r3,lsr#8
2919:
292 bmi 1f
293 bl fpack_q
29410:
295 pop {r15}
296
2972:
298 mov r1,#117
299 bne 3f @ r12=0?
300@ here r12=1
301 movlong r2,0xb17217f8
302 sub r0,r2,r3
303 bl fpack_q
304 pop {r15}
305
3063:
307 mov r0,r3
308 bl fpack_q
309 orrs r0,r0,#1<<31
310 pop {r15}
311
3121:
313 rsbs r0,#0
314 bl fpack_q
315 orrs r0,r0,#1<<31
316 pop {r15}
317
318
319float_wrapper_section atan2f
320
321@ fatan small reduced angle case
322@ r0 y/x in IEEE format, 0..2^-8
323@ r2 e+6 <0
324@ r3 #1
325@ r6 flags
32620:
327 rsbs r4,r2,#0               @ -e-6 = shift down of mantissa required to get to Q29 >0
328 cmp r4,#7                   @ -e-67 [ e-13 ]
329 bge 1f                      @ very small reduced angle? atn θ ~ θ
330@ do a power series
331 bfi r0,r3,#23,#9            @ fix up mantissa Q23-e
332 lsls r0,#7                  @ Q30-e
333 smmulr r1,r0,r0             @ θ² Q60-2e-Q32=Q28-2e
334 lsrs r1,r4                  @ Q28-2e + (e+6) = Q34-e
335 smmulr r1,r1,r0             @ θ³ Q34-e+Q30-e-Q32=Q32-2e
336 mov r3,#0x55555555          @ 1/3 Q32
337 lsrs r1,r4                  @ Q32-2e + e+6 = Q38-e
338 smmulr r1,r1,r3             @ θ³/3 Q38-e
339 sub r0,r0,r1,lsr#8          @ Q30-e
340 cmp r6,#4                   @ at least one quadrant to add?
341 bhs 2f
342 add r1,r2,#113
343 bl fpack_q
344 eors r0,r0,r6,lsl#31        @ fix sign
345 pop {r4-r6,r15}
346
347@ here reduced angle is < 2^-12
3481:
349 cmp r6,#4
350 bhs 3f                      @ at least one quadrant to add?
351 eors r0,r0,r6,lsl#31        @ no: return y/x with the correct sign
352 pop {r4-r6,r15}
353
3543:
355 bfi r0,r3,#23,#9            @ fix up mantissa
356 lsrs r0,r4                  @ Q29
357 lsls r0,#1                  @ Q30
358 b 40f
359
3602:
361 lsrs r0,r4                  @ Q30-e + e+6 = Q36
362 lsrs r0,#6                  @ Q30
363 b 40f
364
365@ case where reduced (x',y') has x' infinite
36671:
367 sbfx r4,r0,#23,#8
368 movs r0,#0
369 cmn r4,#1                   @ y' also infinite?
370 bne 80f
371 mov r0,#0x3f800000          @ both infinite: pretend ∞/∞=1
372 b 80f
373
374@ case where reduced (x',y') has y' zero
37570:
376 ubfx r4,r1,#23,#8
377 movs r0,#0
378 cbnz r4,80f                 @ x' also zero?
379 tst r6,#4
380 beq 80f                     @ already in quadrants 0/±2? then 0/0 result will be correct
381 tst r6,#2
382 ite eq
383 addeq r6,#6
384 subne r6,#6                 @ fix up when in quadrants ±0
385 b 80f
386
38790:
388 movs r0,r1
38991:
390 orrs r0,r0,#0x00400000
391 pop {r4-r6,r15}
392
393wrapper_func atan2f
394 push {r4-r6,r14}
395 lsls r4,r1,#1
396 cmp r4,#0xff000000
397 bhi 90b                     @ y NaN?
398 lsls r4,r0,#1
399 cmp r4,#0xff000000
400 bhi 91b                     @ x NaN?
401 lsrs r6,r0,#31              @ b31..2: quadrant count; b1: sign to apply before addition; b0: sign to apply after addition
402 bic r0,#1<<31
403@ y now positive
404 movs r1,r1
405 bpl 1f
406
407@ here y positive, x negative
408 adds r6,#10
409 bic r1,r1,#1<<31
410 cmp r1,r0
411 bhi 4f                      @ |x| > y: 3rd octant
412@ |x| < y: 2nd octant
413 subs r6,#6
414 b 3f
415
4161:
417@ here x and y positive
418 cmp r1,r0
419 bhs 4f
420@ x < y: 1st octant
421 adds r6,#6
4223:
423 movs r2,r1                  @ exchange x and y
424 movs r1,r0
425 movs r0,r2
4264:
427@ here
428@ r0 y'
429@ r1 x'
430@ where both x' and y' are positive, y'/x' < 1+δ, and the final result is
431@ ± (Q.π/2 ± atn y/x) where 0≤Q2 is a quadrant count in r6b3..2, the inner negation
432@ is given by r6b1 and the outer negation by r6b0. x' can be infinite, or both x' and
433@ y' can be infinite, but not y' alone.
434
435 sbfx r2,r1,#23,#8
436 cmn r2,#1
437 beq 71b                     @ x' infinite?
438 ubfx r2,r0,#23,#8
439 cmp r2,#0
440 beq 70b                     @ y' zero?
441 bl __aeabi_fdiv
44280:
443                             @ r0 y/x in IEEE format, 0..1
444 lsr r2,r0,#23               @ exponent
445 movs r3,#1
446 subs r2,#0x7f-6
447 bmi 20b
448 bfi r0,r3,#23,#9            @ fix up mantissa
449 lsls r0,r2
450 lsls r0,#2
45150:
452
453@ from here atan2(y,1) where 1 implied
454@ r0 y Q31 0≤y<1455
456 lsrs r2,r0,#16
457 mul r3,r2,r2                @ y^2
458 movw r4,#0x895c
459 muls r2,r2,r4               @ y*0x895c
460 movw r5,#0x1227
461 lsrs r3,#14
462 mls r2,r3,r5,r2
463 subs r2,#0x330000           @ Chebyshev approximation to atn(y)
464 lsrs r2,#25
465 ldr r3,=trigtab+4
466 add r3,r3,r2,lsl#4
467 ldmia r3,{r3-r5}
468@ r0 y  Q30
469@ r3 phi0  Q32
470@ r4 sphi0 Q31
471@ r5 cphi0 Q31
472@ r6 flags
473@ x0= cphi0 + sphi0*y
474@ y0=-sphi0 + cphi0*y
475 umull r12,r1,r0,r4          @ sphi0*y
476 umull r12,r2,r0,r5          @ cphi0*y
477 add r1,r1,r5,lsr#1          @ x0 Q30
478 sub r2,r2,r4,lsr#1          @ y0 Q30
47911:
480@ r1 x0 Q30
481@ r2 y0 Q30
482@ r3 phi0 Q32
483@ r6 flags
484 mov r0,#0xffffffff
485 lsrs r4,r1,#15
486 udiv r12,r0,r4              @ rx0 Q17
487 lsls r4,r2,#6               @ y0 Q36
488 smmul r4,r4,r12             @ t2=y0*rx0 Q21
489 lsls r5,r4,#11              @ t2 Q32
490 smmlar r0,r5,r2,r1
491 smmlsr r1,r5,r1,r2
492@ r0 x1 Q30
493@ r1 y1 Q30
494@ r3 phi0
495@ r4 r2 Q21
496@ r12 rx0 Q17
497 mul r5,r4,r4                @ t2_2 Q42
498 smull r2,r5,r4,r5           @ t2_3 Q63
499 add r3,r3,r4,lsl#11         @ Q32
500 lsls r5,#1                  @ Q32
501 mov r2,#0x55555555          @ 1/3
502 smmlsr r3,r2,r5,r3          @ t2_3/3
503
504@ r0 x1 Q30
505@ r1 y1 Q30
506@ r3 phi0+phi1 Q32
507@ r12 rx0 Q17
508 mul r0,r1,r12               @ y1*rx0 Q30+Q17=Q47
509 add r3,r3,r0,asr#15
510@ r3 phi0+phi1+phi2, result over reduced range Q32
511@ r6 flags
512
513 lsrs r0,r3,#2               @ Q30
514@ adc r0,#0                  @ rounding
51540:
516 lsl r5,r6,#30               @ b1 -> b31
517 eor r0,r0,r5,asr#31         @ negate if required
518 movlong r1,0x6487ED51       @ π/2 Q30
519
520 lsr r5,r6,#2                @ quadrants to add
521 mla r0,r5,r1,r0
522 mov r1,#0x80-9              @ for packing Q30
52360:
524 bl fpack_q
525 eors r0,r0,r6,lsl#31
526 pop {r4-r6,r15}
527
528@=======================================
529
530float_wrapper_section fpack
531
532@ fnegpack: negate and pack
533@ fpack_q31:
534@ input
535@ r0 Q31 result, must not be zero
536@ r1 exponent offset [fpack_q only]
537@ output
538@ r0 IEEE single
539@ trashes (r1,)r2
540fnegpack_q31:
541 rsbs r0,r0,#0
542fpack_q31:
543 mov r1,#0x7f-9              @ exponent
544fpack_q:
545 clz r2,r0
546 rsbs r2,#8
547 bmi 1f
548 lsrs r0,r0,r2               @ save rounding bit in carry
549 add r2,r2,r1
550 adc r0,r0,r2,lsl#23         @ insert exponent
551 bx r14
552
5531:
554 rsb r2,#0
555 lsls r0,r0,r2
556 sub r2,r1,r2
557 add r0,r0,r2,lsl#23
558 bx r14
559
560float_wrapper_section tanf
561
562wrapper_func tanf
563 push {r14}
564 bl sincosf_raw
565 bl __aeabi_fdiv
566 pop {r15}
567
568float_wrapper_section fsin_fcos
569
57010: @ process Inf/NaN for sinf and fcos
571 lsls r1,r0,#9
572 it eq
573 orreq r0,#0x80000000
574 orr r0,#0x00400000          @ turn Inf into NaN
575 movs r1,r0                  @ copy result to cosine output
576 bx r14
577
578@ case where angle is very small (<2^-32) before reduction
57932:
580 adds r1,r1,#0x7f
581 it eq
582 moveq r0,#0                 @ flush denormal to zero
583
584 movs r1,0x3f800000          @ return x for sine, 1 for cosine
585 tst r12,#2
586 it ne
587 movne r0,r1                 @ calculating cosine? move to result registers
588 bx r14
589
59040:
591@ case where range-reduced angle is very small
592@ here
593@ r0 mantissa
594@ r1 exponent+9
595@ r6 abs range-reduced angle / 2π < 2^-7 Q32
596@ r12b31: dsincos flag
597@ r12b30: original argument sign
598@ r12b2..1: quadrant count
599@ r12b0: sign of reduced angle
600 push {r12}
601 movs r12,#0
6022:
603 add r12,#2
604 add r1,#2
605 bl frr_core                 @ repeat range reduction with extra factor of 2^2 (, 2^4, 2^6, 2^8,...)
606 eors r4,r6,r6,asr#31        @ we want to keep the sign in r6b31 for later
607 cmp r4,#1<<26               @ ≥ 2^-6?
608 bhs 1f                      @ loop until the result is big enough
609 cmp r12,#32                 @ safety net
610 bne 2b
6111:
612@ here r4 is the abs range-reduced angle Q32+r12, 2^-6..2^-4 in Q32 terms
613
614@ 2π=6.487ED511 (0B4611A6...)
615 movlong r5,0x487ED511       @ 2π Q64 high fractional word
616 umull r2,r0,r4,r5
617 movs r5,#6                  @ 2π integer part
618 mla r0,r4,r5,r0
619
620@ here
621@ r0 θ, abs range reduced angle θ 0..π/4 Q32+r12, 2π * 2^-6..2^-4 in Q32 terms (so top bit is clear)
622@ r6b31: sign of reduced angle
623@ r12: excess exponent2, multiple of 2
624@ r0 / 2^r12 < ~ 2π * 2^-7 so for sin we need to go to term in x^3
625
626 smmulr r1,r0,r0             @ θ² Q32+2*r12
627 lsrs r1,r1,r12              @ θ² Q32+r12
628 lsrs r1,r1,r12              @ θ² Q32
629 movs r4,#0x55555555         @ 1/3 Q32
630 smmulr r4,r1,r4             @ θ²/3 Q32
631 smmulr r2,r4,r1             @ θ⁴/3 Q32
632 rsb r3,r1,r2,lsr#2          @ -θ²+θ⁴/12) Q32
633 asrs r3,#9                  @ -θ²/2+θ⁴/24 Q24
634 adc r3,r3,#0x3f800000       @ IEEE single with rounding
635
636 smmulr r2,r4,r0             @ θ³/3 Q32+r12
637 sub r0,r0,r2,lsr#1          @ θ-θ³/6 Q32+r12
638 rsb r1,r12,#117
639 bl fpack_q
640@ here
641@ r0 packed sin
642@ r3 packed cos
643@ r6b31: sign of reduced angle
644 pop {r12}                   @ get flags
645 lsrs r6,#31
646 bfi r12,r6,#0,#1
647
648 asrs r2,r12,#1
649 bmi 23f                     @ doing sincos?
650 asrs r2,#1
651 bcc 21f                     @ need sine?
652@ need cosine:
653 ands r12,#4
654 orrs r0,r3,r12,lsl#29       @ insert sign
655 pop {r4-r6,r15}
656
65721:
658 eors r12,r12,r12,lsr#2
659 orrs r0,r0,r12,lsl#31       @ insert sign
660 pop {r4-r6,r15}
661
66223:
663 ands r2,r12,#4
664 orrs r3,r3,r2,lsl#29        @ insert sign
665 push {r3}
666@ drop into sincosf code below...
667 b 20f
668
669wrapper_func sincosf
670 push {r1, r2, lr}
671 bl sincosf_raw
672 pop {r2, r3}
673 str r0, [r2]
674 str r1, [r3]
675 pop {pc}
676
677sincosf_raw:
678 ands r12,r0,#1<<31
679 lsrs r12,#1                 @ save argument sign in r12b30
680 orrs r12,r12,#1<<31         @ flag we want both results in r12b31
681 b 1f
682
683wrapper_func sinf
684 lsrs r12,r0,#29             @ negative argument -> +2 quadrants
685 ands r12,#4
686 b 1f
687
688wrapper_func cosf
689 movs r12,#2                 @ cos -> +1 quadrant
6901:
691 ubfx r1,r0,#23,#8           @ get exponent
692 sub r1,r1,#0x7f
693 cmp r1,#0x80
694 beq 10b                     @ Inf or NaN?
695 cmn r1,#32
696 blt 32b                     @ very small argument?
697 movs r3,#1
698 bfi r0,r3,#23,#9            @ fix implied 1 in mantissa
699 push {r4-r6,r14}
700 add r1,#9                   @ e+9
701 bl frr_core
702@ r6 θ/2π 0..1 Q64
703 lsrs r4,r6,#30              @ quadrant count
704 adc r4,r4,#0                @ rounded
705 sub r6,r6,r4,lsl#30         @ now -0.125r6<+0.125 Q32
706 add r12,r12,r4,lsl#1
707 orr r12,r12,r6,lsr#31       @ sign into r12b0
708@ r12b2..1: quadrant count
709@ r12b0: sign of reduced angle
710 eors r6,r6,r6,asr#31        @ absolute value of reduced angle 0≤r7<0.125 Q32
711 cmp r6,#1<<25               @ θ / 2π < 2^-7?
712 blo 40b
713
714@ 2π=6.487ED511 (0B4611A6...)
715 movlong r5,0x487ED511       @ 2π Q64 high fractional word
716 umull r2,r0,r6,r5
717 movs r5,#6                  @ 2π integer part
718 mla r0,r6,r5,r0
719@ r0 range reduced angle θ 0..π/4 Q32
720 lsrs r2,r0,#27
721 ldr r3,=trigtab+4
722 add r3,r3,r2,lsl#4
723 ldmia r3,{r1-r3}
724 subs r0,r1
725@ r0:   ε     Q32 |ε| < 1.17*2^-6
726@ r2:   sin φ Q31
727@ r3:   cos φ Q31
728 asrs r1,r12,#1
729 bmi 5f                      @ doing sincosf?
730 asrs r1,#1
731 bcs 3f                      @ need cosine?
732@ here need sine
733 smmulr r1,r0,r0             @ ε² Q32
734 mov r4,#0x55555555          @ 1/3 Q32
735 asrs r1,#1
736 smmlsr r2,r1,r2,r2          @ sin φ - ε²/2*sin φ ~ sin φ cos ε
737 smmulr r1,r1,r0             @ ε³/2
738 smmlsr r1,r1,r4,r0          @ ε - ε³/6
739
740 smmlar r0,r1,r3,r2          @ sin φ cos ε + cos φ (ε - ε³/6) ~ sin (φ+ε)
741@ the sign of the result is r12b0^r12b2
742 bl fpack_q31
743 eors r12,r12,r12,lsr#2
744 orrs r0,r0,r12,lsl#31       @ insert sign
745 pop {r4-r6,r15}
746
7473:
748@ here need cosine
749 smmulr r1,r0,r0             @ ε² Q32
750 mov r4,#0x55555555          @ 1/3 Q32
751 asrs r1,#1
752 smmlsr r3,r1,r3,r3          @ cos φ - ε²/2*cos φ ~ cos φ cos ε
753 smmulr r1,r1,r0             @ ε³/2
754 smmlsr r1,r1,r4,r0          @ ε - ε³/6
755
756 smmlsr r0,r1,r2,r3 @ cos φ cos ε - sin φ (ε - ε³/6) ~ cos (φ+ε)
757@ the sign of the result is r12b2
758 bl fpack_q31
759 ands r12,#4
760 orrs r0,r0,r12,lsl#29       @ insert sign
761 pop {r4-r6,r15}
762
7635:
764@ here doing sincosf
765 smmulr r1,r0,r0             @ ε² Q32
766 mov r6,#0x55555555          @ 1/3 Q32
767 asrs r1,#1
768 smmlsr r4,r1,r2,r2          @ sin φ - ε²/2*sin φ ~ sin φ cos ε
769 smmlsr r5,r1,r3,r3          @ cos φ - ε²/2*cos φ ~ cos φ cos ε
770 smmulr r1,r1,r0             @ ε³/2
771 smmlsr r6,r1,r6,r0          @ ε - ε³/6
772@ here
773@ r2 sin φ
774@ r3 cos φ
775@ r4 sin φ cos ε
776@ r5 cos φ cos ε
777@ r6 ε - ε³/6 ~ sin ε
778 smmlsr r0,r6,r2,r5          @ cos φ cos ε - sin φ (ε - ε³/6) ~ cos (φ+ε)
779 bl fpack_q31
780 ands r5,r12,#4
781 eors r0,r0,r5,lsl#29        @ negate cosine in quadrants 2 and 3
782 push {r0}
783 smmlar r0,r6,r3,r4          @ sin φ cos ε + cos φ (ε - ε³/6) ~ sin (φ+ε)
784 bl fpack_q31
78520:
786 eors r4,r12,r12,lsr#1
787 eors r4,r4,r12,lsr#2
788 ands r5,r12,#1<<30
789 tst r12,#2                  @ exchange sine and cosine in odd quadrants
790 beq 1f
791 eors r1,r0,r4,lsl#31        @ negate sine on b0^b1^b2
792 pop {r0}
793 eors r0,r0,r5,lsl#1         @ negate sine result if argument was negative
794 pop {r4-r6,r15}
7951:
796 pop {r1}
797 eors r0,r0,r4,lsl#31        @ negate sine on b0^b1^b2
798 eors r0,r0,r5,lsl#1         @ negate sine result if argument was negative
799 pop {r4-r6,r15}
800
801#endif
802