1 /* Header for single-precision sin/cos/sincos functions.
2    Copyright (c) 2018 Arm Ltd.  All rights reserved.
3 
4    SPDX-License-Identifier: BSD-3-Clause
5 
6    Redistribution and use in source and binary forms, with or without
7    modification, are permitted provided that the following conditions
8    are met:
9    1. Redistributions of source code must retain the above copyright
10       notice, this list of conditions and the following disclaimer.
11    2. Redistributions in binary form must reproduce the above copyright
12       notice, this list of conditions and the following disclaimer in the
13       documentation and/or other materials provided with the distribution.
14    3. The name of the company may not be used to endorse or promote
15       products derived from this software without specific prior written
16       permission.
17 
18    THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
19    WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
20    MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21    IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23    TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
28 
29 #include <stdint.h>
30 #include <math.h>
31 #include "math_config.h"
32 
33 /* 2PI * 2^-64.  */
34 static const double pi63 = 0x1.921FB54442D18p-62;
35 /* PI / 4.  */
36 static const double pio4 = 0x1.921FB54442D18p-1;
37 
38 /* The constants and polynomials for sine and cosine.  */
39 typedef struct
40 {
41   double sign[4];		/* Sign of sine in quadrants 0..3.  */
42   double hpi_inv;		/* 2 / PI ( * 2^24 if !TOINT_INTRINSICS).  */
43   double hpi;			/* PI / 2.  */
44   double c0, c1, c2, c3, c4;	/* Cosine polynomial.  */
45   double s1, s2, s3;		/* Sine polynomial.  */
46 } sincos_t;
47 
48 /* Polynomial data (the cosine polynomial is negated in the 2nd entry).  */
49 extern const sincos_t __sincosf_table[2] HIDDEN;
50 
51 /* Table with 4/PI to 192 bit precision.  */
52 extern const uint32_t __inv_pio4[] HIDDEN;
53 
54 /* Top 12 bits of the float representation with the sign bit cleared.  */
55 static inline uint32_t
abstop12(float x)56 abstop12 (float x)
57 {
58   return (asuint (x) >> 20) & 0x7ff;
59 }
60 
61 /* Compute the sine and cosine of inputs X and X2 (X squared), using the
62    polynomial P and store the results in SINP and COSP.  N is the quadrant,
63    if odd the cosine and sine polynomials are swapped.  */
64 static inline void
sincosf_poly(double x,double x2,const sincos_t * p,int n,float * sinp,float * cosp)65 sincosf_poly (double x, double x2, const sincos_t *p, int n, float *sinp,
66 	      float *cosp)
67 {
68   double x3, x4, x5, x6, s, c, c1, c2, s1;
69 
70   x4 = x2 * x2;
71   x3 = x2 * x;
72   c2 = p->c3 + x2 * p->c4;
73   s1 = p->s2 + x2 * p->s3;
74 
75   /* Swap sin/cos result based on quadrant.  */
76   float *tmp = (n & 1 ? cosp : sinp);
77   cosp = (n & 1 ? sinp : cosp);
78   sinp = tmp;
79 
80   c1 = p->c0 + x2 * p->c1;
81   x5 = x3 * x2;
82   x6 = x4 * x2;
83 
84   s = x + x3 * p->s1;
85   c = c1 + x4 * p->c2;
86 
87   *sinp = s + x5 * s1;
88   *cosp = c + x6 * c2;
89 }
90 
91 /* Return the sine of inputs X and X2 (X squared) using the polynomial P.
92    N is the quadrant, and if odd the cosine polynomial is used.  */
93 static inline float
sinf_poly(double x,double x2,const sincos_t * p,int n)94 sinf_poly (double x, double x2, const sincos_t *p, int n)
95 {
96   double x3, x4, x6, x7, s, c, c1, c2, s1;
97 
98   if ((n & 1) == 0)
99     {
100       x3 = x * x2;
101       s1 = p->s2 + x2 * p->s3;
102 
103       x7 = x3 * x2;
104       s = x + x3 * p->s1;
105 
106       return s + x7 * s1;
107     }
108   else
109     {
110       x4 = x2 * x2;
111       c2 = p->c3 + x2 * p->c4;
112       c1 = p->c0 + x2 * p->c1;
113 
114       x6 = x4 * x2;
115       c = c1 + x4 * p->c2;
116 
117       return c + x6 * c2;
118     }
119 }
120 
121 /* Fast range reduction using single multiply-subtract.  Return the modulo of
122    X as a value between -PI/4 and PI/4 and store the quadrant in NP.
123    The values for PI/2 and 2/PI are accessed via P.  Since PI/2 as a double
124    is accurate to 55 bits and the worst-case cancellation happens at 6 * PI/4,
125    the result is accurate for |X| <= 120.0.  */
126 static inline double
reduce_fast(double x,const sincos_t * p,int * np)127 reduce_fast (double x, const sincos_t *p, int *np)
128 {
129   double r;
130 #if TOINT_INTRINSICS
131   /* Use fast round and lround instructions when available.  */
132   r = x * p->hpi_inv;
133   *np = converttoint (r);
134   return x - roundtoint (r) * p->hpi;
135 #else
136   /* Use scaled float to int conversion with explicit rounding.
137      hpi_inv is prescaled by 2^24 so the quadrant ends up in bits 24..31.
138      This avoids inaccuracies introduced by truncating negative values.  */
139   r = x * p->hpi_inv;
140   int n = ((int32_t)r + 0x800000) >> 24;
141   *np = n;
142   return x - n * p->hpi;
143 #endif
144 }
145 
146 /* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
147    XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
148    Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
149    Reduction uses a table of 4/PI with 192 bits of precision.  A 32x96->128 bit
150    multiply computes the exact 2.62-bit fixed-point modulo.  Since the result
151    can have at most 29 leading zeros after the binary point, the double
152    precision result is accurate to 33 bits.  */
153 static inline double
reduce_large(uint32_t xi,int * np)154 reduce_large (uint32_t xi, int *np)
155 {
156   const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15];
157   int shift = (xi >> 23) & 7;
158   uint64_t n, res0, res1, res2;
159 
160   xi = (xi & 0xffffff) | 0x800000;
161   xi <<= shift;
162 
163   res0 = xi * arr[0];
164   res1 = (uint64_t)xi * arr[4];
165   res2 = (uint64_t)xi * arr[8];
166   res0 = (res2 >> 32) | (res0 << 32);
167   res0 += res1;
168 
169   n = (res0 + (1ULL << 61)) >> 62;
170   res0 -= n << 62;
171   double x = (int64_t)res0;
172   *np = n;
173   return x * pi63;
174 }
175