1 /* -------------------------------------------------------------- */
2 /* (C)Copyright 2001,2008, */
3 /* International Business Machines Corporation, */
4 /* Sony Computer Entertainment, Incorporated, */
5 /* Toshiba Corporation, */
6 /* */
7 /* All Rights Reserved. */
8 /* */
9 /* Redistribution and use in source and binary forms, with or */
10 /* without modification, are permitted provided that the */
11 /* following conditions are met: */
12 /* */
13 /* - Redistributions of source code must retain the above copyright*/
14 /* notice, this list of conditions and the following disclaimer. */
15 /* */
16 /* - Redistributions in binary form must reproduce the above */
17 /* copyright notice, this list of conditions and the following */
18 /* disclaimer in the documentation and/or other materials */
19 /* provided with the distribution. */
20 /* */
21 /* - Neither the name of IBM Corporation nor the names of its */
22 /* contributors may be used to endorse or promote products */
23 /* derived from this software without specific prior written */
24 /* permission. */
25 /* */
26 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
27 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
28 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
29 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
30 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR */
31 /* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
32 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT */
33 /* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; */
34 /* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
35 /* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */
36 /* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR */
37 /* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, */
38 /* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
39 /* -------------------------------------------------------------- */
40 /* PROLOG END TAG zYx */
41 #ifdef __SPU__
42 #ifndef _EXP2F4_H_
43 #define _EXP2F4_H_ 1
44
45
46 #include <spu_intrinsics.h>
47 #include "simdmath.h"
48
49 /*
50 * FUNCTION
51 * vector float _exp2f4(vector float x)
52 *
53 * DESCRIPTION
54 * The _exp2f4 function computes 2 raised to the input vector x.
55 * Computation is performed by observing the 2^(a+b) = 2^a * 2^b.
56 * We decompose x into a and b (above) by letting.
57 * a = ceil(x), b = x - a;
58 *
59 * 2^a is easilty computed by placing a into the exponent
60 * or a floating point number whose mantissa is all zeros.
61 *
62 * 2^b is computed using the following polynomial approximation.
63 * (C. Hastings, Jr, 1955).
64 *
65 * __7__
66 * \
67 * \
68 * 2^(-x) = / Ci*x^i
69 * /____
70 * i=1
71 *
72 * for x in the range 0.0 to 1.0
73 *
74 * C0 = 1.0
75 * C1 = -0.9999999995
76 * C2 = 0.4999999206
77 * C3 = -0.1666653019
78 * C4 = 0.0416573475
79 * C5 = -0.0083013598
80 * C6 = 0.0013298820
81 * C7 = -0.0001413161
82 *
83 */
_exp2f4(vector float x)84 static __inline vector float _exp2f4(vector float x)
85 {
86 vector signed int ix;
87 vector unsigned int overflow, underflow;
88 vector float frac, frac2, frac4;
89 vector float exp_int, exp_frac;
90 vector float result;
91 vector float hi, lo;
92
93 vector float bias;
94 /* Break in the input x into two parts ceil(x), x - ceil(x).
95 */
96 bias = (vector float)(spu_rlmaska((vector signed int)(x), -31));
97 bias = (vector float)(spu_andc(spu_splats((unsigned int)0x3F7FFFFF), (vector unsigned int)bias));
98 ix = spu_convts(spu_add(x, bias), 0);
99 frac = spu_sub(spu_convtf(ix, 0), x);
100 frac = spu_mul(frac, spu_splats((float)SM_LN2));
101
102 overflow = spu_rlmask(spu_cmpgt(ix, 128), -1);
103 underflow = spu_cmpgt(ix, -128);
104
105 exp_int = (vector float)spu_and((vector unsigned int)spu_sl(spu_add(ix, 127), 23), underflow);
106
107 /* Instruction counts can be reduced if the polynomial was
108 * computed entirely from nested (dependent) fma's. However,
109 * to reduce the number of pipeline stalls, the polygon is evaluated
110 * in two halves (hi amd lo).
111 */
112 frac2 = spu_mul(frac, frac);
113 frac4 = spu_mul(frac2, frac2);
114
115 hi = spu_madd(frac, spu_splats(-0.0001413161f), spu_splats(0.0013298820f));
116 hi = spu_madd(frac, hi, spu_splats(-0.0083013598f));
117 hi = spu_madd(frac, hi, spu_splats(0.0416573475f));
118 lo = spu_madd(frac, spu_splats(-0.1666653019f), spu_splats(0.4999999206f));
119 lo = spu_madd(frac, lo, spu_splats(-0.9999999995f));
120 lo = spu_madd(frac, lo, spu_splats(1.0f));
121
122 exp_frac = spu_madd(frac4, hi, lo);
123 ix = spu_add(ix, spu_rlmask((vector signed int)(exp_frac), -23));
124 result = spu_mul(exp_frac, exp_int);
125
126 /* Handle overflow */
127 result = spu_or(result, (vector float)overflow);
128
129 return (result);
130
131 }
132
133 #endif /* _EXP2F4_H_ */
134 #endif /* __SPU__ */
135