1 /* -------------------------------------------------------------- */
2 /* (C)Copyright 2001,2008, */
3 /* International Business Machines Corporation, */
4 /* Sony Computer Entertainment, Incorporated, */
5 /* Toshiba Corporation, */
6 /* */
7 /* All Rights Reserved. */
8 /* */
9 /* Redistribution and use in source and binary forms, with or */
10 /* without modification, are permitted provided that the */
11 /* following conditions are met: */
12 /* */
13 /* - Redistributions of source code must retain the above copyright*/
14 /* notice, this list of conditions and the following disclaimer. */
15 /* */
16 /* - Redistributions in binary form must reproduce the above */
17 /* copyright notice, this list of conditions and the following */
18 /* disclaimer in the documentation and/or other materials */
19 /* provided with the distribution. */
20 /* */
21 /* - Neither the name of IBM Corporation nor the names of its */
22 /* contributors may be used to endorse or promote products */
23 /* derived from this software without specific prior written */
24 /* permission. */
25 /* */
26 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
27 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
28 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
29 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
30 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR */
31 /* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
32 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT */
33 /* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; */
34 /* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
35 /* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */
36 /* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR */
37 /* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, */
38 /* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
39 /* -------------------------------------------------------------- */
40 /* PROLOG END TAG zYx */
41 #ifdef __SPU__
42
43 #ifndef _DIVD2_H_
44 #define _DIVD2_H_ 1
45
46 #include <spu_intrinsics.h>
47
48 /*
49 * FUNCTION
50 * vector double _divd2(vector double a, vector double b)
51 *
52 * DESCRIPTION
53 * _divd2 divides the vector dividend a by the vector divisor b and
54 * returns the resulting vector quotient. Maximum error about 0.5 ulp
55 * over entire double range including denorms, compared to true result
56 * in round-to-nearest rounding mode. Handles Inf or NaN operands and
57 * results correctly.
58 */
_divd2(vector double a_in,vector double b_in)59 static __inline vector double _divd2(vector double a_in, vector double b_in)
60 {
61 /* Variables */
62 vec_int4 exp, exp_bias;
63 vec_uint4 no_underflow, overflow;
64 vec_float4 mant_bf, inv_bf;
65 vec_ullong2 exp_a, exp_b;
66 vec_ullong2 a_nan, a_zero, a_inf, a_denorm;
67 vec_ullong2 b_nan, b_zero, b_inf, b_denorm;
68 vec_ullong2 nan;
69 vec_double2 a, b;
70 vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult;
71
72 /* Constants */
73 vec_float4 onef = spu_splats(1.0f);
74 vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL);
75 vec_double2 one = spu_splats(1.0);
76
77 #ifdef __SPU_EDP__
78 vec_double2 denorm_scale = (vec_double2)spu_splats(0x4330000000000000ULL);
79
80 /* Identify all possible special values that must be accomodated including:
81 * +-0, +-infinity, +-denorm, and NaNs.
82 */
83 a_nan = spu_testsv(a_in, (SPU_SV_NAN));
84 a_zero = spu_testsv(a_in, (SPU_SV_NEG_ZERO | SPU_SV_POS_ZERO));
85 a_inf = spu_testsv(a_in, (SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY));
86 a_denorm = spu_testsv(a_in, (SPU_SV_NEG_DENORM | SPU_SV_POS_DENORM));
87
88 b_nan = spu_testsv(b_in, (SPU_SV_NAN));
89 b_zero = spu_testsv(b_in, (SPU_SV_NEG_ZERO | SPU_SV_POS_ZERO));
90 b_inf = spu_testsv(b_in, (SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY));
91 b_denorm = spu_testsv(b_in, (SPU_SV_NEG_DENORM | SPU_SV_POS_DENORM));
92
93 /* Scale denorm inputs to into normalized numbers by conditionally scaling the
94 * input parameters.
95 */
96 a = spu_sel(a_in, spu_mul(a_in, denorm_scale), a_denorm);
97 b = spu_sel(b_in, spu_mul(b_in, denorm_scale), b_denorm);
98
99 #else /* !__SPU_EDP__ */
100 vec_uint4 a_exp, b_exp;
101 vec_ullong2 a_mant_0, b_mant_0;
102 vec_ullong2 a_exp_1s, b_exp_1s;
103 vec_ullong2 sign_exp_mask;
104
105 vec_uint4 exp_mask_u32 = spu_splats((unsigned int)0x7FF00000);
106 vec_uchar16 splat_hi = (vec_uchar16){0,1,2,3, 0,1,2,3, 8, 9,10,11, 8,9,10,11};
107 vec_uchar16 swap_32 = (vec_uchar16){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
108 vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL);
109 vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL);
110
111 sign_exp_mask = spu_or(sign_mask, exp_mask);
112
113 /* Extract the floating point components from each of the operands including
114 * exponent and mantissa.
115 */
116 a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32);
117 a_exp = spu_shuffle(a_exp, a_exp, splat_hi);
118 b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32);
119 b_exp = spu_shuffle(b_exp, b_exp, splat_hi);
120
121 a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0);
122 a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32));
123
124 b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0);
125 b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32));
126
127 a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32);
128 b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32);
129
130 /* Identify all possible special values that must be accomodated including:
131 * +-denorm, +-0, +-infinity, and NaNs.
132 */
133 a_denorm = (vec_ullong2)spu_cmpeq(a_exp, 0); /* really is a_exp_0 */
134 a_nan = spu_andc(a_exp_1s, a_mant_0);
135 a_zero = spu_and (a_denorm, a_mant_0);
136 a_inf = spu_and (a_exp_1s, a_mant_0);
137
138 b_denorm = (vec_ullong2)spu_cmpeq(b_exp, 0); /* really is b_exp_0 */
139 b_nan = spu_andc(b_exp_1s, b_mant_0);
140 b_zero = spu_and (b_denorm, b_mant_0);
141 b_inf = spu_and (b_exp_1s, b_mant_0);
142
143 /* Scale denorm inputs to into normalized numbers by conditionally scaling the
144 * input parameters.
145 */
146 a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask));
147 a = spu_sel(a_in, a, a_denorm);
148
149 b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask));
150 b = spu_sel(b_in, b, b_denorm);
151
152 #endif /* __SPU_EDP__ */
153
154 /* Extract the divisor and dividend exponent and force parameters into the signed
155 * range [1.0,2.0) or [-1.0,2.0).
156 */
157 exp_a = spu_and((vec_ullong2)a, exp_mask);
158 exp_b = spu_and((vec_ullong2)b, exp_mask);
159
160 mant_a = spu_sel(a, one, (vec_ullong2)exp_mask);
161 mant_b = spu_sel(b, one, (vec_ullong2)exp_mask);
162
163 /* Approximate the single reciprocal of b by using
164 * the single precision reciprocal estimate followed by one
165 * single precision iteration of Newton-Raphson.
166 */
167 mant_bf = spu_roundtf(mant_b);
168 inv_bf = spu_re(mant_bf);
169 inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf);
170
171 /* Perform 2 more Newton-Raphson iterations in double precision. The
172 * result (q1) is in the range (0.5, 2.0).
173 */
174 inv_b = spu_extend(inv_bf);
175 inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b);
176 q0 = spu_mul(mant_a, inv_b);
177 q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0);
178
179
180 /* Determine the exponent correction factor that must be applied
181 * to q1 by taking into account the exponent of the normalized inputs
182 * and the scale factors that were applied to normalize them.
183 */
184 exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20);
185 exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34)));
186
187 /* Bias the quotient exponent depending on the sign of the exponent correction
188 * factor so that a single multiplier will ensure the entire double precision
189 * domain (including denorms) can be achieved.
190 *
191 * exp bias q1 adjust exp
192 * ===== ======== ==========
193 * positive 2^+65 -65
194 * negative 2^-64 +64
195 */
196 exp_bias = spu_xor(spu_rlmaska(exp, -31), 64);
197
198
199 exp = spu_sub(exp, exp_bias);
200
201 q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask);
202
203 /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the
204 * expected result.
205 */
206 exp = spu_add(exp, 0x3FF);
207 no_underflow = spu_cmpgt(exp, 0);
208 overflow = spu_cmpgt(exp, 0x7FF);
209 exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow);
210 exp = spu_and(exp, (vec_int4)exp_mask);
211 mult = spu_sel((vec_double2)exp, (vec_double2)exp_mask, (vec_ullong2)overflow);
212
213 /* Handle special value conditions. These include:
214 *
215 * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN
216 * results.
217 * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results.
218 * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results.
219 */
220 mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf));
221 mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero));
222
223 nan = spu_or(a_nan, b_nan);
224 nan = spu_or(nan, spu_and(a_zero, b_zero));
225 nan = spu_or(nan, spu_and(a_inf, b_inf));
226
227 mult = spu_or(mult, (vec_double2)nan);
228
229 /* Scale the final quotient */
230
231 q2 = spu_mul(q1, mult);
232
233 return (q2);
234 }
235
236 #endif /* _DIVD2_H_ */
237 #endif /* __SPU__ */
238