1 /* --------------------------------------------------------------  */
2 /* (C)Copyright 2001,2008,                                         */
3 /* International Business Machines Corporation,                    */
4 /* Sony Computer Entertainment, Incorporated,                      */
5 /* Toshiba Corporation,                                            */
6 /*                                                                 */
7 /* All Rights Reserved.                                            */
8 /*                                                                 */
9 /* Redistribution and use in source and binary forms, with or      */
10 /* without modification, are permitted provided that the           */
11 /* following conditions are met:                                   */
12 /*                                                                 */
13 /* - Redistributions of source code must retain the above copyright*/
14 /*   notice, this list of conditions and the following disclaimer. */
15 /*                                                                 */
16 /* - Redistributions in binary form must reproduce the above       */
17 /*   copyright notice, this list of conditions and the following   */
18 /*   disclaimer in the documentation and/or other materials        */
19 /*   provided with the distribution.                               */
20 /*                                                                 */
21 /* - Neither the name of IBM Corporation nor the names of its      */
22 /*   contributors may be used to endorse or promote products       */
23 /*   derived from this software without specific prior written     */
24 /*   permission.                                                   */
25 /*                                                                 */
26 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          */
27 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     */
28 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        */
29 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        */
30 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            */
31 /* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    */
32 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT    */
33 /* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;    */
34 /* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)        */
35 /* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN       */
36 /* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR    */
37 /* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  */
38 /* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              */
39 /* --------------------------------------------------------------  */
40 /* PROLOG END TAG zYx                                              */
41 #ifdef __SPU__
42 
43 #ifndef _DIVD2_H_
44 #define _DIVD2_H_		 1
45 
46 #include <spu_intrinsics.h>
47 
48 /*
49  * FUNCTION
50  * 	vector double _divd2(vector double a, vector double b)
51  *
52  * DESCRIPTION
53  * 	_divd2 divides the vector dividend a by the vector divisor b and
54  *      returns the resulting vector quotient.  Maximum error about 0.5 ulp
55  *      over entire double range including denorms, compared to true result
56  *      in round-to-nearest rounding mode.  Handles Inf or NaN operands and
57  *	results correctly.
58  */
_divd2(vector double a_in,vector double b_in)59 static __inline vector double _divd2(vector double a_in, vector double b_in)
60 {
61   /* Variables */
62   vec_int4    exp, exp_bias;
63   vec_uint4   no_underflow, overflow;
64   vec_float4  mant_bf, inv_bf;
65   vec_ullong2 exp_a, exp_b;
66   vec_ullong2 a_nan, a_zero, a_inf, a_denorm;
67   vec_ullong2 b_nan, b_zero, b_inf, b_denorm;
68   vec_ullong2 nan;
69   vec_double2 a, b;
70   vec_double2 mant_a, mant_b, inv_b, q0, q1, q2, mult;
71 
72   /* Constants */
73   vec_float4  onef = spu_splats(1.0f);
74   vec_ullong2 exp_mask = spu_splats(0x7FF0000000000000ULL);
75   vec_double2 one = spu_splats(1.0);
76 
77 #ifdef __SPU_EDP__
78   vec_double2 denorm_scale = (vec_double2)spu_splats(0x4330000000000000ULL);
79 
80   /* Identify all possible special values that must be accomodated including:
81    * +-0, +-infinity, +-denorm, and NaNs.
82    */
83   a_nan    = spu_testsv(a_in, (SPU_SV_NAN));
84   a_zero   = spu_testsv(a_in, (SPU_SV_NEG_ZERO     | SPU_SV_POS_ZERO));
85   a_inf    = spu_testsv(a_in, (SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY));
86   a_denorm = spu_testsv(a_in, (SPU_SV_NEG_DENORM   | SPU_SV_POS_DENORM));
87 
88   b_nan    = spu_testsv(b_in, (SPU_SV_NAN));
89   b_zero   = spu_testsv(b_in, (SPU_SV_NEG_ZERO     | SPU_SV_POS_ZERO));
90   b_inf    = spu_testsv(b_in, (SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY));
91   b_denorm = spu_testsv(b_in, (SPU_SV_NEG_DENORM   | SPU_SV_POS_DENORM));
92 
93   /* Scale denorm inputs to into normalized numbers by conditionally scaling the
94    * input parameters.
95    */
96   a = spu_sel(a_in, spu_mul(a_in, denorm_scale), a_denorm);
97   b = spu_sel(b_in, spu_mul(b_in, denorm_scale), b_denorm);
98 
99 #else /* !__SPU_EDP__ */
100   vec_uint4   a_exp, b_exp;
101   vec_ullong2 a_mant_0, b_mant_0;
102   vec_ullong2 a_exp_1s, b_exp_1s;
103   vec_ullong2 sign_exp_mask;
104 
105   vec_uint4   exp_mask_u32 = spu_splats((unsigned int)0x7FF00000);
106   vec_uchar16 splat_hi = (vec_uchar16){0,1,2,3, 0,1,2,3,  8, 9,10,11, 8,9,10,11};
107   vec_uchar16 swap_32 = (vec_uchar16){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
108   vec_ullong2 sign_mask = spu_splats(0x8000000000000000ULL);
109   vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL);
110 
111   sign_exp_mask = spu_or(sign_mask, exp_mask);
112 
113   /* Extract the floating point components from each of the operands including
114    * exponent and mantissa.
115    */
116   a_exp = (vec_uint4)spu_and((vec_uint4)a_in, exp_mask_u32);
117   a_exp = spu_shuffle(a_exp, a_exp, splat_hi);
118   b_exp = (vec_uint4)spu_and((vec_uint4)b_in, exp_mask_u32);
119   b_exp = spu_shuffle(b_exp, b_exp, splat_hi);
120 
121   a_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)a_in, sign_exp_mask), 0);
122   a_mant_0 = spu_and(a_mant_0, spu_shuffle(a_mant_0, a_mant_0, swap_32));
123 
124   b_mant_0 = (vec_ullong2)spu_cmpeq((vec_uint4)spu_andc((vec_ullong2)b_in, sign_exp_mask), 0);
125   b_mant_0 = spu_and(b_mant_0, spu_shuffle(b_mant_0, b_mant_0, swap_32));
126 
127   a_exp_1s = (vec_ullong2)spu_cmpeq(a_exp, exp_mask_u32);
128   b_exp_1s = (vec_ullong2)spu_cmpeq(b_exp, exp_mask_u32);
129 
130   /* Identify all possible special values that must be accomodated including:
131    * +-denorm, +-0, +-infinity, and NaNs.
132    */
133   a_denorm = (vec_ullong2)spu_cmpeq(a_exp, 0);		/* really is a_exp_0 */
134   a_nan    = spu_andc(a_exp_1s, a_mant_0);
135   a_zero   = spu_and (a_denorm, a_mant_0);
136   a_inf    = spu_and (a_exp_1s, a_mant_0);
137 
138   b_denorm = (vec_ullong2)spu_cmpeq(b_exp, 0);		/* really is b_exp_0 */
139   b_nan    = spu_andc(b_exp_1s, b_mant_0);
140   b_zero   = spu_and (b_denorm, b_mant_0);
141   b_inf    = spu_and (b_exp_1s, b_mant_0);
142 
143   /* Scale denorm inputs to into normalized numbers by conditionally scaling the
144    * input parameters.
145    */
146   a = spu_sub(spu_or(a_in, exp_53), spu_sel(exp_53, a_in, sign_mask));
147   a = spu_sel(a_in, a, a_denorm);
148 
149   b = spu_sub(spu_or(b_in, exp_53), spu_sel(exp_53, b_in, sign_mask));
150   b = spu_sel(b_in, b, b_denorm);
151 
152 #endif /* __SPU_EDP__ */
153 
154   /* Extract the divisor and dividend exponent and force parameters into the signed
155    * range [1.0,2.0) or [-1.0,2.0).
156    */
157   exp_a = spu_and((vec_ullong2)a, exp_mask);
158   exp_b = spu_and((vec_ullong2)b, exp_mask);
159 
160   mant_a = spu_sel(a, one, (vec_ullong2)exp_mask);
161   mant_b = spu_sel(b, one, (vec_ullong2)exp_mask);
162 
163   /* Approximate the single reciprocal of b by using
164    * the single precision reciprocal estimate followed by one
165    * single precision iteration of Newton-Raphson.
166    */
167   mant_bf = spu_roundtf(mant_b);
168   inv_bf = spu_re(mant_bf);
169   inv_bf = spu_madd(spu_nmsub(mant_bf, inv_bf, onef), inv_bf, inv_bf);
170 
171   /* Perform 2 more Newton-Raphson iterations in double precision. The
172    * result (q1) is in the range (0.5, 2.0).
173    */
174   inv_b = spu_extend(inv_bf);
175   inv_b = spu_madd(spu_nmsub(mant_b, inv_b, one), inv_b, inv_b);
176   q0 = spu_mul(mant_a, inv_b);
177   q1 = spu_madd(spu_nmsub(mant_b, q0, mant_a), inv_b, q0);
178 
179 
180   /* Determine the exponent correction factor that must be applied
181    * to q1 by taking into account the exponent of the normalized inputs
182    * and the scale factors that were applied to normalize them.
183    */
184   exp = spu_rlmaska(spu_sub((vec_int4)exp_a, (vec_int4)exp_b), -20);
185   exp = spu_add(exp, (vec_int4)spu_add(spu_and((vec_int4)a_denorm, -0x34), spu_and((vec_int4)b_denorm, 0x34)));
186 
187   /* Bias the quotient exponent depending on the sign of the exponent correction
188    * factor so that a single multiplier will ensure the entire double precision
189    * domain (including denorms) can be achieved.
190    *
191    *    exp 	       bias q1     adjust exp
192    *   =====	       ========    ==========
193    *   positive         2^+65         -65
194    *   negative         2^-64         +64
195    */
196   exp_bias = spu_xor(spu_rlmaska(exp, -31), 64);
197 
198 
199   exp = spu_sub(exp, exp_bias);
200 
201   q1 = spu_sel(q1, (vec_double2)spu_add((vec_int4)q1, spu_sl(exp_bias, 20)), exp_mask);
202 
203   /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the
204    * expected result.
205    */
206   exp = spu_add(exp, 0x3FF);
207   no_underflow = spu_cmpgt(exp, 0);
208   overflow = spu_cmpgt(exp, 0x7FF);
209   exp = spu_and(spu_sl(exp, 20), (vec_int4)no_underflow);
210   exp = spu_and(exp, (vec_int4)exp_mask);
211   mult = spu_sel((vec_double2)exp, (vec_double2)exp_mask, (vec_ullong2)overflow);
212 
213   /* Handle special value conditions. These include:
214    *
215    * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN
216    *    results.
217    * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results.
218    * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results.
219    */
220   mult = spu_andc(mult, (vec_double2)spu_or(a_zero, b_inf));
221   mult = spu_sel(mult, (vec_double2)exp_mask, spu_or(a_inf, b_zero));
222 
223   nan = spu_or(a_nan, b_nan);
224   nan = spu_or(nan, spu_and(a_zero, b_zero));
225   nan = spu_or(nan, spu_and(a_inf, b_inf));
226 
227   mult = spu_or(mult, (vec_double2)nan);
228 
229   /* Scale the final quotient */
230 
231   q2 = spu_mul(q1, mult);
232 
233   return (q2);
234 }
235 
236 #endif /* _DIVD2_H_ */
237 #endif /* __SPU__ */
238