1 /* --------------------------------------------------------------  */
2 /* (C)Copyright 2001,2008,                                         */
3 /* International Business Machines Corporation,                    */
4 /* Sony Computer Entertainment, Incorporated,                      */
5 /* Toshiba Corporation,                                            */
6 /*                                                                 */
7 /* All Rights Reserved.                                            */
8 /*                                                                 */
9 /* Redistribution and use in source and binary forms, with or      */
10 /* without modification, are permitted provided that the           */
11 /* following conditions are met:                                   */
12 /*                                                                 */
13 /* - Redistributions of source code must retain the above copyright*/
14 /*   notice, this list of conditions and the following disclaimer. */
15 /*                                                                 */
16 /* - Redistributions in binary form must reproduce the above       */
17 /*   copyright notice, this list of conditions and the following   */
18 /*   disclaimer in the documentation and/or other materials        */
19 /*   provided with the distribution.                               */
20 /*                                                                 */
21 /* - Neither the name of IBM Corporation nor the names of its      */
22 /*   contributors may be used to endorse or promote products       */
23 /*   derived from this software without specific prior written     */
24 /*   permission.                                                   */
25 /*                                                                 */
26 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          */
27 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     */
28 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        */
29 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        */
30 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            */
31 /* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    */
32 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT    */
33 /* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;    */
34 /* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)        */
35 /* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN       */
36 /* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR    */
37 /* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  */
38 /* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              */
39 /* --------------------------------------------------------------  */
40 /* PROLOG END TAG zYx                                              */
41 #ifdef __SPU__
42 
43 #ifndef _RECIPD2_H_
44 #define _RECIPD2_H_		1
45 
46 #include <spu_intrinsics.h>
47 
48 
49 /*
50  * FUNCTION
51  *	vector double _recipd2(vector double value)
52  *
53  * DESCRIPTION
54  * 	The _recipd2 function inverts "value" and returns the result.
55  *      Computation is performed using the single precision reciprocal
56  *      estimate and interpolate instructions to produce a 12 accurate
57  *      estimate.
58  *
59  *	One (1) iteration of a Newton-Raphson is performed to improve
60  *	accuracy to single precision floating point. Two additional double
61  *	precision iterations are  needed to achieve a full double
62  *	preicision result.
63  *
64  *	The Newton-Raphson iteration is of the form:
65  *	a)	X[i+1] = X[i] * (2.0 - b*X[i])
66  *          or
67  *      b)	X[i+1] = X[i] + X[i]*(1.0 - X[i]*b)
68  * 	where b is the input value to be inverted
69  *
70  *      The later (b) form improves the accuracy to 99.95% correctly rounded.
71  */
_recipd2(vector double value_in)72 static __inline vector double _recipd2(vector double value_in)
73 {
74   vec_float4  x0;
75   vec_float4  value;
76   vec_float4  one   = spu_splats(1.0f);
77   vec_double2 one_d = spu_splats(1.0);
78   vec_double2 x1, x2, x3;
79   vec_double2 scale;
80   vec_double2 exp, value_d;
81   vec_ullong2 expmask = spu_splats(0x7FF0000000000000ULL);
82   vec_ullong2 is0inf;
83 
84 #ifdef __SPU_EDP__
85   vec_ullong2 isdenorm;
86   vec_ullong2 expmask_minus1 = spu_splats(0x7FE0000000000000ULL);
87 
88   /* Determine special input values. For example, if the input is a denorm, infinity or 0 */
89 
90   isdenorm = spu_testsv(value_in, (SPU_SV_POS_DENORM   | SPU_SV_NEG_DENORM));
91   is0inf   = spu_testsv(value_in, (SPU_SV_NEG_ZERO     | SPU_SV_POS_ZERO |
92 				   SPU_SV_NEG_INFINITY | SPU_SV_POS_INFINITY));
93 
94   /* Scale the divisor to correct for double precision floating
95    * point exponents that are out of single precision range.
96    */
97   exp = spu_and(value_in, (vec_double2)expmask);
98   scale = spu_xor(exp, (vec_double2)spu_sel(expmask, expmask_minus1, isdenorm));
99   value_d = spu_mul(value_in, scale);
100   value = spu_roundtf(value_d);
101 
102   /* Perform reciprocal with 1 single precision and 2 double precision
103    * Newton-Raphson iterations.
104    */
105   x0 = spu_re(value);
106   x1 = spu_extend(spu_madd(spu_nmsub(value, x0, one), x0, x0));
107   x2 = spu_madd(spu_nmsub(value_d, x1, one_d), x1, x1);
108   x3 = spu_madd(spu_nmsub(value_d, x2, one_d), x2, x2);
109   x3 = spu_sel(spu_mul(x3, scale), spu_xor(value_in, (vector double)expmask), is0inf);
110 
111 #else /* !__SPU_EDP__ */
112 
113   vec_uint4 isinf, iszero, isdenorm0;
114   vec_double2 value_abs;
115   vec_double2 sign = spu_splats(-0.0);
116   vec_double2 denorm_scale = (vec_double2)spu_splats(0x4330000000000000ULL);
117   vec_double2 exp_53 = (vec_double2)spu_splats(0x0350000000000000ULL);
118   vec_uchar16 splat_hi = (vec_uchar16){0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11};
119   vec_uchar16 swap = (vec_uchar16){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
120 
121   value_abs = spu_andc(value_in, sign);
122   exp = spu_and(value_in, (vec_double2)expmask);
123 
124   /* Determine if the input is a special value. These include:
125    *  denorm   - then we must coerce it to a normal value.
126    *  zero     - then we must return an infinity
127    *  infinity - then we must return a zero.
128    */
129   isdenorm0 = spu_cmpeq(spu_shuffle((vec_uint4)exp, (vec_uint4)exp, splat_hi), 0);
130 
131   isinf  = spu_cmpeq((vec_uint4)value_abs, (vec_uint4)expmask);
132   iszero = spu_cmpeq((vec_uint4)value_abs, 0);
133   isinf  = spu_and(isinf,  spu_shuffle(isinf, isinf, swap));
134   iszero = spu_and(iszero, spu_shuffle(iszero, iszero, swap));
135   is0inf = (vec_ullong2)spu_or(isinf, iszero);
136 
137   /* If the inputs is a denorm, we must first convert it to a normal number since
138    * arithmetic operations on denormals produces 0 on Cell/B.E.
139    */
140   value_d = spu_sub(spu_or(value_abs, exp_53), exp_53);
141   value_d = spu_sel(value_abs, value_d, (vec_ullong2)isdenorm0);
142 
143   /* Scale the divisor to correct for double precision floating
144    * point exponents that are out of single precision range.
145    */
146   scale = spu_xor(spu_and(value_d, (vec_double2)expmask), (vec_double2)expmask);
147   value_d = spu_mul(value_d, scale);
148   value = spu_roundtf(value_d);
149 
150   /* Perform reciprocal with 1 single precision and 2 double precision
151    * Newton-Raphson iterations. The bias is removed after the single
152    * precision iteration.
153    */
154   x0 = spu_re(value);
155   x1 = spu_extend(spu_madd(spu_nmsub(value, x0, one), x0, x0));
156   x2 = spu_madd(spu_nmsub(value_d, x1, one_d), x1, x1);
157   x3 = spu_madd(spu_nmsub(value_d, x2, one_d), x2, x2);
158   x3 = spu_mul(x3, spu_sel(scale, value_in, (vec_ullong2)sign));
159   x3 = spu_sel(x3, spu_mul(x3, denorm_scale), (vec_ullong2)isdenorm0);
160   x3 = spu_sel(x3, spu_xor(value_in, (vector double)expmask), is0inf);
161 
162 #endif /* __SPU_EDP__ */
163 
164   return (x3);
165 }
166 
167 #endif /* _RECIPD2_H_ */
168 #endif /* __SPU__ */
169