1/* 2 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd. 3 * 4 * SPDX-License-Identifier: BSD-3-Clause 5 */ 6 7#include "pico/asm_helper.S" 8#include "hardware/hazard3.h" 9 10// This file reimplements some common single-precision soft float routines 11// from libgcc. It targets the RV32IMBZbkb dialect (plus optionally Xh3bextm) 12// and is tuned for Hazard3 execution timings. 13 14// Subnormal values are always flushed to zero on both input and output. 15// Rounding is always to nearest (even on tie). 16 17pico_default_asm_setup 18 19.macro float_section name 20#if PICO_FLOAT_IN_RAM 21.section RAM_SECTION_NAME(\name), "ax" 22#else 23.section SECTION_NAME(\name), "ax" 24#endif 25.endm 26 27float_section __addsf3 28.global __subsf3 29.p2align 2 30__subsf3: 31 binvi a1, a1, 31 32.global __addsf3 33__addsf3: 34 // Unpack exponent: 35 h3.bextmi a2, a0, 23, 8 36 h3.bextmi a3, a1, 23, 8 37 // Flush-to-zero => 0 + y = y applies, including nan, with the sole 38 // exception of y being subnormal (which also needs to be flushed) 39 beqz a2, __addsf_return_y_flushed 40 // Don't have to handle this case for x + 0 = 0 because we already know x 41 // is nonzero 42 beqz a3, __addsf_return_x 43 // Unpack significand, plus 3 extra zeroes for working space: 44 slli a4, a0, 9 45 slli a5, a1, 9 46 // check nan/inf on input 47 li t0, 255 48 beq a2, t0, __addsf_x_nan_inf 49 beq a3, t0, __addsf_y_nan_inf 50 // (finish unpacking significand) 51 srli a4, a4, 6 52 srli a5, a5, 6 53 54 // If we're still on the straight path then we are adding two normal 55 // values. Add implicit one (1.xx...xx000) 56 bseti a4, a4, 23 + 3 57 bseti a5, a5, 23 + 3 58 // Negate if sign bit is set 59 bgez a0, 1f 60 neg a4, a4 611: 62 // (tuck this 16-bit here to avoid alignment penalty) 63 li t1, 25 64 bgez a1, 1f 65 neg a5, a5 661: 67 68 bltu a2, a3, __addsf_ye_gt_xe 69 70 // The main body is repeated twice with different register assignments. 71 // lhs is the more-significant addend: 72.macro addsf_core packed_lhs, packed_rhs, sig_lhs, sig_rhs, exp_lhs, exp_rhs, rhs_is_x 73 sub \packed_rhs, \exp_lhs, \exp_rhs 74 // If there is a large exponent difference then there is no effect on lhs 75.if \rhs_is_x 76 bgeu \packed_rhs, t1, __addsf_return_y 77.else 78 bgeu \packed_rhs, t1, __addsf_return_x 79.endif 80 // Shift rhs down to correct relative significance 81 sra \packed_lhs, \sig_rhs, \packed_rhs 82 // Set sticky bit if ones were shifted out 83 sll \packed_rhs, \packed_lhs, \packed_rhs 84 sltu \packed_rhs, \packed_rhs, \sig_rhs 85 or \packed_lhs, \packed_lhs, \packed_rhs 86 // Add significands 87 add \sig_lhs, \sig_lhs, \packed_lhs 88 // Detect exact cancellation (may be beyond max normalisation shift; also 89 // IEEE 754 requires +0 for exact cancellation, no matter input signs) 90 beqz \sig_lhs, __addsf_return_0 91 // Convert two's complement back to sign + magnitude 92 srai \exp_rhs, \sig_lhs, 31 93 xor \sig_lhs, \sig_lhs, \exp_rhs 94 sub \sig_lhs, \sig_lhs, \exp_rhs 95 // Renormalise significand: bit 31 is now implicit one 96 clz \packed_lhs, \sig_lhs 97 sll \sig_lhs, \sig_lhs, \packed_lhs 98 // Adjust exponent 99 addi \packed_lhs, \packed_lhs, -5 100 sub \exp_lhs, \exp_lhs, \packed_lhs 101 102 // Round to nearest, even on tie (bias upward if above odd number) 103 bexti \packed_lhs, \sig_lhs, 8 104 addi \sig_lhs, \sig_lhs, 127 105 add \sig_lhs, \sig_lhs, \packed_lhs 106 // Exponent may increase by one due to rounding up from all-ones; this is 107 // detected by clearing of implicit one (there is a carry-out too) 108 bgez \sig_lhs, 3f 1094: 110 // Detect underflow/overflow 111 bgeu \exp_lhs, t0, 1f 112 113 // Pack and return 114 packh \exp_lhs, \exp_lhs, \exp_rhs 115 slli \exp_lhs, \exp_lhs, 23 116 slli \sig_lhs, \sig_lhs, 1 117 srli \sig_lhs, \sig_lhs, 9 118 add a0, \sig_lhs, \exp_lhs 119 ret 1201: 121 bgez \exp_lhs, 2f 122 // Signed zero on underflow 123 slli a0, \exp_rhs, 31 124 ret 1252: 126 // Signed infinity on overflow 127 packh a0, t0, \exp_rhs 128 slli a0, a0, 23 129 ret 1303: 131 // Exponent increase due to rounding (uncommon) 132 srli \sig_lhs, \sig_lhs, 1 133 addi \exp_lhs, \exp_lhs, 1 134 j 4b 135.endm 136 137__addsf_xe_gte_ye: 138 addsf_core a0, a1, a4, a5, a2, a3, 0 139.p2align 2 140__addsf_ye_gt_xe: 141 addsf_core a1, a0, a5, a4, a3, a2, 1 142 143__addsf_x_nan_inf: 144 // When at least one operand is nan, we must propagate at least one of 145 // those nan payloads (sign of nan result is unspecified, which we take 146 // advantage of by implementing x - y as x + -y). Check x nan vs inf: 147 bnez a4, __addsf_return_x 148__addsf_x_inf: 149 // If x is +-inf, need to distinguish the following cases: 150 bne a3, t0, __addsf_return_x // y is neither inf nor nan -> return x (propagate inf) 151 bnez a5, __addsf_return_y // y is nan: -> return y (propagate nan) 152 xor a5, a0, a1 153 srli a5, a5, 31 154 beqz a5, __addsf_return_x // y is inf of same sign -> return either x or y (x is faster) 155 li a0, -1 // y is inf of different sign -> return nan 156 ret 157 158__addsf_y_nan_inf: 159 // Mirror of __addsf_x_nan_inf 160 bnez a5, __addsf_return_y 161__addsf_y_inf: 162 bne a2, t0, __addsf_return_y 163 bnez a4, __addsf_return_x 164 xor a4, a0, a1 165 srli a4, a4, 31 166 beqz a4, __addsf_return_x 167 li a0, -1 168 ret 169 170__addsf_return_y_flushed: 171 bnez a3, 1f 172 srli a1, a1, 23 173 slli a1, a1, 23 1741: 175__addsf_return_y: 176 mv a0, a1 177__addsf_return_x: 178 ret 179__addsf_return_0: 180 li a0, 0 181 ret 182 183 184float_section __mulsf3 185.global __mulsf3 186.p2align 2 187__mulsf3: 188 // Force y to be positive (by possibly negating x) *before* unpacking. 189 // This allows many special cases to be handled without repacking. 190 bgez a1, 1f 191 binvi a0, a0, 31 1921: 193 // Unpack exponent: 194 h3.bextmi a2, a0, 23, 8 195 h3.bextmi a3, a1, 23, 8 196 // Check special cases 197 li t0, 255 198 beqz a2, __mulsf_x_0 199 beqz a3, __mulsf_y_0 200 beq a2, t0, __mulsf_x_nan_inf 201 beq a3, t0, __mulsf_y_nan_inf 202 203 // Finish unpacking sign 204 srai a6, a0, 31 205 // Unpack significand (with implicit one in MSB) 206 slli a4, a0, 8 207 slli a5, a1, 8 208 bseti a4, a4, 31 209 bseti a5, a5, 31 210 // Get full 64-bit multiply result in a4:a1 (one cycle each half) 211 // Going from Q1.23 to Q2.46 (both left-justified) 212 mul a1, a4, a5 213 mulhu a4, a4, a5 214 // Normalise (shift left by either 0 or 1) -- bit 8 is the LSB of the 215 // final significand (ignoring rounding) 216 clz a0, a4 217 sll a4, a4, a0 218 sub a2, a2, a0 219 add a2, a2, a3 220 // Subtract redundant bias term (127), add 1 for normalisation correction 221 addi a2, a2, -126 222 blez a2, __mulsf_underflow 223 224 // Gather sticky bits from low fraction: 225 snez a1, a1 226 or a4, a4, a1 227 // Round to nearest, even on tie (aka bias upward if odd) 228 bexti a1, a4, 8 229 add a4, a4, a1 230 addi a4, a4, 127 231 // Check carry-out: exponent may increase due to rounding 232 bgez a4, 2f 2331: 234 bge a2, t0, __mulsf_overflow 235 // Pack it and ship it 236 packh a2, a2, a6 237 slli a2, a2, 23 238 slli a4, a4, 1 239 srli a4, a4, 9 240 add a0, a4, a2 241 ret 2422: 243 srli a4, a4, 1 244 addi a2, a2, 1 245 j 1b 246 247__mulsf_underflow: 248 // Signed zero 249 slli a0, a6, 31 250 ret 251__mulsf_overflow: 252 // Signed inf 253 packh a0, t0, a6 254 slli a0, a0, 23 255 ret 256 257__mulsf_x_0: 258 // 0 times nan -> propagate nan 259 // 0 times inf -> generate nan 260 // 0 times others -> 0 (need to flush significand too as we are FTZ) 261 bne a3, t0, __mulsf_return_flushed_x 262 slli a5, a1, 9 263 beqz a5, 1f 264 // Propagate nan from y 265__mulsf_return_y: 266 mv a0, a1 267 ret 2681: 269 // Generate new nan 270 li a0, -1 271 ret 272 273__mulsf_y_0: 274 // Mirror image of x_0 except we still return x for signed 0, since the 275 // signs were already resolved. 276 bne a2, t0, __mulsf_return_flushed_x 277 slli a1, a0, 9 278 bnez a1, 1f 279 li a0, -1 2801: 281 ret 282 283__mulsf_return_flushed_x: 284 // If we don't support subnormals we at least need to flush to a canonical 285 // zero. This is just a sign bit in bit 31. 286 srli a0, a0, 31 287 slli a0, a0, 31 288__mulsf_return_x: 289 ret 290 291__mulsf_x_nan_inf: 292 // We know that y is not zero and is positive. So... 293 // x is nan -> return x 294 // else y is nan -> return y 295 // else y is inf -> return x 296 // else y is normal -> return x 297 // (the order of the first two clauses is actually our free choice) 298 slli a4, a0, 9 299 bnez a4, __mulsf_return_x 300 bne a3, t0, __mulsf_return_x 301 slli a5, a1, 9 302 bnez a5, __mulsf_return_y 303 ret // return x 304 305__mulsf_y_nan_inf: 306 // We know that x is not zero, nan, nor inf. That just leaves normals. 307 // y is nan -> return y 308 // y is inf -> return inf * sgn(x) (since we already merged the signs) 309 slli a5, a1, 9 310 bnez a5, __mulsf_return_y 311 srai a0, a0, 31 312 packh a0, t0, a0 313 slli a0, a0, 23 314 ret 315 316 317// This is a hack to improve soft float performance for the routines we don't 318// implement (e.g. libm) in libraries built against a non-Zbb ISA dialect: 319float_section __clz2si 320.global __clz2si 321__clz2si: 322 clz a0, a0 323 ret 324