1/*
2 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7#include "pico/asm_helper.S"
8#include "hardware/hazard3.h"
9
10// This file reimplements some common single-precision soft float routines
11// from libgcc. It targets the RV32IMBZbkb dialect (plus optionally Xh3bextm)
12// and is tuned for Hazard3 execution timings.
13
14// Subnormal values are always flushed to zero on both input and output.
15// Rounding is always to nearest (even on tie).
16
17pico_default_asm_setup
18
19.macro float_section name
20#if PICO_FLOAT_IN_RAM
21.section RAM_SECTION_NAME(\name), "ax"
22#else
23.section SECTION_NAME(\name), "ax"
24#endif
25.endm
26
27float_section __addsf3
28.global __subsf3
29.p2align 2
30__subsf3:
31    binvi a1, a1, 31
32.global __addsf3
33__addsf3:
34    // Unpack exponent:
35    h3.bextmi a2, a0, 23, 8
36    h3.bextmi a3, a1, 23, 8
37    // Flush-to-zero => 0 + y = y applies, including nan, with the sole
38    // exception of y being subnormal (which also needs to be flushed)
39    beqz a2, __addsf_return_y_flushed
40    // Don't have to handle this case for x + 0 = 0 because we already know x
41    // is nonzero
42    beqz a3, __addsf_return_x
43    // Unpack significand, plus 3 extra zeroes for working space:
44    slli a4, a0, 9
45    slli a5, a1, 9
46    // check nan/inf on input
47    li t0, 255
48    beq a2, t0, __addsf_x_nan_inf
49    beq a3, t0, __addsf_y_nan_inf
50    // (finish unpacking significand)
51    srli a4, a4, 6
52    srli a5, a5, 6
53
54    // If we're still on the straight path then we are adding two normal
55    // values. Add implicit one (1.xx...xx000)
56    bseti a4, a4, 23 + 3
57    bseti a5, a5, 23 + 3
58    // Negate if sign bit is set
59    bgez a0, 1f
60    neg a4, a4
611:
62    // (tuck this 16-bit here to avoid alignment penalty)
63    li t1, 25
64    bgez a1, 1f
65    neg a5, a5
661:
67
68    bltu a2, a3, __addsf_ye_gt_xe
69
70    // The main body is repeated twice with different register assignments.
71    // lhs is the more-significant addend:
72.macro addsf_core packed_lhs, packed_rhs, sig_lhs, sig_rhs, exp_lhs, exp_rhs, rhs_is_x
73    sub \packed_rhs, \exp_lhs, \exp_rhs
74    // If there is a large exponent difference then there is no effect on lhs
75.if \rhs_is_x
76    bgeu \packed_rhs, t1, __addsf_return_y
77.else
78    bgeu \packed_rhs, t1, __addsf_return_x
79.endif
80    // Shift rhs down to correct relative significance
81    sra \packed_lhs, \sig_rhs, \packed_rhs
82    // Set sticky bit if ones were shifted out
83    sll \packed_rhs, \packed_lhs, \packed_rhs
84    sltu \packed_rhs, \packed_rhs, \sig_rhs
85    or \packed_lhs, \packed_lhs, \packed_rhs
86    // Add significands
87    add \sig_lhs, \sig_lhs, \packed_lhs
88    // Detect exact cancellation (may be beyond max normalisation shift; also
89    // IEEE 754 requires +0 for exact cancellation, no matter input signs)
90    beqz \sig_lhs, __addsf_return_0
91    // Convert two's complement back to sign + magnitude
92    srai \exp_rhs, \sig_lhs, 31
93    xor \sig_lhs, \sig_lhs, \exp_rhs
94    sub \sig_lhs, \sig_lhs, \exp_rhs
95    // Renormalise significand: bit 31 is now implicit one
96    clz \packed_lhs, \sig_lhs
97    sll \sig_lhs, \sig_lhs, \packed_lhs
98    // Adjust exponent
99    addi \packed_lhs, \packed_lhs, -5
100    sub \exp_lhs, \exp_lhs, \packed_lhs
101
102    // Round to nearest, even on tie (bias upward if above odd number)
103    bexti \packed_lhs, \sig_lhs, 8
104    addi \sig_lhs, \sig_lhs, 127
105    add \sig_lhs, \sig_lhs, \packed_lhs
106    // Exponent may increase by one due to rounding up from all-ones; this is
107    // detected by clearing of implicit one (there is a carry-out too)
108    bgez \sig_lhs, 3f
1094:
110    // Detect underflow/overflow
111    bgeu \exp_lhs, t0, 1f
112
113    // Pack and return
114    packh \exp_lhs, \exp_lhs, \exp_rhs
115    slli \exp_lhs, \exp_lhs, 23
116    slli \sig_lhs, \sig_lhs, 1
117    srli \sig_lhs, \sig_lhs, 9
118    add a0, \sig_lhs, \exp_lhs
119    ret
1201:
121    bgez \exp_lhs, 2f
122    // Signed zero on underflow
123    slli a0, \exp_rhs, 31
124    ret
1252:
126    // Signed infinity on overflow
127    packh a0, t0, \exp_rhs
128    slli a0, a0, 23
129    ret
1303:
131    // Exponent increase due to rounding (uncommon)
132    srli \sig_lhs, \sig_lhs, 1
133    addi \exp_lhs, \exp_lhs, 1
134    j 4b
135.endm
136
137__addsf_xe_gte_ye:
138    addsf_core a0, a1, a4, a5, a2, a3, 0
139.p2align 2
140__addsf_ye_gt_xe:
141    addsf_core a1, a0, a5, a4, a3, a2, 1
142
143__addsf_x_nan_inf:
144    // When at least one operand is nan, we must propagate at least one of
145    // those nan payloads (sign of nan result is unspecified, which we take
146    // advantage of by implementing x - y as x + -y). Check x nan vs inf:
147    bnez a4, __addsf_return_x
148__addsf_x_inf:
149    // If x is +-inf, need to distinguish the following cases:
150    bne  a3, t0, __addsf_return_x // y is neither inf nor nan   -> return x (propagate inf)
151    bnez a5,     __addsf_return_y // y is nan:                  -> return y (propagate nan)
152    xor a5, a0, a1
153    srli a5, a5, 31
154    beqz a5,     __addsf_return_x // y is inf of same sign      -> return either x or y (x is faster)
155    li a0, -1                     // y is inf of different sign -> return nan
156    ret
157
158__addsf_y_nan_inf:
159    // Mirror of __addsf_x_nan_inf
160    bnez a5, __addsf_return_y
161__addsf_y_inf:
162    bne  a2, t0, __addsf_return_y
163    bnez a4,     __addsf_return_x
164    xor a4, a0, a1
165    srli a4, a4, 31
166    beqz a4,     __addsf_return_x
167    li a0, -1
168    ret
169
170__addsf_return_y_flushed:
171    bnez a3, 1f
172    srli a1, a1, 23
173    slli a1, a1, 23
1741:
175__addsf_return_y:
176    mv a0, a1
177__addsf_return_x:
178    ret
179__addsf_return_0:
180    li a0, 0
181    ret
182
183
184float_section __mulsf3
185.global __mulsf3
186.p2align 2
187__mulsf3:
188    // Force y to be positive (by possibly negating x) *before* unpacking.
189    // This allows many special cases to be handled without repacking.
190    bgez a1, 1f
191    binvi a0, a0, 31
1921:
193    // Unpack exponent:
194    h3.bextmi a2, a0, 23, 8
195    h3.bextmi a3, a1, 23, 8
196    // Check special cases
197    li t0, 255
198    beqz a2, __mulsf_x_0
199    beqz a3, __mulsf_y_0
200    beq a2, t0, __mulsf_x_nan_inf
201    beq a3, t0, __mulsf_y_nan_inf
202
203    // Finish unpacking sign
204    srai a6, a0, 31
205    // Unpack significand (with implicit one in MSB)
206    slli a4, a0, 8
207    slli a5, a1, 8
208    bseti a4, a4, 31
209    bseti a5, a5, 31
210    // Get full 64-bit multiply result in a4:a1 (one cycle each half)
211    // Going from Q1.23 to Q2.46 (both left-justified)
212    mul a1, a4, a5
213    mulhu a4, a4, a5
214    // Normalise (shift left by either 0 or 1) -- bit 8 is the LSB of the
215    // final significand (ignoring rounding)
216    clz a0, a4
217    sll a4, a4, a0
218    sub a2, a2, a0
219    add a2, a2, a3
220    // Subtract redundant bias term (127), add 1 for normalisation correction
221    addi a2, a2, -126
222    blez a2, __mulsf_underflow
223
224    // Gather sticky bits from low fraction:
225    snez a1, a1
226    or a4, a4, a1
227    // Round to nearest, even on tie (aka bias upward if odd)
228    bexti a1, a4, 8
229    add a4, a4, a1
230    addi a4, a4, 127
231    // Check carry-out: exponent may increase due to rounding
232    bgez a4, 2f
2331:
234    bge a2, t0, __mulsf_overflow
235    // Pack it and ship it
236    packh a2, a2, a6
237    slli a2, a2, 23
238    slli a4, a4, 1
239    srli a4, a4, 9
240    add a0, a4, a2
241    ret
2422:
243    srli a4, a4, 1
244    addi a2, a2, 1
245    j 1b
246
247__mulsf_underflow:
248    // Signed zero
249    slli a0, a6, 31
250    ret
251__mulsf_overflow:
252    // Signed inf
253    packh a0, t0, a6
254    slli a0, a0, 23
255    ret
256
257__mulsf_x_0:
258    // 0 times nan    -> propagate nan
259    // 0 times inf    -> generate nan
260    // 0 times others -> 0 (need to flush significand too as we are FTZ)
261    bne a3, t0, __mulsf_return_flushed_x
262    slli a5, a1, 9
263    beqz a5, 1f
264    // Propagate nan from y
265__mulsf_return_y:
266    mv a0, a1
267    ret
2681:
269    // Generate new nan
270    li a0, -1
271    ret
272
273__mulsf_y_0:
274    // Mirror image of x_0 except we still return x for signed 0, since the
275    // signs were already resolved.
276    bne a2, t0, __mulsf_return_flushed_x
277    slli a1, a0, 9
278    bnez a1, 1f
279    li a0, -1
2801:
281    ret
282
283__mulsf_return_flushed_x:
284    // If we don't support subnormals we at least need to flush to a canonical
285    // zero. This is just a sign bit in bit 31.
286    srli a0, a0, 31
287    slli a0, a0, 31
288__mulsf_return_x:
289    ret
290
291__mulsf_x_nan_inf:
292    // We know that y is not zero and is positive. So...
293    //      x is nan    -> return x
294    // else y is nan    -> return y
295    // else y is inf    -> return x
296    // else y is normal -> return x
297    // (the order of the first two clauses is actually our free choice)
298    slli a4, a0, 9
299    bnez a4, __mulsf_return_x
300    bne a3, t0, __mulsf_return_x
301    slli a5, a1, 9
302    bnez a5, __mulsf_return_y
303    ret // return x
304
305__mulsf_y_nan_inf:
306    // We know that x is not zero, nan, nor inf. That just leaves normals.
307    // y is nan -> return y
308    // y is inf -> return inf * sgn(x) (since we already merged the signs)
309    slli a5, a1, 9
310    bnez a5, __mulsf_return_y
311    srai a0, a0, 31
312    packh a0, t0, a0
313    slli a0, a0, 23
314    ret
315
316
317// This is a hack to improve soft float performance for the routines we don't
318// implement (e.g. libm) in libraries built against a non-Zbb ISA dialect:
319float_section __clz2si
320.global __clz2si
321__clz2si:
322    clz a0, a0
323    ret
324