Skip to content

Commit

Permalink
fixed Float16 from Float64 and BigFloat (#42837)
Browse files Browse the repository at this point in the history
* fixed Float16 from Float64 and BigFloat. Many thanks to Jamison.
  • Loading branch information
oscardssmith committed Nov 4, 2021
1 parent ee36c13 commit 707c57c
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 3 deletions.
19 changes: 17 additions & 2 deletions base/mpfr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -338,8 +338,23 @@ Float32(x::BigFloat, r::MPFRRoundingMode=ROUNDING_MODE[]) =
_cpynansgn(ccall((:mpfr_get_flt,:libmpfr), Float32, (Ref{BigFloat}, MPFRRoundingMode), x, r), x)
Float32(x::BigFloat, r::RoundingMode) = Float32(x, convert(MPFRRoundingMode, r))

# TODO: avoid double rounding
Float16(x::BigFloat) = Float16(Float64(x))
function Float16(x::BigFloat) :: Float16
res = Float32(x)
resi = reinterpret(UInt32, res)
if (resi&0x7fffffff) < 0x38800000 # if Float16(res) is subnormal
#shift so that the mantissa lines up where it would for normal Float16
shift = 113-((resi & 0x7f800000)>>23)
if shift<23
resi |= 0x0080_0000 # set implicit bit
resi >>= shift
end
end
if (resi & 0x1fff == 0x1000) # if we are halfway between 2 Float16 values
# adjust the value by 1 ULP in the direction that will make Float16(res) give the right answer
res = nextfloat(res, cmp(x, res))
end
return res
end

promote_rule(::Type{BigFloat}, ::Type{<:Real}) = BigFloat
promote_rule(::Type{BigInt}, ::Type{<:AbstractFloat}) = BigFloat
Expand Down
19 changes: 18 additions & 1 deletion src/runtime_intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,24 @@ JL_DLLEXPORT uint16_t __gnu_f2h_ieee(float param)

JL_DLLEXPORT uint16_t __truncdfhf2(double param)
{
return float_to_half((float)param);
float res = (float)param;
uint32_t resi;
memcpy(&resi, &res, sizeof(res));
if ((resi&0x7fffffffu) < 0x38800000u){ // if Float16(res) is subnormal
// shift so that the mantissa lines up where it would for normal Float16
uint32_t shift = 113u-((resi & 0x7f800000u)>>23u);
if (shift<23u) {
resi |= 0x00800000; // set implicit bit
resi >>= shift;
}
}
if ((resi & 0x1fffu) == 0x1000u) { // if we are halfway between 2 Float16 values
memcpy(&resi, &res, sizeof(res));
// adjust the value by 1 ULP in the direction that will make Float16(res) give the right answer
resi += (fabs(res) < fabs(param)) - (fabs(param) < fabs(res));
memcpy(&res, &resi, sizeof(res));
}
return float_to_half(res);
}

#endif
Expand Down
22 changes: 22 additions & 0 deletions test/float16.jl
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,25 @@ const minsubf16_32 = Float32(minsubf16)

# issues #33076
@test Float16(1f5) == Inf16

@testset "conversion to Float16 from" begin
for T in (Float32, Float64, BigFloat)
@testset "conversion from $T" begin
for i in 1:2^16
f = reinterpret(Float16, UInt16(i-1))
isfinite(f) || continue
if f < 0
epsdown = T(eps(f))/2
epsup = issubnormal(f) ? epsdown : T(eps(nextfloat(f)))/2
else
epsup = T(eps(f))/2
epsdown = issubnormal(f) ? epsup : T(eps(prevfloat(f)))/2
end
@test isequal(f*(-1)^(f === Float16(0)), Float16(nextfloat(T(f) - epsdown)))
@test isequal(f*(-1)^(f === -Float16(0)), Float16(prevfloat(T(f) + epsup)))
@test isequal(prevfloat(f), Float16(prevfloat(T(f) - epsdown)))
@test isequal(nextfloat(f), Float16(nextfloat(T(f) + epsup)))
end
end
end
end

0 comments on commit 707c57c

Please sign in to comment.