Skip to content

Commit

Permalink
Fix rounding of Float32->Float16 near eps(Float16(0.0))
Browse files Browse the repository at this point in the history
The originally described algorithm worked only for round-to-zero,
and our adjustment to make it round-to-nearest was incorrect
for values that result in Float16 subnormals. Fix this at the expense
of an extra `|` and `&`.
  • Loading branch information
Keno committed Jul 1, 2019
1 parent 7fa1332 commit df9eb7d
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 12 deletions.
29 changes: 20 additions & 9 deletions base/float.jl
Original file line number Diff line number Diff line change
Expand Up @@ -140,19 +140,21 @@ end
# Float32 -> Float16 algorithm from:
# "Fast Half Float Conversion" by Jeroen van der Zijp
# ftp:https://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf

#
# With adjustments for round-to-nearest, ties to even.
#
let _basetable = Vector{UInt16}(undef, 512),
_shifttable = Vector{UInt8}(undef, 512)
for i = 0:255
e = i - 127
if e < -24 # Very small numbers map to zero
if e < -25 # Very small numbers map to zero
_basetable[i|0x000+1] = 0x0000
_basetable[i|0x100+1] = 0x8000
_shifttable[i|0x000+1] = 24
_shifttable[i|0x100+1] = 24
_shifttable[i|0x000+1] = 25
_shifttable[i|0x100+1] = 25
elseif e < -14 # Small numbers map to denorms
_basetable[i|0x000+1] = (0x0400>>(-e-14))
_basetable[i|0x100+1] = (0x0400>>(-e-14)) | 0x8000
_basetable[i|0x000+1] = 0x0000
_basetable[i|0x100+1] = 0x8000
_shifttable[i|0x000+1] = -e-1
_shifttable[i|0x100+1] = -e-1
elseif e <= 15 # Normal numbers just lose precision
Expand Down Expand Up @@ -182,10 +184,14 @@ function Float16(val::Float32)
t = 0x8000 (0x8000 & ((f >> 0x10) % UInt16))
return reinterpret(Float16, t ((f >> 0xd) % UInt16))
end
i = (f >> 23) & 0x1ff + 1
i = ((f & ~significand_mask(Float32)) >> significand_bits(Float32)) + 1
@inbounds sh = shifttable[i]
f &= 0x007fffff
@inbounds h = (basetable[i] + (f >> sh)) % UInt16
f &= significand_mask(Float32)
# If `val` is subnormal, the tables are set up to force the
# result to 0, so the significand has an implicit `1` in the
# cases we care about.
f |= significand_mask(Float32) + 0x1
@inbounds h = (basetable[i] + (f >> sh) & significand_mask(Float16)) % UInt16
# round
# NOTE: we maybe should ignore NaNs here, but the payload is
# getting truncated anyway so "rounding" it might not matter
Expand Down Expand Up @@ -867,6 +873,11 @@ exponent_one(::Type{Float16}) = 0x3c00
exponent_half(::Type{Float16}) = 0x3800
significand_mask(::Type{Float16}) = 0x03ff

for T in (Float16, Float32, Float64)
@eval significand_bits(::Type{$T}) = $(trailing_ones(significand_mask(T)))
@eval exponent_bits(::Type{$T}) = $(sizeof(T)*8 - significand_bits(T) - 1)
end

# integer size of float
uinttype(::Type{Float64}) = UInt64
uinttype(::Type{Float32}) = UInt32
Expand Down
5 changes: 2 additions & 3 deletions base/math.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ import .Base: log, exp, sin, cos, tan, sinh, cosh, tanh, asin,
exp10, expm1, log1p

using .Base: sign_mask, exponent_mask, exponent_one,
exponent_half, uinttype, significand_mask
exponent_half, uinttype, significand_mask,
significand_bits, exponent_bits

using Core.Intrinsics: sqrt_llvm

Expand All @@ -38,8 +39,6 @@ end
end

for T in (Float16, Float32, Float64)
@eval significand_bits(::Type{$T}) = $(trailing_ones(significand_mask(T)))
@eval exponent_bits(::Type{$T}) = $(sizeof(T)*8 - significand_bits(T) - 1)
@eval exponent_bias(::Type{$T}) = $(Int(exponent_one(T) >> significand_bits(T)))
# maximum float exponent
@eval exponent_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)) - exponent_bias(T))
Expand Down
11 changes: 11 additions & 0 deletions test/float16.jl
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,14 @@ end

# issue #17148
@test rem(Float16(1.2), Float16(one(1.2))) == 0.20019531f0

# issue #32441
const f16eps2 = Float32(eps(Float16(0.0)))/2
const minsubf16 = nextfloat(Float16(0.0))
const minsubf16_32 = Float32(minsubf16)
@test Float16(f16eps2) == Float16(0.0)
@test Float16(nextfloat(f16eps2)) == minsubf16
@test Float16(prevfloat(minsubf16_32)) == minsubf16
# Ties to even, in this case up
@test Float16(minsubf16_32 + f16eps2) == nextfloat(minsubf16)
@test Float16(prevfloat(minsubf16_32 + f16eps2)) == minsubf16

0 comments on commit df9eb7d

Please sign in to comment.