Skip to content

Commit

Permalink
speed up Float16 conversions a bit (JuliaLang#29891)
Browse files Browse the repository at this point in the history
  • Loading branch information
JeffBezanson committed Nov 6, 2018
1 parent 618ee77 commit 0d4edb3
Showing 1 changed file with 37 additions and 35 deletions.
72 changes: 37 additions & 35 deletions base/float.jl
Original file line number Diff line number Diff line change
Expand Up @@ -144,17 +144,17 @@ function Float16(val::Float32)
return reinterpret(Float16, t ((f >> 0xd) % UInt16))
end
i = (f >> 23) & 0x1ff + 1
sh = shifttable[i]
@inbounds sh = shifttable[i]
f &= 0x007fffff
h::UInt16 = basetable[i] + (f >> sh)
@inbounds h = (basetable[i] + (f >> sh)) % UInt16
# round
# NOTE: we maybe should ignore NaNs here, but the payload is
# getting truncated anyway so "rounding" it might not matter
nextbit = (f >> (sh-1)) & 1
if nextbit != 0
# Round halfway to even or check lower bits
if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0
h += 1
h += UInt16(1)
end
end
reinterpret(Float16, h)
Expand All @@ -179,7 +179,7 @@ function Float32(val::Float16)
bit = bit >> 1
end
sign = sign << 31
exp = (-14 - n_bit + 127) << 23
exp = ((-14 - n_bit + 127) << 23) % UInt32
sig = ((sig & (~bit)) << n_bit) << (23 - 10)
ret = sign | exp | sig
end
Expand All @@ -195,7 +195,7 @@ function Float32(val::Float16)
end
else
sign = sign << 31
exp = (exp - 15 + 127) << 23
exp = ((exp - 15 + 127) << 23) % UInt32
sig = sig << (23 - 10)
ret = sign | exp | sig
end
Expand All @@ -206,37 +206,39 @@ end
# "Fast Half Float Conversion" by Jeroen van der Zijp
# ftp:https://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf

const basetable = Vector{UInt16}(undef, 512)
const shifttable = Vector{UInt8}(undef, 512)

for i = 0:255
e = i - 127
if e < -24 # Very small numbers map to zero
basetable[i|0x000+1] = 0x0000
basetable[i|0x100+1] = 0x8000
shifttable[i|0x000+1] = 24
shifttable[i|0x100+1] = 24
elseif e < -14 # Small numbers map to denorms
basetable[i|0x000+1] = (0x0400>>(-e-14))
basetable[i|0x100+1] = (0x0400>>(-e-14)) | 0x8000
shifttable[i|0x000+1] = -e-1
shifttable[i|0x100+1] = -e-1
elseif e <= 15 # Normal numbers just lose precision
basetable[i|0x000+1] = ((e+15)<<10)
basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
shifttable[i|0x000+1] = 13
shifttable[i|0x100+1] = 13
elseif e < 128 # Large numbers map to Infinity
basetable[i|0x000+1] = 0x7C00
basetable[i|0x100+1] = 0xFC00
shifttable[i|0x000+1] = 24
shifttable[i|0x100+1] = 24
else # Infinity and NaN's stay Infinity and NaN's
basetable[i|0x000+1] = 0x7C00
basetable[i|0x100+1] = 0xFC00
shifttable[i|0x000+1] = 13
shifttable[i|0x100+1] = 13
let _basetable = Vector{UInt16}(undef, 512),
_shifttable = Vector{UInt8}(undef, 512)
for i = 0:255
e = i - 127
if e < -24 # Very small numbers map to zero
_basetable[i|0x000+1] = 0x0000
_basetable[i|0x100+1] = 0x8000
_shifttable[i|0x000+1] = 24
_shifttable[i|0x100+1] = 24
elseif e < -14 # Small numbers map to denorms
_basetable[i|0x000+1] = (0x0400>>(-e-14))
_basetable[i|0x100+1] = (0x0400>>(-e-14)) | 0x8000
_shifttable[i|0x000+1] = -e-1
_shifttable[i|0x100+1] = -e-1
elseif e <= 15 # Normal numbers just lose precision
_basetable[i|0x000+1] = ((e+15)<<10)
_basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
_shifttable[i|0x000+1] = 13
_shifttable[i|0x100+1] = 13
elseif e < 128 # Large numbers map to Infinity
_basetable[i|0x000+1] = 0x7C00
_basetable[i|0x100+1] = 0xFC00
_shifttable[i|0x000+1] = 24
_shifttable[i|0x100+1] = 24
else # Infinity and NaN's stay Infinity and NaN's
_basetable[i|0x000+1] = 0x7C00
_basetable[i|0x100+1] = 0xFC00
_shifttable[i|0x000+1] = 13
_shifttable[i|0x100+1] = 13
end
end
global const shifttable = (_shifttable...,)
global const basetable = (_basetable...,)
end

#convert(::Type{Float16}, x::Float32) = fptrunc(Float16, x)
Expand Down

0 comments on commit 0d4edb3

Please sign in to comment.