Skip to content

Commit

Permalink
Move F16 table to a better place
Browse files Browse the repository at this point in the history
  • Loading branch information
Keno committed Jul 1, 2019
1 parent 00d8694 commit 7fa1332
Showing 1 changed file with 39 additions and 39 deletions.
78 changes: 39 additions & 39 deletions base/float.jl
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,45 @@ function Float32(x::Int128)
reinterpret(Float32, s | d + y)
end

# Float32 -> Float16 algorithm from:
# "Fast Half Float Conversion" by Jeroen van der Zijp
# ftp:https://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf

let _basetable = Vector{UInt16}(undef, 512),
_shifttable = Vector{UInt8}(undef, 512)
for i = 0:255
e = i - 127
if e < -24 # Very small numbers map to zero
_basetable[i|0x000+1] = 0x0000
_basetable[i|0x100+1] = 0x8000
_shifttable[i|0x000+1] = 24
_shifttable[i|0x100+1] = 24
elseif e < -14 # Small numbers map to denorms
_basetable[i|0x000+1] = (0x0400>>(-e-14))
_basetable[i|0x100+1] = (0x0400>>(-e-14)) | 0x8000
_shifttable[i|0x000+1] = -e-1
_shifttable[i|0x100+1] = -e-1
elseif e <= 15 # Normal numbers just lose precision
_basetable[i|0x000+1] = ((e+15)<<10)
_basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
_shifttable[i|0x000+1] = 13
_shifttable[i|0x100+1] = 13
elseif e < 128 # Large numbers map to Infinity
_basetable[i|0x000+1] = 0x7C00
_basetable[i|0x100+1] = 0xFC00
_shifttable[i|0x000+1] = 24
_shifttable[i|0x100+1] = 24
else # Infinity and NaN's stay Infinity and NaN's
_basetable[i|0x000+1] = 0x7C00
_basetable[i|0x100+1] = 0xFC00
_shifttable[i|0x000+1] = 13
_shifttable[i|0x100+1] = 13
end
end
global const shifttable = (_shifttable...,)
global const basetable = (_basetable...,)
end

function Float16(val::Float32)
f = reinterpret(UInt32, val)
if isnan(val)
Expand Down Expand Up @@ -202,45 +241,6 @@ function Float32(val::Float16)
return reinterpret(Float32, ret)
end

# Float32 -> Float16 algorithm from:
# "Fast Half Float Conversion" by Jeroen van der Zijp
# ftp:https://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf

let _basetable = Vector{UInt16}(undef, 512),
_shifttable = Vector{UInt8}(undef, 512)
for i = 0:255
e = i - 127
if e < -24 # Very small numbers map to zero
_basetable[i|0x000+1] = 0x0000
_basetable[i|0x100+1] = 0x8000
_shifttable[i|0x000+1] = 24
_shifttable[i|0x100+1] = 24
elseif e < -14 # Small numbers map to denorms
_basetable[i|0x000+1] = (0x0400>>(-e-14))
_basetable[i|0x100+1] = (0x0400>>(-e-14)) | 0x8000
_shifttable[i|0x000+1] = -e-1
_shifttable[i|0x100+1] = -e-1
elseif e <= 15 # Normal numbers just lose precision
_basetable[i|0x000+1] = ((e+15)<<10)
_basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
_shifttable[i|0x000+1] = 13
_shifttable[i|0x100+1] = 13
elseif e < 128 # Large numbers map to Infinity
_basetable[i|0x000+1] = 0x7C00
_basetable[i|0x100+1] = 0xFC00
_shifttable[i|0x000+1] = 24
_shifttable[i|0x100+1] = 24
else # Infinity and NaN's stay Infinity and NaN's
_basetable[i|0x000+1] = 0x7C00
_basetable[i|0x100+1] = 0xFC00
_shifttable[i|0x000+1] = 13
_shifttable[i|0x100+1] = 13
end
end
global const shifttable = (_shifttable...,)
global const basetable = (_basetable...,)
end

#convert(::Type{Float16}, x::Float32) = fptrunc(Float16, x)
Float32(x::Float64) = fptrunc(Float32, x)
Float16(x::Float64) = Float16(Float32(x))
Expand Down

0 comments on commit 7fa1332

Please sign in to comment.