speed up Float16 conversions a bit (JuliaLang#29891)

halleysfifthinc · Nov 6, 2018 · 0d4edb3 · 0d4edb3
1 parent 618ee77
commit 0d4edb3
Showing 1 changed file with 37 additions and 35 deletions.
diff --git a/base/float.jl b/base/float.jl
@@ -144,17 +144,17 @@ function Float16(val::Float32)
  return reinterpret(Float16, t ⊻ ((f >> 0xd) % UInt16))
  end
  i = (f >> 23) & 0x1ff + 1
- sh = shifttable[i]
+ @inbounds sh = shifttable[i]
  f &= 0x007fffff
- h::UInt16 = basetable[i] + (f >> sh)
+ @inbounds h = (basetable[i] + (f >> sh)) % UInt16
  # round
  # NOTE: we maybe should ignore NaNs here, but the payload is
  # getting truncated anyway so "rounding" it might not matter
  nextbit = (f >> (sh-1)) & 1
  if nextbit != 0
  # Round halfway to even or check lower bits
  if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0
- h += 1
+ h += UInt16(1)
  end
  end
  reinterpret(Float16, h)
@@ -179,7 +179,7 @@ function Float32(val::Float16)
  bit = bit >> 1
  end
  sign = sign << 31
- exp = (-14 - n_bit + 127) << 23
+ exp = ((-14 - n_bit + 127) << 23) % UInt32
  sig = ((sig & (~bit)) << n_bit) << (23 - 10)
  ret = sign | exp | sig
  end
@@ -195,7 +195,7 @@ function Float32(val::Float16)
  end
  else
  sign = sign << 31
- exp = (exp - 15 + 127) << 23
+ exp = ((exp - 15 + 127) << 23) % UInt32
  sig = sig << (23 - 10)
  ret = sign | exp | sig
  end
@@ -206,37 +206,39 @@ end
 # "Fast Half Float Conversion" by Jeroen van der Zijp
 # ftp:https://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
 
-const basetable = Vector{UInt16}(undef, 512)
-const shifttable = Vector{UInt8}(undef, 512)
-
-for i = 0:255
- e = i - 127
- if e < -24 # Very small numbers map to zero
- basetable[i|0x000+1] = 0x0000
- basetable[i|0x100+1] = 0x8000
- shifttable[i|0x000+1] = 24
- shifttable[i|0x100+1] = 24
- elseif e < -14 # Small numbers map to denorms
- basetable[i|0x000+1] = (0x0400>>(-e-14))
- basetable[i|0x100+1] = (0x0400>>(-e-14)) | 0x8000
- shifttable[i|0x000+1] = -e-1
- shifttable[i|0x100+1] = -e-1
- elseif e <= 15 # Normal numbers just lose precision
- basetable[i|0x000+1] = ((e+15)<<10)
- basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
- shifttable[i|0x000+1] = 13
- shifttable[i|0x100+1] = 13
- elseif e < 128 # Large numbers map to Infinity
- basetable[i|0x000+1] = 0x7C00
- basetable[i|0x100+1] = 0xFC00
- shifttable[i|0x000+1] = 24
- shifttable[i|0x100+1] = 24
- else # Infinity and NaN's stay Infinity and NaN's
- basetable[i|0x000+1] = 0x7C00
- basetable[i|0x100+1] = 0xFC00
- shifttable[i|0x000+1] = 13
- shifttable[i|0x100+1] = 13
+let _basetable = Vector{UInt16}(undef, 512),
+ _shifttable = Vector{UInt8}(undef, 512)
+ for i = 0:255
+ e = i - 127
+  if e < -24 # Very small numbers map to zero
+  _basetable[i|0x000+1] = 0x0000
+  _basetable[i|0x100+1] = 0x8000
+  _shifttable[i|0x000+1] = 24
+  _shifttable[i|0x100+1] = 24
+ elseif e < -14 # Small numbers map to denorms
+  _basetable[i|0x000+1] = (0x0400>>(-e-14))
+  _basetable[i|0x100+1] = (0x0400>>(-e-14)) | 0x8000
+  _shifttable[i|0x000+1] = -e-1
+  _shifttable[i|0x100+1] = -e-1
+ elseif e <= 15 # Normal numbers just lose precision
+  _basetable[i|0x000+1] = ((e+15)<<10)
+  _basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
+  _shifttable[i|0x000+1] = 13
+  _shifttable[i|0x100+1] = 13
+ elseif e < 128 # Large numbers map to Infinity
+  _basetable[i|0x000+1] = 0x7C00
+  _basetable[i|0x100+1] = 0xFC00
+  _shifttable[i|0x000+1] = 24
+  _shifttable[i|0x100+1] = 24
+ else # Infinity and NaN's stay Infinity and NaN's
+  _basetable[i|0x000+1] = 0x7C00
+  _basetable[i|0x100+1] = 0xFC00
+  _shifttable[i|0x000+1] = 13
+  _shifttable[i|0x100+1] = 13
+ end
  end
+ global const shifttable = (_shifttable...,)
+ global const basetable = (_basetable...,)
 end
 
 #convert(::Type{Float16}, x::Float32) = fptrunc(Float16, x)