Add Float16 to supported x86 processors (#46499)

* Add float16 multiversioning for x86 Co-authored-by: pchintalapudi <[email protected]> Co-authored-by: Mosè Giordano <[email protected]>
JuliaLang · Feb 8, 2023 · ee82caa · ee82caa
1 parent 8b4b1f0
commit ee82caa
Show file tree

Hide file tree

Showing 5 changed files with 66 additions and 30 deletions.
diff --git a/src/features_x86.h b/src/features_x86.h
@@ -74,11 +74,13 @@ JL_FEATURE_DEF(enqcmd, 32 * 3 + 29, 0)
 // EAX=7,ECX=0: EDX
 // JL_FEATURE_DEF(avx5124vnniw, 32 * 4 + 2, ?????)
 // JL_FEATURE_DEF(avx5124fmaps, 32 * 4 + 3, ?????)
+JL_FEATURE_DEF(uintr, 32 * 4 + 5, 140000)
 JL_FEATURE_DEF(avx512vp2intersect, 32 * 4 + 8, 0)
 JL_FEATURE_DEF(serialize, 32 * 4 + 14, 110000)
 JL_FEATURE_DEF(tsxldtrk, 32 * 4 + 16, 110000)
 JL_FEATURE_DEF(pconfig, 32 * 4 + 18, 0)
 JL_FEATURE_DEF_NAME(amx_bf16, 32 * 4 + 22, 110000, "amx-bf16")
+JL_FEATURE_DEF(avx512fp16, 32 * 4 + 23, 140000)
 JL_FEATURE_DEF_NAME(amx_tile, 32 * 4 + 24, 110000, "amx-tile")
 JL_FEATURE_DEF_NAME(amx_int8, 32 * 4 + 25, 110000, "amx-int8")
 

diff --git a/src/llvm-demote-float16.cpp b/src/llvm-demote-float16.cpp
@@ -47,35 +47,22 @@ INST_STATISTIC(FCmp);
 
 extern JuliaOJIT *jl_ExecutionEngine;
 
-Optional<bool> always_have_fp16() {
-#if defined(_CPU_X86_) || defined(_CPU_X86_64_)
- // x86 doesn't support fp16
- // TODO: update for sapphire rapids when it comes out
- return false;
-#else
- return {};
-#endif
-}
-
 namespace {
 
 bool have_fp16(Function &caller) {
- auto unconditional = always_have_fp16();
- if (unconditional.hasValue())
- return unconditional.getValue();
-
  Attribute FSAttr = caller.getFnAttribute("target-features");
  StringRef FS =
  FSAttr.isValid() ? FSAttr.getValueAsString() : jl_ExecutionEngine->getTargetFeatureString();
 #if defined(_CPU_AARCH64_)
  if (FS.find("+fp16fml") != llvm::StringRef::npos || FS.find("+fullfp16") != llvm::StringRef::npos){
  return true;
  }
-#else
+#elif defined(_CPU_X86_64_)
  if (FS.find("+avx512fp16") != llvm::StringRef::npos){
  return true;
  }
 #endif
+ (void)FS;
  return false;
 }
 

diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
@@ -45,8 +45,6 @@ using namespace llvm;
 
 extern Optional<bool> always_have_fma(Function&);
 
-extern Optional<bool> always_have_fp16();
-
 void replaceUsesWithLoad(Function &F, function_ref<GlobalVariable *(Instruction &I)> should_replace, MDNode *tbaa_const);
 
 namespace {
@@ -490,13 +488,12 @@ uint32_t CloneCtx::collect_func_info(Function &F)
  flag |= JL_TARGET_CLONE_MATH;
  }
  }
- if(!always_have_fp16().hasValue()){
- for (size_t i = 0; i < I.getNumOperands(); i++) {
- if(I.getOperand(i)->getType()->isHalfTy()){
- flag |= JL_TARGET_CLONE_FLOAT16;
- }
- // Check for BFloat16 when they are added to julia can be done here
+
+ for (size_t i = 0; i < I.getNumOperands(); i++) {
+ if(I.getOperand(i)->getType()->isHalfTy()){
+ flag |= JL_TARGET_CLONE_FLOAT16;
  }
+ // Check for BFloat16 when they are added to julia can be done here
  }
  if (has_veccall && (flag & JL_TARGET_CLONE_SIMD) && (flag & JL_TARGET_CLONE_MATH) &&
  (flag & JL_TARGET_CLONE_CPU) && (flag & JL_TARGET_CLONE_FLOAT16)) {

diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp
@@ -154,6 +154,9 @@ static constexpr FeatureDep deps[] = {
  {avx512vnni, avx512f},
  {avx512vp2intersect, avx512f},
  {avx512vpopcntdq, avx512f},
+ {avx512fp16, avx512bw},
+ {avx512fp16, avx512dq},
+ {avx512fp16, avx512vl},
  {amx_int8, amx_tile},
  {amx_bf16, amx_tile},
  {sse4a, sse3},
@@ -208,8 +211,8 @@ constexpr auto tigerlake = icelake | get_feature_masks(avx512vp2intersect, movdi
 constexpr auto alderlake = skylake | get_feature_masks(clwb, sha, waitpkg, shstk, gfni, vaes, vpclmulqdq, pconfig,
  rdpid, movdiri, pku, movdir64b, serialize, ptwrite, avxvnni);
 constexpr auto sapphirerapids = icelake_server |
- get_feature_masks(amx_tile, amx_int8, amx_bf16, avx512bf16, serialize, cldemote, waitpkg,
- ptwrite, tsxldtrk, enqcmd, shstk, avx512vp2intersect, movdiri, movdir64b);
+ get_feature_masks(amx_tile, amx_int8, amx_bf16, avx512bf16, avx512fp16, serialize, cldemote, waitpkg,
+ avxvnni, uintr, ptwrite, tsxldtrk, enqcmd, shstk, avx512vp2intersect, movdiri, movdir64b);
 
 constexpr auto k8_sse3 = get_feature_masks(sse3, cx16);
 constexpr auto amdfam10 = k8_sse3 | get_feature_masks(sse4a, lzcnt, popcnt, sahf);
@@ -930,10 +933,10 @@ static void ensure_jit_target(bool imaging)
  Feature::avx512pf, Feature::avx512er,
  Feature::avx512cd, Feature::avx512bw,
  Feature::avx512vl, Feature::avx512vbmi,
- Feature::avx512vpopcntdq,
+ Feature::avx512vpopcntdq, Feature::avxvnni,
  Feature::avx512vbmi2, Feature::avx512vnni,
  Feature::avx512bitalg, Feature::avx512bf16,
- Feature::avx512vp2intersect};
+ Feature::avx512vp2intersect, Feature::avx512fp16};
  for (auto fe: clone_math) {
  if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
  t.en.flags |= JL_TARGET_CLONE_MATH;
@@ -946,6 +949,13 @@ static void ensure_jit_target(bool imaging)
  break;
  }
  }
+ static constexpr uint32_t clone_fp16[] = {Feature::avx512fp16};
+ for (auto fe: clone_fp16) {
+ if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
+ t.en.flags |= JL_TARGET_CLONE_FLOAT16;
+ break;
+ }
+ }
  }
 }
 

diff --git a/test/llvmpasses/float16.ll b/test/llvmpasses/float16.ll
@@ -1,8 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p
-; RUN: opt -enable-new-pm=0 -load libjulia-codegen%shlibext -DemoteFloat16 -S %s | FileCheck %s
-; RUN: opt -enable-new-pm=1 --load-pass-plugin=libjulia-codegen%shlibext -passes='DemoteFloat16' -S %s | FileCheck %s
+; RUN: opt -enable-new-pm=0  -load libjulia-codegen%shlibext -DemoteFloat16 -S %s | FileCheck %s
+; RUN: opt -enable-new-pm=1  --load-pass-plugin=libjulia-codegen%shlibext -passes='DemoteFloat16' -S %s | FileCheck %s
 
-define half @demotehalf_test(half %a, half %b) {
+define half @demotehalf_test(half %a, half %b) #0 {
+top:
 ; CHECK-LABEL: @demotehalf_test(
 ; CHECK-NEXT: top:
 ; CHECK-NEXT: %0 = fpext half %a to float
@@ -44,6 +45,42 @@ define half @demotehalf_test(half %a, half %b) {
 ; CHECK-NEXT: %36 = fadd float %34, %35
 ; CHECK-NEXT: %37 = fptrunc float %36 to half
 ; CHECK-NEXT: ret half %37
+;
+ %0 = fadd half %a, %b
+ %1 = fadd half %0, %b
+ %2 = fadd half %1, %b
+ %3 = fmul half %2, %b
+ %4 = fdiv half %3, %b
+ %5 = insertelement <2 x half> undef, half %a, i32 0
+ %6 = insertelement <2 x half> %5, half %b, i32 1
+ %7 = insertelement <2 x half> undef, half %b, i32 0
+ %8 = insertelement <2 x half> %7, half %b, i32 1
+ %9 = fadd <2 x half> %6, %8
+ %10 = extractelement <2 x half> %9, i32 0
+ %11 = extractelement <2 x half> %9, i32 1
+ %12 = fadd half %10, %11
+ %13 = fadd half %12, %4
+ ret half %13
+}
+
+define half @native_half_test(half %a, half %b) #1 {
+; CHECK-LABEL: @native_half_test(
+; CHECK-NEXT top:
+; CHECK-NEXT %0 = fadd half %a, %b
+; CHECK-NEXT %1 = fadd half %0, %b
+; CHECK-NEXT %2 = fadd half %1, %b
+; CHECK-NEXT %3 = fmul half %2, %b
+; CHECK-NEXT %4 = fdiv half %3, %b
+; CHECK-NEXT %5 = insertelement <2 x half> undef, half %a, i32 0
+; CHECK-NEXT %6 = insertelement <2 x half> %5, half %b, i32 1
+; CHECK-NEXT %7 = insertelement <2 x half> undef, half %b, i32 0
+; CHECK-NEXT %8 = insertelement <2 x half> %7, half %b, i32 1
+; CHECK-NEXT %9 = fadd <2 x half> %6, %8
+; CHECK-NEXT %10 = extractelement <2 x half> %9, i32 0
+; CHECK-NEXT %11 = extractelement <2 x half> %9, i32 1
+; CHECK-NEXT %12 = fadd half %10, %11
+; CHECK-NEXT %13 = fadd half %12, %4
+; CHECK-NEXT ret half %13
 ;
 top:
  %0 = fadd half %a, %b
@@ -62,3 +99,6 @@ top:
  %13 = fadd half %12, %4
  ret half %13
 }
+
+attributes #0 = { "target-features"="-avx512fp16" }
+attributes #1 = { "target-features"="+avx512fp16" }