Backport LLVM patches to fix X86 partial register stall

Fix JuliaLang#19976
antoine-levitt · Jan 17, 2017 · df891e0 · df891e0
1 parent fab13f2
commit df891e0
Show file tree

Hide file tree

Showing 6 changed files with 1,928 additions and 1 deletion.
diff --git a/deps/llvm.mk b/deps/llvm.mk
@@ -488,7 +488,7 @@ $(eval $(call LLVM_PATCH,llvm-r282182)) # Remove for 4.0
 $(eval $(call LLVM_PATCH,llvm-3.9.0_cygwin)) # R283427, Remove for 4.0
 endif
 $(eval $(call LLVM_PATCH,llvm-PR22923)) # Remove for 4.0
-$(eval $(call LLVM_PATCH,llvm-arm-fix-prel31))
+$(eval $(call LLVM_PATCH,llvm-arm-fix-prel31)) # Remove for 4.0
 $(eval $(call LLVM_PATCH,llvm-D25865-cmakeshlib)) # Remove for 4.0
 # Cygwin and openSUSE still use win32-threads mingw, https://llvm.org/bugs/show_bug.cgi?id=26365
 $(eval $(call LLVM_PATCH,llvm-3.9.0_threads))
@@ -505,6 +505,11 @@ $(eval $(call LLVM_PATCH,llvm-D27397)) # Julia issue #19792, Remove for 4.0
 $(eval $(call LLVM_PATCH,llvm-D28009)) # Julia issue #19792, Remove for 4.0
 $(eval $(call LLVM_PATCH,llvm-D28215_FreeBSD_shlib))
 $(eval $(call LLVM_PATCH,llvm-D28221-avx512)) # mentioned in issue #19797
+$(eval $(call LLVM_PATCH,llvm-PR276266)) # Issue #19976, Remove for 4.0
+$(eval $(call LLVM_PATCH,llvm-PR278088)) # Issue #19976, Remove for 4.0
+$(eval $(call LLVM_PATCH,llvm-PR277939)) # Issue #19976, Remove for 4.0
+$(eval $(call LLVM_PATCH,llvm-PR278321)) # Issue #19976, Remove for 4.0
+$(eval $(call LLVM_PATCH,llvm-PR278923)) # Issue #19976, Remove for 4.0
 endif # LLVM_VER
 
 ifeq ($(LLVM_VER),3.7.1)

diff --git a/deps/patches/llvm-PR276266.patch b/deps/patches/llvm-PR276266.patch
@@ -0,0 +1,51 @@
+From 64d1e8b748bca22ce205eab7634cc5418c827f18 Mon Sep 17 00:00:00 2001
+From: Marina Yatsina <[email protected]>
+Date: Thu, 21 Jul 2016 12:37:07 +0000
+Subject: [PATCH 3/5] ExecutionDepsFix - Fix bug in clearance calculation
+
+The clearance calculation did not take into account registers defined as outputs or clobbers in inline assembly machine instructions because these register defs are implicit.
+
+Differential Revision: https://reviews.llvm.org/D22580
+
+
+
+git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@276266 91177308-0d34-0410-b5e6-96231b3b80d8
+---
+ lib/CodeGen/ExecutionDepsFix.cpp | 2 --
+ test/CodeGen/X86/break-false-dep.ll | 10 ++++++++++
+ 2 files changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
+index 566b8d507b2..1fe5f459b69 100644
+--- a/lib/CodeGen/ExecutionDepsFix.cpp
++++ b/lib/CodeGen/ExecutionDepsFix.cpp
+@@ -520,8 +520,6 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg())
+ continue;
+- if (MO.isImplicit())
+- break;
+ if (MO.isUse())
+ continue;
+ for (int rx : regIndices(MO.getReg())) {
+diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll
+index 74a0728f918..a7cda499dab 100644
+--- a/test/CodeGen/X86/break-false-dep.ll
++++ b/test/CodeGen/X86/break-false-dep.ll
+@@ -199,3 +199,13 @@ for.end16: ; preds = %for.inc14
+ ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
+ ;AVX-NEXT: vmovsd [[XMM0]],
+ }
++
++define double @inlineasmdep(i64 %arg) {
++top:
++ tail call void asm sideeffect "", "~{xmm0},~{dirflag},~{fpsr},~{flags}"()
++ %tmp1 = sitofp i64 %arg to double
++ ret double %tmp1
++;AVX-LABEL:@inlineasmdep
++;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]]
++;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM0]], {{%xmm[0-9]+}}
++}
+-- 
+2.11.0
+
diff --git a/deps/patches/llvm-PR277939.patch b/deps/patches/llvm-PR277939.patch
@@ -0,0 +1,169 @@
+From 9790ab8bccdbc71dfcc166860ab6ce9c369bf686 Mon Sep 17 00:00:00 2001
+From: Simon Pilgrim <[email protected]>
+Date: Sat, 6 Aug 2016 21:21:12 +0000
+Subject: [PATCH 1/5] [X86][AVX2] Improve sign/zero extension on AVX2 targets
+
+Split extensions to large vectors into 256-bit chunks - the equivalent of what we do with pre-AVX2 into 128-bit chunks
+
+git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277939 91177308-0d34-0410-b5e6-96231b3b80d8
+---
+ lib/Target/X86/X86ISelLowering.cpp | 22 +++++++++++++++-------
+ test/CodeGen/X86/vec_int_to_fp.ll | 24 ++++++++----------------
+ test/CodeGen/X86/vector-sext.ll | 10 ++--------
+ 3 files changed, 25 insertions(+), 31 deletions(-)
+
+diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
+index ca205335013..2bbedd4bd97 100644
+--- a/lib/Target/X86/X86ISelLowering.cpp
++++ b/lib/Target/X86/X86ISelLowering.cpp
+@@ -30164,11 +30164,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
+ : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
+ }
+
+- // On pre-AVX2 targets, split into 128-bit nodes of
+- // ISD::*_EXTEND_VECTOR_INREG.
+- if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) {
+- unsigned NumVecs = VT.getSizeInBits() / 128;
+- unsigned NumSubElts = 128 / SVT.getSizeInBits();
++ auto SplitAndExtendInReg = [&](unsigned SplitSize) {
++ unsigned NumVecs = VT.getSizeInBits() / SplitSize;
++ unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
+ EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
+ EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
+
+@@ -30176,14 +30174,24 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
+ for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
+ SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
+ DAG.getIntPtrConstant(Offset, DL));
+- SrcVec = ExtendVecSize(DL, SrcVec, 128);
++ SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
+ SrcVec = Opcode == ISD::SIGN_EXTEND
+ ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
+ : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
+ Opnds.push_back(SrcVec);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
+- }
++ };
++
++ // On pre-AVX2 targets, split into 128-bit nodes of
++ // ISD::*_EXTEND_VECTOR_INREG.
++ if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
++ return SplitAndExtendInReg(128);
++
++ // On pre-AVX512 targets, split into 256-bit nodes of
++ // ISD::*_EXTEND_VECTOR_INREG.
++ if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
++ return SplitAndExtendInReg(256);
+
+ return SDValue();
+ }
+diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
+index 43f5318a607..5d8f91385c7 100644
+--- a/test/CodeGen/X86/vec_int_to_fp.ll
++++ b/test/CodeGen/X86/vec_int_to_fp.ll
+@@ -153,8 +153,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
+ ;
+ ; AVX2-LABEL: sitofp_16i8_to_2f64:
+ ; AVX2: # BB#0:
+-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
++; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
+ ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+ ; AVX2-NEXT: # kill
+ ; AVX2-NEXT: vzeroupper
+@@ -325,8 +324,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
+ ;
+ ; AVX2-LABEL: sitofp_16i8_to_4f64:
+ ; AVX2: # BB#0:
+-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
++; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
+ ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+ ; AVX2-NEXT: retq
+ %cvt = sitofp <16 x i8> %a to <16 x double>
+@@ -543,8 +541,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
+ ;
+ ; AVX2-LABEL: uitofp_16i8_to_2f64:
+ ; AVX2: # BB#0:
+-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
++; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+ ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+ ; AVX2-NEXT: # kill
+ ; AVX2-NEXT: vzeroupper
+@@ -778,8 +775,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
+ ;
+ ; AVX2-LABEL: uitofp_16i8_to_4f64:
+ ; AVX2: # BB#0:
+-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
++; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+ ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+ ; AVX2-NEXT: retq
+ %cvt = uitofp <16 x i8> %a to <16 x double>
+@@ -958,8 +954,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
+ ;
+ ; AVX2-LABEL: sitofp_16i8_to_4f32:
+ ; AVX2: # BB#0:
+-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
++; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
+ ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+ ; AVX2-NEXT: # kill
+ ; AVX2-NEXT: vzeroupper
+@@ -1134,8 +1129,7 @@ define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
+ ;
+ ; AVX2-LABEL: sitofp_16i8_to_8f32:
+ ; AVX2: # BB#0:
+-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
++; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
+ ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+ ; AVX2-NEXT: retq
+ %cvt = sitofp <16 x i8> %a to <16 x float>
+@@ -1456,8 +1450,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
+ ;
+ ; AVX2-LABEL: uitofp_16i8_to_4f32:
+ ; AVX2: # BB#0:
+-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
++; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+ ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+ ; AVX2-NEXT: # kill
+ ; AVX2-NEXT: vzeroupper
+@@ -1813,8 +1806,7 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
+ ;
+ ; AVX2-LABEL: uitofp_16i8_to_8f32:
+ ; AVX2: # BB#0:
+-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
++; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+ ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+ ; AVX2-NEXT: retq
+ %cvt = uitofp <16 x i8> %a to <16 x float>
+diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
+index 018c5922a43..e29f3e5f91f 100644
+--- a/test/CodeGen/X86/vector-sext.ll
++++ b/test/CodeGen/X86/vector-sext.ll
+@@ -407,15 +407,9 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
+ ;
+ ; AVX2-LABEL: sext_16i8_to_8i64:
+ ; AVX2: # BB#0: # %entry
+-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+-; AVX2-NEXT: vpslld $24, %xmm1, %xmm1
+-; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
+-; AVX2-NEXT: vpmovsxdq %xmm1, %ymm2
++; AVX2-NEXT: vpmovsxbq %xmm0, %ymm2
+ ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
+-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
+-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1
++; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1
+ ; AVX2-NEXT: vmovdqa %ymm2, %ymm0
+ ; AVX2-NEXT: retq
+ ;
+-- 
+2.11.0
+