Skip to content

Commit

Permalink
Backport LLVM patches to fix X86 partial register stall
Browse files Browse the repository at this point in the history
  • Loading branch information
yuyichao authored and Keno committed Jan 17, 2017
1 parent fab13f2 commit df891e0
Show file tree
Hide file tree
Showing 6 changed files with 1,928 additions and 1 deletion.
7 changes: 6 additions & 1 deletion deps/llvm.mk
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ $(eval $(call LLVM_PATCH,llvm-r282182)) # Remove for 4.0
$(eval $(call LLVM_PATCH,llvm-3.9.0_cygwin)) # R283427, Remove for 4.0
endif
$(eval $(call LLVM_PATCH,llvm-PR22923)) # Remove for 4.0
$(eval $(call LLVM_PATCH,llvm-arm-fix-prel31))
$(eval $(call LLVM_PATCH,llvm-arm-fix-prel31)) # Remove for 4.0
$(eval $(call LLVM_PATCH,llvm-D25865-cmakeshlib)) # Remove for 4.0
# Cygwin and openSUSE still use win32-threads mingw, https://llvm.org/bugs/show_bug.cgi?id=26365
$(eval $(call LLVM_PATCH,llvm-3.9.0_threads))
Expand All @@ -505,6 +505,11 @@ $(eval $(call LLVM_PATCH,llvm-D27397)) # Julia issue #19792, Remove for 4.0
$(eval $(call LLVM_PATCH,llvm-D28009)) # Julia issue #19792, Remove for 4.0
$(eval $(call LLVM_PATCH,llvm-D28215_FreeBSD_shlib))
$(eval $(call LLVM_PATCH,llvm-D28221-avx512)) # mentioned in issue #19797
$(eval $(call LLVM_PATCH,llvm-PR276266)) # Issue #19976, Remove for 4.0
$(eval $(call LLVM_PATCH,llvm-PR278088)) # Issue #19976, Remove for 4.0
$(eval $(call LLVM_PATCH,llvm-PR277939)) # Issue #19976, Remove for 4.0
$(eval $(call LLVM_PATCH,llvm-PR278321)) # Issue #19976, Remove for 4.0
$(eval $(call LLVM_PATCH,llvm-PR278923)) # Issue #19976, Remove for 4.0
endif # LLVM_VER

ifeq ($(LLVM_VER),3.7.1)
Expand Down
51 changes: 51 additions & 0 deletions deps/patches/llvm-PR276266.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
From 64d1e8b748bca22ce205eab7634cc5418c827f18 Mon Sep 17 00:00:00 2001
From: Marina Yatsina <[email protected]>
Date: Thu, 21 Jul 2016 12:37:07 +0000
Subject: [PATCH 3/5] ExecutionDepsFix - Fix bug in clearance calculation

The clearance calculation did not take into account registers defined as outputs or clobbers in inline assembly machine instructions because these register defs are implicit.

Differential Revision: https://reviews.llvm.org/D22580



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@276266 91177308-0d34-0410-b5e6-96231b3b80d8
---
lib/CodeGen/ExecutionDepsFix.cpp | 2 --
test/CodeGen/X86/break-false-dep.ll | 10 ++++++++++
2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 566b8d507b2..1fe5f459b69 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -520,8 +520,6 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg())
continue;
- if (MO.isImplicit())
- break;
if (MO.isUse())
continue;
for (int rx : regIndices(MO.getReg())) {
diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll
index 74a0728f918..a7cda499dab 100644
--- a/test/CodeGen/X86/break-false-dep.ll
+++ b/test/CodeGen/X86/break-false-dep.ll
@@ -199,3 +199,13 @@ for.end16: ; preds = %for.inc14
;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
;AVX-NEXT: vmovsd [[XMM0]],
}
+
+define double @inlineasmdep(i64 %arg) {
+top:
+ tail call void asm sideeffect "", "~{xmm0},~{dirflag},~{fpsr},~{flags}"()
+ %tmp1 = sitofp i64 %arg to double
+ ret double %tmp1
+;AVX-LABEL:@inlineasmdep
+;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]]
+;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM0]], {{%xmm[0-9]+}}
+}
--
2.11.0

169 changes: 169 additions & 0 deletions deps/patches/llvm-PR277939.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
From 9790ab8bccdbc71dfcc166860ab6ce9c369bf686 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <[email protected]>
Date: Sat, 6 Aug 2016 21:21:12 +0000
Subject: [PATCH 1/5] [X86][AVX2] Improve sign/zero extension on AVX2 targets

Split extensions to large vectors into 256-bit chunks - the equivalent of what we do with pre-AVX2 into 128-bit chunks

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277939 91177308-0d34-0410-b5e6-96231b3b80d8
---
lib/Target/X86/X86ISelLowering.cpp | 22 +++++++++++++++-------
test/CodeGen/X86/vec_int_to_fp.ll | 24 ++++++++----------------
test/CodeGen/X86/vector-sext.ll | 10 ++--------
3 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ca205335013..2bbedd4bd97 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -30164,11 +30164,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
: DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
}

- // On pre-AVX2 targets, split into 128-bit nodes of
- // ISD::*_EXTEND_VECTOR_INREG.
- if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) {
- unsigned NumVecs = VT.getSizeInBits() / 128;
- unsigned NumSubElts = 128 / SVT.getSizeInBits();
+ auto SplitAndExtendInReg = [&](unsigned SplitSize) {
+ unsigned NumVecs = VT.getSizeInBits() / SplitSize;
+ unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);

@@ -30176,14 +30174,24 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
DAG.getIntPtrConstant(Offset, DL));
- SrcVec = ExtendVecSize(DL, SrcVec, 128);
+ SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
SrcVec = Opcode == ISD::SIGN_EXTEND
? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
: DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
Opnds.push_back(SrcVec);
}
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
- }
+ };
+
+ // On pre-AVX2 targets, split into 128-bit nodes of
+ // ISD::*_EXTEND_VECTOR_INREG.
+ if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
+ return SplitAndExtendInReg(128);
+
+ // On pre-AVX512 targets, split into 256-bit nodes of
+ // ISD::*_EXTEND_VECTOR_INREG.
+ if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
+ return SplitAndExtendInReg(256);

return SDValue();
}
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 43f5318a607..5d8f91385c7 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -153,8 +153,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
;
; AVX2-LABEL: sitofp_16i8_to_2f64:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
@@ -325,8 +324,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
;
; AVX2-LABEL: sitofp_16i8_to_4f64:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX2-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x double>
@@ -543,8 +541,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
;
; AVX2-LABEL: uitofp_16i8_to_2f64:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
@@ -778,8 +775,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
;
; AVX2-LABEL: uitofp_16i8_to_4f64:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX2-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x double>
@@ -958,8 +954,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
;
; AVX2-LABEL: sitofp_16i8_to_4f32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
@@ -1134,8 +1129,7 @@ define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
;
; AVX2-LABEL: sitofp_16i8_to_8f32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x float>
@@ -1456,8 +1450,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
;
; AVX2-LABEL: uitofp_16i8_to_4f32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
@@ -1813,8 +1806,7 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
;
; AVX2-LABEL: uitofp_16i8_to_8f32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x float>
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index 018c5922a43..e29f3e5f91f 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -407,15 +407,9 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
;
; AVX2-LABEL: sext_16i8_to_8i64:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpslld $24, %xmm1, %xmm1
-; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX2-NEXT: vpmovsxdq %xmm1, %ymm2
+; AVX2-NEXT: vpmovsxbq %xmm0, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1
+; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1
; AVX2-NEXT: vmovdqa %ymm2, %ymm0
; AVX2-NEXT: retq
;
--
2.11.0

Loading

0 comments on commit df891e0

Please sign in to comment.