forked from JuliaLang/julia
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Backport LLVM patches to fix X86 partial register stall
Fix JuliaLang#19976
- Loading branch information
Showing
6 changed files
with
1,928 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
From 64d1e8b748bca22ce205eab7634cc5418c827f18 Mon Sep 17 00:00:00 2001 | ||
From: Marina Yatsina <[email protected]> | ||
Date: Thu, 21 Jul 2016 12:37:07 +0000 | ||
Subject: [PATCH 3/5] ExecutionDepsFix - Fix bug in clearance calculation | ||
|
||
The clearance calculation did not take into account registers defined as outputs or clobbers in inline assembly machine instructions because these register defs are implicit. | ||
|
||
Differential Revision: https://reviews.llvm.org/D22580 | ||
|
||
|
||
|
||
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@276266 91177308-0d34-0410-b5e6-96231b3b80d8 | ||
--- | ||
lib/CodeGen/ExecutionDepsFix.cpp | 2 -- | ||
test/CodeGen/X86/break-false-dep.ll | 10 ++++++++++ | ||
2 files changed, 10 insertions(+), 2 deletions(-) | ||
|
||
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp | ||
index 566b8d507b2..1fe5f459b69 100644 | ||
--- a/lib/CodeGen/ExecutionDepsFix.cpp | ||
+++ b/lib/CodeGen/ExecutionDepsFix.cpp | ||
@@ -520,8 +520,6 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { | ||
MachineOperand &MO = MI->getOperand(i); | ||
if (!MO.isReg()) | ||
continue; | ||
- if (MO.isImplicit()) | ||
- break; | ||
if (MO.isUse()) | ||
continue; | ||
for (int rx : regIndices(MO.getReg())) { | ||
diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll | ||
index 74a0728f918..a7cda499dab 100644 | ||
--- a/test/CodeGen/X86/break-false-dep.ll | ||
+++ b/test/CodeGen/X86/break-false-dep.ll | ||
@@ -199,3 +199,13 @@ for.end16: ; preds = %for.inc14 | ||
;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] | ||
;AVX-NEXT: vmovsd [[XMM0]], | ||
} | ||
+ | ||
+define double @inlineasmdep(i64 %arg) { | ||
+top: | ||
+ tail call void asm sideeffect "", "~{xmm0},~{dirflag},~{fpsr},~{flags}"() | ||
+ %tmp1 = sitofp i64 %arg to double | ||
+ ret double %tmp1 | ||
+;AVX-LABEL:@inlineasmdep | ||
+;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]] | ||
+;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM0]], {{%xmm[0-9]+}} | ||
+} | ||
-- | ||
2.11.0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
From 9790ab8bccdbc71dfcc166860ab6ce9c369bf686 Mon Sep 17 00:00:00 2001 | ||
From: Simon Pilgrim <[email protected]> | ||
Date: Sat, 6 Aug 2016 21:21:12 +0000 | ||
Subject: [PATCH 1/5] [X86][AVX2] Improve sign/zero extension on AVX2 targets | ||
|
||
Split extensions to large vectors into 256-bit chunks - the equivalent of what we do with pre-AVX2 into 128-bit chunks | ||
|
||
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277939 91177308-0d34-0410-b5e6-96231b3b80d8 | ||
--- | ||
lib/Target/X86/X86ISelLowering.cpp | 22 +++++++++++++++------- | ||
test/CodeGen/X86/vec_int_to_fp.ll | 24 ++++++++---------------- | ||
test/CodeGen/X86/vector-sext.ll | 10 ++-------- | ||
3 files changed, 25 insertions(+), 31 deletions(-) | ||
|
||
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp | ||
index ca205335013..2bbedd4bd97 100644 | ||
--- a/lib/Target/X86/X86ISelLowering.cpp | ||
+++ b/lib/Target/X86/X86ISelLowering.cpp | ||
@@ -30164,11 +30164,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, | ||
: DAG.getZeroExtendVectorInReg(ExOp, DL, VT); | ||
} | ||
|
||
- // On pre-AVX2 targets, split into 128-bit nodes of | ||
- // ISD::*_EXTEND_VECTOR_INREG. | ||
- if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) { | ||
- unsigned NumVecs = VT.getSizeInBits() / 128; | ||
- unsigned NumSubElts = 128 / SVT.getSizeInBits(); | ||
+ auto SplitAndExtendInReg = [&](unsigned SplitSize) { | ||
+ unsigned NumVecs = VT.getSizeInBits() / SplitSize; | ||
+ unsigned NumSubElts = SplitSize / SVT.getSizeInBits(); | ||
EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); | ||
EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); | ||
|
||
@@ -30176,14 +30174,24 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, | ||
for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { | ||
SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, | ||
DAG.getIntPtrConstant(Offset, DL)); | ||
- SrcVec = ExtendVecSize(DL, SrcVec, 128); | ||
+ SrcVec = ExtendVecSize(DL, SrcVec, SplitSize); | ||
SrcVec = Opcode == ISD::SIGN_EXTEND | ||
? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT) | ||
: DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT); | ||
Opnds.push_back(SrcVec); | ||
} | ||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); | ||
- } | ||
+ }; | ||
+ | ||
+ // On pre-AVX2 targets, split into 128-bit nodes of | ||
+ // ISD::*_EXTEND_VECTOR_INREG. | ||
+ if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) | ||
+ return SplitAndExtendInReg(128); | ||
+ | ||
+ // On pre-AVX512 targets, split into 256-bit nodes of | ||
+ // ISD::*_EXTEND_VECTOR_INREG. | ||
+ if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256)) | ||
+ return SplitAndExtendInReg(256); | ||
|
||
return SDValue(); | ||
} | ||
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll | ||
index 43f5318a607..5d8f91385c7 100644 | ||
--- a/test/CodeGen/X86/vec_int_to_fp.ll | ||
+++ b/test/CodeGen/X86/vec_int_to_fp.ll | ||
@@ -153,8 +153,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { | ||
; | ||
; AVX2-LABEL: sitofp_16i8_to_2f64: | ||
; AVX2: # BB#0: | ||
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 | ||
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 | ||
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 | ||
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 | ||
; AVX2-NEXT: # kill | ||
; AVX2-NEXT: vzeroupper | ||
@@ -325,8 +324,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { | ||
; | ||
; AVX2-LABEL: sitofp_16i8_to_4f64: | ||
; AVX2: # BB#0: | ||
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 | ||
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 | ||
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 | ||
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 | ||
; AVX2-NEXT: retq | ||
%cvt = sitofp <16 x i8> %a to <16 x double> | ||
@@ -543,8 +541,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { | ||
; | ||
; AVX2-LABEL: uitofp_16i8_to_2f64: | ||
; AVX2: # BB#0: | ||
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero | ||
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero | ||
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero | ||
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 | ||
; AVX2-NEXT: # kill | ||
; AVX2-NEXT: vzeroupper | ||
@@ -778,8 +775,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { | ||
; | ||
; AVX2-LABEL: uitofp_16i8_to_4f64: | ||
; AVX2: # BB#0: | ||
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero | ||
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero | ||
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero | ||
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 | ||
; AVX2-NEXT: retq | ||
%cvt = uitofp <16 x i8> %a to <16 x double> | ||
@@ -958,8 +954,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { | ||
; | ||
; AVX2-LABEL: sitofp_16i8_to_4f32: | ||
; AVX2: # BB#0: | ||
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 | ||
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 | ||
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 | ||
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 | ||
; AVX2-NEXT: # kill | ||
; AVX2-NEXT: vzeroupper | ||
@@ -1134,8 +1129,7 @@ define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) { | ||
; | ||
; AVX2-LABEL: sitofp_16i8_to_8f32: | ||
; AVX2: # BB#0: | ||
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 | ||
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 | ||
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 | ||
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 | ||
; AVX2-NEXT: retq | ||
%cvt = sitofp <16 x i8> %a to <16 x float> | ||
@@ -1456,8 +1450,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { | ||
; | ||
; AVX2-LABEL: uitofp_16i8_to_4f32: | ||
; AVX2: # BB#0: | ||
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero | ||
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero | ||
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero | ||
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 | ||
; AVX2-NEXT: # kill | ||
; AVX2-NEXT: vzeroupper | ||
@@ -1813,8 +1806,7 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { | ||
; | ||
; AVX2-LABEL: uitofp_16i8_to_8f32: | ||
; AVX2: # BB#0: | ||
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero | ||
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero | ||
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero | ||
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 | ||
; AVX2-NEXT: retq | ||
%cvt = uitofp <16 x i8> %a to <16 x float> | ||
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll | ||
index 018c5922a43..e29f3e5f91f 100644 | ||
--- a/test/CodeGen/X86/vector-sext.ll | ||
+++ b/test/CodeGen/X86/vector-sext.ll | ||
@@ -407,15 +407,9 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp | ||
; | ||
; AVX2-LABEL: sext_16i8_to_8i64: | ||
; AVX2: # BB#0: # %entry | ||
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero | ||
-; AVX2-NEXT: vpslld $24, %xmm1, %xmm1 | ||
-; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 | ||
-; AVX2-NEXT: vpmovsxdq %xmm1, %ymm2 | ||
+; AVX2-NEXT: vpmovsxbq %xmm0, %ymm2 | ||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] | ||
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero | ||
-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 | ||
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 | ||
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1 | ||
+; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1 | ||
; AVX2-NEXT: vmovdqa %ymm2, %ymm0 | ||
; AVX2-NEXT: retq | ||
; | ||
-- | ||
2.11.0 | ||
|
Oops, something went wrong.