-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[X86] LowerShiftByScalarImmediate - move shl(x,1) -> add(freeze(x),freeze(x)) to X86FixupInstTunings #161007
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…eeze(x)) to X86FixupInstTunings Avoid the shl(x,1) -> add(freeze(x),freeze(x)) if the shift-imm if legal, and leave it to X86FixupInstTunings. Helps avoid missed optimisations due to oneuse limits, avoids unnecessary freezes and allows AVX512 to fold to mi memory folding variants. Fixes llvm#161006
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesAvoid the shl(x,1) -> add(freeze(x),freeze(x)) if the shift-imm if legal, and leave it to X86FixupInstTunings. Helps avoid missed optimisations due to oneuse limits, avoids unnecessary freezes and allows AVX512 to fold to mi memory folding variants. Fixes #161006 Patch is 58.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161007.diff 22 Files Affected:
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 33dc0a232815c..a1d4e0bc62310 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -277,6 +277,22 @@ bool X86FixupInstTuningPass::processInstruction(
return true;
};
+ // Is ADD(X,X) more efficient than SHL(X,1)?
+ auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool {
+ if (MI.getOperand(NumOperands - 1).getImm() != 1)
+ return false;
+ if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true))
+ return false;
+ LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+ {
+ MI.setDesc(TII->get(AddOpc));
+ MI.removeOperand(NumOperands - 1);
+ MI.addOperand(MI.getOperand(NumOperands - 2));
+ }
+ LLVM_DEBUG(dbgs() << " With: " << MI);
+ return false;
+ };
+
switch (Opc) {
case X86::BLENDPDrri:
return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
@@ -563,6 +579,44 @@ bool X86FixupInstTuningPass::processInstruction(
return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz);
case X86::VUNPCKHPSZrmkz:
return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz);
+
+ case X86::PSLLWri:
+ return ProcessShiftLeftToAdd(X86::PADDWrr);
+ case X86::VPSLLWri:
+ return ProcessShiftLeftToAdd(X86::VPADDWrr);
+ case X86::VPSLLWYri:
+ return ProcessShiftLeftToAdd(X86::VPADDWYrr);
+ case X86::VPSLLWZ128ri:
+ return ProcessShiftLeftToAdd(X86::VPADDWZ128rr);
+ case X86::VPSLLWZ256ri:
+ return ProcessShiftLeftToAdd(X86::VPADDWZ256rr);
+ case X86::VPSLLWZri:
+ return ProcessShiftLeftToAdd(X86::VPADDWZrr);
+ case X86::PSLLDri:
+ return ProcessShiftLeftToAdd(X86::PADDDrr);
+ case X86::VPSLLDri:
+ return ProcessShiftLeftToAdd(X86::VPADDDrr);
+ case X86::VPSLLDYri:
+ return ProcessShiftLeftToAdd(X86::VPADDDYrr);
+ case X86::VPSLLDZ128ri:
+ return ProcessShiftLeftToAdd(X86::VPADDDZ128rr);
+ case X86::VPSLLDZ256ri:
+ return ProcessShiftLeftToAdd(X86::VPADDDZ256rr);
+ case X86::VPSLLDZri:
+ return ProcessShiftLeftToAdd(X86::VPADDDZrr);
+ case X86::PSLLQri:
+ return ProcessShiftLeftToAdd(X86::PADDQrr);
+ case X86::VPSLLQri:
+ return ProcessShiftLeftToAdd(X86::VPADDQrr);
+ case X86::VPSLLQYri:
+ return ProcessShiftLeftToAdd(X86::VPADDQYrr);
+ case X86::VPSLLQZ128ri:
+ return ProcessShiftLeftToAdd(X86::VPADDQZ128rr);
+ case X86::VPSLLQZ256ri:
+ return ProcessShiftLeftToAdd(X86::VPADDQZ256rr);
+ case X86::VPSLLQZri:
+ return ProcessShiftLeftToAdd(X86::VPADDQZrr);
+
default:
return false;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3af673d951f65..f12eec1063389 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30312,22 +30312,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
- if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
- // Hardware support for vector shifts is sparse which makes us scalarize the
- // vector operations in many cases. Also, on sandybridge ADD is faster than
- // shl: (shl V, 1) -> (add (freeze V), (freeze V))
- if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
- // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
- // must be 0). (add undef, undef) however can be any value. To make this
- // safe, we must freeze R to ensure that register allocation uses the same
- // register for an undefined value. This ensures that the result will
- // still be even and preserves the original semantics.
- R = DAG.getFreeze(R);
- return DAG.getNode(ISD::ADD, dl, VT, R, R);
- }
-
+ if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
- }
// i64 SRA needs to be performed as partial shifts.
if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll
index ff9f995c4765b..51a8bf5b48415 100644
--- a/llvm/test/CodeGen/X86/combine-add.ll
+++ b/llvm/test/CodeGen/X86/combine-add.ll
@@ -235,10 +235,10 @@ define void @PR52039(ptr %pa, ptr %pb) {
; SSE-NEXT: psubd %xmm1, %xmm3
; SSE-NEXT: psubd %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: paddd %xmm0, %xmm0
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: paddd %xmm1, %xmm1
; SSE-NEXT: paddd %xmm3, %xmm1
; SSE-NEXT: movdqu %xmm3, 16(%rsi)
; SSE-NEXT: movdqu %xmm2, (%rsi)
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 8e4a50ea266c3..ae4d24f91ffc0 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -81,7 +81,7 @@ define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
; SSE-LABEL: combine_vec_mul_pow2c:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: paddq %xmm0, %xmm2
+; SSE-NEXT: paddq %xmm2, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psllq $4, %xmm2
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 98187d61c1f84..6bcbfe1808933 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -2187,13 +2187,13 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128]
; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: paddw %xmm4, %xmm4
-; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5],xmm4[6],xmm2[7]
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT: paddw %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2],xmm0[3,4,5],xmm2[6],xmm0[7]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm3, %xmm2
; SSE41-NEXT: paddb %xmm1, %xmm2
@@ -2201,15 +2201,14 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; SSE41-NEXT: psraw $8, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: paddw %xmm0, %xmm3
-; SSE41-NEXT: psllw $7, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
-; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: psllw $7, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7]
+; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE41-NEXT: psraw $8, %xmm2
; SSE41-NEXT: psllw $7, %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: packuswb %xmm0, %xmm2
+; SSE41-NEXT: packuswb %xmm3, %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
@@ -2225,18 +2224,17 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128]
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5],xmm2[6],xmm3[7]
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5],xmm3[6],xmm2[7]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
-; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
+; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/known-signbits-shl.ll b/llvm/test/CodeGen/X86/known-signbits-shl.ll
index 473fecc307ed4..57d557dec11b9 100644
--- a/llvm/test/CodeGen/X86/known-signbits-shl.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-shl.ll
@@ -137,7 +137,7 @@ define void @computeNumSignBits_shl_zext_vec_3(<2 x i8> %x, ptr %p) nounwind {
; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-NEXT: por %xmm2, %xmm1
; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: paddw %xmm0, %xmm2
+; X64-NEXT: paddw %xmm2, %xmm2
; X64-NEXT: movdqa %xmm2, %xmm3
; X64-NEXT: psraw $1, %xmm3
; X64-NEXT: pcmpeqw %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 4e6f666fa05de..4cde581c10508 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -4806,9 +4806,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
+; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: retq
@@ -4830,9 +4829,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-SMALL-NEXT: retq
@@ -4842,10 +4840,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-LARGE-NEXT: retq
@@ -4875,9 +4872,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
+; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: retq
@@ -4899,9 +4895,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-SMALL-NEXT: retq
@@ -4911,10 +4906,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-LARGE-NEXT: retq
@@ -4944,9 +4938,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
-; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm2
+; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0
+; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
; X64-KNL-NEXT: kmovw %k1, %k2
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
@@ -4972,9 +4965,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
-; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm2
+; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
; X64-SKX-SMALL-NEXT: kmovw %k1, %k2
; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
@@ -4986,10 +4978,9 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm2
+; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm2
; X64-SKX-LARGE-NEXT: kmovw %k1, %k2
; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index f53983036a016..5df1867f73c8e 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -155,10 +155,10 @@ define <16 x i32> @PR42819(ptr %a0) {
define void @PR42833() {
; SSE2-LABEL: PR42833:
; SSE2: # %bb.0:
+; SSE2-NEXT: movl b(%rip), %eax
; SSE2-NEXT: movdqa c+144(%rip), %xmm2
; SSE2-NEXT: movdqa c+128(%rip), %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: addl b(%rip), %eax
+; SSE2-NEXT: addl c+128(%rip), %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: paddd %xmm0, %xmm3
@@ -166,7 +166,7 @@ define void @PR42833() {
; SSE2-NEXT: psubd %xmm2, %xmm4
; SSE2-NEXT: paddd %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: paddd %xmm0, %xmm5
+; SSE2-NEXT: paddd %xmm5, %xmm5
; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
; SSE2-NEXT: movdqa %xmm2, c+144(%rip)
; SSE2-NEXT: movaps %xmm5, c+128(%rip)
@@ -191,17 +191,17 @@ define void @PR42833() {
;
; SSE42-LABEL: PR42833:
; SSE42: # %bb.0:
+; SSE42-NEXT: movl b(%rip), %eax
; SSE42-NEXT: movdqa c+144(%rip), %xmm1
; SSE42-NEXT: movdqa c+128(%rip), %xmm0
-; SSE42-NEXT: movd %xmm0, %eax
-; SSE42-NEXT: addl b(%rip), %eax
+; SSE42-NEXT: addl c+128(%rip), %eax
; SSE42-NEXT: movd %eax, %xmm2
; SSE42-NEXT: paddd %xmm0, %xmm2
; SSE42-NEXT: movdqa d+144(%rip), %xmm3
; SSE42-NEXT: psubd %xmm1, %xmm3
; SSE42-NEXT: paddd %xmm1, %xmm1
; SSE42-NEXT: movdqa %xmm0, %xmm4
-; SSE42-NEXT: paddd %xmm0, %xmm4
+; SSE42-NEXT: paddd %xmm4, %xmm4
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
; SSE42-NEXT: movdqa %xmm1, c+144(%rip)
; SSE42-NEXT: movdqa %xmm4, c+128(%rip)
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index ce03f8fad4a19..161e9651a9cf2 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -26,27 +26,33 @@ define i64 @PR62286(i32 %a) {
; AVX1-LABEL: PR62286:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7]
; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR62286:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm1
-; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX2-NEXT: vpaddd %ymm0, %ymm0, %...
[truncated]
|
; X64-AVX: # %bb.0: | ||
; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 | ||
; X64-AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 | ||
; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the constant a different one with before. Otherwise it's not correct to switch the order.
Avoid the shl(x,1) -> add(freeze(x),freeze(x)) if the shift-imm if legal, and leave it to X86FixupInstTunings.
Helps avoid missed optimisations due to oneuse limits, avoids unnecessary freezes and allows AVX512 to fold to mi memory folding variants.
Fixes #161006