Skip to content

Commit 05d9ca9

Browse files
committed
recommit:[PowerPC] Eliminate loads/swap feeding swap/store for vector type by using big-endian load/store
In PowerPC, there is instruction to load vector in big endian element order when it's in little endian target. So we can combine vector load + reverse into big endian load to eliminate the swap instruction. Also combine vector reverse + store into big endian store. Differential Revision: https://reviews.llvm.org/D65063 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@367516 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 4ba0d42 commit 05d9ca9

File tree

6 files changed

+160
-117
lines changed

6 files changed

+160
-117
lines changed

lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
11181118
setTargetDAGCombine(ISD::ANY_EXTEND);
11191119

11201120
setTargetDAGCombine(ISD::TRUNCATE);
1121+
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1122+
11211123

11221124
if (Subtarget.useCRBits()) {
11231125
setTargetDAGCombine(ISD::TRUNCATE);
@@ -1352,6 +1354,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
13521354
case PPCISD::SExtVElems: return "PPCISD::SExtVElems";
13531355
case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
13541356
case PPCISD::STXVD2X: return "PPCISD::STXVD2X";
1357+
case PPCISD::LOAD_VEC_BE: return "PPCISD::LOAD_VEC_BE";
1358+
case PPCISD::STORE_VEC_BE: return "PPCISD::STORE_VEC_BE";
13551359
case PPCISD::ST_VSR_SCAL_INT:
13561360
return "PPCISD::ST_VSR_SCAL_INT";
13571361
case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
@@ -13113,6 +13117,61 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
1311313117
return Val;
1311413118
}
1311513119

13120+
SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
13121+
LSBaseSDNode *LSBase,
13122+
DAGCombinerInfo &DCI) const {
13123+
assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) &&
13124+
"Not a reverse memop pattern!");
13125+
13126+
auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {
13127+
auto Mask = SVN->getMask();
13128+
int i = 0;
13129+
auto I = Mask.rbegin();
13130+
auto E = Mask.rend();
13131+
13132+
for (; I != E; ++I) {
13133+
if (*I != i)
13134+
return false;
13135+
i++;
13136+
}
13137+
return true;
13138+
};
13139+
13140+
SelectionDAG &DAG = DCI.DAG;
13141+
EVT VT = SVN->getValueType(0);
13142+
13143+
if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX())
13144+
return SDValue();
13145+
13146+
// Before P9, we have PPCVSXSwapRemoval pass to hack the element order.
13147+
// See comment in PPCVSXSwapRemoval.cpp.
13148+
// It is conflict with PPCVSXSwapRemoval opt. So we don't do it.
13149+
if (!Subtarget.hasP9Vector())
13150+
return SDValue();
13151+
13152+
if(!IsElementReverse(SVN))
13153+
return SDValue();
13154+
13155+
if (LSBase->getOpcode() == ISD::LOAD) {
13156+
SDLoc dl(SVN);
13157+
SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
13158+
return DAG.getMemIntrinsicNode(
13159+
PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps,
13160+
LSBase->getMemoryVT(), LSBase->getMemOperand());
13161+
}
13162+
13163+
if (LSBase->getOpcode() == ISD::STORE) {
13164+
SDLoc dl(LSBase);
13165+
SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0),
13166+
LSBase->getBasePtr()};
13167+
return DAG.getMemIntrinsicNode(
13168+
PPCISD::STORE_VEC_BE, dl, DAG.getVTList(MVT::Other), StoreOps,
13169+
LSBase->getMemoryVT(), LSBase->getMemOperand());
13170+
}
13171+
13172+
llvm_unreachable("Expected a load or store node here");
13173+
}
13174+
1311613175
SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
1311713176
DAGCombinerInfo &DCI) const {
1311813177
SelectionDAG &DAG = DCI.DAG;
@@ -13159,6 +13218,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
1315913218
case ISD::SINT_TO_FP:
1316013219
case ISD::UINT_TO_FP:
1316113220
return combineFPToIntToFP(N, DCI);
13221+
case ISD::VECTOR_SHUFFLE:
13222+
if (ISD::isNormalLoad(N->getOperand(0).getNode())) {
13223+
LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));
13224+
return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);
13225+
}
13226+
break;
1316213227
case ISD::STORE: {
1316313228

1316413229
EVT Op1VT = N->getOperand(1).getValueType();
@@ -13170,6 +13235,13 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
1317013235
return Val;
1317113236
}
1317213237

13238+
if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {
13239+
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N->getOperand(1));
13240+
SDValue Val= combineVReverseMemOP(SVN, cast<LSBaseSDNode>(N), DCI);
13241+
if (Val)
13242+
return Val;
13243+
}
13244+
1317313245
// Turn STORE (BSWAP) -> sthbrx/stwbrx.
1317413246
if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&
1317513247
N->getOperand(1).getNode()->hasOneUse() &&

lib/Target/PowerPC/PPCISelLowering.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,11 @@ namespace llvm {
456456
/// an xxswapd.
457457
LXVD2X,
458458

459+
/// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian.
460+
/// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on
461+
/// the vector type to load vector in big-endian element order.
462+
LOAD_VEC_BE,
463+
459464
/// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a
460465
/// v2f32 value into the lower half of a VSR register.
461466
LD_VSX_LH,
@@ -465,6 +470,11 @@ namespace llvm {
465470
/// an xxswapd.
466471
STXVD2X,
467472

473+
/// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian.
474+
/// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on
475+
/// the vector type to store vector in big-endian element order.
476+
STORE_VEC_BE,
477+
468478
/// Store scalar integers from VSR.
469479
ST_VSR_SCAL_INT,
470480

@@ -1167,6 +1177,8 @@ namespace llvm {
11671177
SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const;
11681178
SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const;
11691179
SDValue combineVSelect(SDNode *N, DAGCombinerInfo &DCI) const;
1180+
SDValue combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase,
1181+
DAGCombinerInfo &DCI) const;
11701182

11711183
/// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
11721184
/// SETCC with integer subtraction when (1) there is a legal way of doing it

lib/Target/PowerPC/PPCInstrVSX.td

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,12 +78,21 @@ def SDTVecConv : SDTypeProfile<1, 2, [
7878
def SDTVabsd : SDTypeProfile<1, 3, [
7979
SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32>
8080
]>;
81-
81+
def SDT_PPCld_vec_be : SDTypeProfile<1, 1, [
82+
SDTCisVec<0>, SDTCisPtrTy<1>
83+
]>;
84+
def SDT_PPCst_vec_be : SDTypeProfile<0, 2, [
85+
SDTCisVec<0>, SDTCisPtrTy<1>
86+
]>;
8287

8388
def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
8489
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
8590
def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
8691
[SDNPHasChain, SDNPMayStore]>;
92+
def PPCld_vec_be : SDNode<"PPCISD::LOAD_VEC_BE", SDT_PPCld_vec_be,
93+
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
94+
def PPCst_vec_be : SDNode<"PPCISD::STORE_VEC_BE", SDT_PPCst_vec_be,
95+
[SDNPHasChain, SDNPMayStore]>;
8796
def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
8897
def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
8998
def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
@@ -1088,6 +1097,19 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
10881097
(STXVD2X $rS, xoaddr:$dst)>;
10891098
def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
10901099
}
1100+
1101+
// Load vector big endian order
1102+
let Predicates = [IsLittleEndian, HasVSX] in {
1103+
def : Pat<(v2f64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
1104+
def : Pat<(PPCst_vec_be v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
1105+
def : Pat<(v4f32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
1106+
def : Pat<(PPCst_vec_be v4f32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
1107+
def : Pat<(v2i64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
1108+
def : Pat<(PPCst_vec_be v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
1109+
def : Pat<(v4i32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
1110+
def : Pat<(PPCst_vec_be v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
1111+
}
1112+
10911113
let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in {
10921114
def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
10931115
def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
@@ -3024,6 +3046,16 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
30243046
(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
30253047
def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
30263048
(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
3049+
3050+
def : Pat<(v8i16 (PPCld_vec_be xoaddr:$src)),
3051+
(COPY_TO_REGCLASS (LXVH8X xoaddr:$src), VRRC)>;
3052+
def : Pat<(PPCst_vec_be v8i16:$rS, xoaddr:$dst),
3053+
(STXVH8X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
3054+
3055+
def : Pat<(v16i8 (PPCld_vec_be xoaddr:$src)),
3056+
(COPY_TO_REGCLASS (LXVB16X xoaddr:$src), VRRC)>;
3057+
def : Pat<(PPCst_vec_be v16i8:$rS, xoaddr:$dst),
3058+
(STXVB16X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
30273059
} // IsLittleEndian, HasP9Vector
30283060

30293061
let Predicates = [IsBigEndian, HasP9Vector] in {

test/CodeGen/PowerPC/build-vector-tests.ll

Lines changed: 18 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -986,11 +986,7 @@ define <4 x i32> @fromDiffMemConsDi(i32* nocapture readonly %arr) {
986986
;
987987
; P9LE-LABEL: fromDiffMemConsDi:
988988
; P9LE: # %bb.0: # %entry
989-
; P9LE-NEXT: lxv v2, 0(r3)
990-
; P9LE-NEXT: addis r3, r2, .LCPI8_0@toc@ha
991-
; P9LE-NEXT: addi r3, r3, .LCPI8_0@toc@l
992-
; P9LE-NEXT: lxvx v3, 0, r3
993-
; P9LE-NEXT: vperm v2, v2, v2, v3
989+
; P9LE-NEXT: lxvw4x v2, 0, r3
994990
; P9LE-NEXT: blr
995991
;
996992
; P8BE-LABEL: fromDiffMemConsDi:
@@ -2570,11 +2566,7 @@ define <4 x i32> @fromDiffMemConsDui(i32* nocapture readonly %arr) {
25702566
;
25712567
; P9LE-LABEL: fromDiffMemConsDui:
25722568
; P9LE: # %bb.0: # %entry
2573-
; P9LE-NEXT: lxv v2, 0(r3)
2574-
; P9LE-NEXT: addis r3, r2, .LCPI41_0@toc@ha
2575-
; P9LE-NEXT: addi r3, r3, .LCPI41_0@toc@l
2576-
; P9LE-NEXT: lxvx v3, 0, r3
2577-
; P9LE-NEXT: vperm v2, v2, v2, v3
2569+
; P9LE-NEXT: lxvw4x v2, 0, r3
25782570
; P9LE-NEXT: blr
25792571
;
25802572
; P8BE-LABEL: fromDiffMemConsDui:
@@ -4155,8 +4147,8 @@ define <2 x i64> @fromDiffMemConsDll(i64* nocapture readonly %arr) {
41554147
;
41564148
; P9LE-LABEL: fromDiffMemConsDll:
41574149
; P9LE: # %bb.0: # %entry
4158-
; P9LE-NEXT: lxv v2, 16(r3)
4159-
; P9LE-NEXT: xxswapd v2, v2
4150+
; P9LE-NEXT: addi r3, r3, 16
4151+
; P9LE-NEXT: lxvd2x v2, 0, r3
41604152
; P9LE-NEXT: blr
41614153
;
41624154
; P8BE-LABEL: fromDiffMemConsDll:
@@ -4235,9 +4227,8 @@ define <2 x i64> @fromDiffMemVarDll(i64* nocapture readonly %arr, i32 signext %e
42354227
; P9LE: # %bb.0: # %entry
42364228
; P9LE-NEXT: sldi r4, r4, 3
42374229
; P9LE-NEXT: add r3, r3, r4
4238-
; P9LE-NEXT: li r4, -8
4239-
; P9LE-NEXT: lxvx v2, r3, r4
4240-
; P9LE-NEXT: xxswapd v2, v2
4230+
; P9LE-NEXT: addi r3, r3, -8
4231+
; P9LE-NEXT: lxvd2x v2, 0, r3
42414232
; P9LE-NEXT: blr
42424233
;
42434234
; P8BE-LABEL: fromDiffMemVarDll:
@@ -4948,8 +4939,8 @@ define <2 x i64> @fromDiffMemConsDConvdtoll(double* nocapture readonly %ptr) {
49484939
;
49494940
; P9LE-LABEL: fromDiffMemConsDConvdtoll:
49504941
; P9LE: # %bb.0: # %entry
4951-
; P9LE-NEXT: lxv vs0, 16(r3)
4952-
; P9LE-NEXT: xxswapd vs0, vs0
4942+
; P9LE-NEXT: addi r3, r3, 16
4943+
; P9LE-NEXT: lxvd2x vs0, 0, r3
49534944
; P9LE-NEXT: xvcvdpsxds v2, vs0
49544945
; P9LE-NEXT: blr
49554946
;
@@ -5040,9 +5031,8 @@ define <2 x i64> @fromDiffMemVarDConvdtoll(double* nocapture readonly %arr, i32
50405031
; P9LE: # %bb.0: # %entry
50415032
; P9LE-NEXT: sldi r4, r4, 3
50425033
; P9LE-NEXT: add r3, r3, r4
5043-
; P9LE-NEXT: li r4, -8
5044-
; P9LE-NEXT: lxvx vs0, r3, r4
5045-
; P9LE-NEXT: xxswapd vs0, vs0
5034+
; P9LE-NEXT: addi r3, r3, -8
5035+
; P9LE-NEXT: lxvd2x vs0, 0, r3
50465036
; P9LE-NEXT: xvcvdpsxds v2, vs0
50475037
; P9LE-NEXT: blr
50485038
;
@@ -5402,8 +5392,8 @@ define <2 x i64> @fromDiffMemConsDull(i64* nocapture readonly %arr) {
54025392
;
54035393
; P9LE-LABEL: fromDiffMemConsDull:
54045394
; P9LE: # %bb.0: # %entry
5405-
; P9LE-NEXT: lxv v2, 16(r3)
5406-
; P9LE-NEXT: xxswapd v2, v2
5395+
; P9LE-NEXT: addi r3, r3, 16
5396+
; P9LE-NEXT: lxvd2x v2, 0, r3
54075397
; P9LE-NEXT: blr
54085398
;
54095399
; P8BE-LABEL: fromDiffMemConsDull:
@@ -5482,9 +5472,8 @@ define <2 x i64> @fromDiffMemVarDull(i64* nocapture readonly %arr, i32 signext %
54825472
; P9LE: # %bb.0: # %entry
54835473
; P9LE-NEXT: sldi r4, r4, 3
54845474
; P9LE-NEXT: add r3, r3, r4
5485-
; P9LE-NEXT: li r4, -8
5486-
; P9LE-NEXT: lxvx v2, r3, r4
5487-
; P9LE-NEXT: xxswapd v2, v2
5475+
; P9LE-NEXT: addi r3, r3, -8
5476+
; P9LE-NEXT: lxvd2x v2, 0, r3
54885477
; P9LE-NEXT: blr
54895478
;
54905479
; P8BE-LABEL: fromDiffMemVarDull:
@@ -6195,8 +6184,8 @@ define <2 x i64> @fromDiffMemConsDConvdtoull(double* nocapture readonly %ptr) {
61956184
;
61966185
; P9LE-LABEL: fromDiffMemConsDConvdtoull:
61976186
; P9LE: # %bb.0: # %entry
6198-
; P9LE-NEXT: lxv vs0, 16(r3)
6199-
; P9LE-NEXT: xxswapd vs0, vs0
6187+
; P9LE-NEXT: addi r3, r3, 16
6188+
; P9LE-NEXT: lxvd2x vs0, 0, r3
62006189
; P9LE-NEXT: xvcvdpuxds v2, vs0
62016190
; P9LE-NEXT: blr
62026191
;
@@ -6287,9 +6276,8 @@ define <2 x i64> @fromDiffMemVarDConvdtoull(double* nocapture readonly %arr, i32
62876276
; P9LE: # %bb.0: # %entry
62886277
; P9LE-NEXT: sldi r4, r4, 3
62896278
; P9LE-NEXT: add r3, r3, r4
6290-
; P9LE-NEXT: li r4, -8
6291-
; P9LE-NEXT: lxvx vs0, r3, r4
6292-
; P9LE-NEXT: xxswapd vs0, vs0
6279+
; P9LE-NEXT: addi r3, r3, -8
6280+
; P9LE-NEXT: lxvd2x vs0, 0, r3
62936281
; P9LE-NEXT: xvcvdpuxds v2, vs0
62946282
; P9LE-NEXT: blr
62956283
;

0 commit comments

Comments
 (0)