Skip to content

Commit 29724e2

Browse files
committed
[AArch64][GlobalISel] Add support for 64 bit vector shuffle using TBL1.
This extends the existing support for shufflevector to handle cases like <2 x float>, which we can implement by concating the vectors and using a TBL1. Differential Revision: https://reviews.llvm.org/D58684 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@355104 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 0ad049b commit 29724e2

File tree

2 files changed

+165
-51
lines changed

2 files changed

+165
-51
lines changed

lib/Target/AArch64/AArch64InstructionSelector.cpp

Lines changed: 118 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ class AArch64InstructionSelector : public InstructionSelector {
8282
unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
8383
MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
8484
MachineIRBuilder &MIRBuilder) const;
85+
MachineInstr *emitVectorConcat(unsigned Op1, unsigned Op2,
86+
MachineIRBuilder &MIRBuilder) const;
8587

8688
ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
8789

@@ -1965,6 +1967,98 @@ MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
19651967
return &*Load;
19661968
}
19671969

1970+
/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
1971+
/// size and RB.
1972+
static std::pair<unsigned, unsigned>
1973+
getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
1974+
unsigned Opc, SubregIdx;
1975+
if (RB.getID() == AArch64::GPRRegBankID) {
1976+
if (EltSize == 32) {
1977+
Opc = AArch64::INSvi32gpr;
1978+
SubregIdx = AArch64::ssub;
1979+
} else if (EltSize == 64) {
1980+
Opc = AArch64::INSvi64gpr;
1981+
SubregIdx = AArch64::dsub;
1982+
} else {
1983+
llvm_unreachable("invalid elt size!");
1984+
}
1985+
} else {
1986+
if (EltSize == 8) {
1987+
Opc = AArch64::INSvi8lane;
1988+
SubregIdx = AArch64::bsub;
1989+
} else if (EltSize == 16) {
1990+
Opc = AArch64::INSvi16lane;
1991+
SubregIdx = AArch64::hsub;
1992+
} else if (EltSize == 32) {
1993+
Opc = AArch64::INSvi32lane;
1994+
SubregIdx = AArch64::ssub;
1995+
} else if (EltSize == 64) {
1996+
Opc = AArch64::INSvi64lane;
1997+
SubregIdx = AArch64::dsub;
1998+
} else {
1999+
llvm_unreachable("invalid elt size!");
2000+
}
2001+
}
2002+
return std::make_pair(Opc, SubregIdx);
2003+
}
2004+
2005+
MachineInstr *AArch64InstructionSelector::emitVectorConcat(
2006+
unsigned Op1, unsigned Op2, MachineIRBuilder &MIRBuilder) const {
2007+
// We implement a vector concat by:
2008+
// 1. Use scalar_to_vector to insert the lower vector into the larger dest
2009+
// 2. Insert the upper vector into the destination's upper element
2010+
// TODO: some of this code is common with G_BUILD_VECTOR handling.
2011+
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
2012+
2013+
const LLT Op1Ty = MRI.getType(Op1);
2014+
const LLT Op2Ty = MRI.getType(Op2);
2015+
2016+
if (Op1Ty != Op2Ty) {
2017+
LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
2018+
return nullptr;
2019+
}
2020+
assert(Op1Ty.isVector() && "Expected a vector for vector concat");
2021+
2022+
if (Op1Ty.getSizeInBits() >= 128) {
2023+
LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
2024+
return nullptr;
2025+
}
2026+
2027+
// At the moment we just support 64 bit vector concats.
2028+
if (Op1Ty.getSizeInBits() != 64) {
2029+
LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
2030+
return nullptr;
2031+
}
2032+
2033+
const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
2034+
const LLT &DstTy = LLT::vector(2, ScalarTy);
2035+
const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
2036+
const TargetRegisterClass *DstRC =
2037+
getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
2038+
2039+
MachineInstr *WidenedOp1 = emitScalarToVector(DstTy, DstRC, Op1, MIRBuilder);
2040+
MachineInstr *WidenedOp2 = emitScalarToVector(DstTy, DstRC, Op2, MIRBuilder);
2041+
if (!WidenedOp1 || !WidenedOp2) {
2042+
LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
2043+
return nullptr;
2044+
}
2045+
2046+
// Now do the insert of the upper element.
2047+
unsigned InsertOpc, InsSubRegIdx;
2048+
std::tie(InsertOpc, InsSubRegIdx) =
2049+
getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
2050+
2051+
auto InsElt =
2052+
MIRBuilder
2053+
.buildInstr(InsertOpc, {DstRC}, {WidenedOp1->getOperand(0).getReg()})
2054+
.addImm(1) /* Lane index */
2055+
.addUse(WidenedOp2->getOperand(0).getReg())
2056+
.addImm(0);
2057+
2058+
constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
2059+
return &*InsElt;
2060+
}
2061+
19682062
bool AArch64InstructionSelector::selectShuffleVector(
19692063
MachineInstr &I, MachineRegisterInfo &MRI) const {
19702064
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
@@ -2002,21 +2096,37 @@ bool AArch64InstructionSelector::selectShuffleVector(
20022096
}
20032097
}
20042098

2005-
if (DstTy.getSizeInBits() != 128) {
2006-
assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
2007-
// This case can be done with TBL1.
2008-
return false;
2009-
}
2099+
MachineIRBuilder MIRBuilder(I);
20102100

20112101
// Use a constant pool to load the index vector for TBL.
20122102
Constant *CPVal = ConstantVector::get(CstIdxs);
2013-
MachineIRBuilder MIRBuilder(I);
20142103
MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
20152104
if (!IndexLoad) {
20162105
LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
20172106
return false;
20182107
}
20192108

2109+
if (DstTy.getSizeInBits() != 128) {
2110+
assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
2111+
// This case can be done with TBL1.
2112+
MachineInstr *Concat = emitVectorConcat(Src1Reg, Src2Reg, MIRBuilder);
2113+
if (!Concat) {
2114+
LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
2115+
return false;
2116+
}
2117+
auto TBL1 = MIRBuilder.buildInstr(
2118+
AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
2119+
{Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
2120+
constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
2121+
2122+
auto Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(),
2123+
TII.get(TargetOpcode::COPY), I.getOperand(0).getReg())
2124+
.addUse(TBL1->getOperand(0).getReg(), 0, AArch64::dsub);
2125+
RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
2126+
I.eraseFromParent();
2127+
return true;
2128+
}
2129+
20202130
// For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
20212131
// Q registers for regalloc.
20222132
auto RegSeq = MIRBuilder
@@ -2048,26 +2158,8 @@ bool AArch64InstructionSelector::selectBuildVector(
20482158
const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
20492159
unsigned Opc;
20502160
unsigned SubregIdx;
2051-
if (RB.getID() == AArch64::GPRRegBankID) {
2052-
if (EltSize == 32) {
2053-
Opc = AArch64::INSvi32gpr;
2054-
SubregIdx = AArch64::ssub;
2055-
} else {
2056-
Opc = AArch64::INSvi64gpr;
2057-
SubregIdx = AArch64::dsub;
2058-
}
2059-
} else {
2060-
if (EltSize == 16) {
2061-
Opc = AArch64::INSvi16lane;
2062-
SubregIdx = AArch64::hsub;
2063-
} else if (EltSize == 32) {
2064-
Opc = AArch64::INSvi32lane;
2065-
SubregIdx = AArch64::ssub;
2066-
} else {
2067-
Opc = AArch64::INSvi64lane;
2068-
SubregIdx = AArch64::dsub;
2069-
}
2070-
}
2161+
2162+
std::tie(Opc, SubregIdx) = getInsertVecEltOpInfo(RB, EltSize);
20712163

20722164
MachineIRBuilder MIRBuilder(I);
20732165

test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir

Lines changed: 47 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# WARNING: update_mir_test_checks.py does not include the constant pools output,
3+
# so this test requires manual fixing up after running the script.
4+
25
# RUN: llc -mtriple=aarch64-- -O0 -run-pass=instruction-select -verify-machineinstrs %s -global-isel-abort=1 -o - | FileCheck %s
36
--- |
4-
; ModuleID = 'shufflevec-only-legal.ll'
5-
source_filename = "shufflevec-only-legal.ll"
67
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
78
target triple = "aarch64"
89

10+
define <2 x float> @shuffle_v2f32(<2 x float> %a, <2 x float> %b) {
11+
%shuf = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 0>
12+
ret <2 x float> %shuf
13+
}
14+
915
define <4 x i32> @shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) {
1016
%shuf = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 3, i32 0>
1117
ret <4 x i32> %shuf
@@ -21,21 +27,52 @@
2127
ret <2 x i64> %shuf
2228
}
2329

30+
...
31+
---
32+
name: shuffle_v2f32
33+
alignment: 2
34+
legalized: true
35+
regBankSelected: true
36+
tracksRegLiveness: true
37+
body: |
38+
bb.1 (%ir-block.0):
39+
liveins: $d0, $d1
40+
41+
; CHECK-LABEL: name: shuffle_v2f32
42+
; CHECK: constants:
43+
; CHECK: - id: 0
44+
; CHECK: value: '<8 x i8> <i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3>'
45+
; CHECK: alignment: 8
46+
; CHECK: liveins: $d0, $d1
47+
; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
48+
; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
49+
; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
50+
; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
51+
; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
52+
; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub
53+
; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
54+
; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.dsub
55+
; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0
56+
; CHECK: [[TBLv16i8One:%[0-9]+]]:fpr128 = TBLv16i8One [[INSvi64lane]], [[LDRQui]]
57+
; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY [[TBLv16i8One]].dsub
58+
; CHECK: $d0 = COPY [[COPY2]]
59+
; CHECK: RET_ReallyLR implicit $d0
60+
%0:fpr(<2 x s32>) = COPY $d0
61+
%1:fpr(<2 x s32>) = COPY $d1
62+
%4:gpr(s32) = G_CONSTANT i32 1
63+
%5:gpr(s32) = G_CONSTANT i32 0
64+
%3:fpr(<2 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32)
65+
%2:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %1, %3(<2 x s32>)
66+
$d0 = COPY %2(<2 x s32>)
67+
RET_ReallyLR implicit $d0
68+
2469
...
2570
---
2671
name: shuffle_v4i32
2772
alignment: 2
2873
legalized: true
2974
regBankSelected: true
3075
tracksRegLiveness: true
31-
registers:
32-
- { id: 0, class: fpr }
33-
- { id: 1, class: fpr }
34-
- { id: 2, class: fpr }
35-
- { id: 3, class: fpr }
36-
- { id: 4, class: gpr }
37-
- { id: 5, class: gpr }
38-
- { id: 6, class: gpr }
3976
body: |
4077
bb.1 (%ir-block.0):
4178
liveins: $q0, $q1
@@ -71,15 +108,6 @@ alignment: 2
71108
legalized: true
72109
regBankSelected: true
73110
tracksRegLiveness: true
74-
registers:
75-
- { id: 0, class: fpr }
76-
- { id: 1, class: fpr }
77-
- { id: 2, class: fpr }
78-
- { id: 3, class: fpr }
79-
- { id: 4, class: gpr }
80-
- { id: 5, class: gpr }
81-
- { id: 6, class: gpr }
82-
- { id: 7, class: gpr }
83111
body: |
84112
bb.1 (%ir-block.0):
85113
liveins: $q0, $q1
@@ -116,12 +144,6 @@ alignment: 2
116144
legalized: true
117145
regBankSelected: true
118146
tracksRegLiveness: true
119-
registers:
120-
- { id: 0, class: fpr }
121-
- { id: 1, class: fpr }
122-
- { id: 2, class: fpr }
123-
- { id: 3, class: fpr }
124-
- { id: 4, class: gpr }
125147
body: |
126148
bb.1 (%ir-block.0):
127149
liveins: $q0, $q1

0 commit comments

Comments
 (0)