Skip to content

Commit 8da71ff

Browse files
committed
[NVPTX] Select atomic loads and stores
According to PTX ISA .volatile has the same memory synchronization semantics as .relaxed.sys, so it can be used to implement monotonic atomic loads and stores. This is important for OpenMP's atomic construct where - 'read's and 'write's are lowered to atomic loads and stores, and - an update of float or double types are lowered into a cmpxchg loop. (Note that PTX could do better because it has atom.add.f{32,64} but LLVM's atomicrmw instruction only allows integer types.) Higher levels of atomicity (like acquire and release) need additional synchronization properties which were added with PTX ISA 6.0 / sm_70. So using these instructions still results in an error. Differential Revision: https://reviews.llvm.org/D50391 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@339316 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 8e0b205 commit 8da71ff

File tree

2 files changed

+170
-34
lines changed

2 files changed

+170
-34
lines changed

lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 82 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "llvm/Analysis/ValueTracking.h"
1717
#include "llvm/IR/GlobalValue.h"
1818
#include "llvm/IR/Instructions.h"
19+
#include "llvm/Support/AtomicOrdering.h"
1920
#include "llvm/Support/CommandLine.h"
2021
#include "llvm/Support/Debug.h"
2122
#include "llvm/Support/ErrorHandling.h"
@@ -81,10 +82,12 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
8182

8283
switch (N->getOpcode()) {
8384
case ISD::LOAD:
85+
case ISD::ATOMIC_LOAD:
8486
if (tryLoad(N))
8587
return;
8688
break;
8789
case ISD::STORE:
90+
case ISD::ATOMIC_STORE:
8891
if (tryStore(N))
8992
return;
9093
break;
@@ -834,17 +837,27 @@ static Optional<unsigned> pickOpcodeForVT(
834837

835838
bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
836839
SDLoc dl(N);
837-
LoadSDNode *LD = cast<LoadSDNode>(N);
840+
MemSDNode *LD = cast<MemSDNode>(N);
841+
assert(LD->readMem() && "Expected load");
842+
LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
838843
EVT LoadedVT = LD->getMemoryVT();
839844
SDNode *NVPTXLD = nullptr;
840845

841846
// do not support pre/post inc/dec
842-
if (LD->isIndexed())
847+
if (PlainLoad && PlainLoad->isIndexed())
843848
return false;
844849

845850
if (!LoadedVT.isSimple())
846851
return false;
847852

853+
AtomicOrdering Ordering = LD->getOrdering();
854+
// In order to lower atomic loads with stronger guarantees we would need to
855+
// use load.acquire or insert fences. However these features were only added
856+
// with PTX ISA 6.0 / sm_70.
857+
// TODO: Check if we can actually use the new instructions and implement them.
858+
if (isStrongerThanMonotonic(Ordering))
859+
return false;
860+
848861
// Address Space Setting
849862
unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
850863
if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
@@ -855,8 +868,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
855868
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
856869

857870
// Volatile Setting
858-
// - .volatile is only availalble for .global and .shared
859-
bool isVolatile = LD->isVolatile();
871+
// - .volatile is only available for .global and .shared
872+
// - .volatile has the same memory synchronization semantics as .relaxed.sys
873+
bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
860874
if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
861875
CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
862876
CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
@@ -882,7 +896,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
882896
fromTypeWidth = 32;
883897
}
884898

885-
if ((LD->getExtensionType() == ISD::SEXTLOAD))
899+
if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
886900
fromType = NVPTX::PTXLdStInstCode::Signed;
887901
else if (ScalarVT.isFloatingPoint())
888902
// f16 uses .b16 as its storage type.
@@ -1691,25 +1705,38 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
16911705

16921706
bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
16931707
SDLoc dl(N);
1694-
StoreSDNode *ST = cast<StoreSDNode>(N);
1708+
MemSDNode *ST = cast<MemSDNode>(N);
1709+
assert(ST->writeMem() && "Expected store");
1710+
StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1711+
AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1712+
assert((PlainStore || AtomicStore) && "Expected store");
16951713
EVT StoreVT = ST->getMemoryVT();
16961714
SDNode *NVPTXST = nullptr;
16971715

16981716
// do not support pre/post inc/dec
1699-
if (ST->isIndexed())
1717+
if (PlainStore && PlainStore->isIndexed())
17001718
return false;
17011719

17021720
if (!StoreVT.isSimple())
17031721
return false;
17041722

1723+
AtomicOrdering Ordering = ST->getOrdering();
1724+
// In order to lower atomic loads with stronger guarantees we would need to
1725+
// use store.release or insert fences. However these features were only added
1726+
// with PTX ISA 6.0 / sm_70.
1727+
// TODO: Check if we can actually use the new instructions and implement them.
1728+
if (isStrongerThanMonotonic(Ordering))
1729+
return false;
1730+
17051731
// Address Space Setting
17061732
unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
17071733
unsigned int PointerSize =
17081734
CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
17091735

17101736
// Volatile Setting
1711-
// - .volatile is only availalble for .global and .shared
1712-
bool isVolatile = ST->isVolatile();
1737+
// - .volatile is only available for .global and .shared
1738+
// - .volatile has the same memory synchronization semantics as .relaxed.sys
1739+
bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
17131740
if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
17141741
CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
17151742
CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
@@ -1739,41 +1766,53 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
17391766
toType = NVPTX::PTXLdStInstCode::Unsigned;
17401767

17411768
// Create the machine instruction DAG
1742-
SDValue Chain = N->getOperand(0);
1743-
SDValue N1 = N->getOperand(1);
1744-
SDValue N2 = N->getOperand(2);
1769+
SDValue Chain = ST->getChain();
1770+
SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1771+
SDValue BasePtr = ST->getBasePtr();
17451772
SDValue Addr;
17461773
SDValue Offset, Base;
17471774
Optional<unsigned> Opcode;
1748-
MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
1775+
MVT::SimpleValueType SourceVT =
1776+
Value.getNode()->getSimpleValueType(0).SimpleTy;
17491777

1750-
if (SelectDirectAddr(N2, Addr)) {
1778+
if (SelectDirectAddr(BasePtr, Addr)) {
17511779
Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
17521780
NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
17531781
NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
17541782
NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
17551783
if (!Opcode)
17561784
return false;
1757-
SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
1758-
getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
1759-
getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr,
1760-
Chain };
1785+
SDValue Ops[] = {Value,
1786+
getI32Imm(isVolatile, dl),
1787+
getI32Imm(CodeAddrSpace, dl),
1788+
getI32Imm(vecType, dl),
1789+
getI32Imm(toType, dl),
1790+
getI32Imm(toTypeWidth, dl),
1791+
Addr,
1792+
Chain};
17611793
NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1762-
} else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1763-
: SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1794+
} else if (PointerSize == 64
1795+
? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1796+
: SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
17641797
Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
17651798
NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
17661799
NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
17671800
NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
17681801
if (!Opcode)
17691802
return false;
1770-
SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
1771-
getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
1772-
getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
1773-
Offset, Chain };
1803+
SDValue Ops[] = {Value,
1804+
getI32Imm(isVolatile, dl),
1805+
getI32Imm(CodeAddrSpace, dl),
1806+
getI32Imm(vecType, dl),
1807+
getI32Imm(toType, dl),
1808+
getI32Imm(toTypeWidth, dl),
1809+
Base,
1810+
Offset,
1811+
Chain};
17741812
NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1775-
} else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1776-
: SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1813+
} else if (PointerSize == 64
1814+
? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1815+
: SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
17771816
if (PointerSize == 64)
17781817
Opcode = pickOpcodeForVT(
17791818
SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
@@ -1787,10 +1826,15 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
17871826
if (!Opcode)
17881827
return false;
17891828

1790-
SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
1791-
getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
1792-
getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
1793-
Offset, Chain };
1829+
SDValue Ops[] = {Value,
1830+
getI32Imm(isVolatile, dl),
1831+
getI32Imm(CodeAddrSpace, dl),
1832+
getI32Imm(vecType, dl),
1833+
getI32Imm(toType, dl),
1834+
getI32Imm(toTypeWidth, dl),
1835+
Base,
1836+
Offset,
1837+
Chain};
17941838
NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
17951839
} else {
17961840
if (PointerSize == 64)
@@ -1806,10 +1850,14 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
18061850
NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
18071851
if (!Opcode)
18081852
return false;
1809-
SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
1810-
getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
1811-
getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2,
1812-
Chain };
1853+
SDValue Ops[] = {Value,
1854+
getI32Imm(isVolatile, dl),
1855+
getI32Imm(CodeAddrSpace, dl),
1856+
getI32Imm(vecType, dl),
1857+
getI32Imm(toType, dl),
1858+
getI32Imm(toTypeWidth, dl),
1859+
BasePtr,
1860+
Chain};
18131861
NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
18141862
}
18151863

test/CodeGen/NVPTX/load-store.ll

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
2+
3+
; CHECK-LABEL: plain
4+
define void @plain(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr {
5+
; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
6+
%a.load = load i8, i8* %a
7+
%a.add = add i8 %a.load, 1
8+
; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
9+
store i8 %a.add, i8* %a
10+
11+
; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
12+
%b.load = load i16, i16* %b
13+
%b.add = add i16 %b.load, 1
14+
; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
15+
store i16 %b.add, i16* %b
16+
17+
; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
18+
%c.load = load i32, i32* %c
19+
%c.add = add i32 %c.load, 1
20+
; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
21+
store i32 %c.add, i32* %c
22+
23+
; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
24+
%d.load = load i64, i64* %d
25+
%d.add = add i64 %d.load, 1
26+
; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
27+
store i64 %d.add, i64* %d
28+
29+
ret void
30+
}
31+
32+
; CHECK-LABEL: volatile
33+
define void @volatile(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr {
34+
; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
35+
%a.load = load volatile i8, i8* %a
36+
%a.add = add i8 %a.load, 1
37+
; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
38+
store volatile i8 %a.add, i8* %a
39+
40+
; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
41+
%b.load = load volatile i16, i16* %b
42+
%b.add = add i16 %b.load, 1
43+
; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
44+
store volatile i16 %b.add, i16* %b
45+
46+
; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
47+
%c.load = load volatile i32, i32* %c
48+
%c.add = add i32 %c.load, 1
49+
; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
50+
store volatile i32 %c.add, i32* %c
51+
52+
; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
53+
%d.load = load volatile i64, i64* %d
54+
%d.add = add i64 %d.load, 1
55+
; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
56+
store volatile i64 %d.add, i64* %d
57+
58+
ret void
59+
}
60+
61+
; CHECK-LABEL: monotonic
62+
define void @monotonic(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr {
63+
; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
64+
%a.load = load atomic i8, i8* %a monotonic, align 1
65+
%a.add = add i8 %a.load, 1
66+
; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
67+
store atomic i8 %a.add, i8* %a monotonic, align 1
68+
69+
; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
70+
%b.load = load atomic i16, i16* %b monotonic, align 2
71+
%b.add = add i16 %b.load, 1
72+
; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
73+
store atomic i16 %b.add, i16* %b monotonic, align 2
74+
75+
; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
76+
%c.load = load atomic i32, i32* %c monotonic, align 4
77+
%c.add = add i32 %c.load, 1
78+
; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
79+
store atomic i32 %c.add, i32* %c monotonic, align 4
80+
81+
; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
82+
%d.load = load atomic i64, i64* %d monotonic, align 8
83+
%d.add = add i64 %d.load, 1
84+
; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
85+
store atomic i64 %d.add, i64* %d monotonic, align 8
86+
87+
ret void
88+
}

0 commit comments

Comments
 (0)