forked from microsoft/CNTK
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGPUDataTransferer.cpp
228 lines (192 loc) · 7.32 KB
/
GPUDataTransferer.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#include "stdafx.h"
#include "Basics.h"
#include "GPUDataTransferer.h"
#include "GPUMatrix.h"
#pragma comment(lib, "cudart.lib")
#pragma warning(disable : 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<<a,b>>> syntax if a and b are size_t
#pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
#pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons
namespace Microsoft { namespace MSR { namespace CNTK {
// CUDA failed
// Since the outer code sometimes does not recover properly, as an option we log and die right away.
// This is needed for our GCD farm which has intermittent CUDA errors that sometimes cause the DBN tool, when running with MPI, to hang instead of terminating.
static void cudafail(const char* msg)
{
// TODO: get from an env variable
bool dieoncudafailure = false;
if (!dieoncudafailure)
{
RuntimeError("%s", msg);
}
fprintf(stderr, "%s\n", msg);
fprintf(stderr, "cudafail: terminating\n"), fflush(stderr);
#ifdef WIN32
TerminateProcess(GetCurrentProcess(), EXIT_FAILURE); // fail the hard way to ensure it won't hang elsewhere
#else
exit(1);
#endif
}
// allows to write cudaFunction() || "error" (CUDA runtime)
static
#ifdef WIN32
__declspec(noinline)
#endif
void
operator||(cudaError_t rc, const char* msg)
{
if (rc != cudaSuccess)
{
char buf[1000];
sprintf_s(buf, 1000, "%s: %s (cuda error %d)", msg, cudaGetErrorString(rc), rc);
cudafail(buf);
}
}
//// Base class for different data transferers.
GranularGPUDataTransferer::GranularGPUDataTransferer(int deviceId, const cudaStream_t& fetchStream, const cudaStream_t& assignStream, bool blocking)
: m_fetchStream(fetchStream),
m_assignStream(assignStream),
m_deviceId(deviceId),
m_fetchCompleteEvent(nullptr),
m_assignCompleteEvent(nullptr),
m_syncEvent(nullptr)
{
PrepareDevice(m_deviceId);
// Note: Do NOT use cudaEventBlockingSync (which supposedly yields the process)--it will totally break cudaEventSynchronize(), causing it to take 50 or 100 ms randomly.
// NOTE: We never saw this in reading prefetch.
unsigned flags = cudaEventDisableTiming;
if (blocking)
flags |= cudaEventBlockingSync;
// events
cudaEventCreateWithFlags(&m_fetchCompleteEvent, flags) || "cudaEventCreateWithFlags failed";
cudaEventCreateWithFlags(&m_assignCompleteEvent, flags) || "cudaEventCreateWithFlags failed";
cudaEventCreateWithFlags(&m_syncEvent, cudaEventDisableTiming) || "cudaEventCreateWithFlags failed";
}
GranularGPUDataTransferer::~GranularGPUDataTransferer()
{
// TODO: Check for error code and throw if !std::uncaught_exception()
cudaEventDestroy(m_assignCompleteEvent);
cudaEventDestroy(m_fetchCompleteEvent);
cudaEventDestroy(m_syncEvent);
}
void GranularGPUDataTransferer::CopyGPUToCPUAsync(const void* gpuBuffer, size_t numElements, size_t elementSize, void* cpuBuffer)
{
PrepareDevice(m_deviceId);
cudaMemcpyAsync(cpuBuffer, gpuBuffer, numElements * elementSize, cudaMemcpyDeviceToHost, GetFetchStream()) || "cudaMemcpyAsync failed";
}
void GranularGPUDataTransferer::RecordGPUToCPUCopy()
{
cudaEventRecord(m_fetchCompleteEvent, GetFetchStream()) || "cudaEventRecord failed";
}
void GranularGPUDataTransferer::WaitForCopyGPUToCPU()
{
PrepareDevice(m_deviceId);
cudaEventSynchronize(m_fetchCompleteEvent) || "cudaEventSynchronize failed";
}
void GranularGPUDataTransferer::CopyCPUToGPUAsync(const void* cpuBuffer, size_t numElements, size_t elementSize, void* gpuBuffer)
{
PrepareDevice(m_deviceId);
cudaMemcpyAsync(gpuBuffer, cpuBuffer, numElements * elementSize, cudaMemcpyHostToDevice, GetAssignStream()) || "cudaMemcpyAsync failed";
}
void GranularGPUDataTransferer::RecordCPUToGPUCopy()
{
cudaEventRecord(m_assignCompleteEvent, GetAssignStream()) || "cudaEventRecord failed";
}
void GranularGPUDataTransferer::WaitForCopyCPUToGPU()
{
PrepareDevice(m_deviceId);
cudaEventSynchronize(m_assignCompleteEvent) || "cudaEventSynchronize failed";
}
void GranularGPUDataTransferer::RecordComputeStreamSyncPoint()
{
PrepareDevice(m_deviceId);
cudaEventRecord(m_syncEvent, GetStream()) || "cudeEventRecord failed";
}
void GranularGPUDataTransferer::WaitForSyncPointOnFetchStreamAsync()
{
PrepareDevice(m_deviceId);
cudaStreamWaitEvent(GetFetchStream(), m_syncEvent, 0 /*flags 'must be 0'*/) || "cudaStreamWaitEvent failed";
}
void GranularGPUDataTransferer::WaitForSyncPointOnAssignStreamAsync()
{
PrepareDevice(m_deviceId);
cudaStreamWaitEvent(GetAssignStream(), m_syncEvent, 0 /*flags 'must be 0'*/) || "cudaStreamWaitEvent failed";
}
//// GPUDataTransferer
// same but for event
void GPUDataTransferer::SyncEvent(cudaEvent_t ev)
{
auto rc = cudaEventQuery(ev);
if (rc != cudaErrorNotReady)
{
// if Event is ready then no need to wait
rc || "cudaEventQuery failed";
return;
}
// we must wait
cudaEventSynchronize(ev) || "cudaEventSynchronize failed";
}
//streams
cudaStream_t GPUDataTransferer::s_fetchStream = NULL;
cudaStream_t GPUDataTransferer::s_assignStream = NULL;
cudaStream_t GPUDataTransferer::GetFetchStream()
{
return s_fetchStream;
}
GPUDataTransferer::GPUDataTransferer(int deviceId, bool useConcurrentStreams)
{
#pragma warning(disable : 4127)
if (useConcurrentStreams && (s_fetchStream == NULL))
{
cudaStreamCreateWithFlags(&s_fetchStream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed";
cudaStreamCreateWithFlags(&s_assignStream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed";
}
m_inner = make_unique<GranularGPUDataTransferer>(deviceId, s_fetchStream, s_assignStream);
}
GPUDataTransferer::~GPUDataTransferer()
{
// BUGBUG: we don't destroy our streams (they are static variables); we need a static destructor, I am too lazy now
}
void GPUDataTransferer::CopyGPUToCPUAsync(void* gpuBuffer, size_t totalSize, void* cpuBuffer)
{
m_inner->CopyGPUToCPUAsync(gpuBuffer, 1, totalSize, cpuBuffer);
m_inner->RecordGPUToCPUCopy();
}
void GPUDataTransferer::CopyCPUToGPUAsync(void* cpuBuffer, size_t totalSize, void* gpuBuffer)
{
m_inner->CopyCPUToGPUAsync(cpuBuffer, 1, totalSize, gpuBuffer);
m_inner->RecordCPUToGPUCopy();
}
void GPUDataTransferer::WaitForCopyGPUToCPUAsync()
{
PrepareDevice(m_inner->m_deviceId);
SyncEvent(m_inner->m_fetchCompleteEvent);
}
void GPUDataTransferer::WaitForCopyCPUToGPUAsync()
{
PrepareDevice(m_inner->m_deviceId);
SyncEvent(m_inner->m_assignCompleteEvent);
}
/// PrefetchGPUDataTransferer
PrefetchGPUDataTransferer::PrefetchGPUDataTransferer(int deviceId) : GranularGPUDataTransferer(deviceId, nullptr, nullptr, true)
{
cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed (PrefetchGPUDataTransferer ctor)";
}
PrefetchGPUDataTransferer::~PrefetchGPUDataTransferer()
{
try
{
PrepareDevice(m_deviceId);
}
catch (...)
{
// the error is already logged
return;
}
auto code = cudaStreamDestroy(m_stream);
if (code != cudaSuccess)
{
std::cerr << "cudaStreamDestroy failed (PrefetchGPUDataTransferer dtor): "
<< cudaGetErrorString(code) << " (cuda error " << code << ")"<< std::endl;
}
}
}}}