Skip to content

Commit

Permalink
Merge pull request torch#610 from colesbury/lazy
Browse files Browse the repository at this point in the history
Lazily initialize CUDA devices
  • Loading branch information
soumith authored Nov 23, 2016
2 parents f593224 + 39a13d0 commit f46ca39
Show file tree
Hide file tree
Showing 7 changed files with 246 additions and 255 deletions.
10 changes: 5 additions & 5 deletions init.c
Original file line number Diff line number Diff line change
Expand Up @@ -776,35 +776,35 @@ static int cutorch_getDeviceProperties(lua_State *L)

static int cutorch_seed(lua_State *L)
{
unsigned long seed = THCRandom_seed(cutorch_getstate(L));
unsigned long long seed = THCRandom_seed(cutorch_getstate(L));
lua_pushnumber(L, seed);
return 1;
}

static int cutorch_seedAll(lua_State *L)
{
unsigned long seed = THCRandom_seedAll(cutorch_getstate(L));
unsigned long long seed = THCRandom_seedAll(cutorch_getstate(L));
lua_pushnumber(L, seed);
return 1;
}

static int cutorch_initialSeed(lua_State *L)
{
unsigned long seed = THCRandom_initialSeed(cutorch_getstate(L));
unsigned long long seed = THCRandom_initialSeed(cutorch_getstate(L));
lua_pushnumber(L, seed);
return 1;
}

static int cutorch_manualSeed(lua_State *L)
{
unsigned long seed = luaL_checknumber(L, 1);
unsigned long long seed = luaL_checknumber(L, 1);
THCRandom_manualSeed(cutorch_getstate(L), seed);
return 0;
}

static int cutorch_manualSeedAll(lua_State* L)
{
unsigned long seed = luaL_checknumber(L, 1);
unsigned long long seed = luaL_checknumber(L, 1);
THCRandom_manualSeedAll(cutorch_getstate(L), seed);
return 0;
}
Expand Down
5 changes: 3 additions & 2 deletions lib/THC/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.7" OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "4.7" )
# add c++11 flag
set_source_files_properties(THCCachingAllocator.cpp PROPERTIES COMPILE_FLAGS -std=c++11)
set_source_files_properties(THCTensorRandom.cpp THCCachingAllocator.cpp PROPERTIES COMPILE_FLAGS -std=c++11)
else()
# add c++0x flag
set_source_files_properties(THCCachingAllocator.cpp PROPERTIES COMPILE_FLAGS -std=c++0x)
set_source_files_properties(THCTensorRandom.cpp THCCachingAllocator.cpp PROPERTIES COMPILE_FLAGS -std=c++0x)
endif()
else()
SET(CMAKE_CXX_STANDARD 11)
Expand Down Expand Up @@ -130,6 +130,7 @@ SET(src
THCStream.c
THCTensor.c
THCTensorCopy.c
THCTensorRandom.cpp
THCThreadLocal.c
)

Expand Down
211 changes: 89 additions & 122 deletions lib/THC/THCGeneral.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,21 @@ void THCudaInit(THCState* state)
state->cudaUVAAllocator = (THAllocator*)malloc(sizeof(THAllocator));
THCUVAAllocator_init(state->cudaUVAAllocator);

/* Enable P2P access between all pairs, if possible */
THCudaEnablePeerToPeerAccess(state);
// By default, all direct p2p kernel access (besides copy) is disallowed,
// since direct access without knowing whether or not a certain operation
// should be cross-GPU leads to synchronization errors. The user can choose
// to disable this functionality, however.
state->p2pKernelAccessEnabled = 0;

// p2pAccessEnabled records if p2p copies are allowed between pairs of
// devices. Values include "1" (copy allowed), "0" (copy not allowed), and
// "-1" (unknown).
state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices);
for (int i = 0; i < numDevices; ++i) {
state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices);
memset(state->p2pAccessEnabled[i], -1, sizeof(int) * numDevices);
state->p2pAccessEnabled[i][i] = 1;
}

for (int i = 0; i < numDevices; ++i) {
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i);
Expand All @@ -98,22 +111,15 @@ void THCudaInit(THCState* state)
int numSM = state->deviceProperties[i].multiProcessorCount;
size_t sizePerStream = numSM * GLOBAL_SCRATCH_SPACE_PER_SM_STREAM;
res->scratchSpacePerStream = sizePerStream;

/* Allocate scratch space for each stream */
res->devScratchSpacePerStream = (void**) malloc(sizeof(void*));
THCudaCheck(THCudaMalloc(state, &res->devScratchSpacePerStream[0],
sizePerStream));
}

/* Restore to previous device */
THCudaCheck(cudaSetDevice(device));

/* There is no such thing as a default cublas handle.
To maintain consistency with streams API, handle 0 is always NULL and we
start counting at 1. If currentPerDeviceBlasHandle is 0 (the default
thread-local value), then we assume it means 1.
*/
THCState_reserveBlasHandles(state, 1);
// Unlike CUDA streams, there is no NULL cuBLAS handle. The default THC
// cuBLAS handle is the first user BLAS handle. Note that the actual BLAS
// handles are created lazily.
state->numUserBlasHandles = 1;

state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically
state->heapDelta = 0;
Expand Down Expand Up @@ -147,10 +153,9 @@ void THCudaShutdown(THCState* state)
for (int i = 1; i <= state->numUserStreams; ++i) {
THCStream_free(res->streams[i]);
}
/* Free Torch-defined handles (0 is NULL for consistency with streams API) */
for (int handle = 1; handle <= state->numUserBlasHandles; ++handle) {
THCublasCheck(cublasDestroy(
THCState_getDeviceBlasHandle(state, dev, handle)));
/* Free user defined BLAS handles */
for (int i = 0; i < res->numBlasHandles; ++i) {
THCublasCheck(cublasDestroy(res->blasHandles[i]));
}
/* Free per-stream scratch space; starts at 0 because there is space for
the default stream as well*/
Expand All @@ -174,79 +179,36 @@ void THCudaShutdown(THCState* state)
THCudaCheck(cudaSetDevice(prevDev));
}

void THCudaEnablePeerToPeerAccess(THCState* state)
{
/* By default, all direct p2p kernel access (besides copy) is disallowed, */
/* since direct access without knowing whether or not a certain operation */
/* should be cross-GPU leads to synchronization errors. The user can choose */
/* to disable this functionality, however. */
state->p2pKernelAccessEnabled = 0;

int prevDev = -1;
THCudaCheck(cudaGetDevice(&prevDev));

int numDevices = -1;
THCudaCheck(cudaGetDeviceCount(&numDevices));

state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices);
for (int i = 0; i < numDevices; ++i) {
state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices);
}

/* Build a table of all allowed p2p accesses, to avoid checking the p2p
status at runtime. */
for (int i = 0; i < numDevices; ++i) {
THCudaCheck(cudaSetDevice(i));

for (int j = 0; j < numDevices; ++j) {
/* Presume no access by default */
state->p2pAccessEnabled[i][j] = 0;

if (i == j) {
/* A GPU can access itself */
state->p2pAccessEnabled[i][j] = 1;
} else {
int access = 0;
THCudaCheck(cudaDeviceCanAccessPeer(&access, i, j));

if (access) {
cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
/* It is possible that another thread has already enabled access. */
/* Any future call to cudaGetLastError will now return an error, */
/* even though we've already dealt with this specific error here. */
/* Call cudaGetLastError once to reset the last error state. */
cudaGetLastError();

/* The above should have cleared status */
THCudaCheck(cudaGetLastError());
} else {
/* In case there are other unhandled errors returned from the */
/* above */
THCudaCheck(err);
}

/* Access could be enabled, or was already enabled */
state->p2pAccessEnabled[i][j] = 1;
}
}
}
}

/* Restore previous device before continuing */
THCudaCheck(cudaSetDevice(prevDev));
}

int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
{
if (dev < 0 || dev >= state->numDevices) {
THError("%d is not a device", dev);
}

if (devToAccess < 0 || dev >= state->numDevices) {
if (devToAccess < 0 || devToAccess >= state->numDevices) {
THError("%d is not a device", devToAccess);
}
if (state->p2pAccessEnabled[dev][devToAccess] == -1) {
int prevDev = 0;
THCudaCheck(cudaGetDevice(&prevDev));
THCudaCheck(cudaSetDevice(dev));

int access = 0;
THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));
if (access) {
cudaError_t err = cudaDeviceEnablePeerAccess(devToAccess, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
// ignore and clear the error if access was already enabled
cudaGetLastError();
} else {
THCudaCheck(err);
}
state->p2pAccessEnabled[dev][devToAccess] = 1;
} else {
state->p2pAccessEnabled[dev][devToAccess] = 0;
}

THCudaCheck(cudaSetDevice(prevDev));
}
return state->p2pAccessEnabled[dev][devToAccess];
}

Expand Down Expand Up @@ -327,6 +289,20 @@ int THCState_getNumDevices(THCState *state)
return state->numDevices;
}

static void THCState_initializeScratchSpace(THCState* state, int dev)
{
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
if (res->devScratchSpacePerStream) {
return;
}
size_t size = (state->numUserStreams + 1) * sizeof(void*);
void** scratch = (void**)malloc(size);
for (int i = 0; i <= state->numUserStreams; ++i) {
THCudaCheck(THCudaMalloc(state, &scratch[i], res->scratchSpacePerStream));
}
res->devScratchSpacePerStream = scratch;
}

void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking)
{
if (numStreams <= state->numUserStreams)
Expand All @@ -346,6 +322,7 @@ void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking)
THCStream** newStreams = realloc(res->streams, (numStreams + 1) * sizeof(THCStream*));
THAssert(newStreams);

THCState_initializeScratchSpace(state, dev);
void** newScratchSpace = realloc(res->devScratchSpacePerStream, (numStreams + 1) * sizeof(void*));
THAssert(newScratchSpace);

Expand All @@ -369,47 +346,39 @@ void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking)
THCudaCheck(cudaSetDevice(prevDev));
}

void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
void THCState_reserveDeviceBlasHandles(THCState* state, int device, int numBlasHandles)
{
if (numBlasHandles <= state->numUserBlasHandles)
{
int prevDev = -1;
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
if (numBlasHandles <= res->numBlasHandles) {
return;
}

int prevDev = -1;
THCudaCheck(cudaGetDevice(&prevDev));
THCudaCheck(cudaSetDevice(device));

/* Otherwise, we have to allocate a new set of blasHandles */
for (int dev = 0; dev < state->numDevices; ++dev) {
THCudaCheck(cudaSetDevice(dev));

/* +1 to be consistent with stream API, blas handle 0 is NULL and unused */
cublasHandle_t* newBlasHandles =
(cublasHandle_t*) malloc((numBlasHandles + 1) * sizeof(cublasHandle_t));

/* Copy over old blasHandles
(0 is NULL, 1 ... numUserBlasHandles are rest) */
newBlasHandles[0] = NULL;
for (int hndl = 1; hndl <= state->numUserBlasHandles; ++hndl) {
newBlasHandles[hndl] = THCState_getDeviceBlasHandle(state, dev, hndl);
}

/* Allocate new handles */
for (int hndl = state->numUserBlasHandles + 1; hndl <= numBlasHandles; ++hndl) {
newBlasHandles[hndl] = NULL;
THCublasCheck(cublasCreate(newBlasHandles + hndl));
}

THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
free(res->blasHandles);
res->blasHandles = newBlasHandles;
size_t size = numBlasHandles * sizeof(cublasHandle_t);
cublasHandle_t* handles = (cublasHandle_t*) realloc(res->blasHandles, size);
for (int i = res->numBlasHandles; i < numBlasHandles; ++i) {
handles[i] = NULL;
THCublasCheck(cublasCreate(&handles[i]));
}

state->numUserBlasHandles = numBlasHandles;
res->blasHandles = handles;
res->numBlasHandles = numBlasHandles;

THCudaCheck(cudaSetDevice(prevDev));
}

void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
{
// cuBLAS handles are created lazily from THCState_getDeviceBlasHandle
// to avoid initializing unused devices
if (numBlasHandles > state->numUserBlasHandles)
{
state->numUserBlasHandles = numBlasHandles;
}
}

int THCState_getNumStreams(THCState* state)
{
return state->numUserStreams;
Expand Down Expand Up @@ -445,12 +414,13 @@ cudaStream_t THCState_getDeviceStream(THCState *state, int device, int streamInd

cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle)
{
if (handle <= 0 || handle > state->numUserBlasHandles)
{
if (handle <= 0 || handle > state->numUserBlasHandles) {
THError("%d is not a valid handle, valid range is: (1, %d)",
handle, state->numUserBlasHandles);
}
return THCState_getDeviceResourcePtr(state, device)->blasHandles[handle];
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
THCState_reserveDeviceBlasHandles(state, device, handle);
return res->blasHandles[handle - 1];
}

static THCStream* THCState_getStreamOnDevice(THCState* state, int device)
Expand Down Expand Up @@ -592,16 +562,13 @@ void* THCState_getCurrentDeviceScratchSpace(THCState* state)
return THCState_getDeviceScratchSpace(state, device, stream);
}

void* THCState_getDeviceScratchSpace(THCState* state, int device, int stream)
void* THCState_getDeviceScratchSpace(THCState* state, int dev, int stream)
{
THCCudaResourcesPerDevice* res =
THCState_getDeviceResourcePtr(state, device);

if (stream > state->numUserStreams || stream < 0)
{
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
if (stream > state->numUserStreams || stream < 0) {
THError("%d is not a stream", stream);
}

THCState_initializeScratchSpace(state, dev);
return res->devScratchSpacePerStream[stream];
}

Expand Down
6 changes: 4 additions & 2 deletions lib/THC/THCGeneral.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,14 @@ typedef struct _THCDeviceAllocator {

typedef struct _THCCudaResourcesPerDevice {
THCStream** streams;
/* Number of materialized cuBLAS handles */
int numBlasHandles;
/* cuBLAS handes are lazily initialized */
cublasHandle_t* blasHandles;
/* Size of scratch space per each stream on this device available */
size_t scratchSpacePerStream;
/* Device-resident scratch space per stream, used for global memory
reduction kernels. */
reduction kernels. Lazily initialized. */
void** devScratchSpacePerStream;
} THCCudaResourcesPerDevice;

Expand Down Expand Up @@ -115,7 +118,6 @@ THC_API void THCState_free(THCState* state);

THC_API void THCudaInit(THCState* state);
THC_API void THCudaShutdown(THCState* state);
THC_API void THCudaEnablePeerToPeerAccess(THCState* state);

/* If device `dev` can access allocations on device `devToAccess`, this will return */
/* 1; otherwise, 0. */
Expand Down
Loading

0 comments on commit f46ca39

Please sign in to comment.