Skip to content

Commit

Permalink
Fix CUDA sharing across processes (pytorch#530)
Browse files Browse the repository at this point in the history
  • Loading branch information
colesbury authored and soumith committed Jan 20, 2017
1 parent c991258 commit 0c69fd5
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 3 deletions.
37 changes: 37 additions & 0 deletions test/test_multiprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,16 @@ def send_tensor(queue, event, tp):
event.wait()


def sum_tensors(inq, outq):
with torch.cuda.device(1):
tensors = inq.get()
for tensor in tensors:
outq.put((tensor.sum(), tensor.get_device(),
tensor.numel(), tensor.storage().size()))


def queue_get_exception(inqueue, outqueue):
os.close(2) # hide expected error message
try:
torch.zeros(5, 5).cuda()
except Exception as e:
Expand Down Expand Up @@ -277,6 +286,34 @@ def test_cuda(self):
torch.cuda.FloatTensor([1]) # initialize CUDA outside of leak checker
self._test_sharing(mp.get_context('spawn'), torch.cuda.FloatTensor)


@unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
def test_cuda_small_tensors(self):
# Check multiple small tensors which will likely use the same
# underlying cached allocation
ctx = mp.get_context('spawn')
tensors = []
for i in range(5):
tensors += [torch.range(i * 5, (i * 5) + 4).cuda()]

inq = ctx.Queue()
outq = ctx.Queue()
inq.put(tensors)
p = ctx.Process(target=sum_tensors, args=(inq, outq))
p.start()

results = []
for i in range(5):
results.append(outq.get())
p.join()

for i, tensor in enumerate(tensors):
v, device, tensor_size, storage_size = results[i]
self.assertEqual(v, torch.range(i * 5, (i * 5) + 4).sum())
self.assertEqual(device, 0)
self.assertEqual(tensor_size, 5)
self.assertEqual(storage_size, 5)

@unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
def test_cuda_bad_call(self):
# Initialize CUDA
Expand Down
4 changes: 2 additions & 2 deletions torch/csrc/generic/StorageSharing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ static PyObject * THPStorage_(shareCuda)(THPStorage *self)

_handle = PyBytes_FromStringAndSize((char *)&handle, CUDA_IPC_HANDLE_SIZE);
_offset = PyLong_FromSsize_t((Py_ssize_t)offset);
size = PyLong_FromSize_t(base_size);
size = PyLong_FromSize_t(base_size / sizeof(real));
}
if (!tuple || !device || !_handle || !size || !_offset || !view_size) {
return NULL;
Expand Down Expand Up @@ -244,7 +244,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
ptrdiff_t offset = (ptrdiff_t)THPUtils_unpackLong(_offset);
size_t view_size = (size_t)THPUtils_unpackLong(_view_size);

THCPAutoGPU((int)THPUtils_unpackLong(_device));
THCPAutoGPU __autogpu((int)THPUtils_unpackLong(_device));

char *buffer;
Py_ssize_t handle_size;
Expand Down
2 changes: 1 addition & 1 deletion torch/multiprocessing/reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def rebuild_storage_filename(cls, manager, handle, size):
def reubild_storage_cuda(cls, device, handle, size, offset, view_size):
storage = storage_from_cache(cls, handle)
if storage is not None:
return storage._new_view(offset, size)
return storage._new_view(offset, view_size)
torch.cuda._lazy_init()
storage = cls._new_shared_cuda(device, handle, size, offset, view_size)
shared_cache[handle] = storage._weak_ref(StorageRef)
Expand Down

0 comments on commit 0c69fd5

Please sign in to comment.