diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py index e1b85946fa6cbf..fe9b208b14f738 100644 --- a/test/test_multiprocessing.py +++ b/test/test_multiprocessing.py @@ -38,7 +38,16 @@ def send_tensor(queue, event, tp): event.wait() +def sum_tensors(inq, outq): + with torch.cuda.device(1): + tensors = inq.get() + for tensor in tensors: + outq.put((tensor.sum(), tensor.get_device(), + tensor.numel(), tensor.storage().size())) + + def queue_get_exception(inqueue, outqueue): + os.close(2) # hide expected error message try: torch.zeros(5, 5).cuda() except Exception as e: @@ -277,6 +286,34 @@ def test_cuda(self): torch.cuda.FloatTensor([1]) # initialize CUDA outside of leak checker self._test_sharing(mp.get_context('spawn'), torch.cuda.FloatTensor) + + @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available') + def test_cuda_small_tensors(self): + # Check multiple small tensors which will likely use the same + # underlying cached allocation + ctx = mp.get_context('spawn') + tensors = [] + for i in range(5): + tensors += [torch.range(i * 5, (i * 5) + 4).cuda()] + + inq = ctx.Queue() + outq = ctx.Queue() + inq.put(tensors) + p = ctx.Process(target=sum_tensors, args=(inq, outq)) + p.start() + + results = [] + for i in range(5): + results.append(outq.get()) + p.join() + + for i, tensor in enumerate(tensors): + v, device, tensor_size, storage_size = results[i] + self.assertEqual(v, torch.range(i * 5, (i * 5) + 4).sum()) + self.assertEqual(device, 0) + self.assertEqual(tensor_size, 5) + self.assertEqual(storage_size, 5) + @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available') def test_cuda_bad_call(self): # Initialize CUDA diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp index 868e4361800389..38fe3bda6f880d 100644 --- a/torch/csrc/generic/StorageSharing.cpp +++ b/torch/csrc/generic/StorageSharing.cpp @@ -209,7 +209,7 @@ static PyObject * THPStorage_(shareCuda)(THPStorage *self) _handle = PyBytes_FromStringAndSize((char *)&handle, CUDA_IPC_HANDLE_SIZE); _offset = PyLong_FromSsize_t((Py_ssize_t)offset); - size = PyLong_FromSize_t(base_size); + size = PyLong_FromSize_t(base_size / sizeof(real)); } if (!tuple || !device || !_handle || !size || !_offset || !view_size) { return NULL; @@ -244,7 +244,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args) ptrdiff_t offset = (ptrdiff_t)THPUtils_unpackLong(_offset); size_t view_size = (size_t)THPUtils_unpackLong(_view_size); - THCPAutoGPU((int)THPUtils_unpackLong(_device)); + THCPAutoGPU __autogpu((int)THPUtils_unpackLong(_device)); char *buffer; Py_ssize_t handle_size; diff --git a/torch/multiprocessing/reductions.py b/torch/multiprocessing/reductions.py index 967c0bd149338f..e7e384b25ab22a 100644 --- a/torch/multiprocessing/reductions.py +++ b/torch/multiprocessing/reductions.py @@ -92,7 +92,7 @@ def rebuild_storage_filename(cls, manager, handle, size): def reubild_storage_cuda(cls, device, handle, size, offset, view_size): storage = storage_from_cache(cls, handle) if storage is not None: - return storage._new_view(offset, size) + return storage._new_view(offset, view_size) torch.cuda._lazy_init() storage = cls._new_shared_cuda(device, handle, size, offset, view_size) shared_cache[handle] = storage._weak_ref(StorageRef)