Fix CUDA sharing across processes (pytorch#530)

marszhuo · Jan 20, 2017 · 0c69fd5 · 0c69fd5
1 parent c991258
commit 0c69fd5
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 3 deletions.
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
@@ -38,7 +38,16 @@ def send_tensor(queue, event, tp):
     event.wait()
 
 
+def sum_tensors(inq, outq):
+    with torch.cuda.device(1):
+        tensors = inq.get()
+        for tensor in tensors:
+            outq.put((tensor.sum(), tensor.get_device(),
+                      tensor.numel(), tensor.storage().size()))
+
+
 def queue_get_exception(inqueue, outqueue):
+    os.close(2)  # hide expected error message
     try:
         torch.zeros(5, 5).cuda()
     except Exception as e:
@@ -277,6 +286,34 @@ def test_cuda(self):
         torch.cuda.FloatTensor([1])  # initialize CUDA outside of leak checker
         self._test_sharing(mp.get_context('spawn'), torch.cuda.FloatTensor)
 
+
+    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    def test_cuda_small_tensors(self):
+        # Check multiple small tensors which will likely use the same
+        # underlying cached allocation
+        ctx = mp.get_context('spawn')
+        tensors = []
+        for i in range(5):
+            tensors += [torch.range(i * 5, (i * 5) + 4).cuda()]
+
+        inq = ctx.Queue()
+        outq = ctx.Queue()
+        inq.put(tensors)
+        p = ctx.Process(target=sum_tensors, args=(inq, outq))
+        p.start()
+
+        results = []
+        for i in range(5):
+            results.append(outq.get())
+        p.join()
+
+        for i, tensor in enumerate(tensors):
+            v, device, tensor_size, storage_size = results[i]
+            self.assertEqual(v, torch.range(i * 5, (i * 5) + 4).sum())
+            self.assertEqual(device, 0)
+            self.assertEqual(tensor_size, 5)
+            self.assertEqual(storage_size, 5)
+
     @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
     def test_cuda_bad_call(self):
         # Initialize CUDA

diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp
@@ -209,7 +209,7 @@ static PyObject * THPStorage_(shareCuda)(THPStorage *self)
 
     _handle = PyBytes_FromStringAndSize((char *)&handle, CUDA_IPC_HANDLE_SIZE);
     _offset = PyLong_FromSsize_t((Py_ssize_t)offset);
-    size = PyLong_FromSize_t(base_size);
+    size = PyLong_FromSize_t(base_size / sizeof(real));
   }
   if (!tuple || !device || !_handle || !size || !_offset || !view_size) {
     return NULL;
@@ -244,7 +244,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
   ptrdiff_t offset = (ptrdiff_t)THPUtils_unpackLong(_offset);
   size_t view_size =  (size_t)THPUtils_unpackLong(_view_size);
 
-  THCPAutoGPU((int)THPUtils_unpackLong(_device));
+  THCPAutoGPU __autogpu((int)THPUtils_unpackLong(_device));
 
   char *buffer;
   Py_ssize_t handle_size;

diff --git a/torch/multiprocessing/reductions.py b/torch/multiprocessing/reductions.py
@@ -92,7 +92,7 @@ def rebuild_storage_filename(cls, manager, handle, size):
 def reubild_storage_cuda(cls, device, handle, size, offset, view_size):
     storage = storage_from_cache(cls, handle)
     if storage is not None:
-        return storage._new_view(offset, size)
+        return storage._new_view(offset, view_size)
     torch.cuda._lazy_init()
     storage = cls._new_shared_cuda(device, handle, size, offset, view_size)
     shared_cache[handle] = storage._weak_ref(StorageRef)