Merge pull request tensorflow#10531 from llhe/rdma_fix

yifeif · web-flow · commit 05c1dfa95c6c · 2017-06-22T10:58:13.000-07:00
Improve RDMA rendezvous speed
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
@@ -683,7 +684,6 @@ void RdmaTensorBuffer::SendNextItem() {
                          << " error message: " << status.error_message();
       size_t buffer_size = RdmaMessage::kMessageTotalBytes;
       size_t tensor_bytes = 0;
-      TensorProto proto;
       // Figures out which device the tensor is hosted on.
       Device* src_dev = nullptr;
       Status s = channel_->adapter_->worker_env_->device_mgr->LookupDevice(
@@ -703,21 +703,47 @@ void RdmaTensorBuffer::SendNextItem() {
       CHECK(s.ok()) << "dst device not found";
       AllocatorAttributes dst_alloc_attr;
       dst_alloc_attr.set_on_host(true);
+
+      bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
       // string tensor needs to be serialized
+      Tensor copy;
+      StringPiece copy_buf;
+      TensorProto proto;
       if (src_dev->tensorflow_gpu_device_info() &&
           (!send_args.alloc_attrs.on_host())) {
         CHECK(send_args.device_context)
-            << "send dev name: " << src_dev->name()
-            << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-        // "val" is on a GPU. Uses GPUUtil to fill the proto.
-        s = VerbsUtil::SetProtoFromGPUSync(
-            in, src_dev, send_args.device_context, &proto, is_dead);
-        CHECK(s.ok()) << "set proto from gpu sync";
+          << "send dev name: " << src_dev->name()
+          << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+
+        if (can_memcpy) {
+          AllocatorAttributes host_alloc_attrs;
+          host_alloc_attrs.set_gpu_compatible(true);
+          host_alloc_attrs.set_on_host(true);
+          Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+          copy = Tensor(alloc, in.dtype(), in.shape());
+          s = VerbsUtil::CopyGPUTensorToCPUSync(
+              src_dev, send_args.device_context, &in, &copy);
+          CHECK(s.ok()) << "copy tensor from gpu sync";
+          copy_buf = copy.tensor_data();
+        } else {
+          // "val" is on a GPU. Uses GPUUtil to fill the proto.
+          s = VerbsUtil::SetProtoFromGPUSync(
+              in, src_dev, send_args.device_context, &proto, is_dead);
+          CHECK(s.ok()) << "set proto from gpu sync";
+        }
       } else {
         // tensor is in CPU memory.
-        in.AsProtoTensorContent(&proto);
+        if (can_memcpy) {
+          copy_buf = in.tensor_data();
+        } else {
+          in.AsProtoTensorContent(&proto);
+        }
+      }
+      if (can_memcpy) {
+        tensor_bytes = in.TotalBytes();
+      } else {
+        tensor_bytes = proto.ByteSize();
       }
-      tensor_bytes = proto.ByteSize();
       // maybe some margin for string tensor?
       buffer_size += tensor_bytes;
       // prepare message
@@ -771,7 +797,16 @@ void RdmaTensorBuffer::SendNextItem() {
               static_cast<void*>(static_cast<char*>(buffer_) +
                                  RdmaMessage::kTensorBufferStartIndex);
           CHECK(tensor_bytes + RdmaMessage::kTensorBufferStartIndex <= size_);
-          proto.SerializeToArray(output, tensor_bytes);
+          if (can_memcpy) {
+            CHECK(copy_buf.size() == tensor_bytes)
+               << "unexpected tensor size: "
+               << copy_buf.size()
+               << " != "
+               << tensor_bytes;
+            memcpy(output, copy_buf.data(), tensor_bytes);
+          } else {
+            proto.SerializeToArray(output, tensor_bytes);
+          }
         } else {
           buffer_size = RdmaMessage::kMessageTotalBytes;
         }
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -99,12 +100,40 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
     if (!rm.is_dead_) {
       void* input = static_cast<char*>(rb->buffer_) +
                     RdmaMessage::kTensorBufferStartIndex;
-      TensorProto proto;
-      CHECK(rm.tensor_bytes_ + RdmaMessage::kTensorBufferStartIndex <=
-            rb->size_);
-      CHECK(ParseProtoUnlimited(&proto, input, rm.tensor_bytes_))
-          << "fail to parse proto from array";
-      s = dst_dev->MakeTensorFromProto(proto, recv_args.alloc_attrs, &val);
+      bool can_memcpy = DataTypeCanUseMemcpy(rm.data_type_);
+      if (can_memcpy) {
+        if (dst_dev->tensorflow_gpu_device_info() &&
+            (!recv_args.alloc_attrs.on_host())) {
+          CHECK(recv_args.device_context)
+            << "send dev name: " << src_dev->name()
+            << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+          Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+          Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
+          memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
+
+          Allocator* dst_alloc = dst_dev->GetAllocator(recv_args.alloc_attrs);
+          Tensor gpu_copy(dst_alloc, rm.data_type_, rm.tensor_shape_);
+          s = VerbsUtil::CopyCPUTensorToGPUSync(&copy, recv_args.device_context,
+                                                dst_dev, &gpu_copy);
+          CHECK(s.ok()) << "copy tensor to gpu sync";
+          val = std::move(gpu_copy);
+        } else {
+          AllocatorAttributes host_alloc_attrs;
+          host_alloc_attrs.set_gpu_compatible(true);
+          host_alloc_attrs.set_on_host(true);
+          Allocator* alloc = dst_dev->GetAllocator(host_alloc_attrs);
+          Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
+          memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
+          val = std::move(copy);
+        }
+      } else {
+        TensorProto proto;
+        CHECK(rm.tensor_bytes_ + RdmaMessage::kTensorBufferStartIndex <=
+              rb->size_);
+        CHECK(ParseProtoUnlimited(&proto, input, rm.tensor_bytes_))
+            << "fail to parse proto from array";
+        s = dst_dev->MakeTensorFromProto(proto, recv_args.alloc_attrs, &val);
+      }
     }
 
     rc->RemoveRecvCallback(key_with_step_id);
diff --git a/tensorflow/contrib/verbs/verbs_util.cc b/tensorflow/contrib/verbs/verbs_util.cc
@@ -20,6 +20,40 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 namespace tensorflow {
 
+// static sync wrapper:
+Status VerbsUtil::CopyGPUTensorToCPUSync(Device* gpu_device,
+                              const DeviceContext* device_context,
+                              const Tensor* gpu_tensor,
+                              Tensor* cpu_tensor) {
+  Notification n;
+  Status status;
+  GPUUtil::CopyGPUTensorToCPU(gpu_device, device_context,
+                              gpu_tensor, cpu_tensor,
+                              [&n, &status](const Status& s) {
+                                status = s;
+                                n.Notify();
+                              });
+  n.WaitForNotification();
+  return status;
+}
+
+// static sync wrapper:
+Status VerbsUtil::CopyCPUTensorToGPUSync(const Tensor* cpu_tensor,
+                                         const DeviceContext* device_context,
+                                         Device* gpu_device,
+                                         Tensor* gpu_tensor) {
+  Notification n;
+  Status status;
+  GPUUtil::CopyCPUTensorToGPU(cpu_tensor, device_context,
+                              gpu_device, gpu_tensor,
+                              [&n, &status](const Status& s) {
+                                status = s;
+                                n.Notify();
+                              });
+  n.WaitForNotification();
+  return status;
+}
+
 // static sync wrapper:
 Status VerbsUtil::SetProtoFromGPUSync(const Tensor& tensor, Device* dev,
                                       const DeviceContext* device_context,
diff --git a/tensorflow/contrib/verbs/verbs_util.h b/tensorflow/contrib/verbs/verbs_util.h
@@ -28,6 +28,16 @@ class TensorProto;
 
 class VerbsUtil {
  public:
+  // synchronous wrapper of CopyGPUTensorToCPU
+  static Status CopyGPUTensorToCPUSync(Device* gpu_device,
+                                       const DeviceContext* device_context,
+                                       const Tensor* gpu_tensor,
+                                       Tensor* cpu_tensor);
+  // synchronous wrapper of CopyCPUTensorToGPU
+  static Status CopyCPUTensorToGPUSync(const Tensor* cpu_tensor,
+                                       const DeviceContext* device_context,
+                                       Device* gpu_device,
+                                       Tensor* gpu_tensor);
   // synchronous wrapper of SetProtoFromGPU
   static Status SetProtoFromGPUSync(const Tensor& tensor, Device* dev,
                                     const DeviceContext* device_context,