Skip to content

Commit

Permalink
Force the CUDA runtime initialization before device creation.
Browse files Browse the repository at this point in the history
This is to avoid silent failure and garbage results produced when launching two TensorFlow programs simultaneously in two different processes.

PiperOrigin-RevId: 173456597
  • Loading branch information
Yangzihao Wang authored and tensorflower-gardener committed Oct 25, 2017
1 parent 71beb2d commit 5fe90b5
Showing 1 changed file with 28 additions and 0 deletions.
28 changes: 28 additions & 0 deletions tensorflow/core/common_runtime/gpu/gpu_device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,34 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
if (static_cast<size_t>(n) > valid_gpu_ids.size()) {
n = valid_gpu_ids.size();
}
// Save the original device.
int original_device = 0;
cudaError_t err = cudaGetDevice(&original_device);
if (err != cudaSuccess) {
return errors::Internal("cudaGetDevice() failed. Status: ",
cudaGetErrorString(err));
}
// Force to implicitly initialize CUDA runtime on each valid GPU before
// CreateGPUDevice().
for (int gpu_id : valid_gpu_ids) {
err = cudaSetDevice(gpu_id);
if (err != cudaSuccess) {
return errors::Internal("cudaSetDevice() on GPU:", gpu_id,
" failed. Status: ", cudaGetErrorString(err));
}
err = cudaFree(nullptr);
if (err != cudaSuccess) {
return errors::Internal(
"CUDA runtime implicit initialization on GPU:", gpu_id,
" failed. Status: ", cudaGetErrorString(err));
}
}
// Reset to the original device.
err = cudaSetDevice(original_device);
if (err != cudaSuccess) {
return errors::Internal("cudaSetDevice() on GPU:", original_device,
" failed. Status: ", cudaGetErrorString(err));
}
for (int i = 0; i < n; i++) {
BaseGPUDevice* gpu_device;
TF_RETURN_IF_ERROR(CreateGPUDevice(
Expand Down

0 comments on commit 5fe90b5

Please sign in to comment.