Tensorflow CUDA helper: custom impl of atomicMax(uint64..).

The motivation is this particular overload is only available for compute capability >= 3.5. We'd like to make sure any callers (the Multinomial GPU kernel) have consistent semantics, so we replace the atomicExch() workaround with this custom impl, when needs be. Change: 123468056
chongwei111 · May 27, 2016 · ef7013c · ef7013c
1 parent 5185d44
commit ef7013c
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 14 deletions.
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -151,17 +151,8 @@ __global__ void MultinomialKernel(int32 nthreads, const int32 num_classes,
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     const int maxima_idx = index / num_classes;
     if (ldg(maxima + maxima_idx) == ldg(scores + index)) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-      // The uint64 overload of atomicMax() only works for a compute capability
-      // of >= 3.5.  If not satisfied, we resort to using atomicExch() which
-      // does not guarantee deterministic results across runs (in the presence
-      // of multiple winners).
       CudaAtomicMax(reinterpret_cast<uint64*>(output + maxima_idx),
                     static_cast<uint64>(index % num_classes));
-#else
-      CudaAtomicExch(reinterpret_cast<uint64*>(output + maxima_idx),
-                     static_cast<uint64>(index % num_classes));
-#endif
     }
   }
 }

diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
@@ -91,17 +91,27 @@ USE_CUDA_ATOMIC(Add, int32);
 USE_CUDA_ATOMIC(Add, uint32);
 USE_CUDA_ATOMIC(Add, uint64);
 USE_CUDA_ATOMIC(Add, float);
+
 // For atomicMax.
 USE_CUDA_ATOMIC(Max, int32);
 USE_CUDA_ATOMIC(Max, uint32);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 USE_CUDA_ATOMIC(Max, uint64);
+#else
+// The uint64 overload of atomicMax() is only available for __CUDA_ARCH__ >=
+// 350.  If not satisfied, we provide a custom implementation using atomicCAS().
+CUDA_ATOMIC_WRAPPER(Max, uint64) {
+  uint64* address_as_ull = (uint64*)address;
+  uint64 old = *address_as_ull, assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed, max(val, assumed));
+  } while (assumed != old);
+
+  return old;
+}
 #endif
-// For atomicExch.
-USE_CUDA_ATOMIC(Exch, int32);
-USE_CUDA_ATOMIC(Exch, uint32);
-USE_CUDA_ATOMIC(Exch, uint64);
-USE_CUDA_ATOMIC(Exch, float);
 
 // Custom implementation of atomicAdd for double.
 // This implementation is copied from CUDA manual.