Skip to content

Commit

Permalink
Tensorflow CUDA helper: custom impl of atomicMax(uint64..).
Browse files Browse the repository at this point in the history
The motivation is this particular overload is only available for compute
capability >= 3.5.  We'd like to make sure any callers (the Multinomial GPU
kernel) have consistent semantics, so we replace the atomicExch() workaround
with this custom impl, when needs be.
Change: 123468056
  • Loading branch information
concretevitamin authored and tensorflower-gardener committed May 27, 2016
1 parent 5185d44 commit ef7013c
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 14 deletions.
9 changes: 0 additions & 9 deletions tensorflow/core/kernels/random_op_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,17 +151,8 @@ __global__ void MultinomialKernel(int32 nthreads, const int32 num_classes,
CUDA_1D_KERNEL_LOOP(index, nthreads) {
const int maxima_idx = index / num_classes;
if (ldg(maxima + maxima_idx) == ldg(scores + index)) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
// The uint64 overload of atomicMax() only works for a compute capability
// of >= 3.5. If not satisfied, we resort to using atomicExch() which
// does not guarantee deterministic results across runs (in the presence
// of multiple winners).
CudaAtomicMax(reinterpret_cast<uint64*>(output + maxima_idx),
static_cast<uint64>(index % num_classes));
#else
CudaAtomicExch(reinterpret_cast<uint64*>(output + maxima_idx),
static_cast<uint64>(index % num_classes));
#endif
}
}
}
Expand Down
20 changes: 15 additions & 5 deletions tensorflow/core/util/cuda_kernel_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,17 +91,27 @@ USE_CUDA_ATOMIC(Add, int32);
USE_CUDA_ATOMIC(Add, uint32);
USE_CUDA_ATOMIC(Add, uint64);
USE_CUDA_ATOMIC(Add, float);

// For atomicMax.
USE_CUDA_ATOMIC(Max, int32);
USE_CUDA_ATOMIC(Max, uint32);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
USE_CUDA_ATOMIC(Max, uint64);
#else
// The uint64 overload of atomicMax() is only available for __CUDA_ARCH__ >=
// 350. If not satisfied, we provide a custom implementation using atomicCAS().
CUDA_ATOMIC_WRAPPER(Max, uint64) {
uint64* address_as_ull = (uint64*)address;
uint64 old = *address_as_ull, assumed;

do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, max(val, assumed));
} while (assumed != old);

return old;
}
#endif
// For atomicExch.
USE_CUDA_ATOMIC(Exch, int32);
USE_CUDA_ATOMIC(Exch, uint32);
USE_CUDA_ATOMIC(Exch, uint64);
USE_CUDA_ATOMIC(Exch, float);

// Custom implementation of atomicAdd for double.
// This implementation is copied from CUDA manual.
Expand Down

0 comments on commit ef7013c

Please sign in to comment.