Skip to content

Commit

Permalink
[cuDNN][cuDNN V8 API] Always build assuming cuDNN >= 8.0 (pytorch#91527)
Browse files Browse the repository at this point in the history
We've been building with V8 (incl. V8 API) by default for a while now; this PR cleans up some guards for cuDNN < 8.0.

CC @ptrblck @ngimel
Pull Request resolved: pytorch#91527
Approved by: https://github.com/ngimel
  • Loading branch information
eqy authored and pytorchmergebot committed Jan 13, 2023
1 parent 4d26903 commit 4d07ad7
Show file tree
Hide file tree
Showing 23 changed files with 30 additions and 154 deletions.
1 change: 1 addition & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ cc_library(
"@cuda//:cusolver",
"@cuda//:nvrtc",
"@cudnn",
"@cudnn_frontend",
],
alwayslink = True,
)
Expand Down
3 changes: 0 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,6 @@ cmake_dependent_option(
cmake_dependent_option(
BUILD_NVFUSER_BENCHMARK "Build C++ binaries for nvfuser benchmarks" OFF
"USE_CUDA" OFF)
cmake_dependent_option(
USE_EXPERIMENTAL_CUDNN_V8_API "Use experimental cuDNN v8 API" ON
"USE_CUDNN" OFF)
option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
option(USE_KINETO "Use Kineto profiling library" ON)
option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
Expand Down
6 changes: 6 additions & 0 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,12 @@ new_local_repository(
path = "/usr/",
)

new_local_repository(
name = "cudnn_frontend",
build_file = "@//third_party:cudnn_frontend.BUILD",
path = "third_party/cudnn_frontend/",
)

local_repository(
name = "com_github_google_flatbuffers",
path = "third_party/flatbuffers",
Expand Down
2 changes: 0 additions & 2 deletions aten/src/ATen/cudnn/Descriptors.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,15 +305,13 @@ struct TORCH_CUDA_CPP_API CTCLossDescriptor
void set(cudnnDataType_t datatype) {
AT_CUDNN_CHECK(cudnnSetCTCLossDescriptor(mut_desc(), datatype));
}
#if CUDNN_VERSION >= 7600
void setEx(
cudnnDataType_t datatype,
cudnnLossNormalizationMode_t normMode,
cudnnNanPropagation_t gradMode) {
AT_CUDNN_CHECK(
cudnnSetCTCLossDescriptorEx(mut_desc(), datatype, normMode, gradMode));
}
#endif
};

struct TORCH_CUDA_CPP_API ActivationDescriptor
Expand Down
35 changes: 0 additions & 35 deletions aten/src/ATen/native/cudnn/BatchNorm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,7 @@ cudnnBatchNormMode_t getCudnnBatchNormMode(bool training, at::MemoryFormat memor
return CUDNN_BATCHNORM_PER_ACTIVATION;
} else if (training && memory_format == at::MemoryFormat::ChannelsLast) {

#if CUDNN_VERSION >= 7400
return CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
#else
return CUDNN_BATCHNORM_SPATIAL;
#endif // CUDNN_VERSION >= 7400

} else if (training && memory_format == at::MemoryFormat::ChannelsLast3d) {

Expand Down Expand Up @@ -152,7 +148,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
save_mean = at::empty({ num_features }, weight_t.options());
save_var = at::empty({ num_features }, weight_t.options());

#if CUDNN_VERSION >= 7400
auto op = CUDNN_BATCHNORM_OPS_BN;
size_t workspace_size;
AT_CUDNN_CHECK(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
Expand Down Expand Up @@ -204,22 +199,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
workspace_size,
reserve.data_ptr(),
reserve_size));
#else
reserve = at::empty({0}, input->options().dtype(kByte));
AT_CUDNN_CHECK(cudnnBatchNormalizationForwardTraining(
handle, mode, &one, &zero,
idesc.desc(), input->data_ptr(),
idesc.desc(), output->data_ptr(),
wdesc.desc(),
weight->data_ptr(),
bias->data_ptr(),
exponential_average_factor,
at::maybe_data_ptr(running_mean),
at::maybe_data_ptr(running_var),
epsilon,
save_mean.data_ptr(),
save_var.data_ptr()));
#endif // CUDNN_VERSION >= 7400
} else {
reserve = at::empty({0}, input->options().dtype(kByte));
// This keeps a consistent output with native_batch_norm
Expand Down Expand Up @@ -317,7 +296,6 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
Constant one(dataType, 1);
Constant zero(dataType, 0);

#if CUDNN_VERSION >= 7400
auto op = CUDNN_BATCHNORM_OPS_BN;

size_t workspace_size;
Expand Down Expand Up @@ -354,19 +332,6 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
workspace_size,
reserve->data_ptr(),
reserve->numel()));
#else
AT_CUDNN_CHECK(cudnnBatchNormalizationBackward(
handle, mode, &one, &zero, &one, &zero,
idesc.desc(), input->data_ptr(),
odesc.desc(), grad_output->data_ptr(),
idesc.desc(), grad_input_t.data_ptr(),
wdesc.desc(), weight->data_ptr(),
grad_weight_t.data_ptr(),
grad_bias_t.data_ptr(),
epsilon,
save_mean->data_ptr(),
save_var->data_ptr()));
#endif // CUDNN_VERSION >= 7400

return std::tuple<Tensor,Tensor,Tensor>{grad_input_t, grad_weight_t, grad_bias_t};
}
Expand Down
2 changes: 0 additions & 2 deletions aten/src/ATen/native/cudnn/ConvShared.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ void raw_cudnn_convolution_add_relu_fallback_out(
#if AT_CUDNN_ENABLED()
#include <ATen/native/cudnn/Macros.h>

#if HAS_CUDNN_V8()
// v7 functions are preserved here to allow for runtime switching to v7
// (e.g., TORCH_CUDNN_V8_API_DISABLED=1).
// Note that v7 forward/backward out can have different behavior from the v8
Expand Down Expand Up @@ -149,5 +148,4 @@ void raw_cudnn_convolution_add_relu_out_v7(
bool deterministic,
bool allow_tf32);
#endif
#endif
}}
47 changes: 0 additions & 47 deletions aten/src/ATen/native/cudnn/Conv_v7.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

#if AT_CUDNN_ENABLED()

#include <ATen/native/cudnn/Macros.h>
#include <ATen/core/Tensor.h>

#ifndef AT_PER_OPERATOR_HEADERS
Expand Down Expand Up @@ -60,10 +59,6 @@
// with the best algo, under the hood, cudnn will run with the slower kernel
// since it sees fastest algorithm combination with a sub optimal mathType.

// Note [blocklist fft algorithms for strided dgrad]
// This is a workaround for a CuDNN bug that gave wrong results in certain strided convolution
// gradient setups. Check Issue #16610 for bug details. Bug is there for CUDNN version < 7.5 .

constexpr size_t operator "" _TiB(unsigned long long n) {
return size_t(n) * 1024 * 1024 * 1024 * 1024;
}
Expand Down Expand Up @@ -225,15 +220,6 @@ size_t getMaxWorkspaceSize(
template<typename perf_t>
std::vector<perf_t> getValidAlgorithms(perf_t *perfResults, const ConvolutionArgs& args, int n_algo) {

// See Note [blocklist fft algorithms for strided dgrad]
#if CUDNN_VERSION < 7500
bool blocklist = std::is_same<decltype(perfResults[0].algo), cudnnConvolutionBwdDataAlgo_t>::value;
int stride_dim = args.input.dim() - 2;
blocklist &= std::any_of(std::begin(args.params.stride),
std::begin(args.params.stride) + stride_dim,
[=](int n){return n != 1;});
#endif

std::vector<perf_t> result;
result.reserve(n_algo);
for (const auto i : c10::irange(n_algo)) {
Expand All @@ -244,16 +230,6 @@ std::vector<perf_t> getValidAlgorithms(perf_t *perfResults, const ConvolutionArg
if (perf.status == CUDNN_STATUS_SUCCESS) {
if (!args.params.deterministic || perf.determinism == CUDNN_DETERMINISTIC) {

// See Note [blocklist fft algorithms for strided dgrad]
#if CUDNN_VERSION < 7500
bool skip = blocklist;
skip &= (static_cast<cudnnConvolutionBwdDataAlgo_t>(perfResults[i].algo) == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
static_cast<cudnnConvolutionBwdDataAlgo_t>(perfResults[i].algo) == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT);
if (skip) {
continue;
}
#endif

result.push_back(perf);
}
}
Expand Down Expand Up @@ -493,11 +469,9 @@ class AlgoIterator {
perfResults[0].mathType = CUDNN_TENSOR_OP_MATH;
} else {
perfResults[0].mathType = CUDNN_DEFAULT_MATH;
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8000
if (args.params.dataType == CUDNN_DATA_FLOAT && !args.params.allow_tf32) {
perfResults[0].mathType = CUDNN_FMA_MATH;
}
#endif
}
search::getWorkspaceSize(args, perfResults[0].algo, &(perfResults[0].memory));
return perfResults;
Expand Down Expand Up @@ -610,14 +584,10 @@ static inline void split_batch_dim_to_32bit_out(
}


#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8000
#define ASSERT_CORRECT_PRECISION(math_type) \
if (args.params.dataType == CUDNN_DATA_FLOAT) { \
TORCH_INTERNAL_ASSERT(args.params.allow_tf32 || math_type == CUDNN_FMA_MATH); \
}
#else
#define ASSERT_CORRECT_PRECISION(math_type)
#endif // CUDNN_VERSION >= 8000


// ---------------------------------------------------------------------
Expand Down Expand Up @@ -672,11 +642,7 @@ void raw_cudnn_convolution_forward_out_32bit(
}


#if !HAS_CUDNN_V8()
void raw_cudnn_convolution_forward_out(
#else
void raw_cudnn_convolution_forward_out_v7(
#endif
const Tensor& output, const Tensor& input, const Tensor& weight,
IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
bool benchmark, bool deterministic, bool allow_tf32) {
Expand Down Expand Up @@ -734,11 +700,7 @@ void raw_cudnn_convolution_backward_input_out_32bit(
);
}

#if !HAS_CUDNN_V8()
void raw_cudnn_convolution_backward_input_out(
#else
void raw_cudnn_convolution_backward_input_out_v7(
#endif
const at::Tensor& grad_input,
const at::Tensor& grad_output,
const at::Tensor& weight,
Expand Down Expand Up @@ -797,11 +759,7 @@ void raw_cudnn_convolution_backward_weight_out_32bit(
);
}

#if !HAS_CUDNN_V8()
void raw_cudnn_convolution_backward_weight_out(
#else
void raw_cudnn_convolution_backward_weight_out_v7(
#endif
const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
bool benchmark, bool deterministic, bool allow_tf32) {
Expand Down Expand Up @@ -853,12 +811,7 @@ void raw_cudnn_convolution_backward_weight_out_v7(
TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN.");
}

#if !HAS_CUDNN_V8()
void raw_cudnn_convolution_add_relu_out(
#else
void raw_cudnn_convolution_add_relu_out_v7(
#endif

const Tensor& output,
const Tensor& input,
const Tensor& weight,
Expand Down
5 changes: 0 additions & 5 deletions aten/src/ATen/native/cudnn/Conv_v8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@

#if AT_CUDNN_ENABLED()

#include <ATen/native/cudnn/Macros.h>

#if HAS_CUDNN_V8()

#include <ATen/cudnn/cudnn-wrapper.h>

#include <c10/macros/Macros.h>
Expand Down Expand Up @@ -787,5 +783,4 @@ void raw_cudnn_convolution_add_relu_out(

}} // at::native

#endif // HAS_CUDNN_V8
#endif // AT_CUDNN_ENABLED
2 changes: 1 addition & 1 deletion aten/src/ATen/native/cudnn/LossCTC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include <ATen/ops/empty_like.h>
#endif

#if (!AT_CUDNN_ENABLED()) || (CUDNN_VERSION < 7600)
#if (!AT_CUDNN_ENABLED())

namespace at { namespace native {

Expand Down
3 changes: 0 additions & 3 deletions aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
#include <ATen/cuda/CUDAConfig.h> // for the definition of AT_CUDNN_ENABLED

#if AT_CUDNN_ENABLED()
#include <ATen/native/cudnn/Macros.h>
#if HAS_CUDNN_V8()

#include <ATen/core/TensorBase.h>
#include <ATen/core/TensorBody.h>
Expand Down Expand Up @@ -259,6 +257,5 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
} // namespace native
} // namespace at

#endif // HAS_CUDNN_V8
#endif // AT_CUDNN_ENABLED
#endif // USE_CUDA
4 changes: 0 additions & 4 deletions aten/src/ATen/native/quantized/cudnn/Conv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,8 @@

#if AT_CUDNN_ENABLED()

#include <ATen/native/cudnn/Macros.h>
#include <c10/util/ArrayRef.h>

#if HAS_CUDNN_V8()

#include <ATen/ATen.h>
#include <ATen/cuda/Exceptions.h>
#include <ATen/cudnn/Handle.h>
Expand Down Expand Up @@ -432,6 +429,5 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
} // namespace at


#endif // HAS_CUDNN_V8
#endif // AT_CUDNN_ENABLED
#endif // USE_CUDA
5 changes: 0 additions & 5 deletions aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@

#if AT_CUDNN_ENABLED()

#include <ATen/native/cudnn/Macros.h>

#if HAS_CUDNN_V8()

#include <ATen/ATen.h>
#include <torch/library.h>
#include <ATen/native/quantized/cpu/QuantUtils.h>
Expand Down Expand Up @@ -212,6 +208,5 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
} // namespace native
} // namespace at

#endif // HAS_CUDNN_V8
#endif // AT_CUDNN_ENABLED
#endif // USE_CUDA
5 changes: 0 additions & 5 deletions aten/src/ATen/native/quantized/cudnn/ConvUnpackImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@

#if AT_CUDNN_ENABLED()

#include <ATen/native/cudnn/Macros.h>

#if HAS_CUDNN_V8()

#include <ATen/ATen.h>
#include <ATen/native/quantized/cudnn/utils.h>
#include <ATen/native/quantized/PackedParams.h>
Expand All @@ -23,6 +19,5 @@ std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightCudnn<
template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightCudnn<
2>::unpack();

#endif // HAS_CUDNN_V8
#endif // AT_CUDNN_ENABLED
#endif // USE_CUDA
4 changes: 0 additions & 4 deletions aten/src/ATen/native/quantized/cudnn/Linear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,8 @@

#if AT_CUDNN_ENABLED()

#include <ATen/native/cudnn/Macros.h>
#include <c10/util/ArrayRef.h>

#if HAS_CUDNN_V8()

#include <ATen/ATen.h>
#include <ATen/cuda/Exceptions.h>
#include <ATen/cudnn/Handle.h>
Expand Down Expand Up @@ -367,6 +364,5 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
} // namespace at


#endif // HAS_CUDNN_V8
#endif // AT_CUDNN_ENABLED
#endif // USE_CUDA
5 changes: 0 additions & 5 deletions aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@

#if AT_CUDNN_ENABLED()

#include <ATen/native/cudnn/Macros.h>

#if HAS_CUDNN_V8()

#include <ATen/ATen.h>
#include <torch/library.h>
#include <ATen/native/quantized/cudnn/utils.h>
Expand Down Expand Up @@ -58,6 +54,5 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
} // namespace native
} // namespace at

#endif // HAS_CUDNN_V8
#endif // AT_CUDNN_ENABLED
#endif // USE_CUDA
Loading

0 comments on commit 4d07ad7

Please sign in to comment.