Skip to content

Commit

Permalink
Switch cuda 12.1 docker images to gcc9 (pytorch#102380)
Browse files Browse the repository at this point in the history
Update CUDA-12.1 CI docker images to gcc-9, that should tentatively fix for internal compiler error  in [libtorch-linux-bionic-cuda12.1-py3.7-gcc7 / build](https://github.com/pytorch/pytorch/actions/runs/5071681366/jobs/9135310361)

Co-authored by: Nikita Shulga <[email protected]>

Fixes: pytorch#102372
Pull Request resolved: pytorch#102380
Approved by: https://github.com/malfet, https://github.com/huydhn
  • Loading branch information
atalman authored and pytorchmergebot committed May 30, 2023
1 parent 9ff1932 commit 6ac8a11
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 41 deletions.
18 changes: 16 additions & 2 deletions .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,25 @@ _UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea
# configuration, so we hardcode everything here rather than do it
# from scratch
case "$image" in
pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7)
pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9)
CUDA_VERSION=12.1.0
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=7
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc9)
CUDA_VERSION=11.8.0
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/docker-builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ jobs:
fail-fast: false
matrix:
include:
- docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7
- docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9
- docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc9
- docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
- docker-image-name: pytorch-linux-bionic-py3.8-clang9
- docker-image-name: pytorch-linux-bionic-py3.11-clang9
Expand Down
20 changes: 10 additions & 10 deletions .github/workflows/inductor-periodic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ concurrency:
cancel-in-progress: true

jobs:
linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-build:
name: cuda12.1-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
linux-bionic-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build:
name: cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-bionic-cuda12.1-py3.10-gcc7-sm86
docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7
build-environment: linux-bionic-cuda12.1-py3.10-gcc9-sm86
docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
Expand All @@ -38,11 +38,11 @@ jobs:
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-test:
name: cuda12.1-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
linux-bionic-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-test:
name: cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-test.yml
needs: linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-build
needs: linux-bionic-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build
with:
build-environment: linux-bionic-cuda12.1-py3.10-gcc7-sm86
docker-image: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
build-environment: linux-bionic-cuda12.1-py3.10-gcc9-sm86
docker-image: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
40 changes: 20 additions & 20 deletions .github/workflows/slow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ concurrency:
cancel-in-progress: true

jobs:
linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-build:
name: linux-bionic-cuda12.1-py3-gcc7-slow-gradcheck
linux-bionic-cuda12_1-py3-gcc9-slow-gradcheck-build:
name: linux-bionic-cuda12.1-py3-gcc9-slow-gradcheck
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-bionic-cuda12.1-py3-gcc7-slow-gradcheck
docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7
build-environment: linux-bionic-cuda12.1-py3-gcc9-slow-gradcheck
docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
Expand All @@ -31,37 +31,37 @@ jobs:
{ config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
]}
linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-test:
name: linux-bionic-cuda12.1-py3-gcc7-slow-gradcheck
linux-bionic-cuda12_1-py3-gcc9-slow-gradcheck-test:
name: linux-bionic-cuda12.1-py3-gcc9-slow-gradcheck
uses: ./.github/workflows/_linux-test.yml
needs: linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-build
needs: linux-bionic-cuda12_1-py3-gcc9-slow-gradcheck-build
with:
build-environment: linux-bionic-cuda12.1-py3-gcc7-slow-gradcheck
docker-image: ${{ needs.linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }}
build-environment: linux-bionic-cuda12.1-py3-gcc9-slow-gradcheck
docker-image: ${{ needs.linux-bionic-cuda12_1-py3-gcc9-slow-gradcheck-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda12_1-py3-gcc9-slow-gradcheck-build.outputs.test-matrix }}
timeout-minutes: 300

linux-bionic-cuda12_1-py3_10-gcc7-sm86-build:
name: linux-bionic-cuda12.1-py3.10-gcc7-sm86
linux-bionic-cuda12_1-py3_10-gcc9-sm86-build:
name: linux-bionic-cuda12.1-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-bionic-cuda12.1-py3.10-gcc7-sm86
docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7
build-environment: linux-bionic-cuda12.1-py3.10-gcc9-sm86
docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9
cuda-arch-list: 8.6
test-matrix: |
{ include: [
{ config: "slow", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "slow", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
linux-bionic-cuda12_1-py3_10-gcc7-sm86-test:
name: linux-bionic-cuda12.1-py3.10-gcc7-sm86
linux-bionic-cuda12_1-py3_10-gcc9-sm86-test:
name: linux-bionic-cuda12.1-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-bionic-cuda12_1-py3_10-gcc7-sm86-build
needs: linux-bionic-cuda12_1-py3_10-gcc9-sm86-build
with:
build-environment: linux-bionic-cuda12.1-py3.10-gcc7-sm86
docker-image: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc7-sm86-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc7-sm86-build.outputs.test-matrix }}
build-environment: linux-bionic-cuda12.1-py3.10-gcc9-sm86
docker-image: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc9-sm86-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc9-sm86-build.outputs.test-matrix }}

linux-bionic-py3_8-clang9-build:
name: linux-bionic-py3.8-clang9
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ jobs:
docker-image: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-build.outputs.test-matrix }}

libtorch-linux-bionic-cuda11_8-py3_7-gcc7-build:
name: libtorch-linux-bionic-cuda11.8-py3.7-gcc7
libtorch-linux-bionic-cuda11_8-py3_7-gcc9-build:
name: libtorch-linux-bionic-cuda11.8-py3.7-gcc9
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: libtorch-linux-bionic-cuda11.8-py3.7-gcc7
docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
build-environment: libtorch-linux-bionic-cuda11.8-py3.7-gcc9
docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc9
build-generates-artifacts: false
runner: linux.4xlarge
test-matrix: |
Expand Down
9 changes: 6 additions & 3 deletions torch/csrc/cuda/nccl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,9 @@ void check_inputs(

check_tensor(
input,
i == static_cast<decltype(i)>(root) ? at::optional<at::Tensor>{output}
: at::nullopt,
i == static_cast<std::remove_cv_t<decltype(i)>>(root)
? at::optional<at::Tensor>{output}
: at::nullopt,
input_multiplier,
output_multiplier,
numel,
Expand Down Expand Up @@ -620,7 +621,9 @@ void reduce(
ncclComm_t comm = comms_ref[i];
NCCL_CHECK(ncclReduce(
inputs[i].data_ptr(),
static_cast<decltype(i)>(root) == i ? output.data_ptr() : nullptr,
static_cast<std::remove_cv_t<decltype(i)>>(root) == i
? output.data_ptr()
: nullptr,
count,
data_type,
to_nccl_red_op(op),
Expand Down
3 changes: 2 additions & 1 deletion torch/csrc/lazy/core/shape_inference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,8 @@ std::vector<Shape> compute_shape_cat(at::TensorList tensors, int64_t dim) {
}
TORCH_CHECK(!out_shape.empty(), "Scalar tensors are not supported in cat.");
TORCH_CHECK(
extended_dim_shape <= std::numeric_limits<int64_t>::max(),
extended_dim_shape <=
static_cast<size_t>(std::numeric_limits<int64_t>::max()),
"Size overflow");
out_shape[dim] = extended_dim_shape;
return {Shape(tensors[0].scalar_type(), out_shape)};
Expand Down

0 comments on commit 6ac8a11

Please sign in to comment.