forked from Lightning-AI/pytorch-lightning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpu-tests-pytorch.yml
204 lines (185 loc) · 7.45 KB
/
gpu-tests-pytorch.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# Python package
# Create and test a Python package on multiple Python versions.
# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
# https://docs.microsoft.com/azure/devops/pipelines/languages/python
trigger:
tags:
include:
- '*'
branches:
include:
- "master"
- "release/*"
- "refs/tags/*"
pr:
branches:
include:
- "master"
- "release/*"
paths:
include:
- ".actions/**"
- ".azure/gpu-tests-pytorch.yml"
- "examples/run_pl_examples.sh"
- "examples/pytorch/basics/backbone_image_classifier.py"
- "examples/pytorch/basics/autoencoder.py"
- "requirements/pytorch/**"
- "src/lightning/__about__.py"
- "src/lightning/__init__.py"
- "src/lightning/__main__.py"
- "src/lightning/__setup__.py"
- "src/lightning/__version__.py"
- "src/lightning/pytorch/**"
- "src/pytorch_lightning/*"
- "tests/tests_pytorch/**"
- "pyproject.toml" # includes pytest config
- "requirements/fabric/**"
- "src/lightning/fabric/**"
- "src/lightning_fabric/*"
exclude:
- "requirements/*/docs.txt"
- "*.md"
- "**/*.md"
jobs:
- job: testing
# how long to run the job before automatically cancelling
timeoutInMinutes: "80"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
strategy:
matrix:
'PyTorch & strategies': # this uses torch 1.12 as not all strategies support 1.13 yet
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
scope: "strategies"
PACKAGE_NAME: "pytorch"
'PyTorch | latest':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.0-cuda11.7.1"
scope: ""
PACKAGE_NAME: "pytorch"
'Lightning | latest':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.0-cuda11.7.1"
scope: ""
PACKAGE_NAME: "lightning"
pool: lit-rtx-3090
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
PIP_CACHE_DIR: "/var/tmp/pip"
container:
image: $(image)
# default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--gpus=all --shm-size=2gb -v /var/tmp:/var/tmp"
workspace:
clean: all
steps:
- bash: |
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver"
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
scope=$( python -c 'n = "$(PACKAGE_NAME)" ; print(dict(pytorch="pytorch_lightning").get(n, n))' )
echo "##vso[task.setvariable variable=COVERAGE_SOURCE]$scope"
displayName: 'set env. vars'
- bash: |
echo $(DEVICES)
echo $CUDA_VISIBLE_DEVICES
echo $CUDA_VERSION_MM
echo $TORCH_URL
echo $COVERAGE_SOURCE
lspci | egrep 'VGA|3D'
whereis nvidia
nvidia-smi
which python && which pip
python --version
pip --version
pip list
displayName: 'Image info & NVIDIA'
- bash: |
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
for fpath in `ls requirements/**/*.txt`; do \
python ./requirements/pytorch/adjust-versions.py $fpath ${PYTORCH_VERSION}; \
done
displayName: 'Adjust dependencies'
- bash: pip install -e .[extra,test,examples] -U --find-links ${TORCH_URL}
displayName: 'Install package & dependencies'
- bash: |
pip uninstall -y -r requirements/pytorch/strategies.txt \
-r requirements/_integrations/strategies.txt
condition: ne(variables['scope'], 'strategies')
displayName: 'Uninstall strategies'
- bash: |
set -e
pip install -r requirements/pytorch/strategies.txt \
-r requirements/_integrations/strategies.txt \
--find-links ${TORCH_URL}
python requirements/pytorch/check-avail-strategies.py
condition: eq(variables['scope'], 'strategies')
displayName: 'Install strategies'
- bash: |
set -e
pip list
python requirements/collect_env_details.py
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
python requirements/pytorch/check-avail-extras.py
displayName: 'Env details'
- bash: bash .actions/pull_legacy_checkpoints.sh
displayName: 'Get legacy checkpoints'
- bash: python -m pytest pytorch_lightning
workingDirectory: src
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
displayName: 'Testing: PyTorch doctests'
- bash: |
pip install -q -r .actions/requirements.txt
python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_pytorch" \
--source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning"
python .actions/assistant.py copy_replace_imports --source_dir="./examples/pytorch/basics" \
--source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning"
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
displayName: 'Adjust tests & examples'
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest --ignore benchmarks -v --durations=50
workingDirectory: tests/tests_pytorch
env:
PL_RUN_CUDA_TESTS: "1"
displayName: 'Testing: PyTorch standard'
timeoutInMinutes: "35"
- bash: bash run_standalone_tests.sh
workingDirectory: tests/tests_pytorch
env:
PL_USE_MOCKED_MNIST: "1"
PL_RUN_CUDA_TESTS: "1"
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
displayName: 'Testing: PyTorch standalone tests'
timeoutInMinutes: "35"
- bash: bash run_standalone_tasks.sh
workingDirectory: tests/tests_pytorch
env:
PL_USE_MOCKED_MNIST: "1"
PL_RUN_CUDA_TESTS: "1"
displayName: 'Testing: PyTorch standalone tasks'
timeoutInMinutes: "10"
- bash: |
python -m coverage report
python -m coverage xml
python -m coverage html
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure
ls -l
workingDirectory: tests/tests_pytorch
displayName: 'Statistics'
- script: |
set -e
bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=1
bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=2 --trainer.strategy=ddp
bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=2 --trainer.strategy=ddp --trainer.precision=16
workingDirectory: examples
env:
PL_USE_MOCKED_MNIST: "1"
displayName: 'Testing: PyTorch examples'
- bash: python -m pytest benchmarks -v --maxfail=2 --durations=0
workingDirectory: tests/tests_pytorch
env:
PL_RUN_CUDA_TESTS: "1"
displayName: 'Testing: PyTorch benchmarks'