From db630799c9fc4dedfc37dd1c1efd9b8af2a1bbbf Mon Sep 17 00:00:00 2001 From: chenyaofo Date: Thu, 16 Mar 2023 22:32:38 +0800 Subject: [PATCH] add code for torch compile and benchmarks. --- codebase/main.py | 19 ++++- codebase/models/__init__.py | 6 ++ conf/base.conf | 15 +++- doc/benchmark.md | 139 +++++++++++++++++++++++++++++++++++- 4 files changed, 174 insertions(+), 5 deletions(-) diff --git a/codebase/main.py b/codebase/main.py index fe30a9e..2131ff3 100644 --- a/codebase/main.py +++ b/codebase/main.py @@ -114,9 +114,22 @@ def prepare_for_training(conf: ConfigTree, output_dir: str, local_rank: int): model = model.to(device=get_device(), memory_format=getattr(torch, conf.get("memory_format"))) criterion = criterion.to(device=get_device()) - image_size = conf.get_int('data.image_size') - _logger.info(f"Model details: n_params={compute_nparam(model)/1e6:.2f}M, " - f"flops={compute_flops(model,(1,3, image_size, image_size))/1e6:.2f}M.") + if conf.get_bool("use_compile"): + if hasattr(torch, "compile"): + _logger.info("Use torch.compile to optimize model, please wait for while.") + model = torch.compile( + model=model, + **conf.get("compile") + ) + else: + _logger.info("PyTorch version is too old to support torch.compile, skip it.") + + if conf.get_bool("use_tf32"): + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + # image_size = conf.get_int('data.image_size') + # _logger.info(f"Model details: n_params={compute_nparam(model)/1e6:.2f}M, " + # f"flops={compute_flops(model,(1,3, image_size, image_size))/1e6:.2f}M.") writer = only_master(SummaryWriter(output_dir)) diff --git a/codebase/models/__init__.py b/codebase/models/__init__.py index 05f574f..323fde2 100644 --- a/codebase/models/__init__.py +++ b/codebase/models/__init__.py @@ -2,6 +2,8 @@ import torch.hub from torchvision.models import resnet18, resnet50 +from torchvision.models import mobilenet_v2, shufflenet_v2_x1_0 +from torchvision.models import vit_b_16, swin_t from .dummy_model import dummy_model from .register import MODEL @@ -14,3 +16,7 @@ def PyTorchHub(repo: str, name: str, **kwargs): MODEL.register(resnet18) MODEL.register(resnet50) +MODEL.register(mobilenet_v2) +MODEL.register(shufflenet_v2_x1_0) +MODEL.register(vit_b_16) +MODEL.register(swin_t) diff --git a/conf/base.conf b/conf/base.conf index 29e35c6..ed38466 100644 --- a/conf/base.conf +++ b/conf/base.conf @@ -17,4 +17,17 @@ sync_batchnorm: false # if true, it will convert all the batchnorm layers into t accmulated_steps: 1 set_reproducible: false # if true, the training will be set to reproducible (refer to https://pytorch.org/docs/stable/notes/randomness.html) - # else torch.backends.cudnn.benchmark will be set to True for largest throughput \ No newline at end of file + # else torch.backends.cudnn.benchmark will be set to True for largest throughput + +use_tf32: false # if true, it will use TF32 on NVIDIA Ampere GPUs + +use_compile: false # if true, it will compile the model with torch.compile + +compile { + fullgraph: false + dynamic: false, + backend: inductor + mode: null + options: null + disable: false +} \ No newline at end of file diff --git a/doc/benchmark.md b/doc/benchmark.md index 2cadd32..da38b63 100644 --- a/doc/benchmark.md +++ b/doc/benchmark.md @@ -1,4 +1,5 @@ ## Throughput Benchmark + We test this code on NVIDIA A100 and report the throughput in the followings. | settings | throughput (imgs/s) | @@ -10,7 +11,7 @@ We test this code on NVIDIA A100 and report the throughput in the followings. > Check for NVIDIA impl and **Throughput Benchmark** at https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Classification/ConvNets/resnet50v1.5/README.md#training-performance-results -## Test Environment +Test environment: ``` PyTorch version: 1.12.1+cu113 @@ -48,4 +49,140 @@ Versions of relevant libraries: [conda] torchaudio 0.12.1+cu113 pypi_0 pypi [conda] torchmetrics 0.9.3 pypi_0 pypi [conda] torchvision 0.13.1+cu113 pypi_0 pypi +``` + +## More Benchmarks on PyTorch 2.0 + +We test this code on NVIDIA V100 and report the throughput in the followings. + + - Benchmarks on ResNet-50 + +| settings | throughput (imgs/s) | +| --- | --- | +| baseline | 345 | +| +channels_last | 345 | +| +amp | 774 | +| +channels_last&& | 1175 | +| +channels_last&&&&compile(default) | 1228 | +| +channels_last&&&&compile(default+fullgraph) | 1228 | +| +channels_last&&&&compile(reduce-overhead) | 1234 | +| +channels_last&&&&compile(max-autotune) | FAIL | + + - Benchmarks on MobileNetV2 + +| settings | throughput (imgs/s) | +| --- | --- | +| baseline | 813 | +| +channels_last | 420 | +| +amp | 1315 | +| +channels_last&& | 2100 | +| +channels_last&&&&compile(default) | 2316 | + + - Benchmarks on ShuffleNetV2 + +| settings | throughput (imgs/s) | +| --- | --- | +| baseline | 2342 | +| +channels_last | 1854 | +| +amp | 3250 | +| +channels_last&& | 3862 | +| +channels_last&&&&compile(default) | 4711 | + + - Benchmarks on ViT-B16 + +| settings | throughput (imgs/s) | +| --- | --- | +| baseline | 102 | +| +amp | 360 | +| amp&&compile(default) | 289 | + + - Benchmarks on SwinTransformer-tiny + +| settings | throughput (imgs/s) | +| --- | --- | +| baseline | 264 | +| +amp | 499 | +| amp&&compile(default) | 789 | + +Test environment: + +``` +PyTorch version: 2.0.0+cu118 +Is debug build: False +CUDA used to build PyTorch: 11.8 +ROCM used to build PyTorch: N/A + +OS: Ubuntu 22.04.1 LTS (x86_64) +GCC version: (Ubuntu 11.3.0-1ubuntu1~22.04) 11.3.0 +Clang version: Could not collect +CMake version: version 3.25.0 +Libc version: glibc-2.35 + +Python version: 3.10.9 | packaged by conda-forge | (main, Feb 2 2023, 20:20:04) [GCC 11.3.0] (64-bit runtime) +Python platform: Linux-5.4.0-139-generic-x86_64-with-glibc2.35 +Is CUDA available: True +CUDA runtime version: 11.8.89 +CUDA_MODULE_LOADING set to: LAZY +GPU models and configuration: GPU 0: Tesla V100-SXM2-32GB +Nvidia driver version: 525.85.12 +cuDNN version: Probably one of the following: +/usr/lib/x86_64-linux-gnu/libcudnn.so.8.7.0 +/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.7.0 +/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.7.0 +/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.7.0 +/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.7.0 +/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.7.0 +/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.7.0 +HIP runtime version: N/A +MIOpen runtime version: N/A +Is XNNPACK available: True + +CPU: +Architecture: x86_64 +CPU op-mode(s): 32-bit, 64-bit +Address sizes: 46 bits physical, 48 bits virtual +Byte Order: Little Endian +CPU(s): 10 +On-line CPU(s) list: 0-9 +Vendor ID: GenuineIntel +Model name: Intel(R) Xeon(R) Platinum 8255C CPU @ 2.50GHz +CPU family: 6 +Model: 85 +Thread(s) per core: 1 +Core(s) per socket: 10 +Socket(s): 1 +Stepping: 5 +BogoMIPS: 4999.99 +Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti fsgsbase bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 arat avx512_vnni +Hypervisor vendor: KVM +Virtualization type: full +L1d cache: 320 KiB (10 instances) +L1i cache: 320 KiB (10 instances) +L2 cache: 40 MiB (10 instances) +L3 cache: 35.8 MiB (1 instance) +NUMA node(s): 1 +NUMA node0 CPU(s): 0-9 +Vulnerability Itlb multihit: KVM: Vulnerable +Vulnerability L1tf: Mitigation; PTE Inversion +Vulnerability Mds: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown +Vulnerability Meltdown: Mitigation; PTI +Vulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown +Vulnerability Retbleed: Vulnerable +Vulnerability Spec store bypass: Vulnerable +Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +Vulnerability Spectre v2: Mitigation; Retpolines, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected +Vulnerability Srbds: Not affected +Vulnerability Tsx async abort: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown + +Versions of relevant libraries: +[pip3] numpy==1.23.5 +[pip3] torch==2.0.0+cu118 +[pip3] torchaudio==2.0.1+cu118 +[pip3] torchdata==0.6.0 +[pip3] torchvision==0.15.1+cu118 +[conda] numpy 1.23.5 pypi_0 pypi +[conda] torch 2.0.0+cu118 pypi_0 pypi +[conda] torchaudio 2.0.1+cu118 pypi_0 pypi +[conda] torchdata 0.6.0 pypi_0 pypi +[conda] torchvision 0.15.1+cu118 pypi_0 pypi ``` \ No newline at end of file