From db630799c9fc4dedfc37dd1c1efd9b8af2a1bbbf Mon Sep 17 00:00:00 2001
From: chenyaofo <chenyaofo@gmail.com>
Date: Thu, 16 Mar 2023 22:32:38 +0800
Subject: [PATCH] add code for torch compile and benchmarks.

---
 codebase/main.py            |  19 ++++-
 codebase/models/__init__.py |   6 ++
 conf/base.conf              |  15 +++-
 doc/benchmark.md            | 139 +++++++++++++++++++++++++++++++++++-
 4 files changed, 174 insertions(+), 5 deletions(-)

diff --git a/codebase/main.py b/codebase/main.py
index fe30a9e..2131ff3 100644
--- a/codebase/main.py
+++ b/codebase/main.py
@@ -114,9 +114,22 @@ def prepare_for_training(conf: ConfigTree, output_dir: str, local_rank: int):
         model = model.to(device=get_device(), memory_format=getattr(torch, conf.get("memory_format")))
         criterion = criterion.to(device=get_device())
 
-    image_size = conf.get_int('data.image_size')
-    _logger.info(f"Model details: n_params={compute_nparam(model)/1e6:.2f}M, "
-                 f"flops={compute_flops(model,(1,3, image_size, image_size))/1e6:.2f}M.")
+    if conf.get_bool("use_compile"):
+        if hasattr(torch, "compile"):
+            _logger.info("Use torch.compile to optimize model, please wait for while.")
+            model = torch.compile(
+                model=model,
+                **conf.get("compile")
+            )
+        else:
+            _logger.info("PyTorch version is too old to support torch.compile, skip it.")
+
+    if conf.get_bool("use_tf32"):
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+    # image_size = conf.get_int('data.image_size')
+    # _logger.info(f"Model details: n_params={compute_nparam(model)/1e6:.2f}M, "
+    #              f"flops={compute_flops(model,(1,3, image_size, image_size))/1e6:.2f}M.")
 
     writer = only_master(SummaryWriter(output_dir))
 
diff --git a/codebase/models/__init__.py b/codebase/models/__init__.py
index 05f574f..323fde2 100644
--- a/codebase/models/__init__.py
+++ b/codebase/models/__init__.py
@@ -2,6 +2,8 @@
 import torch.hub
 
 from torchvision.models import resnet18, resnet50
+from torchvision.models import mobilenet_v2, shufflenet_v2_x1_0
+from torchvision.models import vit_b_16, swin_t
 from .dummy_model import dummy_model
 
 from .register import MODEL
@@ -14,3 +16,7 @@ def PyTorchHub(repo: str, name: str, **kwargs):
 
 MODEL.register(resnet18)
 MODEL.register(resnet50)
+MODEL.register(mobilenet_v2)
+MODEL.register(shufflenet_v2_x1_0)
+MODEL.register(vit_b_16)
+MODEL.register(swin_t)
diff --git a/conf/base.conf b/conf/base.conf
index 29e35c6..ed38466 100644
--- a/conf/base.conf
+++ b/conf/base.conf
@@ -17,4 +17,17 @@ sync_batchnorm: false # if true, it will convert all the batchnorm layers into t
 accmulated_steps: 1
 
 set_reproducible: false # if true, the training will be set to reproducible (refer to https://pytorch.org/docs/stable/notes/randomness.html)
-                        # else torch.backends.cudnn.benchmark will be set to True for largest throughput
\ No newline at end of file
+                        # else torch.backends.cudnn.benchmark will be set to True for largest throughput
+
+use_tf32: false # if true, it will use TF32 on NVIDIA Ampere GPUs
+
+use_compile: false # if true, it will compile the model with torch.compile
+
+compile {
+    fullgraph: false
+    dynamic: false,
+    backend: inductor
+    mode: null
+    options: null
+    disable: false
+}
\ No newline at end of file
diff --git a/doc/benchmark.md b/doc/benchmark.md
index 2cadd32..da38b63 100644
--- a/doc/benchmark.md
+++ b/doc/benchmark.md
@@ -1,4 +1,5 @@
 ## Throughput Benchmark
+
 We test this code on NVIDIA A100 and report the throughput in the followings.
 
 | settings | throughput (imgs/s) |
@@ -10,7 +11,7 @@ We test this code on NVIDIA A100 and report the throughput in the followings.
 
 > Check for NVIDIA impl and **Throughput Benchmark** at https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Classification/ConvNets/resnet50v1.5/README.md#training-performance-results
 
-## Test Environment
+Test environment:
 
 ```
 PyTorch version: 1.12.1+cu113
@@ -48,4 +49,140 @@ Versions of relevant libraries:
 [conda] torchaudio                0.12.1+cu113             pypi_0    pypi
 [conda] torchmetrics              0.9.3                    pypi_0    pypi
 [conda] torchvision               0.13.1+cu113             pypi_0    pypi
+```
+
+## More Benchmarks on PyTorch 2.0
+
+We test this code on NVIDIA V100 and report the throughput in the followings.
+
+ - Benchmarks on ResNet-50
+
+| settings | throughput (imgs/s) |
+| --- | --- |
+| baseline | 345 |
+| +channels_last | 345 |
+| +amp | 774 |
+| +channels_last&&amp | 1175 |
+| +channels_last&&amp&&compile(default) | 1228 |
+| +channels_last&&amp&&compile(default+fullgraph) | 1228 |
+| +channels_last&&amp&&compile(reduce-overhead) | 1234 |
+| +channels_last&&amp&&compile(max-autotune) | FAIL |
+
+ - Benchmarks on MobileNetV2
+
+| settings | throughput (imgs/s) |
+| --- | --- |
+| baseline | 813 |
+| +channels_last | 420 |
+| +amp | 1315 |
+| +channels_last&&amp | 2100 |
+| +channels_last&&amp&&compile(default) | 2316 |
+
+ - Benchmarks on ShuffleNetV2
+
+| settings | throughput (imgs/s) |
+| --- | --- |
+| baseline | 2342 |
+| +channels_last | 1854 |
+| +amp | 3250 |
+| +channels_last&&amp | 3862 |
+| +channels_last&&amp&&compile(default) | 4711 |
+
+ - Benchmarks on ViT-B16
+
+| settings | throughput (imgs/s) |
+| --- | --- |
+| baseline | 102 |
+| +amp | 360 |
+| amp&&compile(default) | 289 |
+
+ - Benchmarks on SwinTransformer-tiny
+
+| settings | throughput (imgs/s) |
+| --- | --- |
+| baseline | 264 |
+| +amp | 499 |
+| amp&&compile(default) | 789 |
+
+Test environment:
+
+```
+PyTorch version: 2.0.0+cu118
+Is debug build: False
+CUDA used to build PyTorch: 11.8
+ROCM used to build PyTorch: N/A
+
+OS: Ubuntu 22.04.1 LTS (x86_64)
+GCC version: (Ubuntu 11.3.0-1ubuntu1~22.04) 11.3.0
+Clang version: Could not collect
+CMake version: version 3.25.0
+Libc version: glibc-2.35
+
+Python version: 3.10.9 | packaged by conda-forge | (main, Feb  2 2023, 20:20:04) [GCC 11.3.0] (64-bit runtime)
+Python platform: Linux-5.4.0-139-generic-x86_64-with-glibc2.35
+Is CUDA available: True
+CUDA runtime version: 11.8.89
+CUDA_MODULE_LOADING set to: LAZY
+GPU models and configuration: GPU 0: Tesla V100-SXM2-32GB
+Nvidia driver version: 525.85.12
+cuDNN version: Probably one of the following:
+/usr/lib/x86_64-linux-gnu/libcudnn.so.8.7.0
+/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.7.0
+/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.7.0
+/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.7.0
+/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.7.0
+/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.7.0
+/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.7.0
+HIP runtime version: N/A
+MIOpen runtime version: N/A
+Is XNNPACK available: True
+
+CPU:
+Architecture:                    x86_64
+CPU op-mode(s):                  32-bit, 64-bit
+Address sizes:                   46 bits physical, 48 bits virtual
+Byte Order:                      Little Endian
+CPU(s):                          10
+On-line CPU(s) list:             0-9
+Vendor ID:                       GenuineIntel
+Model name:                      Intel(R) Xeon(R) Platinum 8255C CPU @ 2.50GHz
+CPU family:                      6
+Model:                           85
+Thread(s) per core:              1
+Core(s) per socket:              10
+Socket(s):                       1
+Stepping:                        5
+BogoMIPS:                        4999.99
+Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti fsgsbase bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 arat avx512_vnni
+Hypervisor vendor:               KVM
+Virtualization type:             full
+L1d cache:                       320 KiB (10 instances)
+L1i cache:                       320 KiB (10 instances)
+L2 cache:                        40 MiB (10 instances)
+L3 cache:                        35.8 MiB (1 instance)
+NUMA node(s):                    1
+NUMA node0 CPU(s):               0-9
+Vulnerability Itlb multihit:     KVM: Vulnerable
+Vulnerability L1tf:              Mitigation; PTE Inversion
+Vulnerability Mds:               Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
+Vulnerability Meltdown:          Mitigation; PTI
+Vulnerability Mmio stale data:   Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
+Vulnerability Retbleed:          Vulnerable
+Vulnerability Spec store bypass: Vulnerable
+Vulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __user pointer sanitization
+Vulnerability Spectre v2:        Mitigation; Retpolines, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected
+Vulnerability Srbds:             Not affected
+Vulnerability Tsx async abort:   Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
+
+Versions of relevant libraries:
+[pip3] numpy==1.23.5
+[pip3] torch==2.0.0+cu118
+[pip3] torchaudio==2.0.1+cu118
+[pip3] torchdata==0.6.0
+[pip3] torchvision==0.15.1+cu118
+[conda] numpy                     1.23.5                   pypi_0    pypi
+[conda] torch                     2.0.0+cu118              pypi_0    pypi
+[conda] torchaudio                2.0.1+cu118              pypi_0    pypi
+[conda] torchdata                 0.6.0                    pypi_0    pypi
+[conda] torchvision               0.15.1+cu118             pypi_0    pypi
 ```
\ No newline at end of file