Add training recipes for ViT + IN (facebookresearch#658)

Summary: Pull Request resolved: facebookresearch#658 Reviewed By: vreis Differential Revision: D25004760 Pulled By: mannatsingh fbshipit-source-id: 8204dc0ceeaecef86c504f55f1f44ac309f656ff
syhw · Nov 19, 2020 · f19ef29 · f19ef29
1 parent 0e129dd
commit f19ef29
Show file tree

Hide file tree

Showing 9 changed files with 928 additions and 0 deletions.
diff --git a/examples/vit/README.md b/examples/vit/README.md
@@ -0,0 +1,44 @@
+# An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale
+
+*Dosovitskiy, Alexey, et al. "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale." arXiv preprint arXiv:2010.11929 (2020).*
+
+https://arxiv.org/abs/2010.11929
+
+## Introduction
+
+This paper takes transformer based models that have been extremely successful in NLP (e.g. GPT-3) and successfully makes them work for Computer Vision by applying attention on image patches.
+
+## Training Recipes
+
+- These recipes were used to train models using 16GB V100 GPUs using mixed precision training
+- Based on the type of GPUs available, the `batchsize_per_replica` in any config can be adjusted and mixed precision training can be disabled
+- We use gradient accumulation in all our training runs with a pre-defined global batch size (`simulated_global_batchsize`)
+  - This means these configs can be used with any number of GPUs, as long as `simulated_global_batchsize` is divisible by `batchsize_per_replica * num_gpus`
+- Users need to download ImageNet 1K and modify the config to point to the correct paths to the train and val sets
+- In addition to the paper's training setup, we use label smoothing, mixup and AutoAugment with reasonable defaults
+  - Note that the training hyperparameters here are borrowed from the paper, so the results can possibly be improved with further tuning
+
+### Pre-training on ImageNet 1K
+
+| Model | Training configuration | Top-1 Accuracy (%) |
+| --- |--- | --- |
+| ViT-B/32 | [vit_b32_in.json](vit_b32_in.json) | 73.30 |
+| ViT-B/16 | [vit_b16_in.json](vit_b16_in.json) | 78.98 |
+| ViT-L/32 | [vit_l32_in.json](vit_l32_in.json) | 73.60 |
+| ViT-L/16 | [vit_l16_in.json](vit_l16_in.json) | 76.53 (\*)|
+
+*(\*) training diverged and accuracy went to 0 around phase 87*
+
+### Fine tuning on ImageNet 1K
+
+- The numbers reported in Table 5 of the paper for ImageNet include an additional fine tuning step using a higher resolution of 384 after pre-training
+- The `pretrained_checkpoint` in the config needs to point to the location of a checkpoint of a pre-training run
+
+| Model | Training configuration | Top-1 Accuracy (%) | Paper Top-1 Accuracy (%)
+| --- |--- | --- | --- |
+| ViT-B/32 | [vit_b32_in_ft.json](vit_b32_in_ft.json) | 76.67 | 73.38 |
+| ViT-B/16 | [vit_b16_in_ft.json](vit_b16_in_ft.json) | 79.76 | 77.91 |
+| ViT-L/32 | [vit_l32_in_ft.json](vit_l32_in_ft.json) | 75.38 | 71.16 |
+| ViT-L/16 | [vit_l16_in_ft.json](vit_l16_in_ft.json) | 77.02 (\*)| 76.53 |
+
+*(\*) pre-training diverged and accuracy went to 0 around phase 87*
diff --git a/examples/vit/vit_b16_in.json b/examples/vit/vit_b16_in.json
@@ -0,0 +1,117 @@
+{
+  "name": "classification_task",
+  "num_epochs": 300,
+  "loss": {
+    "name": "label_smoothing_cross_entropy",
+    "smoothing_param": 0.1
+  },
+  "mixup": {
+    "alpha": 0.2,
+    "num_classes": 1000
+  },
+  "simulated_global_batchsize": 4096,
+  "clip_grad_norm": 1,
+  "amp_args": {
+    "opt_level": "O1"
+  },
+  "dataset": {
+    "train": {
+      "name": "image_path",
+      "image_folder": "PATH/TO/IMAGENET/TRAIN",
+      "batchsize_per_replica": 64,
+      "num_samples": null,
+      "use_shuffle": true,
+      "transforms": [
+        {
+          "name": "apply_transform_to_key",
+          "transforms": [
+            {"name": "RandomResizedCrop", "size": 224},
+            {"name": "RandomHorizontalFlip"},
+            {"name": "imagenet_autoaugment"},
+            {"name": "ToTensor"},
+            {
+              "name": "Normalize",
+              "mean": [0.485, 0.456, 0.406],
+              "std": [0.229, 0.224, 0.225]
+            }
+          ],
+          "key": "input"
+        }
+      ]
+    },
+    "test": {
+      "name": "image_path",
+      "image_folder": "PATH/TO/IMAGENET/VAL",
+      "batchsize_per_replica": 64,
+      "num_samples": null,
+      "use_shuffle": false,
+      "transforms": [
+        {
+          "name": "apply_transform_to_key",
+          "transforms": [
+            {"name": "Resize", "size": 256},
+            {"name": "CenterCrop", "size": 224},
+            {"name": "ToTensor"},
+            {
+              "name": "Normalize",
+              "mean": [0.485, 0.456, 0.406],
+              "std": [0.229, 0.224, 0.225]
+            }
+          ],
+          "key": "input"
+        }
+      ]
+    }
+  },
+  "meters": {
+    "accuracy": {
+      "topk": [1, 5]
+    }
+  },
+  "model": {
+    "name": "vision_transformer",
+    "image_size": 224,
+    "patch_size": 16,
+    "hidden_dim": 768,
+    "mlp_dim": 3072,
+    "num_heads": 12,
+    "num_layers": 12,
+    "attention_dropout_rate": 0,
+    "dropout_rate": 0.1,
+    "heads": [
+      {
+        "name": "vision_transformer_head",
+        "unique_id": "default_head",
+        "num_classes": 1000,
+        "fork_block": "trunk_output",
+        "in_plane": 768,
+        "hidden_dim": 3072
+      }
+    ]
+  },
+  "optimizer": {
+    "name": "adamw",
+    "betas": [0.9, 0.999],
+    "weight_decay": 0.3,
+    "param_schedulers": {
+      "lr": {
+        "name": "composite",
+        "schedulers": [
+          {
+            "name": "linear",
+            "start_value": 0,
+            "end_value": 0.003
+          },
+          {
+            "name": "cosine",
+            "start_value": 0.003,
+            "end_value": 0
+          }
+        ],
+        "update_interval": "step",
+        "interval_scaling": ["rescaled", "fixed"],
+        "lengths": [0.1, 0.9]
+      }
+    }
+  }
+}
diff --git a/examples/vit/vit_b16_in_ft.json b/examples/vit/vit_b16_in_ft.json
@@ -0,0 +1,104 @@
+{
+  "name": "fine_tuning",
+  "reset_heads": false,
+  "freeze_trunk": false,
+  "pretrained_checkpoint": "PATH/TO/PRETRAINED/CHECKPOINT",
+  "num_epochs": 8,
+  "loss": {
+    "name": "soft_target_cross_entropy"
+  },
+  "clip_grad_norm": 1,
+  "simulated_global_batchsize": 512,
+  "amp_args": {
+    "opt_level": "O1"
+  },
+  "dataset": {
+    "train": {
+      "name": "image_path",
+      "image_folder": "PATH/TO/IMAGENET/TRAIN",
+      "batchsize_per_replica": 16,
+      "num_samples": null,
+      "use_shuffle": true,
+      "transforms": [
+        {
+          "name": "apply_transform_to_key",
+          "transforms": [
+            {"name": "RandomResizedCrop", "size": 384},
+            {"name": "RandomHorizontalFlip"},
+            {"name": "imagenet_autoaugment"},
+            {"name": "ToTensor"},
+            {
+              "name": "Normalize",
+              "mean": [0.485, 0.456, 0.406],
+              "std": [0.229, 0.224, 0.225]
+            }
+          ],
+          "key": "input"
+        }
+      ]
+    },
+    "test": {
+      "name": "image_path",
+      "image_folder": "PATH/TO/IMAGENET/VAL",
+      "batchsize_per_replica": 16,
+      "num_samples": null,
+      "use_shuffle": false,
+      "transforms": [
+        {
+          "name": "apply_transform_to_key",
+          "transforms": [
+            {"name": "Resize", "size": 438},
+            {"name": "CenterCrop", "size": 384},
+            {"name": "ToTensor"},
+            {
+              "name": "Normalize",
+              "mean": [0.485, 0.456, 0.406],
+              "std": [0.229, 0.224, 0.225]
+            }
+          ],
+          "key": "input"
+        }
+      ]
+    }
+
+  },
+  "meters": {
+    "accuracy": {
+      "topk": [1, 5]
+    }
+  },
+  "model": {
+    "name": "vision_transformer",
+    "image_size": 384,
+    "patch_size": 16,
+    "hidden_dim": 768,
+    "mlp_dim": 3072,
+    "num_heads": 12,
+    "num_layers": 12,
+    "attention_dropout_rate": 0,
+    "dropout_rate": 0.1,
+    "heads": [
+      {
+        "name": "vision_transformer_head",
+        "unique_id": "default_head",
+        "num_classes": 1000,
+        "fork_block": "trunk_output",
+        "in_plane": 768,
+        "hidden_dim": 3072
+      }
+    ]
+  },
+  "optimizer": {
+    "name": "sgd",
+    "weight_decay": 0,
+    "momentum": 0.9,
+    "nesterov": true,
+    "param_schedulers": {
+      "lr": {
+        "name": "cosine",
+        "start_value": 0.01,
+        "end_value": 0
+      }
+    }
+  }
+}
diff --git a/examples/vit/vit_b32_in.json b/examples/vit/vit_b32_in.json
@@ -0,0 +1,117 @@
+{
+  "name": "classification_task",
+  "num_epochs": 300,
+  "loss": {
+    "name": "label_smoothing_cross_entropy",
+    "smoothing_param": 0.1
+  },
+  "mixup": {
+    "alpha": 0.2,
+    "num_classes": 1000
+  },
+  "simulated_global_batchsize": 4096,
+  "clip_grad_norm": 1,
+  "amp_args": {
+    "opt_level": "O1"
+  },
+  "dataset": {
+    "train": {
+      "name": "image_path",
+      "image_folder": "PATH/TO/IMAGENET/TRAIN",
+      "batchsize_per_replica": 256,
+      "num_samples": null,
+      "use_shuffle": true,
+      "transforms": [
+        {
+          "name": "apply_transform_to_key",
+          "transforms": [
+            {"name": "RandomResizedCrop", "size": 224},
+            {"name": "RandomHorizontalFlip"},
+            {"name": "imagenet_autoaugment"},
+            {"name": "ToTensor"},
+            {
+              "name": "Normalize",
+              "mean": [0.485, 0.456, 0.406],
+              "std": [0.229, 0.224, 0.225]
+            }
+          ],
+          "key": "input"
+        }
+      ]
+    },
+    "test": {
+      "name": "image_path",
+      "image_folder": "PATH/TO/IMAGENET/VAL",
+      "batchsize_per_replica": 256,
+      "num_samples": null,
+      "use_shuffle": false,
+      "transforms": [
+        {
+          "name": "apply_transform_to_key",
+          "transforms": [
+            {"name": "Resize", "size": 256},
+            {"name": "CenterCrop", "size": 224},
+            {"name": "ToTensor"},
+            {
+              "name": "Normalize",
+              "mean": [0.485, 0.456, 0.406],
+              "std": [0.229, 0.224, 0.225]
+            }
+          ],
+          "key": "input"
+        }
+      ]
+    }
+  },
+  "meters": {
+    "accuracy": {
+      "topk": [1, 5]
+    }
+  },
+  "model": {
+    "name": "vision_transformer",
+    "image_size": 224,
+    "patch_size": 32,
+    "hidden_dim": 768,
+    "mlp_dim": 3072,
+    "num_heads": 12,
+    "num_layers": 12,
+    "attention_dropout_rate": 0,
+    "dropout_rate": 0.1,
+    "heads": [
+      {
+        "name": "vision_transformer_head",
+        "unique_id": "default_head",
+        "num_classes": 1000,
+        "fork_block": "trunk_output",
+        "in_plane": 768,
+        "hidden_dim": 3072
+      }
+    ]
+  },
+  "optimizer": {
+    "name": "adamw",
+    "betas": [0.9, 0.999],
+    "weight_decay": 0.3,
+    "param_schedulers": {
+      "lr": {
+        "name": "composite",
+        "schedulers": [
+          {
+            "name": "linear",
+            "start_value": 0,
+            "end_value": 0.003
+          },
+          {
+            "name": "cosine",
+            "start_value": 0.003,
+            "end_value": 0
+          }
+        ],
+        "update_interval": "step",
+        "interval_scaling": ["rescaled", "fixed"],
+        "lengths": [0.1, 0.9]
+      }
+    }
+  }
+}