diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md
index f15871eccd..8bd1bb121c 100644
--- a/GETTING_STARTED.md
+++ b/GETTING_STARTED.md
@@ -18,7 +18,8 @@ For more advanced tutorials, refer to our [documentation](https://detectron2.rea
 	for example, `mask_rcnn_R_50_FPN_3x.yaml`.
 2. We provide `demo.py` that is able to run builtin standard models. Run it with:
 ```
-python demo/demo.py --config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
+cd demo/
+python demo.py --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
   --input input1.jpg input2.jpg \
   [--other-options]
   --opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
@@ -45,29 +46,28 @@ setup the corresponding datasets following
 [datasets/README.md](https://github.com/facebookresearch/detectron2/blob/master/datasets/README.md),
 then run:
 ```
-python tools/train_net.py --num-gpus 8 \
-	--config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
+cd tools/
+./train_net.py --num-gpus 8 \
+	--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
 ```
 
-The configs are made for 8-GPU training. To train on 1 GPU, change the batch size with:
+The configs are made for 8-GPU training.
+To train on 1 GPU, you may need to [change some parameters](https://arxiv.org/abs/1706.02677), e.g.:
 ```
-python tools/train_net.py \
-	--config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
+./train_net.py \
+	--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
 	SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025
 ```
 
 For most models, CPU training is not supported.
 
-(Note that we applied the [linear learning rate scaling rule](https://arxiv.org/abs/1706.02677)
-when changing the batch size.)
-
 To evaluate a model's performance, use
 ```
-python tools/train_net.py \
-	--config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
+./train_net.py \
+	--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
 	--eval-only MODEL.WEIGHTS /path/to/checkpoint_file
 ```
-For more options, see `python tools/train_net.py -h`.
+For more options, see `./train_net.py -h`.
 
 ### Use Detectron2 APIs in Your Code
 
diff --git a/INSTALL.md b/INSTALL.md
index 703dae90e6..de46bfec90 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -94,6 +94,8 @@ python -c 'import torch; from torch.utils.cpp_extension import CUDA_HOME; print(
 ```
 
 print valid outputs at the time you build detectron2.
+
+Most models can run inference (but not training) without GPU support. To use CPUs, set `MODEL.DEVICE='cpu'` in the config.
 </details>
 
 <details>
diff --git a/demo/demo.py b/demo/demo.py
old mode 100644
new mode 100755
diff --git a/detectron2/evaluation/cityscapes_evaluation.py b/detectron2/evaluation/cityscapes_evaluation.py
index cdb6fc40b9..6bba332b1b 100644
--- a/detectron2/evaluation/cityscapes_evaluation.py
+++ b/detectron2/evaluation/cityscapes_evaluation.py
@@ -20,6 +20,7 @@ class CityscapesEvaluator(DatasetEvaluator):
     Note:
         * It does not work in multi-machine distributed training.
         * It contains a synchronization, therefore has to be used on all ranks.
+        * Only the main process runs evaluation.
     """
 
     def __init__(self, dataset_name):
diff --git a/detectron2/evaluation/coco_evaluation.py b/detectron2/evaluation/coco_evaluation.py
index b6ce9a3897..08f9a55647 100644
--- a/detectron2/evaluation/coco_evaluation.py
+++ b/detectron2/evaluation/coco_evaluation.py
@@ -42,7 +42,8 @@ def __init__(self, dataset_name, cfg, distributed, output_dir=None):
                 Or it must be in detectron2's standard dataset format
                 so it can be converted to COCO format automatically.
             cfg (CfgNode): config instance
-            distributed (True): if True, will collect results from all ranks for evaluation.
+            distributed (True): if True, will collect results from all ranks and run evaluation
+                in the main process.
                 Otherwise, will evaluate the results in the current process.
             output_dir (str): optional, an output directory to dump all
                 results predicted on the dataset. The dump contains two files:
diff --git a/detectron2/export/api.py b/detectron2/export/api.py
index 0977dff8d0..796425e7b3 100644
--- a/detectron2/export/api.py
+++ b/detectron2/export/api.py
@@ -62,7 +62,8 @@ def export_onnx_model(cfg, model, inputs):
     """
     Export a detectron2 model to ONNX format.
     Note that the exported model contains custom ops only available in caffe2, therefore it
-    cannot be directly executed by other runtime.
+    cannot be directly executed by other runtime. Post-processing or transformation passes
+    may be applied on the model to accommodate different runtimes.
 
     Args:
         cfg (CfgNode): a detectron2 config, with extra export-related options
diff --git a/detectron2/layers/mask_ops.py b/detectron2/layers/mask_ops.py
index 0a2875d355..6ef2053050 100644
--- a/detectron2/layers/mask_ops.py
+++ b/detectron2/layers/mask_ops.py
@@ -70,6 +70,11 @@ def paste_masks_in_image(masks, boxes, image_shape, threshold=0.5):
     The location, height, and width for pasting each mask is determined by their
     corresponding bounding boxes in boxes.
 
+    Note:
+        This is a complicated but more accurate implementation. In actual deployment, it is
+        often enough to use a faster but less accurate implementation.
+        See :func:`paste_mask_in_image_old` in this file for an alternative implementation.
+
     Args:
         masks (tensor): Tensor of shape (Bimg, Hmask, Wmask), where Bimg is the number of
             detected object instances in the image and Hmask, Wmask are the mask width and mask
@@ -85,6 +90,7 @@ def paste_masks_in_image(masks, boxes, image_shape, threshold=0.5):
         number of detected object instances and Himage, Wimage are the image width
         and height. img_masks[i] is a binary mask for object instance i.
     """
+
     assert masks.shape[-1] == masks.shape[-2], "Only square mask predictions are supported"
     N = len(masks)
     if N == 0:
diff --git a/detectron2/utils/events.py b/detectron2/utils/events.py
index 5db0e729bb..6ce6a48323 100644
--- a/detectron2/utils/events.py
+++ b/detectron2/utils/events.py
@@ -3,6 +3,7 @@
 import json
 import logging
 import os
+import time
 from collections import defaultdict
 from contextlib import contextmanager
 import torch
@@ -156,21 +157,35 @@ def __init__(self, max_iter):
         """
         self.logger = logging.getLogger(__name__)
         self._max_iter = max_iter
+        self._last_write = None
 
     def write(self):
         storage = get_event_storage()
         iteration = storage.iter
 
-        data_time, time = None, None
-        eta_string = "N/A"
         try:
             data_time = storage.history("data_time").avg(20)
-            time = storage.history("time").global_avg()
+        except KeyError:
+            # they may not exist in the first few iterations (due to warmup)
+            # or when SimpleTrainer is not used
+            data_time = None
+
+        eta_string = "N/A"
+        try:
+            iter_time = storage.history("time").global_avg()
             eta_seconds = storage.history("time").median(1000) * (self._max_iter - iteration)
             storage.put_scalar("eta_seconds", eta_seconds, smoothing_hint=False)
             eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
-        except KeyError:  # they may not exist in the first few iterations (due to warmup)
-            pass
+        except KeyError:
+            iter_time = None
+            # estimate eta on our own - more noisy
+            if self._last_write is not None:
+                estimate_iter_time = (time.perf_counter() - self._last_write[1]) / (
+                    iteration - self._last_write[0]
+                )
+                eta_seconds = estimate_iter_time * (self._max_iter - iteration)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+            self._last_write = (iteration, time.perf_counter())
 
         try:
             lr = "{:.6f}".format(storage.history("lr").latest())
@@ -184,7 +199,7 @@ def write(self):
 
         # NOTE: max_mem is parsed by grep in "dev/parse_results.sh"
         self.logger.info(
-            " eta: {eta}  iter: {iter}  {losses}  {time}  {data_time}  lr: {lr}  {memory}".format(
+            " eta: {eta}  iter: {iter}  {losses}  {time}{data_time}lr: {lr}  {memory}".format(
                 eta=eta_string,
                 iter=iteration,
                 losses="  ".join(
@@ -194,8 +209,8 @@ def write(self):
                         if "loss" in k
                     ]
                 ),
-                time="time: {:.4f}".format(time) if time is not None else "",
-                data_time="data_time: {:.4f}".format(data_time) if data_time is not None else "",
+                time="time: {:.4f}  ".format(iter_time) if iter_time is not None else "",
+                data_time="data_time: {:.4f}  ".format(data_time) if data_time is not None else "",
                 lr=lr,
                 memory="max_mem: {:.0f}M".format(max_mem_mb) if max_mem_mb is not None else "",
             )
diff --git a/docs/tutorials/deployment.md b/docs/tutorials/deployment.md
index d37313086a..1a4ffcb6fb 100644
--- a/docs/tutorials/deployment.md
+++ b/docs/tutorials/deployment.md
@@ -10,7 +10,7 @@ Caffe2 conversion requires PyTorch ≥ 1.4 and ONNX ≥ 1.6.
 ### Coverage
 
 It supports 3 most common meta architectures: `GeneralizedRCNN`, `RetinaNet`, `PanopticFPN`,
-and almost all official models under these 3 meta architectures.
+and most official models under these 3 meta architectures.
 
 Users' custom extensions under these architectures (added through registration) are supported
 as long as they do not contain control flow or operators not available in Caffe2 (e.g. deformable convolution).
@@ -25,7 +25,7 @@ these APIs to convert a standard model.
 To convert an official Mask R-CNN trained on COCO, first
 [prepare the COCO dataset](../../datasets/), then pick the model from [Model Zoo](../../MODEL_ZOO.md), and run:
 ```
-python tools/caffe2_converter.py --config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
+cd tools/ && ./caffe2_converter.py --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
 	--output ./caffe2_model --run-eval \
 	MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl \
 	MODEL.DEVICE cpu
@@ -50,7 +50,7 @@ You can also load `model.pb` to tools such as [netron](https://github.com/lutzro
 
 ### Inputs & Outputs
 
-All converted models take two input tensors:
+All converted models (the .pb file) take two input tensors:
 "data" which is an NCHW image, and "im_info" which is a Nx3 tensor of (height, width, unused legacy parameter) for
 each image (the shape of "data" might be larger than that in "im_info" due to padding).
 
@@ -60,6 +60,8 @@ The models only produce raw outputs from the final
 layers that are not post-processed, because in actual deployment, an application often needs
 its custom lightweight post-processing (e.g. full-image masks for every detected object is often not necessary).
 
-Due to different inputs & outputs formats, the `Caffe2Model.__call__` method includes
-pre/post-processing code in order to match the formats of original detectron2 models.
-They can serve as a reference for pre/post-processing in actual deployment.
+Due to different inputs & outputs formats,
+we provide a wrapper around the converted model, in the [Caffe2Model.__call__](../modules/export.html#detectron2.export.Caffe2Model.__call__) method.
+It has an interface that's identical to the [format of pytorch versions of models](models.html),
+and it internally applies pre/post-processing code to match the formats.
+They can serve as a reference for pre/post-processing in actual deployment.
\ No newline at end of file
diff --git a/tools/benchmark.py b/tools/benchmark.py
index 8d30d5e359..a8090d5561 100755
--- a/tools/benchmark.py
+++ b/tools/benchmark.py
@@ -100,8 +100,9 @@ def benchmark_train(args):
     dummy_data = list(itertools.islice(data_loader, 100))
 
     def f():
+        data = DatasetFromList(dummy_data, copy=False)
         while True:
-            yield from DatasetFromList(dummy_data, copy=False)
+            yield from data
 
     max_iter = 400
     trainer = SimpleTrainer(model, f(), optimizer)