diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..c286ad0c1 --- /dev/null +++ b/.flake8 @@ -0,0 +1,8 @@ +# This is an example .flake8 config, used when developing *Black* itself. +# Keep in sync with setup.cfg which is used for source packages. + +[flake8] +ignore = E203, E266, E501, W503 +max-line-length = 80 +max-complexity = 18 +select = B,C,E,F,W,T4,B9 diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..eb070dd8c --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ +# compilation and distribution +__pycache__ +_ext +*.pyc +*.so +torch_detectron.egg-info/ +torch_detectron/legacy/ +build/ +dist/ + +# pytorch/python/numpy formats +*.pth +*.pkl +*.npy + +# ipython/jupyter notebooks +*.ipynb + +# Editor temporaries +*.swn +*.swo +*.swp +*~ + +# project dirs +/datasets +/models diff --git a/ABSTRACTIONS.md b/ABSTRACTIONS.md new file mode 100644 index 000000000..36947bc59 --- /dev/null +++ b/ABSTRACTIONS.md @@ -0,0 +1,65 @@ +## Abstractions +The main abstractions introduced by `maskrcnn_benchmark` that are useful to +have in mind are the following: + +### ImageList +In PyTorch, the first dimension of the input to the network generally represents +the batch dimension, and thus all elements of the same batch have the same +height / width. +In order to support images with different sizes and aspect ratios in the same +batch, we created the `ImageList` class, which holds internally a batch of +images (os possibly different sizes). The images are padded with zeros such that +they have the same final size and batched over the first dimension. The original +sizes of the images before padding are stored in the `image_sizes` attribute, +and the batched tensor in `tensors`. +We provide a convenience function `to_image_list` that accepts a few different +input types, including a list of tensors, and returns an `ImageList` object. + +```python +from maskrnn_benchmark.structures.image_list import to_image_list + +images = [torch.rand(3, 100, 200), torch.rand(3, 150, 170)] +batched_images = to_image_list(images) + +# it is also possible to make the final batched image be a multiple of a number +batched_images_32 = to_image_list(images, size_divisible=32) +``` + +### BoxList +The `BoxList` class holds a set of bounding boxes (represented as a `Nx4` tensor) for +a specific image, as well as the size of the image as a `(width, height)` tuple. +It also contains a set of methods that allow to perform geometric +transformations to the bounding boxes (such as cropping, scaling and flipping). +The class accepts bounding boxes from two different input formats: +- `xyxy`, where each box is encoded as a `x1`, `y1`, `x2` and `y2` coordinates) +- `xywh`, where each box is encoded as `x1`, `y1`, `w` and `h`. + +Additionally, each `BoxList` instance can also hold arbitrary additional information +for each bounding box, such as labels, visibility, probability scores etc. + +Here is an example on how to create a `BoxList` from a list of coordinates: +```python +from maskrcnn_baseline.structures.bounding_box import BoxList, FLIP_LEFT_RIGHT + +width = 100 +height = 200 +boxes = [ + [0, 10, 50, 50], + [50, 20, 90, 60], + [10, 10, 50, 50] +] +# create a BoxList with 3 boxes +bbox = BoxList(boxes, size=(width, height), mode='xyxy') + +# perform some box transformations, has similar API as PIL.Image +bbox_scaled = bbox.resize((width * 2, height * 3)) +bbox_flipped = bbox.transpose(FLIP_LEFT_RIGHT) + +# add labels for each bbox +labels = torch.tensor([0, 10, 1]) +bbox.add_field('labels', labels) + +# bbox also support a few operations, like indexing +# here, selects boxes 0 and 2 +bbox_subset = bbox[[0, 2]] +``` diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..0f7ad8bfc --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,5 @@ +# Code of Conduct + +Facebook has adopted a Code of Conduct that we expect project participants to adhere to. +Please read the [full text](https://code.fb.com/codeofconduct/) +so that you can understand what actions will and will not be tolerated. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..fc14cd3c7 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,39 @@ +# Contributing to Mask-RCNN Benchmark +We want to make contributing to this project as easy and transparent as +possible. + +## Our Development Process +Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis. + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `master`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Facebook's open source projects. + +Complete your CLA here: + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + +## Coding Style +* 4 spaces for indentation rather than tabs +* 80 character line length +* PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/) + +## License +By contributing to Mask-RCNN Benchmark, you agree that your contributions will be licensed +under the LICENSE file in the root directory of this source tree. diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 000000000..f8f3414d9 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,47 @@ +## Installation + +### Requirements: +- PyTorch 1.0 from a nightly release. Installation instructions can be found in https://pytorch.org/get-started/locally/ +- torchvision from master +- cocoapi +- yacs +- (optional) OpenCV for the webcam demo + + +### Step-by-step installation + +```bash +# maskrnn_benchmark and coco api dependencies +pip install ninja yacs cython + +# follow PyTorch installation in https://pytorch.org/get-started/locally/ +# we give the instructions for CUDA 9.0 +conda install pytorch-nightly -c pytorch + +# install torchvision +cd ~/github +git clone git@github.com:pytorch/vision.git +cd vision +python setup.py install + +# install pycocotools +cd ~/github +git clone git@github.com:cocodataset/cocoapi.git +cd cocoapi/PythonAPI +python setup.py build_ext install + +# install PyTorch Detection +cd ~/github +git clone git@github.com:facebookresearch/maskrcnn-benchmark.git +cd maskrcnn-benchmark +# the following will install the lib with +# symbolic links, so that you can modify +# the files if you want and won't need to +# re-build it +python setup.py build develop + +# or if you are on macOS +# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build develop +``` + + diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..8585e11b8 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Facebook + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md new file mode 100644 index 000000000..d678ef88c --- /dev/null +++ b/MODEL_ZOO.md @@ -0,0 +1,82 @@ +## Model Zoo and Baselines + +### Hardware +- 8 NVIDIA V100 GPUs + +### Software +- PyTorch version: 1.0.0a0+dd2c487 +- CUDA 9.2 +- CUDNN 7.1 +- NCCL 2.2.13-1 + +### End-to-end Faster and Mask R-CNN baselines + +All the baselines were trained using the exact same experimental setup as in Detectron. +We initialize the detection models with ImageNet weights from Caffe2, the same as used by Detectron. + +The pre-trained models are available in the link in the model id. + +backbone | type | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time(hr) | inference time(s/im) | box AP | mask AP | model id +-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- +R-50-C4 | Fast | 1x | 1 | 5.8 | 0.4036 | 20.2 | 0.17130 | 34.8 | - | [6358800](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_50_C4_1x.pth) +R-50-FPN | Fast | 1x | 2 | 4.4 | 0.3530 | 8.8 | 0.12580 | 36.8 | - | [6358793](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_50_FPN_1x.pth) +R-101-FPN | Fast | 1x | 2 | 7.1 | 0.4591 | 11.5 | 0.143149 | 39.1 | - | [6358804](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_101_FPN_1x.pth) +X-101-32x8d-FPN | Fast | 1x | 1 | 7.6 | 0.7007 | 35.0 | 0.209965 | 41.2 | - | [6358717](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_X_101_32x8d_FPN_1x.pth) +R-50-C4 | Mask | 1x | 1 | 5.8 | 0.4520 | 22.6 | 0.17796 + 0.028 | 35.6 | 31.5 | [6358801](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_C4_1x.pth) +R-50-FPN | Mask | 1x | 2 | 5.2 | 0.4536 | 11.3 | 0.12966 + 0.034 | 37.8 | 34.2 | [6358792](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_FPN_1x.pth) +R-101-FPN | Mask | 1x | 2 | 7.9 | 0.5665 | 14.2 | 0.15384 + 0.034 | 40.1 | 36.1 | [6358805](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_101_FPN_1x.pth) +X-101-32x8d-FPN | Mask | 1x | 1 | 7.8 | 0.7562 | 37.8 | 0.21739 + 0.034 | 42.2 | 37.8 | [6358718](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_X_101_32x8d_FPN_1x.pth) + + +## Comparison with Detectron and mmdetection + +In the following section, we compare our implementation with [Detectron](https://github.com/facebookresearch/Detectron) +and [mmdetection](https://github.com/open-mmlab/mmdetection). +The same remarks from [mmdetection](https://github.com/open-mmlab/mmdetection/blob/master/MODEL_ZOO.md#training-speed) +about different hardware applies here. + +### Training speed + +The numbers here are in seconds / iteration. The lower, the better. + +type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100) +-- | -- | -- | -- +Faster R-CNN R-50 C4 | 0.566 | - | 0.4036 +Faster R-CNN R-50 FPN | 0.544 | 0.554 | 0.3530 +Faster R-CNN R-101 FPN | 0.647 | - | 0.4591 +Faster R-CNN X-101-32x8d FPN | 0.799 | - | 0.7007 +Mask R-CNN R-50 C4 | 0.620 | - | 0.4520 +Mask R-CNN R-50 FPN | 0.889 | 0.690 | 0.4536 +Mask R-CNN R-101 FPN | 1.008 | - | 0.5665 +Mask R-CNN X-101-32x8d FPN | 0.961 | - | 0.7562 + +### Training memory + +The lower, the better + +type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100) +-- | -- | -- | -- +Faster R-CNN R-50 C4 | 6.3 | - | 5.8 +Faster R-CNN R-50 FPN | 7.2 | 4.9 | 4.4 +Faster R-CNN R-101 FPN | 8.9 | - | 7.1 +Faster R-CNN X-101-32x8d FPN | 7.0 | - | 7.6 +Mask R-CNN R-50 C4 | 6.6 | - | 5.8 +Mask R-CNN R-50 FPN | 8.6 | 5.9 | 5.2 +Mask R-CNN R-101 FPN | 10.2 | - | 7.9 +Mask R-CNN X-101-32x8d FPN | 7.7 | - | 7.8 + +### Accuracy + +The higher, the better + +type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100) +-- | -- | -- | -- +Faster R-CNN R-50 C4 | 34.8 | - | 34.8 +Faster R-CNN R-50 FPN | 36.7 | 36.7 | 36.8 +Faster R-CNN R-101 FPN | 39.4 | - | 39.1 +Faster R-CNN X-101-32x8d FPN | 41.3 | - | 41.2 +Mask R-CNN R-50 C4 | 35.8 & 31.4 | - | 35.6 & 31.5 +Mask R-CNN R-50 FPN | 37.7 & 33.9 | 37.5 & 34.4 | 37.8 & 34.2 +Mask R-CNN R-101 FPN | 40.0 & 35.9 | - | 40.1 & 36.1 +Mask R-CNN X-101-32x8d FPN | 42.1 & 37.3 | - | 42.2 & 37.8 + diff --git a/README.md b/README.md new file mode 100644 index 000000000..47868d039 --- /dev/null +++ b/README.md @@ -0,0 +1,166 @@ +# Faster R-CNN and Mask R-CNN in PyTorch 1.0 + +This project aims at providing the necessary building blocks for easily +creating detection and segmentation models using PyTorch 1.0. + +![alt text](demo/demo_e2e_mask_rcnn_X_101_32x8d_FPN_1x.png "from http://cocodataset.org/#explore?id=345434") + +## Highlights +- **PyTorch 1.0:** RPN, Faster R-CNN and Mask R-CNN implementations that matches or exceeds Detectron accuracies +- **Very fast**: up to **2x** faster than [Detectron](https://github.com/facebookresearch/Detectron) and **30%** faster than [mmdetection](https://github.com/open-mmlab/mmdetection) during training. See [MODEL_ZOO.md](MODEL_ZOO.md) for more details. +- **Memory efficient:** uses roughly 500MB less GPU memory than mmdetection during training +- **Multi-GPU training and inference** +- **Batched inference:** can perform inference using multiple images per batch per GPU +- **CPU support for inference:** runs on CPU in inference time. See our [webcam demo](demo) for an example +- Provides pre-trained models for almost all reference Mask R-CNN and Faster R-CNN configurations with 1x schedule. + +## Webcam and Jupyter notebook demo + +We provide a simple webcam demo that illustrates how you can use `maskrcnn_benchmark` for inference: +```bash +cd demo +# by default, it runs on the GPU +# for best results, use min-image-size 800 +python webcam.py --min-image-size 800 +# can also run it on the CPU +python webcam.py --min-image-size 300 MODEL.DEVICE cpu +# or change the model that you want to use +python webcam.py --config-file ../configs/caffe2/e2e_mask_rcnn_R_101_FPN_1x_caffe2.py --min-image-size 300 MODEL.DEVICE cpu +# in order to see the probability heatmaps, pass --show-mask-heatmaps +python webcam.py --min-image-size 300 --show-mask-heatmaps MODEL.DEVICE cpu +``` + +A notebook with the demo can be found in [demo/Mask_R-CNN_demo.ipynb](demo/Mask_R-CNN_demo.ipynb). + +## Installation + +Check [INSTALL.md](INSTALL.md) for installation instructions. + + +## Model Zoo and Baselines + +Pre-trained models, baselines and comparison with Detectron and mmdetection +can be found in [MODEL_ZOO.md](MODEL_ZOO.md) + +## Inference in a few lines +We provide a helper class to simplify writing inference pipelines using pre-trained models. +Here is how we would do it. Run this from the `demo` folder: +```python +from maskrcnn_benchmark.config import cfg +from predictor import COCODemo + +config_file = "../configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml" + +# update the config options with the config file +cfg.merge_from_file(config_file) +# manual override some options +cfg.merge_from_list(["MODEL.DEVICE", "cpu"]) + +coco_demo = COCODemo( + cfg, + min_image_size=800, + confidence_threshold=0.7, +) +# load image and then run prediction +image = ... +predictions = coco_demo.run_on_opencv_image(image) +``` + +## Perform training on COCO dataset + +For the following examples to work, you need to first install `maskrcnn_benchmark`. + +You will also need to download the COCO dataset. +We recommend to symlink the path to the coco dataset to `datasets/` as follows + +We use `minival` and `valminusminival` sets from [Detectron](https://github.com/facebookresearch/Detectron/blob/master/detectron/datasets/data/README.md#coco-minival-annotations) + +```bash +# symlink the coco dataset +cd ~/github/maskrcnn-benchmark +mkdir -p datasets/coco +ln -s /path_to_coco_dataset/annotations datasets/coco/annotations +ln -s /path_to_coco_dataset/train2014 datasets/coco/train2014 +ln -s /path_to_coco_dataset/test2014 datasets/coco/test2014 +ln -s /path_to_coco_dataset/val2014 datasets/coco/val2014 +``` + +You can also configure your own paths to the datasets. +For that, all you need to do is to modify `maskrcnn_benchmark/config/paths_catalog.py` to +point to the location where your dataset is stored. +You can also create a new `paths_catalog.py` file which implements the same two classes, +and pass it as a config argument `PATHS_CATALOG` during training. + +### Single GPU training + +```bash +python /path_to_maskrnn_benchmark/tools/train_net.py --config-file "/path/to/config/file.yaml" +``` + +### Multi-GPU training +We use internally `torch.distributed.launch` in order to launch +multi-gpu training. This utility function from PyTorch spawns as many +Python processes as the number of GPUs we want to use, and each Python +process will only use a single GPU. + +```bash +export NGPUS=8 +python -m torch.distributed.launch --nproc_per_node=$NGPUS /path_to_maskrcnn_benchmark/tools/train_net.py --config-file "path/to/config/file.yaml" +``` + +## Abstractions +For more information on some of the main abstractions in our implementation, see [ABSTRACTIONS.md](ABSTRACTIONS.md). + +## Adding your own dataset + +This implementation adds support for COCO-style datasets. +But adding support for training on a new dataset can be done as follows: +```python +from maskrcnn_benchmark.structures.bounding_box import BoxList + +class MyDataset(object): + def __init__(self, ...): + # as you would do normally + + def __getitem__(self, idx): + # load the image as a PIL Image + image = ... + + # load the bounding boxes as a list of list of boxes + # in this case, for illustrative purposes, we use + # x1, y1, x2, y2 order. + boxes = [[0, 0, 10, 10], [10, 20, 50, 50]] + # and labels + labels = torch.tensor([10, 20]) + + # create a BoxList from the boxes + boxlist = Boxlist(boxes, size=image.size, mode="xyxy") + # add the labels to the boxlist + boxlist.add_field("labels", labels) + + if self.transforms: + image, boxlist = self.transforms(image, boxlist) + + # return the image, the boxlist and the idx in your dataset + return image, boxlist, idx + + def get_img_info(self, idx): + # get img_height and img_width. This is used if + # we want to split the batches according to the aspect ratio + # of the image, as it can be more efficient than loading the + # image from disk + return {"height": img_height, "width": img_width} +``` +That's it. You can also add extra fields to the boxlist, such as segmentation masks +(using `structures.segmentation_mask.SegmentationMask`), or even your own instance type. + +For a full example of how the `COCODataset` is implemented, check [`maskrcnn_benchmark/data/datasets/coco.py`](maskrcnn_benchmark/data/datasets/coco.py). + +### Note: +While the aforementioned example should work for training, we leverage the +cocoApi for computing the accuracies during testing. Thus, test datasets +should currently follow the cocoApi for now. + +## License + +maskrcnn-benchmark is released under the MIT license. See [LICENSE](LICENSE) for additional details. diff --git a/configs/caffe2/e2e_faster_rcnn_R_101_FPN_1x_caffe2.yaml b/configs/caffe2/e2e_faster_rcnn_R_101_FPN_1x_caffe2.yaml new file mode 100644 index 000000000..e129ac885 --- /dev/null +++ b/configs/caffe2/e2e_faster_rcnn_R_101_FPN_1x_caffe2.yaml @@ -0,0 +1,25 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://Caffe2Detectron/COCO/35857890/e2e_faster_rcnn_R-101-FPN_1x" + BACKBONE: + CONV_BODY: "R-101-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" +DATASETS: + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 diff --git a/configs/caffe2/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml b/configs/caffe2/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml new file mode 100644 index 000000000..393defe7f --- /dev/null +++ b/configs/caffe2/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml @@ -0,0 +1,5 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://Caffe2Detectron/COCO/35857197/e2e_faster_rcnn_R-50-C4_1x" +DATASETS: + TEST: ("coco_2014_minival",) diff --git a/configs/caffe2/e2e_faster_rcnn_R_50_FPN_1x_caffe2.yaml b/configs/caffe2/e2e_faster_rcnn_R_50_FPN_1x_caffe2.yaml new file mode 100644 index 000000000..180d737a6 --- /dev/null +++ b/configs/caffe2/e2e_faster_rcnn_R_50_FPN_1x_caffe2.yaml @@ -0,0 +1,25 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://Caffe2Detectron/COCO/35857345/e2e_faster_rcnn_R-50-FPN_1x" + BACKBONE: + CONV_BODY: "R-50-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" +DATASETS: + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 diff --git a/configs/caffe2/e2e_faster_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml b/configs/caffe2/e2e_faster_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml new file mode 100644 index 000000000..166a2ea0e --- /dev/null +++ b/configs/caffe2/e2e_faster_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml @@ -0,0 +1,29 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://Caffe2Detectron/COCO/36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x" + BACKBONE: + CONV_BODY: "R-101-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + RESNETS: + STRIDE_IN_1X1: False + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +DATASETS: + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 diff --git a/configs/caffe2/e2e_mask_rcnn_R_101_FPN_1x_caffe2.yaml b/configs/caffe2/e2e_mask_rcnn_R_101_FPN_1x_caffe2.yaml new file mode 100644 index 000000000..57da8e8cc --- /dev/null +++ b/configs/caffe2/e2e_mask_rcnn_R_101_FPN_1x_caffe2.yaml @@ -0,0 +1,34 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://Caffe2Detectron/COCO/35861795/e2e_mask_rcnn_R-101-FPN_1x" + BACKBONE: + CONV_BODY: "R-101-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + ROI_MASK_HEAD: + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" + PREDICTOR: "MaskRCNNC4Predictor" + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 2 + RESOLUTION: 28 + SHARE_BOX_FEATURE_EXTRACTOR: False + MASK_ON: True +DATASETS: + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 diff --git a/configs/caffe2/e2e_mask_rcnn_R_50_C4_1x_caffe2.yaml b/configs/caffe2/e2e_mask_rcnn_R_50_C4_1x_caffe2.yaml new file mode 100644 index 000000000..d1d0572f8 --- /dev/null +++ b/configs/caffe2/e2e_mask_rcnn_R_50_C4_1x_caffe2.yaml @@ -0,0 +1,9 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://Caffe2Detectron/COCO/35858791/e2e_mask_rcnn_R-50-C4_1x" + ROI_MASK_HEAD: + PREDICTOR: "MaskRCNNC4Predictor" + SHARE_BOX_FEATURE_EXTRACTOR: True + MASK_ON: True +DATASETS: + TEST: ("coco_2014_minival",) diff --git a/configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml b/configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml new file mode 100644 index 000000000..f0e675df5 --- /dev/null +++ b/configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml @@ -0,0 +1,34 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://Caffe2Detectron/COCO/35858933/e2e_mask_rcnn_R-50-FPN_1x" + BACKBONE: + CONV_BODY: "R-50-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + ROI_MASK_HEAD: + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" + PREDICTOR: "MaskRCNNC4Predictor" + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 2 + RESOLUTION: 28 + SHARE_BOX_FEATURE_EXTRACTOR: False + MASK_ON: True +DATASETS: + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 diff --git a/configs/caffe2/e2e_mask_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml b/configs/caffe2/e2e_mask_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml new file mode 100644 index 000000000..c97b94073 --- /dev/null +++ b/configs/caffe2/e2e_mask_rcnn_X_101_32x8d_FPN_1x_caffe2.yaml @@ -0,0 +1,38 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://Caffe2Detectron/COCO/36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x" + BACKBONE: + CONV_BODY: "R-101-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + ROI_MASK_HEAD: + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" + PREDICTOR: "MaskRCNNC4Predictor" + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 2 + RESOLUTION: 28 + SHARE_BOX_FEATURE_EXTRACTOR: False + RESNETS: + STRIDE_IN_1X1: False + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 + MASK_ON: True +DATASETS: + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x.yaml new file mode 100644 index 000000000..45b07e06d --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x.yaml @@ -0,0 +1,31 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" +DATASETS: + TRAIN: ("coco_2014_train", "coco_2014_valminusminival") + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.02 + WEIGHT_DECAY: 0.0001 + STEPS: (60000, 80000) + MAX_ITER: 90000 diff --git a/configs/e2e_faster_rcnn_R_50_C4_1x.yaml b/configs/e2e_faster_rcnn_R_50_C4_1x.yaml new file mode 100644 index 000000000..5cec224a0 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_50_C4_1x.yaml @@ -0,0 +1,15 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" + RPN: + PRE_NMS_TOP_N_TEST: 6000 + POST_NMS_TOP_N_TEST: 1000 +DATASETS: + TRAIN: ("coco_2014_train", "coco_2014_valminusminival") + TEST: ("coco_2014_minival",) +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 diff --git a/configs/e2e_faster_rcnn_R_50_FPN_1x.yaml b/configs/e2e_faster_rcnn_R_50_FPN_1x.yaml new file mode 100644 index 000000000..267a12c13 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_50_FPN_1x.yaml @@ -0,0 +1,31 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" + BACKBONE: + CONV_BODY: "R-50-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" +DATASETS: + TRAIN: ("coco_2014_train", "coco_2014_valminusminival") + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.02 + WEIGHT_DECAY: 0.0001 + STEPS: (60000, 80000) + MAX_ITER: 90000 diff --git a/configs/e2e_faster_rcnn_X_101_32x8d_FPN_1x.yaml b/configs/e2e_faster_rcnn_X_101_32x8d_FPN_1x.yaml new file mode 100644 index 000000000..9338c8767 --- /dev/null +++ b/configs/e2e_faster_rcnn_X_101_32x8d_FPN_1x.yaml @@ -0,0 +1,36 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" + BACKBONE: + CONV_BODY: "R-101-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + RESNETS: + STRIDE_IN_1X1: False + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +DATASETS: + TRAIN: ("coco_2014_train", "coco_2014_valminusminival") + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 diff --git a/configs/e2e_mask_rcnn_R_101_FPN_1x.yaml b/configs/e2e_mask_rcnn_R_101_FPN_1x.yaml new file mode 100644 index 000000000..c2da8f377 --- /dev/null +++ b/configs/e2e_mask_rcnn_R_101_FPN_1x.yaml @@ -0,0 +1,40 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + ROI_MASK_HEAD: + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" + PREDICTOR: "MaskRCNNC4Predictor" + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 2 + RESOLUTION: 28 + SHARE_BOX_FEATURE_EXTRACTOR: False + MASK_ON: True +DATASETS: + TRAIN: ("coco_2014_train", "coco_2014_valminusminival") + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.02 + WEIGHT_DECAY: 0.0001 + STEPS: (60000, 80000) + MAX_ITER: 90000 diff --git a/configs/e2e_mask_rcnn_R_50_C4_1x.yaml b/configs/e2e_mask_rcnn_R_50_C4_1x.yaml new file mode 100644 index 000000000..bfcd25866 --- /dev/null +++ b/configs/e2e_mask_rcnn_R_50_C4_1x.yaml @@ -0,0 +1,19 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" + RPN: + PRE_NMS_TOP_N_TEST: 6000 + POST_NMS_TOP_N_TEST: 1000 + ROI_MASK_HEAD: + PREDICTOR: "MaskRCNNC4Predictor" + SHARE_BOX_FEATURE_EXTRACTOR: True + MASK_ON: True +DATASETS: + TRAIN: ("coco_2014_train", "coco_2014_valminusminival") + TEST: ("coco_2014_minival",) +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 diff --git a/configs/e2e_mask_rcnn_R_50_FPN_1x.yaml b/configs/e2e_mask_rcnn_R_50_FPN_1x.yaml new file mode 100644 index 000000000..176e66069 --- /dev/null +++ b/configs/e2e_mask_rcnn_R_50_FPN_1x.yaml @@ -0,0 +1,40 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" + BACKBONE: + CONV_BODY: "R-50-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + ROI_MASK_HEAD: + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" + PREDICTOR: "MaskRCNNC4Predictor" + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 2 + RESOLUTION: 28 + SHARE_BOX_FEATURE_EXTRACTOR: False + MASK_ON: True +DATASETS: + TRAIN: ("coco_2014_train", "coco_2014_valminusminival") + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.02 + WEIGHT_DECAY: 0.0001 + STEPS: (60000, 80000) + MAX_ITER: 90000 diff --git a/configs/e2e_mask_rcnn_X_101_32x8d_FPN_1x.yaml b/configs/e2e_mask_rcnn_X_101_32x8d_FPN_1x.yaml new file mode 100644 index 000000000..4204419be --- /dev/null +++ b/configs/e2e_mask_rcnn_X_101_32x8d_FPN_1x.yaml @@ -0,0 +1,45 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" + BACKBONE: + CONV_BODY: "R-101-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + ROI_MASK_HEAD: + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" + PREDICTOR: "MaskRCNNC4Predictor" + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 2 + RESOLUTION: 28 + SHARE_BOX_FEATURE_EXTRACTOR: False + RESNETS: + STRIDE_IN_1X1: False + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 + MASK_ON: True +DATASETS: + TRAIN: ("coco_2014_train", "coco_2014_valminusminival") + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 diff --git a/configs/quick_schedules/e2e_faster_rcnn_R_50_C4_quick.yaml b/configs/quick_schedules/e2e_faster_rcnn_R_50_C4_quick.yaml new file mode 100644 index 000000000..d5eae4457 --- /dev/null +++ b/configs/quick_schedules/e2e_faster_rcnn_R_50_C4_quick.yaml @@ -0,0 +1,24 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" + RPN: + PRE_NMS_TOP_N_TEST: 6000 + POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + BATCH_SIZE_PER_IMAGE: 256 +DATASETS: + TRAIN: ("coco_2014_minival",) + TEST: ("coco_2014_minival",) +INPUT: + MIN_SIZE_TRAIN: 600 + MAX_SIZE_TRAIN: 1000 + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1000 +SOLVER: + BASE_LR: 0.005 + WEIGHT_DECAY: 0.0001 + STEPS: (1500,) + MAX_ITER: 2000 + IMS_PER_BATCH: 2 +TEST: + IMS_PER_BATCH: 2 diff --git a/configs/quick_schedules/e2e_faster_rcnn_R_50_FPN_quick.yaml b/configs/quick_schedules/e2e_faster_rcnn_R_50_FPN_quick.yaml new file mode 100644 index 000000000..f69d029f3 --- /dev/null +++ b/configs/quick_schedules/e2e_faster_rcnn_R_50_FPN_quick.yaml @@ -0,0 +1,40 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" + BACKBONE: + CONV_BODY: "R-50-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + BATCH_SIZE_PER_IMAGE: 256 + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" +DATASETS: + TRAIN: ("coco_2014_minival",) + TEST: ("coco_2014_minival",) +INPUT: + MIN_SIZE_TRAIN: 600 + MAX_SIZE_TRAIN: 1000 + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1000 +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.005 + WEIGHT_DECAY: 0.0001 + STEPS: (1500,) + MAX_ITER: 2000 + IMS_PER_BATCH: 4 +TEST: + IMS_PER_BATCH: 2 diff --git a/configs/quick_schedules/e2e_faster_rcnn_X_101_32x8d_FPN_quick.yaml b/configs/quick_schedules/e2e_faster_rcnn_X_101_32x8d_FPN_quick.yaml new file mode 100644 index 000000000..d36ef53ad --- /dev/null +++ b/configs/quick_schedules/e2e_faster_rcnn_X_101_32x8d_FPN_quick.yaml @@ -0,0 +1,44 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" + BACKBONE: + CONV_BODY: "R-101-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + BATCH_SIZE_PER_IMAGE: 256 + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + RESNETS: + STRIDE_IN_1X1: False + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +DATASETS: + TRAIN: ("coco_2014_minival",) + TEST: ("coco_2014_minival",) +INPUT: + MIN_SIZE_TRAIN: 600 + MAX_SIZE_TRAIN: 1000 + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1000 +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.005 + WEIGHT_DECAY: 0.0001 + STEPS: (1500,) + MAX_ITER: 2000 + IMS_PER_BATCH: 2 +TEST: + IMS_PER_BATCH: 2 diff --git a/configs/quick_schedules/e2e_mask_rcnn_R_50_C4_quick.yaml b/configs/quick_schedules/e2e_mask_rcnn_R_50_C4_quick.yaml new file mode 100644 index 000000000..621dd0f68 --- /dev/null +++ b/configs/quick_schedules/e2e_mask_rcnn_R_50_C4_quick.yaml @@ -0,0 +1,28 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" + RPN: + PRE_NMS_TOP_N_TEST: 6000 + POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + BATCH_SIZE_PER_IMAGE: 256 + ROI_MASK_HEAD: + PREDICTOR: "MaskRCNNC4Predictor" + SHARE_BOX_FEATURE_EXTRACTOR: True + MASK_ON: True +DATASETS: + TRAIN: ("coco_2014_minival",) + TEST: ("coco_2014_minival",) +INPUT: + MIN_SIZE_TRAIN: 600 + MAX_SIZE_TRAIN: 1000 + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1000 +SOLVER: + BASE_LR: 0.005 + WEIGHT_DECAY: 0.0001 + STEPS: (1500,) + MAX_ITER: 2000 + IMS_PER_BATCH: 4 +TEST: + IMS_PER_BATCH: 2 diff --git a/configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_quick.yaml b/configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_quick.yaml new file mode 100644 index 000000000..28760d8f9 --- /dev/null +++ b/configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_quick.yaml @@ -0,0 +1,49 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" + BACKBONE: + CONV_BODY: "R-50-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + BATCH_SIZE_PER_IMAGE: 256 + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + ROI_MASK_HEAD: + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" + PREDICTOR: "MaskRCNNC4Predictor" + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 2 + RESOLUTION: 28 + SHARE_BOX_FEATURE_EXTRACTOR: False + MASK_ON: True +DATASETS: + TRAIN: ("coco_2014_minival",) + TEST: ("coco_2014_minival",) +INPUT: + MIN_SIZE_TRAIN: 600 + MAX_SIZE_TRAIN: 1000 + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1000 +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.005 + WEIGHT_DECAY: 0.0001 + STEPS: (1500,) + MAX_ITER: 2000 + IMS_PER_BATCH: 4 +TEST: + IMS_PER_BATCH: 2 diff --git a/configs/quick_schedules/e2e_mask_rcnn_X_101_32x8d_FPN_quick.yaml b/configs/quick_schedules/e2e_mask_rcnn_X_101_32x8d_FPN_quick.yaml new file mode 100644 index 000000000..a6f1283a3 --- /dev/null +++ b/configs/quick_schedules/e2e_mask_rcnn_X_101_32x8d_FPN_quick.yaml @@ -0,0 +1,53 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d" + BACKBONE: + CONV_BODY: "R-101-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + BATCH_SIZE_PER_IMAGE: 256 + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + ROI_MASK_HEAD: + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor" + PREDICTOR: "MaskRCNNC4Predictor" + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 2 + RESOLUTION: 28 + SHARE_BOX_FEATURE_EXTRACTOR: False + RESNETS: + STRIDE_IN_1X1: False + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 + MASK_ON: True +DATASETS: + TRAIN: ("coco_2014_minival",) + TEST: ("coco_2014_minival",) +INPUT: + MIN_SIZE_TRAIN: 600 + MAX_SIZE_TRAIN: 1000 + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1000 +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.005 + WEIGHT_DECAY: 0.0001 + STEPS: (1500,) + MAX_ITER: 2000 + IMS_PER_BATCH: 2 +TEST: + IMS_PER_BATCH: 2 diff --git a/configs/quick_schedules/rpn_R_50_C4_quick.yaml b/configs/quick_schedules/rpn_R_50_C4_quick.yaml new file mode 100644 index 000000000..ecf1e8766 --- /dev/null +++ b/configs/quick_schedules/rpn_R_50_C4_quick.yaml @@ -0,0 +1,23 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" + RPN_ONLY: True + RPN: + PRE_NMS_TOP_N_TEST: 12000 + POST_NMS_TOP_N_TEST: 2000 +DATASETS: + TRAIN: ("coco_2014_minival",) + TEST: ("coco_2014_minival",) +INPUT: + MIN_SIZE_TRAIN: 600 + MAX_SIZE_TRAIN: 1000 + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1000 +SOLVER: + BASE_LR: 0.005 + WEIGHT_DECAY: 0.0001 + STEPS: (1500,) + MAX_ITER: 2000 + IMS_PER_BATCH: 4 +TEST: + IMS_PER_BATCH: 2 diff --git a/configs/quick_schedules/rpn_R_50_FPN_quick.yaml b/configs/quick_schedules/rpn_R_50_FPN_quick.yaml new file mode 100644 index 000000000..d762b4f9d --- /dev/null +++ b/configs/quick_schedules/rpn_R_50_FPN_quick.yaml @@ -0,0 +1,31 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" + RPN_ONLY: True + BACKBONE: + CONV_BODY: "R-50-FPN" + OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 2000 + FPN_POST_NMS_TOP_N_TEST: 2000 +DATASETS: + TRAIN: ("coco_2014_minival",) + TEST: ("coco_2014_minival",) +INPUT: + MIN_SIZE_TRAIN: 600 + MAX_SIZE_TRAIN: 1000 + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1000 +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.005 + WEIGHT_DECAY: 0.0001 + STEPS: (1500,) + MAX_ITER: 2000 + IMS_PER_BATCH: 4 +TEST: + IMS_PER_BATCH: 2 diff --git a/demo/Mask_R-CNN_demo.ipynb b/demo/Mask_R-CNN_demo.ipynb new file mode 100644 index 000000000..0d975eab7 --- /dev/null +++ b/demo/Mask_R-CNN_demo.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Mask R-CNN demo\n", + "\n", + "This notebook illustrates one possible way of using `maskrcnn_benchmark` for computing predictions on images from an arbitrary URL.\n", + "\n", + "Let's start with a few standard imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import matplotlib.pylab as pylab\n", + "\n", + "import requests\n", + "from io import BytesIO\n", + "from PIL import Image\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# this makes our figures bigger\n", + "pylab.rcParams['figure.figsize'] = 20, 12" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Those are the relevant imports for the detection model" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from maskrcnn_benchmark.config import cfg\n", + "from predictor import COCODemo" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We provide a helper class `COCODemo`, which loads a model from the config file, and performs pre-processing, model prediction and post-processing for us.\n", + "\n", + "We can configure several model options by overriding the config options.\n", + "In here, we make the model run on the CPU" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "config_file = \"../configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml\"\n", + "\n", + "# update the config options with the config file\n", + "cfg.merge_from_file(config_file)\n", + "# manual override some options\n", + "cfg.merge_from_list([\"MODEL.DEVICE\", \"cpu\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we create the `COCODemo` object. It contains a few extra options for conveniency, such as the confidence threshold for detections to be shown." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "coco_demo = COCODemo(\n", + " cfg,\n", + " min_image_size=800,\n", + " confidence_threshold=0.7,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define a few helper functions for loading images from a URL" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def load(url):\n", + " \"\"\"\n", + " Given an url of an image, downloads the image and\n", + " returns a PIL image\n", + " \"\"\"\n", + " response = requests.get(url)\n", + " pil_image = Image.open(BytesIO(response.content)).convert(\"RGB\")\n", + " # convert to BGR format\n", + " image = np.array(pil_image)[:, :, [2, 1, 0]]\n", + " return image\n", + "\n", + "def imshow(img):\n", + " plt.imshow(img[:, :, [2, 1, 0]])\n", + " plt.axis(\"off\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now load an image from the COCO dataset. It's reference is in the comment" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# from http://cocodataset.org/#explore?id=345434\n", + "image = load(\"http://farm3.staticflickr.com/2469/3915380994_2e611b1779_z.jpg\")\n", + "imshow(image)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Computing the predictions\n", + "\n", + "We provide a `run_on_opencv_image` function, which takes an image as it was loaded by OpenCV (in `BGR` format), and computes the predictions on them, returning an image with the predictions overlayed on the image." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# compute predictions\n", + "predictions = coco_demo.run_on_opencv_image(image)\n", + "imshow(predictions)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/demo/README.md b/demo/README.md new file mode 100644 index 000000000..bcd089206 --- /dev/null +++ b/demo/README.md @@ -0,0 +1,16 @@ +## Webcam and Jupyter notebook demo + +This folder contains a simple webcam demo that illustrates how you can use `maskrcnn_benchmark` for inference. + +You can start it by running it from this folder, using one of the following commands: +```bash +# by default, it runs on the GPU +# for best results, use min-image-size 800 +python webcam.py --min-image-size 800 +# can also run it on the CPU +python webcam.py --min-image-size 300 MODEL.DEVICE cpu +# or change the model that you want to use +python webcam.py --config-file ../configs/caffe2/e2e_mask_rcnn_R_101_FPN_1x_caffe2.py --min-image-size 300 MODEL.DEVICE cpu +# in order to see the probability heatmaps, pass --show-mask-heatmaps +python webcam.py --min-image-size 300 --show-mask-heatmaps MODEL.DEVICE cpu +``` diff --git a/demo/demo_e2e_mask_rcnn_R_50_FPN_1x.png b/demo/demo_e2e_mask_rcnn_R_50_FPN_1x.png new file mode 100644 index 000000000..406351186 Binary files /dev/null and b/demo/demo_e2e_mask_rcnn_R_50_FPN_1x.png differ diff --git a/demo/demo_e2e_mask_rcnn_X_101_32x8d_FPN_1x.png b/demo/demo_e2e_mask_rcnn_X_101_32x8d_FPN_1x.png new file mode 100644 index 000000000..3e679f54b Binary files /dev/null and b/demo/demo_e2e_mask_rcnn_X_101_32x8d_FPN_1x.png differ diff --git a/demo/predictor.py b/demo/predictor.py new file mode 100644 index 000000000..9dc16dbd5 --- /dev/null +++ b/demo/predictor.py @@ -0,0 +1,355 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import cv2 +import torch +from torchvision import transforms as T + +from maskrcnn_benchmark.modeling.detector import build_detection_model +from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer +from maskrcnn_benchmark.structures.image_list import to_image_list +from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker + + +class COCODemo(object): + # COCO categories for pretty print + CATEGORIES = [ + "__background", + "person", + "bicycle", + "car", + "motorcycle", + "airplane", + "bus", + "train", + "truck", + "boat", + "traffic light", + "fire hydrant", + "stop sign", + "parking meter", + "bench", + "bird", + "cat", + "dog", + "horse", + "sheep", + "cow", + "elephant", + "bear", + "zebra", + "giraffe", + "backpack", + "umbrella", + "handbag", + "tie", + "suitcase", + "frisbee", + "skis", + "snowboard", + "sports ball", + "kite", + "baseball bat", + "baseball glove", + "skateboard", + "surfboard", + "tennis racket", + "bottle", + "wine glass", + "cup", + "fork", + "knife", + "spoon", + "bowl", + "banana", + "apple", + "sandwich", + "orange", + "broccoli", + "carrot", + "hot dog", + "pizza", + "donut", + "cake", + "chair", + "couch", + "potted plant", + "bed", + "dining table", + "toilet", + "tv", + "laptop", + "mouse", + "remote", + "keyboard", + "cell phone", + "microwave", + "oven", + "toaster", + "sink", + "refrigerator", + "book", + "clock", + "vase", + "scissors", + "teddy bear", + "hair drier", + "toothbrush", + ] + + def __init__( + self, + cfg, + confidence_threshold=0.7, + show_mask_heatmaps=False, + masks_per_dim=2, + min_image_size=224, + ): + self.cfg = cfg.clone() + self.model = build_detection_model(cfg) + self.model.eval() + self.device = torch.device(cfg.MODEL.DEVICE) + self.model.to(self.device) + self.min_image_size = min_image_size + + checkpointer = DetectronCheckpointer(cfg, self.model) + _ = checkpointer.load(cfg.MODEL.WEIGHT) + + self.transforms = self.build_transform() + + mask_threshold = -1 if show_mask_heatmaps else 0.5 + self.masker = Masker(threshold=mask_threshold, padding=1) + + # used to make colors for each class + self.palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1]) + + self.cpu_device = torch.device("cpu") + self.confidence_threshold = confidence_threshold + self.show_mask_heatmaps = show_mask_heatmaps + self.masks_per_dim = masks_per_dim + + def build_transform(self): + """ + Creates a basic transformation that was used to train the models + """ + cfg = self.cfg + + # we are loading images with OpenCV, so we don't need to convert them + # to BGR, they are already! So all we need to do is to normalize + # by 255 if we want to convert to BGR255 format, or flip the channels + # if we want it to be in RGB in [0-1] range. + if cfg.INPUT.TO_BGR255: + to_bgr_transform = T.Lambda(lambda x: x * 255) + else: + to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]]) + + normalize_transform = T.Normalize( + mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD + ) + + transform = T.Compose( + [ + T.ToPILImage(), + T.Resize(self.min_image_size), + T.ToTensor(), + to_bgr_transform, + normalize_transform, + ] + ) + return transform + + def run_on_opencv_image(self, image): + """ + Arguments: + image (np.ndarray): an image as returned by OpenCV + + Returns: + prediction (BoxList): the detected objects. Additional information + of the detection properties can be found in the fields of + the BoxList via `prediction.fields()` + """ + predictions = self.compute_prediction(image) + top_predictions = self.select_top_predictions(predictions) + + result = image.copy() + if self.show_mask_heatmaps: + return self.create_mask_montage(result, top_predictions) + result = self.overlay_boxes(result, top_predictions) + if self.cfg.MODEL.MASK_ON: + result = self.overlay_mask(result, top_predictions) + result = self.overlay_class_names(result, top_predictions) + + return result + + def compute_prediction(self, original_image): + """ + Arguments: + original_image (np.ndarray): an image as returned by OpenCV + + Returns: + prediction (BoxList): the detected objects. Additional information + of the detection properties can be found in the fields of + the BoxList via `prediction.fields()` + """ + # apply pre-processing to image + image = self.transforms(original_image) + # convert to an ImageList, padded so that it is divisible by + # cfg.DATALOADER.SIZE_DIVISIBILITY + image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY) + image_list = image_list.to(self.device) + # compute predictions + with torch.no_grad(): + predictions = self.model(image_list) + predictions = [o.to(self.cpu_device) for o in predictions] + + # always single image is passed at a time + prediction = predictions[0] + + # reshape prediction (a BoxList) into the original image size + height, width = original_image.shape[:-1] + prediction = prediction.resize((width, height)) + + if prediction.has_field("mask"): + # if we have masks, paste the masks in the right position + # in the image, as defined by the bounding boxes + masks = prediction.get_field("mask") + masks = self.masker(masks, prediction) + prediction.add_field("mask", masks) + return prediction + + def select_top_predictions(self, predictions): + """ + Select only predictions which have a `score` > self.confidence_threshold, + and returns the predictions in descending order of score + + Arguments: + predictions (BoxList): the result of the computation by the model. + It should contain the field `scores`. + + Returns: + prediction (BoxList): the detected objects. Additional information + of the detection properties can be found in the fields of + the BoxList via `prediction.fields()` + """ + scores = predictions.get_field("scores") + keep = torch.nonzero(scores > self.confidence_threshold).squeeze(1) + predictions = predictions[keep] + scores = predictions.get_field("scores") + _, idx = scores.sort(0, descending=True) + return predictions[idx] + + def compute_colors_for_labels(self, labels): + """ + Simple function that adds fixed colors depending on the class + """ + colors = labels[:, None] * self.palette + colors = (colors % 255).numpy().astype("uint8") + return colors + + def overlay_boxes(self, image, predictions): + """ + Adds the predicted boxes on top of the image + + Arguments: + image (np.ndarray): an image as returned by OpenCV + predictions (BoxList): the result of the computation by the model. + It should contain the field `labels`. + """ + labels = predictions.get_field("labels") + boxes = predictions.bbox + + colors = self.compute_colors_for_labels(labels).tolist() + + for box, color in zip(boxes, colors): + box = box.to(torch.int64) + top_left, bottom_right = box[:2].tolist(), box[2:].tolist() + image = cv2.rectangle( + image, tuple(top_left), tuple(bottom_right), tuple(color), 1 + ) + + return image + + def overlay_mask(self, image, predictions): + """ + Adds the instances contours for each predicted object. + Each label has a different color. + + Arguments: + image (np.ndarray): an image as returned by OpenCV + predictions (BoxList): the result of the computation by the model. + It should contain the field `mask` and `labels`. + """ + masks = predictions.get_field("mask").numpy() + labels = predictions.get_field("labels") + + colors = self.compute_colors_for_labels(labels).tolist() + + for mask, color in zip(masks, colors): + thresh = mask[0, :, :, None] + _, contours, hierarchy = cv2.findContours( + thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE + ) + image = cv2.drawContours(image, contours, -1, color, 3) + + composite = image + + return composite + + def create_mask_montage(self, image, predictions): + """ + Create a montage showing the probability heatmaps for each one one of the + detected objects + + Arguments: + image (np.ndarray): an image as returned by OpenCV + predictions (BoxList): the result of the computation by the model. + It should contain the field `mask`. + """ + masks = predictions.get_field("mask") + masks_per_dim = self.masks_per_dim + masks = torch.nn.functional.interpolate( + masks.float(), scale_factor=1 / masks_per_dim + ).byte() + height, width = masks.shape[-2:] + max_masks = masks_per_dim ** 2 + masks = masks[:max_masks] + # handle case where we have less detections than max_masks + if len(masks) < max_masks: + masks_padded = torch.zeros(max_masks, 1, height, width, dtype=torch.uint8) + masks_padded[: len(masks)] = masks + masks = masks_padded + masks = masks.reshape(masks_per_dim, masks_per_dim, height, width) + result = torch.zeros( + (masks_per_dim * height, masks_per_dim * width), dtype=torch.uint8 + ) + for y in range(masks_per_dim): + start_y = y * height + end_y = (y + 1) * height + for x in range(masks_per_dim): + start_x = x * width + end_x = (x + 1) * width + result[start_y:end_y, start_x:end_x] = masks[y, x] + return cv2.applyColorMap(result.numpy(), cv2.COLORMAP_JET) + + def overlay_class_names(self, image, predictions): + """ + Adds detected class names and scores in the positions defined by the + top-left corner of the predicted bounding box + + Arguments: + image (np.ndarray): an image as returned by OpenCV + predictions (BoxList): the result of the computation by the model. + It should contain the field `scores` and `labels`. + """ + scores = predictions.get_field("scores").tolist() + labels = predictions.get_field("labels").tolist() + labels = [self.CATEGORIES[i] for i in labels] + boxes = predictions.bbox + + template = "{}: {:.2f}" + for box, score, label in zip(boxes, scores, labels): + x, y = box[:2] + s = template.format(label, score) + cv2.putText( + image, s, (x, y), cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1 + ) + + return image diff --git a/demo/webcam.py b/demo/webcam.py new file mode 100644 index 000000000..5cd6a4c44 --- /dev/null +++ b/demo/webcam.py @@ -0,0 +1,80 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import argparse +import cv2 + +from maskrcnn_benchmark.config import cfg +from predictor import COCODemo + +import time + + +def main(): + parser = argparse.ArgumentParser(description="PyTorch Object Detection Webcam Demo") + parser.add_argument( + "--config-file", + default="../configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument( + "--confidence-threshold", + type=float, + default=0.7, + help="Minimum score for the prediction to be shown", + ) + parser.add_argument( + "--min-image-size", + type=int, + default=224, + help="Smallest size of the image to feed to the model. " + "Model was trained with 800, which gives best results", + ) + parser.add_argument( + "--show-mask-heatmaps", + dest="show_mask_heatmaps", + help="Show a heatmap probability for the top masks-per-dim masks", + action="store_true", + ) + parser.add_argument( + "--masks-per-dim", + type=int, + default=2, + help="Number of heatmaps per dimension to show", + ) + parser.add_argument( + "opts", + help="Modify model config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + + args = parser.parse_args() + + # load config from file and command-line arguments + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + + # prepare object that handles inference plus adds predictions on top of image + coco_demo = COCODemo( + cfg, + confidence_threshold=args.confidence_threshold, + show_mask_heatmaps=args.show_mask_heatmaps, + masks_per_dim=args.masks_per_dim, + min_image_size=args.min_image_size, + ) + + cam = cv2.VideoCapture(0) + while True: + start_time = time.time() + ret_val, img = cam.read() + composite = coco_demo.run_on_opencv_image(img) + print("Time: {:.2f} s / img".format(time.time() - start_time)) + cv2.imshow("COCO detections", composite) + if cv2.waitKey(1) == 27: + break # esc to quit + cv2.destroyAllWindows() + + +if __name__ == "__main__": + main() diff --git a/maskrcnn_benchmark/config/__init__.py b/maskrcnn_benchmark/config/__init__.py new file mode 100644 index 000000000..22a15023b --- /dev/null +++ b/maskrcnn_benchmark/config/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .defaults import _C as cfg diff --git a/maskrcnn_benchmark/config/defaults.py b/maskrcnn_benchmark/config/defaults.py new file mode 100644 index 000000000..12b8eb5d6 --- /dev/null +++ b/maskrcnn_benchmark/config/defaults.py @@ -0,0 +1,269 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import os + +from yacs.config import CfgNode as CN + + +# ----------------------------------------------------------------------------- +# Convention about Training / Test specific parameters +# ----------------------------------------------------------------------------- +# Whenever an argument can be either used for training or for testing, the +# corresponding name will be post-fixed by a _TRAIN for a training parameter, +# or _TEST for a test-specific parameter. +# For example, the number of images during training will be +# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be +# IMAGES_PER_BATCH_TEST + +# ----------------------------------------------------------------------------- +# Config definition +# ----------------------------------------------------------------------------- + +_C = CN() + +_C.MODEL = CN() +_C.MODEL.RPN_ONLY = False +_C.MODEL.MASK_ON = False +_C.MODEL.DEVICE = "cuda" +_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN" + +# If the WEIGHT starts with a catalog://, like :R-50, the code will look for +# the path in paths_catalog. Else, it will use it as the specified absolute +# path +_C.MODEL.WEIGHT = "" + + +# ----------------------------------------------------------------------------- +# INPUT +# ----------------------------------------------------------------------------- +_C.INPUT = CN() +# Size of the smallest side of the image during training +_C.INPUT.MIN_SIZE_TRAIN = 800 # (800,) +# Maximum size of the side of the image during training +_C.INPUT.MAX_SIZE_TRAIN = 1333 +# Size of the smallest side of the image during testing +_C.INPUT.MIN_SIZE_TEST = 800 +# Maximum size of the side of the image during testing +_C.INPUT.MAX_SIZE_TEST = 1333 +# Values to be used for image normalization +_C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717] +# Values to be used for image normalization +_C.INPUT.PIXEL_STD = [1., 1., 1.] +# Convert image to BGR format (for Caffe2 models), in range 0-255 +_C.INPUT.TO_BGR255 = True + + +# ----------------------------------------------------------------------------- +# Dataset +# ----------------------------------------------------------------------------- +_C.DATASETS = CN() +# List of the dataset names for training, as present in paths_catalog.py +_C.DATASETS.TRAIN = () +# List of the dataset names for testing, as present in paths_catalog.py +_C.DATASETS.TEST = () + +# ----------------------------------------------------------------------------- +# DataLoader +# ----------------------------------------------------------------------------- +_C.DATALOADER = CN() +# Number of data loading threads +_C.DATALOADER.NUM_WORKERS = 4 +# If > 0, this enforces that each collated batch should have a size divisible +# by SIZE_DIVISIBILITY +_C.DATALOADER.SIZE_DIVISIBILITY = 0 +# If True, each batch should contain only images for which the aspect ratio +# is compatible. This groups portrait images together, and landscape images +# are not batched with portrait images. +_C.DATALOADER.ASPECT_RATIO_GROUPING = True + +# ---------------------------------------------------------------------------- # +# Backbone options +# ---------------------------------------------------------------------------- # +_C.MODEL.BACKBONE = CN() + +# The backbone conv body to use +# The string must match a function that is imported in modeling.model_builder +# (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN +# backbone) +_C.MODEL.BACKBONE.CONV_BODY = "R-50-C4" + +# Add StopGrad at a specified stage so the bottom layers are frozen +_C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2 +_C.MODEL.BACKBONE.OUT_CHANNELS = 256 * 4 + + +# ---------------------------------------------------------------------------- # +# RPN options +# ---------------------------------------------------------------------------- # +_C.MODEL.RPN = CN() +_C.MODEL.RPN.USE_FPN = False +# Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input +_C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512) +# Stride of the feature map that RPN is attached. +# For FPN, number of strides should match number of scales +_C.MODEL.RPN.ANCHOR_STRIDE = (16,) +# RPN anchor aspect ratios +_C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0) +# Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels +# Set to -1 or a large value, e.g. 100000, to disable pruning anchors +_C.MODEL.RPN.STRADDLE_THRESH = 0 +# Minimum overlap required between an anchor and ground-truth box for the +# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD +# ==> positive RPN example) +_C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7 +# Maximum overlap allowed between an anchor and ground-truth box for the +# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD +# ==> negative RPN example) +_C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3 +# Total number of RPN examples per image +_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256 +# Target fraction of foreground (positive) examples per RPN minibatch +_C.MODEL.RPN.POSITIVE_FRACTION = 0.5 +# Number of top scoring RPN proposals to keep before applying NMS +# When FPN is used, this is *per FPN level* (not total) +_C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000 +_C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000 +# Number of top scoring RPN proposals to keep after applying NMS +_C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000 +_C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000 +# NMS threshold used on RPN proposals +_C.MODEL.RPN.NMS_THRESH = 0.7 +# Proposal height and width both need to be greater than RPN_MIN_SIZE +# (a the scale used during training or inference) +_C.MODEL.RPN.MIN_SIZE = 0 +# Number of top scoring RPN proposals to keep after combining proposals from +# all FPN levels +_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000 +_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000 + + +# ---------------------------------------------------------------------------- # +# ROI HEADS options +# ---------------------------------------------------------------------------- # +_C.MODEL.ROI_HEADS = CN() +_C.MODEL.ROI_HEADS.USE_FPN = False +# Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD) +_C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5 +# Overlap threshold for an RoI to be considered background +# (class = 0 if overlap in [0, BG_IOU_THRESHOLD)) +_C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5 +# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets +# These are empirically chosen to approximately lead to unit variance targets +_C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10., 10., 5., 5.) +# RoI minibatch size *per image* (number of regions of interest [ROIs]) +# Total number of RoIs per training minibatch = +# TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH * NUM_GPUS +# E.g., a common configuration is: 512 * 2 * 8 = 8192 +_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512 +# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0) +_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25 + +# Only used on test mode + +# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to +# balance obtaining high recall with not having too many low precision +# detections that will slow down inference post processing steps (like NMS) +_C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05 +# Overlap threshold used for non-maximum suppression (suppress boxes with +# IoU >= this threshold) +_C.MODEL.ROI_HEADS.NMS = 0.5 +# Maximum number of detections to return per image (100 is based on the limit +# established for the COCO dataset) +_C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100 + + +_C.MODEL.ROI_BOX_HEAD = CN() +_C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor" +_C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor" +_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14 +_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0 +_C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,) +_C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81 +# Hidden layer dimension when using an MLP for the RoI box head +_C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024 + + +_C.MODEL.ROI_MASK_HEAD = CN() +_C.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor" +_C.MODEL.ROI_MASK_HEAD.PREDICTOR = "MaskRCNNC4Predictor" +_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14 +_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0 +_C.MODEL.ROI_MASK_HEAD.POOLER_SCALES = (1.0 / 16,) +_C.MODEL.ROI_MASK_HEAD.MLP_HEAD_DIM = 1024 +_C.MODEL.ROI_MASK_HEAD.CONV_LAYERS = (256, 256, 256, 256) +_C.MODEL.ROI_MASK_HEAD.RESOLUTION = 14 +_C.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True + +# ---------------------------------------------------------------------------- # +# ResNe[X]t options (ResNets = {ResNet, ResNeXt} +# Note that parts of a resnet may be used for both the backbone and the head +# These options apply to both +# ---------------------------------------------------------------------------- # +_C.MODEL.RESNETS = CN() + +# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt +_C.MODEL.RESNETS.NUM_GROUPS = 1 + +# Baseline width of each group +_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64 + +# Place the stride 2 conv on the 1x1 filter +# Use True only for the original MSRA ResNet; use False for C2 and Torch models +_C.MODEL.RESNETS.STRIDE_IN_1X1 = True + +# Residual transformation function +_C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm" +# ResNet's stem function (conv1 and pool1) +_C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm" + +# Apply dilation in stage "res5" +_C.MODEL.RESNETS.RES5_DILATION = 1 + +_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256 +_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64 + +# ---------------------------------------------------------------------------- # +# Solver +# ---------------------------------------------------------------------------- # +_C.SOLVER = CN() +_C.SOLVER.MAX_ITER = 40000 + +_C.SOLVER.BASE_LR = 0.001 +_C.SOLVER.BIAS_LR_FACTOR = 2 + +_C.SOLVER.MOMENTUM = 0.9 + +_C.SOLVER.WEIGHT_DECAY = 0.0005 +_C.SOLVER.WEIGHT_DECAY_BIAS = 0 + +_C.SOLVER.GAMMA = 0.1 +_C.SOLVER.STEPS = (30000,) + +_C.SOLVER.WARMUP_FACTOR = 1.0 / 3 +_C.SOLVER.WARMUP_ITERS = 500 +_C.SOLVER.WARMUP_METHOD = "linear" + +_C.SOLVER.CHECKPOINT_PERIOD = 2500 + +# Number of images per batch +# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will +# see 2 images per batch +_C.SOLVER.IMS_PER_BATCH = 16 + +# ---------------------------------------------------------------------------- # +# Specific test options +# ---------------------------------------------------------------------------- # +_C.TEST = CN() +_C.TEST.EXPECTED_RESULTS = [] +_C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4 +# Number of images per batch +# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will +# see 2 images per batch +_C.TEST.IMS_PER_BATCH = 8 + + +# ---------------------------------------------------------------------------- # +# Misc options +# ---------------------------------------------------------------------------- # +_C.OUTPUT_DIR = "." + +_C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py") diff --git a/maskrcnn_benchmark/config/paths_catalog.py b/maskrcnn_benchmark/config/paths_catalog.py new file mode 100644 index 000000000..67231baef --- /dev/null +++ b/maskrcnn_benchmark/config/paths_catalog.py @@ -0,0 +1,94 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +"""Centralized catalog of paths.""" + +import os + + +class DatasetCatalog(object): + DATA_DIR = "datasets" + + DATASETS = { + "coco_2014_train": ( + "coco/train2014", + "coco/annotations/instances_train2014.json", + ), + "coco_2014_val": ("coco/val2014", "coco/annotations/instances_val2014.json"), + "coco_2014_minival": ( + "coco/val2014", + "coco/annotations/instances_minival2014.json", + ), + "coco_2014_valminusminival": ( + "coco/val2014", + "coco/annotations/instances_valminusminival2014.json", + ), + } + + @staticmethod + def get(name): + if "coco" in name: + data_dir = DatasetCatalog.DATA_DIR + attrs = DatasetCatalog.DATASETS[name] + args = dict( + root=os.path.join(data_dir, attrs[0]), + ann_file=os.path.join(data_dir, attrs[1]), + ) + return dict( + factory="COCODataset", + args=args, + ) + raise RuntimeError("Dataset not available: {}".format(name)) + + +class ModelCatalog(object): + S3_C2_DETECTRON_URL = "https://s3-us-west-2.amazonaws.com/detectron" + C2_IMAGENET_MODELS = { + "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl", + "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl", + "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl", + } + + C2_DETECTRON_SUFFIX = "output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl" + C2_DETECTRON_MODELS = { + "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW", + "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I", + "35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7", + "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ", + "35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB", + "35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC", + "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT", + "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI", + } + + @staticmethod + def get(name): + if name.startswith("Caffe2Detectron/COCO"): + return ModelCatalog.get_c2_detectron_12_2017_baselines(name) + if name.startswith("ImageNetPretrained"): + return ModelCatalog.get_c2_imagenet_pretrained(name) + raise RuntimeError("model not present in the catalog {}".format(name)) + + @staticmethod + def get_c2_imagenet_pretrained(name): + prefix = ModelCatalog.S3_C2_DETECTRON_URL + name = name[len("ImageNetPretrained/"):] + name = ModelCatalog.C2_IMAGENET_MODELS[name] + url = "/".join([prefix, name]) + return url + + @staticmethod + def get_c2_detectron_12_2017_baselines(name): + # Detectron C2 models are stored following the structure + # prefix//2012_2017_baselines/.yaml./suffix + # we use as identifiers in the catalog Caffe2Detectron/COCO// + prefix = ModelCatalog.S3_C2_DETECTRON_URL + suffix = ModelCatalog.C2_DETECTRON_SUFFIX + # remove identification prefix + name = name[len("Caffe2Detectron/COCO/"):] + # split in and + model_id, model_name = name.split("/") + # parsing to make it match the url address from the Caffe2 models + model_name = "{}.yaml".format(model_name) + signature = ModelCatalog.C2_DETECTRON_MODELS[name] + unique_name = ".".join([model_name, signature]) + url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix]) + return url diff --git a/maskrcnn_benchmark/csrc/ROIAlign.h b/maskrcnn_benchmark/csrc/ROIAlign.h new file mode 100644 index 000000000..3907deab2 --- /dev/null +++ b/maskrcnn_benchmark/csrc/ROIAlign.h @@ -0,0 +1,46 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once + +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + +// Interface for Python +at::Tensor ROIAlign_forward(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio) { + if (input.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); +} + +at::Tensor ROIAlign_backward(const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio) { + if (grad.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + diff --git a/maskrcnn_benchmark/csrc/ROIPool.h b/maskrcnn_benchmark/csrc/ROIPool.h new file mode 100644 index 000000000..200fd7390 --- /dev/null +++ b/maskrcnn_benchmark/csrc/ROIPool.h @@ -0,0 +1,48 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once + +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + + +std::tuple ROIPool_forward(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width) { + if (input.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +at::Tensor ROIPool_backward(const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& rois, + const at::Tensor& argmax, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width) { + if (grad.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + + + diff --git a/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp new file mode 100644 index 000000000..d35aedf27 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp @@ -0,0 +1,257 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include "cpu/vision.h" + +// implementation taken from Caffe2 +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int iy_upper, + const int ix_upper, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T x = xx; + T y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indeces + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +template +void ROIAlignForward_cpu_kernel( + const int nthreads, + const T* bottom_data, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, + //int roi_cols, + T* top_data) { + //AT_ASSERT(roi_cols == 4 || roi_cols == 5); + int roi_cols = 5; + + int n_rois = nthreads / channels / pooled_width / pooled_height; + // (n, c, ph, pw) is an element in the pooled output + // can be parallelized using omp + // #pragma omp parallel for num_threads(32) + for (int n = 0; n < n_rois; n++) { + int index_n = n * channels * pooled_width * pooled_height; + + // roi could have 4 or 5 columns + const T* offset_bottom_rois = bottom_rois + n * roi_cols; + int roi_batch_ind = 0; + if (roi_cols == 5) { + roi_batch_ind = offset_bottom_rois[0]; + offset_bottom_rois++; + } + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[0] * spatial_scale; + T roi_start_h = offset_bottom_rois[1] * spatial_scale; + T roi_end_w = offset_bottom_rois[2] * spatial_scale; + T roi_end_h = offset_bottom_rois[3] * spatial_scale; + // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale); + // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale); + // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale); + // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale); + + // Force malformed ROIs to be 1x1 + T roi_width = std::max(roi_end_w - roi_start_w, (T)1.); + T roi_height = std::max(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + // we want to precalculate indeces and weights shared by all chanels, + // this is the key point of optimiation + std::vector> pre_calc( + roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate( + height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_bottom_data[pc.pos1] + + pc.w2 * offset_bottom_data[pc.pos2] + + pc.w3 * offset_bottom_data[pc.pos3] + + pc.w4 * offset_bottom_data[pc.pos4]; + + pre_calc_index += 1; + } + } + output_val /= count; + + top_data[index] = output_val; + } // for pw + } // for ph + } // for c + } // for n +} + +at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio) { + AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor"); + AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor"); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); + auto output_size = num_rois * pooled_height * pooled_width * channels; + + if (output.numel() == 0) { + return output; + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { + ROIAlignForward_cpu_kernel( + output_size, + input.data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + rois.data(), + output.data()); + }); + return output; +} diff --git a/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp new file mode 100644 index 000000000..1153dea04 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp @@ -0,0 +1,75 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include "cpu/vision.h" + + +template +at::Tensor nms_cpu_kernel(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold) { + AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); + AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); + AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); + + if (dets.numel() == 0) { + return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); + } + + auto x1_t = dets.select(1, 0).contiguous(); + auto y1_t = dets.select(1, 1).contiguous(); + auto x2_t = dets.select(1, 2).contiguous(); + auto y2_t = dets.select(1, 3).contiguous(); + + at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); + + auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); + + auto ndets = dets.size(0); + at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); + + auto suppressed = suppressed_t.data(); + auto order = order_t.data(); + auto x1 = x1_t.data(); + auto y1 = y1_t.data(); + auto x2 = x2_t.data(); + auto y2 = y2_t.data(); + auto areas = areas_t.data(); + + for (int64_t _i = 0; _i < ndets; _i++) { + auto i = order[_i]; + if (suppressed[i] == 1) + continue; + auto ix1 = x1[i]; + auto iy1 = y1[i]; + auto ix2 = x2[i]; + auto iy2 = y2[i]; + auto iarea = areas[i]; + + for (int64_t _j = _i + 1; _j < ndets; _j++) { + auto j = order[_j]; + if (suppressed[j] == 1) + continue; + auto xx1 = std::max(ix1, x1[j]); + auto yy1 = std::max(iy1, y1[j]); + auto xx2 = std::min(ix2, x2[j]); + auto yy2 = std::min(iy2, y2[j]); + + auto w = std::max(static_cast(0), xx2 - xx1 + 1); + auto h = std::max(static_cast(0), yy2 - yy1 + 1); + auto inter = w * h; + auto ovr = inter / (iarea + areas[j] - inter); + if (ovr >= threshold) + suppressed[j] = 1; + } + } + return at::nonzero(suppressed_t == 0).squeeze(1); +} + +at::Tensor nms_cpu(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold) { + at::Tensor result; + AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { + result = nms_cpu_kernel(dets, scores, threshold); + }); + return result; +} diff --git a/maskrcnn_benchmark/csrc/cpu/vision.h b/maskrcnn_benchmark/csrc/cpu/vision.h new file mode 100644 index 000000000..926112536 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cpu/vision.h @@ -0,0 +1,16 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once +#include + + +at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio); + + +at::Tensor nms_cpu(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold); diff --git a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu new file mode 100644 index 000000000..5fe97ca90 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu @@ -0,0 +1,346 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include +#include + +#include +#include +#include + +// TODO make it in a common file +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + + +template +__device__ T bilinear_interpolate(const T* bottom_data, + const int height, const int width, + T y, T x, + const int index /* index for debug only*/) { + + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + //empty + return 0; + } + + if (y <= 0) y = 0; + if (x <= 0) x = 0; + + int y_low = (int) y; + int x_low = (int) x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T) y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T) x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + // do bilinear interpolation + T v1 = bottom_data[y_low * width + x_low]; + T v2 = bottom_data[y_low * width + x_high]; + T v3 = bottom_data[y_high * width + x_low]; + T v4 = bottom_data[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + return val; +} + +template +__global__ void RoIAlignForward(const int nthreads, const T* bottom_data, + const T spatial_scale, const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, T* top_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[1] * spatial_scale; + T roi_start_h = offset_bottom_rois[2] * spatial_scale; + T roi_end_w = offset_bottom_rois[3] * spatial_scale; + T roi_end_h = offset_bottom_rois[4] * spatial_scale; + // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale); + // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale); + // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale); + // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, (T)1.); + T roi_height = max(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1 + { + const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix ++) + { + const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); + + T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index); + output_val += val; + } + } + output_val /= count; + + top_data[index] = output_val; + } +} + + +template +__device__ void bilinear_interpolate_gradient( + const int height, const int width, + T y, T x, + T & w1, T & w2, T & w3, T & w4, + int & x_low, int & x_high, int & y_low, int & y_high, + const int index /* index for debug only*/) { + + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + //empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y <= 0) y = 0; + if (x <= 0) x = 0; + + y_low = (int) y; + x_low = (int) x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T) y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T) x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = bottom_data[y_low * width + x_low]; + // T v2 = bottom_data[y_low * width + x_high]; + // T v3 = bottom_data[y_high * width + x_low]; + // T v4 = bottom_data[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} + +template +__global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff, + const int num_rois, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + const int sampling_ratio, + T* bottom_diff, + const T* bottom_rois) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[1] * spatial_scale; + T roi_start_h = offset_bottom_rois[2] * spatial_scale; + T roi_end_w = offset_bottom_rois[3] * spatial_scale; + T roi_end_h = offset_bottom_rois[4] * spatial_scale; + // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale); + // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale); + // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale); + // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, (T)1.); + T roi_height = max(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; + + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_top_diff = top_diff + top_offset; + const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1 + { + const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix ++) + { + const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, y, x, + w1, w2, w3, w4, + x_low, x_high, y_low, y_high, + index); + + T g1 = top_diff_this_bin * w1 / count; + T g2 = top_diff_this_bin * w2 / count; + T g3 = top_diff_this_bin * w3 / count; + T g4 = top_diff_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) + { + atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast(g1)); + atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast(g2)); + atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast(g3)); + atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast(g4)); + } // if + } // ix + } // iy + } // CUDA_1D_KERNEL_LOOP +} // RoIAlignBackward + + +at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio) { + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); + auto output_size = num_rois * pooled_height * pooled_width * channels; + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L)); + dim3 block(512); + + if (output.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return output; + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { + RoIAlignForward<<>>( + output_size, + input.contiguous().data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + rois.contiguous().data(), + output.data()); + }); + THCudaCheck(cudaGetLastError()); + return output; +} + +// TODO remove the dependency on input and use instead its sizes -> save memory +at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio) { + AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); + AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + + auto num_rois = rois.size(0); + auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L)); + dim3 block(512); + + // handle possibly empty gradients + if (grad.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return grad_input; + } + + AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] { + RoIAlignBackwardFeature<<>>( + grad.numel(), + grad.contiguous().data(), + num_rois, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + grad_input.data(), + rois.contiguous().data()); + }); + THCudaCheck(cudaGetLastError()); + return grad_input; +} diff --git a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu new file mode 100644 index 000000000..b826dd9bc --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu @@ -0,0 +1,202 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include +#include + +#include +#include +#include + + +// TODO make it in a common file +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + + +template +__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const T* bottom_rois, T* top_data, int* argmax_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + int roi_start_w = round(offset_bottom_rois[1] * spatial_scale); + int roi_start_h = round(offset_bottom_rois[2] * spatial_scale); + int roi_end_w = round(offset_bottom_rois[3] * spatial_scale); + int roi_end_h = round(offset_bottom_rois[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + int roi_width = max(roi_end_w - roi_start_w + 1, 1); + int roi_height = max(roi_end_h - roi_start_h + 1, 1); + T bin_size_h = static_cast(roi_height) + / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) + / static_cast(pooled_width); + + int hstart = static_cast(floor(static_cast(ph) + * bin_size_h)); + int wstart = static_cast(floor(static_cast(pw) + * bin_size_w)); + int hend = static_cast(ceil(static_cast(ph + 1) + * bin_size_h)); + int wend = static_cast(ceil(static_cast(pw + 1) + * bin_size_w)); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart + roi_start_h, 0), height); + hend = min(max(hend + roi_start_h, 0), height); + wstart = min(max(wstart + roi_start_w, 0), width); + wend = min(max(wend + roi_start_w, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Define an empty pooling region to be zero + T maxval = is_empty ? 0 : -FLT_MAX; + // If nothing is pooled, argmax = -1 causes nothing to be backprop'd + int maxidx = -1; + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int bottom_index = h * width + w; + if (offset_bottom_data[bottom_index] > maxval) { + maxval = offset_bottom_data[bottom_index]; + maxidx = bottom_index; + } + } + } + top_data[index] = maxval; + argmax_data[index] = maxidx; + } +} + +template +__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff, + const int* argmax_data, const int num_rois, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, T* bottom_diff, + const T* bottom_rois) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + int bottom_offset = (roi_batch_ind * channels + c) * height * width; + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_top_diff = top_diff + top_offset; + T* offset_bottom_diff = bottom_diff + bottom_offset; + const int* offset_argmax_data = argmax_data + top_offset; + + int argmax = offset_argmax_data[ph * pooled_width + pw]; + if (argmax != -1) { + atomicAdd( + offset_bottom_diff + argmax, + static_cast(offset_top_diff[ph * pooled_width + pw])); + + } + } +} + +std::tuple ROIPool_forward_cuda(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width) { + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); + auto output_size = num_rois * pooled_height * pooled_width * channels; + auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt)); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L)); + dim3 block(512); + + if (output.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return std::make_tuple(output, argmax); + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] { + RoIPoolFForward<<>>( + output_size, + input.contiguous().data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + rois.contiguous().data(), + output.data(), + argmax.data()); + }); + THCudaCheck(cudaGetLastError()); + return std::make_tuple(output, argmax); +} + +// TODO remove the dependency on input and use instead its sizes -> save memory +at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& rois, + const at::Tensor& argmax, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width) { + AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); + AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + // TODO add more checks + + auto num_rois = rois.size(0); + auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L)); + dim3 block(512); + + // handle possibly empty gradients + if (grad.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return grad_input; + } + + AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] { + RoIPoolFBackward<<>>( + grad.numel(), + grad.contiguous().data(), + argmax.data(), + num_rois, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + grad_input.data(), + rois.contiguous().data()); + }); + THCudaCheck(cudaGetLastError()); + return grad_input; +} diff --git a/maskrcnn_benchmark/csrc/cuda/nms.cu b/maskrcnn_benchmark/csrc/cuda/nms.cu new file mode 100644 index 000000000..d7ccf79b0 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/nms.cu @@ -0,0 +1,128 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include +#include + +#include +#include + +#include +#include + +int const threadsPerBlock = sizeof(unsigned long long) * 8; + +__device__ inline float devIoU(float const * const a, float const * const b) { + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + float interS = width * height; + float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); + float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + return interS / (Sa + Sb - interS); +} + +__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, + const float *dev_boxes, unsigned long long *dev_mask) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + __shared__ float block_boxes[threadsPerBlock * 5]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; + block_boxes[threadIdx.x * 5 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; + block_boxes[threadIdx.x * 5 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; + block_boxes[threadIdx.x * 5 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; + block_boxes[threadIdx.x * 5 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 5; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +// boxes is a N x 5 tensor +at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { + using scalar_t = float; + AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); + auto scores = boxes.select(1, 4); + auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); + auto boxes_sorted = boxes.index_select(0, order_t); + + int boxes_num = boxes.size(0); + + const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); + + scalar_t* boxes_dev = boxes_sorted.data(); + + THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState + + unsigned long long* mask_dev = NULL; + //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, + // boxes_num * col_blocks * sizeof(unsigned long long))); + + mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); + + dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), + THCCeilDiv(boxes_num, threadsPerBlock)); + dim3 threads(threadsPerBlock); + nms_kernel<<>>(boxes_num, + nms_overlap_thresh, + boxes_dev, + mask_dev); + + std::vector mask_host(boxes_num * col_blocks); + THCudaCheck(cudaMemcpy(&mask_host[0], + mask_dev, + sizeof(unsigned long long) * boxes_num * col_blocks, + cudaMemcpyDeviceToHost)); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); + + at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); + int64_t* keep_out = keep.data(); + + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / threadsPerBlock; + int inblock = i % threadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + keep_out[num_to_keep++] = i; + unsigned long long *p = &mask_host[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + + THCudaFree(state, mask_dev); + // TODO improve this part + return std::get<0>(order_t.index({keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)}).sort(0, false)); +} diff --git a/maskrcnn_benchmark/csrc/cuda/vision.h b/maskrcnn_benchmark/csrc/cuda/vision.h new file mode 100644 index 000000000..977cef7b5 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/vision.h @@ -0,0 +1,48 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once +#include + + +at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio); + +at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio); + + +std::tuple ROIPool_forward_cuda(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width); + +at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& rois, + const at::Tensor& argmax, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width); + +at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); + + +at::Tensor compute_flow_cuda(const at::Tensor& boxes, + const int height, + const int width); diff --git a/maskrcnn_benchmark/csrc/nms.h b/maskrcnn_benchmark/csrc/nms.h new file mode 100644 index 000000000..312fed4a7 --- /dev/null +++ b/maskrcnn_benchmark/csrc/nms.h @@ -0,0 +1,28 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + + +at::Tensor nms(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold) { + + if (dets.type().is_cuda()) { +#ifdef WITH_CUDA + // TODO raise error if not compiled with CUDA + if (dets.numel() == 0) + return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); + auto b = at::cat({dets, scores.unsqueeze(1)}, 1); + return nms_cuda(b, threshold); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + + at::Tensor result = nms_cpu(dets, scores, threshold); + return result; +} diff --git a/maskrcnn_benchmark/csrc/vision.cpp b/maskrcnn_benchmark/csrc/vision.cpp new file mode 100644 index 000000000..ff002584c --- /dev/null +++ b/maskrcnn_benchmark/csrc/vision.cpp @@ -0,0 +1,13 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include "nms.h" +#include "ROIAlign.h" +#include "ROIPool.h" + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("nms", &nms, "non-maximum suppression"); + m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); + m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); + m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); + m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); +} diff --git a/maskrcnn_benchmark/data/__init__.py b/maskrcnn_benchmark/data/__init__.py new file mode 100644 index 000000000..2ba1e5247 --- /dev/null +++ b/maskrcnn_benchmark/data/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .build import make_data_loader diff --git a/maskrcnn_benchmark/data/build.py b/maskrcnn_benchmark/data/build.py new file mode 100644 index 000000000..86d3829e2 --- /dev/null +++ b/maskrcnn_benchmark/data/build.py @@ -0,0 +1,168 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import bisect +import logging + +import torch.utils.data +from maskrcnn_benchmark.utils.comm import get_world_size +from maskrcnn_benchmark.utils.imports import import_file + +from . import datasets as D +from . import samplers + +from .collate_batch import BatchCollator +from .transforms import build_transforms + + +def build_dataset(dataset_list, transforms, dataset_catalog, is_train=True): + """ + Arguments: + dataset_list (list[str]): Contains the names of the datasets, i.e., + coco_2014_trian, coco_2014_val, etc + transforms (callable): transforms to apply to each (image, target) sample + dataset_catalog (DatasetCatalog): contains the information on how to + construct a dataset. + is_train (bool): whether to setup the dataset for training or testing + """ + if not isinstance(dataset_list, (list, tuple)): + raise RuntimeError( + "dataset_list should be a list of strings, got {}".format(dataset_list)) + datasets = [] + for dataset_name in dataset_list: + data = dataset_catalog.get(dataset_name) + factory = getattr(D, data["factory"]) + args = data["args"] + # for COCODataset, we want to remove images without annotations + # during training + if data["factory"] == "COCODataset": + args["remove_images_without_annotations"] = is_train + args["transforms"] = transforms + # make dataset from factory + dataset = factory(**args) + datasets.append(dataset) + + # for testing, return a list of datasets + if not is_train: + return datasets + + # for training, concatenate all datasets into a single one + dataset = datasets[0] + if len(datasets) > 1: + dataset = D.ConcatDataset(datasets) + + return [dataset] + + +def make_data_sampler(dataset, shuffle, distributed): + if distributed: + return samplers.DistributedSampler(dataset, shuffle=shuffle) + if shuffle: + sampler = torch.utils.data.sampler.RandomSampler(dataset) + else: + sampler = torch.utils.data.sampler.SequentialSampler(dataset) + return sampler + + +def _quantize(x, bins): + bins = sorted(bins.copy()) + quantized = list(map(lambda y: bisect.bisect_right(bins, y), x)) + return quantized + + +def _compute_aspect_ratios(dataset): + aspect_ratios = [] + for i in range(len(dataset)): + img_info = dataset.get_img_info(i) + aspect_ratio = float(img_info["height"]) / float(img_info["width"]) + aspect_ratios.append(aspect_ratio) + return aspect_ratios + + +def make_batch_data_sampler( + dataset, sampler, aspect_grouping, images_per_batch, num_iters=None, start_iter=0 +): + if aspect_grouping: + if not isinstance(aspect_grouping, (list, tuple)): + aspect_grouping = [aspect_grouping] + aspect_ratios = _compute_aspect_ratios(dataset) + group_ids = _quantize(aspect_ratios, aspect_grouping) + batch_sampler = samplers.GroupedBatchSampler( + sampler, group_ids, images_per_batch, drop_uneven=False + ) + else: + batch_sampler = torch.utils.data.sampler.BatchSampler( + sampler, images_per_batch, drop_last=False + ) + if num_iters is not None: + batch_sampler = samplers.IterationBasedBatchSampler(batch_sampler, num_iters, start_iter) + return batch_sampler + + +def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): + num_gpus = get_world_size() + if is_train: + images_per_batch = cfg.SOLVER.IMS_PER_BATCH + assert ( + images_per_batch % num_gpus == 0 + ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " + "of GPUs ({}) used.".format(images_per_batch, num_gpus) + images_per_gpu = images_per_batch // num_gpus + shuffle = True + num_iters = cfg.SOLVER.MAX_ITER + else: + images_per_batch = cfg.TEST.IMS_PER_BATCH + assert ( + images_per_batch % num_gpus == 0 + ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " + "of GPUs ({}) used.".format(images_per_batch, num_gpus) + images_per_gpu = images_per_batch // num_gpus + shuffle = False if not is_distributed else True + num_iters = None + start_iter = 0 + + if images_per_gpu > 1: + logger = logging.getLogger(__name__) + logger.warning( + "When using more than one image per GPU you may encounter " + "an out-of-memory (OOM) error if your GPU does not have " + "sufficient memory. If this happens, you can reduce " + "SOLVER.IMS_PER_BATCH (for training) or " + "TEST.IMS_PER_BATCH (for inference). For training, you must " + "also adjust the learning rate and schedule length according " + "to the linear scaling rule. See for example: " + "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" + ) + + # group images which have similar aspect ratio. In this case, we only + # group in two cases: those with width / height > 1, and the other way around, + # but the code supports more general grouping strategy + aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] + + paths_catalog = import_file( + "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True + ) + DatasetCatalog = paths_catalog.DatasetCatalog + dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST + + transforms = build_transforms(cfg, is_train) + datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train) + + data_loaders = [] + for dataset in datasets: + sampler = make_data_sampler(dataset, shuffle, is_distributed) + batch_sampler = make_batch_data_sampler( + dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter + ) + collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) + num_workers = cfg.DATALOADER.NUM_WORKERS + data_loader = torch.utils.data.DataLoader( + dataset, + num_workers=num_workers, + batch_sampler=batch_sampler, + collate_fn=collator, + ) + data_loaders.append(data_loader) + if is_train: + # during training, a single (possibly concatenated) data_loader is returned + assert len(data_loaders) == 1 + return data_loaders[0] + return data_loaders diff --git a/maskrcnn_benchmark/data/collate_batch.py b/maskrcnn_benchmark/data/collate_batch.py new file mode 100644 index 000000000..a7f034167 --- /dev/null +++ b/maskrcnn_benchmark/data/collate_batch.py @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from maskrcnn_benchmark.structures.image_list import to_image_list + + +class BatchCollator(object): + """ + From a list of samples from the dataset, + returns the batched images and targets. + This should be passed to the DataLoader + """ + + def __init__(self, size_divisible=0): + self.size_divisible = size_divisible + + def __call__(self, batch): + transposed_batch = list(zip(*batch)) + images = to_image_list(transposed_batch[0], self.size_divisible) + targets = transposed_batch[1] + img_ids = transposed_batch[2] + return images, targets, img_ids diff --git a/maskrcnn_benchmark/data/datasets/__init__.py b/maskrcnn_benchmark/data/datasets/__init__.py new file mode 100644 index 000000000..7f2692e4b --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .coco import COCODataset +from .concat_dataset import ConcatDataset + +__all__ = ["COCODataset", "ConcatDataset"] diff --git a/maskrcnn_benchmark/data/datasets/coco.py b/maskrcnn_benchmark/data/datasets/coco.py new file mode 100644 index 000000000..d502385c3 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/coco.py @@ -0,0 +1,65 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torchvision + +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask + + +class COCODataset(torchvision.datasets.coco.CocoDetection): + def __init__( + self, ann_file, root, remove_images_without_annotations, transforms=None + ): + super(COCODataset, self).__init__(root, ann_file) + + # sort indices for reproducible results + self.ids = sorted(self.ids) + + # filter images without detection annotations + if remove_images_without_annotations: + self.ids = [ + img_id + for img_id in self.ids + if len(self.coco.getAnnIds(imgIds=img_id, iscrowd=None)) > 0 + ] + + self.json_category_id_to_contiguous_id = { + v: i + 1 for i, v in enumerate(self.coco.getCatIds()) + } + self.contiguous_category_id_to_json_id = { + v: k for k, v in self.json_category_id_to_contiguous_id.items() + } + self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} + self.transforms = transforms + + def __getitem__(self, idx): + img, anno = super(COCODataset, self).__getitem__(idx) + + # filter crowd annotations + # TODO might be better to add an extra field + anno = [obj for obj in anno if obj["iscrowd"] == 0] + + boxes = [obj["bbox"] for obj in anno] + boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes + target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") + + classes = [obj["category_id"] for obj in anno] + classes = [self.json_category_id_to_contiguous_id[c] for c in classes] + classes = torch.tensor(classes) + target.add_field("labels", classes) + + masks = [obj["segmentation"] for obj in anno] + masks = SegmentationMask(masks, img.size) + target.add_field("masks", masks) + + target = target.clip_to_image(remove_empty=True) + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target, idx + + def get_img_info(self, index): + img_id = self.id_to_img_map[index] + img_data = self.coco.imgs[img_id] + return img_data diff --git a/maskrcnn_benchmark/data/datasets/concat_dataset.py b/maskrcnn_benchmark/data/datasets/concat_dataset.py new file mode 100644 index 000000000..e5e087c42 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/concat_dataset.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import bisect + +from torch.utils.data.dataset import ConcatDataset as _ConcatDataset + + +class ConcatDataset(_ConcatDataset): + """ + Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra + method for querying the sizes of the image + """ + + def get_idxs(self, idx): + dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] + return dataset_idx, sample_idx + + def get_img_info(self, idx): + dataset_idx, sample_idx = self.get_idxs(idx) + return self.datasets[dataset_idx].get_img_info(sample_idx) diff --git a/maskrcnn_benchmark/data/datasets/list_dataset.py b/maskrcnn_benchmark/data/datasets/list_dataset.py new file mode 100644 index 000000000..70f64c0aa --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/list_dataset.py @@ -0,0 +1,36 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Simple dataset class that wraps a list of path names +""" + +from PIL import Image + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +class ListDataset(object): + def __init__(self, image_list, transforms=None): + self.image_lists = image_lists + self.transforms = transforms + + def __getitem__(self, item): + img = Image.open(self.image_lists[item]).convert("RGB") + + # dummy target + w, h = img.size + target = BoxList([[0, 0, w, h]], img.size, mode="xyxy") + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target + + def __len__(self): + return len(image_lists) + + def get_img_info(self, item): + """ + Return the image dimensions for the image, without + loading and pre-processing it + """ + pass diff --git a/maskrcnn_benchmark/data/samplers/__init__.py b/maskrcnn_benchmark/data/samplers/__init__.py new file mode 100644 index 000000000..27982cbe6 --- /dev/null +++ b/maskrcnn_benchmark/data/samplers/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .distributed import DistributedSampler +from .grouped_batch_sampler import GroupedBatchSampler +from .iteration_based_batch_sampler import IterationBasedBatchSampler + +__all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"] diff --git a/maskrcnn_benchmark/data/samplers/distributed.py b/maskrcnn_benchmark/data/samplers/distributed.py new file mode 100644 index 000000000..6b8b3353b --- /dev/null +++ b/maskrcnn_benchmark/data/samplers/distributed.py @@ -0,0 +1,67 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# Code is copy-pasted exactly as in torch.utils.data.distributed, +# with a modification in the import to use the deprecated backend +# FIXME remove this once c10d fixes the bug it has +import math +import torch +import torch.distributed.deprecated as dist +from torch.utils.data.sampler import Sampler + + +class DistributedSampler(Sampler): + """Sampler that restricts data loading to a subset of the dataset. + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each + process can pass a DistributedSampler instance as a DataLoader sampler, + and load a subset of the original dataset that is exclusive to it. + .. note:: + Dataset is assumed to be of constant size. + Arguments: + dataset: Dataset used for sampling. + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + """ + + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) + self.total_size = self.num_samples * self.num_replicas + self.shuffle = True + + def __iter__(self): + if self.shuffle: + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = torch.arange(len(self.dataset)).tolist() + + # add extra samples to make it evenly divisible + indices += indices[: (self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + offset = self.num_samples * self.rank + indices = indices[offset : offset + self.num_samples] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch diff --git a/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py b/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py new file mode 100644 index 000000000..d72e2f026 --- /dev/null +++ b/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py @@ -0,0 +1,115 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import itertools + +import torch +from torch.utils.data.sampler import BatchSampler +from torch.utils.data.sampler import Sampler + + +class GroupedBatchSampler(BatchSampler): + """ + Wraps another sampler to yield a mini-batch of indices. + It enforces that elements from the same group should appear in groups of batch_size. + It also tries to provide mini-batches which follows an ordering which is + as close as possible to the ordering from the original sampler. + + Arguments: + sampler (Sampler): Base sampler. + batch_size (int): Size of mini-batch. + drop_uneven (bool): If ``True``, the sampler will drop the batches whose + size is less than ``batch_size`` + + """ + + def __init__(self, sampler, group_ids, batch_size, drop_uneven=False): + if not isinstance(sampler, Sampler): + raise ValueError( + "sampler should be an instance of " + "torch.utils.data.Sampler, but got sampler={}".format(sampler) + ) + self.sampler = sampler + self.group_ids = torch.as_tensor(group_ids) + assert self.group_ids.dim() == 1 + self.batch_size = batch_size + self.drop_uneven = drop_uneven + + self.groups = torch.unique(self.group_ids).sort(0)[0] + + self._can_reuse_batches = False + + def _prepare_batches(self): + dataset_size = len(self.group_ids) + # get the sampled indices from the sampler + sampled_ids = torch.as_tensor(list(self.sampler)) + # potentially not all elements of the dataset were sampled + # by the sampler (e.g., DistributedSampler). + # construct a tensor which contains -1 if the element was + # not sampled, and a non-negative number indicating the + # order where the element was sampled. + # for example. if sampled_ids = [3, 1] and dataset_size = 5, + # the order is [-1, 1, -1, 0, -1] + order = torch.full((dataset_size,), -1, dtype=torch.int64) + order[sampled_ids] = torch.arange(len(sampled_ids)) + + # get a mask with the elements that were sampled + mask = order >= 0 + + # find the elements that belong to each individual cluster + clusters = [(self.group_ids == i) & mask for i in self.groups] + # get relative order of the elements inside each cluster + # that follows the order from the sampler + relative_order = [order[cluster] for cluster in clusters] + # with the relative order, find the absolute order in the + # sampled space + permutation_ids = [s[s.sort()[1]] for s in relative_order] + # permute each cluster so that they follow the order from + # the sampler + permuted_clusters = [sampled_ids[idx] for idx in permutation_ids] + + # splits each cluster in batch_size, and merge as a list of tensors + splits = [c.split(self.batch_size) for c in permuted_clusters] + merged = tuple(itertools.chain.from_iterable(splits)) + + # now each batch internally has the right order, but + # they are grouped by clusters. Find the permutation between + # different batches that brings them as close as possible to + # the order that we have in the sampler. For that, we will consider the + # ordering as coming from the first element of each batch, and sort + # correspondingly + first_element_of_batch = [t[0].item() for t in merged] + # get and inverse mapping from sampled indices and the position where + # they occur (as returned by the sampler) + inv_sampled_ids_map = {v: k for k, v in enumerate(sampled_ids.tolist())} + # from the first element in each batch, get a relative ordering + first_index_of_batch = torch.as_tensor( + [inv_sampled_ids_map[s] for s in first_element_of_batch] + ) + + # permute the batches so that they approximately follow the order + # from the sampler + permutation_order = first_index_of_batch.sort(0)[1].tolist() + # finally, permute the batches + batches = [merged[i].tolist() for i in permutation_order] + + if self.drop_uneven: + kept = [] + for batch in batches: + if len(batch) == self.batch_size: + kept.append(batch) + batches = kept + return batches + + def __iter__(self): + if self._can_reuse_batches: + batches = self._batches + self._can_reuse_batches = False + else: + batches = self._prepare_batches() + self._batches = batches + return iter(batches) + + def __len__(self): + if not hasattr(self, "_batches"): + self._batches = self._prepare_batches() + self._can_reuse_batches = True + return len(self._batches) diff --git a/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py b/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py new file mode 100644 index 000000000..93452b646 --- /dev/null +++ b/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py @@ -0,0 +1,31 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from torch.utils.data.sampler import BatchSampler + + +class IterationBasedBatchSampler(BatchSampler): + """ + Wraps a BatchSampler, resampling from it until + a specified number of iterations have been sampled + """ + + def __init__(self, batch_sampler, num_iterations, start_iter=0): + self.batch_sampler = batch_sampler + self.num_iterations = num_iterations + self.start_iter = start_iter + + def __iter__(self): + iteration = self.start_iter + while iteration <= self.num_iterations: + # if the underlying sampler has a set_epoch method, like + # DistributedSampler, used for making each process see + # a different split of the dataset, then set it + if hasattr(self.batch_sampler.sampler, "set_epoch"): + self.batch_sampler.sampler.set_epoch(iteration) + for batch in self.batch_sampler: + iteration += 1 + if iteration > self.num_iterations: + break + yield batch + + def __len__(self): + return self.num_iterations diff --git a/maskrcnn_benchmark/data/transforms/__init__.py b/maskrcnn_benchmark/data/transforms/__init__.py new file mode 100644 index 000000000..076f8e98f --- /dev/null +++ b/maskrcnn_benchmark/data/transforms/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .transforms import Compose +from .transforms import Resize +from .transforms import RandomHorizontalFlip +from .transforms import ToTensor +from .transforms import Normalize + +from .build import build_transforms + diff --git a/maskrcnn_benchmark/data/transforms/build.py b/maskrcnn_benchmark/data/transforms/build.py new file mode 100644 index 000000000..8645d4df4 --- /dev/null +++ b/maskrcnn_benchmark/data/transforms/build.py @@ -0,0 +1,28 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from . import transforms as T + + +def build_transforms(cfg, is_train=True): + if is_train: + min_size = cfg.INPUT.MIN_SIZE_TRAIN + max_size = cfg.INPUT.MAX_SIZE_TRAIN + flip_prob = 0.5 # cfg.INPUT.FLIP_PROB_TRAIN + else: + min_size = cfg.INPUT.MIN_SIZE_TEST + max_size = cfg.INPUT.MAX_SIZE_TEST + flip_prob = 0 + + to_bgr255 = cfg.INPUT.TO_BGR255 + normalize_transform = T.Normalize( + mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=to_bgr255 + ) + + transform = T.Compose( + [ + T.Resize(min_size, max_size), + T.RandomHorizontalFlip(flip_prob), + T.ToTensor(), + normalize_transform, + ] + ) + return transform diff --git a/maskrcnn_benchmark/data/transforms/transforms.py b/maskrcnn_benchmark/data/transforms/transforms.py new file mode 100644 index 000000000..71d48d295 --- /dev/null +++ b/maskrcnn_benchmark/data/transforms/transforms.py @@ -0,0 +1,88 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import random + +import torch +import torchvision +from torchvision.transforms import functional as F + + +class Compose(object): + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, target): + for t in self.transforms: + image, target = t(image, target) + return image, target + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string + + +class Resize(object): + def __init__(self, min_size, max_size): + self.min_size = min_size + self.max_size = max_size + + # modified from torchvision to add support for max size + def get_size(self, image_size): + w, h = image_size + size = self.min_size + max_size = self.max_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def __call__(self, image, target): + size = self.get_size(image.size) + image = F.resize(image, size) + target = target.resize(image.size) + return image, target + + +class RandomHorizontalFlip(object): + def __init__(self, prob=0.5): + self.prob = prob + + def __call__(self, image, target): + if random.random() < self.prob: + image = F.hflip(image) + target = target.transpose(0) + return image, target + + +class ToTensor(object): + def __call__(self, image, target): + return F.to_tensor(image), target + + +class Normalize(object): + def __init__(self, mean, std, to_bgr255=True): + self.mean = mean + self.std = std + self.to_bgr255 = to_bgr255 + + def __call__(self, image, target): + if self.to_bgr255: + image = image[[2, 1, 0]] * 255 + image = F.normalize(image, mean=self.mean, std=self.std) + return image, target diff --git a/maskrcnn_benchmark/engine/inference.py b/maskrcnn_benchmark/engine/inference.py new file mode 100644 index 000000000..752526982 --- /dev/null +++ b/maskrcnn_benchmark/engine/inference.py @@ -0,0 +1,428 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import datetime +import logging +import tempfile +import time +import os +from collections import OrderedDict + +import torch + +from tqdm import tqdm + +from ..structures.bounding_box import BoxList +from ..utils.comm import is_main_process +from ..utils.comm import scatter_gather +from ..utils.comm import synchronize + + +from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou + + +def compute_on_dataset(model, data_loader, device): + model.eval() + results_dict = {} + cpu_device = torch.device("cpu") + for i, batch in tqdm(enumerate(data_loader)): + images, targets, image_ids = batch + images = images.to(device) + with torch.no_grad(): + output = model(images) + output = [o.to(cpu_device) for o in output] + results_dict.update( + {img_id: result for img_id, result in zip(image_ids, output)} + ) + return results_dict + + +def prepare_for_coco_detection(predictions, dataset): + # assert isinstance(dataset, COCODataset) + coco_results = [] + for image_id, prediction in enumerate(predictions): + original_id = dataset.id_to_img_map[image_id] + if len(prediction) == 0: + continue + + # TODO replace with get_img_info? + image_width = dataset.coco.imgs[original_id]["width"] + image_height = dataset.coco.imgs[original_id]["height"] + prediction = prediction.resize((image_width, image_height)) + prediction = prediction.convert("xywh") + + boxes = prediction.bbox.tolist() + scores = prediction.get_field("scores").tolist() + labels = prediction.get_field("labels").tolist() + + mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": mapped_labels[k], + "bbox": box, + "score": scores[k], + } + for k, box in enumerate(boxes) + ] + ) + return coco_results + + +def prepare_for_coco_segmentation(predictions, dataset): + import pycocotools.mask as mask_util + import numpy as np + + masker = Masker(threshold=0.5, padding=1) + # assert isinstance(dataset, COCODataset) + coco_results = [] + for image_id, prediction in tqdm(enumerate(predictions)): + original_id = dataset.id_to_img_map[image_id] + if len(prediction) == 0: + continue + + # TODO replace with get_img_info? + image_width = dataset.coco.imgs[original_id]["width"] + image_height = dataset.coco.imgs[original_id]["height"] + prediction = prediction.resize((image_width, image_height)) + masks = prediction.get_field("mask") + # t = time.time() + masks = masker(masks, prediction) + # logger.info('Time mask: {}'.format(time.time() - t)) + # prediction = prediction.convert('xywh') + + # boxes = prediction.bbox.tolist() + scores = prediction.get_field("scores").tolist() + labels = prediction.get_field("labels").tolist() + + # rles = prediction.get_field('mask') + + rles = [ + mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] + for mask in masks + ] + for rle in rles: + rle["counts"] = rle["counts"].decode("utf-8") + + mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": mapped_labels[k], + "segmentation": rle, + "score": scores[k], + } + for k, rle in enumerate(rles) + ] + ) + return coco_results + + +# inspired from Detectron +def evaluate_box_proposals( + predictions, dataset, thresholds=None, area="all", limit=None +): + """Evaluate detection proposal recall metrics. This function is a much + faster alternative to the official COCO API recall evaluation code. However, + it produces slightly different results. + """ + # Record max overlap value for each gt box + # Return vector of overlap values + areas = { + "all": 0, + "small": 1, + "medium": 2, + "large": 3, + "96-128": 4, + "128-256": 5, + "256-512": 6, + "512-inf": 7, + } + area_ranges = [ + [0 ** 2, 1e5 ** 2], # all + [0 ** 2, 32 ** 2], # small + [32 ** 2, 96 ** 2], # medium + [96 ** 2, 1e5 ** 2], # large + [96 ** 2, 128 ** 2], # 96-128 + [128 ** 2, 256 ** 2], # 128-256 + [256 ** 2, 512 ** 2], # 256-512 + [512 ** 2, 1e5 ** 2], + ] # 512-inf + assert area in areas, "Unknown area range: {}".format(area) + area_range = area_ranges[areas[area]] + gt_overlaps = [] + num_pos = 0 + + for image_id, prediction in enumerate(predictions): + original_id = dataset.id_to_img_map[image_id] + + # TODO replace with get_img_info? + image_width = dataset.coco.imgs[original_id]["width"] + image_height = dataset.coco.imgs[original_id]["height"] + prediction = prediction.resize((image_width, image_height)) + + # sort predictions in descending order + # TODO maybe remove this and make it explicit in the documentation + inds = prediction.get_field("objectness").sort(descending=True)[1] + prediction = prediction[inds] + + ann_ids = dataset.coco.getAnnIds(imgIds=original_id) + anno = dataset.coco.loadAnns(ann_ids) + gt_boxes = [obj["bbox"] for obj in anno if obj["iscrowd"] == 0] + gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes + gt_boxes = BoxList(gt_boxes, (image_width, image_height), mode="xywh").convert( + "xyxy" + ) + gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0]) + + if len(gt_boxes) == 0: + continue + + valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) + gt_boxes = gt_boxes[valid_gt_inds] + + num_pos += len(gt_boxes) + + if len(gt_boxes) == 0: + continue + + if len(prediction) == 0: + continue + + if limit is not None and len(prediction) > limit: + prediction = prediction[:limit] + + overlaps = boxlist_iou(prediction, gt_boxes) + + _gt_overlaps = torch.zeros(len(gt_boxes)) + for j in range(min(len(prediction), len(gt_boxes))): + # find which proposal box maximally covers each gt box + # and get the iou amount of coverage for each gt box + max_overlaps, argmax_overlaps = overlaps.max(dim=0) + + # find which gt box is 'best' covered (i.e. 'best' = most iou) + gt_ovr, gt_ind = max_overlaps.max(dim=0) + assert gt_ovr >= 0 + # find the proposal box that covers the best covered gt box + box_ind = argmax_overlaps[gt_ind] + # record the iou coverage of this gt box + _gt_overlaps[j] = overlaps[box_ind, gt_ind] + assert _gt_overlaps[j] == gt_ovr + # mark the proposal box and the gt box as used + overlaps[box_ind, :] = -1 + overlaps[:, gt_ind] = -1 + + # append recorded iou coverage level + gt_overlaps.append(_gt_overlaps) + gt_overlaps = torch.cat(gt_overlaps, dim=0) + gt_overlaps, _ = torch.sort(gt_overlaps) + + if thresholds is None: + step = 0.05 + thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) + recalls = torch.zeros_like(thresholds) + # compute recall for each iou threshold + for i, t in enumerate(thresholds): + recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) + # ar = 2 * np.trapz(recalls, thresholds) + ar = recalls.mean() + return { + "ar": ar, + "recalls": recalls, + "thresholds": thresholds, + "gt_overlaps": gt_overlaps, + "num_pos": num_pos, + } + + +def evaluate_predictions_on_coco( + coco_gt, coco_results, json_result_file, iou_type="bbox" +): + import json + + with open(json_result_file, "w") as f: + json.dump(coco_results, f) + + from pycocotools.cocoeval import COCOeval + + coco_dt = coco_gt.loadRes(str(json_result_file)) + # coco_dt = coco_gt.loadRes(coco_results) + coco_eval = COCOeval(coco_gt, coco_dt, iou_type) + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + return coco_eval + + +def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu): + all_predictions = scatter_gather(predictions_per_gpu) + if not is_main_process(): + return + # merge the list of dicts + predictions = {} + for p in all_predictions: + predictions.update(p) + # convert a dict where the key is the index in a list + image_ids = list(sorted(predictions.keys())) + if len(image_ids) != image_ids[-1] + 1: + logger = logging.getLogger("maskrcnn_benchmark.inference") + logger.warning( + "Number of images that were gathered from multiple processes is not " + "a contiguous set. Some images might be missing from the evaluation" + ) + + # convert to a list + predictions = [predictions[i] for i in image_ids] + return predictions + + +class COCOResults(object): + METRICS = { + "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"], + "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"], + "box_proposal": [ + "AR@100", + "ARs@100", + "ARm@100", + "ARl@100", + "AR@1000", + "ARs@1000", + "ARm@1000", + "ARl@1000", + ], + "keypoint": ["AP", "AP50", "AP75", "APm", "APl"], + } + + def __init__(self, *iou_types): + allowed_types = ("box_proposal", "bbox", "segm") + assert all(iou_type in allowed_types for iou_type in iou_types) + results = OrderedDict() + for iou_type in iou_types: + results[iou_type] = OrderedDict( + [(metric, -1) for metric in COCOResults.METRICS[iou_type]] + ) + self.results = results + + def update(self, coco_eval): + if coco_eval is None: + return + from pycocotools.cocoeval import COCOeval + + assert isinstance(coco_eval, COCOeval) + s = coco_eval.stats + iou_type = coco_eval.params.iouType + res = self.results[iou_type] + metrics = COCOResults.METRICS[iou_type] + for idx, metric in enumerate(metrics): + res[metric] = s[idx] + + def __repr__(self): + # TODO make it pretty + return repr(self.results) + + +def check_expected_results(results, expected_results, sigma_tol): + if not expected_results: + return + + logger = logging.getLogger("maskrcnn_benchmark.inference") + for task, metric, (mean, std) in expected_results: + actual_val = results.results[task][metric] + lo = mean - sigma_tol * std + hi = mean + sigma_tol * std + ok = (lo < actual_val) and (actual_val < hi) + msg = ( + "{} > {} sanity check (actual vs. expected): " + "{:.3f} vs. mean={:.4f}, std={:.4}, range=({:.4f}, {:.4f})" + ).format(task, metric, actual_val, mean, std, lo, hi) + if not ok: + msg = "FAIL: " + msg + logger.error(msg) + else: + msg = "PASS: " + msg + logger.info(msg) + + +def inference( + model, + data_loader, + iou_types=("bbox",), + box_only=False, + device="cuda", + expected_results=(), + expected_results_sigma_tol=4, + output_folder=None, +): + + # convert to a torch.device for efficiency + device = torch.device(device) + num_devices = ( + torch.distributed.deprecated.get_world_size() + if torch.distributed.deprecated.is_initialized() + else 1 + ) + logger = logging.getLogger("maskrcnn_benchmark.inference") + dataset = data_loader.dataset + logger.info("Start evaluation on {} images".format(len(dataset))) + start_time = time.time() + predictions = compute_on_dataset(model, data_loader, device) + # wait for all processes to complete before measuring the time + synchronize() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=total_time)) + logger.info( + "Total inference time: {} ({} s / img per device, on {} devices)".format( + total_time_str, total_time * num_devices / len(dataset), num_devices + ) + ) + + predictions = _accumulate_predictions_from_multiple_gpus(predictions) + if not is_main_process(): + return + + if output_folder: + torch.save(predictions, os.path.join(output_folder, "predictions.pth")) + + if box_only: + logger.info("Evaluating bbox proposals") + areas = {"all": "", "small": "s", "medium": "m", "large": "l"} + res = COCOResults("box_proposal") + for limit in [100, 1000]: + for area, suffix in areas.items(): + stats = evaluate_box_proposals( + predictions, dataset, area=area, limit=limit + ) + key = "AR{}@{:d}".format(suffix, limit) + res.results["box_proposal"][key] = stats["ar"].item() + logger.info(res) + check_expected_results(res, expected_results, expected_results_sigma_tol) + if output_folder: + torch.save(res, os.path.join(output_folder, "box_proposals.pth")) + return + logger.info("Preparing results for COCO format") + coco_results = {} + if "bbox" in iou_types: + logger.info("Preparing bbox results") + coco_results["bbox"] = prepare_for_coco_detection(predictions, dataset) + if "segm" in iou_types: + logger.info("Preparing segm results") + coco_results["segm"] = prepare_for_coco_segmentation(predictions, dataset) + + results = COCOResults(*iou_types) + logger.info("Evaluating predictions") + for iou_type in iou_types: + with tempfile.NamedTemporaryFile() as f: + file_path = f.name + if output_folder: + file_path = os.path.join(output_folder, iou_type + ".json") + res = evaluate_predictions_on_coco( + dataset.coco, coco_results[iou_type], file_path, iou_type + ) + results.update(res) + logger.info(results) + check_expected_results(results, expected_results, expected_results_sigma_tol) + if output_folder: + torch.save(results, os.path.join(output_folder, "coco_results.pth")) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py new file mode 100644 index 000000000..af8049303 --- /dev/null +++ b/maskrcnn_benchmark/engine/trainer.py @@ -0,0 +1,113 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import datetime +import logging +import time + +import torch +from torch.distributed import deprecated as dist + +from maskrcnn_benchmark.utils.comm import get_world_size +from maskrcnn_benchmark.utils.metric_logger import MetricLogger + + +def reduce_loss_dict(loss_dict): + """ + Reduce the loss dictionary from all processes so that process with rank + 0 has the averaged results. Returns a dict with the same fields as + loss_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return loss_dict + with torch.no_grad(): + loss_names = [] + all_losses = [] + for k, v in loss_dict.items(): + loss_names.append(k) + all_losses.append(v) + all_losses = torch.stack(all_losses, dim=0) + dist.reduce(all_losses, dst=0) + if dist.get_rank() == 0: + # only main process gets accumulated, so only divide by + # world_size in this case + all_losses /= world_size + reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} + return reduced_losses + + +def do_train( + model, + data_loader, + optimizer, + scheduler, + checkpointer, + device, + checkpoint_period, + arguments, +): + logger = logging.getLogger("maskrcnn_benchmark.trainer") + logger.info("Start training") + meters = MetricLogger(delimiter=" ") + max_iter = len(data_loader) + start_iter = arguments["iteration"] + model.train() + start_training_time = time.time() + end = time.time() + for iteration, (images, targets, _) in enumerate(data_loader, start_iter): + data_time = time.time() - end + arguments["iteration"] = iteration + + scheduler.step() + + images = images.to(device) + targets = [target.to(device) for target in targets] + + loss_dict = model(images, targets) + + losses = sum(loss for loss in loss_dict.values()) + + # reduce losses over all GPUs for logging purposes + loss_dict_reduced = reduce_loss_dict(loss_dict) + losses_reduced = sum(loss for loss in loss_dict_reduced.values()) + meters.update(loss=losses_reduced, **loss_dict_reduced) + + optimizer.zero_grad() + losses.backward() + optimizer.step() + + batch_time = time.time() - end + end = time.time() + meters.update(time=batch_time, data=data_time) + + eta_seconds = meters.time.global_avg * (max_iter - iteration) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + + if iteration % 20 == 0 or iteration == (max_iter - 1): + logger.info( + meters.delimiter.join( + [ + "eta: {eta}", + "iter: {iter}", + "{meters}", + "lr: {lr:.6f}", + "max mem: {memory:.0f}", + ] + ).format( + eta=eta_string, + iter=iteration, + meters=str(meters), + lr=optimizer.param_groups[0]["lr"], + memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, + ) + ) + if iteration % checkpoint_period == 0 and iteration > 0: + checkpointer.save("model_{:07d}".format(iteration), **arguments) + + checkpointer.save("model_{:07d}".format(iteration), **arguments) + total_training_time = time.time() - start_training_time + total_time_str = str(datetime.timedelta(seconds=total_training_time)) + logger.info( + "Total training time: {} ({:.4f} s / it)".format( + total_time_str, total_training_time / (max_iter) + ) + ) diff --git a/maskrcnn_benchmark/layers/__init__.py b/maskrcnn_benchmark/layers/__init__.py new file mode 100644 index 000000000..0b7f77c8b --- /dev/null +++ b/maskrcnn_benchmark/layers/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from .batch_norm import FrozenBatchNorm2d +from .misc import Conv2d +from .misc import ConvTranspose2d +from .misc import interpolate +from .nms import nms +from .roi_align import ROIAlign +from .roi_align import roi_align +from .roi_pool import ROIPool +from .roi_pool import roi_pool +from .smooth_l1_loss import smooth_l1_loss + +__all__ = ["nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool", "smooth_l1_loss", "Conv2d", "ConvTranspose2d", "interpolate", "FrozenBatchNorm2d"] diff --git a/maskrcnn_benchmark/layers/_utils.py b/maskrcnn_benchmark/layers/_utils.py new file mode 100644 index 000000000..3dabc127b --- /dev/null +++ b/maskrcnn_benchmark/layers/_utils.py @@ -0,0 +1,39 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import glob +import os.path + +import torch + +try: + from torch.utils.cpp_extension import load as load_ext + from torch.utils.cpp_extension import CUDA_HOME +except ImportError: + raise ImportError("The cpp layer extensions requires PyTorch 0.4 or higher") + + +def _load_C_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + this_dir = os.path.dirname(this_dir) + this_dir = os.path.join(this_dir, "csrc") + + main_file = glob.glob(os.path.join(this_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(this_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(this_dir, "cuda", "*.cu")) + + source = main_file + source_cpu + + extra_cflags = [] + if torch.cuda.is_available() and CUDA_HOME is not None: + source.extend(source_cuda) + extra_cflags = ["-DWITH_CUDA"] + source = [os.path.join(this_dir, s) for s in source] + extra_include_paths = [this_dir] + return load_ext( + "torchvision", + source, + extra_cflags=extra_cflags, + extra_include_paths=extra_include_paths, + ) + + +_C = _load_C_extensions() diff --git a/maskrcnn_benchmark/layers/batch_norm.py b/maskrcnn_benchmark/layers/batch_norm.py new file mode 100644 index 000000000..903607ac3 --- /dev/null +++ b/maskrcnn_benchmark/layers/batch_norm.py @@ -0,0 +1,24 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn + + +class FrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters + are fixed + """ + + def __init__(self, n): + super(FrozenBatchNorm2d, self).__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def forward(self, x): + scale = self.weight * self.running_var.rsqrt() + bias = self.bias - self.running_mean * scale + scale = scale.reshape(1, -1, 1, 1) + bias = bias.reshape(1, -1, 1, 1) + return x * scale + bias diff --git a/maskrcnn_benchmark/layers/misc.py b/maskrcnn_benchmark/layers/misc.py new file mode 100644 index 000000000..61f661003 --- /dev/null +++ b/maskrcnn_benchmark/layers/misc.py @@ -0,0 +1,102 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +helper class that supports empty tensors on some nn functions. + +Ideally, add support directly in PyTorch to empty tensors in +those functions. + +This can be removed once https://github.com/pytorch/pytorch/issues/12013 +is implemented +""" + +import math +import torch +from torch.nn.modules.utils import _ntuple + + +class _NewEmptyTensorOp(torch.autograd.Function): + @staticmethod + def forward(ctx, x, new_shape): + ctx.shape = x.shape + return x.new_empty(new_shape) + + @staticmethod + def backward(ctx, grad): + shape = ctx.shape + return _NewEmptyTensorOp.apply(grad, shape), None + + + +class Conv2d(torch.nn.Conv2d): + def forward(self, x): + if x.numel() > 0: + return super(Conv2d, self).forward(x) + # get output shape + + output_shape = [ + (i + 2 * p - (di * (k - 1) + 1)) // d + 1 + for i, p, di, k, d in zip( + x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride + ) + ] + output_shape = [x.shape[0], self.weight.shape[0]] + output_shape + return _NewEmptyTensorOp.apply(x, output_shape) + + +class ConvTranspose2d(torch.nn.ConvTranspose2d): + def forward(self, x): + if x.numel() > 0: + return super(ConvTranspose2d, self).forward(x) + # get output shape + + output_shape = [ + (i - 1) * d - 2 * p + (di * (k - 1) + 1) + op + for i, p, di, k, d, op in zip( + x.shape[-2:], + self.padding, + self.dilation, + self.kernel_size, + self.stride, + self.output_padding, + ) + ] + output_shape = [x.shape[0], self.bias.shape[0]] + output_shape + return _NewEmptyTensorOp.apply(x, output_shape) + + +def interpolate( + input, size=None, scale_factor=None, mode="nearest", align_corners=None +): + if input.numel() > 0: + return torch.nn.functional.interpolate( + input, size, scale_factor, mode, align_corners + ) + + def _check_size_scale_factor(dim): + if size is None and scale_factor is None: + raise ValueError("either size or scale_factor should be defined") + if size is not None and scale_factor is not None: + raise ValueError("only one of size or scale_factor should be defined") + if ( + scale_factor is not None + and isinstance(scale_factor, tuple) + and len(scale_factor) != dim + ): + raise ValueError( + "scale_factor shape must match input shape. " + "Input is {}D, scale_factor size is {}".format(dim, len(scale_factor)) + ) + + def _output_size(dim): + _check_size_scale_factor(dim) + if size is not None: + return size + scale_factors = _ntuple(dim)(scale_factor) + # math.floor might return float in py2.7 + return [ + int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim) + ] + + output_shape = tuple(_output_size(2)) + output_shape = input.shape[:-2] + output_shape + return _NewEmptyTensorOp.apply(input, output_shape) diff --git a/maskrcnn_benchmark/layers/nms.py b/maskrcnn_benchmark/layers/nms.py new file mode 100644 index 000000000..1e80b5550 --- /dev/null +++ b/maskrcnn_benchmark/layers/nms.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# from ._utils import _C +from maskrcnn_benchmark import _C + +nms = _C.nms +# nms.__doc__ = """ +# This function performs Non-maximum suppresion""" diff --git a/maskrcnn_benchmark/layers/roi_align.py b/maskrcnn_benchmark/layers/roi_align.py new file mode 100644 index 000000000..170c8f186 --- /dev/null +++ b/maskrcnn_benchmark/layers/roi_align.py @@ -0,0 +1,68 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from maskrcnn_benchmark import _C + + +class _ROIAlign(Function): + @staticmethod + def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): + ctx.save_for_backward(roi) + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.sampling_ratio = sampling_ratio + ctx.input_shape = input.size() + output = _C.roi_align_forward( + input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio + ) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + rois, = ctx.saved_tensors + output_size = ctx.output_size + spatial_scale = ctx.spatial_scale + sampling_ratio = ctx.sampling_ratio + bs, ch, h, w = ctx.input_shape + grad_input = _C.roi_align_backward( + grad_output, + rois, + spatial_scale, + output_size[0], + output_size[1], + bs, + ch, + h, + w, + sampling_ratio, + ) + return grad_input, None, None, None, None + + +roi_align = _ROIAlign.apply + + +class ROIAlign(nn.Module): + def __init__(self, output_size, spatial_scale, sampling_ratio): + super(ROIAlign, self).__init__() + self.output_size = output_size + self.spatial_scale = spatial_scale + self.sampling_ratio = sampling_ratio + + def forward(self, input, rois): + return roi_align( + input, rois, self.output_size, self.spatial_scale, self.sampling_ratio + ) + + def __repr__(self): + tmpstr = self.__class__.__name__ + "(" + tmpstr += "output_size=" + str(self.output_size) + tmpstr += ", spatial_scale=" + str(self.spatial_scale) + tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) + tmpstr += ")" + return tmpstr diff --git a/maskrcnn_benchmark/layers/roi_pool.py b/maskrcnn_benchmark/layers/roi_pool.py new file mode 100644 index 000000000..c0e42756e --- /dev/null +++ b/maskrcnn_benchmark/layers/roi_pool.py @@ -0,0 +1,63 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from maskrcnn_benchmark import _C + + +class _ROIPool(Function): + @staticmethod + def forward(ctx, input, roi, output_size, spatial_scale): + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.input_shape = input.size() + output, argmax = _C.roi_pool_forward( + input, roi, spatial_scale, output_size[0], output_size[1] + ) + ctx.save_for_backward(input, roi, argmax) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, rois, argmax = ctx.saved_tensors + output_size = ctx.output_size + spatial_scale = ctx.spatial_scale + bs, ch, h, w = ctx.input_shape + grad_input = _C.roi_pool_backward( + grad_output, + input, + rois, + argmax, + spatial_scale, + output_size[0], + output_size[1], + bs, + ch, + h, + w, + ) + return grad_input, None, None, None + + +roi_pool = _ROIPool.apply + + +class ROIPool(nn.Module): + def __init__(self, output_size, spatial_scale): + super(ROIPool, self).__init__() + self.output_size = output_size + self.spatial_scale = spatial_scale + + def forward(self, input, rois): + return roi_pool(input, rois, self.output_size, self.spatial_scale) + + def __repr__(self): + tmpstr = self.__class__.__name__ + "(" + tmpstr += "output_size=" + str(self.output_size) + tmpstr += ", spatial_scale=" + str(self.spatial_scale) + tmpstr += ")" + return tmpstr diff --git a/maskrcnn_benchmark/layers/smooth_l1_loss.py b/maskrcnn_benchmark/layers/smooth_l1_loss.py new file mode 100644 index 000000000..9c4664bb4 --- /dev/null +++ b/maskrcnn_benchmark/layers/smooth_l1_loss.py @@ -0,0 +1,16 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + + +# TODO maybe push this to nn? +def smooth_l1_loss(input, target, beta=1. / 9, size_average=True): + """ + very similar to the smooth_l1_loss from pytorch, but with + the extra beta parameter + """ + n = torch.abs(input - target) + cond = n < beta + loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) + if size_average: + return loss.mean() + return loss.sum() diff --git a/maskrcnn_benchmark/modeling/__init__.py b/maskrcnn_benchmark/modeling/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/maskrcnn_benchmark/modeling/backbone/__init__.py b/maskrcnn_benchmark/modeling/backbone/__init__.py new file mode 100644 index 000000000..4b3da17b8 --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .backbone import build_backbone diff --git a/maskrcnn_benchmark/modeling/backbone/backbone.py b/maskrcnn_benchmark/modeling/backbone/backbone.py new file mode 100644 index 000000000..0af09683c --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/backbone.py @@ -0,0 +1,44 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from collections import OrderedDict + +from torch import nn + +from . import fpn as fpn_module +from . import resnet + + +def build_resnet_backbone(cfg): + body = resnet.ResNet(cfg) + model = nn.Sequential(OrderedDict([("body", body)])) + return model + + +def build_resnet_fpn_backbone(cfg): + body = resnet.ResNet(cfg) + in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS + out_channels = cfg.MODEL.BACKBONE.OUT_CHANNELS + fpn = fpn_module.FPN( + in_channels_list=[ + in_channels_stage2, + in_channels_stage2 * 2, + in_channels_stage2 * 4, + in_channels_stage2 * 8, + ], + out_channels=out_channels, + top_blocks=fpn_module.LastLevelMaxPool(), + ) + model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)])) + return model + + +_BACKBONES = {"resnet": build_resnet_backbone, "resnet-fpn": build_resnet_fpn_backbone} + + +def build_backbone(cfg): + assert cfg.MODEL.BACKBONE.CONV_BODY.startswith( + "R-" + ), "Only ResNet and ResNeXt models are currently implemented" + # Models using FPN end with "-FPN" + if cfg.MODEL.BACKBONE.CONV_BODY.endswith("-FPN"): + return build_resnet_fpn_backbone(cfg) + return build_resnet_backbone(cfg) diff --git a/maskrcnn_benchmark/modeling/backbone/fpn.py b/maskrcnn_benchmark/modeling/backbone/fpn.py new file mode 100644 index 000000000..c9ee8c674 --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/fpn.py @@ -0,0 +1,74 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torch.nn.functional as F +from torch import nn + + +class FPN(nn.Module): + """ + Module that adds FPN on top of a list of feature maps. + The feature maps are currently supposed to be in increasing depth + order, and must be consecutive + """ + + def __init__(self, in_channels_list, out_channels, top_blocks=None): + """ + Arguments: + in_channels_list (list[int]): number of channels for each feature map that + will be fed + out_channels (int): number of channels of the FPN representation + top_blocks (nn.Module or None): if provided, an extra operation will + be performed on the output of the last (smallest resolution) + FPN output, and the result will extend the result list + """ + super(FPN, self).__init__() + self.inner_blocks = [] + self.layer_blocks = [] + for idx, in_channels in enumerate(in_channels_list, 1): + inner_block = "fpn_inner{}".format(idx) + layer_block = "fpn_layer{}".format(idx) + inner_block_module = nn.Conv2d(in_channels, out_channels, 1) + layer_block_module = nn.Conv2d(out_channels, out_channels, 3, 1, 1) + for module in [inner_block_module, layer_block_module]: + # Caffe2 implementation uses XavierFill, which in fact + # corresponds to kaiming_uniform_ in PyTorch + nn.init.kaiming_uniform_(module.weight, a=1) + nn.init.constant_(module.bias, 0) + self.add_module(inner_block, inner_block_module) + self.add_module(layer_block, layer_block_module) + self.inner_blocks.append(inner_block) + self.layer_blocks.append(layer_block) + self.top_blocks = top_blocks + + def forward(self, x): + """ + Arguments: + x (list[Tensor]): feature maps for each feature level. + Returns: + results (tuple[Tensor]): feature maps after FPN layers. + They are ordered from highest resolution first. + """ + last_inner = getattr(self, self.inner_blocks[-1])(x[-1]) + results = [] + results.append(getattr(self, self.layer_blocks[-1])(last_inner)) + for feature, inner_block, layer_block in zip( + x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1] + ): + inner_top_down = F.interpolate(last_inner, scale_factor=2, mode="nearest") + inner_lateral = getattr(self, inner_block)(feature) + # TODO use size instead of scale to make it robust to different sizes + # inner_top_down = F.upsample(last_inner, size=inner_lateral.shape[-2:], + # mode='bilinear', align_corners=False) + last_inner = inner_lateral + inner_top_down + results.insert(0, getattr(self, layer_block)(last_inner)) + + if self.top_blocks is not None: + last_results = self.top_blocks(results[-1]) + results.extend(last_results) + + return tuple(results) + + +class LastLevelMaxPool(nn.Module): + def forward(self, x): + return [F.max_pool2d(x, 1, 2, 0)] diff --git a/maskrcnn_benchmark/modeling/backbone/resnet.py b/maskrcnn_benchmark/modeling/backbone/resnet.py new file mode 100644 index 000000000..cff6863bc --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/resnet.py @@ -0,0 +1,319 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Variant of the resnet module that takes cfg as an argument. +Example usage. Strings may be specified in the config file. + model = ResNet( + "StemWithFixedBatchNorm", + "BottleneckWithFixedBatchNorm", + "ResNet50StagesTo4", + ) +Custom implementations may be written in user code and hooked in via the +`register_*` functions. +""" +from collections import namedtuple + +import torch +import torch.nn.functional as F +from torch import nn + +from maskrcnn_benchmark.layers import FrozenBatchNorm2d +from maskrcnn_benchmark.layers import Conv2d + + +# ResNet stage specification +StageSpec = namedtuple( + "StageSpec", + [ + "index", # Index of the stage, eg 1, 2, ..,. 5 + "block_count", # Numer of residual blocks in the stage + "return_features", # True => return the last feature map from this stage + ], +) + +# ----------------------------------------------------------------------------- +# Standard ResNet models +# ----------------------------------------------------------------------------- +# ResNet-50 (including all stages) +ResNet50StagesTo5 = ( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, False), (4, 3, True)) +) +# ResNet-50 up to stage 4 (excludes stage 5) +ResNet50StagesTo4 = ( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, True)) +) +# ResNet-50-FPN (including all stages) +ResNet50FPNStagesTo5 = ( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 6, True), (4, 3, True)) +) +# ResNet-101-FPN (including all stages) +ResNet101FPNStagesTo5 = ( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 23, True), (4, 3, True)) +) + + +class ResNet(nn.Module): + def __init__(self, cfg): + super(ResNet, self).__init__() + + # If we want to use the cfg in forward(), then we should make a copy + # of it and store it for later use: + # self.cfg = cfg.clone() + + # Translate string names to implementations + stem_module = _STEM_MODULES[cfg.MODEL.RESNETS.STEM_FUNC] + stage_specs = _STAGE_SPECS[cfg.MODEL.BACKBONE.CONV_BODY] + transformation_module = _TRANSFORMATION_MODULES[cfg.MODEL.RESNETS.TRANS_FUNC] + + # Construct the stem module + self.stem = stem_module(cfg) + + # Constuct the specified ResNet stages + num_groups = cfg.MODEL.RESNETS.NUM_GROUPS + width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP + in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS + stage2_bottleneck_channels = num_groups * width_per_group + stage2_out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS + self.stages = [] + self.return_features = {} + for stage_spec in stage_specs: + name = "layer" + str(stage_spec.index) + stage2_relative_factor = 2 ** (stage_spec.index - 1) + bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor + out_channels = stage2_out_channels * stage2_relative_factor + module = _make_stage( + transformation_module, + in_channels, + bottleneck_channels, + out_channels, + stage_spec.block_count, + num_groups, + cfg.MODEL.RESNETS.STRIDE_IN_1X1, + first_stride=int(stage_spec.index > 1) + 1, + ) + in_channels = out_channels + self.add_module(name, module) + self.stages.append(name) + self.return_features[name] = stage_spec.return_features + + # Optionally freeze (requires_grad=False) parts of the backbone + self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT) + + def _freeze_backbone(self, freeze_at): + for stage_index in range(freeze_at): + if stage_index == 0: + m = self.stem # stage 0 is the stem + else: + m = getattr(self, "layer" + str(stage_index)) + for p in m.parameters(): + p.requires_grad = False + + def forward(self, x): + outputs = [] + x = self.stem(x) + for stage_name in self.stages: + x = getattr(self, stage_name)(x) + if self.return_features[stage_name]: + outputs.append(x) + return outputs + + +class ResNetHead(nn.Module): + def __init__( + self, + block_module, + stages, + num_groups=1, + width_per_group=64, + stride_in_1x1=True, + stride_init=None, + res2_out_channels=256, + ): + super(ResNetHead, self).__init__() + + stage2_relative_factor = 2 ** (stages[0].index - 1) + stage2_bottleneck_channels = num_groups * width_per_group + out_channels = res2_out_channels * stage2_relative_factor + in_channels = out_channels // 2 + bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor + + block_module = _TRANSFORMATION_MODULES[block_module] + + self.stages = [] + stride = stride_init + for stage in stages: + name = "layer" + str(stage.index) + if not stride: + stride = int(stage.index > 1) + 1 + module = _make_stage( + block_module, + in_channels, + bottleneck_channels, + out_channels, + stage.block_count, + num_groups, + stride_in_1x1, + first_stride=stride, + ) + stride = None + self.add_module(name, module) + self.stages.append(name) + + def forward(self, x): + for stage in self.stages: + x = getattr(self, stage)(x) + return x + + +def _make_stage( + transformation_module, + in_channels, + bottleneck_channels, + out_channels, + block_count, + num_groups, + stride_in_1x1, + first_stride, +): + blocks = [] + stride = first_stride + for _ in range(block_count): + blocks.append( + transformation_module( + in_channels, + bottleneck_channels, + out_channels, + num_groups, + stride_in_1x1, + stride, + ) + ) + stride = 1 + in_channels = out_channels + return nn.Sequential(*blocks) + + +class BottleneckWithFixedBatchNorm(nn.Module): + def __init__( + self, + in_channels, + bottleneck_channels, + out_channels, + num_groups=1, + stride_in_1x1=True, + stride=1, + ): + super(BottleneckWithFixedBatchNorm, self).__init__() + + self.downsample = None + if in_channels != out_channels: + self.downsample = nn.Sequential( + Conv2d( + in_channels, out_channels, kernel_size=1, stride=stride, bias=False + ), + FrozenBatchNorm2d(out_channels), + ) + + # The original MSRA ResNet models have stride in the first 1x1 conv + # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have + # stride in the 3x3 conv + stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) + + self.conv1 = Conv2d( + in_channels, + bottleneck_channels, + kernel_size=1, + stride=stride_1x1, + bias=False, + ) + self.bn1 = FrozenBatchNorm2d(bottleneck_channels) + # TODO: specify init for the above + + self.conv2 = Conv2d( + bottleneck_channels, + bottleneck_channels, + kernel_size=3, + stride=stride_3x3, + padding=1, + bias=False, + groups=num_groups, + ) + self.bn2 = FrozenBatchNorm2d(bottleneck_channels) + + self.conv3 = Conv2d( + bottleneck_channels, out_channels, kernel_size=1, bias=False + ) + self.bn3 = FrozenBatchNorm2d(out_channels) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = F.relu_(out) + + out = self.conv2(out) + out = self.bn2(out) + out = F.relu_(out) + + out0 = self.conv3(out) + out = self.bn3(out0) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = F.relu_(out) + + return out + + +class StemWithFixedBatchNorm(nn.Module): + def __init__(self, cfg): + super(StemWithFixedBatchNorm, self).__init__() + + out_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS + + self.conv1 = Conv2d( + 3, out_channels, kernel_size=7, stride=2, padding=3, bias=False + ) + self.bn1 = FrozenBatchNorm2d(out_channels) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = F.relu_(x) + x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) + return x + + +_TRANSFORMATION_MODULES = {"BottleneckWithFixedBatchNorm": BottleneckWithFixedBatchNorm} + +_STEM_MODULES = {"StemWithFixedBatchNorm": StemWithFixedBatchNorm} + +_STAGE_SPECS = { + "R-50-C4": ResNet50StagesTo4, + "R-50-C5": ResNet50StagesTo5, + "R-50-FPN": ResNet50FPNStagesTo5, + "R-101-FPN": ResNet101FPNStagesTo5, +} + + +def register_transformation_module(module_name, module): + _register_generic(_TRANSFORMATION_MODULES, module_name, module) + + +def register_stem_module(module_name, module): + _register_generic(_STEM_MODULES, module_name, module) + + +def register_stage_spec(stage_spec_name, stage_spec): + _register_generic(_STAGE_SPECS, stage_spec_name, stage_spec) + + +def _register_generic(module_dict, module_name, module): + assert module_name not in module_dict + module_dict[module_name] = module diff --git a/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py b/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py new file mode 100644 index 000000000..1c9953f14 --- /dev/null +++ b/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py @@ -0,0 +1,68 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + + +class BalancedPositiveNegativeSampler(object): + """ + This class samples batches, ensuring that they contain a fixed proportion of positives + """ + + def __init__(self, batch_size_per_image, positive_fraction): + """ + Arguments: + batch_size_per_image (int): number of elements to be selected per image + positive_fraction (float): percentace of positive elements per batch + """ + self.batch_size_per_image = batch_size_per_image + self.positive_fraction = positive_fraction + + def __call__(self, matched_idxs): + """ + Arguments: + matched idxs: list of tensors containing -1, 0 or positive values. + Each tensor corresponds to a specific image. + -1 values are ignored, 0 are considered as negatives and > 0 as + positives. + + Returns: + pos_idx (list[tensor]) + neg_idx (list[tensor]) + + Returns two lists of binary masks for each image. + The first list contains the positive elements that were selected, + and the second list the negative example. + """ + pos_idx = [] + neg_idx = [] + for matched_idxs_per_image in matched_idxs: + positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1) + negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1) + + num_pos = int(self.batch_size_per_image * self.positive_fraction) + # protect against not enough positive examples + num_pos = min(positive.numel(), num_pos) + num_neg = self.batch_size_per_image - num_pos + # protect against not enough negative examples + num_neg = min(negative.numel(), num_neg) + + # randomly select positive and negative examples + perm1 = torch.randperm(positive.numel())[:num_pos] + perm2 = torch.randperm(negative.numel())[:num_neg] + + pos_idx_per_image = positive[perm1] + neg_idx_per_image = negative[perm2] + + # create binary mask from indices + pos_idx_per_image_mask = torch.zeros_like( + matched_idxs_per_image, dtype=torch.uint8 + ) + neg_idx_per_image_mask = torch.zeros_like( + matched_idxs_per_image, dtype=torch.uint8 + ) + pos_idx_per_image_mask[pos_idx_per_image] = 1 + neg_idx_per_image_mask[neg_idx_per_image] = 1 + + pos_idx.append(pos_idx_per_image_mask) + neg_idx.append(neg_idx_per_image_mask) + + return pos_idx, neg_idx diff --git a/maskrcnn_benchmark/modeling/box_coder.py b/maskrcnn_benchmark/modeling/box_coder.py new file mode 100644 index 000000000..46a4acb32 --- /dev/null +++ b/maskrcnn_benchmark/modeling/box_coder.py @@ -0,0 +1,95 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import math + +import torch + + +class BoxCoder(object): + """ + This class encodes and decodes a set of bounding boxes into + the representation used for training the regressors. + """ + + def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)): + """ + Arguments: + weights (4-element tuple) + bbox_xform_clip (float) + """ + self.weights = weights + self.bbox_xform_clip = bbox_xform_clip + + def encode(self, reference_boxes, proposals): + """ + Encode a set of proposals with respect to some + reference boxes + + Arguments: + reference_boxes (Tensor): reference boxes + proposals (Tensor): boxes to be encoded + """ + + TO_REMOVE = 1 # TODO remove + ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE + ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE + ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths + ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights + + gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE + gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE + gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths + gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights + + wx, wy, ww, wh = self.weights + targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths + targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights + targets_dw = ww * torch.log(gt_widths / ex_widths) + targets_dh = wh * torch.log(gt_heights / ex_heights) + + targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) + return targets + + def decode(self, rel_codes, boxes): + """ + From a set of original boxes and encoded relative box offsets, + get the decoded boxes. + + Arguments: + rel_codes (Tensor): encoded boxes + boxes (Tensor): reference boxes. + """ + + boxes = boxes.to(rel_codes.dtype) + + TO_REMOVE = 1 # TODO remove + widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE + heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + wx, wy, ww, wh = self.weights + dx = rel_codes[:, 0::4] / wx + dy = rel_codes[:, 1::4] / wy + dw = rel_codes[:, 2::4] / ww + dh = rel_codes[:, 3::4] / wh + + # Prevent sending too large values into torch.exp() + dw = torch.clamp(dw, max=self.bbox_xform_clip) + dh = torch.clamp(dh, max=self.bbox_xform_clip) + + pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] + pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] + pred_w = torch.exp(dw) * widths[:, None] + pred_h = torch.exp(dh) * heights[:, None] + + pred_boxes = torch.zeros_like(rel_codes) + # x1 + pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w + # y1 + pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h + # x2 (note: "- 1" is correct; don't be fooled by the asymmetry) + pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1 + # y2 (note: "- 1" is correct; don't be fooled by the asymmetry) + pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1 + + return pred_boxes diff --git a/maskrcnn_benchmark/modeling/detector/__init__.py b/maskrcnn_benchmark/modeling/detector/__init__.py new file mode 100644 index 000000000..ff421e281 --- /dev/null +++ b/maskrcnn_benchmark/modeling/detector/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .detectors import build_detection_model diff --git a/maskrcnn_benchmark/modeling/detector/detectors.py b/maskrcnn_benchmark/modeling/detector/detectors.py new file mode 100644 index 000000000..af2100cac --- /dev/null +++ b/maskrcnn_benchmark/modeling/detector/detectors.py @@ -0,0 +1,10 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .generalized_rcnn import GeneralizedRCNN + + +_DETECTION_META_ARCHITECTURES = {"GeneralizedRCNN": GeneralizedRCNN} + + +def build_detection_model(cfg): + meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE] + return meta_arch(cfg) diff --git a/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py b/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py new file mode 100644 index 000000000..63b5868f1 --- /dev/null +++ b/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py @@ -0,0 +1,65 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Implements the Generalized R-CNN framework +""" + +import torch +from torch import nn + +from maskrcnn_benchmark.structures.image_list import to_image_list + +from ..backbone import build_backbone +from ..rpn.rpn import build_rpn +from ..roi_heads.roi_heads import build_roi_heads + + +class GeneralizedRCNN(nn.Module): + """ + Main class for Generalized R-CNN. Currently supports boxes and masks. + It consists of three main parts: + - backbone + = rpn + - heads: takes the features + the proposals from the RPN and computes + detections / masks from it. + """ + + def __init__(self, cfg): + super(GeneralizedRCNN, self).__init__() + + self.backbone = build_backbone(cfg) + self.rpn = build_rpn(cfg) + self.roi_heads = build_roi_heads(cfg) + + def forward(self, images, targets=None): + """ + Arguments: + images (list[Tensor] or ImageList): images to be processed + targets (list[BoxList]): ground-truth boxes present in the image (optional) + + Returns: + result (list[BoxList] or dict[Tensor]): the output from the model. + During training, it returns a dict[Tensor] which contains the losses. + During testing, it returns list[BoxList] contains additional fields + like `scores`, `labels` and `mask` (for Mask R-CNN models). + + """ + if self.training and targets is None: + raise ValueError("In training mode, targets should be passed") + images = to_image_list(images) + features = self.backbone(images.tensors) + proposals, proposal_losses = self.rpn(images, features, targets) + if self.roi_heads: + x, result, detector_losses = self.roi_heads(features, proposals, targets) + else: + # RPN-only models don't have roi_heads + x = features + result = proposals + detector_losses = {} + + if self.training: + losses = {} + losses.update(detector_losses) + losses.update(proposal_losses) + return losses + + return result diff --git a/maskrcnn_benchmark/modeling/matcher.py b/maskrcnn_benchmark/modeling/matcher.py new file mode 100644 index 000000000..e051d3f59 --- /dev/null +++ b/maskrcnn_benchmark/modeling/matcher.py @@ -0,0 +1,106 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + + +class Matcher(object): + """ + This class assigns to each predicted "element" (e.g., a box) a ground-truth + element. Each predicted element will have exactly zero or one matches; each + ground-truth element may be assigned to zero or more predicted elements. + + Matching is based on the MxN match_quality_matrix, that characterizes how well + each (ground-truth, predicted)-pair match. For example, if the elements are + boxes, the matrix may contain box IoU overlap values. + + The matcher returns a tensor of size N containing the index of the ground-truth + element m that matches to prediction n. If there is no match, a negative value + is returned. + """ + + BELOW_LOW_THRESHOLD = -1 + BETWEEN_THRESHOLDS = -2 + + def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False): + """ + Args: + high_threshold (float): quality values greater than or equal to + this value are candidate matches. + low_threshold (float): a lower quality threshold used to stratify + matches into three levels: + 1) matches >= high_threshold + 2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold) + 3) BELOW_LOW_THRESHOLD matches in [0, low_threshold) + allow_low_quality_matches (bool): if True, produce additional matches + for predictions that have only low-quality match candidates. See + set_low_quality_matches_ for more details. + """ + assert low_threshold <= high_threshold + self.high_threshold = high_threshold + self.low_threshold = low_threshold + self.allow_low_quality_matches = allow_low_quality_matches + + def __call__(self, match_quality_matrix): + """ + Args: + match_quality_matrix (Tensor[float]): an MxN tensor, containing the + pairwise quality between M ground-truth elements and N predicted elements. + + Returns: + matches (Tensor[int64]): an N tensor where N[i] is a matched gt in + [0, M - 1] or a negative value indicating that prediction i could not + be matched. + """ + if match_quality_matrix.numel() == 0: + # handle empty case + device = match_quality_matrix.device + return torch.empty((0,), dtype=torch.int64, device=device) + + # match_quality_matrix is M (gt) x N (predicted) + # Max over gt elements (dim 0) to find best gt candidate for each prediction + matched_vals, matches = match_quality_matrix.max(dim=0) + if self.allow_low_quality_matches: + all_matches = matches.clone() + + # Assign candidate matches with low quality to negative (unassigned) values + below_low_threshold = matched_vals < self.low_threshold + between_thresholds = (matched_vals >= self.low_threshold) & ( + matched_vals < self.high_threshold + ) + matches[below_low_threshold] = Matcher.BELOW_LOW_THRESHOLD + matches[between_thresholds] = Matcher.BETWEEN_THRESHOLDS + + if self.allow_low_quality_matches: + self.set_low_quality_matches_(matches, all_matches, match_quality_matrix) + + return matches + + def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix): + """ + Produce additional matches for predictions that have only low-quality matches. + Specifically, for each ground-truth find the set of predictions that have + maximum overlap with it (including ties); for each prediction in that set, if + it is unmatched, then match it to the ground-truth with which it has the highest + quality value. + """ + # For each gt, find the prediction with which it has highest quality + highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1) + # Find highest quality match available, even if it is low, including ties + gt_pred_pairs_of_highest_quality = torch.nonzero( + match_quality_matrix == highest_quality_foreach_gt[:, None] + ) + # Example gt_pred_pairs_of_highest_quality: + # tensor([[ 0, 39796], + # [ 1, 32055], + # [ 1, 32070], + # [ 2, 39190], + # [ 2, 40255], + # [ 3, 40390], + # [ 3, 41455], + # [ 4, 45470], + # [ 5, 45325], + # [ 5, 46390]]) + # Each row is a (gt index, prediction index) + # Note how gt items 1, 2, 3, and 5 each have two ties + + pred_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1] + matches[pred_inds_to_update] = all_matches[pred_inds_to_update] diff --git a/maskrcnn_benchmark/modeling/poolers.py b/maskrcnn_benchmark/modeling/poolers.py new file mode 100644 index 000000000..0c3fb086b --- /dev/null +++ b/maskrcnn_benchmark/modeling/poolers.py @@ -0,0 +1,122 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import math +import torch +import torch.nn.functional as F +from torch import nn + +from maskrcnn_benchmark.layers import ROIAlign + +from .utils import cat + + +class LevelMapper(object): + """Determine which FPN level each RoI in a set of RoIs should map to based + on the heuristic in the FPN paper. + """ + + def __init__(self, k_min, k_max, canonical_scale=224, canonical_level=4, eps=1e-6): + """ + Arguments: + k_min (int) + k_max (int) + canonical_scale (int) + canonical_level (int) + eps (float) + """ + self.k_min = k_min + self.k_max = k_max + self.s0 = canonical_scale + self.lvl0 = canonical_level + self.eps = eps + + def __call__(self, boxlists): + """ + Arguments: + boxlists (list[BoxList]) + """ + # Compute level ids + s = torch.sqrt(cat([boxlist.area() for boxlist in boxlists])) + + # Eqn.(1) in FPN paper + target_lvls = torch.floor(self.lvl0 + torch.log2(s / self.s0 + self.eps)) + target_lvls = torch.clamp(target_lvls, min=self.k_min, max=self.k_max) + return target_lvls.to(torch.int64) - self.k_min + + +class Pooler(nn.Module): + """ + Pooler for Detection with or without FPN. + It currently hard-code ROIAlign in the implementation, + but that can be made more generic later on. + Also, the requirement of passing the scales is not strictly necessary, as they + can be inferred from the size of the feature map / size of original image, + which is available thanks to the BoxList. + """ + + def __init__(self, output_size, scales, sampling_ratio): + """ + Arguments: + output_size (list[tuple[int]] or list[int]): output size for the pooled region + scales (list[flaot]): scales for each Pooler + sampling_ratio (int): sampling ratio for ROIAlign + """ + super(Pooler, self).__init__() + poolers = [] + for scale in scales: + poolers.append( + ROIAlign( + output_size, spatial_scale=scale, sampling_ratio=sampling_ratio + ) + ) + self.poolers = nn.ModuleList(poolers) + self.output_size = output_size + # get the levels in the feature map by leveraging the fact that the network always + # downsamples by a factor of 2 at each level. + lvl_min = -math.log2(scales[0]) + lvl_max = -math.log2(scales[-1]) + self.map_levels = LevelMapper(lvl_min, lvl_max) + + def convert_to_roi_format(self, boxes): + concat_boxes = cat([b.bbox for b in boxes], dim=0) + device, dtype = concat_boxes.device, concat_boxes.dtype + ids = cat( + [ + torch.full((len(b), 1), i, dtype=dtype, device=device) + for i, b in enumerate(boxes) + ], + dim=0, + ) + rois = torch.cat([ids, concat_boxes], dim=1) + return rois + + def forward(self, x, boxes): + """ + Arguments: + x (list[Tensor]): feature maps for each level + boxes (list[BoxList]): boxes to be used to perform the pooling operation. + Returns: + result (Tensor) + """ + num_levels = len(self.poolers) + rois = self.convert_to_roi_format(boxes) + if num_levels == 1: + return self.poolers[0](x[0], rois) + + levels = self.map_levels(boxes) + + num_rois = len(rois) + num_channels = x[0].shape[1] + output_size = self.output_size[0] + + dtype, device = x[0].dtype, x[0].device + result = torch.zeros( + (num_rois, num_channels, output_size, output_size), + dtype=dtype, + device=device, + ) + for level, (per_level_feature, pooler) in enumerate(zip(x, self.poolers)): + idx_in_level = torch.nonzero(levels == level).squeeze(1) + rois_per_level = rois[idx_in_level] + result[idx_in_level] = pooler(per_level_feature, rois_per_level) + + return result diff --git a/maskrcnn_benchmark/modeling/roi_heads/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py new file mode 100644 index 000000000..53ba53151 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py @@ -0,0 +1,70 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn + +from .roi_box_feature_extractors import make_roi_box_feature_extractor +from .roi_box_predictors import make_roi_box_predictor +from .inference import make_roi_box_post_processor +from .loss import make_roi_box_loss_evaluator + + +class ROIBoxHead(torch.nn.Module): + """ + Generic Box Head class. + """ + + def __init__(self, cfg): + super(ROIBoxHead, self).__init__() + self.feature_extractor = make_roi_box_feature_extractor(cfg) + self.predictor = make_roi_box_predictor(cfg) + self.post_processor = make_roi_box_post_processor(cfg) + self.loss_evaluator = make_roi_box_loss_evaluator(cfg) + + def forward(self, features, proposals, targets=None): + """ + Arguments: + features (list[Tensor]): feature-maps from possibly several levels + proposals (list[BoxList]): proposal boxes + targets (list[BoxList], optional): the ground-truth targets. + + Returns: + x (Tensor): the result of the feature extractor + proposals (list[BoxList]): during training, the subsampled proposals + are returned. During testing, the predicted boxlists are returned + losses (dict[Tensor]): During training, returns the losses for the + head. During testing, returns an empty dict. + """ + + if self.training: + # Faster R-CNN subsamples during training the proposals with a fixed + # positive / negative ratio + with torch.no_grad(): + proposals = self.loss_evaluator.subsample(proposals, targets) + + # extract features that will be fed to the final classifier. The + # feature_extractor generally corresponds to the pooler + heads + x = self.feature_extractor(features, proposals) + # final classifier that converts the features into predictions + class_logits, box_regression = self.predictor(x) + + if not self.training: + result = self.post_processor((class_logits, box_regression), proposals) + return x, result, {} + + loss_classifier, loss_box_reg = self.loss_evaluator( + [class_logits], [box_regression] + ) + return ( + x, + proposals, + dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg), + ) + + +def build_roi_box_head(cfg): + """ + Constructs a new box head. + By default, uses ROIBoxHead, but if it turns out not to be enough, just register a new class + and make it a parameter in the config + """ + return ROIBoxHead(cfg) diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py new file mode 100644 index 000000000..196892550 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py @@ -0,0 +1,152 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torch.nn.functional as F +from torch import nn + +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist +from maskrcnn_benchmark.modeling.box_coder import BoxCoder + + +class PostProcessor(nn.Module): + """ + From a set of classification scores, box regression and proposals, + computes the post-processed boxes, and applies NMS to obtain the + final results + """ + + def __init__( + self, score_thresh=0.05, nms=0.5, detections_per_img=100, box_coder=None + ): + """ + Arguments: + score_thresh (float) + nms (float) + detections_per_img (int) + box_coder (BoxCoder) + """ + super(PostProcessor, self).__init__() + self.score_thresh = score_thresh + self.nms = nms + self.detections_per_img = detections_per_img + if box_coder is None: + box_coder = BoxCoder(weights=(10., 10., 5., 5.)) + self.box_coder = box_coder + + def forward(self, x, boxes): + """ + Arguments: + x (tuple[tensor, tensor]): x contains the class logits + and the box_regression from the model. + boxes (list[BoxList]): bounding boxes that are used as + reference, one for ech image + + Returns: + results (list[BoxList]): one BoxList for each image, containing + the extra fields labels and scores + """ + class_logits, box_regression = x + class_prob = F.softmax(class_logits, -1) + + # TODO think about a representation of batch of boxes + image_shapes = [box.size for box in boxes] + boxes_per_image = [len(box) for box in boxes] + concat_boxes = torch.cat([a.bbox for a in boxes], dim=0) + + proposals = self.box_coder.decode( + box_regression.view(sum(boxes_per_image), -1), concat_boxes + ) + + num_classes = class_prob.shape[1] + + proposals = proposals.split(boxes_per_image, dim=0) + class_prob = class_prob.split(boxes_per_image, dim=0) + + results = [] + for prob, boxes_per_img, image_shape in zip( + class_prob, proposals, image_shapes + ): + boxlist = self.prepare_boxlist(boxes_per_img, prob, image_shape) + boxlist = boxlist.clip_to_image(remove_empty=False) + boxlist = self.filter_results(boxlist, num_classes) + results.append(boxlist) + return results + + def prepare_boxlist(self, boxes, scores, image_shape): + """ + Returns BoxList from `boxes` and adds probability scores information + as an extra field + `boxes` has shape (#detections, 4 * #classes), where each row represents + a list of predicted bounding boxes for each of the object classes in the + dataset (including the background class). The detections in each row + originate from the same object proposal. + `scores` has shape (#detection, #classes), where each row represents a list + of object detection confidence scores for each of the object classes in the + dataset (including the background class). `scores[i, j]`` corresponds to the + box at `boxes[i, j * 4:(j + 1) * 4]`. + """ + boxes = boxes.reshape(-1, 4) + scores = scores.reshape(-1) + boxlist = BoxList(boxes, image_shape, mode="xyxy") + boxlist.add_field("scores", scores) + return boxlist + + def filter_results(self, boxlist, num_classes): + """Returns bounding-box detection results by thresholding on scores and + applying non-maximum suppression (NMS). + """ + # unwrap the boxlist to avoid additional overhead. + # if we had multi-class NMS, we could perform this directly on the boxlist + boxes = boxlist.bbox.reshape(-1, num_classes * 4) + scores = boxlist.get_field("scores").reshape(-1, num_classes) + + device = scores.device + result = [] + # Apply threshold on detection probabilities and apply NMS + # Skip j = 0, because it's the background class + inds_all = scores > self.score_thresh + for j in range(1, num_classes): + inds = inds_all[:, j].nonzero().squeeze(1) + scores_j = scores[inds, j] + boxes_j = boxes[inds, j * 4 : (j + 1) * 4] + boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") + boxlist_for_class.add_field("scores", scores_j) + boxlist_for_class = boxlist_nms( + boxlist_for_class, self.nms, score_field="scores" + ) + num_labels = len(boxlist_for_class) + boxlist_for_class.add_field( + "labels", torch.full((num_labels,), j, dtype=torch.int64, device=device) + ) + result.append(boxlist_for_class) + + result = cat_boxlist(result) + number_of_detections = len(result) + + # Limit to max_per_image detections **over all classes** + if number_of_detections > self.detections_per_img > 0: + cls_scores = result.get_field("scores") + image_thresh, _ = torch.kthvalue( + cls_scores.cpu(), number_of_detections - self.detections_per_img + 1 + ) + keep = cls_scores >= image_thresh.item() + keep = torch.nonzero(keep).squeeze(1) + result = result[keep] + return result + + +def make_roi_box_post_processor(cfg): + use_fpn = cfg.MODEL.ROI_HEADS.USE_FPN + + bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS + box_coder = BoxCoder(weights=bbox_reg_weights) + + score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH + nms_thresh = cfg.MODEL.ROI_HEADS.NMS + detections_per_img = cfg.MODEL.ROI_HEADS.DETECTIONS_PER_IMG + + postprocessor = PostProcessor( + score_thresh, nms_thresh, detections_per_img, box_coder + ) + return postprocessor diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py new file mode 100644 index 000000000..2c21f6cdb --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py @@ -0,0 +1,175 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch.nn import functional as F + +from maskrcnn_benchmark.layers import smooth_l1_loss +from maskrcnn_benchmark.modeling.box_coder import BoxCoder +from maskrcnn_benchmark.modeling.matcher import Matcher +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.modeling.balanced_positive_negative_sampler import ( + BalancedPositiveNegativeSampler +) +from maskrcnn_benchmark.modeling.utils import cat + + +class FastRCNNLossComputation(object): + """ + Computes the loss for Faster R-CNN. + Also supports FPN + """ + + def __init__(self, proposal_matcher, fg_bg_sampler, box_coder): + """ + Arguments: + proposal_matcher (Matcher) + fg_bg_sampler (BalancedPositiveNegativeSampler) + box_coder (BoxCoder) + """ + self.proposal_matcher = proposal_matcher + self.fg_bg_sampler = fg_bg_sampler + self.box_coder = box_coder + + def match_targets_to_proposals(self, proposal, target): + match_quality_matrix = boxlist_iou(target, proposal) + matched_idxs = self.proposal_matcher(match_quality_matrix) + # Fast RCNN only need "labels" field for selecting the targets + target = target.copy_with_fields("labels") + # get the targets corresponding GT for each proposal + # NB: need to clamp the indices because we can have a single + # GT in the image, and matched_idxs can be -2, which goes + # out of bounds + matched_targets = target[matched_idxs.clamp(min=0)] + matched_targets.add_field("matched_idxs", matched_idxs) + return matched_targets + + def prepare_targets(self, proposals, targets): + labels = [] + regression_targets = [] + for proposals_per_image, targets_per_image in zip(proposals, targets): + matched_targets = self.match_targets_to_proposals( + proposals_per_image, targets_per_image + ) + matched_idxs = matched_targets.get_field("matched_idxs") + + labels_per_image = matched_targets.get_field("labels") + labels_per_image = labels_per_image.to(dtype=torch.int64) + + # Label background (below the low threshold) + bg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD + labels_per_image[bg_inds] = 0 + + # Label ignore proposals (between low and high thresholds) + ignore_inds = matched_idxs == Matcher.BETWEEN_THRESHOLDS + labels_per_image[ignore_inds] = -1 # -1 is ignored by sampler + + # compute regression targets + regression_targets_per_image = self.box_coder.encode( + matched_targets.bbox, proposals_per_image.bbox + ) + + labels.append(labels_per_image) + regression_targets.append(regression_targets_per_image) + + return labels, regression_targets + + def subsample(self, proposals, targets): + """ + This method performs the positive/negative sampling, and return + the sampled proposals. + Note: this function keeps a state. + + Arguments: + proposals (list[BoxList]) + targets (list[BoxList]) + """ + + labels, regression_targets = self.prepare_targets(proposals, targets) + sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) + + proposals = list(proposals) + # add corresponding label and regression_targets information to the bounding boxes + for labels_per_image, regression_targets_per_image, proposals_per_image in zip( + labels, regression_targets, proposals + ): + proposals_per_image.add_field("labels", labels_per_image) + proposals_per_image.add_field( + "regression_targets", regression_targets_per_image + ) + + # distributed sampled proposals, that were obtained on all feature maps + # concatenated via the fg_bg_sampler, into individual feature map levels + for img_idx, (pos_inds_img, neg_inds_img) in enumerate( + zip(sampled_pos_inds, sampled_neg_inds) + ): + img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1) + proposals_per_image = proposals[img_idx][img_sampled_inds] + proposals[img_idx] = proposals_per_image + + self._proposals = proposals + return proposals + + def __call__(self, class_logits, box_regression): + """ + Computes the loss for Faster R-CNN. + This requires that the subsample method has been called beforehand. + + Arguments: + class_logits (list[Tensor]) + box_regression (list[Tensor]) + + Returns: + classification_loss (Tensor) + box_loss (Tensor) + """ + + class_logits = cat(class_logits, dim=0) + box_regression = cat(box_regression, dim=0) + device = class_logits.device + + if not hasattr(self, "_proposals"): + raise RuntimeError("subsample needs to be called before") + + proposals = self._proposals + + labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0) + regression_targets = cat( + [proposal.get_field("regression_targets") for proposal in proposals], dim=0 + ) + + classification_loss = F.cross_entropy(class_logits, labels) + + # get indices that correspond to the regression targets for + # the corresponding ground truth labels, to be used with + # advanced indexing + sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1) + labels_pos = labels[sampled_pos_inds_subset] + map_inds = 4 * labels_pos[:, None] + torch.tensor([0, 1, 2, 3], device=device) + + box_loss = smooth_l1_loss( + box_regression[sampled_pos_inds_subset[:, None], map_inds], + regression_targets[sampled_pos_inds_subset], + size_average=False, + beta=1, + ) + box_loss = box_loss / labels.numel() + + return classification_loss, box_loss + + +def make_roi_box_loss_evaluator(cfg): + matcher = Matcher( + cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, + cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, + allow_low_quality_matches=False, + ) + + bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS + box_coder = BoxCoder(weights=bbox_reg_weights) + + fg_bg_sampler = BalancedPositiveNegativeSampler( + cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION + ) + + loss_evaluator = FastRCNNLossComputation(matcher, fg_bg_sampler, box_coder) + + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py new file mode 100644 index 000000000..9194eafb3 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py @@ -0,0 +1,88 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from torch import nn +from torch.nn import functional as F + +from maskrcnn_benchmark.modeling.backbone import resnet +from maskrcnn_benchmark.modeling.poolers import Pooler + + +class ResNet50Conv5ROIFeatureExtractor(nn.Module): + def __init__(self, config): + super(ResNet50Conv5ROIFeatureExtractor, self).__init__() + + resolution = config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + scales = config.MODEL.ROI_BOX_HEAD.POOLER_SCALES + sampling_ratio = config.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + + stage = resnet.StageSpec(index=4, block_count=3, return_features=False) + head = resnet.ResNetHead( + block_module=config.MODEL.RESNETS.TRANS_FUNC, + stages=(stage,), + num_groups=config.MODEL.RESNETS.NUM_GROUPS, + width_per_group=config.MODEL.RESNETS.WIDTH_PER_GROUP, + stride_in_1x1=config.MODEL.RESNETS.STRIDE_IN_1X1, + stride_init=None, + res2_out_channels=config.MODEL.RESNETS.RES2_OUT_CHANNELS, + ) + + self.pooler = pooler + self.head = head + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + x = self.head(x) + return x + + +class FPN2MLPFeatureExtractor(nn.Module): + """ + Heads for FPN for classification + """ + + def __init__(self, cfg): + super(FPN2MLPFeatureExtractor, self).__init__() + + resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES + sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + input_size = cfg.MODEL.BACKBONE.OUT_CHANNELS * resolution ** 2 + representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM + self.pooler = pooler + self.fc6 = nn.Linear(input_size, representation_size) + self.fc7 = nn.Linear(representation_size, representation_size) + + for l in [self.fc6, self.fc7]: + # Caffe2 implementation uses XavierFill, which in fact + # corresponds to kaiming_uniform_ in PyTorch + nn.init.kaiming_uniform_(l.weight, a=1) + nn.init.constant_(l.bias, 0) + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + x = x.view(x.size(0), -1) + + x = F.relu(self.fc6(x)) + x = F.relu(self.fc7(x)) + + return x + + +_ROI_BOX_FEATURE_EXTRACTORS = { + "ResNet50Conv5ROIFeatureExtractor": ResNet50Conv5ROIFeatureExtractor, + "FPN2MLPFeatureExtractor": FPN2MLPFeatureExtractor, +} + + +def make_roi_box_feature_extractor(cfg): + func = _ROI_BOX_FEATURE_EXTRACTORS[cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR] + return func(cfg) diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py new file mode 100644 index 000000000..79eb9ac25 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py @@ -0,0 +1,62 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from torch import nn + + +class FastRCNNPredictor(nn.Module): + def __init__(self, config, pretrained=None): + super(FastRCNNPredictor, self).__init__() + + stage_index = 4 + stage2_relative_factor = 2 ** (stage_index - 1) + res2_out_channels = config.MODEL.RESNETS.RES2_OUT_CHANNELS + num_inputs = res2_out_channels * stage2_relative_factor + + num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES + self.avgpool = nn.AvgPool2d(kernel_size=7, stride=7) + self.cls_score = nn.Linear(num_inputs, num_classes) + self.bbox_pred = nn.Linear(num_inputs, num_classes * 4) + + nn.init.normal_(self.cls_score.weight, mean=0, std=0.01) + nn.init.constant_(self.cls_score.weight, 0) + + nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001) + nn.init.constant_(self.bbox_pred.weight, 0) + + def forward(self, x): + x = self.avgpool(x) + x = x.view(x.size(0), -1) + cls_logit = self.cls_score(x) + bbox_pred = self.bbox_pred(x) + return cls_logit, bbox_pred + + +class FPNPredictor(nn.Module): + def __init__(self, cfg): + super(FPNPredictor, self).__init__() + num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES + representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM + + self.cls_score = nn.Linear(representation_size, num_classes) + self.bbox_pred = nn.Linear(representation_size, num_classes * 4) + + nn.init.normal_(self.cls_score.weight, std=0.01) + nn.init.normal_(self.bbox_pred.weight, std=0.001) + for l in [self.cls_score, self.bbox_pred]: + nn.init.constant_(l.bias, 0) + + def forward(self, x): + scores = self.cls_score(x) + bbox_deltas = self.bbox_pred(x) + + return scores, bbox_deltas + + +_ROI_BOX_PREDICTOR = { + "FastRCNNPredictor": FastRCNNPredictor, + "FPNPredictor": FPNPredictor, +} + + +def make_roi_box_predictor(cfg): + func = _ROI_BOX_PREDICTOR[cfg.MODEL.ROI_BOX_HEAD.PREDICTOR] + return func(cfg) diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py new file mode 100644 index 000000000..b56ea7ebf --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py @@ -0,0 +1,189 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import numpy as np +import torch +from PIL import Image +from torch import nn + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +# TODO check if want to return a single BoxList or a composite +# object +class MaskPostProcessor(nn.Module): + """ + From the results of the CNN, post process the masks + by taking the mask corresponding to the class with max + probability (which are of fixed size and directly output + by the CNN) and return the masks in the mask field of the BoxList. + + If a masker object is passed, it will additionally + project the masks in the image according to the locations in boxes, + """ + + def __init__(self, masker=None): + super(MaskPostProcessor, self).__init__() + self.masker = masker + + def forward(self, x, boxes): + """ + Arguments: + x (Tensor): the mask logits + boxes (list[BoxList]): bounding boxes that are used as + reference, one for ech image + + Returns: + results (list[BoxList]): one BoxList for each image, containing + the extra field mask + """ + mask_prob = x.sigmoid() + + # select masks coresponding to the predicted classes + num_masks = x.shape[0] + labels = [bbox.get_field("labels") for bbox in boxes] + labels = torch.cat(labels) + index = torch.arange(num_masks, device=labels.device) + mask_prob = mask_prob[index, labels][:, None] + + if self.masker: + mask_prob = self.masker(mask_prob, boxes) + + boxes_per_image = [len(box) for box in boxes] + mask_prob = mask_prob.split(boxes_per_image, dim=0) + + results = [] + for prob, box in zip(mask_prob, boxes): + bbox = BoxList(box.bbox, box.size, mode="xyxy") + for field in box.fields(): + bbox.add_field(field, box.get_field(field)) + bbox.add_field("mask", prob) + results.append(bbox) + + return results + + +class MaskPostProcessorCOCOFormat(MaskPostProcessor): + """ + From the results of the CNN, post process the results + so that the masks are pasted in the image, and + additionally convert the results to COCO format. + """ + + def forward(self, x, boxes): + import pycocotools.mask as mask_util + import numpy as np + + results = super(MaskPostProcessorCOCOFormat, self).forward(x, boxes) + for result in results: + masks = result.get_field("mask").cpu() + rles = [ + mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] + for mask in masks + ] + for rle in rles: + rle["counts"] = rle["counts"].decode("utf-8") + result.add_field("mask", rles) + return results + + +# the next two functions should be merged inside Masker +# but are kept here for the moment while we need them +# temporarily gor paste_mask_in_image +def expand_boxes(boxes, scale): + w_half = (boxes[:, 2] - boxes[:, 0]) * .5 + h_half = (boxes[:, 3] - boxes[:, 1]) * .5 + x_c = (boxes[:, 2] + boxes[:, 0]) * .5 + y_c = (boxes[:, 3] + boxes[:, 1]) * .5 + + w_half *= scale + h_half *= scale + + boxes_exp = torch.zeros_like(boxes) + boxes_exp[:, 0] = x_c - w_half + boxes_exp[:, 2] = x_c + w_half + boxes_exp[:, 1] = y_c - h_half + boxes_exp[:, 3] = y_c + h_half + return boxes_exp + + +def expand_masks(mask, padding): + N = mask.shape[0] + M = mask.shape[-1] + pad2 = 2 * padding + scale = float(M + pad2) / M + padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2)) + padded_mask[:, :, padding:-padding, padding:-padding] = mask + return padded_mask, scale + + +def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1): + padded_mask, scale = expand_masks(mask[None], padding=padding) + mask = padded_mask[0, 0] + box = expand_boxes(box[None], scale)[0] + box = box.numpy().astype(np.int32) + + TO_REMOVE = 1 + w = box[2] - box[0] + TO_REMOVE + h = box[3] - box[1] + TO_REMOVE + w = max(w, 1) + h = max(h, 1) + + mask = Image.fromarray(mask.cpu().numpy()) + mask = mask.resize((w, h), resample=Image.BILINEAR) + mask = np.array(mask, copy=False) + + if thresh >= 0: + mask = np.array(mask > thresh, dtype=np.uint8) + mask = torch.from_numpy(mask) + else: + # for visualization and debugging, we also + # allow it to return an unmodified mask + mask = torch.from_numpy(mask * 255).to(torch.uint8) + + im_mask = torch.zeros((im_h, im_w), dtype=torch.uint8) + x_0 = max(box[0], 0) + x_1 = min(box[2] + 1, im_w) + y_0 = max(box[1], 0) + y_1 = min(box[3] + 1, im_h) + + im_mask[y_0:y_1, x_0:x_1] = mask[ + (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0]) + ] + return im_mask + + +class Masker(object): + """ + Projects a set of masks in an image on the locations + specified by the bounding boxes + """ + + def __init__(self, threshold=0.5, padding=1): + self.threshold = threshold + self.padding = padding + + def forward_single_image(self, masks, boxes): + boxes = boxes.convert("xyxy") + im_w, im_h = boxes.size + res = [ + paste_mask_in_image(mask[0], box, im_h, im_w, self.threshold, self.padding) + for mask, box in zip(masks, boxes.bbox) + ] + if len(res) > 0: + res = torch.stack(res, dim=0)[:, None] + else: + res = masks.new_empty((0, 1, masks.shape[-2], masks.shape[-1])) + return res + + def __call__(self, masks, boxes): + # TODO do this properly + if isinstance(boxes, BoxList): + boxes = [boxes] + assert len(boxes) == 1, "Only single image batch supported" + result = self.forward_single_image(masks, boxes[0]) + return result + + +def make_roi_mask_post_processor(cfg): + masker = None + mask_post_processor = MaskPostProcessor(masker) + return mask_post_processor diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py new file mode 100644 index 000000000..36dcaa325 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py @@ -0,0 +1,144 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch.nn import functional as F + +from maskrcnn_benchmark.layers import smooth_l1_loss +from maskrcnn_benchmark.modeling.matcher import Matcher +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.modeling.utils import cat + + +def project_masks_on_boxes(segmentation_masks, proposals, discretization_size): + """ + Given segmentation masks and the bounding boxes corresponding + to the location of the masks in the image, this function + crops and resizes the masks in the position defined by the + boxes. This prepares the masks for them to be fed to the + loss computation as the targets. + + Arguments: + segmentation_masks: an instance of SegmentationMask + proposals: an instance of BoxList + """ + masks = [] + M = discretization_size + device = proposals.bbox.device + proposals = proposals.convert("xyxy") + assert segmentation_masks.size == proposals.size, "{}, {}".format( + segmentation_masks, proposals + ) + # TODO put the proposals on the CPU, as the representation for the + # masks is not efficient GPU-wise (possibly several small tensors for + # representing a single instance mask) + proposals = proposals.bbox.to(torch.device("cpu")) + for segmentation_mask, proposal in zip(segmentation_masks, proposals): + # crop the masks, resize them to the desired resolution and + # then convert them to the tensor representation, + # instead of the list representation that was used + cropped_mask = segmentation_mask.crop(proposal) + scaled_mask = cropped_mask.resize((M, M)) + mask = scaled_mask.convert(mode="mask") + masks.append(mask) + if len(masks) == 0: + return torch.empty(0, dtype=torch.float32, device=device) + return torch.stack(masks, dim=0).to(device, dtype=torch.float32) + + +class MaskRCNNLossComputation(object): + def __init__(self, proposal_matcher, discretization_size): + """ + Arguments: + proposal_matcher (Matcher) + discretization_size (int) + """ + self.proposal_matcher = proposal_matcher + self.discretization_size = discretization_size + + def match_targets_to_proposals(self, proposal, target): + match_quality_matrix = boxlist_iou(target, proposal) + matched_idxs = self.proposal_matcher(match_quality_matrix) + # Mask RCNN needs "labels" and "masks "fields for creating the targets + target = target.copy_with_fields(["labels", "masks"]) + # get the targets corresponding GT for each proposal + # NB: need to clamp the indices because we can have a single + # GT in the image, and matched_idxs can be -2, which goes + # out of bounds + matched_targets = target[matched_idxs.clamp(min=0)] + matched_targets.add_field("matched_idxs", matched_idxs) + return matched_targets + + def prepare_targets(self, proposals, targets): + labels = [] + masks = [] + for proposals_per_image, targets_per_image in zip(proposals, targets): + matched_targets = self.match_targets_to_proposals( + proposals_per_image, targets_per_image + ) + matched_idxs = matched_targets.get_field("matched_idxs") + + labels_per_image = matched_targets.get_field("labels") + labels_per_image = labels_per_image.to(dtype=torch.int64) + + # this can probably be removed, but is left here for clarity + # and completeness + neg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD + labels_per_image[neg_inds] = 0 + + # mask scores are only computed on positive samples + positive_inds = torch.nonzero(labels_per_image > 0).squeeze(1) + + segmentation_masks = matched_targets.get_field("masks") + segmentation_masks = segmentation_masks[positive_inds] + + positive_proposals = proposals_per_image[positive_inds] + + masks_per_image = project_masks_on_boxes( + segmentation_masks, positive_proposals, self.discretization_size + ) + + labels.append(labels_per_image) + masks.append(masks_per_image) + + return labels, masks + + def __call__(self, proposals, mask_logits, targets): + """ + Arguments: + proposals (list[BoxList]) + mask_logits (Tensor) + targets (list[BoxList]) + + Return: + mask_loss (Tensor): scalar tensor containing the loss + """ + labels, mask_targets = self.prepare_targets(proposals, targets) + + labels = cat(labels, dim=0) + mask_targets = cat(mask_targets, dim=0) + + positive_inds = torch.nonzero(labels > 0).squeeze(1) + labels_pos = labels[positive_inds] + + # torch.mean (in binary_cross_entropy_with_logits) doesn't + # accept empty tensors, so handle it separately + if mask_targets.numel() == 0: + return mask_logits.sum() * 0 + + mask_loss = F.binary_cross_entropy_with_logits( + mask_logits[positive_inds, labels_pos], mask_targets + ) + return mask_loss + + +def make_roi_mask_loss_evaluator(cfg): + matcher = Matcher( + cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, + cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, + allow_low_quality_matches=False, + ) + + loss_evaluator = MaskRCNNLossComputation( + matcher, cfg.MODEL.ROI_MASK_HEAD.RESOLUTION + ) + + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py new file mode 100644 index 000000000..e28b1907a --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py @@ -0,0 +1,82 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn + +from maskrcnn_benchmark.structures.bounding_box import BoxList + +from .roi_mask_feature_extractors import make_roi_mask_feature_extractor +from .roi_mask_predictors import make_roi_mask_predictor +from .inference import make_roi_mask_post_processor +from .loss import make_roi_mask_loss_evaluator + + +def keep_only_positive_boxes(boxes): + """ + Given a set of BoxList containing the `labels` field, + return a set of BoxList for which `labels > 0`. + + Arguments: + boxes (list of BoxList) + """ + assert isinstance(boxes, (list, tuple)) + assert isinstance(boxes[0], BoxList) + assert boxes[0].has_field("labels") + positive_boxes = [] + positive_inds = [] + num_boxes = 0 + for boxes_per_image in boxes: + labels = boxes_per_image.get_field("labels") + inds_mask = labels > 0 + inds = inds_mask.nonzero().squeeze(1) + positive_boxes.append(boxes_per_image[inds]) + positive_inds.append(inds_mask) + return positive_boxes, positive_inds + + +class ROIMaskHead(torch.nn.Module): + def __init__(self, cfg): + super(ROIMaskHead, self).__init__() + self.cfg = cfg.clone() + self.feature_extractor = make_roi_mask_feature_extractor(cfg) + self.predictor = make_roi_mask_predictor(cfg) + self.post_processor = make_roi_mask_post_processor(cfg) + self.loss_evaluator = make_roi_mask_loss_evaluator(cfg) + + def forward(self, features, proposals, targets=None): + """ + Arguments: + features (list[Tensor]): feature-maps from possibly several levels + proposals (list[BoxList]): proposal boxes + targets (list[BoxList], optional): the ground-truth targets. + + Returns: + x (Tensor): the result of the feature extractor + proposals (list[BoxList]): during training, the original proposals + are returned. During testing, the predicted boxlists are returned + with the `mask` field set + losses (dict[Tensor]): During training, returns the losses for the + head. During testing, returns an empty dict. + """ + + if self.training: + # during training, only focus on positive boxes + all_proposals = proposals + proposals, positive_inds = keep_only_positive_boxes(proposals) + if self.training and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: + x = features + x = x[torch.cat(positive_inds, dim=0)] + else: + x = self.feature_extractor(features, proposals) + mask_logits = self.predictor(x) + + if not self.training: + result = self.post_processor(mask_logits, proposals) + return x, result, {} + + loss_mask = self.loss_evaluator(proposals, mask_logits, targets) + + return x, all_proposals, dict(loss_mask=loss_mask) + + +def build_roi_mask_head(cfg): + return ROIMaskHead(cfg) diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py new file mode 100644 index 000000000..66f2c2665 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py @@ -0,0 +1,67 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from torch import nn +from torch.nn import functional as F + +from ..box_head.roi_box_feature_extractors import ResNet50Conv5ROIFeatureExtractor +from maskrcnn_benchmark.modeling.poolers import Pooler +from maskrcnn_benchmark.layers import Conv2d + + +class MaskRCNNFPNFeatureExtractor(nn.Module): + """ + Heads for FPN for classification + """ + + def __init__(self, cfg): + """ + Arguments: + num_classes (int): number of output classes + input_size (int): number of channels of the input once it's flattened + representation_size (int): size of the intermediate representation + """ + super(MaskRCNNFPNFeatureExtractor, self).__init__() + + resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION + scales = cfg.MODEL.ROI_MASK_HEAD.POOLER_SCALES + sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + input_size = cfg.MODEL.BACKBONE.OUT_CHANNELS + self.pooler = pooler + + layers = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS + + next_feature = input_size + self.blocks = [] + for layer_idx, layer_features in enumerate(layers, 1): + layer_name = "mask_fcn{}".format(layer_idx) + module = Conv2d(next_feature, layer_features, 3, stride=1, padding=1) + # Caffe2 implementation uses MSRAFill, which in fact + # corresponds to kaiming_normal_ in PyTorch + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") + nn.init.constant_(module.bias, 0) + self.add_module(layer_name, module) + next_feature = layer_features + self.blocks.append(layer_name) + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + + for layer_name in self.blocks: + x = F.relu(getattr(self, layer_name)(x)) + + return x + + +_ROI_MASK_FEATURE_EXTRACTORS = { + "ResNet50Conv5ROIFeatureExtractor": ResNet50Conv5ROIFeatureExtractor, + "MaskRCNNFPNFeatureExtractor": MaskRCNNFPNFeatureExtractor, +} + + +def make_roi_mask_feature_extractor(cfg): + func = _ROI_MASK_FEATURE_EXTRACTORS[cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR] + return func(cfg) diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_predictors.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_predictors.py new file mode 100644 index 000000000..c24962f9f --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_predictors.py @@ -0,0 +1,44 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from torch import nn +from torch.nn import functional as F + +from maskrcnn_benchmark.layers import Conv2d +from maskrcnn_benchmark.layers import ConvTranspose2d + + +class MaskRCNNC4Predictor(nn.Module): + def __init__(self, cfg): + super(MaskRCNNC4Predictor, self).__init__() + num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES + dim_reduced = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS[-1] + + if cfg.MODEL.ROI_HEADS.USE_FPN: + num_inputs = dim_reduced + else: + stage_index = 4 + stage2_relative_factor = 2 ** (stage_index - 1) + res2_out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS + num_inputs = res2_out_channels * stage2_relative_factor + + self.conv5_mask = ConvTranspose2d(num_inputs, dim_reduced, 2, 2, 0) + self.mask_fcn_logits = Conv2d(dim_reduced, num_classes, 1, 1, 0) + + for name, param in self.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0) + elif "weight" in name: + # Caffe2 implementation uses MSRAFill, which in fact + # corresponds to kaiming_normal_ in PyTorch + nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") + + def forward(self, x): + x = F.relu(self.conv5_mask(x)) + return self.mask_fcn_logits(x) + + +_ROI_MASK_PREDICTOR = {"MaskRCNNC4Predictor": MaskRCNNC4Predictor} + + +def make_roi_mask_predictor(cfg): + func = _ROI_MASK_PREDICTOR[cfg.MODEL.ROI_MASK_HEAD.PREDICTOR] + return func(cfg) diff --git a/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py b/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py new file mode 100644 index 000000000..f09c24d61 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py @@ -0,0 +1,54 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from .box_head.box_head import build_roi_box_head +from .mask_head.mask_head import build_roi_mask_head + + +class CombinedROIHeads(torch.nn.ModuleDict): + """ + Combines a set of individual heads (for box prediction or masks) into a single + head. + """ + + def __init__(self, cfg, heads): + super(CombinedROIHeads, self).__init__(heads) + self.cfg = cfg.clone() + if cfg.MODEL.MASK_ON and cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: + self.mask.feature_extractor = self.box.feature_extractor + + def forward(self, features, proposals, targets=None): + losses = {} + # TODO rename x to roi_box_features, if it doesn't increase memory consumption + x, detections, loss_box = self.box(features, proposals, targets) + losses.update(loss_box) + if self.cfg.MODEL.MASK_ON: + mask_features = features + # optimization: during training, if we share the feature extractor between + # the box and the mask heads, then we can reuse the features already computed + if ( + self.training + and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR + ): + mask_features = x + # During training, self.box() will return the unaltered proposals as "detections" + # this makes the API consistent during training and testing + x, detections, loss_mask = self.mask(mask_features, detections, targets) + losses.update(loss_mask) + return x, detections, losses + + +def build_roi_heads(cfg): + # individually create the heads, that will be combined together + # afterwards + roi_heads = [] + if not cfg.MODEL.RPN_ONLY: + roi_heads.append(("box", build_roi_box_head(cfg))) + if cfg.MODEL.MASK_ON: + roi_heads.append(("mask", build_roi_mask_head(cfg))) + + # combine individual heads in a single module + if roi_heads: + roi_heads = CombinedROIHeads(cfg, roi_heads) + + return roi_heads diff --git a/maskrcnn_benchmark/modeling/rpn/__init__.py b/maskrcnn_benchmark/modeling/rpn/__init__.py new file mode 100644 index 000000000..b01f30cfd --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# from .rpn import build_rpn diff --git a/maskrcnn_benchmark/modeling/rpn/anchor_generator.py b/maskrcnn_benchmark/modeling/rpn/anchor_generator.py new file mode 100644 index 000000000..c3c32a905 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/anchor_generator.py @@ -0,0 +1,263 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import math + +import numpy as np +import torch +from torch import nn + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +class BufferList(nn.Module): + """ + Similar to nn.ParameterList, but for buffers + """ + + def __init__(self, buffers=None): + super(BufferList, self).__init__() + if buffers is not None: + self.extend(buffers) + + def extend(self, buffers): + offset = len(self) + for i, buffer in enumerate(buffers): + self.register_buffer(str(offset + i), buffer) + return self + + def __len__(self): + return len(self._buffers) + + def __iter__(self): + return iter(self._buffers.values()) + + +class AnchorGenerator(nn.Module): + """ + For a set of image sizes and feature maps, computes a set + of anchors + """ + + def __init__( + self, + sizes=(128, 256, 512), + aspect_ratios=(0.5, 1.0, 2.0), + anchor_strides=(8, 16, 32), + straddle_thresh=0, + ): + super(AnchorGenerator, self).__init__() + + if len(anchor_strides) == 1: + anchor_stride = anchor_strides[0] + cell_anchors = [ + generate_anchors(anchor_stride, sizes, aspect_ratios).float() + ] + else: + if len(anchor_strides) != len(sizes): + raise RuntimeError("FPN should have #anchor_strides == #sizes") + cell_anchors = [ + generate_anchors(anchor_stride, (size,), aspect_ratios).float() + for anchor_stride, size in zip(anchor_strides, sizes) + ] + self.strides = anchor_strides + self.cell_anchors = BufferList(cell_anchors) + self.straddle_thresh = straddle_thresh + + def num_anchors_per_location(self): + return [len(cell_anchors) for cell_anchors in self.cell_anchors] + + def grid_anchors(self, grid_sizes): + anchors = [] + for size, stride, base_anchors in zip( + grid_sizes, self.strides, self.cell_anchors + ): + grid_height, grid_width = size + device = base_anchors.device + shifts_x = torch.arange( + 0, grid_width * stride, step=stride, dtype=torch.float32, device=device + ) + shifts_y = torch.arange( + 0, grid_height * stride, step=stride, dtype=torch.float32, device=device + ) + shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) + shift_x = shift_x.reshape(-1) + shift_y = shift_y.reshape(-1) + shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) + + anchors.append( + (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4) + ) + + return anchors + + def add_visibility_to(self, boxlist): + image_width, image_height = boxlist.size + anchors = boxlist.bbox + if self.straddle_thresh >= 0: + inds_inside = ( + (anchors[..., 0] >= -self.straddle_thresh) + & (anchors[..., 1] >= -self.straddle_thresh) + & (anchors[..., 2] < image_width + self.straddle_thresh) + & (anchors[..., 3] < image_height + self.straddle_thresh) + ) + else: + device = anchors.device + inds_inside = torch.ones(anchors.shape[0], dtype=torch.uint8, device=device) + boxlist.add_field("visibility", inds_inside) + + def forward(self, image_list, feature_maps): + grid_height, grid_width = feature_maps[0].shape[-2:] + grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps] + anchors_over_all_feature_maps = self.grid_anchors(grid_sizes) + anchors = [] + for i, (image_height, image_width) in enumerate(image_list.image_sizes): + anchors_in_image = [] + for anchors_per_feature_map in anchors_over_all_feature_maps: + boxlist = BoxList( + anchors_per_feature_map, (image_width, image_height), mode="xyxy" + ) + self.add_visibility_to(boxlist) + anchors_in_image.append(boxlist) + anchors.append(anchors_in_image) + return anchors + + +def make_anchor_generator(config): + anchor_sizes = config.MODEL.RPN.ANCHOR_SIZES + aspect_ratios = config.MODEL.RPN.ASPECT_RATIOS + anchor_stride = config.MODEL.RPN.ANCHOR_STRIDE + straddle_thresh = config.MODEL.RPN.STRADDLE_THRESH + + if config.MODEL.RPN.USE_FPN: + assert len(anchor_stride) == len( + anchor_sizes + ), "FPN should have len(ANCHOR_STRIDE) == len(ANCHOR_SIZES)" + else: + assert len(anchor_stride) == 1, "Non-FPN should have a single ANCHOR_STRIDE" + anchor_generator = AnchorGenerator( + anchor_sizes, aspect_ratios, anchor_stride, straddle_thresh + ) + return anchor_generator + + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Faster R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick and Sean Bell +# -------------------------------------------------------- + + +# Verify that we compute the same anchors as Shaoqing's matlab implementation: +# +# >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat +# >> anchors +# +# anchors = +# +# -83 -39 100 56 +# -175 -87 192 104 +# -359 -183 376 200 +# -55 -55 72 72 +# -119 -119 136 136 +# -247 -247 264 264 +# -35 -79 52 96 +# -79 -167 96 184 +# -167 -343 184 360 + +# array([[ -83., -39., 100., 56.], +# [-175., -87., 192., 104.], +# [-359., -183., 376., 200.], +# [ -55., -55., 72., 72.], +# [-119., -119., 136., 136.], +# [-247., -247., 264., 264.], +# [ -35., -79., 52., 96.], +# [ -79., -167., 96., 184.], +# [-167., -343., 184., 360.]]) + + +def generate_anchors( + stride=16, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2) +): + """Generates a matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors + are centered on stride / 2, have (approximate) sqrt areas of the specified + sizes, and aspect ratios as given. + """ + return _generate_anchors( + stride, + np.array(sizes, dtype=np.float) / stride, + np.array(aspect_ratios, dtype=np.float), + ) + + +def _generate_anchors(base_size, scales, aspect_ratios): + """Generate anchor (reference) windows by enumerating aspect ratios X + scales wrt a reference (0, 0, base_size - 1, base_size - 1) window. + """ + anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1 + anchors = _ratio_enum(anchor, aspect_ratios) + anchors = np.vstack( + [_scale_enum(anchors[i, :], scales) for i in range(anchors.shape[0])] + ) + return torch.from_numpy(anchors) + + +def _whctrs(anchor): + """Return width, height, x center, and y center for an anchor (window).""" + w = anchor[2] - anchor[0] + 1 + h = anchor[3] - anchor[1] + 1 + x_ctr = anchor[0] + 0.5 * (w - 1) + y_ctr = anchor[1] + 0.5 * (h - 1) + return w, h, x_ctr, y_ctr + + +def _mkanchors(ws, hs, x_ctr, y_ctr): + """Given a vector of widths (ws) and heights (hs) around a center + (x_ctr, y_ctr), output a set of anchors (windows). + """ + ws = ws[:, np.newaxis] + hs = hs[:, np.newaxis] + anchors = np.hstack( + ( + x_ctr - 0.5 * (ws - 1), + y_ctr - 0.5 * (hs - 1), + x_ctr + 0.5 * (ws - 1), + y_ctr + 0.5 * (hs - 1), + ) + ) + return anchors + + +def _ratio_enum(anchor, ratios): + """Enumerate a set of anchors for each aspect ratio wrt an anchor.""" + w, h, x_ctr, y_ctr = _whctrs(anchor) + size = w * h + size_ratios = size / ratios + ws = np.round(np.sqrt(size_ratios)) + hs = np.round(ws * ratios) + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors + + +def _scale_enum(anchor, scales): + """Enumerate a set of anchors for each scale wrt an anchor.""" + w, h, x_ctr, y_ctr = _whctrs(anchor) + ws = w * scales + hs = h * scales + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors diff --git a/maskrcnn_benchmark/modeling/rpn/inference.py b/maskrcnn_benchmark/modeling/rpn/inference.py new file mode 100644 index 000000000..ca7a03446 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/inference.py @@ -0,0 +1,202 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from maskrcnn_benchmark.modeling.box_coder import BoxCoder +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms +from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes + +from ..utils import cat + + +class RPNPostProcessor(torch.nn.Module): + """ + Performs post-processing on the outputs of the RPN boxes, before feeding the + proposals to the heads + """ + + def __init__( + self, + pre_nms_top_n, + post_nms_top_n, + nms_thresh, + min_size, + box_coder=None, + fpn_post_nms_top_n=None, + ): + """ + Arguments: + pre_nms_top_n (int) + post_nms_top_n (int) + nms_thresh (float) + min_size (int) + box_coder (BoxCoder) + fpn_post_nms_top_n (int) + """ + super(RPNPostProcessor, self).__init__() + self.pre_nms_top_n = pre_nms_top_n + self.post_nms_top_n = post_nms_top_n + self.nms_thresh = nms_thresh + self.min_size = min_size + + if box_coder is None: + box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) + self.box_coder = box_coder + + if fpn_post_nms_top_n is None: + fpn_post_nms_top_n = post_nms_top_n + self.fpn_post_nms_top_n = fpn_post_nms_top_n + + def add_gt_proposals(self, proposals, targets): + """ + Arguments: + proposals: list[BoxList] + targets: list[BoxList] + """ + # Get the device we're operating on + device = proposals[0].bbox.device + + gt_boxes = [target.copy_with_fields([]) for target in targets] + + # later cat of bbox requires all fields to be present for all bbox + # so we need to add a dummy for objectness that's missing + for gt_box in gt_boxes: + gt_box.add_field("objectness", torch.ones(len(gt_box), device=device)) + + proposals = [ + cat_boxlist((proposal, gt_box)) + for proposal, gt_box in zip(proposals, gt_boxes) + ] + + return proposals + + def forward_for_single_feature_map(self, anchors, objectness, box_regression): + """ + Arguments: + anchors: list[BoxList] + objectness: tensor of size N, A, H, W + box_regression: tensor of size N, A * 4, H, W + """ + device = objectness.device + N, A, H, W = objectness.shape + + # put in the same format as anchors + objectness = objectness.permute(0, 2, 3, 1).reshape(N, -1) + objectness = objectness.sigmoid() + box_regression = box_regression.view(N, -1, 4, H, W).permute(0, 3, 4, 1, 2) + box_regression = box_regression.reshape(N, -1, 4) + + num_anchors = A * H * W + + pre_nms_top_n = min(self.pre_nms_top_n, num_anchors) + objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True) + + batch_idx = torch.arange(N, device=device)[:, None] + box_regression = box_regression[batch_idx, topk_idx] + + image_shapes = [box.size for box in anchors] + concat_anchors = torch.cat([a.bbox for a in anchors], dim=0) + concat_anchors = concat_anchors.reshape(N, -1, 4)[batch_idx, topk_idx] + + proposals = self.box_coder.decode( + box_regression.view(-1, 4), concat_anchors.view(-1, 4) + ) + + proposals = proposals.view(N, -1, 4) + + result = [] + for proposal, score, im_shape in zip(proposals, objectness, image_shapes): + boxlist = BoxList(proposal, im_shape, mode="xyxy") + boxlist.add_field("objectness", score) + boxlist = boxlist.clip_to_image(remove_empty=False) + boxlist = remove_small_boxes(boxlist, self.min_size) + boxlist = boxlist_nms( + boxlist, + self.nms_thresh, + max_proposals=self.post_nms_top_n, + score_field="objectness", + ) + result.append(boxlist) + return result + + def forward(self, anchors, objectness, box_regression, targets=None): + """ + Arguments: + anchors: list[list[BoxList]] + objectness: list[tensor] + box_regression: list[tensor] + + Returns: + boxlists (list[BoxList]): the post-processed anchors, after + applying box decoding and NMS + """ + sampled_boxes = [] + num_levels = len(objectness) + anchors = list(zip(*anchors)) + for a, o, b in zip(anchors, objectness, box_regression): + sampled_boxes.append(self.forward_for_single_feature_map(a, o, b)) + + boxlists = list(zip(*sampled_boxes)) + boxlists = [cat_boxlist(boxlist) for boxlist in boxlists] + + if num_levels > 1: + boxlists = self.select_over_all_levels(boxlists) + + # append ground-truth bboxes to proposals + if self.training and targets is not None: + boxlists = self.add_gt_proposals(boxlists, targets) + + return boxlists + + def select_over_all_levels(self, boxlists): + num_images = len(boxlists) + # different behavior during training and during testing: + # during training, post_nms_top_n is over *all* the proposals combined, while + # during testing, it is over the proposals for each image + # TODO resolve this difference and make it consistent. It should be per image, + # and not per batch + if self.training: + objectness = torch.cat( + [boxlist.get_field("objectness") for boxlist in boxlists], dim=0 + ) + box_sizes = [len(boxlist) for boxlist in boxlists] + post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness)) + _, inds_sorted = torch.topk(objectness, post_nms_top_n, dim=0, sorted=True) + inds_mask = torch.zeros_like(objectness, dtype=torch.uint8) + inds_mask[inds_sorted] = 1 + inds_mask = inds_mask.split(box_sizes) + for i in range(num_images): + boxlists[i] = boxlists[i][inds_mask[i]] + else: + for i in range(num_images): + objectness = boxlists[i].get_field("objectness") + post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness)) + _, inds_sorted = torch.topk( + objectness, post_nms_top_n, dim=0, sorted=True + ) + boxlists[i] = boxlists[i][inds_sorted] + return boxlists + + +def make_rpn_postprocessor(config, rpn_box_coder, is_train): + fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN + if not is_train: + fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST + + pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TRAIN + post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TRAIN + if not is_train: + pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TEST + post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TEST + nms_thresh = config.MODEL.RPN.NMS_THRESH + min_size = config.MODEL.RPN.MIN_SIZE + box_selector = RPNPostProcessor( + pre_nms_top_n=pre_nms_top_n, + post_nms_top_n=post_nms_top_n, + nms_thresh=nms_thresh, + min_size=min_size, + box_coder=rpn_box_coder, + fpn_post_nms_top_n=fpn_post_nms_top_n, + ) + return box_selector diff --git a/maskrcnn_benchmark/modeling/rpn/loss.py b/maskrcnn_benchmark/modeling/rpn/loss.py new file mode 100644 index 000000000..08472313a --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/loss.py @@ -0,0 +1,151 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +This file contains specific functions for computing losses on the RPN +file +""" + +import torch +from torch.nn import functional as F + +from ..balanced_positive_negative_sampler import BalancedPositiveNegativeSampler +from ..utils import cat + +from maskrcnn_benchmark.layers import smooth_l1_loss +from maskrcnn_benchmark.modeling.matcher import Matcher +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist + + +class RPNLossComputation(object): + """ + This class computes the RPN loss. + """ + + def __init__(self, proposal_matcher, fg_bg_sampler, box_coder): + """ + Arguments: + proposal_matcher (Matcher) + fg_bg_sampler (BalancedPositiveNegativeSampler) + box_coder (BoxCoder) + """ + # self.target_preparator = target_preparator + self.proposal_matcher = proposal_matcher + self.fg_bg_sampler = fg_bg_sampler + self.box_coder = box_coder + + def match_targets_to_anchors(self, anchor, target): + match_quality_matrix = boxlist_iou(target, anchor) + matched_idxs = self.proposal_matcher(match_quality_matrix) + # RPN doesn't need any fields from target + # for creating the labels, so clear them all + target = target.copy_with_fields([]) + # get the targets corresponding GT for each anchor + # NB: need to clamp the indices because we can have a single + # GT in the image, and matched_idxs can be -2, which goes + # out of bounds + matched_targets = target[matched_idxs.clamp(min=0)] + matched_targets.add_field("matched_idxs", matched_idxs) + return matched_targets + + def prepare_targets(self, anchors, targets): + labels = [] + regression_targets = [] + for anchors_per_image, targets_per_image in zip(anchors, targets): + matched_targets = self.match_targets_to_anchors( + anchors_per_image, targets_per_image + ) + + matched_idxs = matched_targets.get_field("matched_idxs") + labels_per_image = matched_idxs >= 0 + labels_per_image = labels_per_image.to(dtype=torch.float32) + # discard anchors that go out of the boundaries of the image + labels_per_image[~anchors_per_image.get_field("visibility")] = -1 + + # discard indices that are between thresholds + inds_to_discard = matched_idxs == Matcher.BETWEEN_THRESHOLDS + labels_per_image[inds_to_discard] = -1 + + # compute regression targets + regression_targets_per_image = self.box_coder.encode( + matched_targets.bbox, anchors_per_image.bbox + ) + + labels.append(labels_per_image) + regression_targets.append(regression_targets_per_image) + + return labels, regression_targets + + def __call__(self, anchors, objectness, box_regression, targets): + """ + Arguments: + anchors (list[BoxList]) + objectness (list[Tensor]) + box_regression (list[Tensor]) + targets (list[BoxList]) + + Returns: + objectness_loss (Tensor) + box_loss (Tensor + """ + anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors] + labels, regression_targets = self.prepare_targets(anchors, targets) + sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) + sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1) + sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1) + + sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0) + + objectness_flattened = [] + box_regression_flattened = [] + # for each feature level, permute the outputs to make them be in the + # same format as the labels. Note that the labels are computed for + # all feature levels concatenated, so we keep the same representation + # for the objectness and the box_regression + for objectness_per_level, box_regression_per_level in zip( + objectness, box_regression + ): + N, A, H, W = objectness_per_level.shape + objectness_per_level = objectness_per_level.permute(0, 2, 3, 1).reshape( + N, -1 + ) + box_regression_per_level = box_regression_per_level.view(N, -1, 4, H, W) + box_regression_per_level = box_regression_per_level.permute(0, 3, 4, 1, 2) + box_regression_per_level = box_regression_per_level.reshape(N, -1, 4) + objectness_flattened.append(objectness_per_level) + box_regression_flattened.append(box_regression_per_level) + # concatenate on the first dimension (representing the feature levels), to + # take into account the way the labels were generated (with all feature maps + # being concatenated as well) + objectness = cat(objectness_flattened, dim=1).reshape(-1) + box_regression = cat(box_regression_flattened, dim=1).reshape(-1, 4) + + labels = torch.cat(labels, dim=0) + regression_targets = torch.cat(regression_targets, dim=0) + + box_loss = smooth_l1_loss( + box_regression[sampled_pos_inds], + regression_targets[sampled_pos_inds], + beta=1.0 / 9, + size_average=False, + ) / (sampled_inds.numel()) + + objectness_loss = F.binary_cross_entropy_with_logits( + objectness[sampled_inds], labels[sampled_inds] + ) + + return objectness_loss, box_loss + + +def make_rpn_loss_evaluator(cfg, box_coder): + matcher = Matcher( + cfg.MODEL.RPN.FG_IOU_THRESHOLD, + cfg.MODEL.RPN.BG_IOU_THRESHOLD, + allow_low_quality_matches=True, + ) + + fg_bg_sampler = BalancedPositiveNegativeSampler( + cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE, cfg.MODEL.RPN.POSITIVE_FRACTION + ) + + loss_evaluator = RPNLossComputation(matcher, fg_bg_sampler, box_coder) + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/rpn/rpn.py b/maskrcnn_benchmark/modeling/rpn/rpn.py new file mode 100644 index 000000000..becb39fcf --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/rpn.py @@ -0,0 +1,139 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torch.nn.functional as F +from torch import nn + +from maskrcnn_benchmark.modeling.box_coder import BoxCoder +from .loss import make_rpn_loss_evaluator +from .anchor_generator import make_anchor_generator +from .inference import make_rpn_postprocessor + + +class RPNHead(nn.Module): + """ + Adds a simple RPN Head with classification and regression heads + """ + + def __init__(self, in_channels, num_anchors): + """ + Arguments: + in_channels (int): number of channels of the input feature + num_anchors (int): number of anchors to be predicted + """ + super(RPNHead, self).__init__() + self.conv = nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=1, padding=1 + ) + self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) + self.bbox_pred = nn.Conv2d( + in_channels, num_anchors * 4, kernel_size=1, stride=1 + ) + + for l in [self.conv, self.cls_logits, self.bbox_pred]: + torch.nn.init.normal_(l.weight, std=0.01) + torch.nn.init.constant_(l.bias, 0) + + def forward(self, x): + logits = [] + bbox_reg = [] + for feature in x: + t = F.relu(self.conv(feature)) + logits.append(self.cls_logits(t)) + bbox_reg.append(self.bbox_pred(t)) + return logits, bbox_reg + + +class RPNModule(torch.nn.Module): + """ + Module for RPN computation. Takes feature maps from the backbone and RPN + proposals and losses. Works for both FPN and non-FPN. + """ + + def __init__(self, cfg): + super(RPNModule, self).__init__() + + self.cfg = cfg.clone() + + anchor_generator = make_anchor_generator(cfg) + + in_channels = cfg.MODEL.BACKBONE.OUT_CHANNELS + head = RPNHead(in_channels, anchor_generator.num_anchors_per_location()[0]) + + rpn_box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) + + box_selector_train = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=True) + box_selector_test = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=False) + + loss_evaluator = make_rpn_loss_evaluator(cfg, rpn_box_coder) + + self.anchor_generator = anchor_generator + self.head = head + self.box_selector_train = box_selector_train + self.box_selector_test = box_selector_test + self.loss_evaluator = loss_evaluator + + def forward(self, images, features, targets=None): + """ + Arguments: + images (ImageList): images for which we want to compute the predictions + features (list[Tensor]): features computed from the images that are + used for computing the predictions. Each tensor in the list + correspond to different feature levels + targets (list[BoxList): ground-truth boxes present in the image (optional) + + Returns: + boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per + image. + losses (dict[Tensor]): the losses for the model during training. During + testing, it is an empty dict. + """ + objectness, rpn_box_regression = self.head(features) + anchors = self.anchor_generator(images, features) + + if self.training: + return self._forward_train(anchors, objectness, rpn_box_regression, targets) + else: + return self._forward_test(anchors, objectness, rpn_box_regression) + + def _forward_train(self, anchors, objectness, rpn_box_regression, targets): + if self.cfg.MODEL.RPN_ONLY: + # When training an RPN-only model, the loss is determined by the + # predicted objectness and rpn_box_regression values and there is + # no need to transform the anchors into predicted boxes; this is an + # optimization that avoids the unnecessary transformation. + boxes = anchors + else: + # For end-to-end models, anchors must be transformed into boxes and + # sampled into a training batch. + with torch.no_grad(): + boxes = self.box_selector_train( + anchors, objectness, rpn_box_regression, targets + ) + loss_objectness, loss_rpn_box_reg = self.loss_evaluator( + anchors, objectness, rpn_box_regression, targets + ) + losses = { + "loss_objectness": loss_objectness, + "loss_rpn_box_reg": loss_rpn_box_reg, + } + return boxes, losses + + def _forward_test(self, anchors, objectness, rpn_box_regression): + boxes = self.box_selector_test(anchors, objectness, rpn_box_regression) + if self.cfg.MODEL.RPN_ONLY: + # For end-to-end models, the RPN proposals are an intermediate state + # and don't bother to sort them in decreasing score order. For RPN-only + # models, the proposals are the final output and we return them in + # high-to-low confidence order. + inds = [ + box.get_field("objectness").sort(descending=True)[1] for box in boxes + ] + boxes = [box[ind] for box, ind in zip(boxes, inds)] + return boxes, {} + + +def build_rpn(cfg): + """ + This gives the gist of it. Not super important because it doesn't change as much + """ + return RPNModule(cfg) diff --git a/maskrcnn_benchmark/modeling/utils.py b/maskrcnn_benchmark/modeling/utils.py new file mode 100644 index 000000000..5b1d79a81 --- /dev/null +++ b/maskrcnn_benchmark/modeling/utils.py @@ -0,0 +1,16 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Miscellaneous utility functions +""" + +import torch + + +def cat(tensors, dim=0): + """ + Efficient version of torch.cat that avoids a copy if there is only a single element in a list + """ + assert isinstance(tensors, (list, tuple)) + if len(tensors) == 1: + return tensors[0] + return torch.cat(tensors, dim) diff --git a/maskrcnn_benchmark/solver/__init__.py b/maskrcnn_benchmark/solver/__init__.py new file mode 100644 index 000000000..75f40530c --- /dev/null +++ b/maskrcnn_benchmark/solver/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .build import make_optimizer +from .build import make_lr_scheduler +from .lr_scheduler import WarmupMultiStepLR diff --git a/maskrcnn_benchmark/solver/build.py b/maskrcnn_benchmark/solver/build.py new file mode 100644 index 000000000..865a4ec8d --- /dev/null +++ b/maskrcnn_benchmark/solver/build.py @@ -0,0 +1,31 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from .lr_scheduler import WarmupMultiStepLR + + +def make_optimizer(cfg, model): + params = [] + for key, value in model.named_parameters(): + if not value.requires_grad: + continue + lr = cfg.SOLVER.BASE_LR + weight_decay = cfg.SOLVER.WEIGHT_DECAY + if "bias" in key: + lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR + weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS + params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] + + optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM) + return optimizer + + +def make_lr_scheduler(cfg, optimizer): + return WarmupMultiStepLR( + optimizer, + cfg.SOLVER.STEPS, + cfg.SOLVER.GAMMA, + warmup_factor=cfg.SOLVER.WARMUP_FACTOR, + warmup_iters=cfg.SOLVER.WARMUP_ITERS, + warmup_method=cfg.SOLVER.WARMUP_METHOD, + ) diff --git a/maskrcnn_benchmark/solver/lr_scheduler.py b/maskrcnn_benchmark/solver/lr_scheduler.py new file mode 100644 index 000000000..fc7e9d7cd --- /dev/null +++ b/maskrcnn_benchmark/solver/lr_scheduler.py @@ -0,0 +1,52 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from bisect import bisect_right + +import torch + + +# FIXME ideally this would be achieved with a CombinedLRScheduler, +# separating MultiStepLR with WarmupLR +# but the current LRScheduler design doesn't allow it +class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): + def __init__( + self, + optimizer, + milestones, + gamma=0.1, + warmup_factor=1.0 / 3, + warmup_iters=500, + warmup_method="linear", + last_epoch=-1, + ): + if not list(milestones) == sorted(milestones): + raise ValueError( + "Milestones should be a list of" " increasing integers. Got {}", + milestones, + ) + + if warmup_method not in ("constant", "linear"): + raise ValueError( + "Only 'constant' or 'linear' warmup_method accepted" + "got {}".format(warmup_method) + ) + self.milestones = milestones + self.gamma = gamma + self.warmup_factor = warmup_factor + self.warmup_iters = warmup_iters + self.warmup_method = warmup_method + super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) + + def get_lr(self): + warmup_factor = 1 + if self.last_epoch < self.warmup_iters: + if self.warmup_method == "constant": + warmup_factor = self.warmup_factor + elif self.warmup_method == "linear": + alpha = self.last_epoch / self.warmup_iters + warmup_factor = self.warmup_factor * (1 - alpha) + alpha + return [ + base_lr + * warmup_factor + * self.gamma ** bisect_right(self.milestones, self.last_epoch) + for base_lr in self.base_lrs + ] diff --git a/maskrcnn_benchmark/structures/__init__.py b/maskrcnn_benchmark/structures/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/maskrcnn_benchmark/structures/bounding_box.py b/maskrcnn_benchmark/structures/bounding_box.py new file mode 100644 index 000000000..bcdd6d0b2 --- /dev/null +++ b/maskrcnn_benchmark/structures/bounding_box.py @@ -0,0 +1,257 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +# transpose +FLIP_LEFT_RIGHT = 0 +FLIP_TOP_BOTTOM = 1 + + +class BoxList(object): + """ + This class represents a set of bounding boxes. + The bounding boxes are represented as a Nx4 Tensor. + In order ot uniquely determine the bounding boxes with respect + to an image, we also store the corresponding image dimensions. + They can contain extra information that is specific to each bounding box, such as + labels. + """ + + def __init__(self, bbox, image_size, mode="xyxy"): + device = bbox.device if isinstance(bbox, torch.Tensor) else torch.device("cpu") + bbox = torch.as_tensor(bbox, dtype=torch.float32, device=device) + if bbox.ndimension() != 2: + raise ValueError( + "bbox should have 2 dimensions, got {}".format(bbox.ndimension()) + ) + if bbox.size(-1) != 4: + raise ValueError( + "last dimenion of bbox should have a " + "size of 4, got {}".format(bbox.size(-1)) + ) + if mode not in ("xyxy", "xywh"): + raise ValueError("mode should be 'xyxy' or 'xywh'") + + self.bbox = bbox + self.size = image_size # (image_width, image_height) + self.mode = mode + self.extra_fields = {} + + def add_field(self, field, field_data): + self.extra_fields[field] = field_data + + def get_field(self, field): + return self.extra_fields[field] + + def has_field(self, field): + return field in self.extra_fields + + def fields(self): + return list(self.extra_fields.keys()) + + def _copy_extra_fields(self, bbox): + for k, v in bbox.extra_fields.items(): + self.extra_fields[k] = v + + def convert(self, mode): + if mode not in ("xyxy", "xywh"): + raise ValueError("mode should be 'xyxy' or 'xywh'") + if mode == self.mode: + return self + # we only have two modes, so don't need to check + # self.mode + xmin, ymin, xmax, ymax = self._split_into_xyxy() + if mode == "xyxy": + bbox = torch.cat((xmin, ymin, xmax, ymax), dim=-1) + bbox = BoxList(bbox, self.size, mode=mode) + else: + TO_REMOVE = 1 + bbox = torch.cat( + (xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE), dim=-1 + ) + bbox = BoxList(bbox, self.size, mode=mode) + bbox._copy_extra_fields(self) + return bbox + + def _split_into_xyxy(self): + if self.mode == "xyxy": + xmin, ymin, xmax, ymax = self.bbox.split(1, dim=-1) + return xmin, ymin, xmax, ymax + elif self.mode == "xywh": + TO_REMOVE = 1 + xmin, ymin, w, h = self.bbox.split(1, dim=-1) + return ( + xmin, + ymin, + xmin + (w - TO_REMOVE).clamp(min=0), + ymin + (h - TO_REMOVE).clamp(min=0), + ) + else: + raise RuntimeError("Should not be here") + + def resize(self, size, *args, **kwargs): + """ + Returns a resized copy of this bounding box + + :param size: The requested size in pixels, as a 2-tuple: + (width, height). + """ + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) + if ratios[0] == ratios[1]: + ratio = ratios[0] + scaled_box = self.bbox * ratio + bbox = BoxList(scaled_box, size, mode=self.mode) + # bbox._copy_extra_fields(self) + for k, v in self.extra_fields.items(): + if not isinstance(v, torch.Tensor): + v = v.resize(size, *args, **kwargs) + bbox.add_field(k, v) + return bbox + + ratio_width, ratio_height = ratios + xmin, ymin, xmax, ymax = self._split_into_xyxy() + scaled_xmin = xmin * ratio_width + scaled_xmax = xmax * ratio_width + scaled_ymin = ymin * ratio_height + scaled_ymax = ymax * ratio_height + scaled_box = torch.cat( + (scaled_xmin, scaled_ymin, scaled_xmax, scaled_ymax), dim=-1 + ) + bbox = BoxList(scaled_box, size, mode="xyxy") + # bbox._copy_extra_fields(self) + for k, v in self.extra_fields.items(): + if not isinstance(v, torch.Tensor): + v = v.resize(size, *args, **kwargs) + bbox.add_field(k, v) + + return bbox.convert(self.mode) + + def transpose(self, method): + """ + Transpose bounding box (flip or rotate in 90 degree steps) + :param method: One of :py:attr:`PIL.Image.FLIP_LEFT_RIGHT`, + :py:attr:`PIL.Image.FLIP_TOP_BOTTOM`, :py:attr:`PIL.Image.ROTATE_90`, + :py:attr:`PIL.Image.ROTATE_180`, :py:attr:`PIL.Image.ROTATE_270`, + :py:attr:`PIL.Image.TRANSPOSE` or :py:attr:`PIL.Image.TRANSVERSE`. + """ + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + + image_width, image_height = self.size + xmin, ymin, xmax, ymax = self._split_into_xyxy() + if method == FLIP_LEFT_RIGHT: + TO_REMOVE = 1 + transposed_xmin = image_width - xmax - TO_REMOVE + transposed_xmax = image_width - xmin - TO_REMOVE + transposed_ymin = ymin + transposed_ymax = ymax + elif method == FLIP_TOP_BOTTOM: + transposed_xmin = xmin + transposed_xmax = xmax + transposed_ymin = image_height - ymax + transposed_ymax = image_height - ymin + + transposed_boxes = torch.cat( + (transposed_xmin, transposed_ymin, transposed_xmax, transposed_ymax), dim=-1 + ) + bbox = BoxList(transposed_boxes, self.size, mode="xyxy") + # bbox._copy_extra_fields(self) + for k, v in self.extra_fields.items(): + if not isinstance(v, torch.Tensor): + v = v.transpose(method) + bbox.add_field(k, v) + return bbox.convert(self.mode) + + def crop(self, box): + """ + Cropss a rectangular region from this bounding box. The box is a + 4-tuple defining the left, upper, right, and lower pixel + coordinate. + """ + xmin, ymin, xmax, ymax = self._split_into_xyxy() + w, h = box[2] - box[0], box[3] - box[1] + cropped_xmin = (xmin - box[0]).clamp(min=0, max=w) + cropped_ymin = (ymin - box[1]).clamp(min=0, max=h) + cropped_xmax = (xmax - box[0]).clamp(min=0, max=w) + cropped_ymax = (ymax - box[1]).clamp(min=0, max=h) + + # TODO should I filter empty boxes here? + if False: + is_empty = (cropped_xmin == cropped_xmax) | (cropped_ymin == cropped_ymax) + + cropped_box = torch.cat( + (cropped_xmin, cropped_ymin, cropped_xmax, cropped_ymax), dim=-1 + ) + bbox = BoxList(cropped_box, (w, h), mode="xyxy") + # bbox._copy_extra_fields(self) + for k, v in self.extra_fields.items(): + if not isinstance(v, torch.Tensor): + v = v.crop(box) + bbox.add_field(k, v) + return bbox.convert(self.mode) + + # Tensor-like methods + + def to(self, device): + bbox = BoxList(self.bbox.to(device), self.size, self.mode) + for k, v in self.extra_fields.items(): + if hasattr(v, "to"): + v = v.to(device) + bbox.add_field(k, v) + return bbox + + def __getitem__(self, item): + bbox = BoxList(self.bbox[item], self.size, self.mode) + for k, v in self.extra_fields.items(): + bbox.add_field(k, v[item]) + return bbox + + def __len__(self): + return self.bbox.shape[0] + + def clip_to_image(self, remove_empty=True): + TO_REMOVE = 1 + self.bbox[:, 0].clamp_(min=0, max=self.size[0] - TO_REMOVE) + self.bbox[:, 1].clamp_(min=0, max=self.size[1] - TO_REMOVE) + self.bbox[:, 2].clamp_(min=0, max=self.size[0] - TO_REMOVE) + self.bbox[:, 3].clamp_(min=0, max=self.size[1] - TO_REMOVE) + if remove_empty: + box = self.bbox + keep = (box[:, 3] > box[:, 1]) & (box[:, 2] > box[:, 0]) + return self[keep] + return self + + def area(self): + TO_REMOVE = 1 + box = self.bbox + area = (box[:, 2] - box[:, 0] + TO_REMOVE) * (box[:, 3] - box[:, 1] + TO_REMOVE) + return area + + def copy_with_fields(self, fields): + bbox = BoxList(self.bbox, self.size, self.mode) + if not isinstance(fields, (list, tuple)): + fields = [fields] + for field in fields: + bbox.add_field(field, self.get_field(field)) + return bbox + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_boxes={}, ".format(len(self)) + s += "image_width={}, ".format(self.size[0]) + s += "image_height={}, ".format(self.size[1]) + s += "mode={})".format(self.mode) + return s + + +if __name__ == "__main__": + bbox = BoxList([[0, 0, 10, 10], [0, 0, 5, 5]], (10, 10)) + s_bbox = bbox.resize((5, 5)) + print(s_bbox) + print(s_bbox.bbox) + + t_bbox = bbox.transpose(0) + print(t_bbox) + print(t_bbox.bbox) diff --git a/maskrcnn_benchmark/structures/boxlist_ops.py b/maskrcnn_benchmark/structures/boxlist_ops.py new file mode 100644 index 000000000..45160f9ab --- /dev/null +++ b/maskrcnn_benchmark/structures/boxlist_ops.py @@ -0,0 +1,128 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from .bounding_box import BoxList + +from maskrcnn_benchmark.layers import nms as _box_nms + + +def boxlist_nms(boxlist, nms_thresh, max_proposals=-1, score_field="score"): + """ + Performs non-maximum suppression on a boxlist, with scores specified + in a boxlist field via score_field. + + Arguments: + boxlist(BoxList) + nms_thresh (float) + max_proposals (int): if > 0, then only the top max_proposals are kept + after non-maxium suppression + score_field (str) + """ + if nms_thresh <= 0: + return boxlist + mode = boxlist.mode + boxlist = boxlist.convert("xyxy") + boxes = boxlist.bbox + score = boxlist.get_field(score_field) + keep = _box_nms(boxes, score, nms_thresh) + if max_proposals > 0: + keep = keep[: max_proposals] + boxlist = boxlist[keep] + return boxlist.convert(mode) + + +def remove_small_boxes(boxlist, min_size): + """ + Only keep boxes with both sides >= min_size + + Arguments: + boxlist (Boxlist) + min_size (int) + """ + # TODO maybe add an API for querying the ws / hs + xywh_boxes = boxlist.convert("xywh").bbox + _, _, ws, hs = xywh_boxes.unbind(dim=1) + keep = ( + (ws >= min_size) & (hs >= min_size) + ).nonzero().squeeze(1) + return boxlist[keep] + + +# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py +# with slight modifications +def boxlist_iou(boxlist1, boxlist2): + """Compute the intersection over union of two set of boxes. + The box order must be (xmin, ymin, xmax, ymax). + + Arguments: + box1: (BoxList) bounding boxes, sized [N,4]. + box2: (BoxList) bounding boxes, sized [M,4]. + + Returns: + (tensor) iou, sized [N,M]. + + Reference: + https://github.com/chainer/chainercv/blob/master/chainercv/utils/bbox/bbox_iou.py + """ + if boxlist1.size != boxlist2.size: + raise RuntimeError( + "boxlists should have same image size, got {}, {}".format(boxlist1, boxlist2)) + + N = len(boxlist1) + M = len(boxlist2) + + area1 = boxlist1.area() + area2 = boxlist2.area() + + box1, box2 = boxlist1.bbox, boxlist2.bbox + + lt = torch.max(box1[:, None, :2], box2[:, :2]) # [N,M,2] + rb = torch.min(box1[:, None, 2:], box2[:, 2:]) # [N,M,2] + + TO_REMOVE = 1 + + wh = (rb - lt + TO_REMOVE).clamp(min=0) # [N,M,2] + inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] + + iou = inter / (area1[:, None] + area2 - inter) + return iou + + +# TODO redundant, remove +def _cat(tensors, dim=0): + """ + Efficient version of torch.cat that avoids a copy if there is only a single element in a list + """ + assert isinstance(tensors, (list, tuple)) + if len(tensors) == 1: + return tensors[0] + return torch.cat(tensors, dim) + + +def cat_boxlist(bboxes): + """ + Concatenates a list of BoxList (having the same image size) into a + single BoxList + + Arguments: + bboxes (list[BoxList]) + """ + assert isinstance(bboxes, (list, tuple)) + assert all(isinstance(bbox, BoxList) for bbox in bboxes) + + size = bboxes[0].size + assert all(bbox.size == size for bbox in bboxes) + + mode = bboxes[0].mode + assert all(bbox.mode == mode for bbox in bboxes) + + fields = set(bboxes[0].fields()) + assert all(set(bbox.fields()) == fields for bbox in bboxes) + + cat_boxes = BoxList(_cat([bbox.bbox for bbox in bboxes], dim=0), size, mode) + + for field in fields: + data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0) + cat_boxes.add_field(field, data) + + return cat_boxes diff --git a/maskrcnn_benchmark/structures/image_list.py b/maskrcnn_benchmark/structures/image_list.py new file mode 100644 index 000000000..c45c1f039 --- /dev/null +++ b/maskrcnn_benchmark/structures/image_list.py @@ -0,0 +1,68 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + + +class ImageList(object): + """ + Structure that holds a list of images (of possibly + varying sizes) as a single tensor. + This works by padding the images to the same size, + and storing in a field the original sizes of each image + """ + + def __init__(self, tensors, image_sizes): + """ + Arguments: + tensors (tensor) + image_sizes (list[tuple[int, int]]) + """ + self.tensors = tensors + self.image_sizes = image_sizes + + def to(self, *args, **kwargs): + cast_tensor = self.tensors.to(*args, **kwargs) + return ImageList(cast_tensor, self.image_sizes) + + +def to_image_list(tensors, size_divisible=0): + """ + tensors can be an ImageList, a torch.Tensor or + an iterable of Tensors. It can't be a numpy array. + When tensors is an iterable of Tensors, it pads + the Tensors with zeros so that they have the same + shape + """ + if isinstance(tensors, torch.Tensor) and size_divisible > 0: + tensors = [tensors] + + if isinstance(tensors, ImageList): + return tensors + elif isinstance(tensors, torch.Tensor): + # single tensor shape can be inferred + assert tensors.dim() == 4 + image_sizes = [tensor.shape[-2:] for tensor in tensors] + return ImageList(tensors, image_sizes) + elif isinstance(tensors, (tuple, list)): + max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors])) + + # TODO Ideally, just remove this and let me model handle arbitrary + # input sizs + if size_divisible > 0: + import math + + stride = size_divisible + max_size = list(max_size) + max_size[1] = int(math.ceil(max_size[1] / stride) * stride) + max_size[2] = int(math.ceil(max_size[2] / stride) * stride) + max_size = tuple(max_size) + + batch_shape = (len(tensors),) + max_size + batched_imgs = tensors[0].new(*batch_shape).zero_() + for img, pad_img in zip(tensors, batched_imgs): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + + image_sizes = [im.shape[-2:] for im in tensors] + + return ImageList(batched_imgs, image_sizes) + else: + raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors))) diff --git a/maskrcnn_benchmark/structures/segmentation_mask.py b/maskrcnn_benchmark/structures/segmentation_mask.py new file mode 100644 index 000000000..ba1290b91 --- /dev/null +++ b/maskrcnn_benchmark/structures/segmentation_mask.py @@ -0,0 +1,214 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +import pycocotools.mask as mask_utils + +# transpose +FLIP_LEFT_RIGHT = 0 +FLIP_TOP_BOTTOM = 1 + + +class Mask(object): + """ + This class is unfinished and not meant for use yet + It is supposed to contain the mask for an object as + a 2d tensor + """ + + def __init__(self, masks, size, mode): + self.masks = masks + self.size = size + self.mode = mode + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + + width, height = self.size + if method == FLIP_LEFT_RIGHT: + dim = width + idx = 2 + elif method == FLIP_TOP_BOTTOM: + dim = height + idx = 1 + + flip_idx = list(range(dim)[::-1]) + flipped_masks = self.masks.index_select(dim, flip_idx) + return Mask(flipped_masks, self.size, self.mode) + + def crop(self, box): + w, h = box[2] - box[0], box[3] - box[1] + + cropped_masks = self.masks[:, box[1] : box[3], box[0] : box[2]] + return Mask(cropped_masks, size=(w, h), mode=self.mode) + + def resize(self, size, *args, **kwargs): + pass + + +class Polygons(object): + """ + This class holds a set of polygons that represents a single instance + of an object mask. The object can be represented as a set of + polygons + """ + + def __init__(self, polygons, size, mode): + # assert isinstance(polygons, list), '{}'.format(polygons) + if isinstance(polygons, list): + polygons = [torch.as_tensor(p, dtype=torch.float32) for p in polygons] + elif isinstance(polygons, Polygons): + polygons = polygons.polygons + + self.polygons = polygons + self.size = size + self.mode = mode + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + + flipped_polygons = [] + width, height = self.size + if method == FLIP_LEFT_RIGHT: + dim = width + idx = 0 + elif method == FLIP_TOP_BOTTOM: + dim = height + idx = 1 + + for poly in self.polygons: + p = poly.clone() + TO_REMOVE = 1 + p[idx::2] = dim - poly[idx::2] - TO_REMOVE + flipped_polygons.append(p) + + return Polygons(flipped_polygons, size=self.size, mode=self.mode) + + def crop(self, box): + w, h = box[2] - box[0], box[3] - box[1] + + # TODO chck if necessary + w = max(w, 1) + h = max(h, 1) + + cropped_polygons = [] + for poly in self.polygons: + p = poly.clone() + p[0::2] = p[0::2] - box[0] # .clamp(min=0, max=w) + p[1::2] = p[1::2] - box[1] # .clamp(min=0, max=h) + cropped_polygons.append(p) + + return Polygons(cropped_polygons, size=(w, h), mode=self.mode) + + def resize(self, size, *args, **kwargs): + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) + if ratios[0] == ratios[1]: + ratio = ratios[0] + scaled_polys = [p * ratio for p in self.polygons] + return Polygons(scaled_polys, size, mode=self.mode) + + ratio_w, ratio_h = ratios + scaled_polygons = [] + for poly in self.polygons: + p = poly.clone() + p[0::2] *= ratio_w + p[1::2] *= ratio_h + scaled_polygons.append(p) + + return Polygons(scaled_polygons, size=size, mode=self.mode) + + def convert(self, mode): + width, height = self.size + if mode == "mask": + rles = mask_utils.frPyObjects( + [p.numpy() for p in self.polygons], height, width + ) + rle = mask_utils.merge(rles) + mask = mask_utils.decode(rle) + mask = torch.from_numpy(mask) + # TODO add squeeze? + return mask + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_polygons={}, ".format(len(self.polygons)) + s += "image_width={}, ".format(self.size[0]) + s += "image_height={}, ".format(self.size[1]) + s += "mode={})".format(self.mode) + return s + + +class SegmentationMask(object): + """ + This class stores the segmentations for all objects in the image + """ + + def __init__(self, polygons, size, mode=None): + """ + Arguments: + polygons: a list of list of lists of numbers. The first + level of the list correspond to individual instances, + the second level to all the polygons that compose the + object, and the third level to the polygon coordinates. + """ + assert isinstance(polygons, list) + + self.polygons = [Polygons(p, size, mode) for p in polygons] + self.size = size + self.mode = mode + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + + flipped = [] + for polygon in self.polygons: + flipped.append(polygon.transpose(method)) + return SegmentationMask(flipped, size=self.size, mode=self.mode) + + def crop(self, box): + w, h = box[2] - box[0], box[3] - box[1] + cropped = [] + for polygon in self.polygons: + cropped.append(polygon.crop(box)) + return SegmentationMask(cropped, size=(w, h), mode=self.mode) + + def resize(self, size, *args, **kwargs): + scaled = [] + for polygon in self.polygons: + scaled.append(polygon.resize(size, *args, **kwargs)) + return SegmentationMask(scaled, size=size, mode=self.mode) + + def to(self, *args, **kwargs): + return self + + def __getitem__(self, item): + if isinstance(item, (int, slice)): + selected_polygons = [self.polygons[item]] + else: + # advanced indexing on a single dimension + selected_polygons = [] + if isinstance(item, torch.Tensor) and item.dtype == torch.uint8: + item = item.nonzero() + item = item.squeeze(1) if item.numel() > 0 else item + item = item.tolist() + for i in item: + selected_polygons.append(self.polygons[i]) + return SegmentationMask(selected_polygons, size=self.size, mode=self.mode) + + def __iter__(self): + return iter(self.polygons) + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_instances={}, ".format(len(self.polygons)) + s += "image_width={}, ".format(self.size[0]) + s += "image_height={})".format(self.size[1]) + return s diff --git a/maskrcnn_benchmark/utils/README.md b/maskrcnn_benchmark/utils/README.md new file mode 100644 index 000000000..9765b24a7 --- /dev/null +++ b/maskrcnn_benchmark/utils/README.md @@ -0,0 +1,5 @@ +# Utility functions + +This folder contain utility functions that are not used in the +core library, but are useful for building models or training +code using the config system. diff --git a/maskrcnn_benchmark/utils/__init__.py b/maskrcnn_benchmark/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/maskrcnn_benchmark/utils/c2_model_loading.py b/maskrcnn_benchmark/utils/c2_model_loading.py new file mode 100644 index 000000000..3057a04fb --- /dev/null +++ b/maskrcnn_benchmark/utils/c2_model_loading.py @@ -0,0 +1,142 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import logging +import pickle +from collections import OrderedDict + +import torch + +from maskrcnn_benchmark.utils.model_serialization import load_state_dict + + +def _rename_basic_resnet_weights(layer_keys): + layer_keys = [k.replace("_", ".") for k in layer_keys] + layer_keys = [k.replace(".w", ".weight") for k in layer_keys] + layer_keys = [k.replace(".bn", "_bn") for k in layer_keys] + layer_keys = [k.replace(".b", ".bias") for k in layer_keys] + layer_keys = [k.replace("_bn.s", "_bn.scale") for k in layer_keys] + layer_keys = [k.replace(".biasranch", ".branch") for k in layer_keys] + layer_keys = [k.replace("bbox.pred", "bbox_pred") for k in layer_keys] + layer_keys = [k.replace("cls.score", "cls_score") for k in layer_keys] + layer_keys = [k.replace("res.conv1_", "conv1_") for k in layer_keys] + + # RPN / Faster RCNN + layer_keys = [k.replace(".biasbox", ".bbox") for k in layer_keys] + layer_keys = [k.replace("conv.rpn", "rpn.conv") for k in layer_keys] + layer_keys = [k.replace("rpn.bbox.pred", "rpn.bbox_pred") for k in layer_keys] + layer_keys = [k.replace("rpn.cls.logits", "rpn.cls_logits") for k in layer_keys] + + # Affine-Channel -> BatchNorm enaming + layer_keys = [k.replace("_bn.scale", "_bn.weight") for k in layer_keys] + + # Make torchvision-compatible + layer_keys = [k.replace("conv1_bn.", "bn1.") for k in layer_keys] + + layer_keys = [k.replace("res2.", "layer1.") for k in layer_keys] + layer_keys = [k.replace("res3.", "layer2.") for k in layer_keys] + layer_keys = [k.replace("res4.", "layer3.") for k in layer_keys] + layer_keys = [k.replace("res5.", "layer4.") for k in layer_keys] + + layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys] + layer_keys = [k.replace(".branch2a_bn.", ".bn1.") for k in layer_keys] + layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys] + layer_keys = [k.replace(".branch2b_bn.", ".bn2.") for k in layer_keys] + layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys] + layer_keys = [k.replace(".branch2c_bn.", ".bn3.") for k in layer_keys] + + layer_keys = [k.replace(".branch1.", ".downsample.0.") for k in layer_keys] + layer_keys = [k.replace(".branch1_bn.", ".downsample.1.") for k in layer_keys] + + return layer_keys + +def _rename_fpn_weights(layer_keys, stage_names): + for mapped_idx, stage_name in enumerate(stage_names, 1): + suffix = "" + if mapped_idx < 4: + suffix = ".lateral" + layer_keys = [ + k.replace("fpn.inner.layer{}.sum{}".format(stage_name, suffix), "fpn_inner{}".format(mapped_idx)) for k in layer_keys + ] + layer_keys = [k.replace("fpn.layer{}.sum".format(stage_name), "fpn_layer{}".format(mapped_idx)) for k in layer_keys] + + + layer_keys = [k.replace("rpn.conv.fpn2", "rpn.conv") for k in layer_keys] + layer_keys = [k.replace("rpn.bbox_pred.fpn2", "rpn.bbox_pred") for k in layer_keys] + layer_keys = [ + k.replace("rpn.cls_logits.fpn2", "rpn.cls_logits") for k in layer_keys + ] + + return layer_keys + + +def _rename_weights_for_resnet(weights, stage_names): + original_keys = sorted(weights.keys()) + layer_keys = sorted(weights.keys()) + + # for X-101, rename output to fc1000 to avoid conflicts afterwards + layer_keys = [k if k != "pred_b" else "fc1000_b" for k in layer_keys] + layer_keys = [k if k != "pred_w" else "fc1000_w" for k in layer_keys] + + # performs basic renaming: _ -> . , etc + layer_keys = _rename_basic_resnet_weights(layer_keys) + + # FPN + layer_keys = _rename_fpn_weights(layer_keys, stage_names) + + # Mask R-CNN + layer_keys = [k.replace("mask.fcn.logits", "mask_fcn_logits") for k in layer_keys] + layer_keys = [k.replace(".[mask].fcn", "mask_fcn") for k in layer_keys] + layer_keys = [k.replace("conv5.mask", "conv5_mask") for k in layer_keys] + + # Keypoint R-CNN + layer_keys = [k.replace("kps.score.lowres", "kps_score_lowres") for k in layer_keys] + layer_keys = [k.replace("kps.score", "kps_score") for k in layer_keys] + layer_keys = [k.replace("conv.fcn", "conv_fcn") for k in layer_keys] + + # Rename for our RPN structure + layer_keys = [k.replace("rpn.", "rpn.head.") for k in layer_keys] + + key_map = {k: v for k, v in zip(original_keys, layer_keys)} + + logger = logging.getLogger(__name__) + logger.info("Remapping C2 weights") + max_c2_key_size = max([len(k) for k in original_keys if "_momentum" not in k]) + + new_weights = OrderedDict() + for k in original_keys: + v = weights[k] + if "_momentum" in k: + continue + # if 'fc1000' in k: + # continue + w = torch.from_numpy(v) + # if "bn" in k: + # w = w.view(1, -1, 1, 1) + logger.info("C2 name: {: <{}} mapped name: {}".format(k, max_c2_key_size, key_map[k])) + new_weights[key_map[k]] = w + + return new_weights + + +def _load_c2_pickled_weights(file_path): + with open(file_path, "rb") as f: + data = pickle.load(f, encoding="latin1") + if "blobs" in data: + weights = data["blobs"] + else: + weights = data + return weights + + +_C2_STAGE_NAMES = { + "R-50": ["1.2", "2.3", "3.5", "4.2"], + "R-101": ["1.2", "2.3", "3.22", "4.2"], +} + +def load_c2_format(cfg, f): + # TODO make it support other architectures + state_dict = _load_c2_pickled_weights(f) + conv_body = cfg.MODEL.BACKBONE.CONV_BODY + arch = conv_body.replace("-C4", "").replace("-FPN", "") + stages = _C2_STAGE_NAMES[arch] + state_dict = _rename_weights_for_resnet(state_dict, stages) + return dict(model=state_dict) diff --git a/maskrcnn_benchmark/utils/checkpoint.py b/maskrcnn_benchmark/utils/checkpoint.py new file mode 100644 index 000000000..6d15d4797 --- /dev/null +++ b/maskrcnn_benchmark/utils/checkpoint.py @@ -0,0 +1,138 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import logging +import os + +import torch + +from maskrcnn_benchmark.utils.model_serialization import load_state_dict +from maskrcnn_benchmark.utils.c2_model_loading import load_c2_format +from maskrcnn_benchmark.utils.imports import import_file +from maskrcnn_benchmark.utils.model_zoo import cache_url + + +class Checkpointer(object): + def __init__( + self, + model, + optimizer=None, + scheduler=None, + save_dir="", + save_to_disk=None, + logger=None, + ): + self.model = model + self.optimizer = optimizer + self.scheduler = scheduler + self.save_dir = save_dir + self.save_to_disk = save_to_disk + if logger is None: + logger = logging.getLogger(__name__) + self.logger = logger + + def save(self, name, **kwargs): + if not self.save_dir: + return + + if not self.save_to_disk: + return + + data = {} + data["model"] = self.model.state_dict() + if self.optimizer is not None: + data["optimizer"] = self.optimizer.state_dict() + if self.scheduler is not None: + data["scheduler"] = self.scheduler.state_dict() + data.update(kwargs) + + save_file = os.path.join(self.save_dir, "{}.pth".format(name)) + self.logger.info("Saving checkpoint to {}".format(save_file)) + torch.save(data, save_file) + self.tag_last_checkpoint(save_file) + + def load(self, f=None): + if self.has_checkpoint(): + # override argument with existing checkpoint + f = self.get_checkpoint_file() + if not f: + # no checkpoint could be found + self.logger.info("No checkpoint found. Initializing model from scratch") + return {} + self.logger.info("Loading checkpoint from {}".format(f)) + checkpoint = self._load_file(f) + self._load_model(checkpoint) + if "optimizer" in checkpoint and self.optimizer: + self.logger.info("Loading optimizer from {}".format(f)) + self.optimizer.load_state_dict(checkpoint.pop("optimizer")) + if "scheduler" in checkpoint and self.scheduler: + self.logger.info("Loading scheduler from {}".format(f)) + self.scheduler.load_state_dict(checkpoint.pop("scheduler")) + + # return any further checkpoint data + return checkpoint + + def has_checkpoint(self): + save_file = os.path.join(self.save_dir, "last_checkpoint") + return os.path.exists(save_file) + + def get_checkpoint_file(self): + save_file = os.path.join(self.save_dir, "last_checkpoint") + try: + with open(save_file, "r") as f: + last_saved = f.read() + except IOError: + # if file doesn't exist, maybe because it has just been + # deleted by a separate process + last_saved = "" + return last_saved + + def tag_last_checkpoint(self, last_filename): + save_file = os.path.join(self.save_dir, "last_checkpoint") + with open(save_file, "w") as f: + f.write(last_filename) + + def _load_file(self, f): + return torch.load(f, map_location=torch.device("cpu")) + + def _load_model(self, checkpoint): + load_state_dict(self.model, checkpoint.pop("model")) + + +class DetectronCheckpointer(Checkpointer): + def __init__( + self, + cfg, + model, + optimizer=None, + scheduler=None, + save_dir="", + save_to_disk=None, + logger=None, + ): + super(DetectronCheckpointer, self).__init__( + model, optimizer, scheduler, save_dir, save_to_disk, logger + ) + self.cfg = cfg.clone() + + def _load_file(self, f): + # catalog lookup + if f.startswith("catalog://"): + paths_catalog = import_file( + "maskrcnn_benchmark.config.paths_catalog", self.cfg.PATHS_CATALOG, True + ) + catalog_f = paths_catalog.ModelCatalog.get(f[len("catalog://") :]) + self.logger.info("{} points to {}".format(f, catalog_f)) + f = catalog_f + # download url files + if f.startswith("http"): + # if the file is a url path, download it and cache it + cached_f = cache_url(f) + self.logger.info("url {} cached in {}".format(f, cached_f)) + f = cached_f + # convert Caffe2 checkpoint from pkl + if f.endswith(".pkl"): + return load_c2_format(self.cfg, f) + # load native detectron.pytorch checkpoint + loaded = super(DetectronCheckpointer, self)._load_file(f) + if "model" not in loaded: + loaded = dict(model=loaded) + return loaded diff --git a/maskrcnn_benchmark/utils/collect_env.py b/maskrcnn_benchmark/utils/collect_env.py new file mode 100644 index 000000000..2d0641dda --- /dev/null +++ b/maskrcnn_benchmark/utils/collect_env.py @@ -0,0 +1,14 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import PIL + +from torch.utils.collect_env import get_pretty_env_info + + +def get_pil_version(): + return "\n Pillow ({})".format(PIL.__version__) + + +def collect_env_info(): + env_str = get_pretty_env_info() + env_str += get_pil_version() + return env_str diff --git a/maskrcnn_benchmark/utils/comm.py b/maskrcnn_benchmark/utils/comm.py new file mode 100644 index 000000000..98a37de42 --- /dev/null +++ b/maskrcnn_benchmark/utils/comm.py @@ -0,0 +1,141 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +This file contains primitives for multi-gpu communication. +This is useful when doing distributed training. +""" + +import os +import pickle +import tempfile +import time + +import torch + + +def get_world_size(): + if not torch.distributed.deprecated.is_initialized(): + return 1 + return torch.distributed.deprecated.get_world_size() + + +def is_main_process(): + if not torch.distributed.deprecated.is_initialized(): + return True + return torch.distributed.deprecated.get_rank() == 0 + + +def synchronize(): + """ + Helper function to synchronize between multiple processes when + using distributed training + """ + if not torch.distributed.deprecated.is_initialized(): + return + world_size = torch.distributed.deprecated.get_world_size() + rank = torch.distributed.deprecated.get_rank() + if world_size == 1: + return + + def _send_and_wait(r): + if rank == r: + tensor = torch.tensor(0, device="cuda") + else: + tensor = torch.tensor(1, device="cuda") + torch.distributed.deprecated.broadcast(tensor, r) + while tensor.item() == 1: + time.sleep(1) + + _send_and_wait(0) + # now sync on the main process + _send_and_wait(1) + + +def _encode(encoded_data, data): + # gets a byte representation for the data + encoded_bytes = pickle.dumps(data) + # convert this byte string into a byte tensor + storage = torch.ByteStorage.from_buffer(encoded_bytes) + tensor = torch.ByteTensor(storage).to("cuda") + # encoding: first byte is the size and then rest is the data + s = tensor.numel() + assert s <= 255, "Can't encode data greater than 255 bytes" + # put the encoded data in encoded_data + encoded_data[0] = s + encoded_data[1 : (s + 1)] = tensor + + +def _decode(encoded_data): + size = encoded_data[0] + encoded_tensor = encoded_data[1 : (size + 1)].to("cpu") + return pickle.loads(bytearray(encoded_tensor.tolist())) + + +# TODO try to use tensor in shared-memory instead of serializing to disk +# this involves getting the all_gather to work +def scatter_gather(data): + """ + This function gathers data from multiple processes, and returns them + in a list, as they were obtained from each process. + + This function is useful for retrieving data from multiple processes, + when launching the code with torch.distributed.launch + + Note: this function is slow and should not be used in tight loops, i.e., + do not use it in the training loop. + + Arguments: + data: the object to be gathered from multiple processes. + It must be serializable + + Returns: + result (list): a list with as many elements as there are processes, + where each element i in the list corresponds to the data that was + gathered from the process of rank i. + """ + # strategy: the main process creates a temporary directory, and communicates + # the location of the temporary directory to all other processes. + # each process will then serialize the data to the folder defined by + # the main process, and then the main process reads all of the serialized + # files and returns them in a list + if not torch.distributed.deprecated.is_initialized(): + return [data] + synchronize() + # get rank of the current process + rank = torch.distributed.deprecated.get_rank() + + # the data to communicate should be small + data_to_communicate = torch.empty(256, dtype=torch.uint8, device="cuda") + if rank == 0: + # manually creates a temporary directory, that needs to be cleaned + # afterwards + tmp_dir = tempfile.mkdtemp() + _encode(data_to_communicate, tmp_dir) + + synchronize() + # the main process (rank=0) communicates the data to all processes + torch.distributed.deprecated.broadcast(data_to_communicate, 0) + + # get the data that was communicated + tmp_dir = _decode(data_to_communicate) + + # each process serializes to a different file + file_template = "file{}.pth" + tmp_file = os.path.join(tmp_dir, file_template.format(rank)) + torch.save(data, tmp_file) + + # synchronize before loading the data + synchronize() + + # only the master process returns the data + if rank == 0: + data_list = [] + world_size = torch.distributed.deprecated.get_world_size() + for r in range(world_size): + file_path = os.path.join(tmp_dir, file_template.format(r)) + d = torch.load(file_path) + data_list.append(d) + # cleanup + os.remove(file_path) + # cleanup + os.rmdir(tmp_dir) + return data_list diff --git a/maskrcnn_benchmark/utils/env.py b/maskrcnn_benchmark/utils/env.py new file mode 100644 index 000000000..1c7db32e4 --- /dev/null +++ b/maskrcnn_benchmark/utils/env.py @@ -0,0 +1,37 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import os + +from maskrcnn_benchmark.utils.imports import import_file + + +def setup_environment(): + """Perform environment setup work. The default setup is a no-op, but this + function allows the user to specify a Python source file that performs + custom setup work that may be necessary to their computing environment. + """ + custom_module_path = os.environ.get("TORCH_DETECTRON_ENV_MODULE") + if custom_module_path: + setup_custom_environment(custom_module_path) + else: + # The default setup is a no-op + pass + + +def setup_custom_environment(custom_module_path): + """Load custom environment setup from a Python source file and run the setup + function. + """ + module = import_file("maskrcnn_benchmark.utils.env.custom_module", custom_module_path) + assert hasattr(module, "setup_environment") and callable( + module.setup_environment + ), ( + "Custom environment module defined in {} does not have the " + "required callable attribute 'setup_environment'." + ).format( + custom_module_path + ) + module.setup_environment() + + +# Force environment setup when this module is imported +setup_environment() diff --git a/maskrcnn_benchmark/utils/imports.py b/maskrcnn_benchmark/utils/imports.py new file mode 100644 index 000000000..4b3cfa661 --- /dev/null +++ b/maskrcnn_benchmark/utils/imports.py @@ -0,0 +1,14 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import importlib +import importlib.util +import sys + + +# from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa +def import_file(module_name, file_path, make_importable=False): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + if make_importable: + sys.modules[module_name] = module + return module diff --git a/maskrcnn_benchmark/utils/logging.py b/maskrcnn_benchmark/utils/logging.py new file mode 100644 index 000000000..a9e350534 --- /dev/null +++ b/maskrcnn_benchmark/utils/logging.py @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import logging +import os +import sys + + +def setup_logger(name, save_dir, local_rank): + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) + # don't log results for the non-master process + if local_rank > 0: + return logger + ch = logging.StreamHandler(stream=sys.stdout) + ch.setLevel(logging.DEBUG) + formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") + ch.setFormatter(formatter) + logger.addHandler(ch) + + if save_dir: + fh = logging.FileHandler(os.path.join(save_dir, "log.txt")) + fh.setLevel(logging.DEBUG) + fh.setFormatter(formatter) + logger.addHandler(fh) + + return logger diff --git a/maskrcnn_benchmark/utils/metric_logger.py b/maskrcnn_benchmark/utils/metric_logger.py new file mode 100644 index 000000000..c314e1311 --- /dev/null +++ b/maskrcnn_benchmark/utils/metric_logger.py @@ -0,0 +1,63 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from collections import defaultdict +from collections import deque + +import torch + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20): + self.deque = deque(maxlen=window_size) + self.series = [] + self.total = 0.0 + self.count = 0 + + def update(self, value): + self.deque.append(value) + self.series.append(value) + self.count += 1 + self.total += value + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque)) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + return object.__getattr__(self, attr) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg) + ) + return self.delimiter.join(loss_str) diff --git a/maskrcnn_benchmark/utils/miscellaneous.py b/maskrcnn_benchmark/utils/miscellaneous.py new file mode 100644 index 000000000..db9a8b367 --- /dev/null +++ b/maskrcnn_benchmark/utils/miscellaneous.py @@ -0,0 +1,11 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import errno +import os + + +def mkdir(path): + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise diff --git a/maskrcnn_benchmark/utils/model_serialization.py b/maskrcnn_benchmark/utils/model_serialization.py new file mode 100644 index 000000000..a95ad8b2a --- /dev/null +++ b/maskrcnn_benchmark/utils/model_serialization.py @@ -0,0 +1,80 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from collections import OrderedDict +import logging + +import torch + +from maskrcnn_benchmark.utils.imports import import_file + + +def align_and_update_state_dicts(model_state_dict, loaded_state_dict): + """ + Strategy: suppose that the models that we will create will have prefixes appended + to each of its keys, for example due to an extra level of nesting that the original + pre-trained weights from ImageNet won't contain. For example, model.state_dict() + might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains + res2.conv1.weight. We thus want to match both parameters together. + For that, we look for each model weight, look among all loaded keys if there is one + that is a suffix of the current weight name, and use it if that's the case. + If multiple matches exist, take the one with longest size + of the corresponding name. For example, for the same model as before, the pretrained + weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case, + we want to match backbone[0].body.conv1.weight to conv1.weight, and + backbone[0].body.res2.conv1.weight to res2.conv1.weight. + """ + current_keys = sorted(list(model_state_dict.keys())) + loaded_keys = sorted(list(loaded_state_dict.keys())) + # get a matrix of string matches, where each (i, j) entry correspond to the size of the + # loaded_key string, if it matches + match_matrix = [ + len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys + ] + match_matrix = torch.as_tensor(match_matrix).view( + len(current_keys), len(loaded_keys) + ) + max_match_size, idxs = match_matrix.max(1) + # remove indices that correspond to no-match + idxs[max_match_size == 0] = -1 + + # used for logging + max_size = max([len(key) for key in current_keys]) if current_keys else 1 + max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1 + log_str_template = "{: <{}} loaded from {: <{}} of shape {}" + logger = logging.getLogger(__name__) + for idx_new, idx_old in enumerate(idxs.tolist()): + if idx_old == -1: + continue + key = current_keys[idx_new] + key_old = loaded_keys[idx_old] + model_state_dict[key] = loaded_state_dict[key_old] + logger.info( + log_str_template.format( + key, + max_size, + key_old, + max_size_loaded, + tuple(loaded_state_dict[key_old].shape), + ) + ) + + +def strip_prefix_if_present(state_dict, prefix): + keys = sorted(state_dict.keys()) + if not all(key.startswith(prefix) for key in keys): + return state_dict + stripped_state_dict = OrderedDict() + for key, value in state_dict.items(): + stripped_state_dict[key.replace(prefix, "")] = value + return stripped_state_dict + + +def load_state_dict(model, loaded_state_dict): + model_state_dict = model.state_dict() + # if the state_dict comes from a model that was wrapped in a + # DataParallel or DistributedDataParallel during serialization, + # remove the "module" prefix before performing the matching + loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.") + align_and_update_state_dicts(model_state_dict, loaded_state_dict) + + # use strict loading + model.load_state_dict(model_state_dict) diff --git a/maskrcnn_benchmark/utils/model_zoo.py b/maskrcnn_benchmark/utils/model_zoo.py new file mode 100644 index 000000000..7a0ebb349 --- /dev/null +++ b/maskrcnn_benchmark/utils/model_zoo.py @@ -0,0 +1,56 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import os +import sys + +from torch.utils.model_zoo import _download_url_to_file +from torch.utils.model_zoo import urlparse +from torch.utils.model_zoo import HASH_REGEX + +from maskrcnn_benchmark.utils.comm import is_main_process +from maskrcnn_benchmark.utils.comm import synchronize + + +# very similar to https://github.com/pytorch/pytorch/blob/master/torch/utils/model_zoo.py +# but with a few improvements and modifications +def cache_url(url, model_dir=None, progress=True): + r"""Loads the Torch serialized object at the given URL. + If the object is already present in `model_dir`, it's deserialized and + returned. The filename part of the URL should follow the naming convention + ``filename-.ext`` where ```` is the first eight or more + digits of the SHA256 hash of the contents of the file. The hash is used to + ensure unique names and to verify the contents of the file. + The default value of `model_dir` is ``$TORCH_HOME/models`` where + ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be + overridden with the ``$TORCH_MODEL_ZOO`` environment variable. + Args: + url (string): URL of the object to download + model_dir (string, optional): directory in which to save the object + progress (bool, optional): whether or not to display a progress bar to stderr + Example: + >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') + """ + if model_dir is None: + torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch')) + model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models')) + if not os.path.exists(model_dir): + os.makedirs(model_dir) + parts = urlparse(url) + filename = os.path.basename(parts.path) + if filename == "model_final.pkl": + # workaround as pre-trained Caffe2 models from Detectron have all the same filename + # so make the full path the filename by replacing / with _ + filename = parts.path.replace("/", "_") + cached_file = os.path.join(model_dir, filename) + if not os.path.exists(cached_file) and is_main_process(): + sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) + hash_prefix = HASH_REGEX.search(filename) + if hash_prefix is not None: + hash_prefix = hash_prefix.group(1) + # workaround: Caffe2 models don't have a hash, but follow the R-50 convention, + # which matches the hash PyTorch uses. So we skip the hash matching + # if the hash_prefix is less than 6 characters + if len(hash_prefix) < 6: + hash_prefix = None + _download_url_to_file(url, cached_file, hash_prefix, progress=progress) + synchronize() + return cached_file diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..c0216cdc6 --- /dev/null +++ b/setup.py @@ -0,0 +1,69 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#!/usr/bin/env python + +import glob +import os + +import torch +from setuptools import find_packages +from setuptools import setup +from torch.utils.cpp_extension import CUDA_HOME +from torch.utils.cpp_extension import CppExtension +from torch.utils.cpp_extension import CUDAExtension + +requirements = ["torch", "torchvision"] + + +def get_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + extensions_dir = os.path.join(this_dir, "maskrcnn_benchmark", "csrc") + + main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) + + sources = main_file + source_cpu + extension = CppExtension + + extra_compile_args = {"cxx": []} + define_macros = [] + + if torch.cuda.is_available() and CUDA_HOME is not None: + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + + sources = [os.path.join(extensions_dir, s) for s in sources] + + include_dirs = [extensions_dir] + + ext_modules = [ + extension( + "maskrcnn_benchmark._C", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + + return ext_modules + + +setup( + name="maskrcnn_benchmark", + version="0.1", + author="fmassa", + url="https://github.com/facebookresearch/maskrnn-benchmark", + description="object detection in pytorch", + # packages=find_packages(exclude=("configs", "examples", "test",)), + # install_requires=requirements, + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, +) diff --git a/tests/checkpoint.py b/tests/checkpoint.py new file mode 100644 index 000000000..82004fb77 --- /dev/null +++ b/tests/checkpoint.py @@ -0,0 +1,118 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from collections import OrderedDict +import os +from tempfile import TemporaryDirectory +import unittest + +import torch +from torch import nn + +from maskrcnn_benchmark.utils.model_serialization import load_state_dict +from maskrcnn_benchmark.utils.checkpoint import Checkpointer + + +class TestCheckpointer(unittest.TestCase): + def create_model(self): + return nn.Sequential(nn.Linear(2, 3), nn.Linear(3, 1)) + + def create_complex_model(self): + m = nn.Module() + m.block1 = nn.Module() + m.block1.layer1 = nn.Linear(2, 3) + m.layer2 = nn.Linear(3, 2) + m.res = nn.Module() + m.res.layer2 = nn.Linear(3, 2) + + state_dict = OrderedDict() + state_dict["layer1.weight"] = torch.rand(3, 2) + state_dict["layer1.bias"] = torch.rand(3) + state_dict["layer2.weight"] = torch.rand(2, 3) + state_dict["layer2.bias"] = torch.rand(2) + state_dict["res.layer2.weight"] = torch.rand(2, 3) + state_dict["res.layer2.bias"] = torch.rand(2) + + return m, state_dict + + def test_from_last_checkpoint_model(self): + # test that loading works even if they differ by a prefix + for trained_model, fresh_model in [ + (self.create_model(), self.create_model()), + (nn.DataParallel(self.create_model()), self.create_model()), + (self.create_model(), nn.DataParallel(self.create_model())), + ( + nn.DataParallel(self.create_model()), + nn.DataParallel(self.create_model()), + ), + ]: + + with TemporaryDirectory() as f: + checkpointer = Checkpointer( + trained_model, save_dir=f, save_to_disk=True + ) + checkpointer.save("checkpoint_file") + + # in the same folder + fresh_checkpointer = Checkpointer(fresh_model, save_dir=f) + self.assertTrue(fresh_checkpointer.has_checkpoint()) + self.assertEqual( + fresh_checkpointer.get_checkpoint_file(), + os.path.join(f, "checkpoint_file.pth"), + ) + _ = fresh_checkpointer.load() + + for trained_p, loaded_p in zip( + trained_model.parameters(), fresh_model.parameters() + ): + # different tensor references + self.assertFalse(id(trained_p) == id(loaded_p)) + # same content + self.assertTrue(trained_p.equal(loaded_p)) + + def test_from_name_file_model(self): + # test that loading works even if they differ by a prefix + for trained_model, fresh_model in [ + (self.create_model(), self.create_model()), + (nn.DataParallel(self.create_model()), self.create_model()), + (self.create_model(), nn.DataParallel(self.create_model())), + ( + nn.DataParallel(self.create_model()), + nn.DataParallel(self.create_model()), + ), + ]: + with TemporaryDirectory() as f: + checkpointer = Checkpointer( + trained_model, save_dir=f, save_to_disk=True + ) + checkpointer.save("checkpoint_file") + + # on different folders + with TemporaryDirectory() as g: + fresh_checkpointer = Checkpointer(fresh_model, save_dir=g) + self.assertFalse(fresh_checkpointer.has_checkpoint()) + self.assertEqual(fresh_checkpointer.get_checkpoint_file(), "") + _ = fresh_checkpointer.load(os.path.join(f, "checkpoint_file.pth")) + + for trained_p, loaded_p in zip( + trained_model.parameters(), fresh_model.parameters() + ): + # different tensor references + self.assertFalse(id(trained_p) == id(loaded_p)) + # same content + self.assertTrue(trained_p.equal(loaded_p)) + + def test_complex_model_loaded(self): + for add_data_parallel in [False, True]: + model, state_dict = self.create_complex_model() + if add_data_parallel: + model = nn.DataParallel(model) + + load_state_dict(model, state_dict) + for loaded, stored in zip(model.state_dict().values(), state_dict.values()): + # different tensor references + self.assertFalse(id(loaded) == id(stored)) + # same content + self.assertTrue(loaded.equal(stored)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_data_samplers.py b/tests/test_data_samplers.py new file mode 100644 index 000000000..96338e176 --- /dev/null +++ b/tests/test_data_samplers.py @@ -0,0 +1,153 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import itertools +import random +import unittest + +from torch.utils.data.sampler import BatchSampler +from torch.utils.data.sampler import Sampler +from torch.utils.data.sampler import SequentialSampler +from torch.utils.data.sampler import RandomSampler + +from maskrcnn_benchmark.data.samplers import GroupedBatchSampler +from maskrcnn_benchmark.data.samplers import IterationBasedBatchSampler + + +class SubsetSampler(Sampler): + def __init__(self, indices): + self.indices = indices + + def __iter__(self): + return iter(self.indices) + + def __len__(self): + return len(self.indices) + + +class TestGroupedBatchSampler(unittest.TestCase): + def test_respect_order_simple(self): + drop_uneven = False + dataset = [i for i in range(40)] + group_ids = [i // 10 for i in dataset] + sampler = SequentialSampler(dataset) + for batch_size in [1, 3, 5, 6]: + batch_sampler = GroupedBatchSampler( + sampler, group_ids, batch_size, drop_uneven + ) + result = list(batch_sampler) + merged_result = list(itertools.chain.from_iterable(result)) + self.assertEqual(merged_result, dataset) + + def test_respect_order(self): + drop_uneven = False + dataset = [i for i in range(10)] + group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] + sampler = SequentialSampler(dataset) + + expected = [ + [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]], + [[0, 1, 3], [2, 4, 5], [6, 9], [7, 8]], + [[0, 1, 3, 6], [2, 4, 5, 7], [8], [9]], + ] + + for idx, batch_size in enumerate([1, 3, 4]): + batch_sampler = GroupedBatchSampler( + sampler, group_ids, batch_size, drop_uneven + ) + result = list(batch_sampler) + self.assertEqual(result, expected[idx]) + + def test_respect_order_drop_uneven(self): + batch_size = 3 + drop_uneven = True + dataset = [i for i in range(10)] + group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] + sampler = SequentialSampler(dataset) + batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) + + result = list(batch_sampler) + + expected = [[0, 1, 3], [2, 4, 5]] + self.assertEqual(result, expected) + + def test_subset_sampler(self): + batch_size = 3 + drop_uneven = False + dataset = [i for i in range(10)] + group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] + sampler = SubsetSampler([0, 3, 5, 6, 7, 8]) + + batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) + result = list(batch_sampler) + + expected = [[0, 3, 6], [5, 7, 8]] + self.assertEqual(result, expected) + + def test_permute_subset_sampler(self): + batch_size = 3 + drop_uneven = False + dataset = [i for i in range(10)] + group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] + sampler = SubsetSampler([5, 0, 6, 1, 3, 8]) + + batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) + result = list(batch_sampler) + + expected = [[5, 8], [0, 6, 1], [3]] + self.assertEqual(result, expected) + + def test_permute_subset_sampler_drop_uneven(self): + batch_size = 3 + drop_uneven = True + dataset = [i for i in range(10)] + group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] + sampler = SubsetSampler([5, 0, 6, 1, 3, 8]) + + batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) + result = list(batch_sampler) + + expected = [[0, 6, 1]] + self.assertEqual(result, expected) + + def test_len(self): + batch_size = 3 + drop_uneven = True + dataset = [i for i in range(10)] + group_ids = [random.randint(0, 1) for _ in dataset] + sampler = RandomSampler(dataset) + + batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) + result = list(batch_sampler) + self.assertEqual(len(result), len(batch_sampler)) + self.assertEqual(len(result), len(batch_sampler)) + + batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) + batch_sampler_len = len(batch_sampler) + result = list(batch_sampler) + self.assertEqual(len(result), batch_sampler_len) + self.assertEqual(len(result), len(batch_sampler)) + + +class TestIterationBasedBatchSampler(unittest.TestCase): + def test_number_of_iters_and_elements(self): + for batch_size in [2, 3, 4]: + for num_iterations in [4, 10, 20]: + for drop_last in [False, True]: + dataset = [i for i in range(10)] + sampler = SequentialSampler(dataset) + batch_sampler = BatchSampler( + sampler, batch_size, drop_last=drop_last + ) + + iter_sampler = IterationBasedBatchSampler( + batch_sampler, num_iterations + ) + assert len(iter_sampler) == num_iterations + for i, batch in enumerate(iter_sampler): + start = (i % len(batch_sampler)) * batch_size + end = min(start + batch_size, len(dataset)) + expected = [x for x in range(start, end)] + self.assertEqual(batch, expected) + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/test_net.py b/tools/test_net.py new file mode 100644 index 000000000..1a6f61e58 --- /dev/null +++ b/tools/test_net.py @@ -0,0 +1,92 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# Set up custom environment before nearly anything else is imported +# NOTE: this should be the first import (no not reorder) +from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip + +import argparse +import os + +import torch +from maskrcnn_benchmark.config import cfg +from maskrcnn_benchmark.data import make_data_loader +from maskrcnn_benchmark.engine.inference import inference +from maskrcnn_benchmark.modeling.detector import build_detection_model +from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer +from maskrcnn_benchmark.utils.collect_env import collect_env_info +from maskrcnn_benchmark.utils.comm import synchronize +from maskrcnn_benchmark.utils.logging import setup_logger +from maskrcnn_benchmark.utils.miscellaneous import mkdir + + +def main(): + parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference") + parser.add_argument( + "--config-file", + default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument("--local_rank", type=int, default=0) + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + + args = parser.parse_args() + + num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 + distributed = num_gpus > 1 + + if distributed: + torch.cuda.set_device(args.local_rank) + torch.distributed.deprecated.init_process_group( + backend="nccl", init_method="env://" + ) + + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + + save_dir = "" + logger = setup_logger("maskrcnn_benchmark", save_dir, args.local_rank) + logger.info("Using {} GPUs".format(num_gpus)) + logger.info(cfg) + + logger.info("Collecting env info (might take some time)") + logger.info("\n" + collect_env_info()) + + model = build_detection_model(cfg) + model.to(cfg.MODEL.DEVICE) + + checkpointer = DetectronCheckpointer(cfg, model) + _ = checkpointer.load(cfg.MODEL.WEIGHT) + + iou_types = ("bbox",) + if cfg.MODEL.MASK_ON: + iou_types = iou_types + ("segm",) + output_folders = [None] * len(cfg.DATASETS.TEST) + if cfg.OUTPUT_DIR: + dataset_names = cfg.DATASETS.TEST + for idx, dataset_name in enumerate(dataset_names): + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) + mkdir(output_folder) + output_folders[idx] = output_folder + data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) + for output_folder, data_loader_val in zip(output_folders, data_loaders_val): + inference( + model, + data_loader_val, + iou_types=iou_types, + box_only=cfg.MODEL.RPN_ONLY, + device=cfg.MODEL.DEVICE, + expected_results=cfg.TEST.EXPECTED_RESULTS, + expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, + output_folder=output_folder, + ) + synchronize() + + +if __name__ == "__main__": + main() diff --git a/tools/train_net.py b/tools/train_net.py new file mode 100644 index 000000000..1c0025f82 --- /dev/null +++ b/tools/train_net.py @@ -0,0 +1,170 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +r""" +Basic training script for PyTorch +""" + +# Set up custom environment before nearly anything else is imported +# NOTE: this should be the first import (no not reorder) +from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip + +import argparse +import os + +import torch +from maskrcnn_benchmark.config import cfg +from maskrcnn_benchmark.data import make_data_loader +from maskrcnn_benchmark.solver import make_lr_scheduler +from maskrcnn_benchmark.solver import make_optimizer +from maskrcnn_benchmark.engine.inference import inference +from maskrcnn_benchmark.engine.trainer import do_train +from maskrcnn_benchmark.modeling.detector import build_detection_model +from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer +from maskrcnn_benchmark.utils.collect_env import collect_env_info +from maskrcnn_benchmark.utils.comm import synchronize +from maskrcnn_benchmark.utils.imports import import_file +from maskrcnn_benchmark.utils.logging import setup_logger +from maskrcnn_benchmark.utils.miscellaneous import mkdir + + +def train(cfg, local_rank, distributed): + model = build_detection_model(cfg) + device = torch.device(cfg.MODEL.DEVICE) + model.to(device) + + optimizer = make_optimizer(cfg, model) + scheduler = make_lr_scheduler(cfg, optimizer) + + if distributed: + model = torch.nn.parallel.deprecated.DistributedDataParallel( + model, device_ids=[local_rank], output_device=local_rank, + # this should be removed if we update BatchNorm stats + broadcast_buffers=False, + ) + + arguments = {} + arguments["iteration"] = 0 + + output_dir = cfg.OUTPUT_DIR + + save_to_disk = local_rank == 0 + checkpointer = DetectronCheckpointer( + cfg, model, optimizer, scheduler, output_dir, save_to_disk + ) + extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) + arguments.update(extra_checkpoint_data) + + data_loader = make_data_loader( + cfg, + is_train=True, + is_distributed=distributed, + start_iter=arguments["iteration"], + ) + + checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD + + do_train( + model, + data_loader, + optimizer, + scheduler, + checkpointer, + device, + checkpoint_period, + arguments, + ) + + return model + + +def test(cfg, model, distributed): + if distributed: + model = model.module + torch.cuda.empty_cache() # TODO check if it helps + iou_types = ("bbox",) + if cfg.MODEL.MASK_ON: + iou_types = iou_types + ("segm",) + output_folders = [None] * len(cfg.DATASETS.TEST) + if cfg.OUTPUT_DIR: + dataset_names = cfg.DATASETS.TEST + for idx, dataset_name in enumerate(dataset_names): + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) + mkdir(output_folder) + output_folders[idx] = output_folder + data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) + for output_folder, data_loader_val in zip(output_folders, data_loaders_val): + inference( + model, + data_loader_val, + iou_types=iou_types, + box_only=cfg.MODEL.RPN_ONLY, + device=cfg.MODEL.DEVICE, + expected_results=cfg.TEST.EXPECTED_RESULTS, + expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, + output_folder=output_folder, + ) + synchronize() + + +def main(): + parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") + parser.add_argument( + "--config-file", + default="", + metavar="FILE", + help="path to config file", + type=str, + ) + parser.add_argument("--local_rank", type=int, default=0) + parser.add_argument( + "--skip-test", + dest="skip_test", + help="Do not test the final model", + action="store_true", + ) + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + + args = parser.parse_args() + + num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 + args.distributed = num_gpus > 1 + + if args.distributed: + torch.cuda.set_device(args.local_rank) + torch.distributed.deprecated.init_process_group( + backend="nccl", init_method="env://" + ) + + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + + output_dir = cfg.OUTPUT_DIR + if output_dir: + mkdir(output_dir) + + logger = setup_logger("maskrcnn_benchmark", output_dir, args.local_rank) + logger.info("Using {} GPUs".format(num_gpus)) + logger.info(args) + + logger.info("Collecting env info (might take some time)") + logger.info("\n" + collect_env_info()) + + logger.info("Loaded configuration file {}".format(args.config_file)) + with open(args.config_file, "r") as cf: + config_str = "\n" + cf.read() + logger.info(config_str) + logger.info("Running with config:\n{}".format(cfg)) + + model = train(cfg, args.local_rank, args.distributed) + + if not args.skip_test: + test(cfg, model, args.distributed) + + +if __name__ == "__main__": + main()