[Refactor] Support CID (open-mmlab#1907)

mornfairy · Feb 17, 2023 · 77a52a0 · 77a52a0
1 parent 6dc6106
commit 77a52a0
Show file tree

Hide file tree

Showing 25 changed files with 2,089 additions and 140 deletions.
diff --git a/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py b/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py
@@ -0,0 +1,162 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=140, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+    type='Adam',
+    lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=140,
+        milestones=[90, 120],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=160)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+    type='DecoupledHeatmap', input_size=(512, 512), heatmap_size=(128, 128))
+
+# model settings
+model = dict(
+    type='BottomupPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256),
+                multiscale_output=True)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmpose/'
+            'pretrain_models/hrnet_w32-36af842e.pth'),
+    ),
+    head=dict(
+        type='CIDHead',
+        in_channels=(32, 64, 128, 256),
+        num_keypoints=17,
+        gfd_channels=32,
+        input_transform='resize_concat',
+        input_index=(0, 1, 2, 3),
+        coupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=1.0),
+        decoupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=4.0),
+        contrastive_loss=dict(
+            type='InfoNCELoss', temperature=0.05, loss_weight=1.0),
+        decoder=codec,
+    ),
+    train_cfg=dict(max_train_instances=200),
+    test_cfg=dict(
+        multiscale_test=False,
+        flip_test=True,
+        shift_heatmap=False,
+        align_corners=False))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'bottomup'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='BottomupRandomAffine', input_size=codec['input_size']),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='BottomupGetHeatmapMask'),
+    dict(type='PackPoseInputs'),
+]
+val_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(
+        type='BottomupResize',
+        input_size=codec['input_size'],
+        size_factor=64,
+        resize_mode='expand'),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
+                   'img_shape', 'input_size', 'input_center', 'input_scale',
+                   'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
+                   'skeleton_links'))
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=20,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+    nms_thr=0.8,
+    score_mode='keypoint',
+)
+test_evaluator = val_evaluator
diff --git a/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py b/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py
@@ -0,0 +1,162 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=140, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+    type='Adam',
+    lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=140,
+        milestones=[90, 120],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=160)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+    type='DecoupledHeatmap', input_size=(512, 512), heatmap_size=(128, 128))
+
+# model settings
+model = dict(
+    type='BottomupPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384),
+                multiscale_output=True)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmpose/'
+            'pretrain_models/hrnet_w48-8ef0771d.pth'),
+    ),
+    head=dict(
+        type='CIDHead',
+        in_channels=(48, 96, 192, 384),
+        num_keypoints=17,
+        gfd_channels=48,
+        input_transform='resize_concat',
+        input_index=(0, 1, 2, 3),
+        coupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=1.0),
+        decoupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=4.0),
+        contrastive_loss=dict(
+            type='InfoNCELoss', temperature=0.05, loss_weight=1.0),
+        decoder=codec,
+    ),
+    train_cfg=dict(max_train_instances=200),
+    test_cfg=dict(
+        multiscale_test=False,
+        flip_test=True,
+        shift_heatmap=False,
+        align_corners=False))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'bottomup'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='BottomupRandomAffine', input_size=codec['input_size']),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='BottomupGetHeatmapMask'),
+    dict(type='PackPoseInputs'),
+]
+val_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(
+        type='BottomupResize',
+        input_size=codec['input_size'],
+        size_factor=64,
+        resize_mode='expand'),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
+                   'img_shape', 'input_size', 'input_center', 'input_scale',
+                   'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
+                   'skeleton_links'))
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=20,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+    nms_thr=0.8,
+    score_mode='keypoint',
+)
+test_evaluator = val_evaluator
diff --git a/configs/body_2d_keypoint/cid/coco/hrnet_coco.md b/configs/body_2d_keypoint/cid/coco/hrnet_coco.md
@@ -0,0 +1,42 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://openaccess.thecvf.com/content/CVPR2022/html/Wang_Contextual_Instance_Decoupling_for_Robust_Multi-Person_Pose_Estimation_CVPR_2022_paper.html">CID (CVPR'2022)</a></summary>
+
+```bibtex
+@InProceedings{Wang_2022_CVPR,
+    author    = {Wang, Dongkai and Zhang, Shiliang},
+    title     = {Contextual Instance Decoupling for Robust Multi-Person Pose Estimation},
+    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month     = {June},
+    year      = {2022},
+    pages     = {11060-11068}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 without multi-scale test
+
+| Arch                                          | Input Size |  AP   | AP<sup>50</sup> | AP<sup>75</sup> |  AR   | AR<sup>50</sup> |                     ckpt                      |                      log                      |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [CID](/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py) |  512x512   | 0.704 |      0.894      |      0.775      | 0.753 |      0.928      | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_42b7e6e6-20230207.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_20230207.json) |
+| [CID](/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py) |  512x512   | 0.715 |      0.900      |      0.782      | 0.765 |      0.935      | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_a36c3ecf-20230207.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_20230207.json) |
diff --git a/mmpose/codecs/__init__.py b/mmpose/codecs/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .associative_embedding import AssociativeEmbedding
+from .decoupled_heatmap import DecoupledHeatmap
 from .integral_regression_label import IntegralRegressionLabel
 from .megvii_heatmap import MegviiHeatmap
 from .msra_heatmap import MSRAHeatmap
@@ -10,5 +11,6 @@
 
 __all__ = [
     'MSRAHeatmap', 'MegviiHeatmap', 'UDPHeatmap', 'RegressionLabel',
-    'SimCCLabel', 'IntegralRegressionLabel', 'AssociativeEmbedding', 'SPR'
+    'SimCCLabel', 'IntegralRegressionLabel', 'AssociativeEmbedding', 'SPR',
+    'DecoupledHeatmap'
 ]
diff --git a/mmpose/codecs/base.py b/mmpose/codecs/base.py
@@ -14,6 +14,10 @@ class BaseKeypointCodec(metaclass=ABCMeta):
     the methods :meth:`encode` and :meth:`decode`.
     """
 
+    # pass additional encoding arguments to the `encode` method, beyond the
+    # mandatory `keypoints` and `keypoints_visible` arguments.
+    auxiliary_encode_keys = set()
+
     @abstractmethod
     def encode(self,
                keypoints: np.ndarray,