Skip to content

Commit

Permalink
[Feature] add check isfinite hook (#5674)
Browse files Browse the repository at this point in the history
* [Feature] add check isfinite hook

* add isfinite hook

* add isfinite hook

* fix error

* change qq group QRcode

* change qq group QRcode

* fix name

* delete qrcode

* fix

* add check invalid loss hook

* add test unit

* fix logic
  • Loading branch information
BIGWangYuDong authored Aug 18, 2021
1 parent 682f03d commit 3cef22d
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 7 deletions.
4 changes: 4 additions & 0 deletions configs/ssd/ssd300_coco.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,7 @@
# optimizer
optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4)
optimizer_config = dict(_delete_=True)
custom_hooks = [
dict(type='NumClassCheckHook'),
dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW')
]
4 changes: 4 additions & 0 deletions configs/ssd/ssd512_coco.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,7 @@
# optimizer
optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4)
optimizer_config = dict(_delete_=True)
custom_hooks = [
dict(type='NumClassCheckHook'),
dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW')
]
4 changes: 4 additions & 0 deletions configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,7 @@
# Avoid evaluation and saving weights too frequently
evaluation = dict(interval=5, metric='bbox')
checkpoint_config = dict(interval=5)
custom_hooks = [
dict(type='NumClassCheckHook'),
dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW')
]
4 changes: 3 additions & 1 deletion mmdet/core/hook/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .checkloss_hook import CheckInvalidLossHook
from .ema import ExpMomentumEMAHook, LinearMomentumEMAHook
from .sync_norm_hook import SyncNormHook
from .sync_random_size_hook import SyncRandomSizeHook
Expand All @@ -6,5 +7,6 @@

__all__ = [
'SyncRandomSizeHook', 'YOLOXModeSwitchHook', 'SyncNormHook',
'ExpMomentumEMAHook', 'LinearMomentumEMAHook', 'YOLOXLrUpdaterHook'
'ExpMomentumEMAHook', 'LinearMomentumEMAHook', 'YOLOXLrUpdaterHook',
'CheckInvalidLossHook'
]
23 changes: 23 additions & 0 deletions mmdet/core/hook/checkloss_hook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import torch
from mmcv.runner.hooks import HOOKS, Hook


@HOOKS.register_module()
class CheckInvalidLossHook(Hook):
"""Check invalid loss hook.
This hook will regularly check whether the loss is valid
during training.
Args:
interval (int): Checking interval (every k iterations).
Default: 50.
"""

def __init__(self, interval=50):
self.interval = interval

def after_train_iter(self, runner):
if self.every_n_iters(runner, self.interval):
assert torch.isfinite(runner.outputs['loss']), \
runner.logger.info('loss become infinite or NaN!')
6 changes: 0 additions & 6 deletions mmdet/models/dense_heads/ssd_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,12 +326,6 @@ def loss(self,
for i in range(num_images):
all_anchors.append(torch.cat(anchor_list[i]))

# check NaN and Inf
assert torch.isfinite(all_cls_scores).all().item(), \
'classification scores become infinite or NaN!'
assert torch.isfinite(all_bbox_preds).all().item(), \
'bbox predications become infinite or NaN!'

losses_cls, losses_bbox = multi_apply(
self.loss_single,
all_cls_scores,
Expand Down
45 changes: 45 additions & 0 deletions tests/test_utils/test_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,48 @@ def test_sync_random_size_hook():
runner.register_hook_from_cfg(dict(type='SyncRandomSizeHook'))
runner.run([loader, loader], [('train', 1), ('val', 1)])
shutil.rmtree(runner.work_dir)


@pytest.mark.parametrize('set_loss', [
dict(set_loss_nan=False, set_loss_inf=False),
dict(set_loss_nan=True, set_loss_inf=False),
dict(set_loss_nan=False, set_loss_inf=True)
])
def test_check_invalid_loss_hook(set_loss):
# Check whether loss is valid during training.

class DemoModel(nn.Module):

def __init__(self, set_loss_nan=False, set_loss_inf=False):
super().__init__()
self.set_loss_nan = set_loss_nan
self.set_loss_inf = set_loss_inf
self.linear = nn.Linear(2, 1)

def forward(self, x):
return self.linear(x)

def train_step(self, x, optimizer, **kwargs):
if self.set_loss_nan:
return dict(loss=torch.tensor(float('nan')))
elif self.set_loss_inf:
return dict(loss=torch.tensor(float('inf')))
else:
return dict(loss=self(x))

loader = DataLoader(torch.ones((5, 2)))
runner = _build_demo_runner()

demo_model = DemoModel(**set_loss)
runner.model = demo_model
runner.register_hook_from_cfg(
dict(type='CheckInvalidLossHook', interval=1))
if not set_loss['set_loss_nan'] \
and not set_loss['set_loss_inf']:
# check loss is valid
runner.run([loader], [('train', 1)])
else:
# check loss is nan or inf
with pytest.raises(AssertionError):
runner.run([loader], [('train', 1)])
shutil.rmtree(runner.work_dir)

0 comments on commit 3cef22d

Please sign in to comment.