Skip to content

Commit

Permalink
Best checkpoint save after pruning + cleanup0
Browse files Browse the repository at this point in the history
  • Loading branch information
amitklinger committed Dec 19, 2023
1 parent 8c9aaac commit e1627ad
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 94 deletions.
3 changes: 2 additions & 1 deletion configs/fcn/fcn_hailo_10classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
param_scheduler=dict(type='ParamSchedulerHook'),

# save checkpoint every 5 epochs.
checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=7440, save_best='mIoU', rule='greater', max_keep_ckpts=5),
checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=7440, save_best='mIoU', rule='greater',
max_keep_ckpts=5),
)

# tensorboard vis ('LocalVisBackend' might be redundant) save_dir='./tf_dir/<exp_name>'
Expand Down
80 changes: 0 additions & 80 deletions configs/fcn/fcn_hailo_10classes_epoch.py

This file was deleted.

15 changes: 5 additions & 10 deletions configs/fcn/fcn_hailo_prune.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
]

resume = True
# load_from='./work_dirs/fcn_hailo_eta1e5/iter_68448.pth' # best checkpoint path of full training (fcn_hailo_10classes). Start of pruning procedure
load_from='./work_dirs/fcn_hailo_eta1e5_eve/iter_74400.pth'
# best checkpoint path of full training (fcn_hailo_10classes). Start of pruning procedure:
load_from = './work_dirs/fcn_hailo_eta1e5_eve/iter_74400.pth'

# optimizer
optimizer = dict(type='Adam', lr=0.0001, weight_decay=1e-5)
optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)


# runtime settings
train_cfg = dict(type='IterBasedTrainLoop', max_iters=178560, val_interval=1488) # 74400 (50 epochs), 89280 (60 epochs), 104160 (70 epochs), 89280 (80 epochs), 173760
train_cfg = dict(type='IterBasedTrainLoop', max_iters=178560, val_interval=1488) # 74400 (50 epochs), 178560 (120)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')

Expand All @@ -27,17 +27,12 @@
param_scheduler=dict(type='ParamSchedulerHook'),
)

# # save checkpoint every 1 epoch.
# checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=1488, save_best='mIoU', rule='greater',
# max_keep_ckpts=5, save_begin=163680), # 2976 (2Epoches), 7440 (5 Epoches)
# )

# learning policy: taken from the recipe
# custom hooks
sparseml_hook = dict(type='SparseMLHook', priority='NORMAL')
# sparseml_hook = dict(type='SparseMLHook', interval=10, priority='NORMAL')
# Saving best checkpoint starts after pruning hits final ratio
ext_checkpoint_hook = dict(type='ExtCheckpointHook', by_epoch=False, interval=1488, save_best='mIoU', rule='greater',
max_keep_ckpts=5, save_begin=163680) # 2976 (2Epoches), 7440 (5 Epoches), 80352 (54), 83328 (56), 163680
max_keep_ckpts=5, save_begin=163680) # 163680 (110 epochs)
custom_hooks = [sparseml_hook, ext_checkpoint_hook]

# tensorboard vis ('LocalVisBackend' might be redundant) save_dir='./tf_dir/<exp_name>'
Expand Down
3 changes: 0 additions & 3 deletions mmseg/engine/hooks/checkpoint_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,13 @@

@HOOKS.register_module()
class ExtCheckpointHook(CheckpointHook):
# def __init__(self):
# self.by_epoch = False

def after_val_epoch(self, runner, metrics):
if runner.iter == self.save_begin:
runner.logger.info('Resetting best_score to 0.0')
runner.message_hub.update_info('best_score', 0.0)
runner.message_hub.pop_info('best_ckpt', None)
if (runner.iter + 1 >= self.save_begin):
runner.logger.info('ExtCheckpointHook ExtCheckpointHook ExtCheckpointHook')
runner.logger.info(
f'Saving checkpoint at iter {runner.iter}')
super().after_val_epoch(runner, metrics)

0 comments on commit e1627ad

Please sign in to comment.