From e1627ade66389b38d24b406148265e44aef05e4f Mon Sep 17 00:00:00 2001 From: Amit Klinger Date: Tue, 19 Dec 2023 07:14:34 +0200 Subject: [PATCH] Best checkpoint save after pruning + cleanup0 --- configs/fcn/fcn_hailo_10classes.py | 3 +- configs/fcn/fcn_hailo_10classes_epoch.py | 80 ------------------------ configs/fcn/fcn_hailo_prune.py | 15 ++--- mmseg/engine/hooks/checkpoint_hook.py | 3 - 4 files changed, 7 insertions(+), 94 deletions(-) delete mode 100644 configs/fcn/fcn_hailo_10classes_epoch.py diff --git a/configs/fcn/fcn_hailo_10classes.py b/configs/fcn/fcn_hailo_10classes.py index 663fc1356a..9105395ae4 100644 --- a/configs/fcn/fcn_hailo_10classes.py +++ b/configs/fcn/fcn_hailo_10classes.py @@ -30,7 +30,8 @@ param_scheduler=dict(type='ParamSchedulerHook'), # save checkpoint every 5 epochs. - checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=7440, save_best='mIoU', rule='greater', max_keep_ckpts=5), + checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=7440, save_best='mIoU', rule='greater', + max_keep_ckpts=5), ) # tensorboard vis ('LocalVisBackend' might be redundant) save_dir='./tf_dir/' diff --git a/configs/fcn/fcn_hailo_10classes_epoch.py b/configs/fcn/fcn_hailo_10classes_epoch.py deleted file mode 100644 index b4d212e6f3..0000000000 --- a/configs/fcn/fcn_hailo_10classes_epoch.py +++ /dev/null @@ -1,80 +0,0 @@ -# model settings -_base_ = [ - '../_base_/datasets/cityscapes10classes.py', '../_base_/default_runtime.py', -] - -# optimizer -optimizer = dict(type='Adam', lr=0.001, weight_decay=1e-5) -optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None) - -# learning policy -param_scheduler = [ - dict( - type='LinearLR', start_factor=0.2, begin=0, end=1), - dict( - type='CosineAnnealingLR', begin=1, end=5, eta_min=0.00001) -] - -# runtime settings -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=5, val_interval=1) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# default hooks - logger & checkpoint configs -default_hooks = dict( - - # print log every 100 iterations. - logger=dict(type='LoggerHook', interval=1), #, log_metric_by_epoch=False), - - # enable the parameter scheduler. - param_scheduler=dict(type='ParamSchedulerHook'), - - # save checkpoint every 5 epochs. - checkpoint=dict(type='CheckpointHook', - interval=1, - save_best='mIoU', - rule='greater', - max_keep_ckpts=5), -) - -# tensorboard vis ('LocalVisBackend' might be redundant) save_dir='./tf_dir/' -visualizer = dict(type='SegLocalVisualizer', - vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')], - name='visualizer') - -# data preprocessing -norm_cfg = dict(type='SyncBN', requires_grad=True) -crop_size = (512, 1024) -data_preprocessor = dict( - type='SegDataPreProcessor', - mean=[0.0, 0.0, 0.0], - std=[1.0, 1.0, 1.0], - bgr_to_rgb=True, - pad_val=0, - seg_pad_val=255, - size=crop_size) - -model = dict( - type='EncoderDecoder', - backbone=dict( - type='hailoFPN', - depth=0.33, - width=0.125, - bb_channels_list=[128, 256, 512, 1024], - bb_num_repeats_list=[9, 15, 21, 12], - neck_channels_list=[256, 128, 128, 256, 256, 512], - neck_num_repeats_list=[9, 12, 12, 9]), - decode_head=dict( - type='ConvHead', - in_channels=16, - channels=128, - num_convs=1, - num_classes=10, - norm_cfg=norm_cfg, - align_corners=True, - loss_decode=dict( - type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), - # model training and testing settings - train_cfg=dict(), - test_cfg=dict(mode='whole'), - infer_wo_softmax=True) diff --git a/configs/fcn/fcn_hailo_prune.py b/configs/fcn/fcn_hailo_prune.py index 97a6bd5ac6..523b223fbc 100644 --- a/configs/fcn/fcn_hailo_prune.py +++ b/configs/fcn/fcn_hailo_prune.py @@ -4,8 +4,8 @@ ] resume = True -# load_from='./work_dirs/fcn_hailo_eta1e5/iter_68448.pth' # best checkpoint path of full training (fcn_hailo_10classes). Start of pruning procedure -load_from='./work_dirs/fcn_hailo_eta1e5_eve/iter_74400.pth' +# best checkpoint path of full training (fcn_hailo_10classes). Start of pruning procedure: +load_from = './work_dirs/fcn_hailo_eta1e5_eve/iter_74400.pth' # optimizer optimizer = dict(type='Adam', lr=0.0001, weight_decay=1e-5) @@ -13,7 +13,7 @@ # runtime settings -train_cfg = dict(type='IterBasedTrainLoop', max_iters=178560, val_interval=1488) # 74400 (50 epochs), 89280 (60 epochs), 104160 (70 epochs), 89280 (80 epochs), 173760 +train_cfg = dict(type='IterBasedTrainLoop', max_iters=178560, val_interval=1488) # 74400 (50 epochs), 178560 (120) val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop') @@ -27,17 +27,12 @@ param_scheduler=dict(type='ParamSchedulerHook'), ) - # # save checkpoint every 1 epoch. - # checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=1488, save_best='mIoU', rule='greater', - # max_keep_ckpts=5, save_begin=163680), # 2976 (2Epoches), 7440 (5 Epoches) - # ) - # learning policy: taken from the recipe # custom hooks sparseml_hook = dict(type='SparseMLHook', priority='NORMAL') -# sparseml_hook = dict(type='SparseMLHook', interval=10, priority='NORMAL') +# Saving best checkpoint starts after pruning hits final ratio ext_checkpoint_hook = dict(type='ExtCheckpointHook', by_epoch=False, interval=1488, save_best='mIoU', rule='greater', - max_keep_ckpts=5, save_begin=163680) # 2976 (2Epoches), 7440 (5 Epoches), 80352 (54), 83328 (56), 163680 + max_keep_ckpts=5, save_begin=163680) # 163680 (110 epochs) custom_hooks = [sparseml_hook, ext_checkpoint_hook] # tensorboard vis ('LocalVisBackend' might be redundant) save_dir='./tf_dir/' diff --git a/mmseg/engine/hooks/checkpoint_hook.py b/mmseg/engine/hooks/checkpoint_hook.py index b7b820759e..d752fd839e 100644 --- a/mmseg/engine/hooks/checkpoint_hook.py +++ b/mmseg/engine/hooks/checkpoint_hook.py @@ -4,8 +4,6 @@ @HOOKS.register_module() class ExtCheckpointHook(CheckpointHook): - # def __init__(self): - # self.by_epoch = False def after_val_epoch(self, runner, metrics): if runner.iter == self.save_begin: @@ -13,7 +11,6 @@ def after_val_epoch(self, runner, metrics): runner.message_hub.update_info('best_score', 0.0) runner.message_hub.pop_info('best_ckpt', None) if (runner.iter + 1 >= self.save_begin): - runner.logger.info('ExtCheckpointHook ExtCheckpointHook ExtCheckpointHook') runner.logger.info( f'Saving checkpoint at iter {runner.iter}') super().after_val_epoch(runner, metrics)