Best checkpoint save after pruning + cleanup0

hailo-ai · Dec 19, 2023 · e1627ad · e1627ad
1 parent 8c9aaac
commit e1627ad
Show file tree

Hide file tree

Showing 4 changed files with 7 additions and 94 deletions.
diff --git a/configs/fcn/fcn_hailo_10classes.py b/configs/fcn/fcn_hailo_10classes.py
@@ -30,7 +30,8 @@
     param_scheduler=dict(type='ParamSchedulerHook'),
 
     # save checkpoint every 5 epochs.
-    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=7440, save_best='mIoU', rule='greater', max_keep_ckpts=5),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=7440, save_best='mIoU', rule='greater',
+                    max_keep_ckpts=5),
 )
 
 # tensorboard vis ('LocalVisBackend' might be redundant)  save_dir='./tf_dir/<exp_name>'

diff --git a/configs/fcn/fcn_hailo_10classes_epoch.py b/configs/fcn/fcn_hailo_10classes_epoch.py
diff --git a/configs/fcn/fcn_hailo_prune.py b/configs/fcn/fcn_hailo_prune.py
@@ -4,16 +4,16 @@
 ]
 
 resume = True
-# load_from='./work_dirs/fcn_hailo_eta1e5/iter_68448.pth'  # best checkpoint path of full training (fcn_hailo_10classes). Start of pruning procedure
-load_from='./work_dirs/fcn_hailo_eta1e5_eve/iter_74400.pth'
+# best checkpoint path of full training (fcn_hailo_10classes). Start of pruning procedure:
+load_from = './work_dirs/fcn_hailo_eta1e5_eve/iter_74400.pth'
 
 # optimizer
 optimizer = dict(type='Adam', lr=0.0001, weight_decay=1e-5)
 optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
 
 
 # runtime settings
-train_cfg = dict(type='IterBasedTrainLoop', max_iters=178560, val_interval=1488)  # 74400 (50 epochs), 89280 (60 epochs), 104160 (70 epochs), 89280 (80 epochs), 173760
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=178560, val_interval=1488)  # 74400 (50 epochs), 178560 (120)
 val_cfg = dict(type='ValLoop')
 test_cfg = dict(type='TestLoop')
 
@@ -27,17 +27,12 @@
     param_scheduler=dict(type='ParamSchedulerHook'),
     )
 
-    # # save checkpoint every 1 epoch.
-    # checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=1488, save_best='mIoU', rule='greater',
-    #                 max_keep_ckpts=5, save_begin=163680),  # 2976 (2Epoches), 7440 (5 Epoches)
-                    # )
-
 # learning policy: taken from the recipe
 # custom hooks
 sparseml_hook = dict(type='SparseMLHook', priority='NORMAL')
-# sparseml_hook = dict(type='SparseMLHook', interval=10, priority='NORMAL')
+# Saving best checkpoint starts after pruning hits final ratio
 ext_checkpoint_hook = dict(type='ExtCheckpointHook', by_epoch=False, interval=1488, save_best='mIoU', rule='greater',
-                           max_keep_ckpts=5, save_begin=163680)  # 2976 (2Epoches), 7440 (5 Epoches), 80352 (54), 83328 (56), 163680
+                           max_keep_ckpts=5, save_begin=163680)  # 163680 (110 epochs)
 custom_hooks = [sparseml_hook, ext_checkpoint_hook]
 
 # tensorboard vis ('LocalVisBackend' might be redundant)  save_dir='./tf_dir/<exp_name>'

diff --git a/mmseg/engine/hooks/checkpoint_hook.py b/mmseg/engine/hooks/checkpoint_hook.py
@@ -4,16 +4,13 @@
 
 @HOOKS.register_module()
 class ExtCheckpointHook(CheckpointHook):
-    # def __init__(self):
-    #     self.by_epoch = False
 
     def after_val_epoch(self, runner, metrics):
         if runner.iter == self.save_begin:
             runner.logger.info('Resetting best_score to 0.0')
             runner.message_hub.update_info('best_score', 0.0)
             runner.message_hub.pop_info('best_ckpt', None)
         if (runner.iter + 1 >= self.save_begin):
-            runner.logger.info('ExtCheckpointHook ExtCheckpointHook ExtCheckpointHook')
             runner.logger.info(
                 f'Saving checkpoint at iter {runner.iter}')
             super().after_val_epoch(runner, metrics)