From fc989b8844b34489fa8c71aa048f2529ce5e397a Mon Sep 17 00:00:00 2001 From: Polisetty V R K Jyothendra Varma Date: Wed, 17 Jul 2024 20:21:02 +0300 Subject: [PATCH] add kill switch file support to gracefully exit training at runtime (#412) * Update arguments.py * Update training.py * Update utils.py * add copyrights * add copyrights * add copyrights * Update arguments.py help * Update arguments.py * Update training.py * Update utils.py * Update arguments.py --- megatron/arguments.py | 3 +++ megatron/training.py | 19 +++++++++++++++++-- megatron/utils.py | 8 ++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 30c3d669d7..af39f0b0e3 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -680,6 +680,9 @@ def _add_network_size_args(parser): help='Untie embeddings and output weights.'), group.add_argument('--embedding-weights-in-fp32', action='store_true', help='Cast word embedding weights to fp32 before embedding fwd.'), + group.add_argument('--kill-switch-file', type=str, default=None, + help='Location of kill switch file. ' + 'If found will automatically exit the program at runtime.') return parser diff --git a/megatron/training.py b/megatron/training.py index 697d62f7b8..6ba26f3944 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -43,7 +43,7 @@ from megatron.optimizer_param_scheduler import OptimizerParamScheduler from megatron.model import DistributedDataParallel as LocalDDP from megatron.utils import check_adlr_autoresume_termination -from megatron.utils import unwrap_model +from megatron.utils import unwrap_model, found_kill_switch from megatron.data.data_samplers import build_pretraining_data_loader from megatron.utils import calc_params_l2_norm from megatron.core.pipeline_parallel import get_forward_backward_func @@ -128,6 +128,13 @@ def pretrain(train_valid_test_dataset_provider, # Initalize and get arguments, timers, and Tensorboard writer. initialize_megatron(extra_args_provider=extra_args_provider, args_defaults=args_defaults, external_args=external_args) + + args = get_args() + + if found_kill_switch(): + print_datetime(f"Detected kill switch at {args.kill_switch_file}. Exiting") + sys.exit() + # Set pytorch JIT layer fusion options and warmup JIT functions. if get_accelerator().device_name() == 'cuda': set_jit_fusion_options() @@ -144,7 +151,6 @@ def pretrain(train_valid_test_dataset_provider, time.time() - _TRAIN_START_TIME)) print_datetime('after megatron is initialized') - args = get_args() timers = get_timers() if args.deepspeed: @@ -1358,6 +1364,15 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, sys.exit() trigger(on_step_end) + # Exiting based on kill switch file + if found_kill_switch(): + if args.save and not saved_checkpoint: + save_checkpoint_and_time(iteration, model, optimizer, + opt_param_scheduler) + torch.distributed.barrier() + print_datetime(f"Detected kill switch at {args.kill_switch_file}, " + f"iteration={iteration}. Exiting") + sys.exit() return iteration diff --git a/megatron/utils.py b/megatron/utils.py index 9033d6402a..cbb7aa6426 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -381,3 +381,11 @@ def dump_weights(preamble, iteration, model, optimizer, tensor=None): p = model[0].module.tied_modules.embed.word_embeddings.weight._hp_param fh.write(f"{get_fingerprint(p)} module.tied_modules.embed.word_embeddings.weight._hp_param {p.shape}\n") + +def found_kill_switch(): + args = get_args() + if args.kill_switch_file is not None and os.path.exists(args.kill_switch_file): + return True + else: + return False +