From 4271cd081b2d4ad93a0d1ce9a3e77328bc3724b8 Mon Sep 17 00:00:00 2001 From: owen Date: Wed, 18 Sep 2024 16:59:56 +0800 Subject: [PATCH] revert lr scheduler first decay to 1.0 and add more comments for lr and lr_scheduler and its related test func --- tests/test_deep_hydro.py | 147 +++++++++++++++++----------- torchhydro/configs/config.py | 12 ++- torchhydro/trainers/deep_hydro.py | 4 +- torchhydro/trainers/train_logger.py | 4 +- 4 files changed, 102 insertions(+), 65 deletions(-) diff --git a/tests/test_deep_hydro.py b/tests/test_deep_hydro.py index f519eea..5cb290d 100644 --- a/tests/test_deep_hydro.py +++ b/tests/test_deep_hydro.py @@ -1,7 +1,7 @@ """ Author: Wenyu Ouyang Date: 2024-05-01 10:34:15 -LastEditTime: 2024-05-04 11:31:52 +LastEditTime: 2024-09-18 16:47:26 LastEditors: Wenyu Ouyang Description: Unit tests for the DeepHydro class FilePath: \torchhydro\tests\test_deep_hydro.py @@ -16,7 +16,8 @@ from torchhydro.datasets.data_dict import datasets_dict from torchhydro.trainers.train_logger import TrainLogger import torch -import tempfile +from torch.optim import Adam +from torch.optim.lr_scheduler import LambdaLR, ExponentialLR, ReduceLROnPlateau # Mock dataset class using random data @@ -32,61 +33,36 @@ def ngrid(self): @property def nt(self): - return 200 + return 100 def __len__(self): - return self.ngrid * (self.nt - self.data_cfgs["forecast_history"] + 1) + return self.ngrid * (self.nt - self.data_cfgs["forecast_length"] + 1) def __getitem__(self, idx): # Use the stored configurations to generate mock data - # rho = self.data_cfgs["forecast_history"] - # x = torch.randn(rho, self.data_cfgs["input_features"]) - # y = torch.randn(rho, self.data_cfgs["output_features"]) - mode = self.data_cfgs["model_mode"] - if mode == "single": - src1 = torch.rand( - self.data_cfgs["forecast_history"], - self.data_cfgs["input_features"] - 1, - ) - src2 = torch.rand( - self.data_cfgs["forecast_history"], - self.data_cfgs["cnn_size"], - ) - else: - src1 = torch.rand( - self.data_cfgs["forecast_history"], - self.data_cfgs["input_features"], - ) - src2 = torch.rand( - self.data_cfgs["forecast_history"], - self.data_cfgs["input_size_encoder2"] - ) - src3 = torch.rand(1, self.data_cfgs["output_features"]) # start_token - out = torch.rand(self.data_cfgs["forecast_length"], self.data_cfgs["output_features"]) - return [src1, src2, src3], out + rho = self.data_cfgs["forecast_length"] + x = torch.randn(rho, self.data_cfgs["input_features"]) + y = torch.randn(rho, self.data_cfgs["output_features"]) + return x, y @pytest.fixture() def dummy_data_cfgs(): - test_path = "results/test_seq2seq_single2/" + test_path = "results/test/" if not os.path.exists(test_path): os.makedirs(test_path) return { "dataset": "MockDataset", - "input_features": 4, + "input_features": 10, "output_features": 1, # "t_range_valid": ["2010-01-01", "2010-12-31"], "t_range_valid": None, "test_path": test_path, "sampler": "KuaiSampler", - # "sampler": "HydroSampler", "batch_size": 5, - "forecast_history": 5, - "forecast_length": 2, - "warmup_length": 10, - "cnn_size" : 120, - "input_size_encoder2": 1, - "model_mode": "single", + "forecast_history": 0, + "forecast_length": 30, + "warmup_length": 0, } @@ -97,14 +73,21 @@ def test_using_mock_dataset(dummy_data_cfgs): dataset = datasets_dict[dataset_name](dummy_data_cfgs, is_tra_val_te) - assert len(dataset) == 330 - sample_x, sample_y= dataset[0] + assert len(dataset) == 710 + sample_x, sample_y = dataset[0] print(sample_x[0].shape) print(sample_x[1].shape) print(sample_x[2].shape) print(sample_y.shape) - # assert sample_x.shape == (dummy_data_cfgs['forecast_history'], dummy_data_cfgs["input_features"]) - # assert sample_y.shape == (dummy_data_cfgs['forecast_history'], dummy_data_cfgs["output_features"]) + assert sample_x.shape == ( + dummy_data_cfgs["forecast_length"], + dummy_data_cfgs["input_features"], + ) + assert sample_y.shape == ( + dummy_data_cfgs["forecast_length"], + dummy_data_cfgs["output_features"], + ) + @pytest.fixture() def dummy_train_cfgs(dummy_data_cfgs): @@ -112,34 +95,28 @@ def dummy_train_cfgs(dummy_data_cfgs): "training_cfgs": { "early_stopping": False, "patience": 4, - "epochs": 2, + "epochs": 12, "start_epoch": 1, "which_first_tensor": "batch", "device": -1, # Assuming CPU device "train_mode": True, "criterion": "RMSE", "optimizer": "Adam", - "optim_params": {}, - "lr_scheduler": {"lr": 0.001}, + "optim_params": {"lr": 0.01}, + # "optim_params": {}, + "lr_scheduler": {0: 0.5, 10: 0.1}, "batch_size": 5, "save_epoch": 1, }, "data_cfgs": dummy_data_cfgs, "model_cfgs": { "model_type": "Normal", - # "model_name": "CpuLSTM", - "model_name": "Seq2Seq", + "model_name": "CpuLSTM", "weight_path": None, "model_hyperparam": { - # "n_input_features": 10, - # "n_output_features": 1, - # "n_hidden_states": 64, - "input_size": 4, - "output_size": 1, - "hidden_size": 256, - "forecast_length": 2, - "cnn_size": 120, - "model_mode": "single", + "n_input_features": 10, + "n_output_features": 1, + "n_hidden_states": 64, }, }, "evaluation_cfgs": { @@ -149,6 +126,7 @@ def dummy_train_cfgs(dummy_data_cfgs): }, } + @pytest.fixture() def deep_hydro(dummy_train_cfgs): datasets_dict["MockDataset"] = MockDataset @@ -163,8 +141,63 @@ def test_model_train(deep_hydro): # Add assertions to check the expected behavior of the method assert deep_hydro.model.state_dict() is not None + def test_plot_model_structure(deep_hydro, dummy_train_cfgs): opt = torch.optim.SGD(deep_hydro.model.parameters(), lr=0.01) model_filepath = dummy_train_cfgs["data_cfgs"]["test_path"] train_logger = TrainLogger(model_filepath, dummy_train_cfgs, opt) - train_logger.plot_model_structure(deep_hydro.model) \ No newline at end of file + train_logger.plot_model_structure(deep_hydro.model) + + +def test_get_scheduler_lambda_lr(deep_hydro, dummy_train_cfgs): + dummy_train_cfgs["training_cfgs"]["lr_scheduler"] = {"lr": 0.001} + opt = Adam(deep_hydro.model.parameters()) + scheduler = deep_hydro._get_scheduler(dummy_train_cfgs["training_cfgs"], opt) + assert isinstance(scheduler, LambdaLR) + + +def test_get_scheduler_lambda_lr_with_epochs(deep_hydro, dummy_train_cfgs): + dummy_train_cfgs["training_cfgs"]["lr_scheduler"] = {0: 1.0, 10: 0.1} + opt = Adam(deep_hydro.model.parameters()) + scheduler = deep_hydro._get_scheduler(dummy_train_cfgs["training_cfgs"], opt) + assert isinstance(scheduler, LambdaLR) + + +def test_get_scheduler_exponential_lr(deep_hydro, dummy_train_cfgs): + dummy_train_cfgs["training_cfgs"]["lr_scheduler"] = {"lr_factor": 0.9} + opt = Adam(deep_hydro.model.parameters()) + scheduler = deep_hydro._get_scheduler(dummy_train_cfgs["training_cfgs"], opt) + assert isinstance(scheduler, ExponentialLR) + + +def test_get_scheduler_reduce_lr_on_plateau(deep_hydro, dummy_train_cfgs): + dummy_train_cfgs["training_cfgs"]["lr_scheduler"] = { + "lr_factor": 0.9, + "lr_patience": 5, + } + opt = Adam(deep_hydro.model.parameters()) + scheduler = deep_hydro._get_scheduler(dummy_train_cfgs["training_cfgs"], opt) + assert isinstance(scheduler, ReduceLROnPlateau) + + +def test_get_scheduler_invalid_config(deep_hydro, dummy_train_cfgs): + dummy_train_cfgs["training_cfgs"]["lr_scheduler"] = {"invalid_key": 0.9} + opt = Adam(deep_hydro.model.parameters()) + with pytest.raises(ValueError, match="Invalid lr_scheduler configuration"): + deep_hydro._get_scheduler(dummy_train_cfgs["training_cfgs"], opt) + + +# add a test func for LambdaLR show me the lr change in each epoch +def test_get_scheduler_lambda_lr_with_epochs_show_lr(deep_hydro, dummy_train_cfgs): + # NOTE: for scheduler, the start epoch is 0! but scheduler.step is still after each epoch + dummy_train_cfgs["training_cfgs"]["lr_scheduler"] = {1: 0.5, 10: 0.1} + opt = Adam(deep_hydro.model.parameters()) + scheduler = deep_hydro._get_scheduler(dummy_train_cfgs["training_cfgs"], opt) + for epoch in range(1, 15): + # We start from epoch 1 not 0 to make it easier to understand for human + # NOTE: the scheduler.step() is called at the end of each epoch + # so that at the first epoch, the lr is still the initial lr, + # one has to set initial lr in the optimizer directly for the first epoch + print(f"epoch:{epoch}, lr:{opt.param_groups[0]['lr']}") + scheduler.step() + assert isinstance(scheduler, LambdaLR) diff --git a/torchhydro/configs/config.py b/torchhydro/configs/config.py index 7aa58ba..d605645 100644 --- a/torchhydro/configs/config.py +++ b/torchhydro/configs/config.py @@ -1,7 +1,7 @@ """ Author: Wenyu Ouyang Date: 2021-12-31 11:08:29 -LastEditTime: 2024-09-15 10:04:38 +LastEditTime: 2024-09-18 16:50:16 LastEditors: Wenyu Ouyang Description: Config for hydroDL FilePath: \torchhydro\torchhydro\configs\config.py @@ -221,16 +221,20 @@ def default_config_file(): "criterion_params": None, # "weight_decay": None, a regularization term in loss func "optimizer": "Adam", + # "optim_params": {"lr": 0.001} means the initial learning rate is 0.001 "optim_params": {}, "lr_scheduler": { - # 1st opt config, all epochs use this lr + # 1st opt config, all epochs use this lr, + # this setting will cover the lr setting in "optim_params" "lr": 0.001, # 2nd opt config, diff epoch uses diff lr, key is epoch, # start from 0, each value means the decay rate + # if initial lr is 0.001, then 0: 0.5 neans the lr of 0 epoch is 0.001*0.5=0.0005 # "lr_scheduler": {0: 1, 1: 0.5, 2: 0.2}, - # 3rd opt config, lr as a initial value, and lr_factor as an exponential decay factor + # 3rd opt config, lr as a initial value (will cover the lr setting in "optim_params") + # lr_factor as an exponential decay factor # "lr": 0.001, "lr_factor": 0.1, - # 4th opt config, lr as a initial value, + # 4th opt config, lr as a initial value, it will cover the lr setting in "optim_params" # lr_patience represent how many epochs without opt (we watch val_loss) could be tolerated # if lr_patience is satisfied, then lr will be decayed by lr_factor by a linear way # "lr": 0.001, "lr_factor": 0.1, "lr_patience": 1, diff --git a/torchhydro/trainers/deep_hydro.py b/torchhydro/trainers/deep_hydro.py index 3b41fbd..23e1b1e 100644 --- a/torchhydro/trainers/deep_hydro.py +++ b/torchhydro/trainers/deep_hydro.py @@ -1,7 +1,7 @@ """ Author: Wenyu Ouyang Date: 2024-04-08 18:15:48 -LastEditTime: 2024-09-16 10:19:34 +LastEditTime: 2024-09-18 11:18:15 LastEditors: Wenyu Ouyang Description: HydroDL model class FilePath: \torchhydro\torchhydro\trainers\deep_hydro.py @@ -297,7 +297,7 @@ def _get_scheduler(self, training_cfgs, opt): isinstance(epoch, int) for epoch in lr_scheduler_cfg ): scheduler = LambdaLR( - opt, lr_lambda=lambda epoch: lr_scheduler_cfg.get(epoch, 0.5)# 初始从1开始(强制),10epoch之后再从0.1开始 + opt, lr_lambda=lambda epoch: lr_scheduler_cfg.get(epoch, 1.0) ) elif "lr_factor" in lr_scheduler_cfg and "lr_patience" not in lr_scheduler_cfg: scheduler = ExponentialLR(opt, gamma=lr_scheduler_cfg["lr_factor"]) diff --git a/torchhydro/trainers/train_logger.py b/torchhydro/trainers/train_logger.py index 23df44d..82c9193 100644 --- a/torchhydro/trainers/train_logger.py +++ b/torchhydro/trainers/train_logger.py @@ -1,7 +1,7 @@ """ Author: Wenyu Ouyang Date: 2021-12-31 11:08:29 -LastEditTime: 2024-05-04 11:30:00 +LastEditTime: 2024-09-18 15:40:10 LastEditors: Wenyu Ouyang Description: Training function for DL models FilePath: \torchhydro\torchhydro\trainers\train_logger.py @@ -73,7 +73,7 @@ def save_session_param( def log_epoch_train(self, epoch): start_time = time.time() logs = {} - # here content in the with block will be performed + # here content in the 'with' block will be performed after yeild yield logs total_loss = logs["train_loss"] elapsed_time = time.time() - start_time