From 34909cbbd42335c6a7b8258b43cabad0d260b12f Mon Sep 17 00:00:00 2001 From: Simon W Date: Wed, 13 Dec 2023 09:37:22 -0800 Subject: [PATCH 001/128] minimal pytest --- tests/test_integration.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_integration.py b/tests/test_integration.py index e60b3a871..438f75c5c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1705,3 +1705,14 @@ def test_unused_future_regressors(): m.add_future_regressor("price") m.add_lagged_regressor("cost") m.fit(df, freq="D") + +def test_on_the_fly_sampling(): + start_date = "2019-01-01" + end_date = "2019-03-01" + date_range = pd.date_range(start=start_date, end=end_date, freq="H") + y = np.random.randint(0, 1000, size=(len(date_range),)) + df = pd.DataFrame({"ds": date_range, "y": y}) + + m = NeuralProphet(epochs=1) + m.fit(df, freq='H') + m.predict(df) From 687c08559ee282da4eec06ee89613db2a117d51f Mon Sep 17 00:00:00 2001 From: Simon W Date: Wed, 13 Dec 2023 10:02:49 -0800 Subject: [PATCH 002/128] move_func_getitem --- neuralprophet/time_dataset.py | 86 +++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index dca97da79..7a889508d 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -63,10 +63,54 @@ def __init__(self, df, name, **kwargs): "events", "regressors", ] - inputs, targets, drop_missing = tabularize_univariate_datetime(df, **kwargs) + + self.df = df + self.kwargs = kwargs + #inputs, targets, drop_missing = tabularize_univariate_datetime(df, **kwargs) + #self.init_after_tabularized(inputs, targets) + #self.filter_samples_after_init(kwargs["prediction_frequency"]) + #self.drop_nan_after_init(df, kwargs["predict_steps"], drop_missing) + + def __getitem__(self, index): + """Overrides parent class method to get an item at index. + Parameters + ---------- + index : int + Sample location in dataset + Returns + ------- + OrderedDict + Model inputs, each of len(df) but with varying dimensions + Note + ---- + Contains the following data: + Model Inputs + * ``time`` (np.array, float), dims: (num_samples, 1) + * ``seasonalities`` (OrderedDict), named seasonalities + each with features (np.array, float) - dims: (num_samples, n_features[name]) + * ``lags`` (np.array, float), dims: (num_samples, n_lags) + * ``covariates`` (OrderedDict), named covariates, + each with features (np.array, float) of dims: (num_samples, n_lags) + * ``events`` (OrderedDict), events, + each with features (np.array, float) of dims: (num_samples, n_lags) + * ``regressors`` (OrderedDict), regressors, + each with features (np.array, float) of dims: (num_samples, n_lags) + np.array, float + Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) + """ + inputs, targets, drop_missing = tabularize_univariate_datetime(self.df, **self.kwargs) self.init_after_tabularized(inputs, targets) - self.filter_samples_after_init(kwargs["prediction_frequency"]) - self.drop_nan_after_init(df, kwargs["predict_steps"], drop_missing) + self.filter_samples_after_init(self.kwargs["prediction_frequency"]) + self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], drop_missing) + + sample = self.samples[index] + targets = self.targets[index] + meta = self.meta + return sample, targets, meta + + def __len__(self): + """Overrides Parent class method to get data length.""" + return self.length def drop_nan_after_init(self, df, predict_steps, drop_missing): """Checks if inputs/targets contain any NaN values and drops them, if user opts to. @@ -223,42 +267,6 @@ def filter_samples_after_init( sample.pop("timestamps") self.length = len(self.samples) - def __getitem__(self, index): - """Overrides parent class method to get an item at index. - Parameters - ---------- - index : int - Sample location in dataset - Returns - ------- - OrderedDict - Model inputs, each of len(df) but with varying dimensions - Note - ---- - Contains the following data: - Model Inputs - * ``time`` (np.array, float), dims: (num_samples, 1) - * ``seasonalities`` (OrderedDict), named seasonalities - each with features (np.array, float) - dims: (num_samples, n_features[name]) - * ``lags`` (np.array, float), dims: (num_samples, n_lags) - * ``covariates`` (OrderedDict), named covariates, - each with features (np.array, float) of dims: (num_samples, n_lags) - * ``events`` (OrderedDict), events, - each with features (np.array, float) of dims: (num_samples, n_lags) - * ``regressors`` (OrderedDict), regressors, - each with features (np.array, float) of dims: (num_samples, n_lags) - np.array, float - Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) - """ - sample = self.samples[index] - targets = self.targets[index] - meta = self.meta - return sample, targets, meta - - def __len__(self): - """Overrides Parent class method to get data length.""" - return self.length - def tabularize_univariate_datetime( df, From 5215340aa51f2c8ef5d71f7e1ce07db9d7b30433 Mon Sep 17 00:00:00 2001 From: Simon W Date: Thu, 14 Dec 2023 19:26:26 -0800 Subject: [PATCH 003/128] slicing --- neuralprophet/time_dataset.py | 8 +++++++- tests/test_integration.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 7a889508d..afa69beab 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -98,7 +98,13 @@ def __getitem__(self, index): np.array, float Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) """ - inputs, targets, drop_missing = tabularize_univariate_datetime(self.df, **self.kwargs) + start_idx = index + #end_idx = start_idx + self.kwargs.get('n_lags') + self.kwargs.get('n_forecasts') - 1 #correct? + end_idx = start_idx + 1 + df_slice = self.df.iloc[start_idx:end_idx] + + # Functions + inputs, targets, drop_missing = tabularize_univariate_datetime(df_slice, **self.kwargs) self.init_after_tabularized(inputs, targets) self.filter_samples_after_init(self.kwargs["prediction_frequency"]) self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], drop_missing) diff --git a/tests/test_integration.py b/tests/test_integration.py index 438f75c5c..601b9dff9 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1708,7 +1708,7 @@ def test_unused_future_regressors(): def test_on_the_fly_sampling(): start_date = "2019-01-01" - end_date = "2019-03-01" + end_date = "2019-01-04" date_range = pd.date_range(start=start_date, end=end_date, freq="H") y = np.random.randint(0, 1000, size=(len(date_range),)) df = pd.DataFrame({"ds": date_range, "y": y}) From c70fae292623001d66ae1a0efddc7ff96162ab9a Mon Sep 17 00:00:00 2001 From: Simon W Date: Thu, 14 Dec 2023 19:48:03 -0800 Subject: [PATCH 004/128] predict_mode --- neuralprophet/time_dataset.py | 11 +++++++---- tests/test_integration.py | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index afa69beab..885a06165 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -98,10 +98,13 @@ def __getitem__(self, index): np.array, float Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) """ - start_idx = index - #end_idx = start_idx + self.kwargs.get('n_lags') + self.kwargs.get('n_forecasts') - 1 #correct? - end_idx = start_idx + 1 - df_slice = self.df.iloc[start_idx:end_idx] + if self.kwargs['predict_mode']: + df_slice = self.df + else: + start_idx = index + #end_idx = start_idx + self.kwargs.get('n_lags') + self.kwargs.get('n_forecasts') - 1 #correct? + end_idx = start_idx + 1 + df_slice = self.df.iloc[start_idx:end_idx] # Functions inputs, targets, drop_missing = tabularize_univariate_datetime(df_slice, **self.kwargs) diff --git a/tests/test_integration.py b/tests/test_integration.py index 601b9dff9..76517b084 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1713,6 +1713,6 @@ def test_on_the_fly_sampling(): y = np.random.randint(0, 1000, size=(len(date_range),)) df = pd.DataFrame({"ds": date_range, "y": y}) - m = NeuralProphet(epochs=1) + m = NeuralProphet(epochs=1, learning_rate=0.01) m.fit(df, freq='H') m.predict(df) From b78d5e021552d95daa800b36572d62b56ab47244 Mon Sep 17 00:00:00 2001 From: Simon W Date: Mon, 18 Dec 2023 13:20:43 -0800 Subject: [PATCH 005/128] typos --- neuralprophet/forecaster.py | 4 ++-- neuralprophet/time_dataset.py | 2 ++ tests/test_integration.py | 15 ++++++++++----- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index 852fc297b..d81712388 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -2684,7 +2684,7 @@ def _train( val_dataloaders=val_loader, **self.config_train.lr_finder_args, ) - # Estimate the optimat learning rate from the loss curve + # Estimate the optimal learning rate from the loss curve assert lr_finder is not None _, _, lr_suggestion = utils.smooth_loss_and_suggest(lr_finder.results) self.model.learning_rate = lr_suggestion @@ -2706,7 +2706,7 @@ def _train( **self.config_train.lr_finder_args, ) assert lr_finder is not None - # Estimate the optimat learning rate from the loss curve + # Estimate the optimal learning rate from the loss curve _, _, lr_suggestion = utils.smooth_loss_and_suggest(lr_finder.results) self.model.learning_rate = lr_suggestion start = time.time() diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 885a06165..ef1d7baa5 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -106,6 +106,8 @@ def __getitem__(self, index): end_idx = start_idx + 1 df_slice = self.df.iloc[start_idx:end_idx] + #df_slice = self.df + # Functions inputs, targets, drop_missing = tabularize_univariate_datetime(df_slice, **self.kwargs) self.init_after_tabularized(inputs, targets) diff --git a/tests/test_integration.py b/tests/test_integration.py index 76517b084..ead9a17a7 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1710,9 +1710,14 @@ def test_on_the_fly_sampling(): start_date = "2019-01-01" end_date = "2019-01-04" date_range = pd.date_range(start=start_date, end=end_date, freq="H") - y = np.random.randint(0, 1000, size=(len(date_range),)) - df = pd.DataFrame({"ds": date_range, "y": y}) - - m = NeuralProphet(epochs=1, learning_rate=0.01) + #y = np.random.randint(0, 1000, size=(len(date_range),)) + #df = pd.DataFrame({"ds": date_range, "y": y}) + df = pd.DataFrame( + { + "ds": {0: "2022-10-16 00:00:00", 1: "2022-10-17 00:00:00", 2: "2022-10-18 00:00:00", 3: "2022-10-19 00:00:00", 4: "2022-10-20 00:00:00",}, + "y": {0: 17, 1: 18, 2: 10, 3: 8, 4: 5}, + } + ) + m = NeuralProphet(epochs=1) #, learning_rate=0.01) m.fit(df, freq='H') - m.predict(df) + metrics = m.predict(df) From beae5bb21ce8d870ff0fd212f70ac2c462c7ee2d Mon Sep 17 00:00:00 2001 From: Simon W Date: Mon, 18 Dec 2023 17:56:49 -0800 Subject: [PATCH 006/128] lr-finder --- neuralprophet/data/process.py | 1 + neuralprophet/time_dataset.py | 12 +++++++----- tests/test_integration.py | 19 +++++++------------ 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/neuralprophet/data/process.py b/neuralprophet/data/process.py index 9f8861016..c9190f21a 100644 --- a/neuralprophet/data/process.py +++ b/neuralprophet/data/process.py @@ -623,4 +623,5 @@ def _create_dataset(model, df, predict_mode, prediction_frequency=None): config_regressors=model.config_regressors, config_missing=model.config_missing, prediction_frequency=prediction_frequency, + config_train=model.config_train ) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index ef1d7baa5..fdfaf7503 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -98,16 +98,17 @@ def __getitem__(self, index): np.array, float Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) """ - if self.kwargs['predict_mode']: + learning_rate = self.kwargs['config_train'].learning_rate + # TODO: Drop config_train from self! + + if self.kwargs['predict_mode'] or (learning_rate is None): df_slice = self.df else: start_idx = index - #end_idx = start_idx + self.kwargs.get('n_lags') + self.kwargs.get('n_forecasts') - 1 #correct? - end_idx = start_idx + 1 + end_idx = start_idx + self.kwargs.get('n_lags') + self.kwargs.get('n_forecasts') + #end_idx = start_idx + 1 df_slice = self.df.iloc[start_idx:end_idx] - #df_slice = self.df - # Functions inputs, targets, drop_missing = tabularize_univariate_datetime(df_slice, **self.kwargs) self.init_after_tabularized(inputs, targets) @@ -291,6 +292,7 @@ def tabularize_univariate_datetime( config_lagged_regressors: Optional[configure.ConfigLaggedRegressors] = None, config_regressors: Optional[configure.ConfigFutureRegressors] = None, config_missing=None, + config_train=None, prediction_frequency=None, ): """Create a tabular dataset from univariate timeseries for supervised forecasting. diff --git a/tests/test_integration.py b/tests/test_integration.py index ead9a17a7..5a9cf80b9 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1707,17 +1707,12 @@ def test_unused_future_regressors(): m.fit(df, freq="D") def test_on_the_fly_sampling(): - start_date = "2019-01-01" - end_date = "2019-01-04" - date_range = pd.date_range(start=start_date, end=end_date, freq="H") - #y = np.random.randint(0, 1000, size=(len(date_range),)) - #df = pd.DataFrame({"ds": date_range, "y": y}) - df = pd.DataFrame( - { - "ds": {0: "2022-10-16 00:00:00", 1: "2022-10-17 00:00:00", 2: "2022-10-18 00:00:00", 3: "2022-10-19 00:00:00", 4: "2022-10-20 00:00:00",}, - "y": {0: 17, 1: 18, 2: 10, 3: 8, 4: 5}, - } - ) - m = NeuralProphet(epochs=1) #, learning_rate=0.01) + start_date = "2022-10-16 00:00:00" + end_date = "2022-12-30 00:00:00" + date_range = pd.date_range(start=start_date, end=end_date, freq="D") + y = np.random.randint(0, 20, size=(len(date_range),)) + df = pd.DataFrame({"ds": date_range, "y": y}) + + m = NeuralProphet(epochs=1, learning_rate=0.01) m.fit(df, freq='H') metrics = m.predict(df) From 8427ffc46da4aad65f017d937eff801b0f7f1642 Mon Sep 17 00:00:00 2001 From: Simon W Date: Tue, 19 Dec 2023 12:05:35 -0800 Subject: [PATCH 007/128] drop_missing --- neuralprophet/time_dataset.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index fdfaf7503..5f090400c 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -106,14 +106,13 @@ def __getitem__(self, index): else: start_idx = index end_idx = start_idx + self.kwargs.get('n_lags') + self.kwargs.get('n_forecasts') - #end_idx = start_idx + 1 df_slice = self.df.iloc[start_idx:end_idx] # Functions - inputs, targets, drop_missing = tabularize_univariate_datetime(df_slice, **self.kwargs) + inputs, targets = tabularize_univariate_datetime(df_slice, **self.kwargs) self.init_after_tabularized(inputs, targets) self.filter_samples_after_init(self.kwargs["prediction_frequency"]) - self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], drop_missing) + self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) sample = self.samples[index] targets = self.targets[index] @@ -502,7 +501,7 @@ def _stride_timestamps_for_forecasts(x): tabularized_input_shapes_str += f" {key} {value.shape} \n" log.debug(f"Tabularized inputs shapes: \n{tabularized_input_shapes_str}") - return inputs, targets, config_missing.drop_missing + return inputs, targets def fourier_series(dates, period, series_order): From ff05b2a179f672f3b8cf86db6defe3211b0a814c Mon Sep 17 00:00:00 2001 From: Simon W Date: Tue, 19 Dec 2023 13:54:23 -0800 Subject: [PATCH 008/128] predict_v2 --- neuralprophet/forecaster.py | 1 + neuralprophet/time_dataset.py | 52 +++++++++++++++++++++++------------ tests/test_integration.py | 1 + 3 files changed, 36 insertions(+), 18 deletions(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index d81712388..72640cbe0 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -1774,6 +1774,7 @@ def predict_seasonal_components(self, df: pd.DataFrame, quantile: float = 0.5): predict_mode=True, config_missing=self.config_missing, prediction_frequency=self.prediction_frequency, + config_train=self.config_train, ) loader = DataLoader(dataset, batch_size=min(4096, len(df)), shuffle=False, drop_last=False) predicted = {} diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 5f090400c..0a7910c40 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -63,13 +63,16 @@ def __init__(self, df, name, **kwargs): "events", "regressors", ] - - self.df = df self.kwargs = kwargs - #inputs, targets, drop_missing = tabularize_univariate_datetime(df, **kwargs) - #self.init_after_tabularized(inputs, targets) - #self.filter_samples_after_init(kwargs["prediction_frequency"]) - #self.drop_nan_after_init(df, kwargs["predict_steps"], drop_missing) + + learning_rate = kwargs['config_train'].learning_rate + if kwargs['predict_mode'] or (learning_rate is None): + inputs, targets = tabularize_univariate_datetime(df, **kwargs) + self.init_after_tabularized(inputs, targets) + self.filter_samples_after_init(kwargs["prediction_frequency"]) + self.drop_nan_after_init(df, kwargs["predict_steps"], kwargs["config_missing"].drop_missing) + else: + self.df = df def __getitem__(self, index): """Overrides parent class method to get an item at index. @@ -98,31 +101,44 @@ def __getitem__(self, index): np.array, float Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) """ - learning_rate = self.kwargs['config_train'].learning_rate # TODO: Drop config_train from self! - + learning_rate = self.kwargs['config_train'].learning_rate if self.kwargs['predict_mode'] or (learning_rate is None): - df_slice = self.df + sample = self.samples[index] + targets = self.targets[index] + meta = self.meta + return sample, targets, meta else: start_idx = index end_idx = start_idx + self.kwargs.get('n_lags') + self.kwargs.get('n_forecasts') df_slice = self.df.iloc[start_idx:end_idx] - # Functions - inputs, targets = tabularize_univariate_datetime(df_slice, **self.kwargs) - self.init_after_tabularized(inputs, targets) - self.filter_samples_after_init(self.kwargs["prediction_frequency"]) - self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) + # Functions + inputs, targets = tabularize_univariate_datetime(df_slice, **self.kwargs) + self.init_after_tabularized(inputs, targets) + self.filter_samples_after_init(self.kwargs["prediction_frequency"]) + self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) - sample = self.samples[index] - targets = self.targets[index] - meta = self.meta - return sample, targets, meta + sample = self.samples[index] + targets = self.targets[index] + meta = self.meta + return sample, targets, meta def __len__(self): """Overrides Parent class method to get data length.""" return self.length + def drop_nan_init(self, drop_missing): + """Checks if inputs/targets contain any NaN values and drops them, if user opts to. + Parameters + ---------- + drop_missing : bool + whether to automatically drop missing samples from the data + predict_steps : int + number of steps to predict + """ + + def drop_nan_after_init(self, df, predict_steps, drop_missing): """Checks if inputs/targets contain any NaN values and drops them, if user opts to. Parameters diff --git a/tests/test_integration.py b/tests/test_integration.py index 5a9cf80b9..6d1799f64 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1712,6 +1712,7 @@ def test_on_the_fly_sampling(): date_range = pd.date_range(start=start_date, end=end_date, freq="D") y = np.random.randint(0, 20, size=(len(date_range),)) df = pd.DataFrame({"ds": date_range, "y": y}) + df.loc[3, "y"] = np.nan m = NeuralProphet(epochs=1, learning_rate=0.01) m.fit(df, freq='H') From c408e950095b83c7711c807eedec57df8a65bb1e Mon Sep 17 00:00:00 2001 From: Simon W Date: Tue, 19 Dec 2023 15:16:14 -0800 Subject: [PATCH 009/128] predict_v3 --- tests/test_unit.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_unit.py b/tests/test_unit.py index 7600f8c3d..c07c98527 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -82,7 +82,7 @@ def test_time_dataset(): local_data_params, global_data_params = df_utils.init_data_params(df=df, normalize="minmax") df = df.drop("ID", axis=1) df = df_utils.normalize(df, global_data_params) - inputs, targets, _ = time_dataset.tabularize_univariate_datetime( + inputs, targets = time_dataset.tabularize_univariate_datetime( df, n_lags=n_lags, n_forecasts=n_forecasts, config_missing=config_missing ) log.debug( @@ -806,6 +806,13 @@ def test_too_many_NaN(): config_missing = configure.MissingDataHandling( impute_missing=True, impute_linear=5, impute_rolling=5, drop_missing=False ) + config_train = configure.Train( + learning_rate=LR, + epochs=EPOCHS, + batch_size=BATCH_SIZE, + loss_func="SmoothL1Loss", + optimizer="AdamW", + ) length = 100 days = pd.date_range(start="2017-01-01", periods=length) y = np.ones(length) @@ -825,7 +832,7 @@ def test_too_many_NaN(): df["ID"] = "__df__" # Check if ValueError is thrown, if NaN values remain after auto-imputing with pytest.raises(ValueError): - time_dataset.TimeDataset(df, "name", config_missing=config_missing, predict_steps=1, prediction_frequency=None) + time_dataset.TimeDataset(df, "name", predict_mode=False, config_missing=config_missing, config_train=config_train, predict_steps=1, prediction_frequency=None) def test_future_df_with_nan(): From df29f33fbbbd32815d3405acca9e9c59cf99cb29 Mon Sep 17 00:00:00 2001 From: Simon W Date: Wed, 20 Dec 2023 11:01:41 -0800 Subject: [PATCH 010/128] samples --- neuralprophet/time_dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 0a7910c40..db5727448 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -361,6 +361,11 @@ def tabularize_univariate_datetime( """ max_lags = get_max_num_lags(config_lagged_regressors, n_lags) n_samples = len(df) - max_lags + 1 - n_forecasts + #TODO + #n_samples = max_lags + n_forecasts + #if n_samples < 0: + # n_samples = max_lags + n_forecasts + # data is stored in OrderedDict inputs = OrderedDict({}) From 29fe999148ff1c6e8a23c701f0248ed859314e7e Mon Sep 17 00:00:00 2001 From: Simon W Date: Thu, 21 Dec 2023 11:10:22 -0800 Subject: [PATCH 011/128] lagged regressor n_lags --- neuralprophet/time_dataset.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index db5727448..8ea20ebd2 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -66,7 +66,7 @@ def __init__(self, df, name, **kwargs): self.kwargs = kwargs learning_rate = kwargs['config_train'].learning_rate - if kwargs['predict_mode'] or (learning_rate is None): + if kwargs['predict_mode'] or (learning_rate is None) or self.kwargs['config_lagged_regressors']: inputs, targets = tabularize_univariate_datetime(df, **kwargs) self.init_after_tabularized(inputs, targets) self.filter_samples_after_init(kwargs["prediction_frequency"]) @@ -103,7 +103,7 @@ def __getitem__(self, index): """ # TODO: Drop config_train from self! learning_rate = self.kwargs['config_train'].learning_rate - if self.kwargs['predict_mode'] or (learning_rate is None): + if self.kwargs['predict_mode'] or (learning_rate is None) or self.kwargs['config_lagged_regressors']: sample = self.samples[index] targets = self.targets[index] meta = self.meta @@ -111,6 +111,7 @@ def __getitem__(self, index): else: start_idx = index end_idx = start_idx + self.kwargs.get('n_lags') + self.kwargs.get('n_forecasts') + df_slice = self.df.iloc[start_idx:end_idx] # Functions @@ -360,11 +361,13 @@ def tabularize_univariate_datetime( Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) """ max_lags = get_max_num_lags(config_lagged_regressors, n_lags) - n_samples = len(df) - max_lags + 1 - n_forecasts + #n_samples = len(df) - max_lags + 1 - n_forecasts #TODO - #n_samples = max_lags + n_forecasts - #if n_samples < 0: - # n_samples = max_lags + n_forecasts + learning_rate = config_train.learning_rate + if predict_mode or (learning_rate is None): + n_samples = len(df) - max_lags + 1 - n_forecasts + else: + n_samples=1 # data is stored in OrderedDict inputs = OrderedDict({}) From 2f584c23a66c99ed83ea33f7fce73ccda3b8dc7a Mon Sep 17 00:00:00 2001 From: Simon W Date: Thu, 21 Dec 2023 15:31:58 -0800 Subject: [PATCH 012/128] preliminary: events, holidays --- neuralprophet/time_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 8ea20ebd2..26b822990 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -66,7 +66,7 @@ def __init__(self, df, name, **kwargs): self.kwargs = kwargs learning_rate = kwargs['config_train'].learning_rate - if kwargs['predict_mode'] or (learning_rate is None) or self.kwargs['config_lagged_regressors']: + if kwargs['predict_mode'] or (learning_rate is None) or kwargs['config_lagged_regressors'] or kwargs['config_country_holidays'] or kwargs['config_events']: inputs, targets = tabularize_univariate_datetime(df, **kwargs) self.init_after_tabularized(inputs, targets) self.filter_samples_after_init(kwargs["prediction_frequency"]) @@ -103,7 +103,7 @@ def __getitem__(self, index): """ # TODO: Drop config_train from self! learning_rate = self.kwargs['config_train'].learning_rate - if self.kwargs['predict_mode'] or (learning_rate is None) or self.kwargs['config_lagged_regressors']: + if self.kwargs['predict_mode'] or (learning_rate is None) or self.kwargs['config_lagged_regressors'] or self.kwargs['config_country_holidays'] or self.kwargs['config_events']: sample = self.samples[index] targets = self.targets[index] meta = self.meta @@ -364,7 +364,7 @@ def tabularize_univariate_datetime( #n_samples = len(df) - max_lags + 1 - n_forecasts #TODO learning_rate = config_train.learning_rate - if predict_mode or (learning_rate is None): + if predict_mode or (learning_rate is None) or config_lagged_regressors or config_country_holidays or config_events: n_samples = len(df) - max_lags + 1 - n_forecasts else: n_samples=1 From fca7adff3d035ab5e47e44f90faa4e3bbd83ef3e Mon Sep 17 00:00:00 2001 From: Simon W Date: Thu, 21 Dec 2023 15:47:15 -0800 Subject: [PATCH 013/128] adjustes pytests --- tests/test_unit.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_unit.py b/tests/test_unit.py index c07c98527..b8d6a26d8 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -76,6 +76,13 @@ def test_time_dataset(): n_forecasts = 1 valid_p = 0.2 config_missing = configure.MissingDataHandling() + config_train = configure.Train( + learning_rate=LR, + epochs=EPOCHS, + batch_size=BATCH_SIZE, + loss_func="SmoothL1Loss", + optimizer="AdamW", + ) df_train, df_val = df_utils.split_df(df_in, n_lags, n_forecasts, valid_p) # create a tabularized dataset from time series df, _, _ = df_utils.check_dataframe(df_train) @@ -83,7 +90,7 @@ def test_time_dataset(): df = df.drop("ID", axis=1) df = df_utils.normalize(df, global_data_params) inputs, targets = time_dataset.tabularize_univariate_datetime( - df, n_lags=n_lags, n_forecasts=n_forecasts, config_missing=config_missing + df, n_lags=n_lags, n_forecasts=n_forecasts, config_missing=config_missing, config_train=config_train ) log.debug( "tabularized inputs: {}".format( From 139a97f908564175c73cbab6ecd5e9d6787afdbd Mon Sep 17 00:00:00 2001 From: Simon W Date: Thu, 21 Dec 2023 16:12:28 -0800 Subject: [PATCH 014/128] selective forecasting --- neuralprophet/time_dataset.py | 6 +++--- tests/test_unit.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 26b822990..7642eb06f 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -66,7 +66,7 @@ def __init__(self, df, name, **kwargs): self.kwargs = kwargs learning_rate = kwargs['config_train'].learning_rate - if kwargs['predict_mode'] or (learning_rate is None) or kwargs['config_lagged_regressors'] or kwargs['config_country_holidays'] or kwargs['config_events']: + if kwargs['predict_mode'] or (learning_rate is None) or kwargs['config_lagged_regressors'] or kwargs['config_country_holidays'] or kwargs['config_events'] or kwargs['prediction_frequency']: inputs, targets = tabularize_univariate_datetime(df, **kwargs) self.init_after_tabularized(inputs, targets) self.filter_samples_after_init(kwargs["prediction_frequency"]) @@ -103,7 +103,7 @@ def __getitem__(self, index): """ # TODO: Drop config_train from self! learning_rate = self.kwargs['config_train'].learning_rate - if self.kwargs['predict_mode'] or (learning_rate is None) or self.kwargs['config_lagged_regressors'] or self.kwargs['config_country_holidays'] or self.kwargs['config_events']: + if self.kwargs['predict_mode'] or (learning_rate is None) or self.kwargs['config_lagged_regressors'] or self.kwargs['config_country_holidays'] or self.kwargs['config_events'] or self.kwargs['prediction_frequency']: sample = self.samples[index] targets = self.targets[index] meta = self.meta @@ -364,7 +364,7 @@ def tabularize_univariate_datetime( #n_samples = len(df) - max_lags + 1 - n_forecasts #TODO learning_rate = config_train.learning_rate - if predict_mode or (learning_rate is None) or config_lagged_regressors or config_country_holidays or config_events: + if predict_mode or (learning_rate is None) or config_lagged_regressors or config_country_holidays or config_events or prediction_frequency: n_samples = len(df) - max_lags + 1 - n_forecasts else: n_samples=1 diff --git a/tests/test_unit.py b/tests/test_unit.py index b8d6a26d8..6a3df35bb 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -814,7 +814,7 @@ def test_too_many_NaN(): impute_missing=True, impute_linear=5, impute_rolling=5, drop_missing=False ) config_train = configure.Train( - learning_rate=LR, + learning_rate=None, epochs=EPOCHS, batch_size=BATCH_SIZE, loss_func="SmoothL1Loss", @@ -839,7 +839,7 @@ def test_too_many_NaN(): df["ID"] = "__df__" # Check if ValueError is thrown, if NaN values remain after auto-imputing with pytest.raises(ValueError): - time_dataset.TimeDataset(df, "name", predict_mode=False, config_missing=config_missing, config_train=config_train, predict_steps=1, prediction_frequency=None) + time_dataset.TimeDataset(df, "name", predict_mode=False, config_missing=config_missing, config_lagged_regressors=None, config_country_holidays=None, config_events=None, config_train=config_train, predict_steps=1, prediction_frequency=None) def test_future_df_with_nan(): From 30aa303449de2dae12d8e44d884157010d252777 Mon Sep 17 00:00:00 2001 From: Simon W Date: Thu, 21 Dec 2023 16:26:19 -0800 Subject: [PATCH 015/128] black --- neuralprophet/data/process.py | 2 +- neuralprophet/time_dataset.py | 40 ++++++++++++++++++++++++++--------- tests/test_integration.py | 3 ++- tests/test_unit.py | 13 +++++++++++- 4 files changed, 45 insertions(+), 13 deletions(-) diff --git a/neuralprophet/data/process.py b/neuralprophet/data/process.py index c9190f21a..f3e44f9bb 100644 --- a/neuralprophet/data/process.py +++ b/neuralprophet/data/process.py @@ -623,5 +623,5 @@ def _create_dataset(model, df, predict_mode, prediction_frequency=None): config_regressors=model.config_regressors, config_missing=model.config_missing, prediction_frequency=prediction_frequency, - config_train=model.config_train + config_train=model.config_train, ) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 7642eb06f..f93e4e7a3 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -65,8 +65,15 @@ def __init__(self, df, name, **kwargs): ] self.kwargs = kwargs - learning_rate = kwargs['config_train'].learning_rate - if kwargs['predict_mode'] or (learning_rate is None) or kwargs['config_lagged_regressors'] or kwargs['config_country_holidays'] or kwargs['config_events'] or kwargs['prediction_frequency']: + learning_rate = kwargs["config_train"].learning_rate + if ( + kwargs["predict_mode"] + or (learning_rate is None) + or kwargs["config_lagged_regressors"] + or kwargs["config_country_holidays"] + or kwargs["config_events"] + or kwargs["prediction_frequency"] + ): inputs, targets = tabularize_univariate_datetime(df, **kwargs) self.init_after_tabularized(inputs, targets) self.filter_samples_after_init(kwargs["prediction_frequency"]) @@ -102,15 +109,22 @@ def __getitem__(self, index): Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) """ # TODO: Drop config_train from self! - learning_rate = self.kwargs['config_train'].learning_rate - if self.kwargs['predict_mode'] or (learning_rate is None) or self.kwargs['config_lagged_regressors'] or self.kwargs['config_country_holidays'] or self.kwargs['config_events'] or self.kwargs['prediction_frequency']: + learning_rate = self.kwargs["config_train"].learning_rate + if ( + self.kwargs["predict_mode"] + or (learning_rate is None) + or self.kwargs["config_lagged_regressors"] + or self.kwargs["config_country_holidays"] + or self.kwargs["config_events"] + or self.kwargs["prediction_frequency"] + ): sample = self.samples[index] targets = self.targets[index] meta = self.meta return sample, targets, meta else: start_idx = index - end_idx = start_idx + self.kwargs.get('n_lags') + self.kwargs.get('n_forecasts') + end_idx = start_idx + self.kwargs.get("n_lags") + self.kwargs.get("n_forecasts") df_slice = self.df.iloc[start_idx:end_idx] @@ -139,7 +153,6 @@ def drop_nan_init(self, drop_missing): number of steps to predict """ - def drop_nan_after_init(self, df, predict_steps, drop_missing): """Checks if inputs/targets contain any NaN values and drops them, if user opts to. Parameters @@ -361,13 +374,20 @@ def tabularize_univariate_datetime( Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) """ max_lags = get_max_num_lags(config_lagged_regressors, n_lags) - #n_samples = len(df) - max_lags + 1 - n_forecasts - #TODO + # n_samples = len(df) - max_lags + 1 - n_forecasts + # TODO learning_rate = config_train.learning_rate - if predict_mode or (learning_rate is None) or config_lagged_regressors or config_country_holidays or config_events or prediction_frequency: + if ( + predict_mode + or (learning_rate is None) + or config_lagged_regressors + or config_country_holidays + or config_events + or prediction_frequency + ): n_samples = len(df) - max_lags + 1 - n_forecasts else: - n_samples=1 + n_samples = 1 # data is stored in OrderedDict inputs = OrderedDict({}) diff --git a/tests/test_integration.py b/tests/test_integration.py index 6d1799f64..cdb3eebda 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1706,6 +1706,7 @@ def test_unused_future_regressors(): m.add_lagged_regressor("cost") m.fit(df, freq="D") + def test_on_the_fly_sampling(): start_date = "2022-10-16 00:00:00" end_date = "2022-12-30 00:00:00" @@ -1715,5 +1716,5 @@ def test_on_the_fly_sampling(): df.loc[3, "y"] = np.nan m = NeuralProphet(epochs=1, learning_rate=0.01) - m.fit(df, freq='H') + m.fit(df, freq="H") metrics = m.predict(df) diff --git a/tests/test_unit.py b/tests/test_unit.py index 6a3df35bb..be4d7d55a 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -839,7 +839,18 @@ def test_too_many_NaN(): df["ID"] = "__df__" # Check if ValueError is thrown, if NaN values remain after auto-imputing with pytest.raises(ValueError): - time_dataset.TimeDataset(df, "name", predict_mode=False, config_missing=config_missing, config_lagged_regressors=None, config_country_holidays=None, config_events=None, config_train=config_train, predict_steps=1, prediction_frequency=None) + time_dataset.TimeDataset( + df, + "name", + predict_mode=False, + config_missing=config_missing, + config_lagged_regressors=None, + config_country_holidays=None, + config_events=None, + config_train=config_train, + predict_steps=1, + prediction_frequency=None, + ) def test_future_df_with_nan(): From 381c9129d1ac3b57857e8b5cf10f9857e0f7e897 Mon Sep 17 00:00:00 2001 From: Simon W Date: Thu, 21 Dec 2023 16:28:30 -0800 Subject: [PATCH 016/128] ruff --- tests/test_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index cdb3eebda..4876f502a 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1717,4 +1717,4 @@ def test_on_the_fly_sampling(): m = NeuralProphet(epochs=1, learning_rate=0.01) m.fit(df, freq="H") - metrics = m.predict(df) + _ = m.predict(df) From 660934c0696806a81ea9da73fd44c2d5840b9161 Mon Sep 17 00:00:00 2001 From: Simon W Date: Thu, 4 Jan 2024 12:29:37 +0100 Subject: [PATCH 017/128] lagged_regressors --- neuralprophet/time_dataset.py | 14 +++++++++++++- tests/test_integration.py | 1 - 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index f93e4e7a3..333bc5d9c 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -124,7 +124,19 @@ def __getitem__(self, index): return sample, targets, meta else: start_idx = index - end_idx = start_idx + self.kwargs.get("n_lags") + self.kwargs.get("n_forecasts") + + # Lagged Regressors + if self.kwargs["config_lagged_regressors"]: + n_lagged_regressor_list = [] + for dict_name, nested_dict in self.kwargs["config_lagged_regressors"].items(): + name_of_nested_dict = dict_name + n_lagged_regressor = self.kwargs["config_lagged_regressors"][name_of_nested_dict].n_lags + n_lagged_regressor_list.append(n_lagged_regressor) + max_lag = max(self.kwargs["n_lags"], *n_lagged_regressor_list) + end_idx = start_idx + max_lag + self.kwargs.get("n_forecasts") + + else: + end_idx = start_idx + self.kwargs.get("n_lags") + self.kwargs.get("n_forecasts") df_slice = self.df.iloc[start_idx:end_idx] diff --git a/tests/test_integration.py b/tests/test_integration.py index 4876f502a..730493828 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1713,7 +1713,6 @@ def test_on_the_fly_sampling(): date_range = pd.date_range(start=start_date, end=end_date, freq="D") y = np.random.randint(0, 20, size=(len(date_range),)) df = pd.DataFrame({"ds": date_range, "y": y}) - df.loc[3, "y"] = np.nan m = NeuralProphet(epochs=1, learning_rate=0.01) m.fit(df, freq="H") From 51fa0a65c982923596c74192a8644071899a8f56 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 12 Jan 2024 11:58:38 -0800 Subject: [PATCH 018/128] Note down df path to TimeDataset --- neuralprophet/time_dataset.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 333bc5d9c..25493d4c7 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -52,7 +52,34 @@ def __init__(self, df, name, **kwargs): **kwargs : dict Identical to :meth:`tabularize_univariate_datetime` """ + self.df_original = df self.name = name + + # Currently done to df before it arrives here: + # - fit calls prep_or_copy_df, _check_dataframe, and _handle_missing_data, passes to _train + # - _train calls prep_or_copy_df, then passes to init_train_loader, which returns the train_loader + # - init_train_loader calls prep_or_copy_df, _normalize, _create_dataset (returns TimeDataset), returns dataset wrapped in DataLoader + # _create_dataset calls prep_or_copy_df, then returns GlobalTimeDataset + + # Filter missing samples and prediction frequency (does not actually drop, but index) + # filter samples + # drop nan + + # Create index mapping of sample index to df index + + # Preprocessing of features (added to df_original) + # events and holidays + + # TBD + # meta + + # Outcome after a call to init: + # + + # Things that will not be done in init, but on the fly: + # tabularize all features for each sample, return as input, targets + + #### OLD self.length = None self.inputs = OrderedDict({}) self.targets = None From da74f87fc4646f03d51b4788dbce0a2725987eb2 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 12 Jan 2024 12:13:38 -0800 Subject: [PATCH 019/128] complete notes on TimeDataset, move meta --- neuralprophet/time_dataset.py | 43 ++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 25493d4c7..6b493421b 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -52,38 +52,46 @@ def __init__(self, df, name, **kwargs): **kwargs : dict Identical to :meth:`tabularize_univariate_datetime` """ - self.df_original = df + self.df = df self.name = name + self.meta = OrderedDict({}) + self.meta["df_name"] = self.name # Currently done to df before it arrives here: - # - fit calls prep_or_copy_df, _check_dataframe, and _handle_missing_data, passes to _train - # - _train calls prep_or_copy_df, then passes to init_train_loader, which returns the train_loader - # - init_train_loader calls prep_or_copy_df, _normalize, _create_dataset (returns TimeDataset), returns dataset wrapped in DataLoader - # _create_dataset calls prep_or_copy_df, then returns GlobalTimeDataset - - # Filter missing samples and prediction frequency (does not actually drop, but index) - # filter samples - # drop nan + # -> fit calls prep_or_copy_df, _check_dataframe, and _handle_missing_data, passes to _train + # -> _train calls prep_or_copy_df, then passes to init_train_loader, which returns the train_loader + # -> init_train_loader calls prep_or_copy_df, _normalize, _create_dataset (returns TimeDataset), returns dataset wrapped in DataLoader + # ->_create_dataset calls prep_or_copy_df, then returns GlobalTimeDataset # Create index mapping of sample index to df index + # - Filter missing samples and prediction frequency (does not actually drop, but index) + # -- filter samples + # -- drop nan + # - Indexing: + # -- Note, outer indexing connected to self.length - # Preprocessing of features (added to df_original) - # events and holidays + # Preprocessing of features (added to df) + # - events and holidays - # TBD - # meta + # TODO: + # - init_after_tabularized: What must happen here, others in __getitem__? + # - define what happens in __getitem__ - # Outcome after a call to init: - # + # Future TBD + # - integration of preprocessing steps happening outside? - # Things that will not be done in init, but on the fly: + # Outcome after a call to init (summary): + # - add events and holidays columns to df + # - calculated the number of usable samples (accounting for nan and filters) + # - creates mapping of sample index to df index + + # Done later on the fly when calling __getitem__: # tabularize all features for each sample, return as input, targets #### OLD self.length = None self.inputs = OrderedDict({}) self.targets = None - self.meta = OrderedDict({}) self.two_level_inputs = [ "seasonalities", "covariates", @@ -300,7 +308,6 @@ def init_after_tabularized(self, inputs, targets=None): else: self.inputs[key] = torch.from_numpy(data).type(inputs_dtype[key]) self.targets = torch.from_numpy(targets).type(targets_dtype).unsqueeze(dim=2) - self.meta["df_name"] = self.name self.samples = self._split_nested_dict(self.inputs) def filter_samples_after_init( From 97fbe0799bf9d50989aab6874034660563d1d745 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Mon, 15 Jan 2024 16:57:49 -0800 Subject: [PATCH 020/128] Big rewrite with real and pseudocode --- neuralprophet/time_dataset.py | 228 ++++++++++++++++------------------ 1 file changed, 105 insertions(+), 123 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 6b493421b..960a4fca2 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -52,69 +52,36 @@ def __init__(self, df, name, **kwargs): **kwargs : dict Identical to :meth:`tabularize_univariate_datetime` """ - self.df = df - self.name = name - self.meta = OrderedDict({}) - self.meta["df_name"] = self.name + ## Outcome after a call to init (summary): + # - add events and holidays columns to df + # - calculated the number of usable samples (accounting for nan and filters) + # - creates mapping of sample index to df index + ## Context Notes # Currently done to df before it arrives here: # -> fit calls prep_or_copy_df, _check_dataframe, and _handle_missing_data, passes to _train # -> _train calls prep_or_copy_df, then passes to init_train_loader, which returns the train_loader # -> init_train_loader calls prep_or_copy_df, _normalize, _create_dataset (returns TimeDataset), returns dataset wrapped in DataLoader # ->_create_dataset calls prep_or_copy_df, then returns GlobalTimeDataset + # Future TODO: integrate these preprocessing steps happening outside? - # Create index mapping of sample index to df index - # - Filter missing samples and prediction frequency (does not actually drop, but index) - # -- filter samples - # -- drop nan - # - Indexing: - # -- Note, outer indexing connected to self.length - - # Preprocessing of features (added to df) - # - events and holidays - - # TODO: - # - init_after_tabularized: What must happen here, others in __getitem__? - # - define what happens in __getitem__ - - # Future TBD - # - integration of preprocessing steps happening outside? - - # Outcome after a call to init (summary): - # - add events and holidays columns to df - # - calculated the number of usable samples (accounting for nan and filters) - # - creates mapping of sample index to df index + self.df = df + self.name = name + self.meta = OrderedDict({}) + self.meta["df_name"] = self.name - # Done later on the fly when calling __getitem__: - # tabularize all features for each sample, return as input, targets + # TODO: Preprocessing of features (added to self.df) + # - events and holidays: convert date-time occurence dictionary to a column of values in the self.df + # - These will then be later tabularized in __get_item___ - #### OLD - self.length = None - self.inputs = OrderedDict({}) - self.targets = None - self.two_level_inputs = [ - "seasonalities", - "covariates", - "events", - "regressors", - ] - self.kwargs = kwargs - - learning_rate = kwargs["config_train"].learning_rate - if ( - kwargs["predict_mode"] - or (learning_rate is None) - or kwargs["config_lagged_regressors"] - or kwargs["config_country_holidays"] - or kwargs["config_events"] - or kwargs["prediction_frequency"] - ): - inputs, targets = tabularize_univariate_datetime(df, **kwargs) - self.init_after_tabularized(inputs, targets) - self.filter_samples_after_init(kwargs["prediction_frequency"]) - self.drop_nan_after_init(df, kwargs["predict_steps"], kwargs["config_missing"].drop_missing) - else: - self.df = df + ## TODO Create index mapping of sample index to df index + # - Filter missing samples and prediction frequency (does not actually drop, but creates indexmapping) + # -- filter samples + # analogous to `self.filter_samples_after_init(self.kwargs["prediction_frequency"])` + # -- drop nan + # analogous to `self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) + # save the created mapping to self.sample2index_map (used by self.sample2index_map) + self.sample2index_map, self.length = self.create_sample2index_map(df) def __getitem__(self, index): """Overrides parent class method to get an item at index. @@ -142,54 +109,61 @@ def __getitem__(self, index): each with features (np.array, float) of dims: (num_samples, n_lags) np.array, float Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) + OrderedDict + Meta information: static information about the local dataset """ - # TODO: Drop config_train from self! - learning_rate = self.kwargs["config_train"].learning_rate - if ( - self.kwargs["predict_mode"] - or (learning_rate is None) - or self.kwargs["config_lagged_regressors"] - or self.kwargs["config_country_holidays"] - or self.kwargs["config_events"] - or self.kwargs["prediction_frequency"] - ): - sample = self.samples[index] - targets = self.targets[index] - meta = self.meta - return sample, targets, meta - else: - start_idx = index - - # Lagged Regressors - if self.kwargs["config_lagged_regressors"]: - n_lagged_regressor_list = [] - for dict_name, nested_dict in self.kwargs["config_lagged_regressors"].items(): - name_of_nested_dict = dict_name - n_lagged_regressor = self.kwargs["config_lagged_regressors"][name_of_nested_dict].n_lags - n_lagged_regressor_list.append(n_lagged_regressor) - max_lag = max(self.kwargs["n_lags"], *n_lagged_regressor_list) - end_idx = start_idx + max_lag + self.kwargs.get("n_forecasts") - - else: - end_idx = start_idx + self.kwargs.get("n_lags") + self.kwargs.get("n_forecasts") - - df_slice = self.df.iloc[start_idx:end_idx] - - # Functions - inputs, targets = tabularize_univariate_datetime(df_slice, **self.kwargs) - self.init_after_tabularized(inputs, targets) - self.filter_samples_after_init(self.kwargs["prediction_frequency"]) - self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) + # Convert dataset sample index to valid dataframe positional index + # - sample index is any index up to len(dataset) + # - dataframe positional index is given by position of first target in dataframe for given sample index + df_index = self.sample_index_to_df_index(index) - sample = self.samples[index] - targets = self.targets[index] - meta = self.meta - return sample, targets, meta + # Tabularize - extract features from dataframe at given target index position + inputs, target = tabularize_univariate_datetime_single_index(self.df, target_index=df_index, **self.kwargs) + sample, target = self.format_sample(inputs, target) + return sample, target, self.meta def __len__(self): """Overrides Parent class method to get data length.""" return self.length + def create_sample2index_map(self, df): + """creates mapping of sample index to df index. + Create index mapping of sample index to df index + Filter missing samples and prediction frequency (does not actually drop, but creates indexmapping) + -- filter samples + analogous to `self.filter_samples_after_init(self.kwargs["prediction_frequency"])` + -- drop nan + analogous to `self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) + save the created mapping to sample2index_map + """ + # Prediction Frequency + prediction_frequency_mask = self.create_prediction_frequency_filter_mask( + self, self.kwargs["prediction_frequency"] + ) + + # TODO: limit start end range + # Pseudo code: concat[np.zeros(n_lags), np.ones(n_samples - n_lags -n_forecasts +1),np.zeros(n_forecasts-1)] + start_end_target_mask = np.ones(len(df)) + + # TODO Create index mapping of sample index to df index + # - Filter missing samples (does not actually drop, but creates indexmapping) + # -- drop nan analogous to `self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) + nan_mask = self.create_nan_mask(df) # vector of all ones, except nans are zeros + + # TODO: Combine + # Psedocode: valid_sample = elementwise_and_operator(prediction_frequency_mask & start_end_target_mask & nan_mask) + # num_samples = sum(valid_sample) + # sample2index_map = convert valid_sample to list of the positinal index of all true/one entries + # e.g. [0,0,1,1,0,1,0] -> [2,3,5] + sample2index_map = np.ones(len(df)) + + return sample2index_map, num_samples + + def sample_index_to_df_index(self, sample_index): + """Translates a single outer sample to dataframe index""" + # Will need more sophisticated mapping for GlobalTimeDataset + return self.sample2index_map[sample_index] + def drop_nan_init(self, drop_missing): """Checks if inputs/targets contain any NaN values and drops them, if user opts to. Parameters @@ -200,7 +174,7 @@ def drop_nan_init(self, drop_missing): number of steps to predict """ - def drop_nan_after_init(self, df, predict_steps, drop_missing): + def create_nan_mask(self, df, predict_steps, drop_missing): """Checks if inputs/targets contain any NaN values and drops them, if user opts to. Parameters ---------- @@ -209,6 +183,8 @@ def drop_nan_after_init(self, df, predict_steps, drop_missing): predict_steps : int number of steps to predict """ + # TODO: rewrite to return mask instead of filtering df. + nan_idx = [] # NaNs in inputs for key, data in self.inputs.items(): @@ -266,8 +242,8 @@ def split_dict(inputs, index): length = next(iter(inputs.values())).shape[0] return [split_dict(inputs, i) for i in range(length)] - def init_after_tabularized(self, inputs, targets=None): - """Create Timedataset with data. + def format_sample(self, inputs, targets=None): + """Convert tabularizes sample to correct formats. Parameters ---------- inputs : ordered dict @@ -275,6 +251,7 @@ def init_after_tabularized(self, inputs, targets=None): targets : np.array, float Identical to returns from :meth:`tabularize_univariate_datetime` """ + sample_input = OrderedDict({}) inputs_dtype = { "time": torch.float, "timestamps": np.datetime64, @@ -285,11 +262,12 @@ def init_after_tabularized(self, inputs, targets=None): "regressors": torch.float, } targets_dtype = torch.float - self.length = inputs["time"].shape[0] + + sample_target = torch.from_numpy(targets).type(targets_dtype).unsqueeze(dim=2) for key, data in inputs.items(): if key in self.two_level_inputs: - self.inputs[key] = OrderedDict({}) + sample_input[key] = OrderedDict({}) for name, features in data.items(): if features.dtype != np.float32: features = features.astype(np.float32, copy=False) @@ -297,24 +275,29 @@ def init_after_tabularized(self, inputs, targets=None): tensor = torch.from_numpy(features) if tensor.dtype != inputs_dtype[key]: - self.inputs[key][name] = tensor.to( + sample_input[key][name] = tensor.to( dtype=inputs_dtype[key] ) # this can probably be removed, but was included in the previous code else: - self.inputs[key][name] = tensor + sample_input[key][name] = tensor else: if key == "timestamps": - self.inputs[key] = data + sample_input[key] = data else: - self.inputs[key] = torch.from_numpy(data).type(inputs_dtype[key]) - self.targets = torch.from_numpy(targets).type(targets_dtype).unsqueeze(dim=2) - self.samples = self._split_nested_dict(self.inputs) + sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) + sample_input = self._split_nested_dict(sample_input) + + ## Not sure if this needs be done here anymore? + # Exact timestamps are not needed anymore + sample_input.pop("timestamps") - def filter_samples_after_init( + return sample_input, sample_target + + def create_prediction_frequency_filter_mask( self, prediction_frequency=None, ): - """Filters samples from the dataset based on the forecast frequency. + """Filters prediction target index from df based on the forecast frequency setting. Parameters ---------- prediction_frequency : int @@ -323,40 +306,37 @@ def filter_samples_after_init( ---- E.g. if prediction_frequency=7, forecasts are only made on every 7th step (once in a week in case of daily resolution). + + Returns mask where prediction target start indexes to be included are ones, and the rest zeros. """ if prediction_frequency is None or prediction_frequency == 1: return # Only the first target timestamp is of interest for filtering - timestamps = pd.to_datetime([sample["timestamps"][0] for sample in self.samples]) + timestamps = pd.to_datetime([x["timestamps"][0] for x in self.df]) # This may need adjusting masks = [] for key, value in prediction_frequency.items(): if key == "daily-hour": - mask = timestamps.hour == value + 1 # because prediction starts one step after origin + mask = timestamps.hour == value elif key == "weekly-day": - mask = timestamps.dayofweek == value + 1 + mask = timestamps.dayofweek == value elif key == "monthly-day": - mask = timestamps.day == value + 1 + mask = timestamps.day == value elif key == "yearly-month": - mask = timestamps.month == value + 1 + mask = timestamps.month == value elif key == "hourly-minute": - mask = timestamps.minute == value + 1 + mask = timestamps.minute == value else: raise ValueError(f"Invalid prediction frequency: {key}") masks.append(mask) mask = np.ones((len(timestamps),), dtype=bool) for m in masks: mask = mask & m - self.samples = [self.samples[i] for i in range(len(self.samples)) if mask[i]] - - # Exact timestamps are not needed anymore - self.inputs.pop("timestamps") - for sample in self.samples: - sample.pop("timestamps") - self.length = len(self.samples) + return mask -def tabularize_univariate_datetime( +def tabularize_univariate_datetime_single_index( df, + target_index, predict_mode=False, n_lags=0, n_forecasts=1, @@ -421,7 +401,9 @@ def tabularize_univariate_datetime( """ max_lags = get_max_num_lags(config_lagged_regressors, n_lags) # n_samples = len(df) - max_lags + 1 - n_forecasts - # TODO + + # TODO convert to single sample version + learning_rate = config_train.learning_rate if ( predict_mode From bdf529c56f11fcf0dcf92c9cad9e72e1fb711a1b Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 17 Jan 2024 15:54:05 -0800 Subject: [PATCH 021/128] create_target_start_end_mask --- neuralprophet/time_dataset.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 960a4fca2..58249913a 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -69,6 +69,7 @@ def __init__(self, df, name, **kwargs): self.name = name self.meta = OrderedDict({}) self.meta["df_name"] = self.name + self.config_args = kwargs # TODO: Preprocessing of features (added to self.df) # - events and holidays: convert date-time occurence dictionary to a column of values in the self.df @@ -140,10 +141,8 @@ def create_sample2index_map(self, df): prediction_frequency_mask = self.create_prediction_frequency_filter_mask( self, self.kwargs["prediction_frequency"] ) - - # TODO: limit start end range - # Pseudo code: concat[np.zeros(n_lags), np.ones(n_samples - n_lags -n_forecasts +1),np.zeros(n_forecasts-1)] - start_end_target_mask = np.ones(len(df)) + # Limit target range due to input lags and number of forecasts + target_start_end_mask = self.create_target_start_end_mask() # TODO Create index mapping of sample index to df index # - Filter missing samples (does not actually drop, but creates indexmapping) @@ -151,7 +150,7 @@ def create_sample2index_map(self, df): nan_mask = self.create_nan_mask(df) # vector of all ones, except nans are zeros # TODO: Combine - # Psedocode: valid_sample = elementwise_and_operator(prediction_frequency_mask & start_end_target_mask & nan_mask) + # Psedocode: valid_sample = elementwise_and_operator(prediction_frequency_mask & target_start_end_mask & nan_mask) # num_samples = sum(valid_sample) # sample2index_map = convert valid_sample to list of the positinal index of all true/one entries # e.g. [0,0,1,1,0,1,0] -> [2,3,5] @@ -174,6 +173,17 @@ def drop_nan_init(self, drop_missing): number of steps to predict """ + def create_target_start_end_mask(self, df): + """Creates binary mask for valid targets based on limiting input lags and forecast targets.""" + max_lags = get_max_num_lags(self.config_args["config_lagged_regressors"], self.config_args["n_lags"]) + n_forecasts = self.config_args["n_forecasts"] + length = len(df) + start_pad = np.zeros(max_lags) + valid_targets = np.ones(length - max_lags - n_forecasts + 1) + end_pad = np.zeros(n_forecasts - 1) + target_start_end_mask = np.concatenate((start_pad, valid_targets, end_pad), axis=None) + return target_start_end_mask + def create_nan_mask(self, df, predict_steps, drop_missing): """Checks if inputs/targets contain any NaN values and drops them, if user opts to. Parameters From c814115da5c60d8467d0cfa999ac9bbe90e77f58 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 17 Jan 2024 16:02:14 -0800 Subject: [PATCH 022/128] boolean mask --- neuralprophet/time_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 58249913a..7fc2ac47d 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -174,13 +174,13 @@ def drop_nan_init(self, drop_missing): """ def create_target_start_end_mask(self, df): - """Creates binary mask for valid targets based on limiting input lags and forecast targets.""" + """Creates a boolean mask for valid targets based on limiting input lags and forecast targets.""" max_lags = get_max_num_lags(self.config_args["config_lagged_regressors"], self.config_args["n_lags"]) n_forecasts = self.config_args["n_forecasts"] length = len(df) - start_pad = np.zeros(max_lags) - valid_targets = np.ones(length - max_lags - n_forecasts + 1) - end_pad = np.zeros(n_forecasts - 1) + start_pad = np.zeros(max_lags, dtype=bool) + valid_targets = np.ones(length - max_lags - n_forecasts + 1, dtype=bool) + end_pad = np.zeros(n_forecasts - 1, dtype=bool) target_start_end_mask = np.concatenate((start_pad, valid_targets, end_pad), axis=None) return target_start_end_mask From 711941992442ae6ebf6d26a28529952a6dea0eab Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 17 Jan 2024 16:18:56 -0800 Subject: [PATCH 023/128] combine masks into map --- neuralprophet/time_dataset.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 7fc2ac47d..00e33c363 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -139,22 +139,31 @@ def create_sample2index_map(self, df): """ # Prediction Frequency prediction_frequency_mask = self.create_prediction_frequency_filter_mask( - self, self.kwargs["prediction_frequency"] + self, self.config_args["prediction_frequency"] ) # Limit target range due to input lags and number of forecasts - target_start_end_mask = self.create_target_start_end_mask() + df_length = len(df) + max_lags = get_max_num_lags(self.config_args["config_lagged_regressors"], self.config_args["n_lags"]) + n_forecasts = self.config_args["n_forecasts"] + target_start_end_mask = self.create_target_start_end_mask( + df_length=df_length, max_lags=max_lags, n_forecasts=n_forecasts + ) # TODO Create index mapping of sample index to df index # - Filter missing samples (does not actually drop, but creates indexmapping) # -- drop nan analogous to `self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) nan_mask = self.create_nan_mask(df) # vector of all ones, except nans are zeros - # TODO: Combine - # Psedocode: valid_sample = elementwise_and_operator(prediction_frequency_mask & target_start_end_mask & nan_mask) - # num_samples = sum(valid_sample) - # sample2index_map = convert valid_sample to list of the positinal index of all true/one entries + # Combine masks + mask = np.logical_and(prediction_frequency_mask, target_start_end_mask) + valid_sample_mask = np.logical_and(mask, nan_mask) + # Convert boolean valid_sample to list of the positinal index of all true/one entries # e.g. [0,0,1,1,0,1,0] -> [2,3,5] - sample2index_map = np.ones(len(df)) + index_range = np.arange(0, df_length) + sample2index_map = index_range[valid_sample_mask] + + num_samples = np.sum(valid_sample_mask) + assert len(sample2index_map) == num_samples return sample2index_map, num_samples @@ -173,13 +182,10 @@ def drop_nan_init(self, drop_missing): number of steps to predict """ - def create_target_start_end_mask(self, df): + def create_target_start_end_mask(self, df_length, max_lags, n_forecasts): """Creates a boolean mask for valid targets based on limiting input lags and forecast targets.""" - max_lags = get_max_num_lags(self.config_args["config_lagged_regressors"], self.config_args["n_lags"]) - n_forecasts = self.config_args["n_forecasts"] - length = len(df) start_pad = np.zeros(max_lags, dtype=bool) - valid_targets = np.ones(length - max_lags - n_forecasts + 1, dtype=bool) + valid_targets = np.ones(df_length - max_lags - n_forecasts + 1, dtype=bool) end_pad = np.zeros(n_forecasts - 1, dtype=bool) target_start_end_mask = np.concatenate((start_pad, valid_targets, end_pad), axis=None) return target_start_end_mask @@ -317,7 +323,7 @@ def create_prediction_frequency_filter_mask( E.g. if prediction_frequency=7, forecasts are only made on every 7th step (once in a week in case of daily resolution). - Returns mask where prediction target start indexes to be included are ones, and the rest zeros. + Returns boolean mask where prediction target start indexes to be included are True, and the rest False. """ if prediction_frequency is None or prediction_frequency == 1: return From 66bb911c4c2570bf2ac9cee5db7c4652660764da Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 17 Jan 2024 16:27:05 -0800 Subject: [PATCH 024/128] notes for nan check --- neuralprophet/time_dataset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 00e33c363..b238432a4 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -141,6 +141,7 @@ def create_sample2index_map(self, df): prediction_frequency_mask = self.create_prediction_frequency_filter_mask( self, self.config_args["prediction_frequency"] ) + # Limit target range due to input lags and number of forecasts df_length = len(df) max_lags = get_max_num_lags(self.config_args["config_lagged_regressors"], self.config_args["n_lags"]) @@ -152,7 +153,10 @@ def create_sample2index_map(self, df): # TODO Create index mapping of sample index to df index # - Filter missing samples (does not actually drop, but creates indexmapping) # -- drop nan analogous to `self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) - nan_mask = self.create_nan_mask(df) # vector of all ones, except nans are zeros + # Note: needs to also account for NANs in lagged inputs or in n_forecasts, not just first target. + # Implement a convolutional filter for targets and each lagged regressor. + # Also account for future regressors and events. + nan_mask = self.create_nan_mask(df) # boolean array where NAN are False # Combine masks mask = np.logical_and(prediction_frequency_mask, target_start_end_mask) From fe382c1f722e90bec27a25b7a51c01976ea58027 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 17 Jan 2024 16:30:54 -0800 Subject: [PATCH 025/128] bypass NAN filter --- neuralprophet/time_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index b238432a4..7478f4a04 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -203,8 +203,10 @@ def create_nan_mask(self, df, predict_steps, drop_missing): predict_steps : int number of steps to predict """ - # TODO: rewrite to return mask instead of filtering df. + # TODO implement actual filtering + return np.ones(len(df), dtype=bool) + # TODO: rewrite to return mask instead of filtering df. nan_idx = [] # NaNs in inputs for key, data in self.inputs.items(): From 8ec4f9f43eeb2d5670f5bd2c1a8b00f6b55c3a55 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 18 Jan 2024 16:19:04 -0800 Subject: [PATCH 026/128] rework index to point at prediction origin, not first forecast. --- neuralprophet/time_dataset.py | 291 ++++++++++++++++++---------------- 1 file changed, 150 insertions(+), 141 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 7478f4a04..9abc38a4a 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -63,7 +63,11 @@ def __init__(self, df, name, **kwargs): # -> _train calls prep_or_copy_df, then passes to init_train_loader, which returns the train_loader # -> init_train_loader calls prep_or_copy_df, _normalize, _create_dataset (returns TimeDataset), returns dataset wrapped in DataLoader # ->_create_dataset calls prep_or_copy_df, then returns GlobalTimeDataset - # Future TODO: integrate these preprocessing steps happening outside? + # Future TODO: integrate some of these preprocessing steps happening outside? + + # TODO: Preprocessing of features (added to self.df) + # - events and holidays: convert date-time occurence dictionary to a column of values in the self.df + # - These will then be later tabularized in __get_item___ self.df = df self.name = name @@ -71,17 +75,6 @@ def __init__(self, df, name, **kwargs): self.meta["df_name"] = self.name self.config_args = kwargs - # TODO: Preprocessing of features (added to self.df) - # - events and holidays: convert date-time occurence dictionary to a column of values in the self.df - # - These will then be later tabularized in __get_item___ - - ## TODO Create index mapping of sample index to df index - # - Filter missing samples and prediction frequency (does not actually drop, but creates indexmapping) - # -- filter samples - # analogous to `self.filter_samples_after_init(self.kwargs["prediction_frequency"])` - # -- drop nan - # analogous to `self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) - # save the created mapping to self.sample2index_map (used by self.sample2index_map) self.sample2index_map, self.length = self.create_sample2index_map(df) def __getitem__(self, index): @@ -119,7 +112,7 @@ def __getitem__(self, index): df_index = self.sample_index_to_df_index(index) # Tabularize - extract features from dataframe at given target index position - inputs, target = tabularize_univariate_datetime_single_index(self.df, target_index=df_index, **self.kwargs) + inputs, target = tabularize_univariate_datetime_single_index(self.df, target_index=df_index, **self.config_args) sample, target = self.format_sample(inputs, target) return sample, target, self.meta @@ -127,75 +120,121 @@ def __len__(self): """Overrides Parent class method to get data length.""" return self.length + def sample_index_to_df_index(self, sample_index): + """Translates a single outer sample to dataframe index""" + # Will need more sophisticated mapping for GlobalTimeDataset + return self.sample2index_map[sample_index] + def create_sample2index_map(self, df): - """creates mapping of sample index to df index. - Create index mapping of sample index to df index - Filter missing samples and prediction frequency (does not actually drop, but creates indexmapping) - -- filter samples - analogous to `self.filter_samples_after_init(self.kwargs["prediction_frequency"])` - -- drop nan - analogous to `self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) - save the created mapping to sample2index_map + """creates mapping of sample index to corresponding df index at prediction origin. + (prediction origin: last observation before forecast / future period starts). + return created mapping to sample2index_map and number of samples. """ - # Prediction Frequency - prediction_frequency_mask = self.create_prediction_frequency_filter_mask( - self, self.config_args["prediction_frequency"] - ) # Limit target range due to input lags and number of forecasts df_length = len(df) max_lags = get_max_num_lags(self.config_args["config_lagged_regressors"], self.config_args["n_lags"]) n_forecasts = self.config_args["n_forecasts"] - target_start_end_mask = self.create_target_start_end_mask( + origin_start_end_mask = self.create_origin_start_end_mask( df_length=df_length, max_lags=max_lags, n_forecasts=n_forecasts ) + # Prediction Frequency + # Filter missing samples and prediction frequency (does not actually drop, but creates indexmapping) + # analogous to `self.filter_samples_after_init( + # self.kwargs["prediction_frequency"])` + prediction_frequency_mask = self.create_prediction_frequency_filter_mask( + self, df, self.config_args["prediction_frequency"] + ) + # TODO Create index mapping of sample index to df index - # - Filter missing samples (does not actually drop, but creates indexmapping) - # -- drop nan analogous to `self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) - # Note: needs to also account for NANs in lagged inputs or in n_forecasts, not just first target. - # Implement a convolutional filter for targets and each lagged regressor. - # Also account for future regressors and events. + # Drop nan analogous to `self.drop_nan_after_init( + # self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) nan_mask = self.create_nan_mask(df) # boolean array where NAN are False # Combine masks - mask = np.logical_and(prediction_frequency_mask, target_start_end_mask) + mask = np.logical_and(prediction_frequency_mask, origin_start_end_mask) valid_sample_mask = np.logical_and(mask, nan_mask) # Convert boolean valid_sample to list of the positinal index of all true/one entries # e.g. [0,0,1,1,0,1,0] -> [2,3,5] index_range = np.arange(0, df_length) - sample2index_map = index_range[valid_sample_mask] + sample_index_2_df_origin_index = index_range[valid_sample_mask] num_samples = np.sum(valid_sample_mask) - assert len(sample2index_map) == num_samples + assert len(sample_index_2_df_origin_index) == num_samples + + return sample_index_2_df_origin_index, num_samples + + def create_origin_start_end_mask(self, df_length, max_lags, n_forecasts): + """Creates a boolean mask for valid prediction origin positions. + (based on limiting input lags and forecast targets at start and end of df)""" + if max_lags >= 1: + start_pad = np.zeros(max_lags - 1, dtype=bool) + valid_targets = np.ones(df_length - max_lags - n_forecasts + 1, dtype=bool) + end_pad = np.zeros(n_forecasts, dtype=bool) + target_start_end_mask = np.concatenate((start_pad, valid_targets, end_pad), axis=None) + elif max_lags == 0 and n_forecasts == 1: + # without lags, forecast targets and origins are identical + target_start_end_mask = np.ones(df_length, dtype=bool) + else: + raise ValueError(f"max_lags value of {max_lags} not supported for n_forecasts {n_forecasts}.") + return target_start_end_mask - return sample2index_map, num_samples + def create_prediction_frequency_filter_mask( + self, + df: pd.DataFrame, + prediction_frequency=None, + ): + """Filters prediction origin index from df based on the forecast frequency setting. - def sample_index_to_df_index(self, sample_index): - """Translates a single outer sample to dataframe index""" - # Will need more sophisticated mapping for GlobalTimeDataset - return self.sample2index_map[sample_index] + Filter based on timestamp last lag before targets start - def drop_nan_init(self, drop_missing): - """Checks if inputs/targets contain any NaN values and drops them, if user opts to. Parameters ---------- - drop_missing : bool - whether to automatically drop missing samples from the data - predict_steps : int - number of steps to predict + prediction_frequency : int + periodic interval in which forecasts should be made. + Note + ---- + E.g. if prediction_frequency=7, forecasts are only made on every 7th step (once in a week in case of daily + resolution). + + Returns boolean mask where prediction origin indexes to be included are True, and the rest False. """ + # !! IMPORTANT + # TODO: Adjust top level documentation to specify that the filter is applied to prediction ORIGIN, not targets start. + # !! IMPORTANT - def create_target_start_end_mask(self, df_length, max_lags, n_forecasts): - """Creates a boolean mask for valid targets based on limiting input lags and forecast targets.""" - start_pad = np.zeros(max_lags, dtype=bool) - valid_targets = np.ones(df_length - max_lags - n_forecasts + 1, dtype=bool) - end_pad = np.zeros(n_forecasts - 1, dtype=bool) - target_start_end_mask = np.concatenate((start_pad, valid_targets, end_pad), axis=None) - return target_start_end_mask + mask = np.ones((len(df),), dtype=bool) + + # Basic case: no filter + if prediction_frequency is None or prediction_frequency == 1: + return mask + + # originally: timestamps = pd.to_datetime([x["timestamps"][0] for x in df]) + timestamps = df["timestamps"].apply(lambda x: pd.to_datetime(x[0])) + filter_masks = [] + for key, value in prediction_frequency.items(): + if key == "daily-hour": + mask = timestamps.hour == value + elif key == "weekly-day": + mask = timestamps.dayofweek == value + elif key == "monthly-day": + mask = timestamps.day == value + elif key == "yearly-month": + mask = timestamps.month == value + elif key == "hourly-minute": + mask = timestamps.minute == value + else: + raise ValueError(f"Invalid prediction frequency: {key}") + filter_masks.append(mask) + for m in filter_masks: + mask = np.logical_and(mask, m) + return mask def create_nan_mask(self, df, predict_steps, drop_missing): - """Checks if inputs/targets contain any NaN values and drops them, if user opts to. + """Creates mask for each prediction origin, + accounting for corresponding input lags / forecast targets containing any NaN values. + Parameters ---------- drop_missing : bool @@ -206,7 +245,14 @@ def create_nan_mask(self, df, predict_steps, drop_missing): # TODO implement actual filtering return np.ones(len(df), dtype=bool) - # TODO: rewrite to return mask instead of filtering df. + # Create index mapping of sample index to df index + # - Filter missing samples (does not actually drop, but creates indexmapping) + # -- drop nan analogous to `self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) + # Note: needs to also account for NANs in lagged inputs or in n_forecasts, not just first target. + # Implement a convolutional filter for targets and each lagged regressor. + # Also account for future regressors and events. + + # Rewrite to return mask instead of filtering df: nan_idx = [] # NaNs in inputs for key, data in self.inputs.items(): @@ -245,25 +291,6 @@ def create_nan_mask(self, df, predict_steps, drop_missing): "Please either adjust imputation parameters, or set 'drop_missing' to True to drop those samples." ) - @staticmethod - def _split_nested_dict(inputs): - """Split nested dict into list of dicts. - Parameters - ---------- - inputs : ordered dict - Nested dict to be split. - Returns - ------- - list of dicts - List of dicts with same keys as inputs. - """ - - def split_dict(inputs, index): - return {k: v[index] if not isinstance(v, dict) else split_dict(v, index) for k, v in inputs.items()} - - length = next(iter(inputs.values())).shape[0] - return [split_dict(inputs, i) for i in range(length)] - def format_sample(self, inputs, targets=None): """Convert tabularizes sample to correct formats. Parameters @@ -309,60 +336,29 @@ def format_sample(self, inputs, targets=None): sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) sample_input = self._split_nested_dict(sample_input) + # TODO Can this be skipped for a single sample? + # TODO Can this be optimized? + # Split nested dict into list of dicts with same keys as sample_input. + def split_dict(sample_input, index): + return {k: v[index] if not isinstance(v, dict) else split_dict(v, index) for k, v in sample_input.items()} + + length = next(iter(sample_input.values())).shape[0] + sample_input = [split_dict(sample_input, i) for i in range(length)] + ## Not sure if this needs be done here anymore? # Exact timestamps are not needed anymore sample_input.pop("timestamps") return sample_input, sample_target - def create_prediction_frequency_filter_mask( - self, - prediction_frequency=None, - ): - """Filters prediction target index from df based on the forecast frequency setting. - Parameters - ---------- - prediction_frequency : int - periodic interval in which forecasts should be made. - Note - ---- - E.g. if prediction_frequency=7, forecasts are only made on every 7th step (once in a week in case of daily - resolution). - - Returns boolean mask where prediction target start indexes to be included are True, and the rest False. - """ - if prediction_frequency is None or prediction_frequency == 1: - return - # Only the first target timestamp is of interest for filtering - timestamps = pd.to_datetime([x["timestamps"][0] for x in self.df]) # This may need adjusting - masks = [] - for key, value in prediction_frequency.items(): - if key == "daily-hour": - mask = timestamps.hour == value - elif key == "weekly-day": - mask = timestamps.dayofweek == value - elif key == "monthly-day": - mask = timestamps.day == value - elif key == "yearly-month": - mask = timestamps.month == value - elif key == "hourly-minute": - mask = timestamps.minute == value - else: - raise ValueError(f"Invalid prediction frequency: {key}") - masks.append(mask) - mask = np.ones((len(timestamps),), dtype=bool) - for m in masks: - mask = mask & m - return mask - def tabularize_univariate_datetime_single_index( - df, - target_index, - predict_mode=False, - n_lags=0, - n_forecasts=1, - predict_steps=1, + df: pd.DataFrame, + target_index: int, + predict_mode: bool = False, + n_lags: int = 0, + n_forecasts: int = 1, + predict_steps: int = 1, config_seasonality: Optional[configure.ConfigSeasonality] = None, config_events: Optional[configure.ConfigEvents] = None, config_country_holidays=None, @@ -372,15 +368,15 @@ def tabularize_univariate_datetime_single_index( config_train=None, prediction_frequency=None, ): - """Create a tabular dataset from univariate timeseries for supervised forecasting. + """Create a tabular data sample from timeseries dataframe, used for mini-batch creation. Note ---- - Data must have no gaps. - If data contains missing values, they are ignored for the creation of the dataset. - Parameters + Data must have no gaps for sample extracted at given index position. ---------- df : pd.DataFrame Sequence of observations with original ``ds``, ``y`` and normalized ``t``, ``y_scaled`` columns + target_index: int: + dataframe index position of first prediction target. config_seasonality : configure.ConfigSeasonality Configuration for seasonalities n_lags : int @@ -422,23 +418,22 @@ def tabularize_univariate_datetime_single_index( Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) """ max_lags = get_max_num_lags(config_lagged_regressors, n_lags) - # n_samples = len(df) - max_lags + 1 - n_forecasts + n_samples = 1 + + # previous workaround + # learning_rate = config_train.learning_rate + # if ( + # predict_mode + # or (learning_rate is None) + # or config_lagged_regressors + # or config_country_holidays + # or config_events + # or prediction_frequency + # ): + # n_samples = len(df) - max_lags + 1 - n_forecasts # TODO convert to single sample version - learning_rate = config_train.learning_rate - if ( - predict_mode - or (learning_rate is None) - or config_lagged_regressors - or config_country_holidays - or config_events - or prediction_frequency - ): - n_samples = len(df) - max_lags + 1 - n_forecasts - else: - n_samples = 1 - # data is stored in OrderedDict inputs = OrderedDict({}) @@ -476,13 +471,27 @@ def _stride_timestamps_for_forecasts(x): return np.array([x[i + max_lags : i + max_lags + n_forecasts] for i in range(n_samples)], dtype=dtype) # time is the time at each forecast step - t = df.loc[:, "t"].values if max_lags == 0: assert n_forecasts == 1 - time = np.expand_dims(t, 1) + time = np.expand_dims(df.loc[target_index, "t"].values, 1) else: - time = _stride_time_features_for_forecasts(t) - inputs["time"] = time # contains n_lags + n_forecasts + ## time = _stride_time_features_for_forecasts(df.loc[:, "t"].values) + x = df.loc[:, "t"].values + window_size = n_lags + n_forecasts + + if x.ndim == 1: + shape = (n_samples, window_size) + else: + shape = (n_samples, window_size) + x.shape[1:] + + stride = x.strides[0] + strides = (stride, stride) + x.strides[1:] + start_index = max_lags - n_lags + time = np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) + t = df.loc[:, "t"].values + # extract timestamps of n_lags steps before target_index and n_forecasts steps starting at target_index + time = t[target_index - n_lags : target_index + n_forecasts] + inputs["time"] = time if prediction_frequency is not None: ds = df.loc[:, "ds"].values From 23d6100c18bed7483484a14ec99fede80e2b0335 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 18 Jan 2024 17:16:09 -0800 Subject: [PATCH 027/128] tabularize: converted time and lags to single sample extraction --- neuralprophet/time_dataset.py | 106 +++++++++++++++------------------- 1 file changed, 46 insertions(+), 60 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 9abc38a4a..71123cdf0 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -112,7 +112,7 @@ def __getitem__(self, index): df_index = self.sample_index_to_df_index(index) # Tabularize - extract features from dataframe at given target index position - inputs, target = tabularize_univariate_datetime_single_index(self.df, target_index=df_index, **self.config_args) + inputs, target = tabularize_univariate_datetime_single_index(self.df, origin_index=df_index, **self.config_args) sample, target = self.format_sample(inputs, target) return sample, target, self.meta @@ -210,8 +210,12 @@ def create_prediction_frequency_filter_mask( if prediction_frequency is None or prediction_frequency == 1: return mask - # originally: timestamps = pd.to_datetime([x["timestamps"][0] for x in df]) - timestamps = df["timestamps"].apply(lambda x: pd.to_datetime(x[0])) + # OLD: timestamps were created from "ds" column in tabularization and then re-converted here + # timestamps = pd.to_datetime([x["timestamps"][0] for x in df]) + # OR + # timestamps = df["timestamps"].apply(lambda x: pd.to_datetime(x[0])) + + timestamps = pd.to_datetime(df.loc[:, "ds"].values) filter_masks = [] for key, value in prediction_frequency.items(): if key == "daily-hour": @@ -303,7 +307,7 @@ def format_sample(self, inputs, targets=None): sample_input = OrderedDict({}) inputs_dtype = { "time": torch.float, - "timestamps": np.datetime64, + # "timestamps": np.datetime64, "seasonalities": torch.float, "events": torch.float, "lags": torch.float, @@ -330,10 +334,9 @@ def format_sample(self, inputs, targets=None): else: sample_input[key][name] = tensor else: - if key == "timestamps": - sample_input[key] = data - else: - sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) + # if key == "timestamps": sample_input[key] = data + # else: sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) + sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) sample_input = self._split_nested_dict(sample_input) # TODO Can this be skipped for a single sample? @@ -345,16 +348,15 @@ def split_dict(sample_input, index): length = next(iter(sample_input.values())).shape[0] sample_input = [split_dict(sample_input, i) for i in range(length)] - ## Not sure if this needs be done here anymore? - # Exact timestamps are not needed anymore - sample_input.pop("timestamps") + ## timestamps should no longer be present here? + # sample_input.pop("timestamps") # Exact timestamps are not needed anymore return sample_input, sample_target def tabularize_univariate_datetime_single_index( df: pd.DataFrame, - target_index: int, + origin_index: int, predict_mode: bool = False, n_lags: int = 0, n_forecasts: int = 1, @@ -375,8 +377,8 @@ def tabularize_univariate_datetime_single_index( ---------- df : pd.DataFrame Sequence of observations with original ``ds``, ``y`` and normalized ``t``, ``y_scaled`` columns - target_index: int: - dataframe index position of first prediction target. + origin_index: int: + dataframe index position of last observed lag before forecast starts. config_seasonality : configure.ConfigSeasonality Configuration for seasonalities n_lags : int @@ -420,7 +422,7 @@ def tabularize_univariate_datetime_single_index( max_lags = get_max_num_lags(config_lagged_regressors, n_lags) n_samples = 1 - # previous workaround + # OLD: previous workaround # learning_rate = config_train.learning_rate # if ( # predict_mode @@ -432,11 +434,37 @@ def tabularize_univariate_datetime_single_index( # ): # n_samples = len(df) - max_lags + 1 - n_forecasts - # TODO convert to single sample version - # data is stored in OrderedDict inputs = OrderedDict({}) + # time is the time at each sample's lags and forecasts + if max_lags == 0: + assert n_forecasts == 1 + # OLD: time = np.expand_dims(df.loc[origin_index, "t"].values, 1) + inputs["time"] = df.loc[origin_index, "t"].values + else: + # extract time value of n_lags steps before origin_index and n_forecasts steps starting at origin_index + ## OLD: inputs["time"] = _stride_time_features_for_forecasts(df.loc[:, "t"].values) + inputs["time"] = df[origin_index - n_lags : origin_index + n_forecasts, "t"].values + + if n_lags >= 1 and "y" in df.columns: + # OLD + # def _stride_lagged_features(df_col_name, feature_dims): + # # only for case where max_lags > 0 + # assert feature_dims >= 1 + # series = df.loc[:, df_col_name].values + # # Added dtype=np.float64 to solve the problem with np.isnan for ubuntu test + # return np.array( + # [series[i + max_lags - feature_dims : i + max_lags] for i in range(n_samples)], dtype=np.float32 + # ) + # inputs["lags"] = _stride_lagged_features(df_col_name="y_scaled", feature_dims=n_lags) + + # Extract n_lags steps up to and including origin_index + # inputs["lags"] = np.array(df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values, dtype=np.float32) + inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values + + # ----------- TODO convert to single sample version ---------------------- + def _stride_time_features_for_forecasts(x): window_size = n_lags + n_forecasts @@ -462,45 +490,6 @@ def _stride_lagged_features(df_col_name, feature_dims): [series[i + max_lags - feature_dims : i + max_lags] for i in range(n_samples)], dtype=np.float32 ) - def _stride_timestamps_for_forecasts(x): - # only for case where n_lags > 0 - if x.dtype != np.float64: - dtype = np.datetime64 - else: - dtype = np.float64 - return np.array([x[i + max_lags : i + max_lags + n_forecasts] for i in range(n_samples)], dtype=dtype) - - # time is the time at each forecast step - if max_lags == 0: - assert n_forecasts == 1 - time = np.expand_dims(df.loc[target_index, "t"].values, 1) - else: - ## time = _stride_time_features_for_forecasts(df.loc[:, "t"].values) - x = df.loc[:, "t"].values - window_size = n_lags + n_forecasts - - if x.ndim == 1: - shape = (n_samples, window_size) - else: - shape = (n_samples, window_size) + x.shape[1:] - - stride = x.strides[0] - strides = (stride, stride) + x.strides[1:] - start_index = max_lags - n_lags - time = np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) - t = df.loc[:, "t"].values - # extract timestamps of n_lags steps before target_index and n_forecasts steps starting at target_index - time = t[target_index - n_lags : target_index + n_forecasts] - inputs["time"] = time - - if prediction_frequency is not None: - ds = df.loc[:, "ds"].values - if max_lags == 0: # is it rather n_lags? - timestamps = np.expand_dims(ds, 1) - else: - timestamps = _stride_timestamps_for_forecasts(ds) - inputs["timestamps"] = timestamps - if config_seasonality is not None: seasonalities = seasonal_features_from_dates(df, config_seasonality) for name, features in seasonalities.items(): @@ -511,9 +500,6 @@ def _stride_timestamps_for_forecasts(x): seasonalities[name] = _stride_time_features_for_forecasts(features) inputs["seasonalities"] = seasonalities - if n_lags > 0 and "y" in df.columns: - inputs["lags"] = _stride_lagged_features(df_col_name="y_scaled", feature_dims=n_lags) - if config_lagged_regressors is not None and max_lags > 0: covariates = OrderedDict({}) for covar in df.columns: @@ -615,7 +601,7 @@ def fourier_series(dates, period, series_order): Parameters ---------- dates : pd.Series - Containing timestamps + Containing time stamps period : float Number of days of the period series_order : int From 49af45be1d7190971ee33fd46cd4689efa164eba Mon Sep 17 00:00:00 2001 From: ourownstory Date: Tue, 23 Jan 2024 15:01:53 -0800 Subject: [PATCH 028/128] convert lagged regressors --- neuralprophet/time_dataset.py | 55 +++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 71123cdf0..d2e3b49ee 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -437,7 +437,7 @@ def tabularize_univariate_datetime_single_index( # data is stored in OrderedDict inputs = OrderedDict({}) - # time is the time at each sample's lags and forecasts + # TIME: the time at each sample's lags and forecasts if max_lags == 0: assert n_forecasts == 1 # OLD: time = np.expand_dims(df.loc[origin_index, "t"].values, 1) @@ -447,7 +447,11 @@ def tabularize_univariate_datetime_single_index( ## OLD: inputs["time"] = _stride_time_features_for_forecasts(df.loc[:, "t"].values) inputs["time"] = df[origin_index - n_lags : origin_index + n_forecasts, "t"].values + # LAGS: From y-series, extract preceeding n_lags steps up to and including origin_index if n_lags >= 1 and "y" in df.columns: + # inputs["lags"] = np.array(df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values, dtype=np.float32) + inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values + # OLD # def _stride_lagged_features(df_col_name, feature_dims): # # only for case where max_lags > 0 @@ -459,9 +463,34 @@ def tabularize_univariate_datetime_single_index( # ) # inputs["lags"] = _stride_lagged_features(df_col_name="y_scaled", feature_dims=n_lags) - # Extract n_lags steps up to and including origin_index - # inputs["lags"] = np.array(df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values, dtype=np.float32) - inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values + # COVARIATES / LAGGED REGRESSORS: Lagged regressor inputs: analogous to LAGS + if config_lagged_regressors is not None and max_lags > 0: + lagged_regressors = OrderedDict({}) + # TODO: optimize this computation for many lagged_regressors + for lagged_reg in df.columns: + if lagged_reg in config_lagged_regressors: + assert config_lagged_regressors[lagged_reg].n_lags > 0 + covar_lags = config_lagged_regressors[lagged_reg].n_lags + lagged_regressors[lagged_reg] = df.loc[ + origin_index - covar_lags + 1 : origin_index + 1, lagged_reg + ].values + inputs["covariates"] = lagged_regressors + + # OLD + # def _stride_lagged_features(df_col_name, feature_dims): + # # only for case where max_lags > 0 + # assert feature_dims >= 1 + # series = df.loc[:, df_col_name].values + # # Added dtype=np.float64 to solve the problem with np.isnan for ubuntu test + # return np.array( + # [series[i + max_lags - feature_dims : i + max_lags] for i in range(n_samples)], dtype=np.float32 + # ) + # for covar in df.columns: + # if covar in config_lagged_regressors: + # assert config_lagged_regressors[covar].n_lags > 0 + # window = config_lagged_regressors[covar].n_lags + # covariates[covar] = _stride_lagged_features(df_col_name=covar, feature_dims=window) + # inputs["covariates"] = covariates # ----------- TODO convert to single sample version ---------------------- @@ -481,15 +510,6 @@ def _stride_time_features_for_forecasts(x): def _stride_future_time_features_for_forecasts(x): return np.array([x[max_lags + i : max_lags + i + n_forecasts] for i in range(n_samples)], dtype=x.dtype) - def _stride_lagged_features(df_col_name, feature_dims): - # only for case where max_lags > 0 - assert feature_dims >= 1 - series = df.loc[:, df_col_name].values - # Added dtype=np.float64 to solve the problem with np.isnan for ubuntu test - return np.array( - [series[i + max_lags - feature_dims : i + max_lags] for i in range(n_samples)], dtype=np.float32 - ) - if config_seasonality is not None: seasonalities = seasonal_features_from_dates(df, config_seasonality) for name, features in seasonalities.items(): @@ -500,15 +520,6 @@ def _stride_lagged_features(df_col_name, feature_dims): seasonalities[name] = _stride_time_features_for_forecasts(features) inputs["seasonalities"] = seasonalities - if config_lagged_regressors is not None and max_lags > 0: - covariates = OrderedDict({}) - for covar in df.columns: - if covar in config_lagged_regressors: - assert config_lagged_regressors[covar].n_lags > 0 - window = config_lagged_regressors[covar].n_lags - covariates[covar] = _stride_lagged_features(df_col_name=covar, feature_dims=window) - inputs["covariates"] = covariates - # get the regressors features if config_regressors is not None: additive_regressors, multiplicative_regressors = make_regressors_features(df, config_regressors) From a35a1b83a35f79063d90497563ab33cbeec688f3 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Tue, 23 Jan 2024 15:26:40 -0800 Subject: [PATCH 029/128] consolidate seasonality computation in one script --- neuralprophet/time_dataset.py | 231 +++++++++++++++++++++++++++------- 1 file changed, 187 insertions(+), 44 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index d2e3b49ee..27fdb9190 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -466,7 +466,7 @@ def tabularize_univariate_datetime_single_index( # COVARIATES / LAGGED REGRESSORS: Lagged regressor inputs: analogous to LAGS if config_lagged_regressors is not None and max_lags > 0: lagged_regressors = OrderedDict({}) - # TODO: optimize this computation for many lagged_regressors + # Future TODO: optimize this computation for many lagged_regressors for lagged_reg in df.columns: if lagged_reg in config_lagged_regressors: assert config_lagged_regressors[lagged_reg].n_lags > 0 @@ -492,6 +492,159 @@ def tabularize_univariate_datetime_single_index( # covariates[covar] = _stride_lagged_features(df_col_name=covar, feature_dims=window) # inputs["covariates"] = covariates + # SEASONALITIES + if config_seasonality is not None: + dates = df["ds"] + assert len(dates.shape) == 1 + seasonalities = OrderedDict({}) + # Seasonality features + for name, period in config_seasonality.periods.items(): + if period.resolution > 0: + if config_seasonality.computation == "fourier": + # convert to days since epoch + t = np.array((dates - datetime(1970, 1, 1)).dt.total_seconds().astype(np.float32)) / (3600 * 24.0) + # Provides Fourier series components with the specified frequency and order. + # features: Matrix with dims (2*resolution, length len(dates)) + features = np.column_stack( + [ + fun((2.0 * (i + 1) * np.pi * t / period.period)) + for i in range(period.resolution) + for fun in (np.sin, np.cos) + ] + ) + + else: + raise NotImplementedError + if period.condition_name is not None: + # multiply seasonality features with condition mask/values + features = features * df[period.condition_name].values[:, np.newaxis] + seasonalities[name] = features + for name, features in seasonalities.items(): + if max_lags == 0: + seasonalities[name] = np.expand_dims(features, axis=1) + else: + + def _stride_time_features_for_seasonality(x): + window_size = n_lags + n_forecasts + + if x.ndim == 1: + shape = (n_samples, window_size) + else: + shape = (n_samples, window_size) + x.shape[1:] + + stride = x.strides[0] + strides = (stride, stride) + x.strides[1:] + start_index = max_lags - n_lags + return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) + + # stride into num_forecast at dim=1 for each sample, just like we did with time + seasonalities[name] = _stride_time_features_for_seasonality(features) + inputs["seasonalities"] = seasonalities + + ## OLD + # def fourier_series_t(t, period, series_order): + # """Provides Fourier series components with the specified frequency and order. + # Note + # ---- + # This function is identical to Meta AI's Prophet Library + # Parameters + # ---------- + # t : pd.Series, float + # Containing time as floating point number of days + # period : float + # Number of days of the period + # series_order : int + # Number of fourier components + # Returns + # ------- + # np.array + # Matrix with seasonality features + # """ + # features = np.column_stack( + # [fun((2.0 * (i + 1) * np.pi * t / period)) for i in range(series_order) for fun in (np.sin, np.cos)] + # ) + # return features + + # def fourier_series(dates, period, series_order): + # """Provides Fourier series components with the specified frequency and order. + # Note + # ---- + # Identical to OG Prophet. + # Parameters + # ---------- + # dates : pd.Series + # Containing time stamps + # period : float + # Number of days of the period + # series_order : int + # Number of fourier components + # Returns + # ------- + # np.array + # Matrix with seasonality features + # """ + # # convert to days since epoch + # t = np.array((dates - datetime(1970, 1, 1)).dt.total_seconds().astype(np.float32)) / (3600 * 24.0) + # return fourier_series_t(t, period, series_order) + + # def seasonal_features_from_dates(df, config_seasonality: configure.ConfigSeasonality): + # """Dataframe with seasonality features. + # Includes seasonality features + # Parameters + # ---------- + # df : pd.DataFrame + # Dataframe with all values + # config_seasonality : configure.ConfigSeasonality + # Configuration for seasonalities + # Returns + # ------- + # OrderedDict + # Dictionary with keys for each period name containing an np.array + # with the respective regression features. each with dims: (len(dates), 2*fourier_order) + # """ + # dates = df["ds"] + # assert len(dates.shape) == 1 + # seasonalities = OrderedDict({}) + # # Seasonality features + # for name, period in config_seasonality.periods.items(): + # if period.resolution > 0: + # if config_seasonality.computation == "fourier": + # # features: Matrix with dims (2*resolution, length len(dates)) + # features = fourier_series( + # dates=dates, + # period=period.period, + # series_order=period.resolution, + # ) + # else: + # raise NotImplementedError + # if period.condition_name is not None + # # multiply seasonality features with condition mask/values: + # features = features * df[period.condition_name].values[:, np.newaxis] + # seasonalities[name] = features + # return seasonalities + + # def _stride_time_features_for_seasonality(x): + # window_size = n_lags + n_forecasts + + # if x.ndim == 1: + # shape = (n_samples, window_size) + # else: + # shape = (n_samples, window_size) + x.shape[1:] + + # stride = x.strides[0] + # strides = (stride, stride) + x.strides[1:] + # start_index = max_lags - n_lags + # return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) + + # seasonalities = seasonal_features_from_dates(df, config_seasonality) + # for name, features in seasonalities.items(): + # if max_lags == 0: + # seasonalities[name] = np.expand_dims(features, axis=1) + # else: + # # stride into num_forecast at dim=1 for each sample, just like we did with time + # seasonalities[name] = _stride_time_features_for_seasonality(features) + # inputs["seasonalities"] = seasonalities + # ----------- TODO convert to single sample version ---------------------- def _stride_time_features_for_forecasts(x): @@ -510,16 +663,6 @@ def _stride_time_features_for_forecasts(x): def _stride_future_time_features_for_forecasts(x): return np.array([x[max_lags + i : max_lags + i + n_forecasts] for i in range(n_samples)], dtype=x.dtype) - if config_seasonality is not None: - seasonalities = seasonal_features_from_dates(df, config_seasonality) - for name, features in seasonalities.items(): - if max_lags == 0: - seasonalities[name] = np.expand_dims(features, axis=1) - else: - # stride into num_forecast at dim=1 for each sample, just like we did with time - seasonalities[name] = _stride_time_features_for_forecasts(features) - inputs["seasonalities"] = seasonalities - # get the regressors features if config_regressors is not None: additive_regressors, multiplicative_regressors = make_regressors_features(df, config_regressors) @@ -808,36 +951,36 @@ def make_regressors_features(df, config_regressors): return additive_regressors, multiplicative_regressors -def seasonal_features_from_dates(df, config_seasonality: configure.ConfigSeasonality): - """Dataframe with seasonality features. - Includes seasonality features, holiday features, and added regressors. - Parameters - ---------- - df : pd.DataFrame - Dataframe with all values - config_seasonality : configure.ConfigSeasonality - Configuration for seasonalities - Returns - ------- - OrderedDict - Dictionary with keys for each period name containing an np.array - with the respective regression features. each with dims: (len(dates), 2*fourier_order) - """ - dates = df["ds"] - assert len(dates.shape) == 1 - seasonalities = OrderedDict({}) - # Seasonality features - for name, period in config_seasonality.periods.items(): - if period.resolution > 0: - if config_seasonality.computation == "fourier": - features = fourier_series( - dates=dates, - period=period.period, - series_order=period.resolution, - ) - else: - raise NotImplementedError - if period.condition_name is not None: - features = features * df[period.condition_name].values[:, np.newaxis] - seasonalities[name] = features - return seasonalities +# def seasonal_features_from_dates(df, config_seasonality: configure.ConfigSeasonality): +# """Dataframe with seasonality features. +# Includes seasonality features +# Parameters +# ---------- +# df : pd.DataFrame +# Dataframe with all values +# config_seasonality : configure.ConfigSeasonality +# Configuration for seasonalities +# Returns +# ------- +# OrderedDict +# Dictionary with keys for each period name containing an np.array +# with the respective regression features. each with dims: (len(dates), 2*fourier_order) +# """ +# dates = df["ds"] +# assert len(dates.shape) == 1 +# seasonalities = OrderedDict({}) +# # Seasonality features +# for name, period in config_seasonality.periods.items(): +# if period.resolution > 0: +# if config_seasonality.computation == "fourier": +# features = fourier_series( +# dates=dates, +# period=period.period, +# series_order=period.resolution, +# ) +# else: +# raise NotImplementedError +# if period.condition_name is not None: +# features = features * df[period.condition_name].values[:, np.newaxis] +# seasonalities[name] = features +# return seasonalities From c1c9b1bda653b22f93fdbd6fa78b554a08d7e569 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Tue, 23 Jan 2024 16:18:23 -0800 Subject: [PATCH 030/128] finish Seasonlity conversion --- neuralprophet/time_dataset.py | 86 +++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 40 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 27fdb9190..8a626217b 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -440,19 +440,32 @@ def tabularize_univariate_datetime_single_index( # TIME: the time at each sample's lags and forecasts if max_lags == 0: assert n_forecasts == 1 - # OLD: time = np.expand_dims(df.loc[origin_index, "t"].values, 1) inputs["time"] = df.loc[origin_index, "t"].values + # TODO: Possibly need extra dim? + # inputs["time"] = np.expand_dims(inputs["time"], 1) else: - # extract time value of n_lags steps before origin_index and n_forecasts steps starting at origin_index - ## OLD: inputs["time"] = _stride_time_features_for_forecasts(df.loc[:, "t"].values) - inputs["time"] = df[origin_index - n_lags : origin_index + n_forecasts, "t"].values + # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index + inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "t"].values + ## OLD: Time + # def _stride_time_features_for_forecasts(x): + # window_size = n_lags + n_forecasts + + # if x.ndim == 1: + # shape = (n_samples, window_size) + # else: + # shape = (n_samples, window_size) + x.shape[1:] + + # stride = x.strides[0] + # strides = (stride, stride) + x.strides[1:] + # start_index = max_lags - n_lags + # return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) + # inputs["time"] = _stride_time_features_for_forecasts(df.loc[:, "t"].values) # LAGS: From y-series, extract preceeding n_lags steps up to and including origin_index if n_lags >= 1 and "y" in df.columns: # inputs["lags"] = np.array(df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values, dtype=np.float32) inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values - - # OLD + # OLD Lags # def _stride_lagged_features(df_col_name, feature_dims): # # only for case where max_lags > 0 # assert feature_dims >= 1 @@ -475,8 +488,7 @@ def tabularize_univariate_datetime_single_index( origin_index - covar_lags + 1 : origin_index + 1, lagged_reg ].values inputs["covariates"] = lagged_regressors - - # OLD + # OLD Covariates # def _stride_lagged_features(df_col_name, feature_dims): # # only for case where max_lags > 0 # assert feature_dims >= 1 @@ -494,54 +506,45 @@ def tabularize_univariate_datetime_single_index( # SEASONALITIES if config_seasonality is not None: - dates = df["ds"] - assert len(dates.shape) == 1 seasonalities = OrderedDict({}) + if max_lags == 0: + assert n_forecasts == 1 + dates = df.loc[origin_index, "ds"] + else: + dates = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "ds"] + assert len(dates.shape) == 1 # Seasonality features for name, period in config_seasonality.periods.items(): if period.resolution > 0: if config_seasonality.computation == "fourier": + # Compute Fourier series components with the specified frequency and order. # convert to days since epoch t = np.array((dates - datetime(1970, 1, 1)).dt.total_seconds().astype(np.float32)) / (3600 * 24.0) - # Provides Fourier series components with the specified frequency and order. - # features: Matrix with dims (2*resolution, length len(dates)) + # features: Matrix with dims (length len(dates), 2*resolution) features = np.column_stack( - [ - fun((2.0 * (i + 1) * np.pi * t / period.period)) - for i in range(period.resolution) - for fun in (np.sin, np.cos) - ] + [np.sin((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] + + [np.cos((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] ) - + # Single nested loop version: + # features = np.column_stack( + # [ + # fun((2.0 * (i + 1) * np.pi * t / period.period)) + # for i in range(period.resolution) + # for fun in (np.sin, np.cos) + # ] + # ) else: raise NotImplementedError if period.condition_name is not None: # multiply seasonality features with condition mask/values features = features * df[period.condition_name].values[:, np.newaxis] - seasonalities[name] = features - for name, features in seasonalities.items(): - if max_lags == 0: - seasonalities[name] = np.expand_dims(features, axis=1) - else: - - def _stride_time_features_for_seasonality(x): - window_size = n_lags + n_forecasts - - if x.ndim == 1: - shape = (n_samples, window_size) - else: - shape = (n_samples, window_size) + x.shape[1:] - - stride = x.strides[0] - strides = (stride, stride) + x.strides[1:] - start_index = max_lags - n_lags - return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) - # stride into num_forecast at dim=1 for each sample, just like we did with time - seasonalities[name] = _stride_time_features_for_seasonality(features) + seasonalities[name] = features + # TODO: Possibly need extra dim? + # seasonalities[name] = np.expand_dims(seasonalities[name], 1) inputs["seasonalities"] = seasonalities - ## OLD + ## OLD Seasonality # def fourier_series_t(t, period, series_order): # """Provides Fourier series components with the specified frequency and order. # Note @@ -609,7 +612,7 @@ def _stride_time_features_for_seasonality(x): # for name, period in config_seasonality.periods.items(): # if period.resolution > 0: # if config_seasonality.computation == "fourier": - # # features: Matrix with dims (2*resolution, length len(dates)) + # # features: Matrix with dims (length len(dates), 2*resolution) # features = fourier_series( # dates=dates, # period=period.period, @@ -646,6 +649,9 @@ def _stride_time_features_for_seasonality(x): # inputs["seasonalities"] = seasonalities # ----------- TODO convert to single sample version ---------------------- + # TODO: Future Regressors + # TODO: Events + # TODO: Postprocessing def _stride_time_features_for_forecasts(x): window_size = n_lags + n_forecasts From 8271a5ed051dce4056355dc191c9720c976a5118 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Tue, 23 Jan 2024 16:19:26 -0800 Subject: [PATCH 031/128] update todos --- neuralprophet/time_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 8a626217b..9a8f33657 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -649,6 +649,7 @@ def tabularize_univariate_datetime_single_index( # inputs["seasonalities"] = seasonalities # ----------- TODO convert to single sample version ---------------------- + # TODO: Targets # TODO: Future Regressors # TODO: Events # TODO: Postprocessing From 1aad054bae5f15af5d62d30c359d581e4f6657b6 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 24 Jan 2024 12:51:30 -0800 Subject: [PATCH 032/128] complete targets and future regressors --- neuralprophet/time_dataset.py | 254 ++++++++++++++++++++++------------ 1 file changed, 162 insertions(+), 92 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 9a8f33657..f814ec184 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -246,6 +246,7 @@ def create_nan_mask(self, df, predict_steps, drop_missing): predict_steps : int number of steps to predict """ + # IMPORTANT !! # TODO implement actual filtering return np.ones(len(df), dtype=bool) @@ -296,7 +297,7 @@ def create_nan_mask(self, df, predict_steps, drop_missing): ) def format_sample(self, inputs, targets=None): - """Convert tabularizes sample to correct formats. + """Convert tabularized sample to correct formats. Parameters ---------- inputs : ordered dict @@ -421,6 +422,8 @@ def tabularize_univariate_datetime_single_index( """ max_lags = get_max_num_lags(config_lagged_regressors, n_lags) n_samples = 1 + if max_lags == 0: + assert n_forecasts == 1 # OLD: previous workaround # learning_rate = config_train.learning_rate @@ -434,12 +437,41 @@ def tabularize_univariate_datetime_single_index( # ): # n_samples = len(df) - max_lags + 1 - n_forecasts + if predict_mode: + targets = np.zeros((1, n_forecasts)) + ## OLD + # # time is the time at each forecast step + # t = df.loc[:, "t"].values + # if max_lags == 0: + # time = np.expand_dims(t, 1) + # else: + # time = _stride_time_features_for_forecasts(t) + # inputs["time"] = time # contains n_lags + n_forecasts + # targets = np.empty_like(time[:, n_lags:]) + # targets = np.nan_to_num(targets) + else: + targets = df.loc[origin_index + 1 : origin_index + 1 + n_forecasts, "y_scaled"].values + targets = np.expand_dims(targets, axis=1) + ## Alternative + # x = df["y_scaled"].values + # targets = np.array([x[origin_index + 1 : origin_index + 1 + n_forecasts]], dtype=x.dtype) + ## OLD + # # time is the time at each forecast step + # t = df.loc[:, "t"].values + # if max_lags == 0: + # time = np.expand_dims(t, 1) + # else: + # time = _stride_time_features_for_forecasts(t) + # inputs["time"] = time # contains n_lags + n_forecasts + # def _stride_future_time_features_for_forecasts(x): + # return np.array([x[max_lags + i : max_lags + i + n_forecasts] for i in range(n_samples)], dtype=x.dtype) + # targets = _stride_future_time_features_for_forecasts(df["y_scaled"].values) + # data is stored in OrderedDict inputs = OrderedDict({}) # TIME: the time at each sample's lags and forecasts if max_lags == 0: - assert n_forecasts == 1 inputs["time"] = df.loc[origin_index, "t"].values # TODO: Possibly need extra dim? # inputs["time"] = np.expand_dims(inputs["time"], 1) @@ -508,7 +540,6 @@ def tabularize_univariate_datetime_single_index( if config_seasonality is not None: seasonalities = OrderedDict({}) if max_lags == 0: - assert n_forecasts == 1 dates = df.loc[origin_index, "ds"] else: dates = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "ds"] @@ -648,58 +679,103 @@ def tabularize_univariate_datetime_single_index( # seasonalities[name] = _stride_time_features_for_seasonality(features) # inputs["seasonalities"] = seasonalities - # ----------- TODO convert to single sample version ---------------------- - # TODO: Targets - # TODO: Future Regressors - # TODO: Events - # TODO: Postprocessing - - def _stride_time_features_for_forecasts(x): - window_size = n_lags + n_forecasts - - if x.ndim == 1: - shape = (n_samples, window_size) - else: - shape = (n_samples, window_size) + x.shape[1:] - - stride = x.strides[0] - strides = (stride, stride) + x.strides[1:] - start_index = max_lags - n_lags - return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) - - def _stride_future_time_features_for_forecasts(x): - return np.array([x[max_lags + i : max_lags + i + n_forecasts] for i in range(n_samples)], dtype=x.dtype) - - # get the regressors features + # FUTURE REGRESSORS: get the future regressors features if config_regressors is not None: - additive_regressors, multiplicative_regressors = make_regressors_features(df, config_regressors) + # sort and divide regressors into multiplicative and additive + additive_regressors_names = [] + multiplicative_regressors_names = [] + for reg in sorted(df.columns.tolist()): + if reg in config_regressors: + mode = config_regressors[reg].mode + if mode == "additive": + additive_regressors_names.append(reg) + else: + multiplicative_regressors_names.append(reg) + # create numpy array of values of additive and multiplicative regressors, at correct indexes + # features dims: (n_samples/batch, n_forecasts, n_features/n_regressors) regressors = OrderedDict({}) + regressors["additive"] = None + regressors["multiplicative"] = None if max_lags == 0: - if additive_regressors is not None: - regressors["additive"] = np.expand_dims(additive_regressors, axis=1) - if multiplicative_regressors is not None: - regressors["multiplicative"] = np.expand_dims(multiplicative_regressors, axis=1) + if len(additive_regressors_names) > 0: + regressors["additive"] = np.expand_dims(df.loc[origin_index, additive_regressors_names].values, axis=0) + if len(multiplicative_regressors_names) > 0: + regressors["multiplicative"] = np.expand_dims( + df.loc[origin_index, multiplicative_regressors_names].values, axis=0 + ) else: - if additive_regressors is not None: - additive_regressor_feature_windows = [] - # additive_regressor_feature_windows_lagged = [] - for i in range(0, additive_regressors.shape[1]): - # stride into num_forecast at dim=1 for each sample, just like we did with time - stride = _stride_time_features_for_forecasts(additive_regressors[:, i]) - additive_regressor_feature_windows.append(stride) - additive_regressors = np.dstack(additive_regressor_feature_windows) - regressors["additive"] = additive_regressors - - if multiplicative_regressors is not None: - multiplicative_regressor_feature_windows = [] - for i in range(0, multiplicative_regressors.shape[1]): - stride = _stride_time_features_for_forecasts(multiplicative_regressors[:, i]) - multiplicative_regressor_feature_windows.append(stride) - multiplicative_regressors = np.dstack(multiplicative_regressor_feature_windows) - regressors["multiplicative"] = multiplicative_regressors + if len(additive_regressors_names) > 0: + regressors_add_future_window = df.loc[ + origin_index + 1 : origin_index + 1 + n_forecasts, additive_regressors_names + ].values + regressors["additive"] = np.expand_dims(regressors_add_future_window, axis=0) + ## OLD + # additive_regressor_feature_windows = [] + # # additive_regressor_feature_windows_lagged = [] + # for i in range(0, len(additive_regressors_names)): + # # stride into num_forecast at dim=1 for each sample, just like we did with time + # x = additive_regressors[:, i] + # window_size = n_lags + n_forecasts + + # if x.ndim == 1: + # shape = (n_samples, window_size) + # else: + # shape = (n_samples, window_size) + x.shape[1:] + + # stride = x.strides[0] + # strides = (stride, stride) + x.strides[1:] + # start_index = max_lags - n_lags + # stride = np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) + # additive_regressor_feature_windows.append(stride) + # additive_regressors = np.dstack(additive_regressor_feature_windows) + # regressors["additive"] = additive_regressors + if len(multiplicative_regressors_names) > 0: + regressors_mul_future_window = df.loc[ + origin_index + 1 : origin_index + 1 + n_forecasts, multiplicative_regressors_names + ].values + regressors["multiplicative"] = np.expand_dims(regressors_mul_future_window, axis=0) inputs["regressors"] = regressors + ## OLD Future regressors + # additive_regressors, multiplicative_regressors = make_regressors_features(df, config_regressors) + # for max_lags == 0, see code before merge + # if max_lags > 0: + # def _stride_time_features_for_forecasts(x):additive_regressors + # window_size = n_lags + n_forecasts + + # if x.ndim == 1: + # shape = (n_samples, window_size) + # else: + # shape = (n_samples, window_size) + x.shape[1:] + + # stride = x.strides[0] + # strides = (stride, stride) + x.strides[1:] + # start_index = max_lags - n_lags + # return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) + # if additive_regressors is not None: + # additive_regressor_feature_windows = [] + # # additive_regressor_feature_windows_lagged = [] + # for i in range(0, additive_regressors.shape[1]): + # # stride into num_forecast at dim=1 for each sample, just like we did with time + # stride = _stride_time_features_for_forecasts(additive_regressors[:, i]) + # additive_regressor_feature_windows.append(stride) + # additive_regressors = np.dstack(additive_regressor_feature_windows) + # regressors["additive"] = additive_regressors + + # if multiplicative_regressors is not None: + # multiplicative_regressor_feature_windows = [] + # for i in range(0, multiplicative_regressors.shape[1]): + # stride = _stride_time_features_for_forecasts(multiplicative_regressors[:, i]) + # multiplicative_regressor_feature_windows.append(stride) + # multiplicative_regressors = np.dstack(multiplicative_regressor_feature_windows) + # regressors["multiplicative"] = multiplicative_regressors + # inputs["regressors"] = regressors + + # ----------- TODO convert to single sample version ---------------------- + # TODO: Events + # TODO: Postprocessing & Formatting + # get the events features if config_events is not None or config_country_holidays is not None: additive_events, multiplicative_events = make_events_features(df, config_events, config_country_holidays) @@ -731,12 +807,6 @@ def _stride_future_time_features_for_forecasts(x): events["multiplicative"] = multiplicative_events inputs["events"] = events - if predict_mode: - targets = np.empty_like(time[:, n_lags:]) - targets = np.nan_to_num(targets) - else: - targets = _stride_future_time_features_for_forecasts(df["y_scaled"].values) - tabularized_input_shapes_str = "" for key, value in inputs.items(): if key in [ @@ -918,44 +988,44 @@ def make_events_features(df, config_events: Optional[configure.ConfigEvents] = N return additive_events, multiplicative_events -def make_regressors_features(df, config_regressors): - """Construct arrays of all scalar regressor features - Parameters - ---------- - df : pd.DataFrame - Dataframe with all values including the user specified regressors - config_regressors : configure.ConfigFutureRegressors - User specified regressors config - Returns - ------- - np.array - All additive regressor features - np.array - All multiplicative regressor features - """ - additive_regressors = pd.DataFrame() - multiplicative_regressors = pd.DataFrame() - - for reg in df.columns: - if reg in config_regressors: - mode = config_regressors[reg].mode - if mode == "additive": - additive_regressors[reg] = df[reg] - else: - multiplicative_regressors[reg] = df[reg] - - if not additive_regressors.empty: - additive_regressors = additive_regressors[sorted(additive_regressors.columns.tolist())] - additive_regressors = additive_regressors.values - else: - additive_regressors = None - if not multiplicative_regressors.empty: - multiplicative_regressors = multiplicative_regressors[sorted(multiplicative_regressors.columns.tolist())] - multiplicative_regressors = multiplicative_regressors.values - else: - multiplicative_regressors = None - - return additive_regressors, multiplicative_regressors +# def make_regressors_features(df, config_regressors): +# """Construct arrays of all scalar regressor features +# Parameters +# ---------- +# df : pd.DataFrame +# Dataframe with all values including the user specified regressors +# config_regressors : configure.ConfigFutureRegressors +# User specified regressors config +# Returns +# ------- +# np.array +# All additive regressor features +# np.array +# All multiplicative regressor features +# """ +# additive_regressors = pd.DataFrame() +# multiplicative_regressors = pd.DataFrame() + +# for reg in df.columns: +# if reg in config_regressors: +# mode = config_regressors[reg].mode +# if mode == "additive": +# additive_regressors[reg] = df[reg] +# else: +# multiplicative_regressors[reg] = df[reg] + +# if not additive_regressors.empty: +# additive_regressors = additive_regressors[sorted(additive_regressors.columns.tolist())] +# additive_regressors = additive_regressors.values +# else: +# additive_regressors = None +# if not multiplicative_regressors.empty: +# multiplicative_regressors = multiplicative_regressors[sorted(multiplicative_regressors.columns.tolist())] +# multiplicative_regressors = multiplicative_regressors.values +# else: +# multiplicative_regressors = None + +# return additive_regressors, multiplicative_regressors # def seasonal_features_from_dates(df, config_seasonality: configure.ConfigSeasonality): From a41138e5150687ed3fdf3341667d0527cee19c8e Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 24 Jan 2024 15:06:45 -0800 Subject: [PATCH 033/128] convert events --- neuralprophet/time_dataset.py | 1330 ++++++++++++++++++--------------- 1 file changed, 730 insertions(+), 600 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index f814ec184..5a82a56f6 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -65,16 +65,22 @@ def __init__(self, df, name, **kwargs): # ->_create_dataset calls prep_or_copy_df, then returns GlobalTimeDataset # Future TODO: integrate some of these preprocessing steps happening outside? - # TODO: Preprocessing of features (added to self.df) - # - events and holidays: convert date-time occurence dictionary to a column of values in the self.df - # - These will then be later tabularized in __get_item___ - self.df = df self.name = name self.meta = OrderedDict({}) self.meta["df_name"] = self.name self.config_args = kwargs + # TODO: Preprocessing of features (added to self.df) + # - events and holidays: convert date-time occurence dictionary to a column of values in the self.df + # - These will then be later tabularized in __get_item___ + # add events based on configuration to df + self.df = self.df.reset_index(drop=True) + ( + self.df, + self.additive_event_and_holiday_names, + self.multiplicative_event_and_holiday_names, + ) = add_event_features_to_df(self.df, self.config_args.config_events, self.config_args.config_country_holidays) self.sample2index_map, self.length = self.create_sample2index_map(df) def __getitem__(self, index): @@ -135,7 +141,7 @@ def create_sample2index_map(self, df): df_length = len(df) max_lags = get_max_num_lags(self.config_args["config_lagged_regressors"], self.config_args["n_lags"]) n_forecasts = self.config_args["n_forecasts"] - origin_start_end_mask = self.create_origin_start_end_mask( + origin_start_end_mask = create_origin_start_end_mask( df_length=df_length, max_lags=max_lags, n_forecasts=n_forecasts ) @@ -143,14 +149,14 @@ def create_sample2index_map(self, df): # Filter missing samples and prediction frequency (does not actually drop, but creates indexmapping) # analogous to `self.filter_samples_after_init( # self.kwargs["prediction_frequency"])` - prediction_frequency_mask = self.create_prediction_frequency_filter_mask( - self, df, self.config_args["prediction_frequency"] + prediction_frequency_mask = create_prediction_frequency_filter_mask( + df, self.config_args["prediction_frequency"] ) - # TODO Create index mapping of sample index to df index - # Drop nan analogous to `self.drop_nan_after_init( + # TODO Create NAN-free index mapping of sample index to df index + # analogous to `self.drop_nan_after_init( # self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) - nan_mask = self.create_nan_mask(df) # boolean array where NAN are False + nan_mask = create_nan_mask(df) # boolean array where NAN are False # Combine masks mask = np.logical_and(prediction_frequency_mask, origin_start_end_mask) @@ -165,137 +171,6 @@ def create_sample2index_map(self, df): return sample_index_2_df_origin_index, num_samples - def create_origin_start_end_mask(self, df_length, max_lags, n_forecasts): - """Creates a boolean mask for valid prediction origin positions. - (based on limiting input lags and forecast targets at start and end of df)""" - if max_lags >= 1: - start_pad = np.zeros(max_lags - 1, dtype=bool) - valid_targets = np.ones(df_length - max_lags - n_forecasts + 1, dtype=bool) - end_pad = np.zeros(n_forecasts, dtype=bool) - target_start_end_mask = np.concatenate((start_pad, valid_targets, end_pad), axis=None) - elif max_lags == 0 and n_forecasts == 1: - # without lags, forecast targets and origins are identical - target_start_end_mask = np.ones(df_length, dtype=bool) - else: - raise ValueError(f"max_lags value of {max_lags} not supported for n_forecasts {n_forecasts}.") - return target_start_end_mask - - def create_prediction_frequency_filter_mask( - self, - df: pd.DataFrame, - prediction_frequency=None, - ): - """Filters prediction origin index from df based on the forecast frequency setting. - - Filter based on timestamp last lag before targets start - - Parameters - ---------- - prediction_frequency : int - periodic interval in which forecasts should be made. - Note - ---- - E.g. if prediction_frequency=7, forecasts are only made on every 7th step (once in a week in case of daily - resolution). - - Returns boolean mask where prediction origin indexes to be included are True, and the rest False. - """ - # !! IMPORTANT - # TODO: Adjust top level documentation to specify that the filter is applied to prediction ORIGIN, not targets start. - # !! IMPORTANT - - mask = np.ones((len(df),), dtype=bool) - - # Basic case: no filter - if prediction_frequency is None or prediction_frequency == 1: - return mask - - # OLD: timestamps were created from "ds" column in tabularization and then re-converted here - # timestamps = pd.to_datetime([x["timestamps"][0] for x in df]) - # OR - # timestamps = df["timestamps"].apply(lambda x: pd.to_datetime(x[0])) - - timestamps = pd.to_datetime(df.loc[:, "ds"].values) - filter_masks = [] - for key, value in prediction_frequency.items(): - if key == "daily-hour": - mask = timestamps.hour == value - elif key == "weekly-day": - mask = timestamps.dayofweek == value - elif key == "monthly-day": - mask = timestamps.day == value - elif key == "yearly-month": - mask = timestamps.month == value - elif key == "hourly-minute": - mask = timestamps.minute == value - else: - raise ValueError(f"Invalid prediction frequency: {key}") - filter_masks.append(mask) - for m in filter_masks: - mask = np.logical_and(mask, m) - return mask - - def create_nan_mask(self, df, predict_steps, drop_missing): - """Creates mask for each prediction origin, - accounting for corresponding input lags / forecast targets containing any NaN values. - - Parameters - ---------- - drop_missing : bool - whether to automatically drop missing samples from the data - predict_steps : int - number of steps to predict - """ - # IMPORTANT !! - # TODO implement actual filtering - return np.ones(len(df), dtype=bool) - - # Create index mapping of sample index to df index - # - Filter missing samples (does not actually drop, but creates indexmapping) - # -- drop nan analogous to `self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) - # Note: needs to also account for NANs in lagged inputs or in n_forecasts, not just first target. - # Implement a convolutional filter for targets and each lagged regressor. - # Also account for future regressors and events. - - # Rewrite to return mask instead of filtering df: - nan_idx = [] - # NaNs in inputs - for key, data in self.inputs.items(): - if isinstance(data, torch.Tensor): - nans = torch.where(torch.isnan(data))[0].tolist() - if len(nans) > 0: - nan_idx += nans - elif isinstance(data, dict): - for subkey, subdata in data.items(): - nans = torch.where(torch.isnan(subdata))[0].tolist() - if len(nans) > 0: - nan_idx += nans - - # NaNs in targets that are not inserted for prediction at the end - nans = torch.where(torch.isnan(self.targets))[0].tolist() - if len(nans) > 0: - for idx in nans: - if idx not in nan_idx and idx < len(self) - predict_steps: - nan_idx.append(idx) - - nan_idx = list(set(nan_idx)) - nan_idx.sort() - if drop_missing and len(nan_idx) > 0: - log.warning(f"{len(nan_idx)} samples with missing values were dropped from the data. ") - for key, data in self.inputs.items(): - if key not in ["time", "lags"]: # "time_lagged" - for name, features in data.items(): - self.inputs[key][name] = np.delete(self.inputs[key][name], nan_idx, 0) - else: - self.inputs[key] = np.delete(self.inputs[key], nan_idx, 0) - self.targets = np.delete(self.targets, nan_idx, 0) - self.length = self.inputs["time"].shape[0] - if not drop_missing and len(nan_idx) > 0: - raise ValueError( - "Inputs/targets with missing values detected. " - "Please either adjust imputation parameters, or set 'drop_missing' to True to drop those samples." - ) - def format_sample(self, inputs, targets=None): """Convert tabularized sample to correct formats. Parameters @@ -354,474 +229,506 @@ def split_dict(sample_input, index): return sample_input, sample_target - -def tabularize_univariate_datetime_single_index( - df: pd.DataFrame, - origin_index: int, - predict_mode: bool = False, - n_lags: int = 0, - n_forecasts: int = 1, - predict_steps: int = 1, - config_seasonality: Optional[configure.ConfigSeasonality] = None, - config_events: Optional[configure.ConfigEvents] = None, - config_country_holidays=None, - config_lagged_regressors: Optional[configure.ConfigLaggedRegressors] = None, - config_regressors: Optional[configure.ConfigFutureRegressors] = None, - config_missing=None, - config_train=None, - prediction_frequency=None, -): - """Create a tabular data sample from timeseries dataframe, used for mini-batch creation. - Note - ---- - Data must have no gaps for sample extracted at given index position. - ---------- - df : pd.DataFrame - Sequence of observations with original ``ds``, ``y`` and normalized ``t``, ``y_scaled`` columns - origin_index: int: - dataframe index position of last observed lag before forecast starts. - config_seasonality : configure.ConfigSeasonality - Configuration for seasonalities - n_lags : int - Number of lagged values of series to include as model inputs (aka AR-order) - n_forecasts : int - Number of steps to forecast into future - config_events : configure.ConfigEvents - User specified events, each with their upper, lower windows (int) and regularization - config_country_holidays : configure.ConfigCountryHolidays - Configurations (holiday_names, upper, lower windows, regularization) for country specific holidays - config_lagged_regressors : configure.ConfigLaggedRegressors - Configurations for lagged regressors - config_regressors : configure.ConfigFutureRegressors - Configuration for regressors - predict_mode : bool - Chooses the prediction mode - Options - * (default) ``False``: Includes target values - * ``True``: Does not include targets but includes entire dataset as input - Returns - ------- - OrderedDict - Model inputs, each of len(df) but with varying dimensions - Note - ---- - Contains the following data: - Model Inputs - * ``time`` (np.array, float), dims: (num_samples, 1) - * ``seasonalities`` (OrderedDict), named seasonalities - each with features (np.array, float) - dims: (num_samples, n_features[name]) - * ``lags`` (np.array, float), dims: (num_samples, n_lags) - * ``covariates`` (OrderedDict), named covariates, - each with features (np.array, float) of dims: (num_samples, n_lags) - * ``events`` (OrderedDict), events, - each with features (np.array, float) of dims: (num_samples, n_lags) - * ``regressors`` (OrderedDict), regressors, - each with features (np.array, float) of dims: (num_samples, n_lags) - np.array, float - Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) - """ - max_lags = get_max_num_lags(config_lagged_regressors, n_lags) - n_samples = 1 - if max_lags == 0: - assert n_forecasts == 1 - - # OLD: previous workaround - # learning_rate = config_train.learning_rate - # if ( - # predict_mode - # or (learning_rate is None) - # or config_lagged_regressors - # or config_country_holidays - # or config_events - # or prediction_frequency - # ): - # n_samples = len(df) - max_lags + 1 - n_forecasts - - if predict_mode: - targets = np.zeros((1, n_forecasts)) - ## OLD - # # time is the time at each forecast step - # t = df.loc[:, "t"].values - # if max_lags == 0: - # time = np.expand_dims(t, 1) - # else: - # time = _stride_time_features_for_forecasts(t) - # inputs["time"] = time # contains n_lags + n_forecasts - # targets = np.empty_like(time[:, n_lags:]) - # targets = np.nan_to_num(targets) - else: - targets = df.loc[origin_index + 1 : origin_index + 1 + n_forecasts, "y_scaled"].values - targets = np.expand_dims(targets, axis=1) - ## Alternative - # x = df["y_scaled"].values - # targets = np.array([x[origin_index + 1 : origin_index + 1 + n_forecasts]], dtype=x.dtype) - ## OLD - # # time is the time at each forecast step - # t = df.loc[:, "t"].values - # if max_lags == 0: - # time = np.expand_dims(t, 1) - # else: - # time = _stride_time_features_for_forecasts(t) - # inputs["time"] = time # contains n_lags + n_forecasts - # def _stride_future_time_features_for_forecasts(x): - # return np.array([x[max_lags + i : max_lags + i + n_forecasts] for i in range(n_samples)], dtype=x.dtype) - # targets = _stride_future_time_features_for_forecasts(df["y_scaled"].values) - - # data is stored in OrderedDict - inputs = OrderedDict({}) - - # TIME: the time at each sample's lags and forecasts - if max_lags == 0: - inputs["time"] = df.loc[origin_index, "t"].values - # TODO: Possibly need extra dim? - # inputs["time"] = np.expand_dims(inputs["time"], 1) - else: - # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index - inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "t"].values - ## OLD: Time - # def _stride_time_features_for_forecasts(x): - # window_size = n_lags + n_forecasts - - # if x.ndim == 1: - # shape = (n_samples, window_size) - # else: - # shape = (n_samples, window_size) + x.shape[1:] - - # stride = x.strides[0] - # strides = (stride, stride) + x.strides[1:] - # start_index = max_lags - n_lags - # return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) - # inputs["time"] = _stride_time_features_for_forecasts(df.loc[:, "t"].values) - - # LAGS: From y-series, extract preceeding n_lags steps up to and including origin_index - if n_lags >= 1 and "y" in df.columns: - # inputs["lags"] = np.array(df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values, dtype=np.float32) - inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values - # OLD Lags - # def _stride_lagged_features(df_col_name, feature_dims): - # # only for case where max_lags > 0 - # assert feature_dims >= 1 - # series = df.loc[:, df_col_name].values - # # Added dtype=np.float64 to solve the problem with np.isnan for ubuntu test - # return np.array( - # [series[i + max_lags - feature_dims : i + max_lags] for i in range(n_samples)], dtype=np.float32 - # ) - # inputs["lags"] = _stride_lagged_features(df_col_name="y_scaled", feature_dims=n_lags) - - # COVARIATES / LAGGED REGRESSORS: Lagged regressor inputs: analogous to LAGS - if config_lagged_regressors is not None and max_lags > 0: - lagged_regressors = OrderedDict({}) - # Future TODO: optimize this computation for many lagged_regressors - for lagged_reg in df.columns: - if lagged_reg in config_lagged_regressors: - assert config_lagged_regressors[lagged_reg].n_lags > 0 - covar_lags = config_lagged_regressors[lagged_reg].n_lags - lagged_regressors[lagged_reg] = df.loc[ - origin_index - covar_lags + 1 : origin_index + 1, lagged_reg - ].values - inputs["covariates"] = lagged_regressors - # OLD Covariates - # def _stride_lagged_features(df_col_name, feature_dims): - # # only for case where max_lags > 0 - # assert feature_dims >= 1 - # series = df.loc[:, df_col_name].values - # # Added dtype=np.float64 to solve the problem with np.isnan for ubuntu test - # return np.array( - # [series[i + max_lags - feature_dims : i + max_lags] for i in range(n_samples)], dtype=np.float32 - # ) - # for covar in df.columns: - # if covar in config_lagged_regressors: - # assert config_lagged_regressors[covar].n_lags > 0 - # window = config_lagged_regressors[covar].n_lags - # covariates[covar] = _stride_lagged_features(df_col_name=covar, feature_dims=window) - # inputs["covariates"] = covariates - - # SEASONALITIES - if config_seasonality is not None: - seasonalities = OrderedDict({}) + def tabularize_univariate_datetime_single_index( + self, + df: pd.DataFrame, + origin_index: int, + predict_mode: bool = False, + n_lags: int = 0, + n_forecasts: int = 1, + predict_steps: int = 1, + config_seasonality: Optional[configure.ConfigSeasonality] = None, + config_events: Optional[configure.ConfigEvents] = None, + config_country_holidays=None, + config_lagged_regressors: Optional[configure.ConfigLaggedRegressors] = None, + config_regressors: Optional[configure.ConfigFutureRegressors] = None, + config_missing=None, + config_train=None, + prediction_frequency=None, + ): + """Create a tabular data sample from timeseries dataframe, used for mini-batch creation. + Note + ---- + Data must have no gaps for sample extracted at given index position. + ---------- + df : pd.DataFrame + Sequence of observations with original ``ds``, ``y`` and normalized ``t``, ``y_scaled`` columns + origin_index: int: + dataframe index position of last observed lag before forecast starts. + config_seasonality : configure.ConfigSeasonality + Configuration for seasonalities + n_lags : int + Number of lagged values of series to include as model inputs (aka AR-order) + n_forecasts : int + Number of steps to forecast into future + config_events : configure.ConfigEvents + User specified events, each with their upper, lower windows (int) and regularization + config_country_holidays : configure.ConfigCountryHolidays + Configurations (holiday_names, upper, lower windows, regularization) for country specific holidays + config_lagged_regressors : configure.ConfigLaggedRegressors + Configurations for lagged regressors + config_regressors : configure.ConfigFutureRegressors + Configuration for regressors + predict_mode : bool + Chooses the prediction mode + Options + * (default) ``False``: Includes target values + * ``True``: Does not include targets but includes entire dataset as input + Returns + ------- + OrderedDict + Model inputs, each of len(df) but with varying dimensions + Note + ---- + Contains the following data: + Model Inputs + * ``time`` (np.array, float), dims: (num_samples, 1) + * ``seasonalities`` (OrderedDict), named seasonalities + each with features (np.array, float) - dims: (num_samples, n_features[name]) + * ``lags`` (np.array, float), dims: (num_samples, n_lags) + * ``covariates`` (OrderedDict), named covariates, + each with features (np.array, float) of dims: (num_samples, n_lags) + * ``events`` (OrderedDict), events, + each with features (np.array, float) of dims: (num_samples, n_lags) + * ``regressors`` (OrderedDict), regressors, + each with features (np.array, float) of dims: (num_samples, n_lags) + np.array, float + Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) + """ + max_lags = get_max_num_lags(config_lagged_regressors, n_lags) + n_samples = 1 if max_lags == 0: - dates = df.loc[origin_index, "ds"] + assert n_forecasts == 1 + + # OLD: previous workaround + # learning_rate = config_train.learning_rate + # if ( + # predict_mode + # or (learning_rate is None) + # or config_lagged_regressors + # or config_country_holidays + # or config_events + # or prediction_frequency + # ): + # n_samples = len(df) - max_lags + 1 - n_forecasts + + if predict_mode: + targets = np.zeros((1, n_forecasts)) + ## OLD + # # time is the time at each forecast step + # t = df.loc[:, "t"].values + # if max_lags == 0: + # time = np.expand_dims(t, 1) + # else: + # time = _stride_time_features_for_forecasts(t) + # inputs["time"] = time # contains n_lags + n_forecasts + # targets = np.empty_like(time[:, n_lags:]) + # targets = np.nan_to_num(targets) else: - dates = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "ds"] - assert len(dates.shape) == 1 - # Seasonality features - for name, period in config_seasonality.periods.items(): - if period.resolution > 0: - if config_seasonality.computation == "fourier": - # Compute Fourier series components with the specified frequency and order. - # convert to days since epoch - t = np.array((dates - datetime(1970, 1, 1)).dt.total_seconds().astype(np.float32)) / (3600 * 24.0) - # features: Matrix with dims (length len(dates), 2*resolution) - features = np.column_stack( - [np.sin((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] - + [np.cos((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] + targets = df.loc[origin_index + 1 : origin_index + 1 + n_forecasts, "y_scaled"].values + targets = np.expand_dims(targets, axis=1) + ## Alternative + # x = df["y_scaled"].values + # targets = np.array([x[origin_index + 1 : origin_index + 1 + n_forecasts]], dtype=x.dtype) + ## OLD + # # time is the time at each forecast step + # t = df.loc[:, "t"].values + # if max_lags == 0: + # time = np.expand_dims(t, 1) + # else: + # time = _stride_time_features_for_forecasts(t) + # inputs["time"] = time # contains n_lags + n_forecasts + # def _stride_future_time_features_for_forecasts(x): + # return np.array([x[max_lags + i : max_lags + i + n_forecasts] for i in range(n_samples)], dtype=x.dtype) + # targets = _stride_future_time_features_for_forecasts(df["y_scaled"].values) + + # data is stored in OrderedDict + inputs = OrderedDict({}) + + # TIME: the time at each sample's lags and forecasts + if max_lags == 0: + inputs["time"] = df.loc[origin_index, "t"].values + # TODO: Possibly need extra dim? + # inputs["time"] = np.expand_dims(inputs["time"], 1) + else: + # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index + inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "t"].values + ## OLD: Time + # def _stride_time_features_for_forecasts(x): + # window_size = n_lags + n_forecasts + + # if x.ndim == 1: + # shape = (n_samples, window_size) + # else: + # shape = (n_samples, window_size) + x.shape[1:] + + # stride = x.strides[0] + # strides = (stride, stride) + x.strides[1:] + # start_index = max_lags - n_lags + # return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) + # inputs["time"] = _stride_time_features_for_forecasts(df.loc[:, "t"].values) + + # LAGS: From y-series, extract preceeding n_lags steps up to and including origin_index + if n_lags >= 1 and "y" in df.columns: + # inputs["lags"] = np.array(df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values, dtype=np.float32) + inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values + # OLD Lags + # def _stride_lagged_features(df_col_name, feature_dims): + # # only for case where max_lags > 0 + # assert feature_dims >= 1 + # series = df.loc[:, df_col_name].values + # # Added dtype=np.float64 to solve the problem with np.isnan for ubuntu test + # return np.array( + # [series[i + max_lags - feature_dims : i + max_lags] for i in range(n_samples)], dtype=np.float32 + # ) + # inputs["lags"] = _stride_lagged_features(df_col_name="y_scaled", feature_dims=n_lags) + + # COVARIATES / LAGGED REGRESSORS: Lagged regressor inputs: analogous to LAGS + if config_lagged_regressors is not None and max_lags > 0: + lagged_regressors = OrderedDict({}) + # Future TODO: optimize this computation for many lagged_regressors + for lagged_reg in df.columns: + if lagged_reg in config_lagged_regressors: + assert config_lagged_regressors[lagged_reg].n_lags > 0 + covar_lags = config_lagged_regressors[lagged_reg].n_lags + lagged_regressors[lagged_reg] = df.loc[ + origin_index - covar_lags + 1 : origin_index + 1, lagged_reg + ].values + inputs["covariates"] = lagged_regressors + # OLD Covariates + # def _stride_lagged_features(df_col_name, feature_dims): + # # only for case where max_lags > 0 + # assert feature_dims >= 1 + # series = df.loc[:, df_col_name].values + # # Added dtype=np.float64 to solve the problem with np.isnan for ubuntu test + # return np.array( + # [series[i + max_lags - feature_dims : i + max_lags] for i in range(n_samples)], dtype=np.float32 + # ) + # for covar in df.columns: + # if covar in config_lagged_regressors: + # assert config_lagged_regressors[covar].n_lags > 0 + # window = config_lagged_regressors[covar].n_lags + # covariates[covar] = _stride_lagged_features(df_col_name=covar, feature_dims=window) + # inputs["covariates"] = covariates + + # SEASONALITIES + if config_seasonality is not None: + seasonalities = OrderedDict({}) + if max_lags == 0: + dates = df.loc[origin_index, "ds"] + else: + dates = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "ds"] + assert len(dates.shape) == 1 + # Seasonality features + for name, period in config_seasonality.periods.items(): + if period.resolution > 0: + if config_seasonality.computation == "fourier": + # Compute Fourier series components with the specified frequency and order. + # convert to days since epoch + t = np.array((dates - datetime(1970, 1, 1)).dt.total_seconds().astype(np.float32)) / ( + 3600 * 24.0 + ) + # features: Matrix with dims (length len(dates), 2*resolution) + features = np.column_stack( + [np.sin((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] + + [np.cos((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] + ) + # Single nested loop version: + # features = np.column_stack( + # [ + # fun((2.0 * (i + 1) * np.pi * t / period.period)) + # for i in range(period.resolution) + # for fun in (np.sin, np.cos) + # ] + # ) + else: + raise NotImplementedError + if period.condition_name is not None: + # multiply seasonality features with condition mask/values + features = features * df[period.condition_name].values[:, np.newaxis] + + seasonalities[name] = features + # TODO: Possibly need extra dim? + # seasonalities[name] = np.expand_dims(seasonalities[name], 1) + inputs["seasonalities"] = seasonalities + + ## OLD Seasonality + # def fourier_series_t(t, period, series_order): + # """Provides Fourier series components with the specified frequency and order. + # Note + # ---- + # This function is identical to Meta AI's Prophet Library + # Parameters + # ---------- + # t : pd.Series, float + # Containing time as floating point number of days + # period : float + # Number of days of the period + # series_order : int + # Number of fourier components + # Returns + # ------- + # np.array + # Matrix with seasonality features + # """ + # features = np.column_stack( + # [fun((2.0 * (i + 1) * np.pi * t / period)) for i in range(series_order) for fun in (np.sin, np.cos)] + # ) + # return features + + # def fourier_series(dates, period, series_order): + # """Provides Fourier series components with the specified frequency and order. + # Note + # ---- + # Identical to OG Prophet. + # Parameters + # ---------- + # dates : pd.Series + # Containing time stamps + # period : float + # Number of days of the period + # series_order : int + # Number of fourier components + # Returns + # ------- + # np.array + # Matrix with seasonality features + # """ + # # convert to days since epoch + # t = np.array((dates - datetime(1970, 1, 1)).dt.total_seconds().astype(np.float32)) / (3600 * 24.0) + # return fourier_series_t(t, period, series_order) + + # def seasonal_features_from_dates(df, config_seasonality: configure.ConfigSeasonality): + # """Dataframe with seasonality features. + # Includes seasonality features + # Parameters + # ---------- + # df : pd.DataFrame + # Dataframe with all values + # config_seasonality : configure.ConfigSeasonality + # Configuration for seasonalities + # Returns + # ------- + # OrderedDict + # Dictionary with keys for each period name containing an np.array + # with the respective regression features. each with dims: (len(dates), 2*fourier_order) + # """ + # dates = df["ds"] + # assert len(dates.shape) == 1 + # seasonalities = OrderedDict({}) + # # Seasonality features + # for name, period in config_seasonality.periods.items(): + # if period.resolution > 0: + # if config_seasonality.computation == "fourier": + # # features: Matrix with dims (length len(dates), 2*resolution) + # features = fourier_series( + # dates=dates, + # period=period.period, + # series_order=period.resolution, + # ) + # else: + # raise NotImplementedError + # if period.condition_name is not None + # # multiply seasonality features with condition mask/values: + # features = features * df[period.condition_name].values[:, np.newaxis] + # seasonalities[name] = features + # return seasonalities + + # def _stride_time_features_for_seasonality(x): + # window_size = n_lags + n_forecasts + + # if x.ndim == 1: + # shape = (n_samples, window_size) + # else: + # shape = (n_samples, window_size) + x.shape[1:] + + # stride = x.strides[0] + # strides = (stride, stride) + x.strides[1:] + # start_index = max_lags - n_lags + # return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) + + # seasonalities = seasonal_features_from_dates(df, config_seasonality) + # for name, features in seasonalities.items(): + # if max_lags == 0: + # seasonalities[name] = np.expand_dims(features, axis=1) + # else: + # # stride into num_forecast at dim=1 for each sample, just like we did with time + # seasonalities[name] = _stride_time_features_for_seasonality(features) + # inputs["seasonalities"] = seasonalities + + # FUTURE REGRESSORS: get the future regressors features + if config_regressors is not None: + # sort and divide regressors into multiplicative and additive + additive_regressors_names = [] + multiplicative_regressors_names = [] + for reg in sorted(df.columns.tolist()): + if reg in config_regressors: + mode = config_regressors[reg].mode + if mode == "additive": + additive_regressors_names.append(reg) + else: + multiplicative_regressors_names.append(reg) + + # create numpy array of values of additive and multiplicative regressors, at correct indexes + # features dims: (n_samples/batch, n_forecasts, n_features/n_regressors) + regressors = OrderedDict({}) + regressors["additive"] = None + regressors["multiplicative"] = None + if max_lags == 0: + if len(additive_regressors_names) > 0: + regressors["additive"] = np.expand_dims( + df.loc[origin_index, additive_regressors_names].values, axis=0 ) - # Single nested loop version: - # features = np.column_stack( - # [ - # fun((2.0 * (i + 1) * np.pi * t / period.period)) - # for i in range(period.resolution) - # for fun in (np.sin, np.cos) - # ] - # ) - else: - raise NotImplementedError - if period.condition_name is not None: - # multiply seasonality features with condition mask/values - features = features * df[period.condition_name].values[:, np.newaxis] - - seasonalities[name] = features - # TODO: Possibly need extra dim? - # seasonalities[name] = np.expand_dims(seasonalities[name], 1) - inputs["seasonalities"] = seasonalities - - ## OLD Seasonality - # def fourier_series_t(t, period, series_order): - # """Provides Fourier series components with the specified frequency and order. - # Note - # ---- - # This function is identical to Meta AI's Prophet Library - # Parameters - # ---------- - # t : pd.Series, float - # Containing time as floating point number of days - # period : float - # Number of days of the period - # series_order : int - # Number of fourier components - # Returns - # ------- - # np.array - # Matrix with seasonality features - # """ - # features = np.column_stack( - # [fun((2.0 * (i + 1) * np.pi * t / period)) for i in range(series_order) for fun in (np.sin, np.cos)] - # ) - # return features - - # def fourier_series(dates, period, series_order): - # """Provides Fourier series components with the specified frequency and order. - # Note - # ---- - # Identical to OG Prophet. - # Parameters - # ---------- - # dates : pd.Series - # Containing time stamps - # period : float - # Number of days of the period - # series_order : int - # Number of fourier components - # Returns - # ------- - # np.array - # Matrix with seasonality features - # """ - # # convert to days since epoch - # t = np.array((dates - datetime(1970, 1, 1)).dt.total_seconds().astype(np.float32)) / (3600 * 24.0) - # return fourier_series_t(t, period, series_order) - - # def seasonal_features_from_dates(df, config_seasonality: configure.ConfigSeasonality): - # """Dataframe with seasonality features. - # Includes seasonality features - # Parameters - # ---------- - # df : pd.DataFrame - # Dataframe with all values - # config_seasonality : configure.ConfigSeasonality - # Configuration for seasonalities - # Returns - # ------- - # OrderedDict - # Dictionary with keys for each period name containing an np.array - # with the respective regression features. each with dims: (len(dates), 2*fourier_order) - # """ - # dates = df["ds"] - # assert len(dates.shape) == 1 - # seasonalities = OrderedDict({}) - # # Seasonality features - # for name, period in config_seasonality.periods.items(): - # if period.resolution > 0: - # if config_seasonality.computation == "fourier": - # # features: Matrix with dims (length len(dates), 2*resolution) - # features = fourier_series( - # dates=dates, - # period=period.period, - # series_order=period.resolution, - # ) - # else: - # raise NotImplementedError - # if period.condition_name is not None - # # multiply seasonality features with condition mask/values: - # features = features * df[period.condition_name].values[:, np.newaxis] - # seasonalities[name] = features - # return seasonalities - - # def _stride_time_features_for_seasonality(x): - # window_size = n_lags + n_forecasts - - # if x.ndim == 1: - # shape = (n_samples, window_size) - # else: - # shape = (n_samples, window_size) + x.shape[1:] - - # stride = x.strides[0] - # strides = (stride, stride) + x.strides[1:] - # start_index = max_lags - n_lags - # return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) - - # seasonalities = seasonal_features_from_dates(df, config_seasonality) - # for name, features in seasonalities.items(): - # if max_lags == 0: - # seasonalities[name] = np.expand_dims(features, axis=1) - # else: - # # stride into num_forecast at dim=1 for each sample, just like we did with time - # seasonalities[name] = _stride_time_features_for_seasonality(features) - # inputs["seasonalities"] = seasonalities - - # FUTURE REGRESSORS: get the future regressors features - if config_regressors is not None: - # sort and divide regressors into multiplicative and additive - additive_regressors_names = [] - multiplicative_regressors_names = [] - for reg in sorted(df.columns.tolist()): - if reg in config_regressors: - mode = config_regressors[reg].mode - if mode == "additive": - additive_regressors_names.append(reg) - else: - multiplicative_regressors_names.append(reg) - - # create numpy array of values of additive and multiplicative regressors, at correct indexes - # features dims: (n_samples/batch, n_forecasts, n_features/n_regressors) - regressors = OrderedDict({}) - regressors["additive"] = None - regressors["multiplicative"] = None + if len(multiplicative_regressors_names) > 0: + regressors["multiplicative"] = np.expand_dims( + df.loc[origin_index, multiplicative_regressors_names].values, axis=0 + ) + else: + if len(additive_regressors_names) > 0: + regressors_add_future_window = df.loc[ + origin_index + 1 : origin_index + 1 + n_forecasts, additive_regressors_names + ].values + regressors["additive"] = np.expand_dims(regressors_add_future_window, axis=0) + ## OLD + # additive_regressor_feature_windows = [] + # # additive_regressor_feature_windows_lagged = [] + # for i in range(0, len(additive_regressors_names)): + # # stride into num_forecast at dim=1 for each sample, just like we did with time + # x = additive_regressors[:, i] + # window_size = n_lags + n_forecasts + + # if x.ndim == 1: + # shape = (n_samples, window_size) + # else: + # shape = (n_samples, window_size) + x.shape[1:] + + # stride = x.strides[0] + # strides = (stride, stride) + x.strides[1:] + # start_index = max_lags - n_lags + # stride = np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) + # additive_regressor_feature_windows.append(stride) + # additive_regressors = np.dstack(additive_regressor_feature_windows) + # regressors["additive"] = additive_regressors + if len(multiplicative_regressors_names) > 0: + regressors_mul_future_window = df.loc[ + origin_index + 1 : origin_index + 1 + n_forecasts, multiplicative_regressors_names + ].values + regressors["multiplicative"] = np.expand_dims(regressors_mul_future_window, axis=0) + inputs["regressors"] = regressors + + ## OLD Future regressors + # additive_regressors, multiplicative_regressors = make_regressors_features(df, config_regressors) + # for max_lags == 0, see code before merge + # if max_lags > 0: + # def _stride_time_features_for_forecasts(x):additive_regressors + # window_size = n_lags + n_forecasts + + # if x.ndim == 1: + # shape = (n_samples, window_size) + # else: + # shape = (n_samples, window_size) + x.shape[1:] + + # stride = x.strides[0] + # strides = (stride, stride) + x.strides[1:] + # start_index = max_lags - n_lags + # return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) + # if additive_regressors is not None: + # additive_regressor_feature_windows = [] + # # additive_regressor_feature_windows_lagged = [] + # for i in range(0, additive_regressors.shape[1]): + # # stride into num_forecast at dim=1 for each sample, just like we did with time + # stride = _stride_time_features_for_forecasts(additive_regressors[:, i]) + # additive_regressor_feature_windows.append(stride) + # additive_regressors = np.dstack(additive_regressor_feature_windows) + # regressors["additive"] = additive_regressors + + # if multiplicative_regressors is not None: + # multiplicative_regressor_feature_windows = [] + # for i in range(0, multiplicative_regressors.shape[1]): + # stride = _stride_time_features_for_forecasts(multiplicative_regressors[:, i]) + # multiplicative_regressor_feature_windows.append(stride) + # multiplicative_regressors = np.dstack(multiplicative_regressor_feature_windows) + # regressors["multiplicative"] = multiplicative_regressors + # inputs["regressors"] = regressors + + # FUTURE EVENTS: get the events features + # create numpy array of values of additive and multiplicative events, at correct indexes + # features dims: (n_samples/batch, n_forecasts, n_features/n_events) + events = OrderedDict({}) + events["additive"] = None + events["multiplicative"] = None if max_lags == 0: - if len(additive_regressors_names) > 0: - regressors["additive"] = np.expand_dims(df.loc[origin_index, additive_regressors_names].values, axis=0) - if len(multiplicative_regressors_names) > 0: - regressors["multiplicative"] = np.expand_dims( - df.loc[origin_index, multiplicative_regressors_names].values, axis=0 + if len(self.additive_event_and_holiday_names) > 0: + events["additive"] = np.expand_dims( + df.loc[origin_index, self.additive_event_and_holiday_names].values, axis=0 + ) + if len(self.multiplicative_event_and_holiday_names) > 0: + events["multiplicative"] = np.expand_dims( + df.loc[origin_index, self.multiplicative_event_and_holiday_names].values, axis=0 ) else: - if len(additive_regressors_names) > 0: - regressors_add_future_window = df.loc[ - origin_index + 1 : origin_index + 1 + n_forecasts, additive_regressors_names + if len(self.additive_event_and_holiday_names) > 0: + events_add_future_window = df.loc[ + origin_index + 1 : origin_index + 1 + n_forecasts, self.additive_event_and_holiday_names ].values - regressors["additive"] = np.expand_dims(regressors_add_future_window, axis=0) - ## OLD - # additive_regressor_feature_windows = [] - # # additive_regressor_feature_windows_lagged = [] - # for i in range(0, len(additive_regressors_names)): - # # stride into num_forecast at dim=1 for each sample, just like we did with time - # x = additive_regressors[:, i] - # window_size = n_lags + n_forecasts - - # if x.ndim == 1: - # shape = (n_samples, window_size) - # else: - # shape = (n_samples, window_size) + x.shape[1:] - - # stride = x.strides[0] - # strides = (stride, stride) + x.strides[1:] - # start_index = max_lags - n_lags - # stride = np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) - # additive_regressor_feature_windows.append(stride) - # additive_regressors = np.dstack(additive_regressor_feature_windows) - # regressors["additive"] = additive_regressors - if len(multiplicative_regressors_names) > 0: - regressors_mul_future_window = df.loc[ - origin_index + 1 : origin_index + 1 + n_forecasts, multiplicative_regressors_names + events["additive"] = np.expand_dims(events_add_future_window, axis=0) + if len(self.multiplicative_event_and_holiday_names) > 0: + events_mul_future_window = df.loc[ + origin_index + 1 : origin_index + 1 + n_forecasts, self.multiplicative_event_and_holiday_names ].values - regressors["multiplicative"] = np.expand_dims(regressors_mul_future_window, axis=0) - inputs["regressors"] = regressors - - ## OLD Future regressors - # additive_regressors, multiplicative_regressors = make_regressors_features(df, config_regressors) - # for max_lags == 0, see code before merge - # if max_lags > 0: - # def _stride_time_features_for_forecasts(x):additive_regressors - # window_size = n_lags + n_forecasts - - # if x.ndim == 1: - # shape = (n_samples, window_size) - # else: - # shape = (n_samples, window_size) + x.shape[1:] - - # stride = x.strides[0] - # strides = (stride, stride) + x.strides[1:] - # start_index = max_lags - n_lags - # return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) - # if additive_regressors is not None: - # additive_regressor_feature_windows = [] - # # additive_regressor_feature_windows_lagged = [] - # for i in range(0, additive_regressors.shape[1]): - # # stride into num_forecast at dim=1 for each sample, just like we did with time - # stride = _stride_time_features_for_forecasts(additive_regressors[:, i]) - # additive_regressor_feature_windows.append(stride) - # additive_regressors = np.dstack(additive_regressor_feature_windows) - # regressors["additive"] = additive_regressors - - # if multiplicative_regressors is not None: - # multiplicative_regressor_feature_windows = [] - # for i in range(0, multiplicative_regressors.shape[1]): - # stride = _stride_time_features_for_forecasts(multiplicative_regressors[:, i]) - # multiplicative_regressor_feature_windows.append(stride) - # multiplicative_regressors = np.dstack(multiplicative_regressor_feature_windows) - # regressors["multiplicative"] = multiplicative_regressors - # inputs["regressors"] = regressors - - # ----------- TODO convert to single sample version ---------------------- - # TODO: Events - # TODO: Postprocessing & Formatting - - # get the events features - if config_events is not None or config_country_holidays is not None: - additive_events, multiplicative_events = make_events_features(df, config_events, config_country_holidays) - - events = OrderedDict({}) - if max_lags == 0: - if additive_events is not None: - events["additive"] = np.expand_dims(additive_events, axis=1) - if multiplicative_events is not None: - events["multiplicative"] = np.expand_dims(multiplicative_events, axis=1) - else: - if additive_events is not None: - additive_event_feature_windows = [] - for i in range(0, additive_events.shape[1]): - # stride into num_forecast at dim=1 for each sample, just like we did with time - additive_event_feature_windows.append(_stride_time_features_for_forecasts(additive_events[:, i])) - additive_events = np.dstack(additive_event_feature_windows) - events["additive"] = additive_events - - if multiplicative_events is not None: - multiplicative_event_feature_windows = [] - # multiplicative_event_feature_windows_lagged = [] - for i in range(0, multiplicative_events.shape[1]): - # stride into num_forecast at dim=1 for each sample, just like we did with time - multiplicative_event_feature_windows.append( - _stride_time_features_for_forecasts(multiplicative_events[:, i]) - ) - multiplicative_events = np.dstack(multiplicative_event_feature_windows) - events["multiplicative"] = multiplicative_events + events["multiplicative"] = np.expand_dims(events_mul_future_window, axis=0) inputs["events"] = events - tabularized_input_shapes_str = "" - for key, value in inputs.items(): - if key in [ - "seasonalities", - "covariates", - "events", - "regressors", - ]: - for name, period_features in value.items(): - tabularized_input_shapes_str += f" {name} {key} {period_features}\n" - else: - tabularized_input_shapes_str += f" {key} {value.shape} \n" - log.debug(f"Tabularized inputs shapes: \n{tabularized_input_shapes_str}") + ## OLD + # # get the events features + # if config_events is not None or config_country_holidays is not None: + # additive_events, multiplicative_events = make_events_features(df, config_events, config_country_holidays) - return inputs, targets + # events = OrderedDict({}) + # if max_lags == 0: + # if additive_events is not None: + # events["additive"] = np.expand_dims(additive_events, axis=1) + # if multiplicative_events is not None: + # events["multiplicative"] = np.expand_dims(multiplicative_events, axis=1) + # else: + # if additive_events is not None: + # additive_event_feature_windows = [] + # for i in range(0, additive_events.shape[1]): + # # stride into num_forecast at dim=1 for each sample, just like we did with time + # additive_event_feature_windows.append(_stride_time_features_for_forecasts(additive_events[:, i])) + # additive_events = np.dstack(additive_event_feature_windows) + # events["additive"] = additive_events + + # if multiplicative_events is not None: + # multiplicative_event_feature_windows = [] + # # multiplicative_event_feature_windows_lagged = [] + # for i in range(0, multiplicative_events.shape[1]): + # # stride into num_forecast at dim=1 for each sample, just like we did with time + # multiplicative_event_feature_windows.append( + # _stride_time_features_for_forecasts(multiplicative_events[:, i]) + # ) + # multiplicative_events = np.dstack(multiplicative_event_feature_windows) + # events["multiplicative"] = multiplicative_events + # inputs["events"] = events + + # ----------- TODO convert to single sample version ---------------------- + # TODO: Postprocessing & Formatting + + tabularized_input_shapes_str = "" + for key, value in inputs.items(): + if key in [ + "seasonalities", + "covariates", + "events", + "regressors", + ]: + for name, period_features in value.items(): + tabularized_input_shapes_str += f" {name} {key} {period_features}\n" + else: + tabularized_input_shapes_str += f" {key} {value.shape} \n" + log.debug(f"Tabularized inputs shapes: \n{tabularized_input_shapes_str}") + + return inputs, targets def fourier_series(dates, period, series_order): @@ -871,7 +778,7 @@ def fourier_series_t(t, period, series_order): return features -def make_country_specific_holidays_df(year_list, country): +def make_country_specific_holidays_dict(year_list, country): """ Make dataframe of country specific holidays for given years and countries Parameters @@ -900,6 +807,32 @@ def make_country_specific_holidays_df(year_list, country): return country_specific_holidays_dict +def get_event_offset_features(event, config, feature): + """ + Create event offset features for the given event, config and feature + Parameters + ---------- + event : str + Name of the event + config : configure.ConfigEvents + User specified events, holidays, and country specific holidays + feature : pd.Series + Feature for the event + Returns + ------- + tuple + Tuple of additive_events and multiplicative_events + """ + events = pd.DataFrame({}) + lw = config.lower_window + uw = config.upper_window + for offset in range(lw, uw + 1): + key = utils.create_event_names_for_offsets(event, offset) + offset_feature = feature.shift(periods=offset, fill_value=0.0) + events[key] = offset_feature + return events + + def _create_event_offset_features(event, config, feature, additive_events, multiplicative_events): """ Create event offset features for the given event, config and feature @@ -932,6 +865,73 @@ def _create_event_offset_features(event, config, feature, additive_events, multi multiplicative_events[key] = offset_feature +def add_event_features_to_df( + df, + config_events: Optional[configure.ConfigEvents] = None, + config_country_holidays: Optional[configure.ConfigCountryHolidays] = None, +): + """ + Construct columns containing the features of each event, added to df. + Parameters + ---------- + df : pd.DataFrame + Dataframe with all values including the user specified events (provided by user) + config_events : configure.ConfigEvents + User specified events, each with their upper, lower windows (int), regularization + config_country_holidays : configure.ConfigCountryHolidays + Configurations (holiday_names, upper, lower windows, regularization) for country specific holidays + Returns + ------- + np.array + All additive event features (both user specified and country specific) + np.array + All multiplicative event features (both user specified and country specific) + """ + # create all additional user specified offest events + additive_events_names = [] + multiplicative_events_names = [] + if config_events is not None: + for event in sorted(list(config_events.keys())): + feature = df[event] + config = config_events[event] + mode = config.mode + for offset in range(config.lower_window, config.upper_window + 1): + event_offset_name = utils.create_event_names_for_offsets(event, offset) + df[event_offset_name] = feature.shift(periods=offset, fill_value=0.0) + if mode == "additive": + additive_events_names.append(event_offset_name) + else: + multiplicative_events_names.append(event_offset_name) + + # create all country specific holidays and their offsets. + additive_holiday_names = [] + multiplicative_holiday_names = [] + if config_country_holidays is not None: + year_list = list({x.year for x in df.ds}) + country_holidays_dict = make_country_specific_holidays_dict(year_list, config_country_holidays.country) + config = config_country_holidays + mode = config.mode + for holiday in config_country_holidays.holiday_names: + # feature = pd.Series([0.0] * df.shape[0]) + feature = pd.Series(np.zeros(df.shape[0], dtype=np.float32)) + if holiday in country_holidays_dict.keys(): + dates = country_holidays_dict[holiday] + feature[df.ds.isin(dates)] = 1.0 + else: + raise ValueError(f"Holiday {holiday} not found in country holidays") + for offset in range(config.lower_window, config.upper_window + 1): + holiday_offset_name = utils.create_event_names_for_offsets(holiday, offset) + df[holiday_offset_name] = feature.shift(periods=offset, fill_value=0.0) + if mode == "additive": + additive_holiday_names.append(event_offset_name) + else: + multiplicative_holiday_names.append(event_offset_name) + # Future TODO: possibly undo merge of events and holidays. + additive_event_and_holiday_names = sorted(additive_events_names + additive_holiday_names) + multiplicative_event_and_holiday_names = sorted(multiplicative_events_names + multiplicative_holiday_names) + return df, additive_event_and_holiday_names, multiplicative_event_and_holiday_names + + def make_events_features(df, config_events: Optional[configure.ConfigEvents] = None, config_country_holidays=None): """ Construct arrays of all event features @@ -963,7 +963,7 @@ def make_events_features(df, config_events: Optional[configure.ConfigEvents] = N # create all country specific holidays if config_country_holidays is not None: year_list = list({x.year for x in df.ds}) - country_holidays_dict = make_country_specific_holidays_df(year_list, config_country_holidays.country) + country_holidays_dict = make_country_specific_holidays_dict(year_list, config_country_holidays.country) for holiday in config_country_holidays.holiday_names: feature = pd.Series([0.0] * df.shape[0]) if holiday in country_holidays_dict.keys(): @@ -1061,3 +1061,133 @@ def make_events_features(df, config_events: Optional[configure.ConfigEvents] = N # features = features * df[period.condition_name].values[:, np.newaxis] # seasonalities[name] = features # return seasonalities + + +def create_origin_start_end_mask(df_length, max_lags, n_forecasts): + """Creates a boolean mask for valid prediction origin positions. + (based on limiting input lags and forecast targets at start and end of df)""" + if max_lags >= 1: + start_pad = np.zeros(max_lags - 1, dtype=bool) + valid_targets = np.ones(df_length - max_lags - n_forecasts + 1, dtype=bool) + end_pad = np.zeros(n_forecasts, dtype=bool) + target_start_end_mask = np.concatenate((start_pad, valid_targets, end_pad), axis=None) + elif max_lags == 0 and n_forecasts == 1: + # without lags, forecast targets and origins are identical + target_start_end_mask = np.ones(df_length, dtype=bool) + else: + raise ValueError(f"max_lags value of {max_lags} not supported for n_forecasts {n_forecasts}.") + return target_start_end_mask + + +def create_prediction_frequency_filter_mask(df: pd.DataFrame, prediction_frequency=None): + """Filters prediction origin index from df based on the forecast frequency setting. + + Filter based on timestamp last lag before targets start + + Parameters + ---------- + prediction_frequency : int + periodic interval in which forecasts should be made. + Note + ---- + E.g. if prediction_frequency=7, forecasts are only made on every 7th step (once in a week in case of daily + resolution). + + Returns boolean mask where prediction origin indexes to be included are True, and the rest False. + """ + # !! IMPORTANT + # TODO: Adjust top level documentation to specify that the filter is applied to prediction ORIGIN, not targets start. + # !! IMPORTANT + + mask = np.ones((len(df),), dtype=bool) + + # Basic case: no filter + if prediction_frequency is None or prediction_frequency == 1: + return mask + + # OLD: timestamps were created from "ds" column in tabularization and then re-converted here + # timestamps = pd.to_datetime([x["timestamps"][0] for x in df]) + # OR + # timestamps = df["timestamps"].apply(lambda x: pd.to_datetime(x[0])) + + timestamps = pd.to_datetime(df.loc[:, "ds"].values) + filter_masks = [] + for key, value in prediction_frequency.items(): + if key == "daily-hour": + mask = timestamps.hour == value + elif key == "weekly-day": + mask = timestamps.dayofweek == value + elif key == "monthly-day": + mask = timestamps.day == value + elif key == "yearly-month": + mask = timestamps.month == value + elif key == "hourly-minute": + mask = timestamps.minute == value + else: + raise ValueError(f"Invalid prediction frequency: {key}") + filter_masks.append(mask) + for m in filter_masks: + mask = np.logical_and(mask, m) + return mask + + +def create_nan_mask(df, predict_steps, drop_missing): + """Creates mask for each prediction origin, + accounting for corresponding input lags / forecast targets containing any NaN values. + + Parameters + ---------- + drop_missing : bool + whether to automatically drop missing samples from the data + predict_steps : int + number of steps to predict + """ + # IMPORTANT !! + # TODO implement actual filtering + return np.ones(len(df), dtype=bool) + + # Create index mapping of sample index to df index + # - Filter missing samples (does not actually drop, but creates indexmapping) + # -- drop nan analogous to `self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) + # Note: needs to also account for NANs in lagged inputs or in n_forecasts, not just first target. + # Implement a convolutional filter for targets and each lagged regressor. + # Also account for future regressors and events. + + # Rewrite to return mask instead of filtering df: + nan_idx = [] + # NaNs in inputs + for key, data in self.inputs.items(): + if isinstance(data, torch.Tensor): + nans = torch.where(torch.isnan(data))[0].tolist() + if len(nans) > 0: + nan_idx += nans + elif isinstance(data, dict): + for subkey, subdata in data.items(): + nans = torch.where(torch.isnan(subdata))[0].tolist() + if len(nans) > 0: + nan_idx += nans + + # NaNs in targets that are not inserted for prediction at the end + nans = torch.where(torch.isnan(self.targets))[0].tolist() + if len(nans) > 0: + for idx in nans: + if idx not in nan_idx and idx < len(self) - predict_steps: + nan_idx.append(idx) + + nan_idx = list(set(nan_idx)) + nan_idx.sort() + if drop_missing and len(nan_idx) > 0: + log.warning(f"{len(nan_idx)} samples with missing values were dropped from the data. ") + for key, data in self.inputs.items(): + if key not in ["time", "lags"]: # "time_lagged" + for name, features in data.items(): + self.inputs[key][name] = np.delete(self.inputs[key][name], nan_idx, 0) + else: + self.inputs[key] = np.delete(self.inputs[key], nan_idx, 0) + self.targets = np.delete(self.targets, nan_idx, 0) + self.length = self.inputs["time"].shape[0] + if not drop_missing and len(nan_idx) > 0: + raise ValueError( + "Inputs/targets with missing values detected. " + "Please either adjust imputation parameters, or set 'drop_missing' to True to drop those samples." + ) From dfc60063ada5d5d01d03baa90816ce3904f6ffcb Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 24 Jan 2024 15:24:35 -0800 Subject: [PATCH 034/128] finish events and holidays conversion --- neuralprophet/time_dataset.py | 83 ++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 5a82a56f6..5b3451e30 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -66,21 +66,24 @@ def __init__(self, df, name, **kwargs): # Future TODO: integrate some of these preprocessing steps happening outside? self.df = df + self.df = self.df.reset_index(drop=True) # Future TODO: Is this still necessary post restructuring? self.name = name self.meta = OrderedDict({}) self.meta["df_name"] = self.name self.config_args = kwargs - # TODO: Preprocessing of features (added to self.df) - # - events and holidays: convert date-time occurence dictionary to a column of values in the self.df - # - These will then be later tabularized in __get_item___ - # add events based on configuration to df - self.df = self.df.reset_index(drop=True) + # Preprocessing of events and holidays features (added to self.df) ( self.df, self.additive_event_and_holiday_names, self.multiplicative_event_and_holiday_names, ) = add_event_features_to_df(self.df, self.config_args.config_events, self.config_args.config_country_holidays) + # pre-sort additive/multiplicative regressors + self.additive_regressors_names, self.multiplicative_regressors_names = sort_regressor_names( + self.config_args.config_regressors + ) + + # Construct index map self.sample2index_map, self.length = self.create_sample2index_map(df) def __getitem__(self, index): @@ -118,8 +121,13 @@ def __getitem__(self, index): df_index = self.sample_index_to_df_index(index) # Tabularize - extract features from dataframe at given target index position - inputs, target = tabularize_univariate_datetime_single_index(self.df, origin_index=df_index, **self.config_args) + inputs, target = self.tabularize_univariate_datetime_single_index( + self, self.df, origin_index=df_index, **self.config_args + ) + # ------------------ + # Important! TODO: integrate format_sample into tabularize_univariate_datetime_single_index sample, target = self.format_sample(inputs, target) + # -------------------------- return sample, target, self.meta def __len__(self): @@ -557,23 +565,14 @@ def tabularize_univariate_datetime_single_index( # inputs["seasonalities"] = seasonalities # FUTURE REGRESSORS: get the future regressors features + # create numpy array of values of additive and multiplicative regressors, at correct indexes + # features dims: (n_samples/batch, n_forecasts, n_features/n_regressors) if config_regressors is not None: - # sort and divide regressors into multiplicative and additive - additive_regressors_names = [] - multiplicative_regressors_names = [] - for reg in sorted(df.columns.tolist()): - if reg in config_regressors: - mode = config_regressors[reg].mode - if mode == "additive": - additive_regressors_names.append(reg) - else: - multiplicative_regressors_names.append(reg) - - # create numpy array of values of additive and multiplicative regressors, at correct indexes - # features dims: (n_samples/batch, n_forecasts, n_features/n_regressors) regressors = OrderedDict({}) regressors["additive"] = None regressors["multiplicative"] = None + additive_regressors_names = self.additive_regressors_names + multiplicative_regressors_names = self.multiplicative_regressors_names if max_lags == 0: if len(additive_regressors_names) > 0: regressors["additive"] = np.expand_dims( @@ -711,22 +710,20 @@ def tabularize_univariate_datetime_single_index( # events["multiplicative"] = multiplicative_events # inputs["events"] = events - # ----------- TODO convert to single sample version ---------------------- - # TODO: Postprocessing & Formatting - - tabularized_input_shapes_str = "" - for key, value in inputs.items(): - if key in [ - "seasonalities", - "covariates", - "events", - "regressors", - ]: - for name, period_features in value.items(): - tabularized_input_shapes_str += f" {name} {key} {period_features}\n" - else: - tabularized_input_shapes_str += f" {key} {value.shape} \n" - log.debug(f"Tabularized inputs shapes: \n{tabularized_input_shapes_str}") + # ONLY FOR DEBUGGING + # tabularized_input_shapes_str = "" + # for key, value in inputs.items(): + # if key in [ + # "seasonalities", + # "covariates", + # "events", + # "regressors", + # ]: + # for name, period_features in value.items(): + # tabularized_input_shapes_str += f" {name} {key} {period_features}\n" + # else: + # tabularized_input_shapes_str += f" {key} {value.shape} \n" + # log.debug(f"Tabularized inputs shapes: \n{tabularized_input_shapes_str}") return inputs, targets @@ -1191,3 +1188,19 @@ def create_nan_mask(df, predict_steps, drop_missing): "Inputs/targets with missing values detected. " "Please either adjust imputation parameters, or set 'drop_missing' to True to drop those samples." ) + + +def sort_regressor_names(config): + additive_regressors_names = [] + multiplicative_regressors_names = [] + if config is not None: + # sort and divide regressors into multiplicative and additive + additive_regressors_names = [] + multiplicative_regressors_names = [] + for reg in sorted(list(config.keys())): + mode = config[reg].mode + if mode == "additive": + additive_regressors_names.append(reg) + else: + multiplicative_regressors_names.append(reg) + return additive_regressors_names, multiplicative_regressors_names From 62c4818497b5941882b83ff78c3bf925c7c1f7ce Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 25 Jan 2024 15:52:58 -0800 Subject: [PATCH 035/128] debug timedataset --- neuralprophet/time_dataset.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 5b3451e30..8927abc45 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -77,10 +77,12 @@ def __init__(self, df, name, **kwargs): self.df, self.additive_event_and_holiday_names, self.multiplicative_event_and_holiday_names, - ) = add_event_features_to_df(self.df, self.config_args.config_events, self.config_args.config_country_holidays) + ) = add_event_features_to_df( + self.df, self.config_args["config_events"], self.config_args["config_country_holidays"] + ) # pre-sort additive/multiplicative regressors self.additive_regressors_names, self.multiplicative_regressors_names = sort_regressor_names( - self.config_args.config_regressors + self.config_args["config_regressors"] ) # Construct index map @@ -122,7 +124,7 @@ def __getitem__(self, index): # Tabularize - extract features from dataframe at given target index position inputs, target = self.tabularize_univariate_datetime_single_index( - self, self.df, origin_index=df_index, **self.config_args + df=self.df, origin_index=df_index, **self.config_args ) # ------------------ # Important! TODO: integrate format_sample into tabularize_univariate_datetime_single_index @@ -164,7 +166,9 @@ def create_sample2index_map(self, df): # TODO Create NAN-free index mapping of sample index to df index # analogous to `self.drop_nan_after_init( # self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) - nan_mask = create_nan_mask(df) # boolean array where NAN are False + nan_mask = create_nan_mask( + df, self.config_args["predict_steps"], self.config_args["config_missing"].drop_missing + ) # boolean array where NAN are False # Combine masks mask = np.logical_and(prediction_frequency_mask, origin_start_end_mask) From e7b8f0c076e48d54517b361adb38a044d281198d Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 25 Jan 2024 16:22:39 -0800 Subject: [PATCH 036/128] debugging --- neuralprophet/time_dataset.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 8927abc45..72b49d12c 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -72,6 +72,13 @@ def __init__(self, df, name, **kwargs): self.meta["df_name"] = self.name self.config_args = kwargs + self.two_level_inputs = [ + "seasonalities", + "covariates", + "events", + "regressors", + ] + # Preprocessing of events and holidays features (added to self.df) ( self.df, From 235eea8043da2e46faad188bea4043f6c834abfd Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 25 Jan 2024 16:26:30 -0800 Subject: [PATCH 037/128] make_country_specific_holidays_df --- neuralprophet/time_dataset.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 72b49d12c..a1eaf1ce1 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -1215,3 +1215,33 @@ def sort_regressor_names(config): else: multiplicative_regressors_names.append(reg) return additive_regressors_names, multiplicative_regressors_names + + +## TODO: move - used elsewhere, not in this file. +def make_country_specific_holidays_df(year_list, country): + """ + Make dataframe of country specific holidays for given years and countries + Parameters + ---------- + year_list : list + List of years + country : str, list + List of country names + Returns + ------- + pd.DataFrame + Containing country specific holidays df with columns 'ds' and 'holiday' + """ + # iterate over countries and get holidays for each country + # convert to list if not already + if isinstance(country, str): + country = [country] + country_specific_holidays = {} + for single_country in country: + single_country_specific_holidays = get_country_holidays(single_country, year_list) + # only add holiday if it is not already in the dict + country_specific_holidays.update(single_country_specific_holidays) + country_specific_holidays_dict = defaultdict(list) + for date, holiday in country_specific_holidays.items(): + country_specific_holidays_dict[holiday].append(pd.to_datetime(date)) + return country_specific_holidays_dict From 02ff9bb33bf7c49d1f02d7ed9b2bfb46ba4c88c4 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 25 Jan 2024 16:35:48 -0800 Subject: [PATCH 038/128] remove uses of df.loc[...].values --- neuralprophet/time_dataset.py | 63 +++++++++-------------------------- 1 file changed, 16 insertions(+), 47 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index a1eaf1ce1..f8902e331 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -344,7 +344,7 @@ def tabularize_univariate_datetime_single_index( # targets = np.empty_like(time[:, n_lags:]) # targets = np.nan_to_num(targets) else: - targets = df.loc[origin_index + 1 : origin_index + 1 + n_forecasts, "y_scaled"].values + targets = df.loc[origin_index + 1 : origin_index + 1 + n_forecasts, "y_scaled"] targets = np.expand_dims(targets, axis=1) ## Alternative # x = df["y_scaled"].values @@ -366,12 +366,12 @@ def tabularize_univariate_datetime_single_index( # TIME: the time at each sample's lags and forecasts if max_lags == 0: - inputs["time"] = df.loc[origin_index, "t"].values + inputs["time"] = df.loc[origin_index, "t"] # TODO: Possibly need extra dim? # inputs["time"] = np.expand_dims(inputs["time"], 1) else: # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index - inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "t"].values + inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "t"] ## OLD: Time # def _stride_time_features_for_forecasts(x): # window_size = n_lags + n_forecasts @@ -390,7 +390,7 @@ def tabularize_univariate_datetime_single_index( # LAGS: From y-series, extract preceeding n_lags steps up to and including origin_index if n_lags >= 1 and "y" in df.columns: # inputs["lags"] = np.array(df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values, dtype=np.float32) - inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values + inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"] # OLD Lags # def _stride_lagged_features(df_col_name, feature_dims): # # only for case where max_lags > 0 @@ -410,9 +410,7 @@ def tabularize_univariate_datetime_single_index( if lagged_reg in config_lagged_regressors: assert config_lagged_regressors[lagged_reg].n_lags > 0 covar_lags = config_lagged_regressors[lagged_reg].n_lags - lagged_regressors[lagged_reg] = df.loc[ - origin_index - covar_lags + 1 : origin_index + 1, lagged_reg - ].values + lagged_regressors[lagged_reg] = df.loc[origin_index - covar_lags + 1 : origin_index + 1, lagged_reg] inputs["covariates"] = lagged_regressors # OLD Covariates # def _stride_lagged_features(df_col_name, feature_dims): @@ -586,18 +584,16 @@ def tabularize_univariate_datetime_single_index( multiplicative_regressors_names = self.multiplicative_regressors_names if max_lags == 0: if len(additive_regressors_names) > 0: - regressors["additive"] = np.expand_dims( - df.loc[origin_index, additive_regressors_names].values, axis=0 - ) + regressors["additive"] = np.expand_dims(df.loc[origin_index, additive_regressors_names], axis=0) if len(multiplicative_regressors_names) > 0: regressors["multiplicative"] = np.expand_dims( - df.loc[origin_index, multiplicative_regressors_names].values, axis=0 + df.loc[origin_index, multiplicative_regressors_names], axis=0 ) else: if len(additive_regressors_names) > 0: regressors_add_future_window = df.loc[ origin_index + 1 : origin_index + 1 + n_forecasts, additive_regressors_names - ].values + ] regressors["additive"] = np.expand_dims(regressors_add_future_window, axis=0) ## OLD # additive_regressor_feature_windows = [] @@ -622,7 +618,7 @@ def tabularize_univariate_datetime_single_index( if len(multiplicative_regressors_names) > 0: regressors_mul_future_window = df.loc[ origin_index + 1 : origin_index + 1 + n_forecasts, multiplicative_regressors_names - ].values + ] regressors["multiplicative"] = np.expand_dims(regressors_mul_future_window, axis=0) inputs["regressors"] = regressors @@ -669,23 +665,21 @@ def tabularize_univariate_datetime_single_index( events["multiplicative"] = None if max_lags == 0: if len(self.additive_event_and_holiday_names) > 0: - events["additive"] = np.expand_dims( - df.loc[origin_index, self.additive_event_and_holiday_names].values, axis=0 - ) + events["additive"] = np.expand_dims(df.loc[origin_index, self.additive_event_and_holiday_names], axis=0) if len(self.multiplicative_event_and_holiday_names) > 0: events["multiplicative"] = np.expand_dims( - df.loc[origin_index, self.multiplicative_event_and_holiday_names].values, axis=0 + df.loc[origin_index, self.multiplicative_event_and_holiday_names], axis=0 ) else: if len(self.additive_event_and_holiday_names) > 0: events_add_future_window = df.loc[ origin_index + 1 : origin_index + 1 + n_forecasts, self.additive_event_and_holiday_names - ].values + ] events["additive"] = np.expand_dims(events_add_future_window, axis=0) if len(self.multiplicative_event_and_holiday_names) > 0: events_mul_future_window = df.loc[ origin_index + 1 : origin_index + 1 + n_forecasts, self.multiplicative_event_and_holiday_names - ].values + ] events["multiplicative"] = np.expand_dims(events_mul_future_window, axis=0) inputs["events"] = events @@ -1118,7 +1112,7 @@ def create_prediction_frequency_filter_mask(df: pd.DataFrame, prediction_frequen # OR # timestamps = df["timestamps"].apply(lambda x: pd.to_datetime(x[0])) - timestamps = pd.to_datetime(df.loc[:, "ds"].values) + timestamps = pd.to_datetime(df.loc[:, "ds"]) filter_masks = [] for key, value in prediction_frequency.items(): if key == "daily-hour": @@ -1217,31 +1211,6 @@ def sort_regressor_names(config): return additive_regressors_names, multiplicative_regressors_names -## TODO: move - used elsewhere, not in this file. +## TODO: rename - used elsewhere, not in this file. def make_country_specific_holidays_df(year_list, country): - """ - Make dataframe of country specific holidays for given years and countries - Parameters - ---------- - year_list : list - List of years - country : str, list - List of country names - Returns - ------- - pd.DataFrame - Containing country specific holidays df with columns 'ds' and 'holiday' - """ - # iterate over countries and get holidays for each country - # convert to list if not already - if isinstance(country, str): - country = [country] - country_specific_holidays = {} - for single_country in country: - single_country_specific_holidays = get_country_holidays(single_country, year_list) - # only add holiday if it is not already in the dict - country_specific_holidays.update(single_country_specific_holidays) - country_specific_holidays_dict = defaultdict(list) - for date, holiday in country_specific_holidays.items(): - country_specific_holidays_dict[holiday].append(pd.to_datetime(date)) - return country_specific_holidays_dict + return make_country_specific_holidays_dict(year_list, country) From 7fda18d55a42d795a67840b8b2b5027ea1d6c93a Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 25 Jan 2024 16:41:40 -0800 Subject: [PATCH 039/128] debug time --- neuralprophet/time_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index f8902e331..723426405 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -66,7 +66,7 @@ def __init__(self, df, name, **kwargs): # Future TODO: integrate some of these preprocessing steps happening outside? self.df = df - self.df = self.df.reset_index(drop=True) # Future TODO: Is this still necessary post restructuring? + self.df = self.df.reset_index(drop=True) # Needed for index based operations in __get_item__ self.name = name self.meta = OrderedDict({}) self.meta["df_name"] = self.name @@ -435,7 +435,6 @@ def tabularize_univariate_datetime_single_index( dates = df.loc[origin_index, "ds"] else: dates = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "ds"] - assert len(dates.shape) == 1 # Seasonality features for name, period in config_seasonality.periods.items(): if period.resolution > 0: From 621e701ce4a27c655cd16904864cabbf95c72ef3 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 25 Jan 2024 16:46:40 -0800 Subject: [PATCH 040/128] debugging types --- neuralprophet/time_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 723426405..3c9b99556 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -344,7 +344,7 @@ def tabularize_univariate_datetime_single_index( # targets = np.empty_like(time[:, n_lags:]) # targets = np.nan_to_num(targets) else: - targets = df.loc[origin_index + 1 : origin_index + 1 + n_forecasts, "y_scaled"] + targets = df.loc[origin_index + 1 : origin_index + 1 + n_forecasts, "y_scaled"].values targets = np.expand_dims(targets, axis=1) ## Alternative # x = df["y_scaled"].values From c62f3320e8e5a572aa2a0c6f902baafca61f1df3 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 25 Jan 2024 21:16:47 -0800 Subject: [PATCH 041/128] debug timedata --- .gitignore | 1 + neuralprophet/time_dataset.py | 57 ++++++++++++++++++----------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index 0cb6e7b98..bab9645aa 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ tests/metrics/*.svg .vscode/launch.json .vscode/settings.json source/ +debug* # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 3c9b99556..455aa9dde 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -366,9 +366,8 @@ def tabularize_univariate_datetime_single_index( # TIME: the time at each sample's lags and forecasts if max_lags == 0: - inputs["time"] = df.loc[origin_index, "t"] - # TODO: Possibly need extra dim? - # inputs["time"] = np.expand_dims(inputs["time"], 1) + # inputs["time"] = df.loc[origin_index, "t"] + inputs["time"] = np.expand_dims(df.loc[origin_index, "t"], 0) else: # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "t"] @@ -441,9 +440,7 @@ def tabularize_univariate_datetime_single_index( if config_seasonality.computation == "fourier": # Compute Fourier series components with the specified frequency and order. # convert to days since epoch - t = np.array((dates - datetime(1970, 1, 1)).dt.total_seconds().astype(np.float32)) / ( - 3600 * 24.0 - ) + t = np.array((dates - datetime(1900, 1, 1)).total_seconds()) / (3600 * 24.0) # features: Matrix with dims (length len(dates), 2*resolution) features = np.column_stack( [np.sin((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] @@ -659,28 +656,32 @@ def tabularize_univariate_datetime_single_index( # FUTURE EVENTS: get the events features # create numpy array of values of additive and multiplicative events, at correct indexes # features dims: (n_samples/batch, n_forecasts, n_features/n_events) - events = OrderedDict({}) - events["additive"] = None - events["multiplicative"] = None - if max_lags == 0: - if len(self.additive_event_and_holiday_names) > 0: - events["additive"] = np.expand_dims(df.loc[origin_index, self.additive_event_and_holiday_names], axis=0) - if len(self.multiplicative_event_and_holiday_names) > 0: - events["multiplicative"] = np.expand_dims( - df.loc[origin_index, self.multiplicative_event_and_holiday_names], axis=0 - ) - else: - if len(self.additive_event_and_holiday_names) > 0: - events_add_future_window = df.loc[ - origin_index + 1 : origin_index + 1 + n_forecasts, self.additive_event_and_holiday_names - ] - events["additive"] = np.expand_dims(events_add_future_window, axis=0) - if len(self.multiplicative_event_and_holiday_names) > 0: - events_mul_future_window = df.loc[ - origin_index + 1 : origin_index + 1 + n_forecasts, self.multiplicative_event_and_holiday_names - ] - events["multiplicative"] = np.expand_dims(events_mul_future_window, axis=0) - inputs["events"] = events + any_events = 0 < len(self.additive_event_and_holiday_names + self.multiplicative_event_and_holiday_names) + if any_events: + events = OrderedDict({}) + events["additive"] = None + events["multiplicative"] = None + if max_lags == 0: + if len(self.additive_event_and_holiday_names) > 0: + events["additive"] = np.expand_dims( + df.loc[origin_index, self.additive_event_and_holiday_names], axis=0 + ) + if len(self.multiplicative_event_and_holiday_names) > 0: + events["multiplicative"] = np.expand_dims( + df.loc[origin_index, self.multiplicative_event_and_holiday_names], axis=0 + ) + else: + if len(self.additive_event_and_holiday_names) > 0: + events_add_future_window = df.loc[ + origin_index + 1 : origin_index + 1 + n_forecasts, self.additive_event_and_holiday_names + ] + events["additive"] = np.expand_dims(events_add_future_window, axis=0) + if len(self.multiplicative_event_and_holiday_names) > 0: + events_mul_future_window = df.loc[ + origin_index + 1 : origin_index + 1 + n_forecasts, self.multiplicative_event_and_holiday_names + ] + events["multiplicative"] = np.expand_dims(events_mul_future_window, axis=0) + inputs["events"] = events ## OLD # # get the events features From 54edbf491ff8a4ab600101c3e6aab431eb6d74cd Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 25 Jan 2024 21:50:42 -0800 Subject: [PATCH 042/128] debugging time_dataset variable shapes --- neuralprophet/time_dataset.py | 90 +++++++++++++++++++++------------ tests/test_model_performance.py | 7 ++- 2 files changed, 63 insertions(+), 34 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 455aa9dde..1c54ab606 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -232,16 +232,14 @@ def format_sample(self, inputs, targets=None): # if key == "timestamps": sample_input[key] = data # else: sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) - sample_input = self._split_nested_dict(sample_input) # TODO Can this be skipped for a single sample? - # TODO Can this be optimized? + # Alternatively, Can this be optimized? # Split nested dict into list of dicts with same keys as sample_input. - def split_dict(sample_input, index): - return {k: v[index] if not isinstance(v, dict) else split_dict(v, index) for k, v in sample_input.items()} - - length = next(iter(sample_input.values())).shape[0] - sample_input = [split_dict(sample_input, i) for i in range(length)] + # def split_dict(sample_input, index): + # return {k: v[index] if not isinstance(v, dict) else split_dict(v, index) for k, v in sample_input.items()} + # length = next(iter(sample_input.values())).shape[0] + # sample_input = [split_dict(sample_input, i) for i in range(length)] ## timestamps should no longer be present here? # sample_input.pop("timestamps") # Exact timestamps are not needed anymore @@ -332,7 +330,7 @@ def tabularize_univariate_datetime_single_index( # n_samples = len(df) - max_lags + 1 - n_forecasts if predict_mode: - targets = np.zeros((1, n_forecasts)) + targets = np.zeros((1, n_forecasts), dtype=np.float32) ## OLD # # time is the time at each forecast step # t = df.loc[:, "t"].values @@ -345,7 +343,7 @@ def tabularize_univariate_datetime_single_index( # targets = np.nan_to_num(targets) else: targets = df.loc[origin_index + 1 : origin_index + 1 + n_forecasts, "y_scaled"].values - targets = np.expand_dims(targets, axis=1) + targets = np.expand_dims(np.array(targets, dtype=np.float32), axis=0) ## Alternative # x = df["y_scaled"].values # targets = np.array([x[origin_index + 1 : origin_index + 1 + n_forecasts]], dtype=x.dtype) @@ -370,7 +368,7 @@ def tabularize_univariate_datetime_single_index( inputs["time"] = np.expand_dims(df.loc[origin_index, "t"], 0) else: # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index - inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "t"] + inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "t"].values ## OLD: Time # def _stride_time_features_for_forecasts(x): # window_size = n_lags + n_forecasts @@ -389,7 +387,9 @@ def tabularize_univariate_datetime_single_index( # LAGS: From y-series, extract preceeding n_lags steps up to and including origin_index if n_lags >= 1 and "y" in df.columns: # inputs["lags"] = np.array(df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values, dtype=np.float32) - inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"] + inputs["lags"] = np.array( + df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values, dtype=np.float32 + ) # OLD Lags # def _stride_lagged_features(df_col_name, feature_dims): # # only for case where max_lags > 0 @@ -409,7 +409,9 @@ def tabularize_univariate_datetime_single_index( if lagged_reg in config_lagged_regressors: assert config_lagged_regressors[lagged_reg].n_lags > 0 covar_lags = config_lagged_regressors[lagged_reg].n_lags - lagged_regressors[lagged_reg] = df.loc[origin_index - covar_lags + 1 : origin_index + 1, lagged_reg] + lagged_regressors[lagged_reg] = df.loc[ + origin_index - covar_lags + 1 : origin_index + 1, lagged_reg + ].values inputs["covariates"] = lagged_regressors # OLD Covariates # def _stride_lagged_features(df_col_name, feature_dims): @@ -431,16 +433,18 @@ def tabularize_univariate_datetime_single_index( if config_seasonality is not None: seasonalities = OrderedDict({}) if max_lags == 0: - dates = df.loc[origin_index, "ds"] + dates = pd.Series(df.loc[origin_index, "ds"]) else: - dates = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "ds"] + dates = pd.Series(df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "ds"]) # Seasonality features for name, period in config_seasonality.periods.items(): if period.resolution > 0: if config_seasonality.computation == "fourier": # Compute Fourier series components with the specified frequency and order. # convert to days since epoch - t = np.array((dates - datetime(1900, 1, 1)).total_seconds()) / (3600 * 24.0) + t = np.array((dates - datetime(1900, 1, 1)).dt.total_seconds().astype(np.float32)) / ( + 3600 * 24.0 + ) # features: Matrix with dims (length len(dates), 2*resolution) features = np.column_stack( [np.sin((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] @@ -572,23 +576,24 @@ def tabularize_univariate_datetime_single_index( # FUTURE REGRESSORS: get the future regressors features # create numpy array of values of additive and multiplicative regressors, at correct indexes # features dims: (n_samples/batch, n_forecasts, n_features/n_regressors) - if config_regressors is not None: + any_future_regressors = 0 < len(self.additive_regressors_names + self.multiplicative_regressors_names) + if any_future_regressors: # if config_regressors is not None: regressors = OrderedDict({}) - regressors["additive"] = None - regressors["multiplicative"] = None - additive_regressors_names = self.additive_regressors_names - multiplicative_regressors_names = self.multiplicative_regressors_names + # regressors["additive"] = None + # regressors["multiplicative"] = None if max_lags == 0: - if len(additive_regressors_names) > 0: - regressors["additive"] = np.expand_dims(df.loc[origin_index, additive_regressors_names], axis=0) - if len(multiplicative_regressors_names) > 0: + if len(self.additive_regressors_names) > 0: + regressors["additive"] = np.expand_dims( + df.loc[origin_index, self.additive_regressors_names], axis=0 + ) + if len(self.multiplicative_regressors_names) > 0: regressors["multiplicative"] = np.expand_dims( - df.loc[origin_index, multiplicative_regressors_names], axis=0 + df.loc[origin_index, self.multiplicative_regressors_names], axis=0 ) else: - if len(additive_regressors_names) > 0: + if len(self.additive_regressors_names) > 0: regressors_add_future_window = df.loc[ - origin_index + 1 : origin_index + 1 + n_forecasts, additive_regressors_names + origin_index + 1 : origin_index + 1 + n_forecasts, self.additive_regressors_names ] regressors["additive"] = np.expand_dims(regressors_add_future_window, axis=0) ## OLD @@ -611,9 +616,9 @@ def tabularize_univariate_datetime_single_index( # additive_regressor_feature_windows.append(stride) # additive_regressors = np.dstack(additive_regressor_feature_windows) # regressors["additive"] = additive_regressors - if len(multiplicative_regressors_names) > 0: + if len(self.multiplicative_regressors_names) > 0: regressors_mul_future_window = df.loc[ - origin_index + 1 : origin_index + 1 + n_forecasts, multiplicative_regressors_names + origin_index + 1 : origin_index + 1 + n_forecasts, self.multiplicative_regressors_names ] regressors["multiplicative"] = np.expand_dims(regressors_mul_future_window, axis=0) inputs["regressors"] = regressors @@ -659,8 +664,8 @@ def tabularize_univariate_datetime_single_index( any_events = 0 < len(self.additive_event_and_holiday_names + self.multiplicative_event_and_holiday_names) if any_events: events = OrderedDict({}) - events["additive"] = None - events["multiplicative"] = None + # events["additive"] = None + # events["multiplicative"] = None if max_lags == 0: if len(self.additive_event_and_holiday_names) > 0: events["additive"] = np.expand_dims( @@ -1211,6 +1216,25 @@ def sort_regressor_names(config): return additive_regressors_names, multiplicative_regressors_names -## TODO: rename - used elsewhere, not in this file. -def make_country_specific_holidays_df(year_list, country): - return make_country_specific_holidays_dict(year_list, country) +# ## TODO: rename - used elsewhere, not in this file. +# def make_country_specific_holidays_df(year_list, country): +# return make_country_specific_holidays_dict(year_list, country) + + +# def split_nested_dict(inputs): +# """Split nested dict into list of dicts. +# Parameters +# ---------- +# inputs : ordered dict +# Nested dict to be split. +# Returns +# ------- +# list of dicts +# List of dicts with same keys as inputs. +# """ + +# def split_dict(inputs, index): +# return {k: v[index] if not isinstance(v, dict) else split_dict(v, index) for k, v in inputs.items()} + +# length = next(iter(inputs.values())).shape[0] +# return [split_dict(inputs, i) for i in range(length)] diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index 6a519fe03..37d623c23 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -146,6 +146,7 @@ def test_PeytonManning(): def test_YosemiteTemps(): df = pd.read_csv(YOS_FILE) m = NeuralProphet( + learning_rate=0.01, n_lags=36, n_forecasts=12, changepoints_range=0.9, @@ -171,7 +172,10 @@ def test_YosemiteTemps(): def test_AirPassengers(): df = pd.read_csv(AIR_FILE) - m = NeuralProphet(seasonality_mode="multiplicative") + m = NeuralProphet( + learning_rate=0.01, + seasonality_mode="multiplicative", + ) df_train, df_test = m.split_df(df=df, freq="MS", valid_p=0.1) system_speed, std = get_system_speed() @@ -194,6 +198,7 @@ def test_EnergyPriceDaily(): df["temp"] = df["temperature"] m = NeuralProphet( + learning_rate=0.01, n_forecasts=7, n_changepoints=0, yearly_seasonality=True, From 4629bf41d72a47039bdc15bde6443e48b7cab4e2 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 26 Jan 2024 15:44:53 -0800 Subject: [PATCH 043/128] address indexing and slicing issues, .loc --- neuralprophet/configure.py | 7 +- neuralprophet/data/process.py | 7 +- neuralprophet/df_utils.py | 6 +- neuralprophet/time_dataset.py | 148 +++++++++++++++++++--------------- neuralprophet/time_net.py | 2 +- 5 files changed, 96 insertions(+), 74 deletions(-) diff --git a/neuralprophet/configure.py b/neuralprophet/configure.py index 57ef0c301..0c9c6458e 100644 --- a/neuralprophet/configure.py +++ b/neuralprophet/configure.py @@ -41,10 +41,9 @@ def init_data_params( config_events: Optional[ConfigEvents] = None, config_seasonality: Optional[ConfigSeasonality] = None, ): - if len(df["ID"].unique()) == 1: - if not self.global_normalization: - log.info("Setting normalization to global as only one dataframe provided for training.") - self.global_normalization = True + if len(df["ID"].unique()) == 1 and not self.global_normalization: + log.info("Setting normalization to global as only one dataframe provided for training.") + self.global_normalization = True self.local_data_params, self.global_data_params = df_utils.init_data_params( df=df, normalize=self.normalize, diff --git a/neuralprophet/data/process.py b/neuralprophet/data/process.py index f3e44f9bb..6899496fc 100644 --- a/neuralprophet/data/process.py +++ b/neuralprophet/data/process.py @@ -345,6 +345,11 @@ def _validate_column_name( "extra_regressors_multiplicative", "multiplicative_terms", "ID", + "y_scaled", + "ds", + "t", + "y", + "index", ] rn_l = [n + "_lower" for n in reserved_names] rn_u = [n + "_upper" for n in reserved_names] @@ -495,7 +500,7 @@ def _handle_missing_data( df_grouped = df.groupby("ID").apply(lambda x: x.set_index("ds").resample(freq).asfreq()).drop(columns=["ID"]) n_missing_dates = len(df_grouped) - len(df) if n_missing_dates > 0: - df = df_grouped.reset_index() + df = df_grouped.reset_index(drop=True) log.info(f"Added {n_missing_dates} missing dates.") if config_regressors is not None: diff --git a/neuralprophet/df_utils.py b/neuralprophet/df_utils.py index 8a4c4dcb5..fcd12d1f4 100644 --- a/neuralprophet/df_utils.py +++ b/neuralprophet/df_utils.py @@ -1053,7 +1053,7 @@ def add_missing_dates_nan(df, freq): df_resampled = df.resample(freq).asfreq() if "ID" in df.columns: df_resampled["ID"].fillna(df["ID"].iloc[0], inplace=True) - df_resampled.reset_index(inplace=True) + df_resampled.reset_index(drop=True, inplace=True) num_added = len(df_resampled) - len(df) return df_resampled, num_added @@ -1534,10 +1534,10 @@ def drop_missing_from_df(df, drop_missing, predict_steps, n_lags): if all_nan_idx[i + 1] - all_nan_idx[i] > 1: break # drop NaN window - df = df.drop(df.index[window[0] : window[-1] + 1]).reset_index().drop("index", axis=1) + df = df.drop(df.index[window[0] : window[-1] + 1]).reset_index(drop=True) # drop lagged values if window does not occur at the beginning of df if window[0] - (n_lags - 1) >= 0: - df = df.drop(df.index[(window[0] - (n_lags - 1)) : window[0]]).reset_index().drop("index", axis=1) + df = df.drop(df.index[(window[0] - (n_lags - 1)) : window[0]]).reset_index(drop=True) return df diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 1c54ab606..46ce99b5f 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -15,29 +15,6 @@ log = logging.getLogger("NP.time_dataset") -class GlobalTimeDataset(Dataset): - def __init__(self, df, **kwargs): - """Initialize Timedataset from time-series df. - Parameters - ---------- - df : pd.DataFrame - dataframe containing column ``ds``, ``y``, and optionally``ID`` and - normalized columns normalized columns ``ds``, ``y``, ``t``, ``y_scaled`` - **kwargs : dict - Identical to :meth:`tabularize_univariate_datetime` - """ - # # TODO (future): vectorize - timedatasets = [TimeDataset(df_i, df_name, **kwargs) for df_name, df_i in df.groupby("ID")] - self.combined_timedataset = [item for timedataset in timedatasets for item in timedataset] - self.length = sum(timedataset.length for timedataset in timedatasets) - - def __len__(self): - return self.length - - def __getitem__(self, idx): - return self.combined_timedataset[idx] - - class TimeDataset(Dataset): """Create a PyTorch dataset of a tabularized time-series""" @@ -65,11 +42,11 @@ def __init__(self, df, name, **kwargs): # ->_create_dataset calls prep_or_copy_df, then returns GlobalTimeDataset # Future TODO: integrate some of these preprocessing steps happening outside? - self.df = df - self.df = self.df.reset_index(drop=True) # Needed for index based operations in __get_item__ - self.name = name + self.df = df.reset_index(drop=True) # Needed for index based operations in __get_item__ + if "index" in list(self.df.columns): # should not be the case + self.df = self.df.drop("index", axis=1) self.meta = OrderedDict({}) - self.meta["df_name"] = self.name + self.meta["df_name"] = name self.config_args = kwargs self.two_level_inputs = [ @@ -211,7 +188,7 @@ def format_sample(self, inputs, targets=None): } targets_dtype = torch.float - sample_target = torch.from_numpy(targets).type(targets_dtype).unsqueeze(dim=2) + sample_target = torch.from_numpy(targets).type(targets_dtype) for key, data in inputs.items(): if key in self.two_level_inputs: @@ -330,7 +307,9 @@ def tabularize_univariate_datetime_single_index( # n_samples = len(df) - max_lags + 1 - n_forecasts if predict_mode: - targets = np.zeros((1, n_forecasts), dtype=np.float32) + # targets = np.zeros((1, n_forecasts), dtype=np.float32) + targets = np.zeros(n_forecasts, dtype=np.float32) + ## OLD # # time is the time at each forecast step # t = df.loc[:, "t"].values @@ -342,42 +321,52 @@ def tabularize_univariate_datetime_single_index( # targets = np.empty_like(time[:, n_lags:]) # targets = np.nan_to_num(targets) else: - targets = df.loc[origin_index + 1 : origin_index + 1 + n_forecasts, "y_scaled"].values - targets = np.expand_dims(np.array(targets, dtype=np.float32), axis=0) - ## Alternative - # x = df["y_scaled"].values - # targets = np.array([x[origin_index + 1 : origin_index + 1 + n_forecasts]], dtype=x.dtype) - ## OLD - # # time is the time at each forecast step - # t = df.loc[:, "t"].values - # if max_lags == 0: - # time = np.expand_dims(t, 1) - # else: - # time = _stride_time_features_for_forecasts(t) - # inputs["time"] = time # contains n_lags + n_forecasts - # def _stride_future_time_features_for_forecasts(x): - # return np.array([x[max_lags + i : max_lags + i + n_forecasts] for i in range(n_samples)], dtype=x.dtype) - # targets = _stride_future_time_features_for_forecasts(df["y_scaled"].values) + if n_forecasts == 1: + if max_lags == 0: + targets = df.at[origin_index, "y_scaled"] + if max_lags > 0: + targets = df.at[origin_index + 1, "y_scaled"] + else: + # Note: df.loc is inclusive of slice end, while df.iloc is not. + targets = df.loc[origin_index + 1 : origin_index + n_forecasts, "y_scaled"].values + # targets = np.array(targets, dtype=np.float32) # optional + + ## Alternative 1 + # targets = df.loc[:, "y_scaled"].iloc[origin_index + 1 : origin_index + 1 + n_forecasts].values + # targets = np.expand_dims(np.array(targets, dtype=np.float32), axis=0) + ## Alternative 2 + # x = df["y_scaled"].values + # targets = np.array([x[origin_index + 1 : origin_index + 1 + n_forecasts]], dtype=x.dtype) + ## OLD + # # time is the time at each forecast step + # t = df.loc[:, "t"].values + # if max_lags == 0: + # time = np.expand_dims(t, 1) + # else: + # time = _stride_time_features_for_forecasts(t) + # inputs["time"] = time # contains n_lags + n_forecasts + # def _stride_future_time_features_for_forecasts(x): + # return np.array([x[max_lags + i : max_lags + i + n_forecasts] for i in range(n_samples)], dtype=x.dtype) + # targets = _stride_future_time_features_for_forecasts(df["y_scaled"].values) # data is stored in OrderedDict inputs = OrderedDict({}) # TIME: the time at each sample's lags and forecasts if max_lags == 0: - # inputs["time"] = df.loc[origin_index, "t"] - inputs["time"] = np.expand_dims(df.loc[origin_index, "t"], 0) + # inputs["time"] = np.expand_dims(df.at[origin_index, "t"], 0) + inputs["time"] = df.at[origin_index, "t"] else: # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index - inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "t"].values + # Note: df.loc is inclusive of slice end, while df.iloc is not. + inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values ## OLD: Time # def _stride_time_features_for_forecasts(x): # window_size = n_lags + n_forecasts - # if x.ndim == 1: # shape = (n_samples, window_size) # else: # shape = (n_samples, window_size) + x.shape[1:] - # stride = x.strides[0] # strides = (stride, stride) + x.strides[1:] # start_index = max_lags - n_lags @@ -385,11 +374,10 @@ def tabularize_univariate_datetime_single_index( # inputs["time"] = _stride_time_features_for_forecasts(df.loc[:, "t"].values) # LAGS: From y-series, extract preceeding n_lags steps up to and including origin_index - if n_lags >= 1 and "y" in df.columns: - # inputs["lags"] = np.array(df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values, dtype=np.float32) - inputs["lags"] = np.array( - df.loc[origin_index - n_lags + 1 : origin_index + 1, "y_scaled"].values, dtype=np.float32 - ) + if n_lags >= 1 and "y_scaled" in df.columns: + # Note: df.loc is inclusive of slice end, while df.iloc is not. + # inputs["lags"] = np.array(df.loc[origin_index - n_lags + 1 : origin_index, "y_scaled"].values, dtype=np.float32) + inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index, "y_scaled"].values # OLD Lags # def _stride_lagged_features(df_col_name, feature_dims): # # only for case where max_lags > 0 @@ -407,10 +395,11 @@ def tabularize_univariate_datetime_single_index( # Future TODO: optimize this computation for many lagged_regressors for lagged_reg in df.columns: if lagged_reg in config_lagged_regressors: - assert config_lagged_regressors[lagged_reg].n_lags > 0 covar_lags = config_lagged_regressors[lagged_reg].n_lags + assert covar_lags > 0 + # Note: df.loc is inclusive of slice end, while df.iloc is not. lagged_regressors[lagged_reg] = df.loc[ - origin_index - covar_lags + 1 : origin_index + 1, lagged_reg + origin_index - covar_lags + 1 : origin_index, lagged_reg ].values inputs["covariates"] = lagged_regressors # OLD Covariates @@ -433,9 +422,10 @@ def tabularize_univariate_datetime_single_index( if config_seasonality is not None: seasonalities = OrderedDict({}) if max_lags == 0: - dates = pd.Series(df.loc[origin_index, "ds"]) + dates = pd.Series(df.at[origin_index, "ds"]) else: - dates = pd.Series(df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts + 1, "ds"]) + # Note: df.loc is inclusive of slice end, while df.iloc is not. + dates = pd.Series(df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "ds"].values) # Seasonality features for name, period in config_seasonality.periods.items(): if period.resolution > 0: @@ -463,10 +453,9 @@ def tabularize_univariate_datetime_single_index( if period.condition_name is not None: # multiply seasonality features with condition mask/values features = features * df[period.condition_name].values[:, np.newaxis] - seasonalities[name] = features # TODO: Possibly need extra dim? - # seasonalities[name] = np.expand_dims(seasonalities[name], 1) + # seasonalities[name] = np.expand_dims(seasonalities[name], 0) inputs["seasonalities"] = seasonalities ## OLD Seasonality @@ -593,7 +582,7 @@ def tabularize_univariate_datetime_single_index( else: if len(self.additive_regressors_names) > 0: regressors_add_future_window = df.loc[ - origin_index + 1 : origin_index + 1 + n_forecasts, self.additive_regressors_names + origin_index + 1 : origin_index + n_forecasts, self.additive_regressors_names ] regressors["additive"] = np.expand_dims(regressors_add_future_window, axis=0) ## OLD @@ -618,7 +607,7 @@ def tabularize_univariate_datetime_single_index( # regressors["additive"] = additive_regressors if len(self.multiplicative_regressors_names) > 0: regressors_mul_future_window = df.loc[ - origin_index + 1 : origin_index + 1 + n_forecasts, self.multiplicative_regressors_names + origin_index + 1 : origin_index + n_forecasts, self.multiplicative_regressors_names ] regressors["multiplicative"] = np.expand_dims(regressors_mul_future_window, axis=0) inputs["regressors"] = regressors @@ -678,12 +667,12 @@ def tabularize_univariate_datetime_single_index( else: if len(self.additive_event_and_holiday_names) > 0: events_add_future_window = df.loc[ - origin_index + 1 : origin_index + 1 + n_forecasts, self.additive_event_and_holiday_names + origin_index + 1 : origin_index + n_forecasts, self.additive_event_and_holiday_names ] events["additive"] = np.expand_dims(events_add_future_window, axis=0) if len(self.multiplicative_event_and_holiday_names) > 0: events_mul_future_window = df.loc[ - origin_index + 1 : origin_index + 1 + n_forecasts, self.multiplicative_event_and_holiday_names + origin_index + 1 : origin_index + n_forecasts, self.multiplicative_event_and_holiday_names ] events["multiplicative"] = np.expand_dims(events_mul_future_window, axis=0) inputs["events"] = events @@ -738,6 +727,35 @@ def tabularize_univariate_datetime_single_index( return inputs, targets +class GlobalTimeDataset(TimeDataset): + def __init__(self, df, **kwargs): + """Initialize Timedataset from time-series df. + Parameters + ---------- + df : pd.DataFrame + dataframe containing column ``ds``, ``y``, and optionally``ID`` and + normalized columns normalized columns ``ds``, ``y``, ``t``, ``y_scaled`` + **kwargs : dict + Identical to :meth:`tabularize_univariate_datetime` + """ + df_names = list(np.unique(df.loc[:, "ID"].values)) + if len(df_names) == 1: + super().__init__(df, df_names[0], **kwargs) + else: + raise NotImplementedError + # TODO: re-implement with JIT sample computation in TimeDatase + # # TODO (future): vectorize + # timedatasets = [TimeDataset(df_i, df_name, **kwargs) for df_name, df_i in df.groupby("ID")] + # self.combined_timedataset = [item for timedataset in timedatasets for item in timedataset] + # self.length = sum(timedataset.length for timedataset in timedatasets) + + # def __len__(self): + # return self.length + + # def __getitem__(self, idx): + # return self.combined_timedataset[idx] + + def fourier_series(dates, period, series_order): """Provides Fourier series components with the specified frequency and order. Note diff --git a/neuralprophet/time_net.py b/neuralprophet/time_net.py index f2fcbeb80..0379844cf 100644 --- a/neuralprophet/time_net.py +++ b/neuralprophet/time_net.py @@ -801,7 +801,7 @@ def training_step(self, batch, batch_idx): # Metrics if self.metrics_enabled: predicted_denorm = self.denormalize(predicted[:, :, 0]) - target_denorm = self.denormalize(targets.squeeze(dim=2)) + target_denorm = self.denormalize(targets) self.log_dict(self.metrics_train(predicted_denorm, target_denorm), **self.log_args) self.log("Loss", loss, **self.log_args) self.log("RegLoss", reg_loss, **self.log_args) From b2f89ed602a592681b9e8f0e41cbba1908e64a7f Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 26 Jan 2024 16:13:59 -0800 Subject: [PATCH 044/128] fix dimensions except nonstationary components --- neuralprophet/time_dataset.py | 47 +++++++++++++++++------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 46ce99b5f..f1673c0b0 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -572,19 +572,18 @@ def tabularize_univariate_datetime_single_index( # regressors["multiplicative"] = None if max_lags == 0: if len(self.additive_regressors_names) > 0: - regressors["additive"] = np.expand_dims( - df.loc[origin_index, self.additive_regressors_names], axis=0 - ) + regressors["additive"] = df.loc[origin_index, self.additive_regressors_names].values + # regressors["additive"] = np.expand_dims(regressors["additive"], axis=0) if len(self.multiplicative_regressors_names) > 0: - regressors["multiplicative"] = np.expand_dims( - df.loc[origin_index, self.multiplicative_regressors_names], axis=0 - ) + regressors["multiplicative"] = df.loc[origin_index, self.multiplicative_regressors_names].values + # regressors["multiplicative"] = np.expand_dims(regressors["multiplicative"], axis=0) else: if len(self.additive_regressors_names) > 0: - regressors_add_future_window = df.loc[ + regressors["additive"] = df.loc[ origin_index + 1 : origin_index + n_forecasts, self.additive_regressors_names - ] - regressors["additive"] = np.expand_dims(regressors_add_future_window, axis=0) + ].values + # regressors["additive"] = np.expand_dims(regressors["additive"], axis=0) + ## OLD # additive_regressor_feature_windows = [] # # additive_regressor_feature_windows_lagged = [] @@ -605,11 +604,13 @@ def tabularize_univariate_datetime_single_index( # additive_regressor_feature_windows.append(stride) # additive_regressors = np.dstack(additive_regressor_feature_windows) # regressors["additive"] = additive_regressors + if len(self.multiplicative_regressors_names) > 0: - regressors_mul_future_window = df.loc[ + regressors["multiplicative"] = df.loc[ origin_index + 1 : origin_index + n_forecasts, self.multiplicative_regressors_names - ] - regressors["multiplicative"] = np.expand_dims(regressors_mul_future_window, axis=0) + ].values + # regressors["multiplicative"] = np.expand_dims(regressors["multiplicative"], axis=0) + inputs["regressors"] = regressors ## OLD Future regressors @@ -657,24 +658,22 @@ def tabularize_univariate_datetime_single_index( # events["multiplicative"] = None if max_lags == 0: if len(self.additive_event_and_holiday_names) > 0: - events["additive"] = np.expand_dims( - df.loc[origin_index, self.additive_event_and_holiday_names], axis=0 - ) + events["additive"] = df.loc[origin_index, self.additive_event_and_holiday_names].values + # events["additive"] = np.expand_dims( events["additive"], axis=0) if len(self.multiplicative_event_and_holiday_names) > 0: - events["multiplicative"] = np.expand_dims( - df.loc[origin_index, self.multiplicative_event_and_holiday_names], axis=0 - ) + events["multiplicative"] = df.loc[origin_index, self.multiplicative_event_and_holiday_names].values + # events["multiplicative"] = np.expand_dims(events["multiplicative"], axis=0) else: if len(self.additive_event_and_holiday_names) > 0: - events_add_future_window = df.loc[ + events["additive"] = df.loc[ origin_index + 1 : origin_index + n_forecasts, self.additive_event_and_holiday_names - ] - events["additive"] = np.expand_dims(events_add_future_window, axis=0) + ].values + # events["additive"] = np.expand_dims(events["additive"], axis=0) if len(self.multiplicative_event_and_holiday_names) > 0: - events_mul_future_window = df.loc[ + events["multiplicative"] = df.loc[ origin_index + 1 : origin_index + n_forecasts, self.multiplicative_event_and_holiday_names - ] - events["multiplicative"] = np.expand_dims(events_mul_future_window, axis=0) + ].values + # events["multiplicative"] = np.expand_dims(events["multiplicative"], axis=0) inputs["events"] = events ## OLD From c65a10701fe018adbf00cefbf296f9b1e256858d Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 26 Jan 2024 16:51:52 -0800 Subject: [PATCH 045/128] integrate torch formatting into tabularize --- neuralprophet/time_dataset.py | 145 +++++++++++++++++++--------------- 1 file changed, 82 insertions(+), 63 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index f1673c0b0..7967927f3 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -111,10 +111,10 @@ def __getitem__(self, index): df=self.df, origin_index=df_index, **self.config_args ) # ------------------ - # Important! TODO: integrate format_sample into tabularize_univariate_datetime_single_index - sample, target = self.format_sample(inputs, target) + # DONE: integrate format_sample into tabularize_univariate_datetime_single_index + # sample, target = self.format_sample(inputs, target) # -------------------------- - return sample, target, self.meta + return inputs, target, self.meta def __len__(self): """Overrides Parent class method to get data length.""" @@ -167,61 +167,64 @@ def create_sample2index_map(self, df): return sample_index_2_df_origin_index, num_samples - def format_sample(self, inputs, targets=None): - """Convert tabularized sample to correct formats. - Parameters - ---------- - inputs : ordered dict - Identical to returns from :meth:`tabularize_univariate_datetime` - targets : np.array, float - Identical to returns from :meth:`tabularize_univariate_datetime` - """ - sample_input = OrderedDict({}) - inputs_dtype = { - "time": torch.float, - # "timestamps": np.datetime64, - "seasonalities": torch.float, - "events": torch.float, - "lags": torch.float, - "covariates": torch.float, - "regressors": torch.float, - } - targets_dtype = torch.float - - sample_target = torch.from_numpy(targets).type(targets_dtype) - - for key, data in inputs.items(): - if key in self.two_level_inputs: - sample_input[key] = OrderedDict({}) - for name, features in data.items(): - if features.dtype != np.float32: - features = features.astype(np.float32, copy=False) - - tensor = torch.from_numpy(features) - - if tensor.dtype != inputs_dtype[key]: - sample_input[key][name] = tensor.to( - dtype=inputs_dtype[key] - ) # this can probably be removed, but was included in the previous code - else: - sample_input[key][name] = tensor - else: - # if key == "timestamps": sample_input[key] = data - # else: sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) - sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) - - # TODO Can this be skipped for a single sample? - # Alternatively, Can this be optimized? - # Split nested dict into list of dicts with same keys as sample_input. - # def split_dict(sample_input, index): - # return {k: v[index] if not isinstance(v, dict) else split_dict(v, index) for k, v in sample_input.items()} - # length = next(iter(sample_input.values())).shape[0] - # sample_input = [split_dict(sample_input, i) for i in range(length)] - - ## timestamps should no longer be present here? - # sample_input.pop("timestamps") # Exact timestamps are not needed anymore - - return sample_input, sample_target + # def format_sample(self, inputs, targets=None): + # """Convert tabularized sample to correct formats. + # Parameters + # ---------- + # inputs : ordered dict + # Identical to returns from :meth:`tabularize_univariate_datetime` + # targets : np.array, float + # Identical to returns from :meth:`tabularize_univariate_datetime` + # """ + # sample_input = OrderedDict({}) + # sample_input["time"] = inputs["time"] + # if "lags" in inputs.keys(): + # sample_input["lags"] = inputs["lags"] + # inputs_dtype = { + # # "time": torch.float, + # # "timestamps": np.datetime64, + # # "lags": torch.float, + # "seasonalities": torch.float, + # "events": torch.float, + # "covariates": torch.float, + # "regressors": torch.float, + # } + + # for key, data in inputs.items(): + # if key in self.two_level_inputs: + # sample_input[key] = OrderedDict({}) + # for name, features in data.items(): + # if features.dtype != np.float32: + # features = features.astype(np.float32, copy=False) + + # tensor = torch.from_numpy(features) + + # if tensor.dtype != inputs_dtype[key]: + # sample_input[key][name] = tensor.to( + # dtype=inputs_dtype[key] + # ) # this can probably be removed, but was included in the previous code + # else: + # sample_input[key][name] = tensor + + # # No longer needed as - now directly casting to torch in tabularize + # # else: # single_level items + # # sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) + # # ## OLD + # # # if key == "timestamps": sample_input[key] = data + # # # else: sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) + + # # TODO Can this be skipped for a single sample? + # # Alternatively, Can this be optimized? + # # Split nested dict into list of dicts with same keys as sample_input. + # # def split_dict(sample_input, index): + # # return {k: v[index] if not isinstance(v, dict) else split_dict(v, index) for k, v in sample_input.items()} + # # length = next(iter(sample_input.values())).shape[0] + # # sample_input = [split_dict(sample_input, i) for i in range(length)] + + # ## timestamps should no longer be present here? + # # sample_input.pop("timestamps") # Exact timestamps are not needed anymore + + # return sample_input, targets def tabularize_univariate_datetime_single_index( self, @@ -289,6 +292,9 @@ def tabularize_univariate_datetime_single_index( np.array, float Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) """ + # data is stored in OrderedDict + inputs = OrderedDict({}) + max_lags = get_max_num_lags(config_lagged_regressors, n_lags) n_samples = 1 if max_lags == 0: @@ -308,7 +314,8 @@ def tabularize_univariate_datetime_single_index( if predict_mode: # targets = np.zeros((1, n_forecasts), dtype=np.float32) - targets = np.zeros(n_forecasts, dtype=np.float32) + # targets = np.zeros(n_forecasts, dtype=np.float32) + targets = torch.zeros(n_forecasts, dtype=torch.float32) ## OLD # # time is the time at each forecast step @@ -326,10 +333,12 @@ def tabularize_univariate_datetime_single_index( targets = df.at[origin_index, "y_scaled"] if max_lags > 0: targets = df.at[origin_index + 1, "y_scaled"] + targets = torch.tensor(targets, dtype=torch.float32) else: # Note: df.loc is inclusive of slice end, while df.iloc is not. targets = df.loc[origin_index + 1 : origin_index + n_forecasts, "y_scaled"].values # targets = np.array(targets, dtype=np.float32) # optional + targets = torch.as_tensor(targets, dtype=torch.float32) ## Alternative 1 # targets = df.loc[:, "y_scaled"].iloc[origin_index + 1 : origin_index + 1 + n_forecasts].values @@ -349,17 +358,16 @@ def tabularize_univariate_datetime_single_index( # return np.array([x[max_lags + i : max_lags + i + n_forecasts] for i in range(n_samples)], dtype=x.dtype) # targets = _stride_future_time_features_for_forecasts(df["y_scaled"].values) - # data is stored in OrderedDict - inputs = OrderedDict({}) - # TIME: the time at each sample's lags and forecasts if max_lags == 0: # inputs["time"] = np.expand_dims(df.at[origin_index, "t"], 0) inputs["time"] = df.at[origin_index, "t"] + inputs["time"] = torch.tensor(inputs["time"], dtype=torch.float32) else: # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index # Note: df.loc is inclusive of slice end, while df.iloc is not. inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values + inputs["time"] = torch.as_tensor(inputs["time"], dtype=torch.float32) ## OLD: Time # def _stride_time_features_for_forecasts(x): # window_size = n_lags + n_forecasts @@ -378,6 +386,7 @@ def tabularize_univariate_datetime_single_index( # Note: df.loc is inclusive of slice end, while df.iloc is not. # inputs["lags"] = np.array(df.loc[origin_index - n_lags + 1 : origin_index, "y_scaled"].values, dtype=np.float32) inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index, "y_scaled"].values + inputs["lags"] = torch.as_tensor(inputs["lags"], dtype=torch.float32) # OLD Lags # def _stride_lagged_features(df_col_name, feature_dims): # # only for case where max_lags > 0 @@ -401,6 +410,7 @@ def tabularize_univariate_datetime_single_index( lagged_regressors[lagged_reg] = df.loc[ origin_index - covar_lags + 1 : origin_index, lagged_reg ].values + lagged_regressors[lagged_reg] = torch.as_tensor(lagged_regressors[lagged_reg], dtype=torch.float32) inputs["covariates"] = lagged_regressors # OLD Covariates # def _stride_lagged_features(df_col_name, feature_dims): @@ -453,7 +463,7 @@ def tabularize_univariate_datetime_single_index( if period.condition_name is not None: # multiply seasonality features with condition mask/values features = features * df[period.condition_name].values[:, np.newaxis] - seasonalities[name] = features + seasonalities[name] = torch.as_tensor(features, dtype=torch.float32) # TODO: Possibly need extra dim? # seasonalities[name] = np.expand_dims(seasonalities[name], 0) inputs["seasonalities"] = seasonalities @@ -574,15 +584,18 @@ def tabularize_univariate_datetime_single_index( if len(self.additive_regressors_names) > 0: regressors["additive"] = df.loc[origin_index, self.additive_regressors_names].values # regressors["additive"] = np.expand_dims(regressors["additive"], axis=0) + regressors["additive"] = torch.as_tensor(regressors["additive"], dtype=torch.float32) if len(self.multiplicative_regressors_names) > 0: regressors["multiplicative"] = df.loc[origin_index, self.multiplicative_regressors_names].values # regressors["multiplicative"] = np.expand_dims(regressors["multiplicative"], axis=0) + regressors["multiplicative"] = torch.as_tensor(regressors["multiplicative"], dtype=torch.float32) else: if len(self.additive_regressors_names) > 0: regressors["additive"] = df.loc[ origin_index + 1 : origin_index + n_forecasts, self.additive_regressors_names ].values # regressors["additive"] = np.expand_dims(regressors["additive"], axis=0) + regressors["additive"] = torch.as_tensor(regressors["additive"], dtype=torch.float32) ## OLD # additive_regressor_feature_windows = [] @@ -610,6 +623,7 @@ def tabularize_univariate_datetime_single_index( origin_index + 1 : origin_index + n_forecasts, self.multiplicative_regressors_names ].values # regressors["multiplicative"] = np.expand_dims(regressors["multiplicative"], axis=0) + regressors["multiplicative"] = torch.as_tensor(regressors["multiplicative"], dtype=torch.float32) inputs["regressors"] = regressors @@ -660,20 +674,25 @@ def tabularize_univariate_datetime_single_index( if len(self.additive_event_and_holiday_names) > 0: events["additive"] = df.loc[origin_index, self.additive_event_and_holiday_names].values # events["additive"] = np.expand_dims( events["additive"], axis=0) + events["additive"] = torch.as_tensor(events["additive"], dtype=torch.float32) if len(self.multiplicative_event_and_holiday_names) > 0: events["multiplicative"] = df.loc[origin_index, self.multiplicative_event_and_holiday_names].values # events["multiplicative"] = np.expand_dims(events["multiplicative"], axis=0) + events["multiplicative"] = torch.as_tensor(events["multiplicative"], dtype=torch.float32) else: if len(self.additive_event_and_holiday_names) > 0: events["additive"] = df.loc[ origin_index + 1 : origin_index + n_forecasts, self.additive_event_and_holiday_names ].values # events["additive"] = np.expand_dims(events["additive"], axis=0) + events["additive"] = torch.as_tensor(events["additive"], dtype=torch.float32) + if len(self.multiplicative_event_and_holiday_names) > 0: events["multiplicative"] = df.loc[ origin_index + 1 : origin_index + n_forecasts, self.multiplicative_event_and_holiday_names ].values # events["multiplicative"] = np.expand_dims(events["multiplicative"], axis=0) + events["multiplicative"] = torch.as_tensor(events["multiplicative"], dtype=torch.float32) inputs["events"] = events ## OLD From af5524ae4db463ae490ae25c9ea7abdf4b5b00ba Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 26 Jan 2024 19:07:03 -0800 Subject: [PATCH 046/128] check shapes --- neuralprophet/time_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 7967927f3..36e21a671 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -42,7 +42,7 @@ def __init__(self, df, name, **kwargs): # ->_create_dataset calls prep_or_copy_df, then returns GlobalTimeDataset # Future TODO: integrate some of these preprocessing steps happening outside? - self.df = df.reset_index(drop=True) # Needed for index based operations in __get_item__ + self.df = df.reset_index(drop=True) # Needed for index based operations in __getitem__ if "index" in list(self.df.columns): # should not be the case self.df = self.df.drop("index", axis=1) self.meta = OrderedDict({}) From 404e3072f8df67ff1d13efc8bb0c67ecccf48cc6 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 26 Jan 2024 20:58:49 -0800 Subject: [PATCH 047/128] AirPassengers test working! --- neuralprophet/time_dataset.py | 51 ++++++++++++++++++----------------- neuralprophet/time_net.py | 2 +- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 36e21a671..e811ae5d4 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -313,9 +313,8 @@ def tabularize_univariate_datetime_single_index( # n_samples = len(df) - max_lags + 1 - n_forecasts if predict_mode: - # targets = np.zeros((1, n_forecasts), dtype=np.float32) - # targets = np.zeros(n_forecasts, dtype=np.float32) - targets = torch.zeros(n_forecasts, dtype=torch.float32) + targets = torch.zeros((1, 1, n_forecasts), dtype=torch.float32) + # targets = torch.zeros(n_forecasts, dtype=torch.float32) ## OLD # # time is the time at each forecast step @@ -333,40 +332,42 @@ def tabularize_univariate_datetime_single_index( targets = df.at[origin_index, "y_scaled"] if max_lags > 0: targets = df.at[origin_index + 1, "y_scaled"] - targets = torch.tensor(targets, dtype=torch.float32) else: # Note: df.loc is inclusive of slice end, while df.iloc is not. targets = df.loc[origin_index + 1 : origin_index + n_forecasts, "y_scaled"].values - # targets = np.array(targets, dtype=np.float32) # optional - targets = torch.as_tensor(targets, dtype=torch.float32) - - ## Alternative 1 - # targets = df.loc[:, "y_scaled"].iloc[origin_index + 1 : origin_index + 1 + n_forecasts].values - # targets = np.expand_dims(np.array(targets, dtype=np.float32), axis=0) - ## Alternative 2 - # x = df["y_scaled"].values - # targets = np.array([x[origin_index + 1 : origin_index + 1 + n_forecasts]], dtype=x.dtype) - ## OLD - # # time is the time at each forecast step - # t = df.loc[:, "t"].values - # if max_lags == 0: - # time = np.expand_dims(t, 1) - # else: - # time = _stride_time_features_for_forecasts(t) - # inputs["time"] = time # contains n_lags + n_forecasts - # def _stride_future_time_features_for_forecasts(x): - # return np.array([x[max_lags + i : max_lags + i + n_forecasts] for i in range(n_samples)], dtype=x.dtype) - # targets = _stride_future_time_features_for_forecasts(df["y_scaled"].values) + targets = np.expand_dims(np.expand_dims(targets, 0), 0) + targets = torch.as_tensor(targets, dtype=torch.float32) + + ## Alternative 1 + # targets = df.loc[:, "y_scaled"].iloc[origin_index + 1 : origin_index + 1 + n_forecasts].values + # targets = np.expand_dims(np.array(targets, dtype=np.float32), axis=0) + ## Alternative 2 + # x = df["y_scaled"].values + # targets = np.array([x[origin_index + 1 : origin_index + 1 + n_forecasts]], dtype=x.dtype) + ## OLD + # # time is the time at each forecast step + # t = df.loc[:, "t"].values + # if max_lags == 0: + # time = np.expand_dims(t, 1) + # else: + # time = _stride_time_features_for_forecasts(t) + # inputs["time"] = time # contains n_lags + n_forecasts + # def _stride_future_time_features_for_forecasts(x): + # return np.array([x[max_lags + i : max_lags + i + n_forecasts] for i in range(n_samples)], dtype=x.dtype) + # targets = _stride_future_time_features_for_forecasts(df["y_scaled"].values) # TIME: the time at each sample's lags and forecasts if max_lags == 0: - # inputs["time"] = np.expand_dims(df.at[origin_index, "t"], 0) inputs["time"] = df.at[origin_index, "t"] + inputs["time"] = np.expand_dims(inputs["time"], 0) inputs["time"] = torch.tensor(inputs["time"], dtype=torch.float32) + else: # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index # Note: df.loc is inclusive of slice end, while df.iloc is not. inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values + if n_forecasts == 1: + inputs["time"] = np.expand_dims(inputs["time"], 0) inputs["time"] = torch.as_tensor(inputs["time"], dtype=torch.float32) ## OLD: Time # def _stride_time_features_for_forecasts(x): diff --git a/neuralprophet/time_net.py b/neuralprophet/time_net.py index 0379844cf..f2fcbeb80 100644 --- a/neuralprophet/time_net.py +++ b/neuralprophet/time_net.py @@ -801,7 +801,7 @@ def training_step(self, batch, batch_idx): # Metrics if self.metrics_enabled: predicted_denorm = self.denormalize(predicted[:, :, 0]) - target_denorm = self.denormalize(targets) + target_denorm = self.denormalize(targets.squeeze(dim=2)) self.log_dict(self.metrics_train(predicted_denorm, target_denorm), **self.log_args) self.log("Loss", loss, **self.log_args) self.log("RegLoss", reg_loss, **self.log_args) From 6075074e8dfdb48322e7be6915f5863a127d1995 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 26 Jan 2024 21:03:10 -0800 Subject: [PATCH 048/128] fix dataset generator --- tests/utils/dataset_generators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/utils/dataset_generators.py b/tests/utils/dataset_generators.py index 4a0440e12..065b91162 100644 --- a/tests/utils/dataset_generators.py +++ b/tests/utils/dataset_generators.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from neuralprophet.time_dataset import make_country_specific_holidays_df +from neuralprophet.time_dataset import make_country_specific_holidays_dict def generate_holiday_dataset(country="US", years=[2022], y_default=1, y_holiday=100, y_holidays_override={}): @@ -11,7 +11,7 @@ def generate_holiday_dataset(country="US", years=[2022], y_default=1, y_holiday= dates = pd.date_range("%i-01-01" % (years[0]), periods=periods, freq="D") df = pd.DataFrame({"ds": dates, "y": y_default}, index=dates) - holidays = make_country_specific_holidays_df(years, country) + holidays = make_country_specific_holidays_dict(years, country) for holiday_name, timestamps in holidays.items(): df.loc[timestamps[0], "y"] = y_holidays_override.get(holiday_name, y_holiday) From d6242a28722cee139b43adcba5e0812454631234 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Mon, 29 Jan 2024 16:56:26 -0800 Subject: [PATCH 049/128] fixed all performance tests but Energy due to nonstationary components --- neuralprophet/time_dataset.py | 11 ++++++----- tests/test_model_performance.py | 14 ++++++++++---- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index e811ae5d4..dd62e3a0b 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -290,7 +290,7 @@ def tabularize_univariate_datetime_single_index( * ``regressors`` (OrderedDict), regressors, each with features (np.array, float) of dims: (num_samples, n_lags) np.array, float - Targets to be predicted of same length as each of the model inputs, dims: (num_samples, n_forecasts) + Targets to be predicted of same length as each of the model inputs, dims: (n_forecasts, 1) """ # data is stored in OrderedDict inputs = OrderedDict({}) @@ -313,7 +313,7 @@ def tabularize_univariate_datetime_single_index( # n_samples = len(df) - max_lags + 1 - n_forecasts if predict_mode: - targets = torch.zeros((1, 1, n_forecasts), dtype=torch.float32) + targets = torch.zeros((n_forecasts, 1), dtype=torch.float32) # targets = torch.zeros(n_forecasts, dtype=torch.float32) ## OLD @@ -332,10 +332,12 @@ def tabularize_univariate_datetime_single_index( targets = df.at[origin_index, "y_scaled"] if max_lags > 0: targets = df.at[origin_index + 1, "y_scaled"] + targets = np.expand_dims(targets, 0) + targets = np.expand_dims(targets, 1) # extra dimension at end for quantiles:median else: # Note: df.loc is inclusive of slice end, while df.iloc is not. targets = df.loc[origin_index + 1 : origin_index + n_forecasts, "y_scaled"].values - targets = np.expand_dims(np.expand_dims(targets, 0), 0) + targets = np.expand_dims(targets, 1) # extra dimension at end for quantiles:median targets = torch.as_tensor(targets, dtype=torch.float32) ## Alternative 1 @@ -366,8 +368,6 @@ def tabularize_univariate_datetime_single_index( # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index # Note: df.loc is inclusive of slice end, while df.iloc is not. inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values - if n_forecasts == 1: - inputs["time"] = np.expand_dims(inputs["time"], 0) inputs["time"] = torch.as_tensor(inputs["time"], dtype=torch.float32) ## OLD: Time # def _stride_time_features_for_forecasts(x): @@ -431,6 +431,7 @@ def tabularize_univariate_datetime_single_index( # SEASONALITIES if config_seasonality is not None: + # TODO: precompute and save fourier features and only tabularize / slide windows when calling __getitem__ seasonalities = OrderedDict({}) if max_lags == 0: dates = pd.Series(df.at[origin_index, "ds"]) diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index 37d623c23..3c097d2a3 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -125,7 +125,10 @@ def create_metrics_plot(metrics): def test_PeytonManning(): df = pd.read_csv(PEYTON_FILE) - m = NeuralProphet() + m = NeuralProphet( + # learning_rate=0.01, + # epochs=3, + ) df_train, df_test = m.split_df(df=df, freq="D", valid_p=0.1) system_speed, std = get_system_speed() @@ -146,7 +149,8 @@ def test_PeytonManning(): def test_YosemiteTemps(): df = pd.read_csv(YOS_FILE) m = NeuralProphet( - learning_rate=0.01, + # learning_rate=0.01, + # epochs=3, n_lags=36, n_forecasts=12, changepoints_range=0.9, @@ -173,7 +177,8 @@ def test_YosemiteTemps(): def test_AirPassengers(): df = pd.read_csv(AIR_FILE) m = NeuralProphet( - learning_rate=0.01, + # learning_rate=0.01, + # epochs=3, seasonality_mode="multiplicative", ) df_train, df_test = m.split_df(df=df, freq="MS", valid_p=0.1) @@ -198,7 +203,8 @@ def test_EnergyPriceDaily(): df["temp"] = df["temperature"] m = NeuralProphet( - learning_rate=0.01, + # learning_rate=0.01, + # epochs=3, n_forecasts=7, n_changepoints=0, yearly_seasonality=True, From a5ebff9b8440bce435ecfe8a134b1a103f38a20c Mon Sep 17 00:00:00 2001 From: ourownstory Date: Mon, 29 Jan 2024 17:07:47 -0800 Subject: [PATCH 050/128] fixed nonstationary issue. all performance tests running --- neuralprophet/time_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index dd62e3a0b..766a44387 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -594,7 +594,7 @@ def tabularize_univariate_datetime_single_index( else: if len(self.additive_regressors_names) > 0: regressors["additive"] = df.loc[ - origin_index + 1 : origin_index + n_forecasts, self.additive_regressors_names + origin_index + 1 - n_lags : origin_index + n_forecasts, self.additive_regressors_names ].values # regressors["additive"] = np.expand_dims(regressors["additive"], axis=0) regressors["additive"] = torch.as_tensor(regressors["additive"], dtype=torch.float32) @@ -622,7 +622,7 @@ def tabularize_univariate_datetime_single_index( if len(self.multiplicative_regressors_names) > 0: regressors["multiplicative"] = df.loc[ - origin_index + 1 : origin_index + n_forecasts, self.multiplicative_regressors_names + origin_index + 1 - n_lags : origin_index + n_forecasts, self.multiplicative_regressors_names ].values # regressors["multiplicative"] = np.expand_dims(regressors["multiplicative"], axis=0) regressors["multiplicative"] = torch.as_tensor(regressors["multiplicative"], dtype=torch.float32) From a4152e6248e3b3a315f36c07d891c92422e5a64b Mon Sep 17 00:00:00 2001 From: ourownstory Date: Tue, 30 Jan 2024 11:53:05 -0800 Subject: [PATCH 051/128] refactor tabularize function --- neuralprophet/time_dataset.py | 759 +++++++++++----------------------- 1 file changed, 235 insertions(+), 524 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 766a44387..d3dbffd6e 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -47,6 +47,11 @@ def __init__(self, df, name, **kwargs): self.df = self.df.drop("index", axis=1) self.meta = OrderedDict({}) self.meta["df_name"] = name + + self.predict_mode = (kwargs["predict_mode"],) + self.n_lags = (kwargs["n_lags"],) + self.n_forecasts = (kwargs["n_forecasts"],) + self.max_lags = get_max_num_lags(kwargs["config_lagged_regressors"], self.n_lags) self.config_args = kwargs self.two_level_inputs = [ @@ -62,7 +67,9 @@ def __init__(self, df, name, **kwargs): self.additive_event_and_holiday_names, self.multiplicative_event_and_holiday_names, ) = add_event_features_to_df( - self.df, self.config_args["config_events"], self.config_args["config_country_holidays"] + self.df, + self.config_args["config_events"], + self.config_args["config_country_holidays"], ) # pre-sort additive/multiplicative regressors self.additive_regressors_names, self.multiplicative_regressors_names = sort_regressor_names( @@ -107,13 +114,20 @@ def __getitem__(self, index): df_index = self.sample_index_to_df_index(index) # Tabularize - extract features from dataframe at given target index position - inputs, target = self.tabularize_univariate_datetime_single_index( - df=self.df, origin_index=df_index, **self.config_args + inputs, target = tabularize_univariate_datetime_single_index( + df=self.df, + origin_index=df_index, + predict_mode=self.predict_mode, + n_lags=self.n_lags, + max_lags=self.max_lags, + n_forecasts=self.n_forecasts, + config_seasonality=self.config_args["config_seasonality"], + config_lagged_regressors=self.config_args["config_lagged_regressors"], + additive_event_and_holiday_names=self.additive_event_and_holiday_names, + multiplicative_event_and_holiday_names=self.multiplicative_event_and_holiday_names, + additive_regressors_names=self.additive_regressors_names, + multiplicative_regressors_names=self.multiplicative_regressors_names, ) - # ------------------ - # DONE: integrate format_sample into tabularize_univariate_datetime_single_index - # sample, target = self.format_sample(inputs, target) - # -------------------------- return inputs, target, self.meta def __len__(self): @@ -133,16 +147,14 @@ def create_sample2index_map(self, df): # Limit target range due to input lags and number of forecasts df_length = len(df) - max_lags = get_max_num_lags(self.config_args["config_lagged_regressors"], self.config_args["n_lags"]) n_forecasts = self.config_args["n_forecasts"] origin_start_end_mask = create_origin_start_end_mask( - df_length=df_length, max_lags=max_lags, n_forecasts=n_forecasts + df_length=df_length, max_lags=self.max_lags, n_forecasts=n_forecasts ) # Prediction Frequency # Filter missing samples and prediction frequency (does not actually drop, but creates indexmapping) - # analogous to `self.filter_samples_after_init( - # self.kwargs["prediction_frequency"])` + # analogous to `self.filter_samples_after_init(self.kwargs["prediction_frequency"])` prediction_frequency_mask = create_prediction_frequency_filter_mask( df, self.config_args["prediction_frequency"] ) @@ -226,525 +238,224 @@ def create_sample2index_map(self, df): # return sample_input, targets - def tabularize_univariate_datetime_single_index( - self, - df: pd.DataFrame, - origin_index: int, - predict_mode: bool = False, - n_lags: int = 0, - n_forecasts: int = 1, - predict_steps: int = 1, - config_seasonality: Optional[configure.ConfigSeasonality] = None, - config_events: Optional[configure.ConfigEvents] = None, - config_country_holidays=None, - config_lagged_regressors: Optional[configure.ConfigLaggedRegressors] = None, - config_regressors: Optional[configure.ConfigFutureRegressors] = None, - config_missing=None, - config_train=None, - prediction_frequency=None, - ): - """Create a tabular data sample from timeseries dataframe, used for mini-batch creation. - Note - ---- - Data must have no gaps for sample extracted at given index position. - ---------- - df : pd.DataFrame - Sequence of observations with original ``ds``, ``y`` and normalized ``t``, ``y_scaled`` columns - origin_index: int: - dataframe index position of last observed lag before forecast starts. - config_seasonality : configure.ConfigSeasonality - Configuration for seasonalities - n_lags : int - Number of lagged values of series to include as model inputs (aka AR-order) - n_forecasts : int - Number of steps to forecast into future - config_events : configure.ConfigEvents - User specified events, each with their upper, lower windows (int) and regularization - config_country_holidays : configure.ConfigCountryHolidays - Configurations (holiday_names, upper, lower windows, regularization) for country specific holidays - config_lagged_regressors : configure.ConfigLaggedRegressors - Configurations for lagged regressors - config_regressors : configure.ConfigFutureRegressors - Configuration for regressors - predict_mode : bool - Chooses the prediction mode - Options - * (default) ``False``: Includes target values - * ``True``: Does not include targets but includes entire dataset as input - Returns - ------- - OrderedDict - Model inputs, each of len(df) but with varying dimensions - Note - ---- - Contains the following data: - Model Inputs - * ``time`` (np.array, float), dims: (num_samples, 1) - * ``seasonalities`` (OrderedDict), named seasonalities - each with features (np.array, float) - dims: (num_samples, n_features[name]) - * ``lags`` (np.array, float), dims: (num_samples, n_lags) - * ``covariates`` (OrderedDict), named covariates, - each with features (np.array, float) of dims: (num_samples, n_lags) - * ``events`` (OrderedDict), events, - each with features (np.array, float) of dims: (num_samples, n_lags) - * ``regressors`` (OrderedDict), regressors, - each with features (np.array, float) of dims: (num_samples, n_lags) - np.array, float - Targets to be predicted of same length as each of the model inputs, dims: (n_forecasts, 1) - """ - # data is stored in OrderedDict - inputs = OrderedDict({}) - max_lags = get_max_num_lags(config_lagged_regressors, n_lags) - n_samples = 1 - if max_lags == 0: - assert n_forecasts == 1 - - # OLD: previous workaround - # learning_rate = config_train.learning_rate - # if ( - # predict_mode - # or (learning_rate is None) - # or config_lagged_regressors - # or config_country_holidays - # or config_events - # or prediction_frequency - # ): - # n_samples = len(df) - max_lags + 1 - n_forecasts - - if predict_mode: - targets = torch.zeros((n_forecasts, 1), dtype=torch.float32) - # targets = torch.zeros(n_forecasts, dtype=torch.float32) - - ## OLD - # # time is the time at each forecast step - # t = df.loc[:, "t"].values - # if max_lags == 0: - # time = np.expand_dims(t, 1) - # else: - # time = _stride_time_features_for_forecasts(t) - # inputs["time"] = time # contains n_lags + n_forecasts - # targets = np.empty_like(time[:, n_lags:]) - # targets = np.nan_to_num(targets) +def tabularize_univariate_datetime_single_index( + df: pd.DataFrame, + origin_index: int, + predict_mode: bool = False, + n_lags: int = 0, + max_lags: int = 0, + n_forecasts: int = 1, + config_seasonality: Optional[configure.ConfigSeasonality] = None, + config_lagged_regressors: Optional[configure.ConfigLaggedRegressors] = None, + # config_events: Optional[configure.ConfigEvents] = None, + # config_country_holidays=None, + additive_event_and_holiday_names: list[str] = [], + multiplicative_event_and_holiday_names: list[str] = [], + # config_regressors: Optional[configure.ConfigFutureRegressors] = None, + additive_regressors_names: list[str] = [], + multiplicative_regressors_names: list[str] = [], +): + """Create a tabular data sample from timeseries dataframe, used for mini-batch creation. + Note + ---- + Data must have no gaps for sample extracted at given index position. + ---------- + df : pd.DataFrame + Sequence of observations with original ``ds``, ``y`` and normalized ``t``, ``y_scaled`` columns + origin_index: int: + dataframe index position of last observed lag before forecast starts. + n_forecasts : int + Number of steps to forecast into future + n_lags : int + Number of lagged values of series to include as model inputs (aka AR-order) + config_seasonality : configure.ConfigSeasonality + Configuration for seasonalities + config_lagged_regressors : configure.ConfigLaggedRegressors + Configurations for lagged regressors + config_events : configure.ConfigEvents + User specified events, each with their upper, lower windows (int) and regularization + config_country_holidays : configure.ConfigCountryHolidays + Configurations (holiday_names, upper, lower windows, regularization) for country specific holidays + config_regressors : configure.ConfigFutureRegressors + Configuration for regressors + predict_mode : bool + Chooses the prediction mode + Options + * (default) ``False``: Includes target values + * ``True``: Does not include targets but includes entire dataset as input + Returns + ------- + OrderedDict + Model inputs, each of len(df) but with varying dimensions + Note + ---- + Contains the following data: + Model Inputs + * ``time`` (np.array, float), dims: (num_samples, 1) + * ``seasonalities`` (OrderedDict), named seasonalities + each with features (np.array, float) - dims: (num_samples, n_features[name]) + * ``lags`` (np.array, float), dims: (num_samples, n_lags) + * ``covariates`` (OrderedDict), named covariates, + each with features (np.array, float) of dims: (num_samples, n_lags) + * ``events`` (OrderedDict), events, + each with features (np.array, float) of dims: (num_samples, n_lags) + * ``regressors`` (OrderedDict), regressors, + each with features (np.array, float) of dims: (num_samples, n_lags) + np.array, float + Targets to be predicted of same length as each of the model inputs, dims: (n_forecasts, 1) + """ + # sample features are stored and returned in OrderedDict + inputs = OrderedDict({}) + + if max_lags == 0: + assert n_forecasts == 1 + + if predict_mode: + targets = torch.zeros((n_forecasts, 1), dtype=torch.float32) + else: + if n_forecasts == 1: + if max_lags == 0: + targets = df.at[origin_index, "y_scaled"] + if max_lags > 0: + targets = df.at[origin_index + 1, "y_scaled"] + targets = np.expand_dims(targets, 0) + targets = np.expand_dims(targets, 1) # extra dimension at end for quantiles:median else: - if n_forecasts == 1: - if max_lags == 0: - targets = df.at[origin_index, "y_scaled"] - if max_lags > 0: - targets = df.at[origin_index + 1, "y_scaled"] - targets = np.expand_dims(targets, 0) - targets = np.expand_dims(targets, 1) # extra dimension at end for quantiles:median - else: + # Note: df.loc is inclusive of slice end, while df.iloc is not. + targets = df.loc[origin_index + 1 : origin_index + n_forecasts, "y_scaled"].values + targets = np.expand_dims(targets, 1) # extra dimension at end for quantiles:median + targets = torch.as_tensor(targets, dtype=torch.float32) + + # TIME: the time at each sample's lags and forecasts + if max_lags == 0: + inputs["time"] = df.at[origin_index, "t"] + inputs["time"] = np.expand_dims(inputs["time"], 0) + inputs["time"] = torch.tensor(inputs["time"], dtype=torch.float32) + + else: + # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index + # Note: df.loc is inclusive of slice end, while df.iloc is not. + inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values + inputs["time"] = torch.as_tensor(inputs["time"], dtype=torch.float32) + + # LAGS: From y-series, extract preceeding n_lags steps up to and including origin_index + if n_lags >= 1 and "y_scaled" in df.columns: + # Note: df.loc is inclusive of slice end, while df.iloc is not. + inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index, "y_scaled"].values + inputs["lags"] = torch.as_tensor(inputs["lags"], dtype=torch.float32) + + # COVARIATES / LAGGED REGRESSORS: Lagged regressor inputs: analogous to LAGS + if config_lagged_regressors is not None and max_lags > 0: + lagged_regressors = OrderedDict({}) + # Future TODO: optimize this computation for many lagged_regressors + for lagged_reg in df.columns: + if lagged_reg in config_lagged_regressors: + covar_lags = config_lagged_regressors[lagged_reg].n_lags + assert covar_lags > 0 # Note: df.loc is inclusive of slice end, while df.iloc is not. - targets = df.loc[origin_index + 1 : origin_index + n_forecasts, "y_scaled"].values - targets = np.expand_dims(targets, 1) # extra dimension at end for quantiles:median - targets = torch.as_tensor(targets, dtype=torch.float32) - - ## Alternative 1 - # targets = df.loc[:, "y_scaled"].iloc[origin_index + 1 : origin_index + 1 + n_forecasts].values - # targets = np.expand_dims(np.array(targets, dtype=np.float32), axis=0) - ## Alternative 2 - # x = df["y_scaled"].values - # targets = np.array([x[origin_index + 1 : origin_index + 1 + n_forecasts]], dtype=x.dtype) - ## OLD - # # time is the time at each forecast step - # t = df.loc[:, "t"].values - # if max_lags == 0: - # time = np.expand_dims(t, 1) - # else: - # time = _stride_time_features_for_forecasts(t) - # inputs["time"] = time # contains n_lags + n_forecasts - # def _stride_future_time_features_for_forecasts(x): - # return np.array([x[max_lags + i : max_lags + i + n_forecasts] for i in range(n_samples)], dtype=x.dtype) - # targets = _stride_future_time_features_for_forecasts(df["y_scaled"].values) - - # TIME: the time at each sample's lags and forecasts + lagged_regressors[lagged_reg] = df.loc[origin_index - covar_lags + 1 : origin_index, lagged_reg].values + lagged_regressors[lagged_reg] = torch.as_tensor(lagged_regressors[lagged_reg], dtype=torch.float32) + inputs["covariates"] = lagged_regressors + + # SEASONALITIES + # TODO: precompute and save fourier features and only tabularize / slide windows when calling __getitem__ + if config_seasonality is not None: + seasonalities = OrderedDict({}) if max_lags == 0: - inputs["time"] = df.at[origin_index, "t"] - inputs["time"] = np.expand_dims(inputs["time"], 0) - inputs["time"] = torch.tensor(inputs["time"], dtype=torch.float32) - + dates = pd.Series(df.at[origin_index, "ds"]) else: - # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index # Note: df.loc is inclusive of slice end, while df.iloc is not. - inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values - inputs["time"] = torch.as_tensor(inputs["time"], dtype=torch.float32) - ## OLD: Time - # def _stride_time_features_for_forecasts(x): - # window_size = n_lags + n_forecasts - # if x.ndim == 1: - # shape = (n_samples, window_size) - # else: - # shape = (n_samples, window_size) + x.shape[1:] - # stride = x.strides[0] - # strides = (stride, stride) + x.strides[1:] - # start_index = max_lags - n_lags - # return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) - # inputs["time"] = _stride_time_features_for_forecasts(df.loc[:, "t"].values) - - # LAGS: From y-series, extract preceeding n_lags steps up to and including origin_index - if n_lags >= 1 and "y_scaled" in df.columns: - # Note: df.loc is inclusive of slice end, while df.iloc is not. - # inputs["lags"] = np.array(df.loc[origin_index - n_lags + 1 : origin_index, "y_scaled"].values, dtype=np.float32) - inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index, "y_scaled"].values - inputs["lags"] = torch.as_tensor(inputs["lags"], dtype=torch.float32) - # OLD Lags - # def _stride_lagged_features(df_col_name, feature_dims): - # # only for case where max_lags > 0 - # assert feature_dims >= 1 - # series = df.loc[:, df_col_name].values - # # Added dtype=np.float64 to solve the problem with np.isnan for ubuntu test - # return np.array( - # [series[i + max_lags - feature_dims : i + max_lags] for i in range(n_samples)], dtype=np.float32 - # ) - # inputs["lags"] = _stride_lagged_features(df_col_name="y_scaled", feature_dims=n_lags) - - # COVARIATES / LAGGED REGRESSORS: Lagged regressor inputs: analogous to LAGS - if config_lagged_regressors is not None and max_lags > 0: - lagged_regressors = OrderedDict({}) - # Future TODO: optimize this computation for many lagged_regressors - for lagged_reg in df.columns: - if lagged_reg in config_lagged_regressors: - covar_lags = config_lagged_regressors[lagged_reg].n_lags - assert covar_lags > 0 - # Note: df.loc is inclusive of slice end, while df.iloc is not. - lagged_regressors[lagged_reg] = df.loc[ - origin_index - covar_lags + 1 : origin_index, lagged_reg - ].values - lagged_regressors[lagged_reg] = torch.as_tensor(lagged_regressors[lagged_reg], dtype=torch.float32) - inputs["covariates"] = lagged_regressors - # OLD Covariates - # def _stride_lagged_features(df_col_name, feature_dims): - # # only for case where max_lags > 0 - # assert feature_dims >= 1 - # series = df.loc[:, df_col_name].values - # # Added dtype=np.float64 to solve the problem with np.isnan for ubuntu test - # return np.array( - # [series[i + max_lags - feature_dims : i + max_lags] for i in range(n_samples)], dtype=np.float32 - # ) - # for covar in df.columns: - # if covar in config_lagged_regressors: - # assert config_lagged_regressors[covar].n_lags > 0 - # window = config_lagged_regressors[covar].n_lags - # covariates[covar] = _stride_lagged_features(df_col_name=covar, feature_dims=window) - # inputs["covariates"] = covariates - - # SEASONALITIES - if config_seasonality is not None: - # TODO: precompute and save fourier features and only tabularize / slide windows when calling __getitem__ - seasonalities = OrderedDict({}) - if max_lags == 0: - dates = pd.Series(df.at[origin_index, "ds"]) - else: - # Note: df.loc is inclusive of slice end, while df.iloc is not. - dates = pd.Series(df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "ds"].values) - # Seasonality features - for name, period in config_seasonality.periods.items(): - if period.resolution > 0: - if config_seasonality.computation == "fourier": - # Compute Fourier series components with the specified frequency and order. - # convert to days since epoch - t = np.array((dates - datetime(1900, 1, 1)).dt.total_seconds().astype(np.float32)) / ( - 3600 * 24.0 - ) - # features: Matrix with dims (length len(dates), 2*resolution) - features = np.column_stack( - [np.sin((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] - + [np.cos((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] - ) - # Single nested loop version: - # features = np.column_stack( - # [ - # fun((2.0 * (i + 1) * np.pi * t / period.period)) - # for i in range(period.resolution) - # for fun in (np.sin, np.cos) - # ] - # ) - else: - raise NotImplementedError - if period.condition_name is not None: - # multiply seasonality features with condition mask/values - features = features * df[period.condition_name].values[:, np.newaxis] - seasonalities[name] = torch.as_tensor(features, dtype=torch.float32) - # TODO: Possibly need extra dim? - # seasonalities[name] = np.expand_dims(seasonalities[name], 0) - inputs["seasonalities"] = seasonalities - - ## OLD Seasonality - # def fourier_series_t(t, period, series_order): - # """Provides Fourier series components with the specified frequency and order. - # Note - # ---- - # This function is identical to Meta AI's Prophet Library - # Parameters - # ---------- - # t : pd.Series, float - # Containing time as floating point number of days - # period : float - # Number of days of the period - # series_order : int - # Number of fourier components - # Returns - # ------- - # np.array - # Matrix with seasonality features - # """ - # features = np.column_stack( - # [fun((2.0 * (i + 1) * np.pi * t / period)) for i in range(series_order) for fun in (np.sin, np.cos)] - # ) - # return features - - # def fourier_series(dates, period, series_order): - # """Provides Fourier series components with the specified frequency and order. - # Note - # ---- - # Identical to OG Prophet. - # Parameters - # ---------- - # dates : pd.Series - # Containing time stamps - # period : float - # Number of days of the period - # series_order : int - # Number of fourier components - # Returns - # ------- - # np.array - # Matrix with seasonality features - # """ - # # convert to days since epoch - # t = np.array((dates - datetime(1970, 1, 1)).dt.total_seconds().astype(np.float32)) / (3600 * 24.0) - # return fourier_series_t(t, period, series_order) - - # def seasonal_features_from_dates(df, config_seasonality: configure.ConfigSeasonality): - # """Dataframe with seasonality features. - # Includes seasonality features - # Parameters - # ---------- - # df : pd.DataFrame - # Dataframe with all values - # config_seasonality : configure.ConfigSeasonality - # Configuration for seasonalities - # Returns - # ------- - # OrderedDict - # Dictionary with keys for each period name containing an np.array - # with the respective regression features. each with dims: (len(dates), 2*fourier_order) - # """ - # dates = df["ds"] - # assert len(dates.shape) == 1 - # seasonalities = OrderedDict({}) - # # Seasonality features - # for name, period in config_seasonality.periods.items(): - # if period.resolution > 0: - # if config_seasonality.computation == "fourier": - # # features: Matrix with dims (length len(dates), 2*resolution) - # features = fourier_series( - # dates=dates, - # period=period.period, - # series_order=period.resolution, - # ) - # else: - # raise NotImplementedError - # if period.condition_name is not None - # # multiply seasonality features with condition mask/values: - # features = features * df[period.condition_name].values[:, np.newaxis] - # seasonalities[name] = features - # return seasonalities - - # def _stride_time_features_for_seasonality(x): - # window_size = n_lags + n_forecasts - - # if x.ndim == 1: - # shape = (n_samples, window_size) - # else: - # shape = (n_samples, window_size) + x.shape[1:] - - # stride = x.strides[0] - # strides = (stride, stride) + x.strides[1:] - # start_index = max_lags - n_lags - # return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) - - # seasonalities = seasonal_features_from_dates(df, config_seasonality) - # for name, features in seasonalities.items(): - # if max_lags == 0: - # seasonalities[name] = np.expand_dims(features, axis=1) - # else: - # # stride into num_forecast at dim=1 for each sample, just like we did with time - # seasonalities[name] = _stride_time_features_for_seasonality(features) - # inputs["seasonalities"] = seasonalities - - # FUTURE REGRESSORS: get the future regressors features - # create numpy array of values of additive and multiplicative regressors, at correct indexes - # features dims: (n_samples/batch, n_forecasts, n_features/n_regressors) - any_future_regressors = 0 < len(self.additive_regressors_names + self.multiplicative_regressors_names) - if any_future_regressors: # if config_regressors is not None: - regressors = OrderedDict({}) - # regressors["additive"] = None - # regressors["multiplicative"] = None - if max_lags == 0: - if len(self.additive_regressors_names) > 0: - regressors["additive"] = df.loc[origin_index, self.additive_regressors_names].values - # regressors["additive"] = np.expand_dims(regressors["additive"], axis=0) - regressors["additive"] = torch.as_tensor(regressors["additive"], dtype=torch.float32) - if len(self.multiplicative_regressors_names) > 0: - regressors["multiplicative"] = df.loc[origin_index, self.multiplicative_regressors_names].values - # regressors["multiplicative"] = np.expand_dims(regressors["multiplicative"], axis=0) - regressors["multiplicative"] = torch.as_tensor(regressors["multiplicative"], dtype=torch.float32) - else: - if len(self.additive_regressors_names) > 0: - regressors["additive"] = df.loc[ - origin_index + 1 - n_lags : origin_index + n_forecasts, self.additive_regressors_names - ].values - # regressors["additive"] = np.expand_dims(regressors["additive"], axis=0) - regressors["additive"] = torch.as_tensor(regressors["additive"], dtype=torch.float32) - - ## OLD - # additive_regressor_feature_windows = [] - # # additive_regressor_feature_windows_lagged = [] - # for i in range(0, len(additive_regressors_names)): - # # stride into num_forecast at dim=1 for each sample, just like we did with time - # x = additive_regressors[:, i] - # window_size = n_lags + n_forecasts - - # if x.ndim == 1: - # shape = (n_samples, window_size) - # else: - # shape = (n_samples, window_size) + x.shape[1:] - - # stride = x.strides[0] - # strides = (stride, stride) + x.strides[1:] - # start_index = max_lags - n_lags - # stride = np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) - # additive_regressor_feature_windows.append(stride) - # additive_regressors = np.dstack(additive_regressor_feature_windows) - # regressors["additive"] = additive_regressors - - if len(self.multiplicative_regressors_names) > 0: - regressors["multiplicative"] = df.loc[ - origin_index + 1 - n_lags : origin_index + n_forecasts, self.multiplicative_regressors_names - ].values - # regressors["multiplicative"] = np.expand_dims(regressors["multiplicative"], axis=0) - regressors["multiplicative"] = torch.as_tensor(regressors["multiplicative"], dtype=torch.float32) - - inputs["regressors"] = regressors - - ## OLD Future regressors - # additive_regressors, multiplicative_regressors = make_regressors_features(df, config_regressors) - # for max_lags == 0, see code before merge - # if max_lags > 0: - # def _stride_time_features_for_forecasts(x):additive_regressors - # window_size = n_lags + n_forecasts - - # if x.ndim == 1: - # shape = (n_samples, window_size) - # else: - # shape = (n_samples, window_size) + x.shape[1:] - - # stride = x.strides[0] - # strides = (stride, stride) + x.strides[1:] - # start_index = max_lags - n_lags - # return np.lib.stride_tricks.as_strided(x[start_index:], shape=shape, strides=strides) - # if additive_regressors is not None: - # additive_regressor_feature_windows = [] - # # additive_regressor_feature_windows_lagged = [] - # for i in range(0, additive_regressors.shape[1]): - # # stride into num_forecast at dim=1 for each sample, just like we did with time - # stride = _stride_time_features_for_forecasts(additive_regressors[:, i]) - # additive_regressor_feature_windows.append(stride) - # additive_regressors = np.dstack(additive_regressor_feature_windows) - # regressors["additive"] = additive_regressors - - # if multiplicative_regressors is not None: - # multiplicative_regressor_feature_windows = [] - # for i in range(0, multiplicative_regressors.shape[1]): - # stride = _stride_time_features_for_forecasts(multiplicative_regressors[:, i]) - # multiplicative_regressor_feature_windows.append(stride) - # multiplicative_regressors = np.dstack(multiplicative_regressor_feature_windows) - # regressors["multiplicative"] = multiplicative_regressors - # inputs["regressors"] = regressors - - # FUTURE EVENTS: get the events features - # create numpy array of values of additive and multiplicative events, at correct indexes - # features dims: (n_samples/batch, n_forecasts, n_features/n_events) - any_events = 0 < len(self.additive_event_and_holiday_names + self.multiplicative_event_and_holiday_names) - if any_events: - events = OrderedDict({}) - # events["additive"] = None - # events["multiplicative"] = None - if max_lags == 0: - if len(self.additive_event_and_holiday_names) > 0: - events["additive"] = df.loc[origin_index, self.additive_event_and_holiday_names].values - # events["additive"] = np.expand_dims( events["additive"], axis=0) - events["additive"] = torch.as_tensor(events["additive"], dtype=torch.float32) - if len(self.multiplicative_event_and_holiday_names) > 0: - events["multiplicative"] = df.loc[origin_index, self.multiplicative_event_and_holiday_names].values - # events["multiplicative"] = np.expand_dims(events["multiplicative"], axis=0) - events["multiplicative"] = torch.as_tensor(events["multiplicative"], dtype=torch.float32) - else: - if len(self.additive_event_and_holiday_names) > 0: - events["additive"] = df.loc[ - origin_index + 1 : origin_index + n_forecasts, self.additive_event_and_holiday_names - ].values - # events["additive"] = np.expand_dims(events["additive"], axis=0) - events["additive"] = torch.as_tensor(events["additive"], dtype=torch.float32) - - if len(self.multiplicative_event_and_holiday_names) > 0: - events["multiplicative"] = df.loc[ - origin_index + 1 : origin_index + n_forecasts, self.multiplicative_event_and_holiday_names - ].values - # events["multiplicative"] = np.expand_dims(events["multiplicative"], axis=0) - events["multiplicative"] = torch.as_tensor(events["multiplicative"], dtype=torch.float32) - inputs["events"] = events - - ## OLD - # # get the events features - # if config_events is not None or config_country_holidays is not None: - # additive_events, multiplicative_events = make_events_features(df, config_events, config_country_holidays) - - # events = OrderedDict({}) - # if max_lags == 0: - # if additive_events is not None: - # events["additive"] = np.expand_dims(additive_events, axis=1) - # if multiplicative_events is not None: - # events["multiplicative"] = np.expand_dims(multiplicative_events, axis=1) - # else: - # if additive_events is not None: - # additive_event_feature_windows = [] - # for i in range(0, additive_events.shape[1]): - # # stride into num_forecast at dim=1 for each sample, just like we did with time - # additive_event_feature_windows.append(_stride_time_features_for_forecasts(additive_events[:, i])) - # additive_events = np.dstack(additive_event_feature_windows) - # events["additive"] = additive_events - - # if multiplicative_events is not None: - # multiplicative_event_feature_windows = [] - # # multiplicative_event_feature_windows_lagged = [] - # for i in range(0, multiplicative_events.shape[1]): - # # stride into num_forecast at dim=1 for each sample, just like we did with time - # multiplicative_event_feature_windows.append( - # _stride_time_features_for_forecasts(multiplicative_events[:, i]) - # ) - # multiplicative_events = np.dstack(multiplicative_event_feature_windows) - # events["multiplicative"] = multiplicative_events - # inputs["events"] = events - - # ONLY FOR DEBUGGING - # tabularized_input_shapes_str = "" - # for key, value in inputs.items(): - # if key in [ - # "seasonalities", - # "covariates", - # "events", - # "regressors", - # ]: - # for name, period_features in value.items(): - # tabularized_input_shapes_str += f" {name} {key} {period_features}\n" - # else: - # tabularized_input_shapes_str += f" {key} {value.shape} \n" - # log.debug(f"Tabularized inputs shapes: \n{tabularized_input_shapes_str}") - - return inputs, targets + dates = pd.Series(df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "ds"].values) + # Seasonality features + for name, period in config_seasonality.periods.items(): + if period.resolution > 0: + if config_seasonality.computation == "fourier": + # Compute Fourier series components with the specified frequency and order. + # convert to days since epoch + t = np.array((dates - datetime(1900, 1, 1)).dt.total_seconds().astype(np.float32)) / (3600 * 24.0) + # features: Matrix with dims (length len(dates), 2*resolution) + features = np.column_stack( + [np.sin((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] + + [np.cos((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] + ) + else: + raise NotImplementedError + if period.condition_name is not None: + # multiply seasonality features with condition mask/values + features = features * df[period.condition_name].values[:, np.newaxis] + seasonalities[name] = torch.as_tensor(features, dtype=torch.float32) + inputs["seasonalities"] = seasonalities + + # FUTURE REGRESSORS: get the future regressors features + # create numpy array of values of additive and multiplicative regressors, at correct indexes + # features dims: (n_samples/batch, n_forecasts, n_features/n_regressors) + any_future_regressors = 0 < len(additive_regressors_names + multiplicative_regressors_names) + if any_future_regressors: # if config_regressors is not None: + regressors = OrderedDict({}) + if max_lags == 0: + if len(additive_regressors_names) > 0: + regressors["additive"] = df.loc[origin_index, additive_regressors_names].values + regressors["additive"] = torch.as_tensor(regressors["additive"], dtype=torch.float32) + if len(multiplicative_regressors_names) > 0: + regressors["multiplicative"] = df.loc[origin_index, multiplicative_regressors_names].values + regressors["multiplicative"] = torch.as_tensor(regressors["multiplicative"], dtype=torch.float32) + else: + if len(additive_regressors_names) > 0: + regressors["additive"] = df.loc[ + origin_index + 1 - n_lags : origin_index + n_forecasts, additive_regressors_names + ].values + regressors["additive"] = torch.as_tensor(regressors["additive"], dtype=torch.float32) + + if len(multiplicative_regressors_names) > 0: + regressors["multiplicative"] = df.loc[ + origin_index + 1 - n_lags : origin_index + n_forecasts, multiplicative_regressors_names + ].values + regressors["multiplicative"] = torch.as_tensor(regressors["multiplicative"], dtype=torch.float32) + inputs["regressors"] = regressors + + # FUTURE EVENTS: get the events features + # create numpy array of values of additive and multiplicative events, at correct indexes + # features dims: (n_samples/batch, n_forecasts, n_features/n_events) + any_events = 0 < len(additive_event_and_holiday_names + multiplicative_event_and_holiday_names) + if any_events: + events = OrderedDict({}) + if max_lags == 0: + if len(additive_event_and_holiday_names) > 0: + events["additive"] = df.loc[origin_index, additive_event_and_holiday_names].values + events["additive"] = torch.as_tensor(events["additive"], dtype=torch.float32) + if len(multiplicative_event_and_holiday_names) > 0: + events["multiplicative"] = df.loc[origin_index, multiplicative_event_and_holiday_names].values + events["multiplicative"] = torch.as_tensor(events["multiplicative"], dtype=torch.float32) + else: + if len(additive_event_and_holiday_names) > 0: + events["additive"] = df.loc[ + origin_index + 1 : origin_index + n_forecasts, additive_event_and_holiday_names + ].values + events["additive"] = torch.as_tensor(events["additive"], dtype=torch.float32) + + if len(multiplicative_event_and_holiday_names) > 0: + events["multiplicative"] = df.loc[ + origin_index + 1 : origin_index + n_forecasts, multiplicative_event_and_holiday_names + ].values + events["multiplicative"] = torch.as_tensor(events["multiplicative"], dtype=torch.float32) + inputs["events"] = events + + # ONLY FOR DEBUGGING + # tabularized_input_shapes_str = "" + # for key, value in inputs.items(): + # if key in [ + # "seasonalities", + # "covariates", + # "events", + # "regressors", + # ]: + # for name, period_features in value.items(): + # tabularized_input_shapes_str += f" {name} {key} {period_features}\n" + # else: + # tabularized_input_shapes_str += f" {key} {value.shape} \n" + # log.debug(f"Tabularized inputs shapes: \n{tabularized_input_shapes_str}") + return inputs, targets class GlobalTimeDataset(TimeDataset): From fba0d0db14fbce9f14c5b008ffadd522505ea36d Mon Sep 17 00:00:00 2001 From: ourownstory Date: Tue, 30 Jan 2024 14:10:46 -0800 Subject: [PATCH 052/128] fix bug --- neuralprophet/df_utils.py | 15 +++++++-------- neuralprophet/forecaster.py | 4 +++- neuralprophet/time_dataset.py | 10 ++++++---- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/neuralprophet/df_utils.py b/neuralprophet/df_utils.py index fcd12d1f4..7d569af98 100644 --- a/neuralprophet/df_utils.py +++ b/neuralprophet/df_utils.py @@ -88,15 +88,15 @@ def return_df_in_original_format(df, received_ID_col=False, received_single_time return new_df -def get_max_num_lags(config_lagged_regressors: Optional[ConfigLaggedRegressors], n_lags: int) -> int: +def get_max_num_lags(n_lags: int, config_lagged_regressors: Optional[ConfigLaggedRegressors]) -> int: """Get the greatest number of lags between the autoregression lags and the covariates lags. Parameters ---------- - config_lagged_regressors : configure.ConfigLaggedRegressors - Configurations for lagged regressors n_lags : int number of lagged values of series to include as model inputs + config_lagged_regressors : configure.ConfigLaggedRegressors + Configurations for lagged regressors Returns ------- @@ -104,12 +104,11 @@ def get_max_num_lags(config_lagged_regressors: Optional[ConfigLaggedRegressors], Maximum number of lags between the autoregression lags and the covariates lags. """ if config_lagged_regressors is not None: - log.debug("config_lagged_regressors exists") - max_n_lags = max([n_lags] + [val.n_lags for key, val in config_lagged_regressors.items()]) + # log.debug("config_lagged_regressors exists") + return max([n_lags] + [val.n_lags for key, val in config_lagged_regressors.items()]) else: - log.debug("config_lagged_regressors does not exist") - max_n_lags = n_lags - return max_n_lags + # log.debug("config_lagged_regressors does not exist") + return n_lags def merge_dataframes(df: pd.DataFrame) -> pd.DataFrame: diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index 58137abe4..47897221f 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -951,7 +951,9 @@ def fit( if self.fitted is True and not continue_training: log.error("Model has already been fitted. Re-fitting may break or produce different results.") - self.max_lags = df_utils.get_max_num_lags(self.config_lagged_regressors, self.n_lags) + self.max_lags = df_utils.get_max_num_lags( + n_lags=self.n_lags, config_lagged_regressors=self.config_lagged_regressors + ) if self.max_lags == 0 and self.n_forecasts > 1: self.n_forecasts = 1 self.predict_steps = 1 diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index d3dbffd6e..e9c299e47 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -48,10 +48,12 @@ def __init__(self, df, name, **kwargs): self.meta = OrderedDict({}) self.meta["df_name"] = name - self.predict_mode = (kwargs["predict_mode"],) - self.n_lags = (kwargs["n_lags"],) - self.n_forecasts = (kwargs["n_forecasts"],) - self.max_lags = get_max_num_lags(kwargs["config_lagged_regressors"], self.n_lags) + self.predict_mode = kwargs["predict_mode"] + self.n_lags = kwargs["n_lags"] + self.n_forecasts = kwargs["n_forecasts"] + self.max_lags = get_max_num_lags( + n_lags=self.n_lags, config_lagged_regressors=kwargs["config_lagged_regressors"] + ) self.config_args = kwargs self.two_level_inputs = [ From 3493d8abab63136f8eab1197021169ae51061ea2 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Tue, 30 Jan 2024 14:59:21 -0800 Subject: [PATCH 053/128] initial build of GlobalTimeDataset --- neuralprophet/data/process.py | 2 +- neuralprophet/time_dataset.py | 41 +++++++++++++++++++++++++++-------- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/neuralprophet/data/process.py b/neuralprophet/data/process.py index 6899496fc..1b1be0b1c 100644 --- a/neuralprophet/data/process.py +++ b/neuralprophet/data/process.py @@ -500,7 +500,7 @@ def _handle_missing_data( df_grouped = df.groupby("ID").apply(lambda x: x.set_index("ds").resample(freq).asfreq()).drop(columns=["ID"]) n_missing_dates = len(df_grouped) - len(df) if n_missing_dates > 0: - df = df_grouped.reset_index(drop=True) + df = df_grouped.reset_index() log.info(f"Added {n_missing_dates} missing dates.") if config_regressors is not None: diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index e9c299e47..f3d55308e 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -86,7 +86,7 @@ def __getitem__(self, index): Parameters ---------- index : int - Sample location in dataset + Sample location in dataset, starting at 0, maximum at length-1 Returns ------- OrderedDict @@ -471,22 +471,45 @@ def __init__(self, df, **kwargs): **kwargs : dict Identical to :meth:`tabularize_univariate_datetime` """ - df_names = list(np.unique(df.loc[:, "ID"].values)) - if len(df_names) == 1: - super().__init__(df, df_names[0], **kwargs) + self.df_names = sorted(list(np.unique(df.loc[:, "ID"].values))) + if len(self.df_names) == 1: + super().__init__(df, self.df_names[0], **kwargs) else: - raise NotImplementedError + self.datasets = OrderedDict({}) + for df_name in self.df_names: + self.datasets[df_name] = TimeDataset(df[df["ID"] == df_name], df_name, **kwargs) + self.length = sum(dataset.length for (name, dataset) in self.datasets.items()) + self.global_sample_to_local_ID = np.full(shape=self.length, fill_value="__df__", dtype=str) + self.global_sample_to_local_sample = np.full(shape=self.length, fill_value=0, dtype=int) + global_position = 0 + for name, dataset in self.datasets.items(): + local_length = dataset.length + self.global_sample_to_local_ID[global_position : global_position + local_length] = name + self.global_sample_to_local_sample[global_position : global_position + local_length] = np.arange( + local_length, dtype=int + ) + global_position += local_length + + # raise NotImplementedError # TODO: re-implement with JIT sample computation in TimeDatase # # TODO (future): vectorize # timedatasets = [TimeDataset(df_i, df_name, **kwargs) for df_name, df_i in df.groupby("ID")] # self.combined_timedataset = [item for timedataset in timedatasets for item in timedataset] # self.length = sum(timedataset.length for timedataset in timedatasets) - # def __len__(self): - # return self.length + def __len__(self): + return self.length - # def __getitem__(self, idx): - # return self.combined_timedataset[idx] + def __getitem__(self, idx): + """Overrides parent class method to get an item at index. + Parameters + ---------- + index : int + Sample location in dataset, starting at 0 + """ + df_name = self.global_sample_to_local_ID[idx] + local_pos = self.global_sample_to_local_sample[idx] + return self.datasets[df_name].__getitem__(local_pos) def fourier_series(dates, period, series_order): From dbec862a53cbae96f8dea29a5df069f750765419 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Tue, 30 Jan 2024 15:44:29 -0800 Subject: [PATCH 054/128] refactor TimeDataset not to use kwargs passthrough --- neuralprophet/forecaster.py | 4 +- neuralprophet/time_dataset.py | 218 ++++++++++++++-------------------- 2 files changed, 92 insertions(+), 130 deletions(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index 47897221f..8af79b2f2 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -1770,8 +1770,8 @@ def predict_seasonal_components(self, df: pd.DataFrame, quantile: float = 0.5): df_i, name=df_name, config_seasonality=self.config_seasonality, - # n_lags=0, - # n_forecasts=1, + n_lags=0, + n_forecasts=1, predict_steps=self.predict_steps, predict_mode=True, config_missing=self.config_missing, diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index f3d55308e..afe354be6 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -18,7 +18,22 @@ class TimeDataset(Dataset): """Create a PyTorch dataset of a tabularized time-series""" - def __init__(self, df, name, **kwargs): + def __init__( + self, + df, + name, + predict_mode, + n_lags, + n_forecasts, + prediction_frequency, + predict_steps, + config_seasonality, + config_events, + config_country_holidays, + config_regressors, + config_lagged_regressors, + config_missing, + ): """Initialize Timedataset from time-series df. Parameters ---------- @@ -48,13 +63,27 @@ def __init__(self, df, name, **kwargs): self.meta = OrderedDict({}) self.meta["df_name"] = name - self.predict_mode = kwargs["predict_mode"] - self.n_lags = kwargs["n_lags"] - self.n_forecasts = kwargs["n_forecasts"] - self.max_lags = get_max_num_lags( - n_lags=self.n_lags, config_lagged_regressors=kwargs["config_lagged_regressors"] - ) - self.config_args = kwargs + self.predict_mode = predict_mode + self.n_lags = n_lags + self.n_forecasts = n_forecasts + self.prediction_frequency = prediction_frequency + self.predict_steps = predict_steps + self.config_seasonality = config_seasonality + self.config_events = config_events + self.config_country_holidays = config_country_holidays + self.config_regressors = config_regressors + self.config_lagged_regressors = config_lagged_regressors + self.config_missing = config_missing + + # self.config_args = kwargs + # self.predict_mode = kwargs["predict_mode"] + # self.n_lags = kwargs["n_lags"] + # self.n_forecasts = kwargs["n_forecasts"] + # self.config_events = kwargs["config_events"] + # self.config_country_holidays = kwargs["config_country_holidays"] + # self.config_lagged_regressors = kwargs["config_lagged_regressors"] + + self.max_lags = get_max_num_lags(n_lags=self.n_lags, config_lagged_regressors=self.config_lagged_regressors) self.two_level_inputs = [ "seasonalities", @@ -70,12 +99,12 @@ def __init__(self, df, name, **kwargs): self.multiplicative_event_and_holiday_names, ) = add_event_features_to_df( self.df, - self.config_args["config_events"], - self.config_args["config_country_holidays"], + self.config_events, + self.config_country_holidays, ) # pre-sort additive/multiplicative regressors self.additive_regressors_names, self.multiplicative_regressors_names = sort_regressor_names( - self.config_args["config_regressors"] + self.config_regressors ) # Construct index map @@ -123,8 +152,8 @@ def __getitem__(self, index): n_lags=self.n_lags, max_lags=self.max_lags, n_forecasts=self.n_forecasts, - config_seasonality=self.config_args["config_seasonality"], - config_lagged_regressors=self.config_args["config_lagged_regressors"], + config_seasonality=self.config_seasonality, + config_lagged_regressors=self.config_lagged_regressors, additive_event_and_holiday_names=self.additive_event_and_holiday_names, multiplicative_event_and_holiday_names=self.multiplicative_event_and_holiday_names, additive_regressors_names=self.additive_regressors_names, @@ -149,7 +178,7 @@ def create_sample2index_map(self, df): # Limit target range due to input lags and number of forecasts df_length = len(df) - n_forecasts = self.config_args["n_forecasts"] + n_forecasts = selfn_forecasts origin_start_end_mask = create_origin_start_end_mask( df_length=df_length, max_lags=self.max_lags, n_forecasts=n_forecasts ) @@ -157,15 +186,13 @@ def create_sample2index_map(self, df): # Prediction Frequency # Filter missing samples and prediction frequency (does not actually drop, but creates indexmapping) # analogous to `self.filter_samples_after_init(self.kwargs["prediction_frequency"])` - prediction_frequency_mask = create_prediction_frequency_filter_mask( - df, self.config_args["prediction_frequency"] - ) + prediction_frequency_mask = create_prediction_frequency_filter_mask(df, self.prediction_frequency) # TODO Create NAN-free index mapping of sample index to df index # analogous to `self.drop_nan_after_init( # self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) nan_mask = create_nan_mask( - df, self.config_args["predict_steps"], self.config_args["config_missing"].drop_missing + df, self.predict_steps, self.config_missing.drop_missing ) # boolean array where NAN are False # Combine masks @@ -181,64 +208,51 @@ def create_sample2index_map(self, df): return sample_index_2_df_origin_index, num_samples - # def format_sample(self, inputs, targets=None): - # """Convert tabularized sample to correct formats. - # Parameters - # ---------- - # inputs : ordered dict - # Identical to returns from :meth:`tabularize_univariate_datetime` - # targets : np.array, float - # Identical to returns from :meth:`tabularize_univariate_datetime` - # """ - # sample_input = OrderedDict({}) - # sample_input["time"] = inputs["time"] - # if "lags" in inputs.keys(): - # sample_input["lags"] = inputs["lags"] - # inputs_dtype = { - # # "time": torch.float, - # # "timestamps": np.datetime64, - # # "lags": torch.float, - # "seasonalities": torch.float, - # "events": torch.float, - # "covariates": torch.float, - # "regressors": torch.float, - # } - - # for key, data in inputs.items(): - # if key in self.two_level_inputs: - # sample_input[key] = OrderedDict({}) - # for name, features in data.items(): - # if features.dtype != np.float32: - # features = features.astype(np.float32, copy=False) - - # tensor = torch.from_numpy(features) - - # if tensor.dtype != inputs_dtype[key]: - # sample_input[key][name] = tensor.to( - # dtype=inputs_dtype[key] - # ) # this can probably be removed, but was included in the previous code - # else: - # sample_input[key][name] = tensor - - # # No longer needed as - now directly casting to torch in tabularize - # # else: # single_level items - # # sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) - # # ## OLD - # # # if key == "timestamps": sample_input[key] = data - # # # else: sample_input[key] = torch.from_numpy(data).type(inputs_dtype[key]) - - # # TODO Can this be skipped for a single sample? - # # Alternatively, Can this be optimized? - # # Split nested dict into list of dicts with same keys as sample_input. - # # def split_dict(sample_input, index): - # # return {k: v[index] if not isinstance(v, dict) else split_dict(v, index) for k, v in sample_input.items()} - # # length = next(iter(sample_input.values())).shape[0] - # # sample_input = [split_dict(sample_input, i) for i in range(length)] - - # ## timestamps should no longer be present here? - # # sample_input.pop("timestamps") # Exact timestamps are not needed anymore - - # return sample_input, targets + +class GlobalTimeDataset(TimeDataset): + def __init__(self, df, **kwargs): + """Initialize Timedataset from time-series df. + Parameters + ---------- + df : pd.DataFrame + dataframe containing column ``ds``, ``y``, and optionally``ID`` and + normalized columns normalized columns ``ds``, ``y``, ``t``, ``y_scaled`` + **kwargs : dict + Identical to :meth:`tabularize_univariate_datetime` + """ + self.df_names = sorted(list(np.unique(df.loc[:, "ID"].values))) + # if len(self.df_names) == 1: + # super().__init__(df, self.df_names[0], **kwargs) + # else: + # raise NotImplementedError + # timedatasets = [TimeDataset(df_i, df_name, **kwargs) for df_name, df_i in df.groupby("ID")] + # self.combined_timedataset = [item for timedataset in timedatasets for item in timedataset] + # self.length = sum(timedataset.length for timedataset in timedatasets) + self.datasets = OrderedDict({}) + for df_name in self.df_names: + self.datasets[df_name] = TimeDataset(df[df["ID"] == df_name], df_name, **kwargs) + self.length = sum(dataset.length for (name, dataset) in self.datasets.items()) + global_sample_to_local_ID = [] + global_sample_to_local_sample = [] + for name, dataset in self.datasets.items(): + global_sample_to_local_ID.append(np.full(shape=dataset.length, fill_value=name)) + global_sample_to_local_sample.append(np.arange(dataset.length)) + self.global_sample_to_local_ID = np.concatenate(global_sample_to_local_ID) + self.global_sample_to_local_sample = np.concatenate(global_sample_to_local_sample) + + def __len__(self): + return self.length + + def __getitem__(self, idx): + """Overrides parent class method to get an item at index. + Parameters + ---------- + index : int + Sample location in dataset, starting at 0 + """ + df_name = self.global_sample_to_local_ID[idx] + local_pos = self.global_sample_to_local_sample[idx] + return self.datasets[df_name].__getitem__(local_pos) def tabularize_univariate_datetime_single_index( @@ -460,58 +474,6 @@ def tabularize_univariate_datetime_single_index( return inputs, targets -class GlobalTimeDataset(TimeDataset): - def __init__(self, df, **kwargs): - """Initialize Timedataset from time-series df. - Parameters - ---------- - df : pd.DataFrame - dataframe containing column ``ds``, ``y``, and optionally``ID`` and - normalized columns normalized columns ``ds``, ``y``, ``t``, ``y_scaled`` - **kwargs : dict - Identical to :meth:`tabularize_univariate_datetime` - """ - self.df_names = sorted(list(np.unique(df.loc[:, "ID"].values))) - if len(self.df_names) == 1: - super().__init__(df, self.df_names[0], **kwargs) - else: - self.datasets = OrderedDict({}) - for df_name in self.df_names: - self.datasets[df_name] = TimeDataset(df[df["ID"] == df_name], df_name, **kwargs) - self.length = sum(dataset.length for (name, dataset) in self.datasets.items()) - self.global_sample_to_local_ID = np.full(shape=self.length, fill_value="__df__", dtype=str) - self.global_sample_to_local_sample = np.full(shape=self.length, fill_value=0, dtype=int) - global_position = 0 - for name, dataset in self.datasets.items(): - local_length = dataset.length - self.global_sample_to_local_ID[global_position : global_position + local_length] = name - self.global_sample_to_local_sample[global_position : global_position + local_length] = np.arange( - local_length, dtype=int - ) - global_position += local_length - - # raise NotImplementedError - # TODO: re-implement with JIT sample computation in TimeDatase - # # TODO (future): vectorize - # timedatasets = [TimeDataset(df_i, df_name, **kwargs) for df_name, df_i in df.groupby("ID")] - # self.combined_timedataset = [item for timedataset in timedatasets for item in timedataset] - # self.length = sum(timedataset.length for timedataset in timedatasets) - - def __len__(self): - return self.length - - def __getitem__(self, idx): - """Overrides parent class method to get an item at index. - Parameters - ---------- - index : int - Sample location in dataset, starting at 0 - """ - df_name = self.global_sample_to_local_ID[idx] - local_pos = self.global_sample_to_local_sample[idx] - return self.datasets[df_name].__getitem__(local_pos) - - def fourier_series(dates, period, series_order): """Provides Fourier series components with the specified frequency and order. Note From 254cb236ddf7e855e422ecd47aeecf97439231f7 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Tue, 30 Jan 2024 15:52:57 -0800 Subject: [PATCH 055/128] debugged seasonal components call of TimeDataset --- neuralprophet/data/process.py | 6 +++--- neuralprophet/forecaster.py | 14 +++++++++----- neuralprophet/time_dataset.py | 3 +-- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/neuralprophet/data/process.py b/neuralprophet/data/process.py index 1b1be0b1c..85e59d0ab 100644 --- a/neuralprophet/data/process.py +++ b/neuralprophet/data/process.py @@ -620,13 +620,13 @@ def _create_dataset(model, df, predict_mode, prediction_frequency=None): predict_mode=predict_mode, n_lags=model.n_lags, n_forecasts=model.n_forecasts, + prediction_frequency=prediction_frequency, predict_steps=model.predict_steps, config_seasonality=model.config_seasonality, config_events=model.config_events, config_country_holidays=model.config_country_holidays, - config_lagged_regressors=model.config_lagged_regressors, config_regressors=model.config_regressors, + config_lagged_regressors=model.config_lagged_regressors, config_missing=model.config_missing, - prediction_frequency=prediction_frequency, - config_train=model.config_train, + # config_train=model.config_train, # no longer needed since JIT tabularization. ) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index 8af79b2f2..4193a9ccc 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -1767,16 +1767,20 @@ def predict_seasonal_components(self, df: pd.DataFrame, quantile: float = 0.5): df_seasonal = pd.DataFrame() for df_name, df_i in df.groupby("ID"): dataset = time_dataset.TimeDataset( - df_i, + df=df_i, name=df_name, - config_seasonality=self.config_seasonality, + predict_mode=True, n_lags=0, n_forecasts=1, + prediction_frequency=self.prediction_frequency, predict_steps=self.predict_steps, - predict_mode=True, + config_seasonality=self.config_seasonality, + config_events=self.config_events, + config_country_holidays=self.config_country_holidays, + config_regressors=self.config_regressors, + config_lagged_regressors=self.config_lagged_regressors, config_missing=self.config_missing, - prediction_frequency=self.prediction_frequency, - config_train=self.config_train, + # config_train=self.config_train, # no longer needed since JIT tabularization. ) loader = DataLoader(dataset, batch_size=min(4096, len(df)), shuffle=False, drop_last=False) predicted = {} diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index afe354be6..2955be88e 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -178,9 +178,8 @@ def create_sample2index_map(self, df): # Limit target range due to input lags and number of forecasts df_length = len(df) - n_forecasts = selfn_forecasts origin_start_end_mask = create_origin_start_end_mask( - df_length=df_length, max_lags=self.max_lags, n_forecasts=n_forecasts + df_length=df_length, max_lags=self.max_lags, n_forecasts=self.n_forecasts ) # Prediction Frequency From 1b6940a4b81070ff8ff502cfa0c532c101ef5be1 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Tue, 30 Jan 2024 16:17:12 -0800 Subject: [PATCH 056/128] fix numpy object type error --- neuralprophet/time_dataset.py | 38 ++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 2955be88e..dfe5e18cc 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -320,6 +320,8 @@ def tabularize_univariate_datetime_single_index( np.array, float Targets to be predicted of same length as each of the model inputs, dims: (n_forecasts, 1) """ + # TODO: pre-process al type conversions (e.g. torch.float32) in __init__ + # sample features are stored and returned in OrderedDict inputs = OrderedDict({}) @@ -411,22 +413,31 @@ def tabularize_univariate_datetime_single_index( if max_lags == 0: if len(additive_regressors_names) > 0: regressors["additive"] = df.loc[origin_index, additive_regressors_names].values - regressors["additive"] = torch.as_tensor(regressors["additive"], dtype=torch.float32) + regressors["additive"] = torch.as_tensor( + np.array(regressors["additive"], dtype=np.float32), dtype=torch.float32 + ) if len(multiplicative_regressors_names) > 0: regressors["multiplicative"] = df.loc[origin_index, multiplicative_regressors_names].values - regressors["multiplicative"] = torch.as_tensor(regressors["multiplicative"], dtype=torch.float32) + regressors["multiplicative"] = torch.as_tensor( + np.array(regressors["multiplicative"], dtype=np.float32), dtype=torch.float32 + ) else: if len(additive_regressors_names) > 0: regressors["additive"] = df.loc[ origin_index + 1 - n_lags : origin_index + n_forecasts, additive_regressors_names ].values - regressors["additive"] = torch.as_tensor(regressors["additive"], dtype=torch.float32) - + # regressors["additive"] = torch.as_tensor(regressors["additive"], dtype=torch.float32) + regressors["additive"] = torch.as_tensor( + np.array(regressors["additive"], dtype=np.float32), dtype=torch.float32 + ) if len(multiplicative_regressors_names) > 0: regressors["multiplicative"] = df.loc[ origin_index + 1 - n_lags : origin_index + n_forecasts, multiplicative_regressors_names ].values - regressors["multiplicative"] = torch.as_tensor(regressors["multiplicative"], dtype=torch.float32) + # regressors["multiplicative"] = torch.as_tensor(regressors["multiplicative"], dtype=torch.float32) + regressors["multiplicative"] = torch.as_tensor( + np.array(regressors["multiplicative"], dtype=np.float32), dtype=torch.float32 + ) inputs["regressors"] = regressors # FUTURE EVENTS: get the events features @@ -438,22 +449,29 @@ def tabularize_univariate_datetime_single_index( if max_lags == 0: if len(additive_event_and_holiday_names) > 0: events["additive"] = df.loc[origin_index, additive_event_and_holiday_names].values - events["additive"] = torch.as_tensor(events["additive"], dtype=torch.float32) + events["additive"] = torch.as_tensor( + np.array(events["additive"], dtype=np.float32), dtype=torch.float32 + ) if len(multiplicative_event_and_holiday_names) > 0: events["multiplicative"] = df.loc[origin_index, multiplicative_event_and_holiday_names].values - events["multiplicative"] = torch.as_tensor(events["multiplicative"], dtype=torch.float32) - else: + events["multiplicative"] = torch.as_tensor( + np.array(events["multiplicative"], dtype=np.float32), dtype=torch.float32 + ) if len(additive_event_and_holiday_names) > 0: events["additive"] = df.loc[ origin_index + 1 : origin_index + n_forecasts, additive_event_and_holiday_names ].values - events["additive"] = torch.as_tensor(events["additive"], dtype=torch.float32) + events["additive"] = torch.as_tensor( + np.array(events["additive"], dtype=np.float32), dtype=torch.float32 + ) if len(multiplicative_event_and_holiday_names) > 0: events["multiplicative"] = df.loc[ origin_index + 1 : origin_index + n_forecasts, multiplicative_event_and_holiday_names ].values - events["multiplicative"] = torch.as_tensor(events["multiplicative"], dtype=torch.float32) + events["multiplicative"] = torch.as_tensor( + np.array(events["multiplicative"], dtype=np.float32), dtype=torch.float32 + ) inputs["events"] = events # ONLY FOR DEBUGGING From edec3443a01c79de839a08c9aa0f78924ad4c771 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 31 Jan 2024 15:30:52 -0800 Subject: [PATCH 057/128] fix seasonality condition bugs --- neuralprophet/time_dataset.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index dfe5e18cc..9e6195239 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -393,14 +393,20 @@ def tabularize_univariate_datetime_single_index( t = np.array((dates - datetime(1900, 1, 1)).dt.total_seconds().astype(np.float32)) / (3600 * 24.0) # features: Matrix with dims (length len(dates), 2*resolution) features = np.column_stack( - [np.sin((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] - + [np.cos((2.0 * (i + 1) * np.pi * t / period.period)) for i in range(period.resolution)] + [np.sin(2.0 * (i + 1) * np.pi * t / period.period) for i in range(period.resolution)] + + [np.cos(2.0 * (i + 1) * np.pi * t / period.period) for i in range(period.resolution)] ) else: raise NotImplementedError if period.condition_name is not None: # multiply seasonality features with condition mask/values - features = features * df[period.condition_name].values[:, np.newaxis] + if max_lags == 0: + condition_values = pd.Series(df.at[origin_index, period.condition_name]).values[:, np.newaxis] + else: + condition_values = df.loc[ + origin_index - n_lags + 1 : origin_index + n_forecasts, period.condition_name + ].values[:, np.newaxis] + features = features * condition_values seasonalities[name] = torch.as_tensor(features, dtype=torch.float32) inputs["seasonalities"] = seasonalities From 5eef5f9c266b6f9817a1f8430ba7cb1488954b7c Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 31 Jan 2024 16:39:06 -0800 Subject: [PATCH 058/128] fix events and future regressor cases --- .../components/future_regressors/linear.py | 6 ++- neuralprophet/time_dataset.py | 53 +++++++++---------- neuralprophet/time_net.py | 10 ++-- 3 files changed, 35 insertions(+), 34 deletions(-) diff --git a/neuralprophet/components/future_regressors/linear.py b/neuralprophet/components/future_regressors/linear.py index e8434384c..7b7685b83 100644 --- a/neuralprophet/components/future_regressors/linear.py +++ b/neuralprophet/components/future_regressors/linear.py @@ -51,8 +51,10 @@ def scalar_features_effects(self, features, params, indices=None): if indices is not None: features = features[:, :, indices] params = params[:, indices] - - return torch.sum(features.unsqueeze(dim=2) * params.unsqueeze(dim=0).unsqueeze(dim=0), dim=-1) + # features dims: (batch, n_forecasts, n_features) -> (batch, n_forecasts, 1, n_features) + # params dims: (n_quantiles, n_features) -> (batch, 1, n_quantiles, n_features) + out = torch.sum(features.unsqueeze(dim=2) * params.unsqueeze(dim=0).unsqueeze(dim=0), dim=-1) + return out # dims (batch, n_forecasts, n_quantiles) def get_reg_weights(self, name): """ diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 9e6195239..e90f4f764 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -412,72 +412,69 @@ def tabularize_univariate_datetime_single_index( # FUTURE REGRESSORS: get the future regressors features # create numpy array of values of additive and multiplicative regressors, at correct indexes - # features dims: (n_samples/batch, n_forecasts, n_features/n_regressors) + # features dims: (n_forecasts, n_features) any_future_regressors = 0 < len(additive_regressors_names + multiplicative_regressors_names) if any_future_regressors: # if config_regressors is not None: regressors = OrderedDict({}) if max_lags == 0: if len(additive_regressors_names) > 0: - regressors["additive"] = df.loc[origin_index, additive_regressors_names].values + features = df.loc[origin_index, additive_regressors_names].values regressors["additive"] = torch.as_tensor( - np.array(regressors["additive"], dtype=np.float32), dtype=torch.float32 + np.expand_dims(np.array(features, dtype=np.float32), axis=0), dtype=torch.float32 ) if len(multiplicative_regressors_names) > 0: - regressors["multiplicative"] = df.loc[origin_index, multiplicative_regressors_names].values + features = df.loc[origin_index, multiplicative_regressors_names].values regressors["multiplicative"] = torch.as_tensor( - np.array(regressors["multiplicative"], dtype=np.float32), dtype=torch.float32 + np.expand_dims(np.array(features, dtype=np.float32), axis=0), dtype=torch.float32 ) else: if len(additive_regressors_names) > 0: - regressors["additive"] = df.loc[ + features = df.loc[ origin_index + 1 - n_lags : origin_index + n_forecasts, additive_regressors_names ].values - # regressors["additive"] = torch.as_tensor(regressors["additive"], dtype=torch.float32) - regressors["additive"] = torch.as_tensor( - np.array(regressors["additive"], dtype=np.float32), dtype=torch.float32 - ) + # regressors["additive"] = torch.as_tensor(features, dtype=torch.float32) + regressors["additive"] = torch.as_tensor(np.array(features, dtype=np.float32), dtype=torch.float32) if len(multiplicative_regressors_names) > 0: - regressors["multiplicative"] = df.loc[ + features = df.loc[ origin_index + 1 - n_lags : origin_index + n_forecasts, multiplicative_regressors_names ].values - # regressors["multiplicative"] = torch.as_tensor(regressors["multiplicative"], dtype=torch.float32) + # regressors["multiplicative"] = torch.as_tensor(features, dtype=torch.float32) regressors["multiplicative"] = torch.as_tensor( - np.array(regressors["multiplicative"], dtype=np.float32), dtype=torch.float32 + np.array(features, dtype=np.float32), dtype=torch.float32 ) inputs["regressors"] = regressors # FUTURE EVENTS: get the events features # create numpy array of values of additive and multiplicative events, at correct indexes - # features dims: (n_samples/batch, n_forecasts, n_features/n_events) + # features dims: (n_forecasts, n_features) any_events = 0 < len(additive_event_and_holiday_names + multiplicative_event_and_holiday_names) if any_events: events = OrderedDict({}) if max_lags == 0: + # forecasts are at origin_index if len(additive_event_and_holiday_names) > 0: - events["additive"] = df.loc[origin_index, additive_event_and_holiday_names].values + features = df.loc[origin_index, additive_event_and_holiday_names].values events["additive"] = torch.as_tensor( - np.array(events["additive"], dtype=np.float32), dtype=torch.float32 + np.expand_dims(np.array(features, dtype=np.float32), axis=0), dtype=torch.float32 ) if len(multiplicative_event_and_holiday_names) > 0: - events["multiplicative"] = df.loc[origin_index, multiplicative_event_and_holiday_names].values + features = df.loc[origin_index, multiplicative_event_and_holiday_names].values events["multiplicative"] = torch.as_tensor( - np.array(events["multiplicative"], dtype=np.float32), dtype=torch.float32 + np.expand_dims(np.array(features, dtype=np.float32), axis=0), dtype=torch.float32 ) + else: + # forecasts are at origin_index + 1 up to origin_index + n_forecasts if len(additive_event_and_holiday_names) > 0: - events["additive"] = df.loc[ - origin_index + 1 : origin_index + n_forecasts, additive_event_and_holiday_names + features = df.loc[ + origin_index + 1 - n_lags : origin_index + n_forecasts, additive_event_and_holiday_names ].values - events["additive"] = torch.as_tensor( - np.array(events["additive"], dtype=np.float32), dtype=torch.float32 - ) + events["additive"] = torch.as_tensor(np.array(features, dtype=np.float32), dtype=torch.float32) if len(multiplicative_event_and_holiday_names) > 0: - events["multiplicative"] = df.loc[ - origin_index + 1 : origin_index + n_forecasts, multiplicative_event_and_holiday_names + features = df.loc[ + origin_index + 1 - n_lags : origin_index + n_forecasts, multiplicative_event_and_holiday_names ].values - events["multiplicative"] = torch.as_tensor( - np.array(events["multiplicative"], dtype=np.float32), dtype=torch.float32 - ) + events["multiplicative"] = torch.as_tensor(np.array(features, dtype=np.float32), dtype=torch.float32) inputs["events"] = events # ONLY FOR DEBUGGING diff --git a/neuralprophet/time_net.py b/neuralprophet/time_net.py index f2fcbeb80..8674a6482 100644 --- a/neuralprophet/time_net.py +++ b/neuralprophet/time_net.py @@ -442,19 +442,21 @@ def scalar_features_effects(self, features: torch.Tensor, params: nn.Parameter, Features (either additive or multiplicative) related to event component dims (batch, n_forecasts, n_features) params : nn.Parameter - Params (either additive or multiplicative) related to events + Params (either additive or multiplicative) related to events dims (n_quantiles, n_features) indices : list of int Indices in the feature tensors related to a particular event Returns ------- torch.Tensor - Forecast component of dims (batch, n_forecasts) + Forecast component of dims (batch, n_forecasts, n_quantiles) """ if indices is not None: features = features[:, :, indices] params = params[:, indices] - - return torch.sum(features.unsqueeze(dim=2) * params.unsqueeze(dim=0).unsqueeze(dim=0), dim=-1) + # features dims: (batch, n_forecasts, n_features) -> (batch, n_forecasts, 1, n_features) + # params dims: (n_quantiles, n_features) -> (batch, 1, n_quantiles, n_features) + out = torch.sum(features.unsqueeze(dim=2) * params.unsqueeze(dim=0).unsqueeze(dim=0), dim=-1) + return out # dims (batch, n_forecasts, n_quantiles) def auto_regression(self, lags: Union[torch.Tensor, float]) -> torch.Tensor: """Computes auto-regessive model component AR-Net. From f88e55014293b2ed50d456430cb07e6d1d9fc286 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 31 Jan 2024 16:53:07 -0800 Subject: [PATCH 059/128] fixing prediction frequency filter --- neuralprophet/time_dataset.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index e90f4f764..72c892ab3 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -686,9 +686,9 @@ def add_event_features_to_df( holiday_offset_name = utils.create_event_names_for_offsets(holiday, offset) df[holiday_offset_name] = feature.shift(periods=offset, fill_value=0.0) if mode == "additive": - additive_holiday_names.append(event_offset_name) + additive_holiday_names.append(holiday_offset_name) else: - multiplicative_holiday_names.append(event_offset_name) + multiplicative_holiday_names.append(holiday_offset_name) # Future TODO: possibly undo merge of events and holidays. additive_event_and_holiday_names = sorted(additive_events_names + additive_holiday_names) multiplicative_event_and_holiday_names = sorted(multiplicative_events_names + multiplicative_holiday_names) @@ -877,15 +877,15 @@ def create_prediction_frequency_filter_mask(df: pd.DataFrame, prediction_frequen filter_masks = [] for key, value in prediction_frequency.items(): if key == "daily-hour": - mask = timestamps.hour == value + mask = timestamps.dt.hour == value elif key == "weekly-day": - mask = timestamps.dayofweek == value + mask = timestamps.dt.dayofweek == value elif key == "monthly-day": - mask = timestamps.day == value + mask = timestamps.dt.day == value elif key == "yearly-month": - mask = timestamps.month == value + mask = timestamps.dt.month == value elif key == "hourly-minute": - mask = timestamps.minute == value + mask = timestamps.dt.minute == value else: raise ValueError(f"Invalid prediction frequency: {key}") filter_masks.append(mask) From 61aad2a0c2e36e32322d6eb8dd4ff7ffcb6265d4 Mon Sep 17 00:00:00 2001 From: Simon W Date: Thu, 1 Feb 2024 17:14:19 -0800 Subject: [PATCH 060/128] performance_test_energy --- tests/test_model_performance.py | 92 +++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index 3c097d2a3..f6c30cc5a 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -5,6 +5,7 @@ import os import pathlib import time +import torch import numpy as np import pandas as pd @@ -230,3 +231,94 @@ def test_EnergyPriceDaily(): json.dump(accuracy_metrics, outfile) create_metrics_plot(metrics).write_image(os.path.join(DIR, "tests", "metrics", "EnergyPriceDaily.svg")) + + +def test_EnergyPerformance(): + ### Temporary Test for on-the-fly sampling - very time consuming! + + df = pd.read_csv(ENERGY_PRICE_DAILY_FILE) + df = df[df["ds"] < "2018-01-01"] + df["temp"] = df["temperature"] + df["ds"] = pd.to_datetime(df["ds"]) + df["y"] = pd.to_numeric(df["y"], errors="coerce") + df["ID"] = "test" + + # Conditional Seasonality + df["winter"] = np.where( + df["ds"].dt.month.isin( + [ + 10, + 11, + 12, + 1, + 2, + 3, + ] + ), + 1, + 0, + ) + df["summer"] = np.where(df["ds"].dt.month.isin([4, 5, 6, 7, 8, 9]), 1, 0) + df["winter"] = pd.to_numeric(df["winter"], errors="coerce") + df["summer"] = pd.to_numeric(df["summer"], errors="coerce") + + # Normalize Temperature + df["temp"] = (df["temp"] - 65.0) / 50.0 + + # df + df = df[["ID", "ds", "y", "temp", "winter", "summer"]] + + # Hyperparameter + tuned_params = { + "n_lags": 24 * 15, + "newer_samples_weight": 2.0, + "n_changepoints": 0, + "yearly_seasonality": 10, + "weekly_seasonality": True, + "daily_seasonality": False, # due to conditional daily seasonality + "batch_size": 128, + "ar_layers": [32, 64, 32, 16], + "lagged_reg_layers": [32, 32], + # not tuned + "n_forecasts": 33, + "learning_rate": 0.001, + "epochs": 30, + "trend_global_local": "global", + "season_global_local": "global", + "drop_missing": True, + "normalize": "standardize", + } + + # Uncertainty Quantification + confidence_lv = 0.98 + quantile_list = [round(((1 - confidence_lv) / 2), 2), round((confidence_lv + (1 - confidence_lv) / 2), 2)] + + # Check if GPU is available + use_gpu = torch.cuda.is_available() + + # Set trainer configuration + trainer_configs = { + "accelerator": "gpu" if use_gpu else "cpu", + } + print(f"Using {'GPU' if use_gpu else 'CPU'}") + + # Model + m = NeuralProphet(**tuned_params, **trainer_configs, quantiles=quantile_list) + + # Lagged Regressor + m.add_lagged_regressor(names="temp", n_lags=33, normalize="standardize") + + # Conditional Seasonality + m.add_seasonality(name="winter", period=1, fourier_order=6, condition_name="winter") + m.add_seasonality(name="summer", period=1, fourier_order=6, condition_name="summer") + + # Holidays + m.add_country_holidays(country_name="US", lower_window=-1, upper_window=1) + + # Split + df_train = df[df["ds"] < "2016-05-01"] + df_test = df[df["ds"] >= "2016-05-01"] + + # Training & Predict + _ = m.fit(df=df_train, freq="H", num_workers=4, early_stopping=True) + _ = m.predict(df_test) From 661b5b754cdba7e2a84be343f87ce50cc20db0ac Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 1 Feb 2024 17:41:45 -0800 Subject: [PATCH 061/128] debug events --- docs/source/code/forecaster.rst | 2 +- docs/source/code/hdays_utils.rst | 5 - neuralprophet/configure.py | 3 +- neuralprophet/event_utils.py | 90 +++++++++++ neuralprophet/hdays_utils.py | 31 ---- neuralprophet/time_dataset.py | 141 +++++++----------- neuralprophet/utils.py | 36 ----- pyproject.toml | 2 +- ...est_hdays_utils.py => test_event_utils.py} | 8 +- tests/test_unit.py | 34 ++--- 10 files changed, 171 insertions(+), 181 deletions(-) delete mode 100644 docs/source/code/hdays_utils.rst create mode 100644 neuralprophet/event_utils.py delete mode 100644 neuralprophet/hdays_utils.py rename tests/{test_hdays_utils.py => test_event_utils.py} (62%) diff --git a/docs/source/code/forecaster.rst b/docs/source/code/forecaster.rst index 26eb0b12e..d48d700f7 100644 --- a/docs/source/code/forecaster.rst +++ b/docs/source/code/forecaster.rst @@ -7,7 +7,7 @@ Core Module Documentation configure.py df_utils.py - hdays_utils.py + event_utils.py plot_forecast_plotly.py plot_forecast_matplotlib.py plot_model_parameters_plotly.py diff --git a/docs/source/code/hdays_utils.rst b/docs/source/code/hdays_utils.rst deleted file mode 100644 index 0b2c83a12..000000000 --- a/docs/source/code/hdays_utils.rst +++ /dev/null @@ -1,5 +0,0 @@ -Core Module Documentation -========================== - -.. automodule:: neuralprophet.hdays_utils - :members: \ No newline at end of file diff --git a/neuralprophet/configure.py b/neuralprophet/configure.py index 0c9c6458e..52b8b3f0a 100644 --- a/neuralprophet/configure.py +++ b/neuralprophet/configure.py @@ -15,6 +15,7 @@ from neuralprophet import df_utils, np_types, utils, utils_torch from neuralprophet.custom_loss_metrics import PinballLoss +from neuralprophet.event_utils import get_holiday_names log = logging.getLogger("NP.config") @@ -429,7 +430,7 @@ class Holidays: holiday_names: set = field(init=False) def init_holidays(self, df=None): - self.holiday_names = utils.get_holidays_from_country(self.country, df) + self.holiday_names = get_holiday_names(self.country, df) ConfigCountryHolidays = Holidays diff --git a/neuralprophet/event_utils.py b/neuralprophet/event_utils.py new file mode 100644 index 000000000..1633cc16c --- /dev/null +++ b/neuralprophet/event_utils.py @@ -0,0 +1,90 @@ +from collections import defaultdict +from typing import Iterable, Optional, Union + +import numpy as np +import pandas as pd +from holidays import country_holidays + +# def get_country_holidays(country: str, years: Optional[Union[int, Iterable[int]]] = None): +# """ +# Helper function to get holidays for a country. + +# Parameters +# ---------- +# country : str +# Country name to retrieve country specific holidays +# years : int, list +# Year or list of years to retrieve holidays for + +# Returns +# ------- +# set +# All possible holiday dates and names of given country + +# """ +# # For compatibility with Turkey as "TU" cases. +# country = "TUR" if country == "TU" else country +# holiday_dict = country_holidays(country=country, years=years, expand=True, observed=False) +# return holiday_dict + + +def get_holiday_names(country: Union[str, Iterable[str]], df=None): + """ + Return all possible holiday names for a list of countries over time period in df + + Parameters + ---------- + country : str, list + List of country names to retrieve country specific holidays + df : pd.Dataframe + Dataframe from which datestamps will be retrieved from + + Returns + ------- + set + All possible holiday names of given country + """ + if df is None: + years = np.arange(1995, 2045) + else: + dates = df["ds"].copy(deep=True) + years = pd.unique(dates.apply(lambda x: x.year)) + # years = list({x.year for x in dates}) + # support multiple countries, convert to list if not already + if isinstance(country, str): + country = [country] + + all_holidays = get_all_holidays(years=years, country=country) + return set(all_holidays.keys()) + + +def get_all_holidays(years, country): + """ + Make dataframe of country specific holidays for given years and countries + Parameters + ---------- + year_list : list + List of years + country : str, list + List of country names + Returns + ------- + pd.DataFrame + Containing country specific holidays df with columns 'ds' and 'holiday' + """ + # convert to list if not already + if isinstance(country, str): + country = [country] + all_holidays = defaultdict(list) + # iterate over countries and get holidays for each country + for single_country in country: + # For compatibility with Turkey as "TU" cases. + single_country = "TUR" if single_country == "TU" else single_country + # get dict of dates and their holiday name + single_country_specific_holidays = country_holidays( + country=single_country, years=years, expand=True, observed=False + ) + # invert order - for given holiday, store list of dates + for date, name in single_country_specific_holidays.items(): + all_holidays[name].append(pd.to_datetime(date)) + return all_holidays diff --git a/neuralprophet/hdays_utils.py b/neuralprophet/hdays_utils.py deleted file mode 100644 index f827b9237..000000000 --- a/neuralprophet/hdays_utils.py +++ /dev/null @@ -1,31 +0,0 @@ -from typing import Iterable, Optional, Union - -import holidays - - -def get_country_holidays(country: str, years: Optional[Union[int, Iterable[int]]] = None): - """ - Helper function to get holidays for a country. - - Parameters - ---------- - country : str - Country name to retrieve country specific holidays - years : int, list - Year or list of years to retrieve holidays for - - Returns - ------- - set - All possible holiday dates and names of given country - - """ - substitutions = { - "TU": "TR", # For compatibility with Turkey as "TU" cases. - } - - country = substitutions.get(country, country) - if not hasattr(holidays, country): - raise AttributeError(f"Holidays in {country} are not currently supported!") - - return getattr(holidays, country)(years=years) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 72c892ab3..ddf8405f7 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -10,7 +10,7 @@ from neuralprophet import configure, utils from neuralprophet.df_utils import get_max_num_lags -from neuralprophet.hdays_utils import get_country_holidays +from neuralprophet.event_utils import get_all_holidays log = logging.getLogger("NP.time_dataset") @@ -541,35 +541,6 @@ def fourier_series_t(t, period, series_order): return features -def make_country_specific_holidays_dict(year_list, country): - """ - Make dataframe of country specific holidays for given years and countries - Parameters - ---------- - year_list : list - List of years - country : str, list - List of country names - Returns - ------- - pd.DataFrame - Containing country specific holidays df with columns 'ds' and 'holiday' - """ - # iterate over countries and get holidays for each country - # convert to list if not already - if isinstance(country, str): - country = [country] - country_specific_holidays = {} - for single_country in country: - single_country_specific_holidays = get_country_holidays(single_country, year_list) - # only add holiday if it is not already in the dict - country_specific_holidays.update(single_country_specific_holidays) - country_specific_holidays_dict = defaultdict(list) - for date, holiday in country_specific_holidays.items(): - country_specific_holidays_dict[holiday].append(pd.to_datetime(date)) - return country_specific_holidays_dict - - def get_event_offset_features(event, config, feature): """ Create event offset features for the given event, config and feature @@ -671,7 +642,7 @@ def add_event_features_to_df( multiplicative_holiday_names = [] if config_country_holidays is not None: year_list = list({x.year for x in df.ds}) - country_holidays_dict = make_country_specific_holidays_dict(year_list, config_country_holidays.country) + country_holidays_dict = get_all_holidays(year_list, config_country_holidays.country) config = config_country_holidays mode = config.mode for holiday in config_country_holidays.holiday_names: @@ -681,7 +652,7 @@ def add_event_features_to_df( dates = country_holidays_dict[holiday] feature[df.ds.isin(dates)] = 1.0 else: - raise ValueError(f"Holiday {holiday} not found in country holidays") + raise ValueError(f"Holiday {holiday} not found in {config_country_holidays.country} holidays") for offset in range(config.lower_window, config.upper_window + 1): holiday_offset_name = utils.create_event_names_for_offsets(holiday, offset) df[holiday_offset_name] = feature.shift(periods=offset, fill_value=0.0) @@ -695,60 +666,60 @@ def add_event_features_to_df( return df, additive_event_and_holiday_names, multiplicative_event_and_holiday_names -def make_events_features(df, config_events: Optional[configure.ConfigEvents] = None, config_country_holidays=None): - """ - Construct arrays of all event features - Parameters - ---------- - df : pd.DataFrame - Dataframe with all values including the user specified events (provided by user) - config_events : configure.ConfigEvents - User specified events, each with their upper, lower windows (int), regularization - config_country_holidays : configure.ConfigCountryHolidays - Configurations (holiday_names, upper, lower windows, regularization) for country specific holidays - Returns - ------- - np.array - All additive event features (both user specified and country specific) - np.array - All multiplicative event features (both user specified and country specific) - """ - df = df.reset_index(drop=True) - additive_events = pd.DataFrame() - multiplicative_events = pd.DataFrame() - - # create all user specified events - if config_events is not None: - for event, configs in config_events.items(): - feature = df[event] - _create_event_offset_features(event, configs, feature, additive_events, multiplicative_events) - - # create all country specific holidays - if config_country_holidays is not None: - year_list = list({x.year for x in df.ds}) - country_holidays_dict = make_country_specific_holidays_dict(year_list, config_country_holidays.country) - for holiday in config_country_holidays.holiday_names: - feature = pd.Series([0.0] * df.shape[0]) - if holiday in country_holidays_dict.keys(): - dates = country_holidays_dict[holiday] - feature[df.ds.isin(dates)] = 1.0 - _create_event_offset_features( - holiday, config_country_holidays, feature, additive_events, multiplicative_events - ) - - # Make sure column order is consistent - if not additive_events.empty: - additive_events = additive_events[sorted(additive_events.columns.tolist())] - additive_events = additive_events.values - else: - additive_events = None - if not multiplicative_events.empty: - multiplicative_events = multiplicative_events[sorted(multiplicative_events.columns.tolist())] - multiplicative_events = multiplicative_events.values - else: - multiplicative_events = None +# def make_events_features(df, config_events: Optional[configure.ConfigEvents] = None, config_country_holidays=None): +# """ +# Construct arrays of all event features +# Parameters +# ---------- +# df : pd.DataFrame +# Dataframe with all values including the user specified events (provided by user) +# config_events : configure.ConfigEvents +# User specified events, each with their upper, lower windows (int), regularization +# config_country_holidays : configure.ConfigCountryHolidays +# Configurations (holiday_names, upper, lower windows, regularization) for country specific holidays +# Returns +# ------- +# np.array +# All additive event features (both user specified and country specific) +# np.array +# All multiplicative event features (both user specified and country specific) +# """ +# df = df.reset_index(drop=True) +# additive_events = pd.DataFrame() +# multiplicative_events = pd.DataFrame() + +# # create all user specified events +# if config_events is not None: +# for event, configs in config_events.items(): +# feature = df[event] +# _create_event_offset_features(event, configs, feature, additive_events, multiplicative_events) + +# # create all country specific holidays +# if config_country_holidays is not None: +# year_list = list({x.year for x in df.ds}) +# country_holidays_dict = make_country_specific_holidays_dict(year_list, config_country_holidays.country) +# for holiday in config_country_holidays.holiday_names: +# feature = pd.Series([0.0] * df.shape[0]) +# if holiday in country_holidays_dict.keys(): +# dates = country_holidays_dict[holiday] +# feature[df.ds.isin(dates)] = 1.0 +# _create_event_offset_features( +# holiday, config_country_holidays, feature, additive_events, multiplicative_events +# ) + +# # Make sure column order is consistent +# if not additive_events.empty: +# additive_events = additive_events[sorted(additive_events.columns.tolist())] +# additive_events = additive_events.values +# else: +# additive_events = None +# if not multiplicative_events.empty: +# multiplicative_events = multiplicative_events[sorted(multiplicative_events.columns.tolist())] +# multiplicative_events = multiplicative_events.values +# else: +# multiplicative_events = None - return additive_events, multiplicative_events +# return additive_events, multiplicative_events # def make_regressors_features(df, config_regressors): diff --git a/neuralprophet/utils.py b/neuralprophet/utils.py index ea245dc7f..c6fec4568 100644 --- a/neuralprophet/utils.py +++ b/neuralprophet/utils.py @@ -13,7 +13,6 @@ import torch from neuralprophet import utils_torch -from neuralprophet.hdays_utils import get_country_holidays from neuralprophet.logger import ProgressBar if TYPE_CHECKING: @@ -321,41 +320,6 @@ def config_seasonality_to_model_dims(config_seasonality: ConfigSeasonality): return seasonal_dims -def get_holidays_from_country(country: Union[str, Iterable[str]], df=None): - """ - Return all possible holiday names of given country - - Parameters - ---------- - country : str, list - List of country names to retrieve country specific holidays - df : pd.Dataframe - Dataframe from which datestamps will be retrieved from - - Returns - ------- - set - All possible holiday names of given country - """ - if df is None: - years = np.arange(1995, 2045) - else: - dates = df["ds"].copy(deep=True) - years = list({x.year for x in dates}) - # support multiple countries - if isinstance(country, str): - country = [country] - - unique_holidays = {} - for single_country in country: - holidays_country = get_country_holidays(single_country, years) - for date, name in holidays_country.items(): - if date not in unique_holidays: - unique_holidays[date] = name - holiday_names = unique_holidays.values() - return set(holiday_names) - - def config_events_to_model_dims(config_events: Optional[ConfigEvents], config_country_holidays): """ Convert user specified events configurations along with country specific diff --git a/pyproject.toml b/pyproject.toml index 876d9194d..5e1760d84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,7 +103,7 @@ include = [ "neuralprophet/forecaster.py", "neuralprophet/configure.py", "neuralprophet/df_utils.py", - "neuralprophet/hdays_utils.py", + "neuralprophet/event_utils.py", ] [tool.ruff] diff --git a/tests/test_hdays_utils.py b/tests/test_event_utils.py similarity index 62% rename from tests/test_hdays_utils.py rename to tests/test_event_utils.py index 691804649..c124bafc4 100644 --- a/tests/test_hdays_utils.py +++ b/tests/test_event_utils.py @@ -3,16 +3,16 @@ import holidays import pytest -from neuralprophet import hdays_utils +from neuralprophet import event_utils def test_get_country_holidays(): - assert issubclass(hdays_utils.get_country_holidays("TU").__class__, holidays.countries.turkey.TR) is True + assert issubclass(event_utils.get_country_holidays("TU").__class__, holidays.countries.turkey.TR) is True for country in ("UnitedStates", "US", "USA"): - us_holidays = hdays_utils.get_country_holidays(country, years=2019) + us_holidays = event_utils.get_country_holidays(country, years=2019) assert issubclass(us_holidays.__class__, holidays.countries.united_states.UnitedStates) is True assert len(us_holidays) == 10 with pytest.raises(AttributeError): - hdays_utils.get_country_holidays("NotSupportedCountry") + event_utils.get_country_holidays("NotSupportedCountry") diff --git a/tests/test_unit.py b/tests/test_unit.py index be4d7d55a..fc66f48d9 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -809,16 +809,13 @@ def test_make_future(): def test_too_many_NaN(): - # n_lags, n_forecasts = 12, 1 + n_lags = 12 + n_forecasts = 1 config_missing = configure.MissingDataHandling( - impute_missing=True, impute_linear=5, impute_rolling=5, drop_missing=False - ) - config_train = configure.Train( - learning_rate=None, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - loss_func="SmoothL1Loss", - optimizer="AdamW", + impute_missing=True, + impute_linear=5, + impute_rolling=5, + drop_missing=False, ) length = 100 days = pd.date_range(start="2017-01-01", periods=length) @@ -840,16 +837,19 @@ def test_too_many_NaN(): # Check if ValueError is thrown, if NaN values remain after auto-imputing with pytest.raises(ValueError): time_dataset.TimeDataset( - df, - "name", + df=df, + name="name", predict_mode=False, - config_missing=config_missing, - config_lagged_regressors=None, - config_country_holidays=None, - config_events=None, - config_train=config_train, - predict_steps=1, + n_lags=n_lags, + n_forecasts=n_forecasts, prediction_frequency=None, + predict_steps=1, + config_seasonality=None, + config_events=None, + config_country_holidays=None, + config_regressors=None, + config_lagged_regressors=None, + config_missing=config_missing, ) From 3e5dd344deb0f0363533f126189755857cbd6566 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 1 Feb 2024 18:07:47 -0800 Subject: [PATCH 062/128] convert new energytest to daily data --- tests/test_model_performance.py | 118 ++++++++++++++++++++++++++++---- 1 file changed, 105 insertions(+), 13 deletions(-) diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index f6c30cc5a..d741153c7 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -5,11 +5,11 @@ import os import pathlib import time -import torch import numpy as np import pandas as pd import plotly.graph_objects as go +import torch from plotly.subplots import make_subplots from plotly_resampler import unregister_plotly_resampler @@ -233,7 +233,7 @@ def test_EnergyPriceDaily(): create_metrics_plot(metrics).write_image(os.path.join(DIR, "tests", "metrics", "EnergyPriceDaily.svg")) -def test_EnergyPerformance(): +def test_EnergyDailyDeep(): ### Temporary Test for on-the-fly sampling - very time consuming! df = pd.read_csv(ENERGY_PRICE_DAILY_FILE) @@ -270,17 +270,17 @@ def test_EnergyPerformance(): # Hyperparameter tuned_params = { - "n_lags": 24 * 15, + "n_lags": 15, "newer_samples_weight": 2.0, "n_changepoints": 0, "yearly_seasonality": 10, - "weekly_seasonality": True, - "daily_seasonality": False, # due to conditional daily seasonality - "batch_size": 128, - "ar_layers": [32, 64, 32, 16], - "lagged_reg_layers": [32, 32], + "weekly_seasonality": False, # due to conditional daily seasonality + "daily_seasonality": False, # due to data freq + "batch_size": 64, + "ar_layers": [16, 32, 16, 8], + "lagged_reg_layers": [32, 16], # not tuned - "n_forecasts": 33, + "n_forecasts": 7, "learning_rate": 0.001, "epochs": 30, "trend_global_local": "global", @@ -306,11 +306,11 @@ def test_EnergyPerformance(): m = NeuralProphet(**tuned_params, **trainer_configs, quantiles=quantile_list) # Lagged Regressor - m.add_lagged_regressor(names="temp", n_lags=33, normalize="standardize") + m.add_lagged_regressor(names="temp", n_lags=7, normalize="standardize") # Conditional Seasonality - m.add_seasonality(name="winter", period=1, fourier_order=6, condition_name="winter") - m.add_seasonality(name="summer", period=1, fourier_order=6, condition_name="summer") + m.add_seasonality(name="winter", period=7, fourier_order=6, condition_name="winter") + m.add_seasonality(name="summer", period=7, fourier_order=6, condition_name="summer") # Holidays m.add_country_holidays(country_name="US", lower_window=-1, upper_window=1) @@ -320,5 +320,97 @@ def test_EnergyPerformance(): df_test = df[df["ds"] >= "2016-05-01"] # Training & Predict - _ = m.fit(df=df_train, freq="H", num_workers=4, early_stopping=True) + _ = m.fit(df=df_train, freq="D", num_workers=4) _ = m.predict(df_test) + + +# TODO: adapt to hourly dataset with multiple IDs +# def test_EnergyPerformance(): +# ### Temporary Test for on-the-fly sampling - very time consuming! + +# df = pd.read_csv(ENERGY_PRICE_DAILY_FILE) +# df = df[df["ds"] < "2018-01-01"] +# df["temp"] = df["temperature"] +# df["ds"] = pd.to_datetime(df["ds"]) +# df["y"] = pd.to_numeric(df["y"], errors="coerce") +# df["ID"] = "test" + +# # Conditional Seasonality +# df["winter"] = np.where( +# df["ds"].dt.month.isin( +# [ +# 10, +# 11, +# 12, +# 1, +# 2, +# 3, +# ] +# ), +# 1, +# 0, +# ) +# df["summer"] = np.where(df["ds"].dt.month.isin([4, 5, 6, 7, 8, 9]), 1, 0) +# df["winter"] = pd.to_numeric(df["winter"], errors="coerce") +# df["summer"] = pd.to_numeric(df["summer"], errors="coerce") + +# # Normalize Temperature +# df["temp"] = (df["temp"] - 65.0) / 50.0 + +# # df +# df = df[["ID", "ds", "y", "temp", "winter", "summer"]] + +# # Hyperparameter +# tuned_params = { +# "n_lags": 24 * 15, +# "newer_samples_weight": 2.0, +# "n_changepoints": 0, +# "yearly_seasonality": 10, +# "weekly_seasonality": True, +# "daily_seasonality": False, # due to conditional daily seasonality +# "batch_size": 128, +# "ar_layers": [32, 64, 32, 16], +# "lagged_reg_layers": [32, 32], +# # not tuned +# "n_forecasts": 33, +# "learning_rate": 0.001, +# "epochs": 30, +# "trend_global_local": "global", +# "season_global_local": "global", +# "drop_missing": True, +# "normalize": "standardize", +# } + +# # Uncertainty Quantification +# confidence_lv = 0.98 +# quantile_list = [round(((1 - confidence_lv) / 2), 2), round((confidence_lv + (1 - confidence_lv) / 2), 2)] + +# # Check if GPU is available +# use_gpu = torch.cuda.is_available() + +# # Set trainer configuration +# trainer_configs = { +# "accelerator": "gpu" if use_gpu else "cpu", +# } +# print(f"Using {'GPU' if use_gpu else 'CPU'}") + +# # Model +# m = NeuralProphet(**tuned_params, **trainer_configs, quantiles=quantile_list) + +# # Lagged Regressor +# m.add_lagged_regressor(names="temp", n_lags=33, normalize="standardize") + +# # Conditional Seasonality +# m.add_seasonality(name="winter", period=1, fourier_order=6, condition_name="winter") +# m.add_seasonality(name="summer", period=1, fourier_order=6, condition_name="summer") + +# # Holidays +# m.add_country_holidays(country_name="US", lower_window=-1, upper_window=1) + +# # Split +# df_train = df[df["ds"] < "2016-05-01"] +# df_test = df[df["ds"] >= "2016-05-01"] + +# # Training & Predict +# _ = m.fit(df=df_train, freq="H", num_workers=4, early_stopping=True) +# _ = m.predict(df_test) From b78477b653decfb4390c7bb024884e3a9f033b47 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 1 Feb 2024 18:16:18 -0800 Subject: [PATCH 063/128] fix events util reference --- tests/utils/dataset_generators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/utils/dataset_generators.py b/tests/utils/dataset_generators.py index 065b91162..275fd8b69 100644 --- a/tests/utils/dataset_generators.py +++ b/tests/utils/dataset_generators.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from neuralprophet.time_dataset import make_country_specific_holidays_dict +from neuralprophet.event_utils import get_all_holidays def generate_holiday_dataset(country="US", years=[2022], y_default=1, y_holiday=100, y_holidays_override={}): @@ -11,7 +11,7 @@ def generate_holiday_dataset(country="US", years=[2022], y_default=1, y_holiday= dates = pd.date_range("%i-01-01" % (years[0]), periods=periods, freq="D") df = pd.DataFrame({"ds": dates, "y": y_default}, index=dates) - holidays = make_country_specific_holidays_dict(years, country) + holidays = get_all_holidays(years, country) for holiday_name, timestamps in holidays.items(): df.loc[timestamps[0], "y"] = y_holidays_override.get(holiday_name, y_holiday) From 190e3b7c3147507b8e907c329a62e367e21015e4 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 1 Feb 2024 18:26:40 -0800 Subject: [PATCH 064/128] fix test_get_country_holidays --- tests/test_event_utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/test_event_utils.py b/tests/test_event_utils.py index c124bafc4..862c11c2f 100644 --- a/tests/test_event_utils.py +++ b/tests/test_event_utils.py @@ -7,12 +7,14 @@ def test_get_country_holidays(): - assert issubclass(event_utils.get_country_holidays("TU").__class__, holidays.countries.turkey.TR) is True + # deprecated + # assert issubclass(event_utils.get_country_holidays("TU").__class__, holidays.countries.turkey.TR) is True + # new format + assert issubclass(event_utils.get_all_holidays(country=["TU", "US"], years=2025).__class__, dict) is True for country in ("UnitedStates", "US", "USA"): - us_holidays = event_utils.get_country_holidays(country, years=2019) - assert issubclass(us_holidays.__class__, holidays.countries.united_states.UnitedStates) is True + us_holidays = event_utils.get_all_holidays(country=country, years=[2019, 2020]) assert len(us_holidays) == 10 - with pytest.raises(AttributeError): - event_utils.get_country_holidays("NotSupportedCountry") + with pytest.raises(NotImplementedError): + event_utils.get_holiday_names("NotSupportedCountry") From 767ca0260b1ccf59dd8b745b52accccf995b58fe Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 1 Feb 2024 18:57:51 -0800 Subject: [PATCH 065/128] fix test_timedataset_minima --- neuralprophet/data/process.py | 10 ++--- tests/test_unit.py | 71 +++++++++++++++++++++++------------ 2 files changed, 52 insertions(+), 29 deletions(-) diff --git a/neuralprophet/data/process.py b/neuralprophet/data/process.py index 85e59d0ab..c0fd9ae04 100644 --- a/neuralprophet/data/process.py +++ b/neuralprophet/data/process.py @@ -438,14 +438,14 @@ def _check_dataframe( def _handle_missing_data( df: pd.DataFrame, - freq: Optional[str], + freq: str, n_lags: int, n_forecasts: int, config_missing, - config_regressors: Optional[ConfigFutureRegressors], - config_lagged_regressors: Optional[ConfigLaggedRegressors], - config_events: Optional[ConfigEvents], - config_seasonality: Optional[ConfigSeasonality], + config_regressors: Optional[ConfigFutureRegressors] = None, + config_lagged_regressors: Optional[ConfigLaggedRegressors] = None, + config_events: Optional[ConfigEvents] = None, + config_seasonality: Optional[ConfigSeasonality] = None, predicting: bool = False, ) -> pd.DataFrame: """ diff --git a/tests/test_unit.py b/tests/test_unit.py index fc66f48d9..8796abd95 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -68,35 +68,58 @@ def test_impute_missing(): plt.show() -def test_time_dataset(): +def test_timedataset_minimal(): # manually load any file that stores a time series, for example: df_in = pd.read_csv(AIR_FILE, index_col=False, nrows=NROWS) log.debug(f"Infile shape: {df_in.shape}") - n_lags = 3 - n_forecasts = 1 valid_p = 0.2 - config_missing = configure.MissingDataHandling() - config_train = configure.Train( - learning_rate=LR, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - loss_func="SmoothL1Loss", - optimizer="AdamW", - ) - df_train, df_val = df_utils.split_df(df_in, n_lags, n_forecasts, valid_p) - # create a tabularized dataset from time series - df, _, _ = df_utils.check_dataframe(df_train) - local_data_params, global_data_params = df_utils.init_data_params(df=df, normalize="minmax") - df = df.drop("ID", axis=1) - df = df_utils.normalize(df, global_data_params) - inputs, targets = time_dataset.tabularize_univariate_datetime( - df, n_lags=n_lags, n_forecasts=n_forecasts, config_missing=config_missing, config_train=config_train - ) - log.debug( - "tabularized inputs: {}".format( - "; ".join(["{}: {}".format(inp, values.shape) for inp, values in inputs.items()]) + for n_forecasts, n_lags in [(1, 0), (1, 5), (3, 5)]: + config_missing = configure.MissingDataHandling() + # config_train = configure.Train() + df, df_val = df_utils.split_df(df_in, n_lags, n_forecasts, valid_p) + # create a tabularized dataset from time series + df, _, _, _ = df_utils.prep_or_copy_df(df) + df, _, _ = df_utils.check_dataframe(df) + df = _handle_missing_data( + df, + freq="MS", + n_lags=n_lags, + n_forecasts=n_forecasts, + config_missing=config_missing, + # config_regressors: Optional[ConfigFutureRegressors], + # config_lagged_regressors: Optional[ConfigLaggedRegressors], + # config_events: Optional[ConfigEvents], + # config_seasonality: Optional[ConfigSeasonality], + predicting=False, + ) + local_data_params, global_data_params = df_utils.init_data_params(df=df, normalize="minmax") + df = df.drop("ID", axis=1) + df = df_utils.normalize(df, global_data_params) + + dataset = time_dataset.TimeDataset( + df=df, + name="name", + predict_mode=False, + n_lags=n_lags, + n_forecasts=n_forecasts, + prediction_frequency=None, + predict_steps=1, + config_seasonality=None, + config_events=None, + config_country_holidays=None, + config_regressors=None, + config_lagged_regressors=None, + config_missing=config_missing, + ) + inputs, targets, meta = dataset.__getitem__(0) + # inputs50, targets50, meta50 = dataset.__getitem__(50) + log.debug(f"(n_forecasts {n_forecasts}, n_lags {n_lags})") + log.debug(f"tabularized targets: {targets.shape}") + log.debug( + "tabularized inputs: {}".format( + "; ".join(["{}: {}".format(inp, values.shape) for inp, values in inputs.items()]) + ) ) - ) def test_normalize(): From 7e9b29d701f6345d5077a996be3595466ba4e94a Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 1 Feb 2024 19:08:33 -0800 Subject: [PATCH 066/128] fix selective forecasting --- neuralprophet/df_utils.py | 2 +- tests/test_integration.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/neuralprophet/df_utils.py b/neuralprophet/df_utils.py index 7d569af98..79c6c4ea6 100644 --- a/neuralprophet/df_utils.py +++ b/neuralprophet/df_utils.py @@ -1052,7 +1052,7 @@ def add_missing_dates_nan(df, freq): df_resampled = df.resample(freq).asfreq() if "ID" in df.columns: df_resampled["ID"].fillna(df["ID"].iloc[0], inplace=True) - df_resampled.reset_index(drop=True, inplace=True) + df_resampled.reset_index(inplace=True) num_added = len(df_resampled) - len(df) return df_resampled, num_added diff --git a/tests/test_integration.py b/tests/test_integration.py index 449c560a3..6be735def 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1601,6 +1601,23 @@ def test_selective_forecasting(): date_range = pd.date_range(start=start_date, end=end_date, freq="H") y = np.random.randint(0, 1000, size=(len(date_range),)) df = pd.DataFrame({"ds": date_range, "y": y}) + m = NeuralProphet( + n_forecasts=24, + n_lags=48, + epochs=1, + batch_size=BATCH_SIZE, + learning_rate=LR, + prediction_frequency={"daily-hour": 7}, + ) + m.fit(df, freq="H") + m.predict(df) + + log.info("testing: selective forecasting with n_forecasts < prediction_frequency with lags") + start_date = "2019-01-01" + end_date = "2019-03-01" + date_range = pd.date_range(start=start_date, end=end_date, freq="H") + y = np.random.randint(0, 1000, size=(len(date_range),)) + df = pd.DataFrame({"ds": date_range, "y": y}) m = NeuralProphet( n_forecasts=1, n_lags=14, From 32d2cc651dafe2bed32caa42cbfc4d041472c042 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 1 Feb 2024 19:20:24 -0800 Subject: [PATCH 067/128] cleanup timedataset --- neuralprophet/time_dataset.py | 188 +--------------------------------- 1 file changed, 1 insertion(+), 187 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index ddf8405f7..d9bfa1b9c 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -75,22 +75,9 @@ def __init__( self.config_lagged_regressors = config_lagged_regressors self.config_missing = config_missing - # self.config_args = kwargs - # self.predict_mode = kwargs["predict_mode"] - # self.n_lags = kwargs["n_lags"] - # self.n_forecasts = kwargs["n_forecasts"] - # self.config_events = kwargs["config_events"] - # self.config_country_holidays = kwargs["config_country_holidays"] - # self.config_lagged_regressors = kwargs["config_lagged_regressors"] - self.max_lags = get_max_num_lags(n_lags=self.n_lags, config_lagged_regressors=self.config_lagged_regressors) - self.two_level_inputs = [ - "seasonalities", - "covariates", - "events", - "regressors", - ] + self.two_level_inputs = ["seasonalities", "covariates", "events", "regressors"] # Preprocessing of events and holidays features (added to self.df) ( @@ -220,13 +207,6 @@ def __init__(self, df, **kwargs): Identical to :meth:`tabularize_univariate_datetime` """ self.df_names = sorted(list(np.unique(df.loc[:, "ID"].values))) - # if len(self.df_names) == 1: - # super().__init__(df, self.df_names[0], **kwargs) - # else: - # raise NotImplementedError - # timedatasets = [TimeDataset(df_i, df_name, **kwargs) for df_name, df_i in df.groupby("ID")] - # self.combined_timedataset = [item for timedataset in timedatasets for item in timedataset] - # self.length = sum(timedataset.length for timedataset in timedatasets) self.datasets = OrderedDict({}) for df_name in self.df_names: self.datasets[df_name] = TimeDataset(df[df["ID"] == df_name], df_name, **kwargs) @@ -263,11 +243,8 @@ def tabularize_univariate_datetime_single_index( n_forecasts: int = 1, config_seasonality: Optional[configure.ConfigSeasonality] = None, config_lagged_regressors: Optional[configure.ConfigLaggedRegressors] = None, - # config_events: Optional[configure.ConfigEvents] = None, - # config_country_holidays=None, additive_event_and_holiday_names: list[str] = [], multiplicative_event_and_holiday_names: list[str] = [], - # config_regressors: Optional[configure.ConfigFutureRegressors] = None, additive_regressors_names: list[str] = [], multiplicative_regressors_names: list[str] = [], ): @@ -432,13 +409,11 @@ def tabularize_univariate_datetime_single_index( features = df.loc[ origin_index + 1 - n_lags : origin_index + n_forecasts, additive_regressors_names ].values - # regressors["additive"] = torch.as_tensor(features, dtype=torch.float32) regressors["additive"] = torch.as_tensor(np.array(features, dtype=np.float32), dtype=torch.float32) if len(multiplicative_regressors_names) > 0: features = df.loc[ origin_index + 1 - n_lags : origin_index + n_forecasts, multiplicative_regressors_names ].values - # regressors["multiplicative"] = torch.as_tensor(features, dtype=torch.float32) regressors["multiplicative"] = torch.as_tensor( np.array(features, dtype=np.float32), dtype=torch.float32 ) @@ -646,7 +621,6 @@ def add_event_features_to_df( config = config_country_holidays mode = config.mode for holiday in config_country_holidays.holiday_names: - # feature = pd.Series([0.0] * df.shape[0]) feature = pd.Series(np.zeros(df.shape[0], dtype=np.float32)) if holiday in country_holidays_dict.keys(): dates = country_holidays_dict[holiday] @@ -666,137 +640,6 @@ def add_event_features_to_df( return df, additive_event_and_holiday_names, multiplicative_event_and_holiday_names -# def make_events_features(df, config_events: Optional[configure.ConfigEvents] = None, config_country_holidays=None): -# """ -# Construct arrays of all event features -# Parameters -# ---------- -# df : pd.DataFrame -# Dataframe with all values including the user specified events (provided by user) -# config_events : configure.ConfigEvents -# User specified events, each with their upper, lower windows (int), regularization -# config_country_holidays : configure.ConfigCountryHolidays -# Configurations (holiday_names, upper, lower windows, regularization) for country specific holidays -# Returns -# ------- -# np.array -# All additive event features (both user specified and country specific) -# np.array -# All multiplicative event features (both user specified and country specific) -# """ -# df = df.reset_index(drop=True) -# additive_events = pd.DataFrame() -# multiplicative_events = pd.DataFrame() - -# # create all user specified events -# if config_events is not None: -# for event, configs in config_events.items(): -# feature = df[event] -# _create_event_offset_features(event, configs, feature, additive_events, multiplicative_events) - -# # create all country specific holidays -# if config_country_holidays is not None: -# year_list = list({x.year for x in df.ds}) -# country_holidays_dict = make_country_specific_holidays_dict(year_list, config_country_holidays.country) -# for holiday in config_country_holidays.holiday_names: -# feature = pd.Series([0.0] * df.shape[0]) -# if holiday in country_holidays_dict.keys(): -# dates = country_holidays_dict[holiday] -# feature[df.ds.isin(dates)] = 1.0 -# _create_event_offset_features( -# holiday, config_country_holidays, feature, additive_events, multiplicative_events -# ) - -# # Make sure column order is consistent -# if not additive_events.empty: -# additive_events = additive_events[sorted(additive_events.columns.tolist())] -# additive_events = additive_events.values -# else: -# additive_events = None -# if not multiplicative_events.empty: -# multiplicative_events = multiplicative_events[sorted(multiplicative_events.columns.tolist())] -# multiplicative_events = multiplicative_events.values -# else: -# multiplicative_events = None - -# return additive_events, multiplicative_events - - -# def make_regressors_features(df, config_regressors): -# """Construct arrays of all scalar regressor features -# Parameters -# ---------- -# df : pd.DataFrame -# Dataframe with all values including the user specified regressors -# config_regressors : configure.ConfigFutureRegressors -# User specified regressors config -# Returns -# ------- -# np.array -# All additive regressor features -# np.array -# All multiplicative regressor features -# """ -# additive_regressors = pd.DataFrame() -# multiplicative_regressors = pd.DataFrame() - -# for reg in df.columns: -# if reg in config_regressors: -# mode = config_regressors[reg].mode -# if mode == "additive": -# additive_regressors[reg] = df[reg] -# else: -# multiplicative_regressors[reg] = df[reg] - -# if not additive_regressors.empty: -# additive_regressors = additive_regressors[sorted(additive_regressors.columns.tolist())] -# additive_regressors = additive_regressors.values -# else: -# additive_regressors = None -# if not multiplicative_regressors.empty: -# multiplicative_regressors = multiplicative_regressors[sorted(multiplicative_regressors.columns.tolist())] -# multiplicative_regressors = multiplicative_regressors.values -# else: -# multiplicative_regressors = None - -# return additive_regressors, multiplicative_regressors - - -# def seasonal_features_from_dates(df, config_seasonality: configure.ConfigSeasonality): -# """Dataframe with seasonality features. -# Includes seasonality features -# Parameters -# ---------- -# df : pd.DataFrame -# Dataframe with all values -# config_seasonality : configure.ConfigSeasonality -# Configuration for seasonalities -# Returns -# ------- -# OrderedDict -# Dictionary with keys for each period name containing an np.array -# with the respective regression features. each with dims: (len(dates), 2*fourier_order) -# """ -# dates = df["ds"] -# assert len(dates.shape) == 1 -# seasonalities = OrderedDict({}) -# # Seasonality features -# for name, period in config_seasonality.periods.items(): -# if period.resolution > 0: -# if config_seasonality.computation == "fourier": -# features = fourier_series( -# dates=dates, -# period=period.period, -# series_order=period.resolution, -# ) -# else: -# raise NotImplementedError -# if period.condition_name is not None: -# features = features * df[period.condition_name].values[:, np.newaxis] -# seasonalities[name] = features -# return seasonalities - - def create_origin_start_end_mask(df_length, max_lags, n_forecasts): """Creates a boolean mask for valid prediction origin positions. (based on limiting input lags and forecast targets at start and end of df)""" @@ -839,11 +682,6 @@ def create_prediction_frequency_filter_mask(df: pd.DataFrame, prediction_frequen if prediction_frequency is None or prediction_frequency == 1: return mask - # OLD: timestamps were created from "ds" column in tabularization and then re-converted here - # timestamps = pd.to_datetime([x["timestamps"][0] for x in df]) - # OR - # timestamps = df["timestamps"].apply(lambda x: pd.to_datetime(x[0])) - timestamps = pd.to_datetime(df.loc[:, "ds"]) filter_masks = [] for key, value in prediction_frequency.items(): @@ -941,27 +779,3 @@ def sort_regressor_names(config): else: multiplicative_regressors_names.append(reg) return additive_regressors_names, multiplicative_regressors_names - - -# ## TODO: rename - used elsewhere, not in this file. -# def make_country_specific_holidays_df(year_list, country): -# return make_country_specific_holidays_dict(year_list, country) - - -# def split_nested_dict(inputs): -# """Split nested dict into list of dicts. -# Parameters -# ---------- -# inputs : ordered dict -# Nested dict to be split. -# Returns -# ------- -# list of dicts -# List of dicts with same keys as inputs. -# """ - -# def split_dict(inputs, index): -# return {k: v[index] if not isinstance(v, dict) else split_dict(v, index) for k, v in inputs.items()} - -# length = next(iter(inputs.values())).shape[0] -# return [split_dict(inputs, i) for i in range(length)] From b709f2d1b165b83962c4d62f0a201fe542b8a6e4 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 1 Feb 2024 19:45:36 -0800 Subject: [PATCH 068/128] refactor tabularize_univariate --- neuralprophet/time_dataset.py | 324 ++++++++++++++++++++-------------- 1 file changed, 190 insertions(+), 134 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index d9bfa1b9c..9ad350ce5 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -234,6 +234,153 @@ def __getitem__(self, idx): return self.datasets[df_name].__getitem__(local_pos) +def get_sample_targets(df, origin_index, n_forecasts, max_lags, predict_mode): + if predict_mode: + return torch.zeros((n_forecasts, 1), dtype=torch.float32) + else: + if n_forecasts == 1: + if max_lags == 0: + targets = df.at[origin_index, "y_scaled"] + if max_lags > 0: + targets = df.at[origin_index + 1, "y_scaled"] + targets = np.expand_dims(targets, 0) + targets = np.expand_dims(targets, 1) # extra dimension at end for quantiles:median + else: + # Note: df.loc is inclusive of slice end, while df.iloc is not. + targets = df.loc[origin_index + 1 : origin_index + n_forecasts, "y_scaled"].values + targets = np.expand_dims(targets, 1) # extra dimension at end for quantiles:median + return torch.as_tensor(targets, dtype=torch.float32) + + +def get_sample_lagged_regressors(df, origin_index, config_lagged_regressors): + lagged_regressors = OrderedDict({}) + # Future TODO: optimize this computation for many lagged_regressors + for lagged_reg in df.columns: + if lagged_reg in config_lagged_regressors: + covar_lags = config_lagged_regressors[lagged_reg].n_lags + assert covar_lags > 0 + # Note: df.loc is inclusive of slice end, while df.iloc is not. + lagged_regressors[lagged_reg] = df.loc[origin_index - covar_lags + 1 : origin_index, lagged_reg].values + lagged_regressors[lagged_reg] = torch.as_tensor(lagged_regressors[lagged_reg], dtype=torch.float32) + return lagged_regressors + + +def get_sample_seasonalities(df, origin_index, n_forecasts, max_lags, n_lags, config_seasonality): + # TODO: precompute and save fourier features and only tabularize / slide windows when calling __getitem_ + seasonalities = OrderedDict({}) + if max_lags == 0: + dates = pd.Series(df.at[origin_index, "ds"]) + else: + # Note: df.loc is inclusive of slice end, while df.iloc is not. + dates = pd.Series(df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "ds"].values) + # Seasonality features + for name, period in config_seasonality.periods.items(): + if period.resolution > 0: + if config_seasonality.computation == "fourier": + # Compute Fourier series components with the specified frequency and order. + # convert to days since epoch + t = np.array((dates - datetime(1900, 1, 1)).dt.total_seconds().astype(np.float32)) / (3600 * 24.0) + # features: Matrix with dims (length len(dates), 2*resolution) + features = np.column_stack( + [np.sin(2.0 * (i + 1) * np.pi * t / period.period) for i in range(period.resolution)] + + [np.cos(2.0 * (i + 1) * np.pi * t / period.period) for i in range(period.resolution)] + ) + else: + raise NotImplementedError + if period.condition_name is not None: + # multiply seasonality features with condition mask/values + if max_lags == 0: + condition_values = pd.Series(df.at[origin_index, period.condition_name]).values[:, np.newaxis] + else: + condition_values = df.loc[ + origin_index - n_lags + 1 : origin_index + n_forecasts, period.condition_name + ].values[:, np.newaxis] + features = features * condition_values + seasonalities[name] = torch.as_tensor(features, dtype=torch.float32) + return seasonalities + + +def get_sample_future_regressors( + df, origin_index, n_forecasts, max_lags, n_lags, additive_regressors_names, multiplicative_regressors_names +): + regressors = OrderedDict({}) + if max_lags == 0: + if len(additive_regressors_names) > 0: + features = df.loc[origin_index, additive_regressors_names].values + regressors["additive"] = torch.as_tensor( + np.expand_dims(np.array(features, dtype=np.float32), axis=0), dtype=torch.float32 + ) + if len(multiplicative_regressors_names) > 0: + features = df.loc[origin_index, multiplicative_regressors_names].values + regressors["multiplicative"] = torch.as_tensor( + np.expand_dims(np.array(features, dtype=np.float32), axis=0), dtype=torch.float32 + ) + else: + if len(additive_regressors_names) > 0: + features = df.loc[origin_index + 1 - n_lags : origin_index + n_forecasts, additive_regressors_names].values + regressors["additive"] = torch.as_tensor(np.array(features, dtype=np.float32), dtype=torch.float32) + if len(multiplicative_regressors_names) > 0: + features = df.loc[ + origin_index + 1 - n_lags : origin_index + n_forecasts, multiplicative_regressors_names + ].values + regressors["multiplicative"] = torch.as_tensor(np.array(features, dtype=np.float32), dtype=torch.float32) + return regressors + + +def get_sample_future_events( + df, + origin_index, + n_forecasts, + max_lags, + n_lags, + additive_event_and_holiday_names, + multiplicative_event_and_holiday_names, +): + events = OrderedDict({}) + if max_lags == 0: + # forecasts are at origin_index + if len(additive_event_and_holiday_names) > 0: + features = df.loc[origin_index, additive_event_and_holiday_names].values + events["additive"] = torch.as_tensor( + np.expand_dims(np.array(features, dtype=np.float32), axis=0), dtype=torch.float32 + ) + if len(multiplicative_event_and_holiday_names) > 0: + features = df.loc[origin_index, multiplicative_event_and_holiday_names].values + events["multiplicative"] = torch.as_tensor( + np.expand_dims(np.array(features, dtype=np.float32), axis=0), dtype=torch.float32 + ) + else: + # forecasts are at origin_index + 1 up to origin_index + n_forecasts + if len(additive_event_and_holiday_names) > 0: + features = df.loc[ + origin_index + 1 - n_lags : origin_index + n_forecasts, additive_event_and_holiday_names + ].values + events["additive"] = torch.as_tensor(np.array(features, dtype=np.float32), dtype=torch.float32) + + if len(multiplicative_event_and_holiday_names) > 0: + features = df.loc[ + origin_index + 1 - n_lags : origin_index + n_forecasts, multiplicative_event_and_holiday_names + ].values + events["multiplicative"] = torch.as_tensor(np.array(features, dtype=np.float32), dtype=torch.float32) + return events + + +def log_input_shapes(inputs): + tabularized_input_shapes_str = "" + for key, value in inputs.items(): + if key in [ + "seasonalities", + "covariates", + "events", + "regressors", + ]: + for name, period_features in value.items(): + tabularized_input_shapes_str += f" {name} {key} {period_features}\n" + else: + tabularized_input_shapes_str += f" {key} {value.shape} \n" + log.debug(f"Tabularized inputs shapes: \n{tabularized_input_shapes_str}") + + def tabularize_univariate_datetime_single_index( df: pd.DataFrame, origin_index: int, @@ -297,7 +444,7 @@ def tabularize_univariate_datetime_single_index( np.array, float Targets to be predicted of same length as each of the model inputs, dims: (n_forecasts, 1) """ - # TODO: pre-process al type conversions (e.g. torch.float32) in __init__ + # TODO: pre-process all type conversions (e.g. torch.float32) in __init__ # sample features are stored and returned in OrderedDict inputs = OrderedDict({}) @@ -305,167 +452,76 @@ def tabularize_univariate_datetime_single_index( if max_lags == 0: assert n_forecasts == 1 - if predict_mode: - targets = torch.zeros((n_forecasts, 1), dtype=torch.float32) - else: - if n_forecasts == 1: - if max_lags == 0: - targets = df.at[origin_index, "y_scaled"] - if max_lags > 0: - targets = df.at[origin_index + 1, "y_scaled"] - targets = np.expand_dims(targets, 0) - targets = np.expand_dims(targets, 1) # extra dimension at end for quantiles:median - else: - # Note: df.loc is inclusive of slice end, while df.iloc is not. - targets = df.loc[origin_index + 1 : origin_index + n_forecasts, "y_scaled"].values - targets = np.expand_dims(targets, 1) # extra dimension at end for quantiles:median - targets = torch.as_tensor(targets, dtype=torch.float32) + targets = get_sample_targets( + df=df, origin_index=origin_index, n_forecasts=n_forecasts, max_lags=max_lags, predict_mode=predict_mode + ) # TIME: the time at each sample's lags and forecasts if max_lags == 0: - inputs["time"] = df.at[origin_index, "t"] - inputs["time"] = np.expand_dims(inputs["time"], 0) - inputs["time"] = torch.tensor(inputs["time"], dtype=torch.float32) - + t = df.at[origin_index, "t"] + inputs["time"] = torch.tensor(np.expand_dims(t, 0), dtype=torch.float32) else: # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index # Note: df.loc is inclusive of slice end, while df.iloc is not. - inputs["time"] = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values - inputs["time"] = torch.as_tensor(inputs["time"], dtype=torch.float32) + t = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values + inputs["time"] = torch.as_tensor(t, dtype=torch.float32) # LAGS: From y-series, extract preceeding n_lags steps up to and including origin_index if n_lags >= 1 and "y_scaled" in df.columns: # Note: df.loc is inclusive of slice end, while df.iloc is not. - inputs["lags"] = df.loc[origin_index - n_lags + 1 : origin_index, "y_scaled"].values - inputs["lags"] = torch.as_tensor(inputs["lags"], dtype=torch.float32) + lags = df.loc[origin_index - n_lags + 1 : origin_index, "y_scaled"].values + inputs["lags"] = torch.as_tensor(lags, dtype=torch.float32) # COVARIATES / LAGGED REGRESSORS: Lagged regressor inputs: analogous to LAGS if config_lagged_regressors is not None and max_lags > 0: - lagged_regressors = OrderedDict({}) - # Future TODO: optimize this computation for many lagged_regressors - for lagged_reg in df.columns: - if lagged_reg in config_lagged_regressors: - covar_lags = config_lagged_regressors[lagged_reg].n_lags - assert covar_lags > 0 - # Note: df.loc is inclusive of slice end, while df.iloc is not. - lagged_regressors[lagged_reg] = df.loc[origin_index - covar_lags + 1 : origin_index, lagged_reg].values - lagged_regressors[lagged_reg] = torch.as_tensor(lagged_regressors[lagged_reg], dtype=torch.float32) - inputs["covariates"] = lagged_regressors - - # SEASONALITIES - # TODO: precompute and save fourier features and only tabularize / slide windows when calling __getitem__ + inputs["covariates"] = get_sample_lagged_regressors( + df=df, origin_index=origin_index, config_lagged_regressors=config_lagged_regressors + ) + + # SEASONALITIES_ if config_seasonality is not None: - seasonalities = OrderedDict({}) - if max_lags == 0: - dates = pd.Series(df.at[origin_index, "ds"]) - else: - # Note: df.loc is inclusive of slice end, while df.iloc is not. - dates = pd.Series(df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "ds"].values) - # Seasonality features - for name, period in config_seasonality.periods.items(): - if period.resolution > 0: - if config_seasonality.computation == "fourier": - # Compute Fourier series components with the specified frequency and order. - # convert to days since epoch - t = np.array((dates - datetime(1900, 1, 1)).dt.total_seconds().astype(np.float32)) / (3600 * 24.0) - # features: Matrix with dims (length len(dates), 2*resolution) - features = np.column_stack( - [np.sin(2.0 * (i + 1) * np.pi * t / period.period) for i in range(period.resolution)] - + [np.cos(2.0 * (i + 1) * np.pi * t / period.period) for i in range(period.resolution)] - ) - else: - raise NotImplementedError - if period.condition_name is not None: - # multiply seasonality features with condition mask/values - if max_lags == 0: - condition_values = pd.Series(df.at[origin_index, period.condition_name]).values[:, np.newaxis] - else: - condition_values = df.loc[ - origin_index - n_lags + 1 : origin_index + n_forecasts, period.condition_name - ].values[:, np.newaxis] - features = features * condition_values - seasonalities[name] = torch.as_tensor(features, dtype=torch.float32) - inputs["seasonalities"] = seasonalities + inputs["seasonalities"] = get_sample_seasonalities( + df=df, + origin_index=origin_index, + n_forecasts=n_forecasts, + max_lags=max_lags, + n_lags=n_lags, + config_seasonality=config_seasonality, + ) # FUTURE REGRESSORS: get the future regressors features # create numpy array of values of additive and multiplicative regressors, at correct indexes # features dims: (n_forecasts, n_features) any_future_regressors = 0 < len(additive_regressors_names + multiplicative_regressors_names) if any_future_regressors: # if config_regressors is not None: - regressors = OrderedDict({}) - if max_lags == 0: - if len(additive_regressors_names) > 0: - features = df.loc[origin_index, additive_regressors_names].values - regressors["additive"] = torch.as_tensor( - np.expand_dims(np.array(features, dtype=np.float32), axis=0), dtype=torch.float32 - ) - if len(multiplicative_regressors_names) > 0: - features = df.loc[origin_index, multiplicative_regressors_names].values - regressors["multiplicative"] = torch.as_tensor( - np.expand_dims(np.array(features, dtype=np.float32), axis=0), dtype=torch.float32 - ) - else: - if len(additive_regressors_names) > 0: - features = df.loc[ - origin_index + 1 - n_lags : origin_index + n_forecasts, additive_regressors_names - ].values - regressors["additive"] = torch.as_tensor(np.array(features, dtype=np.float32), dtype=torch.float32) - if len(multiplicative_regressors_names) > 0: - features = df.loc[ - origin_index + 1 - n_lags : origin_index + n_forecasts, multiplicative_regressors_names - ].values - regressors["multiplicative"] = torch.as_tensor( - np.array(features, dtype=np.float32), dtype=torch.float32 - ) - inputs["regressors"] = regressors + inputs["regressors"] = get_sample_future_regressors( + df=df, + origin_index=origin_index, + n_forecasts=n_forecasts, + max_lags=max_lags, + n_lags=n_lags, + additive_regressors_names=additive_regressors_names, + multiplicative_regressors_names=multiplicative_regressors_names, + ) # FUTURE EVENTS: get the events features # create numpy array of values of additive and multiplicative events, at correct indexes # features dims: (n_forecasts, n_features) any_events = 0 < len(additive_event_and_holiday_names + multiplicative_event_and_holiday_names) if any_events: - events = OrderedDict({}) - if max_lags == 0: - # forecasts are at origin_index - if len(additive_event_and_holiday_names) > 0: - features = df.loc[origin_index, additive_event_and_holiday_names].values - events["additive"] = torch.as_tensor( - np.expand_dims(np.array(features, dtype=np.float32), axis=0), dtype=torch.float32 - ) - if len(multiplicative_event_and_holiday_names) > 0: - features = df.loc[origin_index, multiplicative_event_and_holiday_names].values - events["multiplicative"] = torch.as_tensor( - np.expand_dims(np.array(features, dtype=np.float32), axis=0), dtype=torch.float32 - ) - else: - # forecasts are at origin_index + 1 up to origin_index + n_forecasts - if len(additive_event_and_holiday_names) > 0: - features = df.loc[ - origin_index + 1 - n_lags : origin_index + n_forecasts, additive_event_and_holiday_names - ].values - events["additive"] = torch.as_tensor(np.array(features, dtype=np.float32), dtype=torch.float32) - - if len(multiplicative_event_and_holiday_names) > 0: - features = df.loc[ - origin_index + 1 - n_lags : origin_index + n_forecasts, multiplicative_event_and_holiday_names - ].values - events["multiplicative"] = torch.as_tensor(np.array(features, dtype=np.float32), dtype=torch.float32) - inputs["events"] = events + inputs["events"] = get_sample_future_events( + df=df, + origin_index=origin_index, + n_forecasts=n_forecasts, + max_lags=max_lags, + n_lags=n_lags, + additive_event_and_holiday_names=additive_event_and_holiday_names, + multiplicative_event_and_holiday_names=multiplicative_event_and_holiday_names, + ) # ONLY FOR DEBUGGING - # tabularized_input_shapes_str = "" - # for key, value in inputs.items(): - # if key in [ - # "seasonalities", - # "covariates", - # "events", - # "regressors", - # ]: - # for name, period_features in value.items(): - # tabularized_input_shapes_str += f" {name} {key} {period_features}\n" - # else: - # tabularized_input_shapes_str += f" {key} {value.shape} \n" - # log.debug(f"Tabularized inputs shapes: \n{tabularized_input_shapes_str}") + if log.level == 0: + log_input_shapes(inputs) return inputs, targets From 9fe44c4ec801829d8c5ba31dea9c8b4354e9b39f Mon Sep 17 00:00:00 2001 From: Simon W Date: Fri, 2 Feb 2024 12:15:53 -0800 Subject: [PATCH 069/128] daily_data --- tests/test_model_performance.py | 174 ++++++++++++++++---------------- 1 file changed, 85 insertions(+), 89 deletions(-) diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index d741153c7..8c68bddf8 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -325,92 +325,88 @@ def test_EnergyDailyDeep(): # TODO: adapt to hourly dataset with multiple IDs -# def test_EnergyPerformance(): -# ### Temporary Test for on-the-fly sampling - very time consuming! - -# df = pd.read_csv(ENERGY_PRICE_DAILY_FILE) -# df = df[df["ds"] < "2018-01-01"] -# df["temp"] = df["temperature"] -# df["ds"] = pd.to_datetime(df["ds"]) -# df["y"] = pd.to_numeric(df["y"], errors="coerce") -# df["ID"] = "test" - -# # Conditional Seasonality -# df["winter"] = np.where( -# df["ds"].dt.month.isin( -# [ -# 10, -# 11, -# 12, -# 1, -# 2, -# 3, -# ] -# ), -# 1, -# 0, -# ) -# df["summer"] = np.where(df["ds"].dt.month.isin([4, 5, 6, 7, 8, 9]), 1, 0) -# df["winter"] = pd.to_numeric(df["winter"], errors="coerce") -# df["summer"] = pd.to_numeric(df["summer"], errors="coerce") - -# # Normalize Temperature -# df["temp"] = (df["temp"] - 65.0) / 50.0 - -# # df -# df = df[["ID", "ds", "y", "temp", "winter", "summer"]] - -# # Hyperparameter -# tuned_params = { -# "n_lags": 24 * 15, -# "newer_samples_weight": 2.0, -# "n_changepoints": 0, -# "yearly_seasonality": 10, -# "weekly_seasonality": True, -# "daily_seasonality": False, # due to conditional daily seasonality -# "batch_size": 128, -# "ar_layers": [32, 64, 32, 16], -# "lagged_reg_layers": [32, 32], -# # not tuned -# "n_forecasts": 33, -# "learning_rate": 0.001, -# "epochs": 30, -# "trend_global_local": "global", -# "season_global_local": "global", -# "drop_missing": True, -# "normalize": "standardize", -# } - -# # Uncertainty Quantification -# confidence_lv = 0.98 -# quantile_list = [round(((1 - confidence_lv) / 2), 2), round((confidence_lv + (1 - confidence_lv) / 2), 2)] - -# # Check if GPU is available -# use_gpu = torch.cuda.is_available() - -# # Set trainer configuration -# trainer_configs = { -# "accelerator": "gpu" if use_gpu else "cpu", -# } -# print(f"Using {'GPU' if use_gpu else 'CPU'}") - -# # Model -# m = NeuralProphet(**tuned_params, **trainer_configs, quantiles=quantile_list) - -# # Lagged Regressor -# m.add_lagged_regressor(names="temp", n_lags=33, normalize="standardize") - -# # Conditional Seasonality -# m.add_seasonality(name="winter", period=1, fourier_order=6, condition_name="winter") -# m.add_seasonality(name="summer", period=1, fourier_order=6, condition_name="summer") - -# # Holidays -# m.add_country_holidays(country_name="US", lower_window=-1, upper_window=1) - -# # Split -# df_train = df[df["ds"] < "2016-05-01"] -# df_test = df[df["ds"] >= "2016-05-01"] - -# # Training & Predict -# _ = m.fit(df=df_train, freq="H", num_workers=4, early_stopping=True) -# _ = m.predict(df_test) +def test_EnergyHourlyDeep(): + ### Temporary Test for on-the-fly sampling - very time consuming! + + df = pd.read_csv(ENERGY_PRICE_DAILY_FILE) + df["temp"] = df["temperature"] + df = df.drop(columns="temperature") + df["ds"] = pd.to_datetime(df["ds"]) + df["y"] = pd.to_numeric(df["y"], errors="coerce") + + df = df.drop("ds", axis=1) + df['ds'] = pd.date_range(start="2015-01-01 00:00:00", periods=len(df), freq="H") + df["ID"] = "test" + + df_id = df[['ds', 'y', 'temp']].copy() + df_id['ID'] = "test2" + df_id['y'] = df_id['y'] * 0.3 + df_id['temp'] = df_id['temp'] * 0.4 + df = pd.concat([df, df_id], ignore_index=True) + + # Conditional Seasonality + df["winter"] = np.where(df["ds"].dt.month.isin([1]), 1, 0,) + df["summer"] = np.where(df["ds"].dt.month.isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]), 1, 0) + df["winter"] = pd.to_numeric(df["winter"], errors="coerce") + df["summer"] = pd.to_numeric(df["summer"], errors="coerce") + + # Normalize Temperature + df["temp"] = (df["temp"] - 65.0) / 50.0 + + # df + df = df[["ID", "ds", "y", "temp", "winter", "summer"]] + + # Hyperparameter + tuned_params = { + "n_lags": 24 * 15, + "newer_samples_weight": 2.0, + "n_changepoints": 0, + "yearly_seasonality": 10, + "weekly_seasonality": True, + "daily_seasonality": False, # due to conditional daily seasonality + "batch_size": 128, + "ar_layers": [32, 64, 32, 16], + "lagged_reg_layers": [32, 32], + # not tuned + "n_forecasts": 33, + "learning_rate": 0.001, + "epochs": 30, + "trend_global_local": "global", + "season_global_local": "global", + "drop_missing": True, + "normalize": "standardize", + } + + # Uncertainty Quantification + confidence_lv = 0.98 + quantile_list = [round(((1 - confidence_lv) / 2), 2), round((confidence_lv + (1 - confidence_lv) / 2), 2)] + + # Check if GPU is available + use_gpu = torch.cuda.is_available() + + # Set trainer configuration + trainer_configs = { + "accelerator": "gpu" if use_gpu else "cpu", + } + print(f"Using {'GPU' if use_gpu else 'CPU'}") + + # Model + m = NeuralProphet(**tuned_params, **trainer_configs, quantiles=quantile_list) + + # Lagged Regressor + m.add_lagged_regressor(names="temp", n_lags=33, normalize="standardize") + + # Conditional Seasonality + m.add_seasonality(name="winter", period=1, fourier_order=6, condition_name="winter") + m.add_seasonality(name="summer", period=1, fourier_order=6, condition_name="summer") + + # Holidays + m.add_country_holidays(country_name="US", lower_window=-1, upper_window=1) + + # Split + df_train = df[df["ds"] < "2015-03-01"] + df_test = df[df["ds"] >= "2015-03-01"] + + # Training & Predict + _ = m.fit(df=df_train, freq="H", num_workers=4, early_stopping=True) + _ = m.predict(df_test) From 7d84b37d8d2bd0b6bf0af5f50baeb74499f7d7d1 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Tue, 6 Feb 2024 16:29:40 -0800 Subject: [PATCH 070/128] start nan check for smaple mask --- neuralprophet/time_dataset.py | 162 +++++++++++++++++++++++++--------- 1 file changed, 119 insertions(+), 43 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 9ad350ce5..614c9b4ed 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd import torch +from numpy.lib.stride_tricks import sliding_window_view from torch.utils.data.dataset import Dataset from neuralprophet import configure, utils @@ -76,7 +77,8 @@ def __init__( self.config_missing = config_missing self.max_lags = get_max_num_lags(n_lags=self.n_lags, config_lagged_regressors=self.config_lagged_regressors) - + if self.max_lags == 0: + assert self.n_forecasts == 1 self.two_level_inputs = ["seasonalities", "covariates", "events", "regressors"] # Preprocessing of events and holidays features (added to self.df) @@ -154,7 +156,6 @@ def __len__(self): def sample_index_to_df_index(self, sample_index): """Translates a single outer sample to dataframe index""" - # Will need more sophisticated mapping for GlobalTimeDataset return self.sample2index_map[sample_index] def create_sample2index_map(self, df): @@ -174,16 +175,29 @@ def create_sample2index_map(self, df): # analogous to `self.filter_samples_after_init(self.kwargs["prediction_frequency"])` prediction_frequency_mask = create_prediction_frequency_filter_mask(df, self.prediction_frequency) + # Combine prediction origin masks + valid_prediction_mask = np.logical_and(prediction_frequency_mask, origin_start_end_mask) + # TODO Create NAN-free index mapping of sample index to df index # analogous to `self.drop_nan_after_init( # self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) nan_mask = create_nan_mask( - df, self.predict_steps, self.config_missing.drop_missing + df=df, + predict_steps=self.predict_steps, + drop_missing=self.config_missing.drop_missing, + n_lags=self.n_lags, ) # boolean array where NAN are False - # Combine masks - mask = np.logical_and(prediction_frequency_mask, origin_start_end_mask) - valid_sample_mask = np.logical_and(mask, nan_mask) + # Filter NAN + valid_sample_mask = np.logical_and(valid_prediction_mask, nan_mask) + n_clean_data_samples = sum(valid_prediction_mask) + n_real_data_samples = sum(valid_sample_mask) + nan_samples_to_drop = n_clean_data_samples - n_real_data_samples + if nan_samples_to_drop > 0 and not self.config_missing.drop_missing: + raise ValueError( + f"NANs found. {nan_samples_to_drop} samples affected. Set `drop_missing` to `True` to drop these samples." + ) + # Convert boolean valid_sample to list of the positinal index of all true/one entries # e.g. [0,0,1,1,0,1,0] -> [2,3,5] index_range = np.arange(0, df_length) @@ -445,13 +459,11 @@ def tabularize_univariate_datetime_single_index( Targets to be predicted of same length as each of the model inputs, dims: (n_forecasts, 1) """ # TODO: pre-process all type conversions (e.g. torch.float32) in __init__ + # Note: if max_lags == 0, then n_forecasts == 1 # sample features are stored and returned in OrderedDict inputs = OrderedDict({}) - if max_lags == 0: - assert n_forecasts == 1 - targets = get_sample_targets( df=df, origin_index=origin_index, n_forecasts=n_forecasts, max_lags=max_lags, predict_mode=predict_mode ) @@ -598,38 +610,6 @@ def get_event_offset_features(event, config, feature): return events -def _create_event_offset_features(event, config, feature, additive_events, multiplicative_events): - """ - Create event offset features for the given event, config and feature - Parameters - ---------- - event : str - Name of the event - config : configure.ConfigEvents - User specified events, holidays, and country specific holidays - feature : pd.Series - Feature for the event - additive_events : pd.DataFrame - Dataframe of additive events - multiplicative_events : pd.DataFrame - Dataframe of multiplicative events - Returns - ------- - tuple - Tuple of additive_events and multiplicative_events - """ - lw = config.lower_window - uw = config.upper_window - mode = config.mode - for offset in range(lw, uw + 1): - key = utils.create_event_names_for_offsets(event, offset) - offset_feature = feature.shift(periods=offset, fill_value=0.0) - if mode == "additive": - additive_events[key] = offset_feature - else: - multiplicative_events[key] = offset_feature - - def add_event_features_to_df( df, config_events: Optional[configure.ConfigEvents] = None, @@ -759,7 +739,7 @@ def create_prediction_frequency_filter_mask(df: pd.DataFrame, prediction_frequen return mask -def create_nan_mask(df, predict_steps, drop_missing): +def create_nan_mask(df, predict_steps, drop_missing, predict_mode, max_lags, n_lags, n_forecasts): """Creates mask for each prediction origin, accounting for corresponding input lags / forecast targets containing any NaN values. @@ -770,9 +750,105 @@ def create_nan_mask(df, predict_steps, drop_missing): predict_steps : int number of steps to predict """ + # check y: lags: + non_nan = np.ones(len(df), dtype=bool) + df_isna = df.isna() + if n_lags > 0: + # boolean vector, starting at origin_index = n_lags -1 + y_lags_nan = sliding_window_view(df_isna["y_scaled"], window_shape=n_lags, axis=0).any(axis=-1) + # fill first n_lags -1 positions with True + y_lags_nan = np.pad(y_lags_nan, pad_width=(n_lags - 1, 0), mode="constant", constant_values=True) + y_lags_valid = np.logical_not(y_lags_nan) + non_nan = np.logical_and(non_nan, y_lags_valid) + + # Targets + if predict_mode: + targets_valid = np.ones(len(df), dtype=bool) + else: + if n_forecasts == 1: + if max_lags == 0: # y-series and origin index match + targets_valid = np.logical_not(df_isna["y_scaled"].values) + if max_lags > 0: + targets_nan = df_isna.loc[1:, "y_scaled"].values + targets_nan = np.pad(targets_nan, pad_width=(1, 0), mode="constant", constant_values=True) + targets_valid = np.logical_not(targets_nan) + else: + targets_nan = sliding_window_view(df_isna["y_scaled"], window_shape=n_forecasts, axis=0).any(axis=-1) + # first entry corresponds to origin_index -1, drop this. + targets_nan = targets_nan[1:] + # pad last n_forecasts as missing, as forecast origins will have missing forecast-targets there. + targets_nan = np.pad(targets_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) + targets_valid = np.logical_not(targets_nan) + + non_nan = np.logical_and(non_nan, targets_valid) + return non_nan + + # TIME: the time at each sample's lags and forecasts + if max_lags == 0: + t = df.at[origin_index, "t"] + inputs["time"] = torch.tensor(np.expand_dims(t, 0), dtype=torch.float32) + else: + # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index + # Note: df.loc is inclusive of slice end, while df.iloc is not. + t = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values + inputs["time"] = torch.as_tensor(t, dtype=torch.float32) + + # LAGS: From y-series, extract preceeding n_lags steps up to and including origin_index + if n_lags >= 1 and "y_scaled" in df.columns: + # Note: df.loc is inclusive of slice end, while df.iloc is not. + lags = df.loc[origin_index - n_lags + 1 : origin_index, "y_scaled"].values + inputs["lags"] = torch.as_tensor(lags, dtype=torch.float32) + + # COVARIATES / LAGGED REGRESSORS: Lagged regressor inputs: analogous to LAGS + if config_lagged_regressors is not None and max_lags > 0: + inputs["covariates"] = get_sample_lagged_regressors( + df=df, origin_index=origin_index, config_lagged_regressors=config_lagged_regressors + ) + + # SEASONALITIES_ + if config_seasonality is not None: + inputs["seasonalities"] = get_sample_seasonalities( + df=df, + origin_index=origin_index, + n_forecasts=n_forecasts, + max_lags=max_lags, + n_lags=n_lags, + config_seasonality=config_seasonality, + ) + + # FUTURE REGRESSORS: get the future regressors features + # create numpy array of values of additive and multiplicative regressors, at correct indexes + # features dims: (n_forecasts, n_features) + any_future_regressors = 0 < len(additive_regressors_names + multiplicative_regressors_names) + if any_future_regressors: # if config_regressors is not None: + inputs["regressors"] = get_sample_future_regressors( + df=df, + origin_index=origin_index, + n_forecasts=n_forecasts, + max_lags=max_lags, + n_lags=n_lags, + additive_regressors_names=additive_regressors_names, + multiplicative_regressors_names=multiplicative_regressors_names, + ) + + # FUTURE EVENTS: get the events features + # create numpy array of values of additive and multiplicative events, at correct indexes + # features dims: (n_forecasts, n_features) + any_events = 0 < len(additive_event_and_holiday_names + multiplicative_event_and_holiday_names) + if any_events: + inputs["events"] = get_sample_future_events( + df=df, + origin_index=origin_index, + n_forecasts=n_forecasts, + max_lags=max_lags, + n_lags=n_lags, + additive_event_and_holiday_names=additive_event_and_holiday_names, + multiplicative_event_and_holiday_names=multiplicative_event_and_holiday_names, + ) + # IMPORTANT !! # TODO implement actual filtering - return np.ones(len(df), dtype=bool) + # return np.ones(len(df), dtype=bool) # Create index mapping of sample index to df index # - Filter missing samples (does not actually drop, but creates indexmapping) From 79ad0e70710cefa8b2cb0bc4c88ca07d0e8e35e9 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Tue, 6 Feb 2024 16:38:33 -0800 Subject: [PATCH 071/128] working on time nan2 --- neuralprophet/time_dataset.py | 48 ++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 614c9b4ed..1929816b6 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -763,24 +763,38 @@ def create_nan_mask(df, predict_steps, drop_missing, predict_mode, max_lags, n_l # Targets if predict_mode: + # Targets not needed targets_valid = np.ones(len(df), dtype=bool) else: - if n_forecasts == 1: - if max_lags == 0: # y-series and origin index match - targets_valid = np.logical_not(df_isna["y_scaled"].values) - if max_lags > 0: + if max_lags == 0: # y-series and origin index match + targets_valid = np.logical_not(df_isna["y_scaled"].values) + else: + if n_forecasts == 1: targets_nan = df_isna.loc[1:, "y_scaled"].values - targets_nan = np.pad(targets_nan, pad_width=(1, 0), mode="constant", constant_values=True) + targets_nan = np.pad(targets_nan, pad_width=(0, 1), mode="constant", constant_values=True) targets_valid = np.logical_not(targets_nan) - else: - targets_nan = sliding_window_view(df_isna["y_scaled"], window_shape=n_forecasts, axis=0).any(axis=-1) - # first entry corresponds to origin_index -1, drop this. - targets_nan = targets_nan[1:] - # pad last n_forecasts as missing, as forecast origins will have missing forecast-targets there. - targets_nan = np.pad(targets_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) - targets_valid = np.logical_not(targets_nan) - - non_nan = np.logical_and(non_nan, targets_valid) + else: # This is also correct for n_forecasts == 1, but slower. + targets_nan = sliding_window_view(df_isna["y_scaled"], window_shape=n_forecasts, axis=0).any(axis=-1) + # first entry corresponds to origin_index -1, drop this. + targets_nan = targets_nan[1:] + # pad last n_forecasts as missing, as forecast origins will have missing forecast-targets there. + targets_nan = np.pad(targets_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) + targets_valid = np.logical_not(targets_nan) + non_nan = np.logical_and(non_nan, targets_valid) + + # TIME: the time at each sample's lags and forecasts + if max_lags == 0: # y-series and origin_index match + time_valid = np.logical_not(df_isna["t"].values) + else: + # TODO: sliding_window_view and pad operations. + time_valid = np.ones(len(df), dtype=bool) + ## inspiration from tabularization: + # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index + # Note: df.loc is inclusive of slice end, while df.iloc is not. + # t = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values + # inputs["time"] = torch.as_tensor(t, dtype=torch.float32) + non_nan = np.logical_and(non_nan, time_valid) + return non_nan # TIME: the time at each sample's lags and forecasts @@ -793,12 +807,6 @@ def create_nan_mask(df, predict_steps, drop_missing, predict_mode, max_lags, n_l t = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values inputs["time"] = torch.as_tensor(t, dtype=torch.float32) - # LAGS: From y-series, extract preceeding n_lags steps up to and including origin_index - if n_lags >= 1 and "y_scaled" in df.columns: - # Note: df.loc is inclusive of slice end, while df.iloc is not. - lags = df.loc[origin_index - n_lags + 1 : origin_index, "y_scaled"].values - inputs["lags"] = torch.as_tensor(lags, dtype=torch.float32) - # COVARIATES / LAGGED REGRESSORS: Lagged regressor inputs: analogous to LAGS if config_lagged_regressors is not None and max_lags > 0: inputs["covariates"] = get_sample_lagged_regressors( From 469b11c3d9f0170a83051441b67ef4c5de02f018 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 7 Feb 2024 15:16:27 -0800 Subject: [PATCH 072/128] fix tests --- neuralprophet/time_dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 1929816b6..1e9993ece 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -185,7 +185,10 @@ def create_sample2index_map(self, df): df=df, predict_steps=self.predict_steps, drop_missing=self.config_missing.drop_missing, + predict_mode=self.predict_mode, + max_lags=self.max_lags, n_lags=self.n_lags, + n_forecasts=self.n_forecasts, ) # boolean array where NAN are False # Filter NAN From 38f70fad412b8f3947ef88fcc53bbf5da1500fc2 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 7 Feb 2024 17:08:17 -0800 Subject: [PATCH 073/128] finish nan-check --- neuralprophet/time_dataset.py | 278 +++++++++++++++++----------------- 1 file changed, 139 insertions(+), 139 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 1e9993ece..acde711ea 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -189,6 +189,9 @@ def create_sample2index_map(self, df): max_lags=self.max_lags, n_lags=self.n_lags, n_forecasts=self.n_forecasts, + config_lagged_regressors=self.config_lagged_regressors, + future_regressor_names=self.additive_regressors_names + self.multiplicative_regressors_names, + event_names=self.additive_event_and_holiday_names + self.multiplicative_event_and_holiday_names, ) # boolean array where NAN are False # Filter NAN @@ -272,13 +275,13 @@ def get_sample_targets(df, origin_index, n_forecasts, max_lags, predict_mode): def get_sample_lagged_regressors(df, origin_index, config_lagged_regressors): lagged_regressors = OrderedDict({}) # Future TODO: optimize this computation for many lagged_regressors - for lagged_reg in df.columns: - if lagged_reg in config_lagged_regressors: - covar_lags = config_lagged_regressors[lagged_reg].n_lags + for name in df.columns: + if name in config_lagged_regressors: + covar_lags = config_lagged_regressors[name].n_lags assert covar_lags > 0 # Note: df.loc is inclusive of slice end, while df.iloc is not. - lagged_regressors[lagged_reg] = df.loc[origin_index - covar_lags + 1 : origin_index, lagged_reg].values - lagged_regressors[lagged_reg] = torch.as_tensor(lagged_regressors[lagged_reg], dtype=torch.float32) + lagged_regressors[name] = df.loc[origin_index - covar_lags + 1 : origin_index, name].values + lagged_regressors[name] = torch.as_tensor(lagged_regressors[name], dtype=torch.float32) return lagged_regressors @@ -488,7 +491,7 @@ def tabularize_univariate_datetime_single_index( inputs["lags"] = torch.as_tensor(lags, dtype=torch.float32) # COVARIATES / LAGGED REGRESSORS: Lagged regressor inputs: analogous to LAGS - if config_lagged_regressors is not None and max_lags > 0: + if config_lagged_regressors is not None: # and max_lags > 0: inputs["covariates"] = get_sample_lagged_regressors( df=df, origin_index=origin_index, config_lagged_regressors=config_lagged_regressors ) @@ -742,7 +745,18 @@ def create_prediction_frequency_filter_mask(df: pd.DataFrame, prediction_frequen return mask -def create_nan_mask(df, predict_steps, drop_missing, predict_mode, max_lags, n_lags, n_forecasts): +def create_nan_mask( + df, + predict_steps, + drop_missing, + predict_mode, + max_lags, + n_lags, + n_forecasts, + config_lagged_regressors, + future_regressor_names, + event_names, +): """Creates mask for each prediction origin, accounting for corresponding input lags / forecast targets containing any NaN values. @@ -753,18 +767,10 @@ def create_nan_mask(df, predict_steps, drop_missing, predict_mode, max_lags, n_l predict_steps : int number of steps to predict """ - # check y: lags: - non_nan = np.ones(len(df), dtype=bool) + valid_origins = np.ones(len(df), dtype=bool) df_isna = df.isna() - if n_lags > 0: - # boolean vector, starting at origin_index = n_lags -1 - y_lags_nan = sliding_window_view(df_isna["y_scaled"], window_shape=n_lags, axis=0).any(axis=-1) - # fill first n_lags -1 positions with True - y_lags_nan = np.pad(y_lags_nan, pad_width=(n_lags - 1, 0), mode="constant", constant_values=True) - y_lags_valid = np.logical_not(y_lags_nan) - non_nan = np.logical_and(non_nan, y_lags_valid) - # Targets + # TARGETS if predict_mode: # Targets not needed targets_valid = np.ones(len(df), dtype=bool) @@ -773,7 +779,7 @@ def create_nan_mask(df, predict_steps, drop_missing, predict_mode, max_lags, n_l targets_valid = np.logical_not(df_isna["y_scaled"].values) else: if n_forecasts == 1: - targets_nan = df_isna.loc[1:, "y_scaled"].values + targets_nan = df_isna["y_scaled"].values[1:] targets_nan = np.pad(targets_nan, pad_width=(0, 1), mode="constant", constant_values=True) targets_valid = np.logical_not(targets_nan) else: # This is also correct for n_forecasts == 1, but slower. @@ -783,129 +789,123 @@ def create_nan_mask(df, predict_steps, drop_missing, predict_mode, max_lags, n_l # pad last n_forecasts as missing, as forecast origins will have missing forecast-targets there. targets_nan = np.pad(targets_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) targets_valid = np.logical_not(targets_nan) - non_nan = np.logical_and(non_nan, targets_valid) - - # TIME: the time at each sample's lags and forecasts - if max_lags == 0: # y-series and origin_index match - time_valid = np.logical_not(df_isna["t"].values) - else: - # TODO: sliding_window_view and pad operations. - time_valid = np.ones(len(df), dtype=bool) - ## inspiration from tabularization: - # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index - # Note: df.loc is inclusive of slice end, while df.iloc is not. - # t = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values - # inputs["time"] = torch.as_tensor(t, dtype=torch.float32) - non_nan = np.logical_and(non_nan, time_valid) - - return non_nan - - # TIME: the time at each sample's lags and forecasts - if max_lags == 0: - t = df.at[origin_index, "t"] - inputs["time"] = torch.tensor(np.expand_dims(t, 0), dtype=torch.float32) - else: - # extract time value of n_lags steps before and icluding origin_index and n_forecasts steps after origin_index - # Note: df.loc is inclusive of slice end, while df.iloc is not. - t = df.loc[origin_index - n_lags + 1 : origin_index + n_forecasts, "t"].values - inputs["time"] = torch.as_tensor(t, dtype=torch.float32) - - # COVARIATES / LAGGED REGRESSORS: Lagged regressor inputs: analogous to LAGS - if config_lagged_regressors is not None and max_lags > 0: - inputs["covariates"] = get_sample_lagged_regressors( - df=df, origin_index=origin_index, config_lagged_regressors=config_lagged_regressors - ) - - # SEASONALITIES_ - if config_seasonality is not None: - inputs["seasonalities"] = get_sample_seasonalities( - df=df, - origin_index=origin_index, - n_forecasts=n_forecasts, - max_lags=max_lags, - n_lags=n_lags, - config_seasonality=config_seasonality, - ) + valid_origins = np.logical_and(valid_origins, targets_valid) - # FUTURE REGRESSORS: get the future regressors features - # create numpy array of values of additive and multiplicative regressors, at correct indexes - # features dims: (n_forecasts, n_features) - any_future_regressors = 0 < len(additive_regressors_names + multiplicative_regressors_names) - if any_future_regressors: # if config_regressors is not None: - inputs["regressors"] = get_sample_future_regressors( - df=df, - origin_index=origin_index, - n_forecasts=n_forecasts, - max_lags=max_lags, - n_lags=n_lags, - additive_regressors_names=additive_regressors_names, - multiplicative_regressors_names=multiplicative_regressors_names, - ) - - # FUTURE EVENTS: get the events features - # create numpy array of values of additive and multiplicative events, at correct indexes - # features dims: (n_forecasts, n_features) - any_events = 0 < len(additive_event_and_holiday_names + multiplicative_event_and_holiday_names) - if any_events: - inputs["events"] = get_sample_future_events( - df=df, - origin_index=origin_index, - n_forecasts=n_forecasts, - max_lags=max_lags, - n_lags=n_lags, - additive_event_and_holiday_names=additive_event_and_holiday_names, - multiplicative_event_and_holiday_names=multiplicative_event_and_holiday_names, - ) - - # IMPORTANT !! - # TODO implement actual filtering - # return np.ones(len(df), dtype=bool) - - # Create index mapping of sample index to df index - # - Filter missing samples (does not actually drop, but creates indexmapping) - # -- drop nan analogous to `self.drop_nan_after_init(self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) - # Note: needs to also account for NANs in lagged inputs or in n_forecasts, not just first target. - # Implement a convolutional filter for targets and each lagged regressor. - # Also account for future regressors and events. - - # Rewrite to return mask instead of filtering df: - nan_idx = [] - # NaNs in inputs - for key, data in self.inputs.items(): - if isinstance(data, torch.Tensor): - nans = torch.where(torch.isnan(data))[0].tolist() - if len(nans) > 0: - nan_idx += nans - elif isinstance(data, dict): - for subkey, subdata in data.items(): - nans = torch.where(torch.isnan(subdata))[0].tolist() - if len(nans) > 0: - nan_idx += nans - - # NaNs in targets that are not inserted for prediction at the end - nans = torch.where(torch.isnan(self.targets))[0].tolist() - if len(nans) > 0: - for idx in nans: - if idx not in nan_idx and idx < len(self) - predict_steps: - nan_idx.append(idx) - - nan_idx = list(set(nan_idx)) - nan_idx.sort() - if drop_missing and len(nan_idx) > 0: - log.warning(f"{len(nan_idx)} samples with missing values were dropped from the data. ") - for key, data in self.inputs.items(): - if key not in ["time", "lags"]: # "time_lagged" - for name, features in data.items(): - self.inputs[key][name] = np.delete(self.inputs[key][name], nan_idx, 0) + # AR LAGS + if n_lags > 0: + # boolean vector, starting at origin_index = n_lags -1 + y_lags_nan = sliding_window_view(df_isna["y_scaled"], window_shape=n_lags, axis=0).any(axis=-1) + # fill first n_lags -1 positions with True + # as there are missing lags for the corresponding origin_indexes + y_lags_nan = np.pad(y_lags_nan, pad_width=(n_lags - 1, 0), mode="constant", constant_values=True) + y_lags_valid = np.logical_not(y_lags_nan) + valid_origins = np.logical_and(valid_origins, y_lags_valid) + + # LAGGED REGRESSORS + if config_lagged_regressors is not None: # and max_lags > 0: + reg_lags_valid = np.ones(len(df), dtype=bool) + for name in df.columns: + if name in config_lagged_regressors: + n_reg_lags = config_lagged_regressors[name].n_lags + if n_reg_lags > 0: + # boolean vector, starting at origin_index = n_lags -1 + reg_lags_nan = sliding_window_view(df_isna[name], window_shape=n_reg_lags, axis=0).any(axis=-1) + # fill first n_reg_lags -1 positions with True, + # as there are missing lags for the corresponding origin_indexes + reg_lags_nan = np.pad( + reg_lags_nan, pad_width=(n_reg_lags - 1, 0), mode="constant", constant_values=True + ) + reg_lags_valid_i = np.logical_not(reg_lags_nan) + reg_lags_valid = np.logical_and(reg_lags_valid, reg_lags_valid_i) + valid_origins = np.logical_and(valid_origins, reg_lags_valid) + + # TIME: TREND & SEASONALITY: the time at each sample's lags and forecasts + # FUTURE REGRESSORS + # EVENTS + for names in [["t"], future_regressor_names, event_names]: + if len(names) > 0: + valid_columns = mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_forecasts) + valid_origins = np.logical_and(valid_origins, valid_columns) + + # # TIME: TREND & SEASONALITY: the time at each sample's lags and forecasts + # if max_lags == 0: # y-series and origin_index match + # time_valid = np.logical_not(df_isna["t"].values) + # else: + # time_nan = sliding_window_view(df_isna["t"], window_shape=n_lags+n_forecasts, axis=0).any(axis=-1) + # # first sample is at origin_index = n_lags -1, + # if n_lags == 0: # first sample origin index is at -1 + # time_nan = time_nan[1:] + # else: + # time_nan = np.pad(time_nan, pad_width=(n_lags-1, 0), mode="constant", constant_values=True) + # # there are n_forecasts origin_indexes missing at end + # time_nan = np.pad(time_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) + # time_valid = np.logical_not(time_nan) + # non_nan = np.logical_and(non_nan, time_valid) + + # # FUTURE REGRESSORS + # if len(future_regressor_names) > 0: + # if max_lags == 0: + # fut_reg_nan = df_isna.loc[:, future_regressor_names] + # assert len(fut_reg_nan.shape) == 2 + # fut_reg_nan = fut_reg_nan.any(axis=-1) + # else: + # fut_reg_nan = sliding_window_view(df_isna.loc[:, future_regressor_names], window_shape=n_lags+n_forecasts, axis=0).any(axis=-1) + # assert len(fut_reg_nan.shape) == 2 + # fut_reg_nan = fut_reg_nan.any(axis=-1) + # # first sample is at origin_index = n_lags -1, + # if n_lags == 0: # first sample origin index is at -1 + # fut_reg_nan = fut_reg_nan[1:] + # else: + # fut_reg_nan = np.pad(fut_reg_nan, pad_width=(n_lags-1, 0), mode="constant", constant_values=True) + # # there are n_forecasts origin_indexes missing at end + # fut_reg_nan = np.pad(fut_reg_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) + # fut_reg_valid = np.logical_not(fut_reg_nan) + # non_nan = np.logical_and(non_nan, fut_reg_valid) + + # # EVENTS + # if len(event_names) > 0: + # if max_lags == 0: + # event_nan = df_isna.loc[:, event_names] + # assert len(event_nan.shape) == 2 + # event_nan = event_nan.any(axis=-1) + # else: + # event_nan = sliding_window_view(df_isna.loc[:, event_names], window_shape=n_lags+n_forecasts, axis=0).any(axis=-1) + # assert len(event_nan.shape) == 2 + # event_nan = event_nan.any(axis=-1) + # # first sample is at origin_index = n_lags -1, + # if n_lags == 0: # first sample origin index is at -1 + # event_nan = event_nan[1:] + # else: + # event_nan = np.pad(event_nan, pad_width=(n_lags-1, 0), mode="constant", constant_values=True) + # # there are n_forecasts origin_indexes missing at end + # event_nan = np.pad(event_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) + # event_valid = np.logical_not(event_nan) + # non_nan = np.logical_and(non_nan, event_valid) + + return valid_origins + + +def mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_forecasts): + # assert len(names) > 0 + contains_nan = df_isna.loc[:, names] + if len(contains_nan.shape) > 1: + assert len(contains_nan.shape) == 2 + contains_nan = contains_nan.any(axis=-1) + if max_lags > 0: + if n_lags == 0 and n_forecasts == 1: + contains_nan = contains_nan[1:] + contains_nan = np.pad(contains_nan, pad_width=(0, 1), mode="constant", constant_values=True) + else: + contains_nan = sliding_window_view(contains_nan, window_shape=n_lags + n_forecasts, axis=0).any(axis=-1) + # first sample is at origin_index = n_lags -1, + if n_lags == 0: # first sample origin index is at -1 + contains_nan = contains_nan[1:] else: - self.inputs[key] = np.delete(self.inputs[key], nan_idx, 0) - self.targets = np.delete(self.targets, nan_idx, 0) - self.length = self.inputs["time"].shape[0] - if not drop_missing and len(nan_idx) > 0: - raise ValueError( - "Inputs/targets with missing values detected. " - "Please either adjust imputation parameters, or set 'drop_missing' to True to drop those samples." - ) + contains_nan = np.pad(contains_nan, pad_width=(n_lags - 1, 0), mode="constant", constant_values=True) + # there are n_forecasts origin_indexes missing at end + contains_nan = np.pad(contains_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) + valid_origins = np.logical_not(contains_nan) + return valid_origins def sort_regressor_names(config): From cfb2562c921e15a01aa63747d08831526d65e1f9 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 7 Feb 2024 17:12:14 -0800 Subject: [PATCH 074/128] fix dims --- neuralprophet/time_dataset.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index acde711ea..21072c5d3 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -826,6 +826,7 @@ def create_nan_mask( if len(names) > 0: valid_columns = mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_forecasts) valid_origins = np.logical_and(valid_origins, valid_columns) + return valid_origins # # TIME: TREND & SEASONALITY: the time at each sample's lags and forecasts # if max_lags == 0: # y-series and origin_index match @@ -840,7 +841,7 @@ def create_nan_mask( # # there are n_forecasts origin_indexes missing at end # time_nan = np.pad(time_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) # time_valid = np.logical_not(time_nan) - # non_nan = np.logical_and(non_nan, time_valid) + # valid_origins = np.logical_and(valid_origins, time_valid) # # FUTURE REGRESSORS # if len(future_regressor_names) > 0: @@ -860,7 +861,7 @@ def create_nan_mask( # # there are n_forecasts origin_indexes missing at end # fut_reg_nan = np.pad(fut_reg_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) # fut_reg_valid = np.logical_not(fut_reg_nan) - # non_nan = np.logical_and(non_nan, fut_reg_valid) + # valid_origins = np.logical_and(valid_origins, fut_reg_valid) # # EVENTS # if len(event_names) > 0: @@ -880,9 +881,8 @@ def create_nan_mask( # # there are n_forecasts origin_indexes missing at end # event_nan = np.pad(event_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) # event_valid = np.logical_not(event_nan) - # non_nan = np.logical_and(non_nan, event_valid) - - return valid_origins + # valid_origins = np.logical_and(valid_origins, event_valid) + # return valid_origins def mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_forecasts): @@ -890,7 +890,7 @@ def mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_fore contains_nan = df_isna.loc[:, names] if len(contains_nan.shape) > 1: assert len(contains_nan.shape) == 2 - contains_nan = contains_nan.any(axis=-1) + contains_nan = contains_nan.any(axis=1) if max_lags > 0: if n_lags == 0 and n_forecasts == 1: contains_nan = contains_nan[1:] From e320b22a8fa553417885af3902dd0a422a4d645d Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 7 Feb 2024 17:26:09 -0800 Subject: [PATCH 075/128] pass self.df to indexing --- neuralprophet/time_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 21072c5d3..8e1760c76 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -97,7 +97,7 @@ def __init__( ) # Construct index map - self.sample2index_map, self.length = self.create_sample2index_map(df) + self.sample2index_map, self.length = self.create_sample2index_map(self.df) def __getitem__(self, index): """Overrides parent class method to get an item at index. @@ -888,9 +888,9 @@ def create_nan_mask( def mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_forecasts): # assert len(names) > 0 contains_nan = df_isna.loc[:, names] - if len(contains_nan.shape) > 1: - assert len(contains_nan.shape) == 2 - contains_nan = contains_nan.any(axis=1) + # if len(contains_nan.shape) > 1: + # assert len(contains_nan.shape) == 2 + contains_nan = contains_nan.any(axis=1) if max_lags > 0: if n_lags == 0 and n_forecasts == 1: contains_nan = contains_nan[1:] From 7f7be5ff1900d4b08fe29b2532eb11cac31b94b2 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 7 Feb 2024 17:47:54 -0800 Subject: [PATCH 076/128] fix zero dim lagged regressors --- neuralprophet/forecaster.py | 6 ++---- tests/test_integration.py | 7 ++++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index 4193a9ccc..bb5dbfc86 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -510,10 +510,8 @@ def add_lagged_regressor( lagged_reg_layers = self.config_model.lagged_reg_layers if n_lags == 0 or n_lags is None: - n_lags = 0 - log.warning( - "Please, set n_lags to a value greater than 0 or to the options 'scalar' or 'auto'. No lags will be " - + "added to regressors when n_lags = 0 or n_lags is None" + raise ValueError( + f"Received n_lags {n_lags} for lagged regressor {names}. Please set n_lags > 0 or use options 'scalar' or 'auto'." ) if n_lags == "auto": if self.n_lags is not None and self.n_lags > 0: diff --git a/tests/test_integration.py b/tests/test_integration.py index 6be735def..c4fb0a0dd 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1505,9 +1505,10 @@ def test_n_lags_for_regressors(): n_forecasts=2, n_lags=2, ) - m = m.add_lagged_regressor(names="A", n_lags=0) - m = m.add_lagged_regressor(names="B", n_lags=0) - with pytest.raises(AssertionError): + + with pytest.raises(ValueError): + m = m.add_lagged_regressor(names="A", n_lags=0) + m = m.add_lagged_regressor(names="B", n_lags=0) m.fit(df1, freq="D") From d00d5f92b8a0b256fe88c6e23d4d1977139fccc5 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 7 Feb 2024 17:51:18 -0800 Subject: [PATCH 077/128] close figures in tests --- tests/test_plotting.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_plotting.py b/tests/test_plotting.py index 6c13ad55b..1c18df09d 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -4,6 +4,7 @@ import os import pathlib +import matplotlib import pandas as pd import pytest @@ -72,6 +73,7 @@ def test_plot(plotting_backend): fig6.show() fig7.show() fig8.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -114,6 +116,7 @@ def test_plot_components(plotting_backend): fig2.show() fig3.show() fig4.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -154,6 +157,7 @@ def test_plot_parameters(plotting_backend): fig1.show() fig2.show() fig3.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -222,6 +226,7 @@ def test_plot_global_local_parameters(plotting_backend): fig1.show() fig2.show() fig3.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -290,6 +295,7 @@ def test_plot_events(plotting_backend): fig1.show() fig2.show() fig3.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -356,6 +362,7 @@ def test_plot_events_additive(plotting_backend): fig1.show() fig2.show() fig3.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -394,6 +401,7 @@ def test_plot_events_components(plotting_backend): fig1.show() fig2.show() fig3.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -419,6 +427,7 @@ def test_plot_trend(plotting_backend): fig1.show() fig2.show() fig3.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -490,6 +499,7 @@ def test_plot_seasonality(plotting_backend): fig4.show() fig5.show() fig6.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -519,6 +529,7 @@ def test_plot_daily_seasonality(plotting_backend): fig1.show() fig2.show() fig3.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -556,6 +567,7 @@ def test_plot_lag_reg(plotting_backend): fig2.show() fig3.show() fig4.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -585,6 +597,7 @@ def test_plot_future_reg(plotting_backend): fig1.show() fig2.show() fig3.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -643,6 +656,7 @@ def test_plot_uncertainty(plotting_backend): fig5.show() fig6.show() fig7.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -706,6 +720,7 @@ def test_plot_conformal_prediction(plotting_backend): fig3.show() fig4.show() fig5.show() + matplotlib.pyplot.close("all") def test_advanced_conformal_prediction_plots(): @@ -734,6 +749,7 @@ def test_advanced_conformal_prediction_plots(): fig0 = m.conformal_plot(forecast) if PLOT: fig0.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -763,6 +779,7 @@ def test_plot_conformal_prediction_asymmetric(plotting_backend): fig0.show() fig1.show() fig2.show() + matplotlib.pyplot.close("all") @pytest.mark.parametrize(*decorator_input) @@ -791,6 +808,7 @@ def test_plot_latest_forecast(plotting_backend): fig1.show() fig2.show() fig3.show() + matplotlib.pyplot.close("all") def test_plotting_backend_options(): @@ -842,3 +860,4 @@ def test_plotting_backend_options(): fig10.show() fig11.show() fig12.show() + matplotlib.pyplot.close("all") From df5051d9efdd684b01e64a96355a0036e9b0a223 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 7 Feb 2024 17:56:58 -0800 Subject: [PATCH 078/128] fix typings --- neuralprophet/time_dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 8e1760c76..e84aeea81 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -1,7 +1,7 @@ import logging from collections import OrderedDict, defaultdict from datetime import datetime -from typing import Optional +from typing import List, Optional import numpy as np import pandas as pd @@ -410,10 +410,10 @@ def tabularize_univariate_datetime_single_index( n_forecasts: int = 1, config_seasonality: Optional[configure.ConfigSeasonality] = None, config_lagged_regressors: Optional[configure.ConfigLaggedRegressors] = None, - additive_event_and_holiday_names: list[str] = [], - multiplicative_event_and_holiday_names: list[str] = [], - additive_regressors_names: list[str] = [], - multiplicative_regressors_names: list[str] = [], + additive_event_and_holiday_names: List[str] = [], + multiplicative_event_and_holiday_names: List[str] = [], + additive_regressors_names: List[str] = [], + multiplicative_regressors_names: List[str] = [], ): """Create a tabular data sample from timeseries dataframe, used for mini-batch creation. Note From d3bce01cb037397a5916128f23173642f5f3987c Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 7 Feb 2024 18:06:27 -0800 Subject: [PATCH 079/128] black --- tests/test_model_performance.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index 8c68bddf8..9eae4f812 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -335,17 +335,21 @@ def test_EnergyHourlyDeep(): df["y"] = pd.to_numeric(df["y"], errors="coerce") df = df.drop("ds", axis=1) - df['ds'] = pd.date_range(start="2015-01-01 00:00:00", periods=len(df), freq="H") + df["ds"] = pd.date_range(start="2015-01-01 00:00:00", periods=len(df), freq="H") df["ID"] = "test" - df_id = df[['ds', 'y', 'temp']].copy() - df_id['ID'] = "test2" - df_id['y'] = df_id['y'] * 0.3 - df_id['temp'] = df_id['temp'] * 0.4 + df_id = df[["ds", "y", "temp"]].copy() + df_id["ID"] = "test2" + df_id["y"] = df_id["y"] * 0.3 + df_id["temp"] = df_id["temp"] * 0.4 df = pd.concat([df, df_id], ignore_index=True) # Conditional Seasonality - df["winter"] = np.where(df["ds"].dt.month.isin([1]), 1, 0,) + df["winter"] = np.where( + df["ds"].dt.month.isin([1]), + 1, + 0, + ) df["summer"] = np.where(df["ds"].dt.month.isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]), 1, 0) df["winter"] = pd.to_numeric(df["winter"], errors="coerce") df["summer"] = pd.to_numeric(df["summer"], errors="coerce") From dce2f73d49980b9515552b27e5a55ae3523e22b3 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 7 Feb 2024 18:16:58 -0800 Subject: [PATCH 080/128] ruff --- neuralprophet/configure.py | 2 +- neuralprophet/event_utils.py | 2 +- neuralprophet/time_dataset.py | 2 +- neuralprophet/utils.py | 2 +- tests/test_event_utils.py | 1 - 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/neuralprophet/configure.py b/neuralprophet/configure.py index 52b8b3f0a..bb9698782 100644 --- a/neuralprophet/configure.py +++ b/neuralprophet/configure.py @@ -13,7 +13,7 @@ import pandas as pd import torch -from neuralprophet import df_utils, np_types, utils, utils_torch +from neuralprophet import df_utils, np_types, utils_torch from neuralprophet.custom_loss_metrics import PinballLoss from neuralprophet.event_utils import get_holiday_names diff --git a/neuralprophet/event_utils.py b/neuralprophet/event_utils.py index 1633cc16c..9deaa8f5d 100644 --- a/neuralprophet/event_utils.py +++ b/neuralprophet/event_utils.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Iterable, Optional, Union +from typing import Iterable, Union import numpy as np import pandas as pd diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index e84aeea81..9f2d3fcde 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -1,5 +1,5 @@ import logging -from collections import OrderedDict, defaultdict +from collections import OrderedDict from datetime import datetime from typing import List, Optional diff --git a/neuralprophet/utils.py b/neuralprophet/utils.py index c6fec4568..0cabb4e01 100644 --- a/neuralprophet/utils.py +++ b/neuralprophet/utils.py @@ -5,7 +5,7 @@ import os import sys from collections import OrderedDict -from typing import TYPE_CHECKING, Iterable, Optional, Union +from typing import TYPE_CHECKING, Optional import numpy as np import pandas as pd diff --git a/tests/test_event_utils.py b/tests/test_event_utils.py index 862c11c2f..8c26a2e49 100644 --- a/tests/test_event_utils.py +++ b/tests/test_event_utils.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -import holidays import pytest from neuralprophet import event_utils From bedce94c26e703372c6573ddc9df4081f9782247 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 7 Feb 2024 18:18:12 -0800 Subject: [PATCH 081/128] linting --- neuralprophet/time_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 9f2d3fcde..3adfbf136 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -45,12 +45,12 @@ def __init__( **kwargs : dict Identical to :meth:`tabularize_univariate_datetime` """ - ## Outcome after a call to init (summary): + # Outcome after a call to init (summary): # - add events and holidays columns to df # - calculated the number of usable samples (accounting for nan and filters) # - creates mapping of sample index to df index - ## Context Notes + # Context Notes # Currently done to df before it arrives here: # -> fit calls prep_or_copy_df, _check_dataframe, and _handle_missing_data, passes to _train # -> _train calls prep_or_copy_df, then passes to init_train_loader, which returns the train_loader From 051e1ad624bfad7a7d280be29a31390adcb7bf34 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 7 Feb 2024 18:27:02 -0800 Subject: [PATCH 082/128] linting --- neuralprophet/__main__.py | 1 + neuralprophet/df_utils.py | 4 +--- neuralprophet/forecaster.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/neuralprophet/__main__.py b/neuralprophet/__main__.py index 00cf0eaea..62aa59d01 100644 --- a/neuralprophet/__main__.py +++ b/neuralprophet/__main__.py @@ -1,6 +1,7 @@ """ Invokes neuralprophet when module is run as a script. """ + import argparse from neuralprophet._version import __version__ diff --git a/neuralprophet/df_utils.py b/neuralprophet/df_utils.py index 79c6c4ea6..8b83ad366 100644 --- a/neuralprophet/df_utils.py +++ b/neuralprophet/df_utils.py @@ -507,14 +507,12 @@ def check_dataframe( for name in columns: if name not in df: raise ValueError(f"Column {name!r} missing from dataframe") - if df.loc[df.loc[:, name].notnull()].shape[0] < 1: + if sum(df.loc[:, name].notnull().values) < 1: raise ValueError(f"Dataframe column {name!r} only has NaN rows.") if not np.issubdtype(df[name].dtype, np.number): df[name] = pd.to_numeric(df[name]) if np.isinf(df.loc[:, name].values).any(): df.loc[:, name] = df[name].replace([np.inf, -np.inf], np.nan) - if df.loc[df.loc[:, name].notnull()].shape[0] < 1: - raise ValueError(f"Dataframe column {name!r} only has NaN rows.") if future: return df, regressors_to_remove, lag_regressors_to_remove diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index bb5dbfc86..a11853595 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -1005,7 +1005,7 @@ def fit( # Only display the plot if the session is interactive, eg. do not show in github actions since it # causes an error in the Windows and MacOS environment if matplotlib.is_interactive(): - fig + fig.show() self.fitted = True return metrics_df From 0c9cd87846a03d59680ff12666cceec40def6fdd Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 8 Feb 2024 18:09:53 -0800 Subject: [PATCH 083/128] modify logs --- neuralprophet/forecaster.py | 32 +++++++++++++++----------------- neuralprophet/time_dataset.py | 4 ++-- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index a11853595..efac5cf8e 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -710,8 +710,9 @@ def add_country_holidays( if self.fitted: raise Exception("Country must be specified prior to model fitting.") if self.config_country_holidays: - log.warning( - "Country holidays can only be added for a single country. Previous country holidays were overridden." + log.error( + "Country holidays can only be added once. Previous country holidays will be overridden." + "If adding multiple countries, please add as list. " ) if regularization is not None: @@ -906,18 +907,18 @@ def fit( ] ) if reg_enabled: - log.warning( + log.info( "Early stopping is enabled, but regularization only starts after half the number of configured \ epochs. If you see no impact of the regularization, turn off the early_stopping or reduce the \ number of epochs to train for." ) if progress == "plot" and metrics is False: - log.warning("Progress plot requires metrics to be enabled. Enabling the default metrics.") + log.info("Progress plot requires metrics to be enabled. Enabling the default metrics.") metrics = utils_metrics.get_metrics(True) if not self.config_normalization.global_normalization: - log.warning("When Global modeling with local normalization, metrics are displayed in normalized scale.") + log.info("When Global modeling with local normalization, metrics are displayed in normalized scale.") if minimal: checkpointing = False @@ -1138,7 +1139,7 @@ def test(self, df: pd.DataFrame, verbose: bool = True): val_metrics_df = pd.DataFrame(val_metrics) # TODO Check whether supported by Lightning if not self.config_normalization.global_normalization: - log.warning("Note that the metrics are displayed in normalized scale because of local normalization.") + log.info("Note that the metrics are displayed in normalized scale because of local normalization.") return val_metrics_df def split_df(self, df: pd.DataFrame, freq: str = "auto", valid_p: float = 0.2, local_split: bool = False): @@ -2112,8 +2113,8 @@ def plot_latest_forecast( if df_name not in fcst["ID"].unique(): assert len(fcst["ID"].unique()) > 1 raise Exception( - "Many time series are present in the pd.DataFrame (more than one ID). Please, especify ID to be \ - plotted." + "Many time series are present in the pd.DataFrame (more than one ID)." + "Please, especify ID to be plotted." ) else: fcst = fcst[fcst["ID"] == df_name].copy(deep=True) @@ -2121,7 +2122,7 @@ def plot_latest_forecast( if len(self.config_train.quantiles) > 1: log.warning( "Plotting latest forecasts when uncertainty estimation enabled" - " plots the forecasts only for the median quantile." + " plots only the median quantile forecasts." ) if plot_history_data is None: fcst = fcst[-(include_previous_forecasts + self.n_forecasts + self.max_lags) :] @@ -2174,10 +2175,7 @@ def plot_last_forecast( plotting_backend: Optional[str] = None, ): args = locals() - log.warning( - "plot_last_forecast() has been renamed to plot_latest_forecast() and is therefore deprecated. " - "Please use plot_latst_forecast() in the future" - ) + log.error("plot_last_forecast() is deprecated." "Please use plot_latest_forecast().") return NeuralProphet.plot_latest_forecast(**args) @@ -2251,8 +2249,8 @@ def plot_components( if df_name not in fcst["ID"].unique(): assert len(fcst["ID"].unique()) > 1 raise Exception( - "Many time series are present in the pd.DataFrame (more than one ID). Please, especify ID to be \ - plotted." + "Multiple time series are present in the pd.DataFrame (more than one ID)." + "Please, especify ID to be plotted." ) else: fcst = fcst[fcst["ID"] == df_name].copy(deep=True) @@ -2278,8 +2276,8 @@ def plot_components( if self.model.config_seasonality is not None: if self.model.config_seasonality.global_local == "local" and df_name is None: raise Exception( - "df_name parameter is required for multiple time series and local modeling of at least one \ - component." + "df_name parameter is required for multiple time series " + "and local modeling of at least one component." ) # Validate components to be plotted diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 3adfbf136..d942e1410 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -538,8 +538,8 @@ def tabularize_univariate_datetime_single_index( ) # ONLY FOR DEBUGGING - if log.level == 0: - log_input_shapes(inputs) + # if log.level == 0: + # log_input_shapes(inputs) return inputs, targets From f44231a91cd337ddc1a4d700d3d259d13bd29bac Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 8 Feb 2024 23:50:30 -0800 Subject: [PATCH 084/128] add benchmarking script for computational time --- tests/utils/benchmark_time_dataset.py | 363 ++++++++++++++++++++++++++ 1 file changed, 363 insertions(+) create mode 100644 tests/utils/benchmark_time_dataset.py diff --git a/tests/utils/benchmark_time_dataset.py b/tests/utils/benchmark_time_dataset.py new file mode 100644 index 000000000..d76f33c85 --- /dev/null +++ b/tests/utils/benchmark_time_dataset.py @@ -0,0 +1,363 @@ +import logging +import os +import pathlib +import time +from itertools import product + +import pandas as pd +import pytest +import torch.utils.benchmark as benchmark + +from neuralprophet import NeuralProphet, uncertainty_evaluate + +log = logging.getLogger("NP.test") +# log.setLevel("INFO") +# log.parent.setLevel("INFO") +# log.setLevel("WARNING") +# log.parent.setLevel("WARNING") +log.setLevel("ERROR") +log.parent.setLevel("ERROR") + +DIR = pathlib.Path(__file__).parent.parent.parent.absolute() +DATA_DIR = os.path.join(DIR, "tests", "test-data") +PEYTON_FILE = os.path.join(DATA_DIR, "wp_log_peyton_manning.csv") +AIR_FILE = os.path.join(DATA_DIR, "air_passengers.csv") +YOS_FILE = os.path.join(DATA_DIR, "yosemite_temps.csv") +NROWS = 256 +EPOCHS = 10 +BATCH_SIZE = 128 +LR = 1.0 + + +def yosemite(nrows=NROWS, epochs=EPOCHS, batch=BATCH_SIZE, season=True): + # log.info("testing: Uncertainty Estimation Yosemite Temps") + df = pd.read_csv(YOS_FILE, nrows=nrows) + m = NeuralProphet( + n_lags=12, + n_forecasts=6, + quantiles=[0.01, 0.99], + epochs=epochs, + batch_size=batch, + learning_rate=LR, + yearly_seasonality=season, + weekly_seasonality=season, + daily_seasonality=season, + ) + # tic = time.perf_counter() + m.fit(df, freq="5min") + # toc = time.perf_counter() + # print(f"######## Time: {toc - tic:0.4f} for fit") + + # tic = time.perf_counter() + # future = m.make_future_dataframe(df, periods=6, n_historic_predictions=3 * 24 * 12) + # toc = time.perf_counter() + # print(f"######## Time: {toc - tic:0.4f} for make_future_dataframe") + + # tic = time.perf_counter() + # m.predict(future) + # toc = time.perf_counter() + # print(f"######## Time: {toc - tic:0.4f} for predict") + + m.highlight_nth_step_ahead_of_each_forecast(m.n_forecasts) + + +def peyton(nrows=NROWS, epochs=EPOCHS, batch=BATCH_SIZE, season=True): + # log.info("testing: Uncertainty Estimation Peyton Manning") + df = pd.read_csv(PEYTON_FILE, nrows=nrows) + playoffs = pd.DataFrame( + { + "event": "playoff", + "ds": pd.to_datetime( + [ + "2008-01-13", + "2009-01-03", + "2010-01-16", + "2010-01-24", + "2010-02-07", + "2011-01-08", + "2013-01-12", + "2014-01-12", + "2014-01-19", + "2014-02-02", + "2015-01-11", + "2016-01-17", + "2016-01-24", + "2016-02-07", + ] + ), + } + ) + superbowls = pd.DataFrame( + { + "event": "superbowl", + "ds": pd.to_datetime(["2010-02-07", "2014-02-02", "2016-02-07"]), + } + ) + events_df = pd.concat((playoffs, superbowls)) + + m = NeuralProphet( + n_forecasts=1, + loss_func="SmoothL1Loss", + quantiles=[0.01, 0.99], + epochs=epochs, + batch_size=batch, + learning_rate=LR, + yearly_seasonality=season, + weekly_seasonality=season, + # daily_seasonality=False, + ) + + # add lagged regressors + # # if m.n_lags > 0: + # df["A"] = df["y"].rolling(7, min_periods=1).mean() + # df["B"] = df["y"].rolling(30, min_periods=1).mean() + # m = m.add_lagged_regressor(name="A", n_lags=10) + # m = m.add_lagged_regressor(name="B", only_last_value=True) + + # add events + m = m.add_events(["superbowl", "playoff"], lower_window=-1, upper_window=1, regularization=0.1) + + m = m.add_country_holidays("US", mode="additive", regularization=0.1) + + df["C"] = df["y"].rolling(7, min_periods=1).mean() + df["D"] = df["y"].rolling(30, min_periods=1).mean() + + m = m.add_future_regressor(name="C", regularization=0.1) + m = m.add_future_regressor(name="D", regularization=0.1) + + history_df = m.create_df_with_events(df, events_df) + + m.fit(history_df, freq="D") + + # periods = 90 + # regressors_future_df = pd.DataFrame(data={"C": df["C"][:periods], "D": df["D"][:periods]}) + # future_df = m.make_future_dataframe( + # df=history_df, + # regressors_df=regressors_future_df, + # events_df=events_df, + # periods=periods, + # n_historic_predictions=nrows, + # ) + # m.predict(df=future_df) + + +def peyton_minus_events(nrows=NROWS, epochs=EPOCHS, batch=BATCH_SIZE, season=True): + # log.info("testing: Uncertainty Estimation Peyton Manning") + df = pd.read_csv(PEYTON_FILE, nrows=nrows) + + m = NeuralProphet( + n_forecasts=1, + loss_func="SmoothL1Loss", + quantiles=[0.01, 0.99], + epochs=epochs, + batch_size=batch, + learning_rate=LR, + yearly_seasonality=season, + weekly_seasonality=season, + # daily_seasonality=False, + ) + + # add lagged regressors + if m.n_lags > 0: + df["A"] = df["y"].rolling(7, min_periods=1).mean() + df["B"] = df["y"].rolling(30, min_periods=1).mean() + m = m.add_lagged_regressor(name="A") + m = m.add_lagged_regressor(name="B", only_last_value=True) + + df["C"] = df["y"].rolling(7, min_periods=1).mean() + df["D"] = df["y"].rolling(30, min_periods=1).mean() + + m = m.add_future_regressor(name="C", regularization=0.1) + m = m.add_future_regressor(name="D", regularization=0.1) + + history_df = df + + m.fit(history_df, freq="D") + + # periods = 90 + # regressors_future_df = pd.DataFrame(data={"C": df["C"][:periods], "D": df["D"][:periods]}) + # future_df = m.make_future_dataframe( + # df=history_df, + # regressors_df=regressors_future_df, + # periods=periods, + # n_historic_predictions=nrows, + # ) + # m.predict(df=future_df) + + +def peyton_minus_regressors(nrows=NROWS, epochs=EPOCHS, batch=BATCH_SIZE, season=True): + # log.info("testing: Uncertainty Estimation Peyton Manning") + df = pd.read_csv(PEYTON_FILE, nrows=nrows) + playoffs = pd.DataFrame( + { + "event": "playoff", + "ds": pd.to_datetime( + [ + "2008-01-13", + "2009-01-03", + "2010-01-16", + "2010-01-24", + "2010-02-07", + "2011-01-08", + "2013-01-12", + "2014-01-12", + "2014-01-19", + "2014-02-02", + "2015-01-11", + "2016-01-17", + "2016-01-24", + "2016-02-07", + ] + ), + } + ) + superbowls = pd.DataFrame( + { + "event": "superbowl", + "ds": pd.to_datetime(["2010-02-07", "2014-02-02", "2016-02-07"]), + } + ) + events_df = pd.concat((playoffs, superbowls)) + + m = NeuralProphet( + n_forecasts=1, + loss_func="SmoothL1Loss", + quantiles=[0.01, 0.99], + epochs=epochs, + batch_size=batch, + learning_rate=LR, + yearly_seasonality=season, + weekly_seasonality=season, + # daily_seasonality=False, + ) + # add events + m = m.add_events(["superbowl", "playoff"], lower_window=-1, upper_window=1, regularization=0.1) + + m = m.add_country_holidays("US", mode="additive", regularization=0.1) + + history_df = m.create_df_with_events(df, events_df) + + m.fit(history_df, freq="D") + + # periods = 90 + # future_df = m.make_future_dataframe( + # df=history_df, + # events_df=events_df, + # periods=periods, + # n_historic_predictions=nrows, + # ) + # m.predict(df=future_df) + + +####################################### +# tic = time.perf_counter() +# test_uncertainty_estimation_yosemite_temps() +# toc = time.perf_counter() +# print(f"#### Time: {toc - tic:0.4f} for test_uncertainty_estimation_yosemite_temps") + +# tic = time.perf_counter() +# test_uncertainty_estimation_peyton_manning() +# toc = time.perf_counter() +# print(f"#### Time: {toc - tic:0.4f} for test_uncertainty_estimation_peyton_manning") + +# tic = time.perf_counter() +# test_uncertainty_estimation_air_travel() +# toc = time.perf_counter() +# print(f"#### Time: {toc - tic:0.4f} for test_uncertainty_estimation_air_travel") + +# tic = time.perf_counter() +# test_uncertainty_estimation_multiple_quantiles() +# toc = time.perf_counter() +# print(f"#### Time: {toc - tic:0.4f} for test_uncertainty_estimation_multiple_quantiles") + +# tic = time.perf_counter() +# test_split_conformal_prediction() +# toc = time.perf_counter() +# print(f"#### Time: {toc - tic:0.4f} for test_split_conformal_prediction") + +# tic = time.perf_counter() +# test_asymmetrical_quantiles() +# toc = time.perf_counter() +# print(f"#### Time: {toc - tic:0.4f} for test_asymmetrical_quantiles") + + +############################33333 +# t0 = benchmark.Timer( +# stmt='test_uncertainty_estimation_yosemite_temps(x)', +# setup='from __main__ import test_uncertainty_estimation_yosemite_temps', +# globals={'x': x} +# ) + +# t1 = benchmark.Timer( +# stmt='test_uncertainty_estimation_peyton_manning(x)', +# setup='from __main__ import test_uncertainty_estimation_peyton_manning', +# # globals={'x': x} +# ) + +# print(t0.timeit(1)) +# print(t1.timeit(1)) + + +############################### + +# Compare takes a list of measurements which we'll save in results. +results = [] + +epochs = [5] +sizes = [100, 1000] +# sizes = [100, 1000, 10000] +batches = [128] +seasons = [False, True] +for ep, nrows, b, season in product(epochs, sizes, batches, seasons): + # label and sub_label are the rows + # description is the column + label = "tests" + sub_label = f"[rows: {nrows}, epochs:{ep}, batch:{b}, season:{season}]" + for num_threads in [1]: # [1, 4, 16, 64] + results.append( + benchmark.Timer( + stmt="yosemite(nrows, epochs, batch, season)", + setup="from __main__ import yosemite", + globals={"epochs": ep, "nrows": nrows, "batch": b, "season": season}, + num_threads=num_threads, + label=label, + sub_label=sub_label, + description="yosemite", + ).blocked_autorange(min_run_time=1) + ) + results.append( + benchmark.Timer( + stmt="peyton(nrows, epochs, batch, season)", + setup="from __main__ import peyton", + globals={"nrows": nrows, "epochs": ep, "batch": b, "season": season}, + num_threads=num_threads, + label=label, + sub_label=sub_label, + description="peyton", + ).blocked_autorange(min_run_time=1) + ) + results.append( + benchmark.Timer( + stmt="peyton_minus_events(nrows, epochs, batch, season)", + setup="from __main__ import peyton_minus_events", + globals={"nrows": nrows, "epochs": ep, "batch": b, "season": season}, + num_threads=num_threads, + label=label, + sub_label=sub_label, + description="peyton_minus_events", + ).blocked_autorange(min_run_time=1) + ) + results.append( + benchmark.Timer( + stmt="peyton_minus_regressors(nrows, epochs, batch, season)", + setup="from __main__ import peyton_minus_regressors", + globals={"nrows": nrows, "epochs": ep, "batch": b, "season": season}, + num_threads=num_threads, + label=label, + sub_label=sub_label, + description="peyton_minus_regressors", + ).blocked_autorange(min_run_time=1) + ) + +compare = benchmark.Compare(results) +compare.print() From 2039212abda60d7b91ab5e0da41107b356e75f0c Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 8 Feb 2024 23:56:59 -0800 Subject: [PATCH 085/128] speed up uncertainty tests --- tests/test_uncertainty.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_uncertainty.py b/tests/test_uncertainty.py index 039128cb1..75544fe6d 100644 --- a/tests/test_uncertainty.py +++ b/tests/test_uncertainty.py @@ -10,7 +10,7 @@ from neuralprophet import NeuralProphet, uncertainty_evaluate log = logging.getLogger("NP.test") -log.setLevel("DEBUG") +log.setLevel("WARNING") log.parent.setLevel("WARNING") DIR = pathlib.Path(__file__).parent.parent.absolute() @@ -26,7 +26,7 @@ def test_uncertainty_estimation_peyton_manning(): log.info("testing: Uncertainty Estimation Peyton Manning") - df = pd.read_csv(PEYTON_FILE) + df = pd.read_csv(PEYTON_FILE, nrows=NROWS) playoffs = pd.DataFrame( { "event": "playoff", @@ -103,7 +103,7 @@ def test_uncertainty_estimation_peyton_manning(): def test_uncertainty_estimation_yosemite_temps(): log.info("testing: Uncertainty Estimation Yosemite Temps") - df = pd.read_csv(YOS_FILE) + df = pd.read_csv(YOS_FILE, nrows=NROWS) m = NeuralProphet( n_lags=12, n_forecasts=6, From d34700fa3cc7940bcbe19e56d1ba6e32e023bdcf Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 9 Feb 2024 00:23:24 -0800 Subject: [PATCH 086/128] fix unit test multiple country --- neuralprophet/forecaster.py | 4 ++-- tests/test_unit.py | 11 +++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index efac5cf8e..b0de6483c 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -708,9 +708,9 @@ def add_country_holidays( ``additive`` (default) or ``multiplicative``. """ if self.fitted: - raise Exception("Country must be specified prior to model fitting.") + raise AssertionError("Country must be specified prior to model fitting.") if self.config_country_holidays: - log.error( + raise AssertionError( "Country holidays can only be added once. Previous country holidays will be overridden." "If adding multiple countries, please add as list. " ) diff --git a/tests/test_unit.py b/tests/test_unit.py index 8796abd95..fe1f70c18 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -977,19 +977,18 @@ def test_handle_negative_values_replace(): def test_add_country_holiday_multiple_calls_warning(caplog): - error_message = ( - "Country holidays can only be added for a single country. Previous country holidays were overridden." - ) m = NeuralProphet( epochs=EPOCHS, batch_size=BATCH_SIZE, learning_rate=LR, ) - m.add_country_holidays("US") + m.add_country_holidays(["US", "Germany"]) + error_message = "Country holidays can only be added once." assert error_message not in caplog.text - m.add_country_holidays("Germany") - assert error_message in caplog.text + with pytest.raises(AssertionError): + m.add_country_holidays("Germany") + # assert error_message in caplog.text def test_multiple_countries(): From 485f5a8c9fdb843a5d501e47d561c965b91b6ff2 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 9 Feb 2024 00:26:51 -0800 Subject: [PATCH 087/128] reduce tests log level to ERROR --- tests/test_wrapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py index 4749ae038..48b13af9e 100644 --- a/tests/test_wrapper.py +++ b/tests/test_wrapper.py @@ -9,8 +9,8 @@ from neuralprophet import TorchProphet as Prophet log = logging.getLogger("NP.test") -log.setLevel("DEBUG") -log.parent.setLevel("WARNING") +log.setLevel("ERROR") +log.parent.setLevel("ERROR") DIR = pathlib.Path(__file__).parent.parent.absolute() DATA_DIR = os.path.join(DIR, "tests", "test-data") From 8b863daba90ab24f032e872147b5ffc62d1cba4a Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 9 Feb 2024 00:33:17 -0800 Subject: [PATCH 088/128] reduce log level to ERROR and fix adding multiple countries --- tests/pytest.ini | 2 +- tests/test_glocal.py | 4 ++-- tests/test_integration.py | 12 +++++------- tests/test_model_performance.py | 4 ++-- tests/test_plotting.py | 16 ++++------------ tests/test_regularization.py | 4 ++-- tests/test_uncertainty.py | 4 ++-- tests/test_unit.py | 4 ++-- tests/test_utils.py | 4 ++-- 9 files changed, 22 insertions(+), 32 deletions(-) diff --git a/tests/pytest.ini b/tests/pytest.ini index cbb9fe0c0..546920b92 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -1,6 +1,6 @@ [pytest] log_cli = true -log_cli_level = DEBUG +log_cli_level = ERROR log_cli_format = %(asctime)s [%(levelname)s]: %(message)s (%(filename)s:%(lineno)s) log_cli_date_format = %Y-%m-%d %H:%M:%S filterwarnings = diff --git a/tests/test_glocal.py b/tests/test_glocal.py index e4cf8309e..771cc1829 100644 --- a/tests/test_glocal.py +++ b/tests/test_glocal.py @@ -9,8 +9,8 @@ from neuralprophet import NeuralProphet log = logging.getLogger("NP.test") -log.setLevel("DEBUG") -log.parent.setLevel("WARNING") +log.setLevel("ERROR") +log.parent.setLevel("ERROR") DIR = pathlib.Path(__file__).parent.parent.absolute() DATA_DIR = os.path.join(DIR, "tests", "test-data") diff --git a/tests/test_integration.py b/tests/test_integration.py index c4fb0a0dd..3718e6a35 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -15,8 +15,8 @@ from neuralprophet.data.process import _handle_missing_data, _validate_column_name log = logging.getLogger("NP.test") -log.setLevel("DEBUG") -log.parent.setLevel("WARNING") +log.setLevel("ERROR") +log.parent.setLevel("ERROR") DIR = pathlib.Path(__file__).parent.parent.absolute() DATA_DIR = os.path.join(DIR, "tests", "test-data") @@ -473,12 +473,10 @@ def test_events(): ["superbowl", "playoff"], lower_window=-1, upper_window=1, mode="multiplicative", regularization=0.5 ) # add the country specific holidays - m = m.add_country_holidays("US", mode="additive", regularization=0.5) - m.add_country_holidays("Indonesia") + m = m.add_country_holidays( + ["US", "Indonesia", "Philippines", "Pakistan", "Belarus"], mode="additive", regularization=0.5 + ) # m.add_country_holidays("Thailand") # holidays package has issue with int input for timedelta. accepts np.float64() - m.add_country_holidays("Philippines") - m.add_country_holidays("Pakistan") - m.add_country_holidays("Belarus") history_df = m.create_df_with_events(df, events_df) m.fit(history_df, freq="D") future = m.make_future_dataframe(df=history_df, events_df=events_df, periods=30, n_historic_predictions=90) diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index 9eae4f812..93c908d77 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -16,8 +16,8 @@ from neuralprophet import NeuralProphet, set_random_seed log = logging.getLogger("NP.test") -log.setLevel("DEBUG") -log.parent.setLevel("WARNING") +log.setLevel("ERROR") +log.parent.setLevel("ERROR") DIR = pathlib.Path(__file__).parent.parent.absolute() DATA_DIR = os.path.join(DIR, "tests", "test-data") diff --git a/tests/test_plotting.py b/tests/test_plotting.py index 1c18df09d..0d24f3530 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -11,8 +11,8 @@ from neuralprophet import NeuralProphet log = logging.getLogger("NP.test") -log.setLevel("DEBUG") -log.parent.setLevel("WARNING") +log.setLevel("ERROR") +log.parent.setLevel("ERROR") DIR = pathlib.Path(__file__).parent.parent.absolute() DATA_DIR = os.path.join(DIR, "tests", "test-data") @@ -276,11 +276,7 @@ def test_plot_events(plotting_backend): ["superbowl", "playoff"], lower_window=-1, upper_window=1, mode="multiplicative", regularization=0.5 ) # add the country specific holidays - m = m.add_country_holidays("US", mode="multiplicative", regularization=0.5) - m.add_country_holidays("Indonesia") - m.add_country_holidays("Philippines") - m.add_country_holidays("Pakistan") - m.add_country_holidays("Belarus") + m = m.add_country_holidays(["US", "Indonesia", "Philippines", "Pakistan", "Belarus"], mode="multiplicative") history_df = m.create_df_with_events(df, events_df) m.fit(history_df, freq="D") future = m.make_future_dataframe(df=history_df, events_df=events_df, periods=30, n_historic_predictions=90) @@ -343,11 +339,7 @@ def test_plot_events_additive(plotting_backend): # set event windows m = m.add_events(["superbowl", "playoff"], lower_window=-1, upper_window=1, mode="additive", regularization=0.5) # add the country specific holidays - m = m.add_country_holidays("US", mode="additive", regularization=0.5) - m.add_country_holidays("Indonesia") - m.add_country_holidays("Philippines") - m.add_country_holidays("Pakistan") - m.add_country_holidays("Belarus") + m = m.add_country_holidays(["US", "Canada", "MEX"], mode="additive", regularization=0.5) history_df = m.create_df_with_events(df, events_df) m.fit(history_df, freq="D") future = m.make_future_dataframe(df=history_df, events_df=events_df, periods=30, n_historic_predictions=90) diff --git a/tests/test_regularization.py b/tests/test_regularization.py index 5a56d09f6..931a8fbb5 100644 --- a/tests/test_regularization.py +++ b/tests/test_regularization.py @@ -17,8 +17,8 @@ ) log = logging.getLogger("NP.test") -log.setLevel("DEBUG") -log.parent.setLevel("WARNING") +log.setLevel("ERROR") +log.parent.setLevel("ERROR") # Fix random seeds torch.manual_seed(0) diff --git a/tests/test_uncertainty.py b/tests/test_uncertainty.py index 75544fe6d..1208faa62 100644 --- a/tests/test_uncertainty.py +++ b/tests/test_uncertainty.py @@ -10,8 +10,8 @@ from neuralprophet import NeuralProphet, uncertainty_evaluate log = logging.getLogger("NP.test") -log.setLevel("WARNING") -log.parent.setLevel("WARNING") +log.setLevel("ERROR") +log.parent.setLevel("ERROR") DIR = pathlib.Path(__file__).parent.parent.absolute() DATA_DIR = os.path.join(DIR, "tests", "test-data") diff --git a/tests/test_unit.py b/tests/test_unit.py index fe1f70c18..41e3dd358 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -15,8 +15,8 @@ from neuralprophet.data.transform import _normalize log = logging.getLogger("NP.test") -log.setLevel("DEBUG") -log.parent.setLevel("WARNING") +log.setLevel("ERROR") +log.parent.setLevel("ERROR") DIR = pathlib.Path(__file__).parent.parent.absolute() DATA_DIR = os.path.join(DIR, "tests", "test-data") diff --git a/tests/test_utils.py b/tests/test_utils.py index 3e965e03a..88eced6bb 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -10,8 +10,8 @@ from neuralprophet import NeuralProphet, df_utils, load, save log = logging.getLogger("NP.test") -log.setLevel("DEBUG") -log.parent.setLevel("WARNING") +log.setLevel("ERROR") +log.parent.setLevel("ERROR") DIR = pathlib.Path(__file__).parent.parent.absolute() DATA_DIR = os.path.join(DIR, "tests", "test-data") From 3226884b87f7d800efe67bbef025898fbab266a0 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 9 Feb 2024 00:38:25 -0800 Subject: [PATCH 089/128] bypass intentional glocal test error log --- tests/test_glocal.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_glocal.py b/tests/test_glocal.py index 771cc1829..75e456940 100644 --- a/tests/test_glocal.py +++ b/tests/test_glocal.py @@ -187,6 +187,8 @@ def test_wrong_option_global_local_modeling(): df2_0["ID"] = "df2" df3_0 = df.iloc[256:384, :].copy(deep=True) df3_0["ID"] = "df3" + prev_level = log.getEffectiveLevel() + log.setLevel("CRITICAL") m = NeuralProphet( n_forecasts=2, n_lags=10, @@ -197,6 +199,7 @@ def test_wrong_option_global_local_modeling(): season_global_local="glocsl", trend_global_local="glocsl", ) + log.setLevel(prev_level) train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) m.fit(train_df) future = m.make_future_dataframe(test_df) From a6eceb2e173d60beffa43e12f6852d99d58d97a5 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 9 Feb 2024 00:40:26 -0800 Subject: [PATCH 090/128] fix prev --- tests/test_glocal.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_glocal.py b/tests/test_glocal.py index 75e456940..848bb68a5 100644 --- a/tests/test_glocal.py +++ b/tests/test_glocal.py @@ -187,8 +187,8 @@ def test_wrong_option_global_local_modeling(): df2_0["ID"] = "df2" df3_0 = df.iloc[256:384, :].copy(deep=True) df3_0["ID"] = "df3" - prev_level = log.getEffectiveLevel() - log.setLevel("CRITICAL") + prev_level = log.parent.getEffectiveLevel() + log.parent.setLevel("CRITICAL") m = NeuralProphet( n_forecasts=2, n_lags=10, @@ -199,7 +199,7 @@ def test_wrong_option_global_local_modeling(): season_global_local="glocsl", trend_global_local="glocsl", ) - log.setLevel(prev_level) + log.parent.setLevel(prev_level) train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) m.fit(train_df) future = m.make_future_dataframe(test_df) From 6cbf17b6abe896ba71b18e4a9aff973ed4a0d967 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 9 Feb 2024 14:45:43 -0800 Subject: [PATCH 091/128] benchmark dataloader time --- neuralprophet/forecaster.py | 4 +- neuralprophet/time_dataset.py | 2 +- tests/utils/benchmark_time_dataset.py | 242 +++++++++++++++++++------- 3 files changed, 180 insertions(+), 68 deletions(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index b0de6483c..40ceeac35 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -2544,7 +2544,7 @@ def _init_train_loader(self, df, num_workers=0): ------- torch DataLoader """ - df, _, _, _ = df_utils.prep_or_copy_df(df) + df, _, _, _ = df_utils.prep_or_copy_df(df) # TODO: Can this call be avoided? # if not self.fitted: self.config_normalization.init_data_params( df=df, @@ -2641,7 +2641,7 @@ def _train( metrics """ # Set up data the training dataloader - df, _, _, _ = df_utils.prep_or_copy_df(df) + df, _, _, _ = df_utils.prep_or_copy_df(df) # TODO: Can this call be removed? train_loader = self._init_train_loader(df, num_workers) dataset_size = len(df) # train_loader.dataset diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index d942e1410..2affa93dc 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -395,7 +395,7 @@ def log_input_shapes(inputs): "regressors", ]: for name, period_features in value.items(): - tabularized_input_shapes_str += f" {name} {key} {period_features}\n" + tabularized_input_shapes_str += f" {name} {key} {period_features.shape}\n" else: tabularized_input_shapes_str += f" {key} {value.shape} \n" log.debug(f"Tabularized inputs shapes: \n{tabularized_input_shapes_str}") diff --git a/tests/utils/benchmark_time_dataset.py b/tests/utils/benchmark_time_dataset.py index d76f33c85..c1e9e75fd 100644 --- a/tests/utils/benchmark_time_dataset.py +++ b/tests/utils/benchmark_time_dataset.py @@ -7,8 +7,13 @@ import pandas as pd import pytest import torch.utils.benchmark as benchmark +from torch.utils.data import DataLoader -from neuralprophet import NeuralProphet, uncertainty_evaluate +from neuralprophet import NeuralProphet, df_utils, utils +from neuralprophet.data.process import _check_dataframe, _create_dataset, _handle_missing_data +from neuralprophet.data.transform import _normalize + +# from neuralprophet.forecaster import log = logging.getLogger("NP.test") # log.setLevel("INFO") @@ -23,12 +28,114 @@ PEYTON_FILE = os.path.join(DATA_DIR, "wp_log_peyton_manning.csv") AIR_FILE = os.path.join(DATA_DIR, "air_passengers.csv") YOS_FILE = os.path.join(DATA_DIR, "yosemite_temps.csv") -NROWS = 256 -EPOCHS = 10 -BATCH_SIZE = 128 +NROWS = 1000 +EPOCHS = 1 +BATCH_SIZE = 10 LR = 1.0 +def print_input_shapes(inputs): + tabularized_input_shapes_str = "" + for key, value in inputs.items(): + if key in [ + "seasonalities", + "covariates", + "events", + "regressors", + ]: + for name, period_features in value.items(): + tabularized_input_shapes_str += f" {name} {key} {period_features.shape}\n" + else: + tabularized_input_shapes_str += f" {key} {value.shape} \n" + print(f"Tabularized inputs shapes: \n{tabularized_input_shapes_str}") + + +def load(nrows=NROWS, epochs=EPOCHS, batch=BATCH_SIZE, season=True, iterations=1): + tic = time.perf_counter() + df = pd.read_csv(YOS_FILE, nrows=nrows) + freq = "5min" + num_workers = 0 + + m = NeuralProphet( + n_lags=12, + n_forecasts=6, + epochs=epochs, + batch_size=batch, + learning_rate=LR, + yearly_seasonality=season, + weekly_seasonality=season, + daily_seasonality=season, + ) + + # Mimick m.fit(df) behavior + + df, _, _, m.id_list = df_utils.prep_or_copy_df(df) + df = _check_dataframe(m, df, check_y=True, exogenous=True) + m.data_freq = df_utils.infer_frequency(df, n_lags=m.max_lags, freq=freq) + df = _handle_missing_data( + df=df, + freq=m.data_freq, + n_lags=m.n_lags, + n_forecasts=m.n_forecasts, + config_missing=m.config_missing, + config_regressors=m.config_regressors, + config_lagged_regressors=m.config_lagged_regressors, + config_events=m.config_events, + config_seasonality=m.config_seasonality, + predicting=False, + ) + # mimick _init_train_loader + m.config_normalization.init_data_params( + df=df, + config_lagged_regressors=m.config_lagged_regressors, + config_regressors=m.config_regressors, + config_events=m.config_events, + config_seasonality=m.config_seasonality, + ) + df = _normalize(df=df, config_normalization=m.config_normalization) + + df_merged = df_utils.merge_dataframes(df) + m.config_seasonality = utils.set_auto_seasonalities(df_merged, config_seasonality=m.config_seasonality) + if m.config_country_holidays is not None: + m.config_country_holidays.init_holidays(df_merged) + + dataset = _create_dataset( + m, df, predict_mode=False, prediction_frequency=m.prediction_frequency + ) # needs to be called after set_auto_seasonalities + + # Determine the max_number of epochs + m.config_train.set_auto_batch_epoch(n_data=len(dataset)) + + loader = DataLoader( + dataset, + batch_size=m.config_train.batch_size, + shuffle=True, + num_workers=num_workers, + ) + # dataset_size = len(df) + # print(dataset_size) + + dataloader_iterator = iter(loader) + toc = time.perf_counter() + print(f"######## Time: {toc - tic:0.4f} for setup") + tic = time.perf_counter() + for i in range(iterations): + data, target, meta = next(dataloader_iterator) + # try: + # data, target, meta = next(dataloader_iterator) + # except StopIteration: + # dataloader_iterator = iter(loader) + # data, target, meta = next(dataloader_iterator) + # do_something() + toc = time.perf_counter() + # print_input_shapes(data) + # print(len(meta["df_name"])) + print(f"######## Time: {toc - tic:0.4f} for iterating {iterations} batches of size {batch}") + + +load(nrows=1010, batch=100, iterations=10) + + def yosemite(nrows=NROWS, epochs=EPOCHS, batch=BATCH_SIZE, season=True): # log.info("testing: Uncertainty Estimation Yosemite Temps") df = pd.read_csv(YOS_FILE, nrows=nrows) @@ -300,64 +407,69 @@ def peyton_minus_regressors(nrows=NROWS, epochs=EPOCHS, batch=BATCH_SIZE, season ############################### -# Compare takes a list of measurements which we'll save in results. -results = [] - -epochs = [5] -sizes = [100, 1000] -# sizes = [100, 1000, 10000] -batches = [128] -seasons = [False, True] -for ep, nrows, b, season in product(epochs, sizes, batches, seasons): - # label and sub_label are the rows - # description is the column - label = "tests" - sub_label = f"[rows: {nrows}, epochs:{ep}, batch:{b}, season:{season}]" - for num_threads in [1]: # [1, 4, 16, 64] - results.append( - benchmark.Timer( - stmt="yosemite(nrows, epochs, batch, season)", - setup="from __main__ import yosemite", - globals={"epochs": ep, "nrows": nrows, "batch": b, "season": season}, - num_threads=num_threads, - label=label, - sub_label=sub_label, - description="yosemite", - ).blocked_autorange(min_run_time=1) - ) - results.append( - benchmark.Timer( - stmt="peyton(nrows, epochs, batch, season)", - setup="from __main__ import peyton", - globals={"nrows": nrows, "epochs": ep, "batch": b, "season": season}, - num_threads=num_threads, - label=label, - sub_label=sub_label, - description="peyton", - ).blocked_autorange(min_run_time=1) - ) - results.append( - benchmark.Timer( - stmt="peyton_minus_events(nrows, epochs, batch, season)", - setup="from __main__ import peyton_minus_events", - globals={"nrows": nrows, "epochs": ep, "batch": b, "season": season}, - num_threads=num_threads, - label=label, - sub_label=sub_label, - description="peyton_minus_events", - ).blocked_autorange(min_run_time=1) - ) - results.append( - benchmark.Timer( - stmt="peyton_minus_regressors(nrows, epochs, batch, season)", - setup="from __main__ import peyton_minus_regressors", - globals={"nrows": nrows, "epochs": ep, "batch": b, "season": season}, - num_threads=num_threads, - label=label, - sub_label=sub_label, - description="peyton_minus_regressors", - ).blocked_autorange(min_run_time=1) - ) - -compare = benchmark.Compare(results) -compare.print() + +def measure_times(): + # Compare takes a list of measurements which we'll save in results. + results = [] + + epochs = [5] + sizes = [100, 1000] + # sizes = [100, 1000, 10000] + batches = [128] + seasons = [False, True] + for ep, nrows, b, season in product(epochs, sizes, batches, seasons): + # label and sub_label are the rows + # description is the column + label = "tests" + sub_label = f"[rows: {nrows}, epochs:{ep}, batch:{b}, season:{season}]" + for num_threads in [1]: # [1, 4, 16, 64] + results.append( + benchmark.Timer( + stmt="yosemite(nrows, epochs, batch, season)", + setup="from __main__ import yosemite", + globals={"epochs": ep, "nrows": nrows, "batch": b, "season": season}, + num_threads=num_threads, + label=label, + sub_label=sub_label, + description="yosemite", + ).blocked_autorange(min_run_time=1) + ) + results.append( + benchmark.Timer( + stmt="peyton(nrows, epochs, batch, season)", + setup="from __main__ import peyton", + globals={"nrows": nrows, "epochs": ep, "batch": b, "season": season}, + num_threads=num_threads, + label=label, + sub_label=sub_label, + description="peyton", + ).blocked_autorange(min_run_time=1) + ) + results.append( + benchmark.Timer( + stmt="peyton_minus_events(nrows, epochs, batch, season)", + setup="from __main__ import peyton_minus_events", + globals={"nrows": nrows, "epochs": ep, "batch": b, "season": season}, + num_threads=num_threads, + label=label, + sub_label=sub_label, + description="peyton_minus_events", + ).blocked_autorange(min_run_time=1) + ) + results.append( + benchmark.Timer( + stmt="peyton_minus_regressors(nrows, epochs, batch, season)", + setup="from __main__ import peyton_minus_regressors", + globals={"nrows": nrows, "epochs": ep, "batch": b, "season": season}, + num_threads=num_threads, + label=label, + sub_label=sub_label, + description="peyton_minus_regressors", + ).blocked_autorange(min_run_time=1) + ) + + compare = benchmark.Compare(results) + compare.print() + + +# measure_times() From 0c16eb1a233ac41b259e86ae28a62b9a73d37920 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 14 Feb 2024 16:57:22 -0800 Subject: [PATCH 092/128] remove hourly energy test --- tests/test_model_performance.py | 92 --------------------------------- 1 file changed, 92 deletions(-) diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index 93c908d77..3d72c186c 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -322,95 +322,3 @@ def test_EnergyDailyDeep(): # Training & Predict _ = m.fit(df=df_train, freq="D", num_workers=4) _ = m.predict(df_test) - - -# TODO: adapt to hourly dataset with multiple IDs -def test_EnergyHourlyDeep(): - ### Temporary Test for on-the-fly sampling - very time consuming! - - df = pd.read_csv(ENERGY_PRICE_DAILY_FILE) - df["temp"] = df["temperature"] - df = df.drop(columns="temperature") - df["ds"] = pd.to_datetime(df["ds"]) - df["y"] = pd.to_numeric(df["y"], errors="coerce") - - df = df.drop("ds", axis=1) - df["ds"] = pd.date_range(start="2015-01-01 00:00:00", periods=len(df), freq="H") - df["ID"] = "test" - - df_id = df[["ds", "y", "temp"]].copy() - df_id["ID"] = "test2" - df_id["y"] = df_id["y"] * 0.3 - df_id["temp"] = df_id["temp"] * 0.4 - df = pd.concat([df, df_id], ignore_index=True) - - # Conditional Seasonality - df["winter"] = np.where( - df["ds"].dt.month.isin([1]), - 1, - 0, - ) - df["summer"] = np.where(df["ds"].dt.month.isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]), 1, 0) - df["winter"] = pd.to_numeric(df["winter"], errors="coerce") - df["summer"] = pd.to_numeric(df["summer"], errors="coerce") - - # Normalize Temperature - df["temp"] = (df["temp"] - 65.0) / 50.0 - - # df - df = df[["ID", "ds", "y", "temp", "winter", "summer"]] - - # Hyperparameter - tuned_params = { - "n_lags": 24 * 15, - "newer_samples_weight": 2.0, - "n_changepoints": 0, - "yearly_seasonality": 10, - "weekly_seasonality": True, - "daily_seasonality": False, # due to conditional daily seasonality - "batch_size": 128, - "ar_layers": [32, 64, 32, 16], - "lagged_reg_layers": [32, 32], - # not tuned - "n_forecasts": 33, - "learning_rate": 0.001, - "epochs": 30, - "trend_global_local": "global", - "season_global_local": "global", - "drop_missing": True, - "normalize": "standardize", - } - - # Uncertainty Quantification - confidence_lv = 0.98 - quantile_list = [round(((1 - confidence_lv) / 2), 2), round((confidence_lv + (1 - confidence_lv) / 2), 2)] - - # Check if GPU is available - use_gpu = torch.cuda.is_available() - - # Set trainer configuration - trainer_configs = { - "accelerator": "gpu" if use_gpu else "cpu", - } - print(f"Using {'GPU' if use_gpu else 'CPU'}") - - # Model - m = NeuralProphet(**tuned_params, **trainer_configs, quantiles=quantile_list) - - # Lagged Regressor - m.add_lagged_regressor(names="temp", n_lags=33, normalize="standardize") - - # Conditional Seasonality - m.add_seasonality(name="winter", period=1, fourier_order=6, condition_name="winter") - m.add_seasonality(name="summer", period=1, fourier_order=6, condition_name="summer") - - # Holidays - m.add_country_holidays(country_name="US", lower_window=-1, upper_window=1) - - # Split - df_train = df[df["ds"] < "2015-03-01"] - df_test = df[df["ds"] >= "2015-03-01"] - - # Training & Predict - _ = m.fit(df=df_train, freq="H", num_workers=4, early_stopping=True) - _ = m.predict(df_test) From b5845fd972e95450cd308f0110966ea259a50270 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 14 Feb 2024 16:59:11 -0800 Subject: [PATCH 093/128] add debug notebook for energy hourly --- tests/metrics/debug-energy-price-hourly.ipynb | 2529 +++++++++++++++++ 1 file changed, 2529 insertions(+) create mode 100644 tests/metrics/debug-energy-price-hourly.ipynb diff --git a/tests/metrics/debug-energy-price-hourly.ipynb b/tests/metrics/debug-energy-price-hourly.ipynb new file mode 100644 index 000000000..14a09c93e --- /dev/null +++ b/tests/metrics/debug-energy-price-hourly.ipynb @@ -0,0 +1,2529 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pathlib\n", + "import torch\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from plotly.subplots import make_subplots\n", + "from plotly_resampler import unregister_plotly_resampler\n", + "\n", + "from neuralprophet import NeuralProphet, set_random_seed" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def create_metrics_plot(metrics):\n", + " # Deactivate the resampler since it is not compatible with kaleido (image export)\n", + " unregister_plotly_resampler()\n", + "\n", + " # Plotly params\n", + " prediction_color = \"#2d92ff\"\n", + " actual_color = \"black\"\n", + " line_width = 2\n", + " xaxis_args = {\"showline\": True, \"mirror\": True, \"linewidth\": 1.5, \"showgrid\": False}\n", + " yaxis_args = {\n", + " \"showline\": True,\n", + " \"mirror\": True,\n", + " \"linewidth\": 1.5,\n", + " \"showgrid\": False,\n", + " \"rangemode\": \"tozero\",\n", + " \"type\": \"log\",\n", + " }\n", + " layout_args = {\n", + " \"autosize\": True,\n", + " \"template\": \"plotly_white\",\n", + " \"margin\": go.layout.Margin(l=0, r=10, b=0, t=30, pad=0),\n", + " \"font\": dict(size=10),\n", + " \"title\": dict(font=dict(size=10)),\n", + " \"width\": 1000,\n", + " \"height\": 200,\n", + " }\n", + "\n", + " metric_cols = [col for col in metrics.columns if not (\"_val\" in col or col == \"RegLoss\" or col == \"epoch\")]\n", + " fig = make_subplots(rows=1, cols=len(metric_cols), subplot_titles=metric_cols)\n", + " for i, metric in enumerate(metric_cols):\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " y=metrics[metric],\n", + " name=metric,\n", + " mode=\"lines\",\n", + " line=dict(color=prediction_color, width=line_width),\n", + " legendgroup=metric,\n", + " ),\n", + " row=1,\n", + " col=i + 1,\n", + " )\n", + " if f\"{metric}_val\" in metrics.columns:\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " y=metrics[f\"{metric}_val\"],\n", + " name=f\"{metric}_val\",\n", + " mode=\"lines\",\n", + " line=dict(color=actual_color, width=line_width),\n", + " legendgroup=metric,\n", + " ),\n", + " row=1,\n", + " col=i + 1,\n", + " )\n", + " if metric == \"Loss\":\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " y=metrics[\"RegLoss\"],\n", + " name=\"RegLoss\",\n", + " mode=\"lines\",\n", + " line=dict(color=actual_color, width=line_width),\n", + " legendgroup=metric,\n", + " ),\n", + " row=1,\n", + " col=i + 1,\n", + " )\n", + " fig.update_xaxes(xaxis_args)\n", + " fig.update_yaxes(yaxis_args)\n", + " fig.update_layout(layout_args)\n", + " return fig" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "DIR = \"~/github/neural_prophet\"\n", + "DATA_DIR = os.path.join(DIR, \"tests\", \"test-data\")\n", + "PEYTON_FILE = os.path.join(DATA_DIR, \"wp_log_peyton_manning.csv\")\n", + "AIR_FILE = os.path.join(DATA_DIR, \"air_passengers.csv\")\n", + "YOS_FILE = os.path.join(DATA_DIR, \"yosemite_temps.csv\")\n", + "ENERGY_PRICE_DAILY_FILE = os.path.join(DATA_DIR, \"tutorial04_kaggle_energy_daily_temperature.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(ENERGY_PRICE_DAILY_FILE)\n", + "df[\"temp\"] = df[\"temperature\"]\n", + "df = df.drop(columns=\"temperature\")\n", + "df[\"ds\"] = pd.to_datetime(df[\"ds\"])\n", + "df[\"y\"] = pd.to_numeric(df[\"y\"], errors=\"coerce\")\n", + "\n", + "df = df.drop(\"ds\", axis=1)\n", + "df[\"ds\"] = pd.date_range(start=\"2015-01-01 00:00:00\", periods=len(df), freq=\"H\")\n", + "df[\"ID\"] = \"test\"\n", + "\n", + "df_id = df[[\"ds\", \"y\", \"temp\"]].copy()\n", + "df_id[\"ID\"] = \"test2\"\n", + "df_id[\"y\"] = df_id[\"y\"] * 0.3\n", + "df_id[\"temp\"] = df_id[\"temp\"] * 0.4\n", + "df = pd.concat([df, df_id], ignore_index=True)\n", + "\n", + "# Conditional Seasonality\n", + "df[\"winter\"] = np.where(\n", + " df[\"ds\"].dt.month.isin([1]),\n", + " 1,\n", + " 0,\n", + ")\n", + "df[\"summer\"] = np.where(df[\"ds\"].dt.month.isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]), 1, 0)\n", + "df[\"winter\"] = pd.to_numeric(df[\"winter\"], errors=\"coerce\")\n", + "df[\"summer\"] = pd.to_numeric(df[\"summer\"], errors=\"coerce\")\n", + "\n", + "# Normalize Temperature\n", + "df[\"temp\"] = (df[\"temp\"] - 65.0) / 50.0\n", + "\n", + "# df\n", + "df = df[[\"ID\", \"ds\", \"y\", \"temp\", \"winter\", \"summer\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using CPU\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Temporary Test for on-the-fly sampling - very time consuming!\n", + "\n", + "\n", + "# Hyperparameter\n", + "tuned_params = {\n", + " \"n_lags\": 10,\n", + " \"newer_samples_weight\": 2.0,\n", + " \"n_changepoints\": 0,\n", + " \"yearly_seasonality\": 10,\n", + " \"weekly_seasonality\": True,\n", + " \"daily_seasonality\": False, # due to conditional daily seasonality\n", + " \"batch_size\": 128,\n", + " \"ar_layers\": [8, 4],\n", + " \"lagged_reg_layers\": [8],\n", + " # not tuned\n", + " \"n_forecasts\": 5,\n", + " \"learning_rate\": 0.001,\n", + " \"epochs\": 10,\n", + " \"trend_global_local\": \"global\",\n", + " \"season_global_local\": \"global\",\n", + " \"drop_missing\": True,\n", + " \"normalize\": \"standardize\",\n", + "}\n", + "\n", + "# Uncertainty Quantification\n", + "confidence_lv = 0.98\n", + "quantile_list = [round(((1 - confidence_lv) / 2), 2), round((confidence_lv + (1 - confidence_lv) / 2), 2)]\n", + "\n", + "# Check if GPU is available\n", + "use_gpu = torch.cuda.is_available()\n", + "\n", + "# Set trainer configuration\n", + "trainer_configs = {\n", + " \"accelerator\": \"gpu\" if use_gpu else \"cpu\",\n", + "}\n", + "print(f\"Using {'GPU' if use_gpu else 'CPU'}\")\n", + "\n", + "# Model\n", + "m = NeuralProphet(**tuned_params, **trainer_configs, quantiles=quantile_list)\n", + "\n", + "# Lagged Regressor\n", + "m.add_lagged_regressor(names=\"temp\", n_lags=33, normalize=\"standardize\")\n", + "\n", + "# Conditional Seasonality\n", + "m.add_seasonality(name=\"winter\", period=1, fourier_order=6, condition_name=\"winter\")\n", + "m.add_seasonality(name=\"summer\", period=1, fourier_order=6, condition_name=\"summer\")\n", + "\n", + "# Holidays\n", + "m.add_country_holidays(country_name=\"US\", lower_window=-1, upper_window=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO - (NP.forecaster.fit) - When Global modeling with local normalization, metrics are displayed in normalized scale.\n", + "INFO - (NP.df_utils._infer_frequency) - Major frequency H corresponds to 99.929% of the data.\n", + "INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - H\n", + "INFO - (NP.df_utils._infer_frequency) - Major frequency H corresponds to 99.929% of the data.\n", + "INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - H\n", + "INFO - (NP.utils.configure_trainer) - Using accelerator cpu with 1 device(s).\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "aa26aaf9191f401b9c69ebafca381bab", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8f924e854e154a2a9e9e86640d0298db", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "babd196a4ca640adaa6302bdba9682b1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4c4b3ce470a7482f83b3118343efa35e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "43e00642d3674fac82ab23cd4d56ab3c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9091c01f25ff475bb3e16f402bdcb08b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a86321ee392d4c42b7e078509adb4efd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a659c5f8d73f49d2ba37897c1d604989", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "76eb4a72f44b47048d9e23746a6baf41", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f6400786ccbb4d899b117881cae52eb6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "914fe3ff48524bf38e9f5892bd897646", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Split\n", + "df_train = df[df[\"ds\"] < \"2015-03-01\"]\n", + "df_test = df[df[\"ds\"] >= \"2015-03-01\"]\n", + "\n", + "# Training & Predict\n", + "metrics = m.fit(df=df_train, validation_df=df_test, freq=\"H\", num_workers=4, early_stopping=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "plotlyServerURL": "https://plot.ly" + }, + "data": [ + { + "legendgroup": "MAE", + "line": { + "color": "#2d92ff", + "width": 2 + }, + "mode": "lines", + "name": "MAE", + "type": "scatter", + "xaxis": "x", + "y": [ + 1.6991313695907593, + 1.5541504621505737, + 1.2866111993789673, + 1.0485198497772217, + 0.9603586792945862, + 0.933108389377594, + 0.9244528412818909, + 0.9177840948104858, + 0.9132021069526672, + 0.9105463027954102 + ], + "yaxis": "y" + }, + { + "legendgroup": "MAE", + "line": { + "color": "black", + "width": 2 + }, + "mode": "lines", + "name": "MAE_val", + "type": "scatter", + "xaxis": "x", + "y": [ + 1.9174306392669678, + 2.133635997772217, + 2.1361277103424072, + 1.954904317855835, + 1.8205108642578125, + 1.7834810018539429, + 1.7635681629180908, + 1.7493915557861328, + 1.7418491840362549, + 1.7389646768569946 + ], + "yaxis": "y" + }, + { + "legendgroup": "RMSE", + "line": { + "color": "#2d92ff", + "width": 2 + }, + "mode": "lines", + "name": "RMSE", + "type": "scatter", + "xaxis": "x2", + "y": [ + 2.249849557876587, + 2.062807083129883, + 1.6801131963729858, + 1.344346523284912, + 1.2270969152450562, + 1.1934525966644287, + 1.1826142072677612, + 1.1741188764572144, + 1.169130563735962, + 1.1649360656738281 + ], + "yaxis": "y2" + }, + { + "legendgroup": "RMSE", + "line": { + "color": "black", + "width": 2 + }, + "mode": "lines", + "name": "RMSE_val", + "type": "scatter", + "xaxis": "x2", + "y": [ + 2.1282451152801514, + 2.287360668182373, + 2.3184731006622314, + 2.140346050262451, + 2.0008866786956787, + 1.962218999862671, + 1.9410110712051392, + 1.9257516860961914, + 1.9175572395324707, + 1.914405107498169 + ], + "yaxis": "y2" + }, + { + "legendgroup": "Loss", + "line": { + "color": "#2d92ff", + "width": 2 + }, + "mode": "lines", + "name": "Loss", + "type": "scatter", + "xaxis": "x3", + "y": [ + 3.4565775394439697, + 3.047083854675293, + 2.3058581352233887, + 1.710412621498108, + 1.4448997974395752, + 1.353717565536499, + 1.3267676830291748, + 1.3102833032608032, + 1.2921112775802612, + 1.2888280153274536 + ], + "yaxis": "y3" + }, + { + "legendgroup": "Loss", + "line": { + "color": "black", + "width": 2 + }, + "mode": "lines", + "name": "Loss_val", + "type": "scatter", + "xaxis": "x3", + "y": [ + 4.821254730224609, + 4.705277919769287, + 4.240411758422852, + 3.7221953868865967, + 3.4264442920684814, + 3.345188617706299, + 3.2992584705352783, + 3.2648608684539795, + 3.246990919113159, + 3.2401645183563232 + ], + "yaxis": "y3" + }, + { + "legendgroup": "Loss", + "line": { + "color": "black", + "width": 2 + }, + "mode": "lines", + "name": "RegLoss", + "type": "scatter", + "xaxis": "x3", + "y": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "yaxis": "y3" + } + ], + "layout": { + "annotations": [ + { + "font": { + "size": 16 + }, + "showarrow": false, + "text": "MAE", + "x": 0.14444444444444446, + "xanchor": "center", + "xref": "paper", + "y": 1, + "yanchor": "bottom", + "yref": "paper" + }, + { + "font": { + "size": 16 + }, + "showarrow": false, + "text": "RMSE", + "x": 0.5, + "xanchor": "center", + "xref": "paper", + "y": 1, + "yanchor": "bottom", + "yref": "paper" + }, + { + "font": { + "size": 16 + }, + "showarrow": false, + "text": "Loss", + "x": 0.8555555555555556, + "xanchor": "center", + "xref": "paper", + "y": 1, + "yanchor": "bottom", + "yref": "paper" + } + ], + "autosize": true, + "font": { + "size": 10 + }, + "height": 200, + "margin": { + "b": 0, + "l": 0, + "pad": 0, + "r": 10, + "t": 30 + }, + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "white", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "white", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "#C8D4E3", + "linecolor": "#C8D4E3", + "minorgridcolor": "#C8D4E3", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "#C8D4E3", + "linecolor": "#C8D4E3", + "minorgridcolor": "#C8D4E3", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "fillpattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "white", + "showlakes": true, + "showland": true, + "subunitcolor": "#C8D4E3" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "white", + "polar": { + "angularaxis": { + "gridcolor": "#EBF0F8", + "linecolor": "#EBF0F8", + "ticks": "" + }, + "bgcolor": "white", + "radialaxis": { + "gridcolor": "#EBF0F8", + "linecolor": "#EBF0F8", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "white", + "gridcolor": "#DFE8F3", + "gridwidth": 2, + "linecolor": "#EBF0F8", + "showbackground": true, + "ticks": "", + "zerolinecolor": "#EBF0F8" + }, + "yaxis": { + "backgroundcolor": "white", + "gridcolor": "#DFE8F3", + "gridwidth": 2, + "linecolor": "#EBF0F8", + "showbackground": true, + "ticks": "", + "zerolinecolor": "#EBF0F8" + }, + "zaxis": { + "backgroundcolor": "white", + "gridcolor": "#DFE8F3", + "gridwidth": 2, + "linecolor": "#EBF0F8", + "showbackground": true, + "ticks": "", + "zerolinecolor": "#EBF0F8" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "#DFE8F3", + "linecolor": "#A2B1C6", + "ticks": "" + }, + "baxis": { + "gridcolor": "#DFE8F3", + "linecolor": "#A2B1C6", + "ticks": "" + }, + "bgcolor": "white", + "caxis": { + "gridcolor": "#DFE8F3", + "linecolor": "#A2B1C6", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "#EBF0F8", + "linecolor": "#EBF0F8", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "#EBF0F8", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "#EBF0F8", + "linecolor": "#EBF0F8", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "#EBF0F8", + "zerolinewidth": 2 + } + } + }, + "title": { + "font": { + "size": 10 + } + }, + "width": 1000, + "xaxis": { + "anchor": "y", + "domain": [ + 0, + 0.2888888888888889 + ], + "linewidth": 1.5, + "mirror": true, + "showgrid": false, + "showline": true + }, + "xaxis2": { + "anchor": "y2", + "domain": [ + 0.35555555555555557, + 0.6444444444444445 + ], + "linewidth": 1.5, + "mirror": true, + "showgrid": false, + "showline": true + }, + "xaxis3": { + "anchor": "y3", + "domain": [ + 0.7111111111111111, + 1 + ], + "linewidth": 1.5, + "mirror": true, + "showgrid": false, + "showline": true + }, + "yaxis": { + "anchor": "x", + "domain": [ + 0, + 1 + ], + "linewidth": 1.5, + "mirror": true, + "rangemode": "tozero", + "showgrid": false, + "showline": true, + "type": "log" + }, + "yaxis2": { + "anchor": "x2", + "domain": [ + 0, + 1 + ], + "linewidth": 1.5, + "mirror": true, + "rangemode": "tozero", + "showgrid": false, + "showline": true, + "type": "log" + }, + "yaxis3": { + "anchor": "x3", + "domain": [ + 0, + 1 + ], + "linewidth": 1.5, + "mirror": true, + "rangemode": "tozero", + "showgrid": false, + "showline": true, + "type": "log" + } + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "create_metrics_plot(metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'MAE_val': 1.7389646768569946,\n", + " 'RMSE_val': 1.914405107498169,\n", + " 'Loss_val': 3.2401645183563232,\n", + " 'RegLoss_val': 0.0,\n", + " 'epoch': 9,\n", + " 'MAE': 0.9105463027954102,\n", + " 'RMSE': 1.1649360656738281,\n", + " 'Loss': 1.2888280153274536,\n", + " 'RegLoss': 0.0}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.to_dict(\"records\")[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MAE_valRMSE_valLoss_valRegLoss_valepochMAERMSELossRegLoss
91.7389651.9144053.2401650.090.9105461.1649361.2888280.0
\n", + "
" + ], + "text/plain": [ + " MAE_val RMSE_val Loss_val RegLoss_val epoch MAE RMSE \\\n", + "9 1.738965 1.914405 3.240165 0.0 9 0.910546 1.164936 \n", + "\n", + " Loss RegLoss \n", + "9 1.288828 0.0 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.tail(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO - (NP.df_utils._infer_frequency) - Major frequency H corresponds to 99.932% of the data.\n", + "INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - H\n", + "INFO - (NP.df_utils._infer_frequency) - Major frequency H corresponds to 99.932% of the data.\n", + "INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - H\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO - (NP.df_utils._infer_frequency) - Major frequency H corresponds to 99.932% of the data.\n", + "INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - H\n", + "INFO - (NP.data.processing._handle_missing_data) - Dropped 5 rows at the end with NaNs in 'y' column.\n", + "INFO - (NP.df_utils._infer_frequency) - Major frequency H corresponds to 99.932% of the data.\n", + "INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - H\n", + "INFO - (NP.data.processing._handle_missing_data) - Dropped 5 rows at the end with NaNs in 'y' column.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "78600faef98442c3bcae260cf6a78232", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Predicting: 22it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a621592c80404313838c8ae9250a41c3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Predicting: 22it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "forecast = m.predict(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO - (NP.forecaster.plot) - Plotting data from ID test\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4d1fc6fc6bca4a459a6484e0f9bec945", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FigureWidgetResampler({\n", + " 'data': [{'fillcolor': 'rgba(45, 146, 255, 0.2)',\n", + " 'line': {'color': 'rgba(45, 146, 255, 0.2)', 'width': 1},\n", + " 'mode': 'lines',\n", + " 'name': '[R] yhat5 1.0% ~1h',\n", + " 'type': 'scatter',\n", + " 'uid': 'aebc484d-c130-47bd-8870-268071f0b3d5',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 13, 0),\n", + " datetime.datetime(2015, 1, 2, 14, 0),\n", + " datetime.datetime(2015, 1, 2, 15, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 18, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'y': array([62.35801 , 58.90128 , 49.21923 , ..., 50.683945, 56.553596, 58.41175 ],\n", + " dtype=float32)},\n", + " {'fill': 'tonexty',\n", + " 'fillcolor': 'rgba(45, 146, 255, 0.2)',\n", + " 'line': {'color': 'rgba(45, 146, 255, 0.2)', 'width': 1},\n", + " 'mode': 'lines',\n", + " 'name': '[R] yhat5 99.0% ~1h',\n", + " 'type': 'scatter',\n", + " 'uid': 'c62aca2a-cbb9-4e43-915f-156387e57092',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 13, 0),\n", + " datetime.datetime(2015, 1, 2, 14, 0),\n", + " datetime.datetime(2015, 1, 2, 15, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 19, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'y': array([80.960884, 76.19124 , 64.98064 , ..., 55.83882 , 67.100685, 64.74074 ],\n", + " dtype=float32)},\n", + " {'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': '[R] Predicted ~1h',\n", + " 'type': 'scatter',\n", + " 'uid': 'aeae0371-af61-428b-bac3-3d7c9675a881',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 13, 0),\n", + " datetime.datetime(2015, 1, 2, 14, 0),\n", + " datetime.datetime(2015, 1, 2, 15, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 18, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'y': array([62.35801 , 58.90128 , 49.21923 , ..., 50.683945, 56.553596, 58.41175 ],\n", + " dtype=float32)},\n", + " {'marker': {'color': 'blue', 'size': 4, 'symbol': 'x'},\n", + " 'mode': 'markers',\n", + " 'name': '[R] Predicted ~1h',\n", + " 'type': 'scatter',\n", + " 'uid': 'fdc61ccb-a79c-4487-bdf9-b9be5d0159d8',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 13, 0),\n", + " datetime.datetime(2015, 1, 2, 14, 0),\n", + " datetime.datetime(2015, 1, 2, 15, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 18, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'y': array([62.35801 , 58.90128 , 49.21923 , ..., 50.683945, 56.553596, 58.41175 ],\n", + " dtype=float32)},\n", + " {'marker': {'color': 'black', 'size': 4},\n", + " 'mode': 'markers',\n", + " 'name': '[R] Actual ~1h',\n", + " 'type': 'scatter',\n", + " 'uid': 'f3b8bafe-c1a6-4a00-b6d8-94ad845ee178',\n", + " 'x': array([datetime.datetime(2015, 1, 1, 0, 0),\n", + " datetime.datetime(2015, 1, 1, 1, 0),\n", + " datetime.datetime(2015, 1, 1, 2, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 18, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'y': array([64.92, 58.46, 63.35, ..., 68.61, 60.22, 60.32])}],\n", + " 'layout': {'autosize': True,\n", + " 'font': {'size': 10},\n", + " 'height': 420,\n", + " 'hovermode': 'x unified',\n", + " 'margin': {'b': 0, 'l': 0, 'pad': 0, 'r': 10, 't': 10},\n", + " 'showlegend': True,\n", + " 'template': '...',\n", + " 'title': {'font': {'size': 12}},\n", + " 'width': 700,\n", + " 'xaxis': {'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangeselector': {'buttons': [{'count': 7, 'label': '1w', 'step': 'day', 'stepmode': 'backward'},\n", + " {'count': 1,\n", + " 'label': '1m',\n", + " 'step': 'month',\n", + " 'stepmode': 'backward'},\n", + " {'count': 6,\n", + " 'label': '6m',\n", + " 'step': 'month',\n", + " 'stepmode': 'backward'},\n", + " {'count': 1, 'label': '1y', 'step': 'year', 'stepmode': 'backward'},\n", + " {'step': 'all'}]},\n", + " 'rangeslider': {'visible': True},\n", + " 'showline': True,\n", + " 'title': {'text': 'ds'},\n", + " 'type': 'date'},\n", + " 'yaxis': {'linewidth': 1.5, 'mirror': True, 'showline': True, 'title': {'text': 'y'}}}\n", + "})" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m.highlight_nth_step_ahead_of_each_forecast(m.n_forecasts)\n", + "m.plot(forecast, df_name=\"test\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO - (NP.forecaster.plot_components) - Plotting data from ID test\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0851c9188ffb4c94bc7948103985aee2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FigureWidgetResampler({\n", + " 'data': [{'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': '[R] Trend ~1h',\n", + " 'showlegend': False,\n", + " 'type': 'scatter',\n", + " 'uid': 'a971f8c1-1e2e-428f-bbae-b1e366f40f84',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 9, 0),\n", + " datetime.datetime(2015, 1, 2, 10, 0),\n", + " datetime.datetime(2015, 1, 2, 11, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 19, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'xaxis': 'x',\n", + " 'y': array([41.138184, 41.136326, 41.134468, ..., 38.49218 , 38.488464, 38.486603],\n", + " dtype=float32),\n", + " 'yaxis': 'y'},\n", + " {'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': ('[R' ... ' style=\"color:#fc9944\">~1h'),\n", + " 'showlegend': False,\n", + " 'type': 'scatter',\n", + " 'uid': '896ec7a5-4db3-4572-9615-c92ff0a440c2',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 9, 0),\n", + " datetime.datetime(2015, 1, 2, 10, 0),\n", + " datetime.datetime(2015, 1, 2, 11, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 19, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'xaxis': 'x2',\n", + " 'y': array([7.7610316, 7.77278 , 7.782315 , ..., 8.327494 , 8.318201 , 8.315492 ],\n", + " dtype=float32),\n", + " 'yaxis': 'y2'},\n", + " {'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': ('[R' ... ' style=\"color:#fc9944\">~1h'),\n", + " 'showlegend': False,\n", + " 'type': 'scatter',\n", + " 'uid': '4e7558c0-22c3-42b6-b38b-64828e95911f',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 9, 0),\n", + " datetime.datetime(2015, 1, 2, 10, 0),\n", + " datetime.datetime(2015, 1, 2, 11, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 19, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'xaxis': 'x3',\n", + " 'y': array([ 0.36878857, 0.30485797, 0.2463306 , ..., -0.56539005, 0.4600458 ,\n", + " 0.93207777], dtype=float32),\n", + " 'yaxis': 'y3'},\n", + " {'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': ('[R' ... ' style=\"color:#fc9944\">~1h'),\n", + " 'showlegend': False,\n", + " 'type': 'scatter',\n", + " 'uid': '421f3dfd-0361-48bd-b035-27b85837e7d1',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 9, 0),\n", + " datetime.datetime(2015, 1, 2, 10, 0),\n", + " datetime.datetime(2015, 1, 2, 11, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 18, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'xaxis': 'x4',\n", + " 'y': array([ 6.8369484 , 8.779529 , -0.55572075, ..., 0. , 0. ,\n", + " 0. ], dtype=float32),\n", + " 'yaxis': 'y4'},\n", + " {'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': ('[R' ... ' style=\"color:#fc9944\">~1h'),\n", + " 'showlegend': False,\n", + " 'type': 'scatter',\n", + " 'uid': '34dff3a4-3054-4a91-b79e-f965aa8d3284',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 9, 0),\n", + " datetime.datetime(2015, 1, 2, 10, 0),\n", + " datetime.datetime(2015, 1, 2, 11, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 18, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'xaxis': 'x5',\n", + " 'y': array([0. , 0. , 0. , ..., 2.5935924, 7.5037613, 4.810857 ],\n", + " dtype=float32),\n", + " 'yaxis': 'y5'},\n", + " {'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': ('[R' ... ' style=\"color:#fc9944\">~1h'),\n", + " 'showlegend': False,\n", + " 'type': 'scatter',\n", + " 'uid': 'cd6c048f-2b7f-47c5-8f34-50aed056ee0b',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 13, 0),\n", + " datetime.datetime(2015, 1, 2, 14, 0),\n", + " datetime.datetime(2015, 1, 2, 15, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 18, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'xaxis': 'x6',\n", + " 'y': array([14.265438 , 6.3923936 , -0.08357577, ..., 0. , 0.4089267 ,\n", + " 4.4793005 ], dtype=float32),\n", + " 'yaxis': 'y6'},\n", + " {'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': ('[R' ... ' style=\"color:#fc9944\">~1h'),\n", + " 'showlegend': False,\n", + " 'type': 'scatter',\n", + " 'uid': '61f91c94-2944-4efb-9907-709ee2fb6c77',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 13, 0),\n", + " datetime.datetime(2015, 1, 2, 14, 0),\n", + " datetime.datetime(2015, 1, 2, 15, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 18, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'xaxis': 'x7',\n", + " 'y': array([1.765334 , 3.7883697, 4.1204934, ..., 1.8360679, 1.8920995, 0. ],\n", + " dtype=float32),\n", + " 'yaxis': 'y7'},\n", + " {'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': ('[R' ... ' style=\"color:#fc9944\">~1h'),\n", + " 'showlegend': False,\n", + " 'type': 'scatter',\n", + " 'uid': '885dd23f-b889-4004-9dcf-70c1ca00a53d',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 9, 0),\n", + " datetime.datetime(2015, 1, 2, 10, 0),\n", + " datetime.datetime(2015, 1, 2, 11, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 18, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'xaxis': 'x8',\n", + " 'y': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),\n", + " 'yaxis': 'y8'},\n", + " {'fill': 'tozeroy',\n", + " 'fillcolor': 'rgba(45, 146, 255, 0.2)',\n", + " 'line': {'color': 'rgba(45, 146, 255, 0.2)', 'width': 1},\n", + " 'mode': 'lines',\n", + " 'name': '[R] yhat5 1.0% ~1h',\n", + " 'showlegend': True,\n", + " 'type': 'scatter',\n", + " 'uid': 'b180a45d-ef7d-43be-9180-b0d60ffffff7',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 13, 0),\n", + " datetime.datetime(2015, 1, 2, 14, 0),\n", + " datetime.datetime(2015, 1, 2, 15, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 19, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'xaxis': 'x9',\n", + " 'y': array([ 37.975266 , 31.622068 , -3.693409 , ..., -14.12719 , -2.9433403,\n", + " -0.9949646], dtype=float32),\n", + " 'yaxis': 'y9'},\n", + " {'fill': 'tozeroy',\n", + " 'fillcolor': 'rgba(45, 146, 255, 0.2)',\n", + " 'line': {'color': 'rgba(45, 146, 255, 0.2)', 'width': 1},\n", + " 'mode': 'lines',\n", + " 'name': '[R] yhat5 99.0% ~1h',\n", + " 'showlegend': True,\n", + " 'type': 'scatter',\n", + " 'uid': '94ed8664-debf-4a84-b7bf-74bc18ea9a66',\n", + " 'x': array([datetime.datetime(2015, 1, 2, 13, 0),\n", + " datetime.datetime(2015, 1, 2, 14, 0),\n", + " datetime.datetime(2015, 1, 2, 15, 0), ...,\n", + " datetime.datetime(2015, 3, 2, 17, 0),\n", + " datetime.datetime(2015, 3, 2, 19, 0),\n", + " datetime.datetime(2015, 3, 2, 20, 0)], dtype=object),\n", + " 'xaxis': 'x9',\n", + " 'y': array([56.57814 , 48.912025 , 12.067997 , ..., -8.972313 , 5.452839 ,\n", + " 5.3340225], dtype=float32),\n", + " 'yaxis': 'y9'}],\n", + " 'layout': {'autosize': True,\n", + " 'barmode': 'overlay',\n", + " 'font': {'size': 10},\n", + " 'height': 1890,\n", + " 'hovermode': 'x unified',\n", + " 'legend': {'traceorder': 'reversed', 'y': 0.1},\n", + " 'margin': {'b': 0, 'l': 0, 'pad': 0, 'r': 10, 't': 10},\n", + " 'template': '...',\n", + " 'title': {'font': {'size': 12}},\n", + " 'width': 700,\n", + " 'xaxis': {'anchor': 'y',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [2014-12-30 10:00:00, 2015-03-05 19:00:00],\n", + " 'showline': True,\n", + " 'title': {'text': 'ds'},\n", + " 'type': 'date'},\n", + " 'xaxis2': {'anchor': 'y2',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [2014-12-30 10:00:00, 2015-03-05 19:00:00],\n", + " 'showline': True,\n", + " 'title': {'text': 'ds'},\n", + " 'type': 'date'},\n", + " 'xaxis3': {'anchor': 'y3',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [2014-12-30 10:00:00, 2015-03-05 19:00:00],\n", + " 'showline': True,\n", + " 'title': {'text': 'ds'},\n", + " 'type': 'date'},\n", + " 'xaxis4': {'anchor': 'y4',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [2014-12-30 10:00:00, 2015-03-05 19:00:00],\n", + " 'showline': True,\n", + " 'title': {'text': 'ds'},\n", + " 'type': 'date'},\n", + " 'xaxis5': {'anchor': 'y5',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [2014-12-30 10:00:00, 2015-03-05 19:00:00],\n", + " 'showline': True,\n", + " 'title': {'text': 'ds'},\n", + " 'type': 'date'},\n", + " 'xaxis6': {'anchor': 'y6',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [2014-12-30 14:00:00, 2015-03-05 19:00:00],\n", + " 'showline': True,\n", + " 'title': {'text': 'ds'},\n", + " 'type': 'date'},\n", + " 'xaxis7': {'anchor': 'y7',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [2014-12-30 14:00:00, 2015-03-05 19:00:00],\n", + " 'showline': True,\n", + " 'title': {'text': 'ds'},\n", + " 'type': 'date'},\n", + " 'xaxis8': {'anchor': 'y8',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [2014-12-30 10:00:00, 2015-03-05 19:00:00],\n", + " 'showline': True,\n", + " 'title': {'text': 'ds'},\n", + " 'type': 'date'},\n", + " 'xaxis9': {'anchor': 'y9',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [2014-12-30 14:00:00, 2015-03-05 19:00:00],\n", + " 'showline': True,\n", + " 'title': {'text': 'ds'},\n", + " 'type': 'date'},\n", + " 'yaxis': {'anchor': 'x',\n", + " 'domain': [0.9185185185185185, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'normal',\n", + " 'showline': True,\n", + " 'title': {'text': 'Trend'}},\n", + " 'yaxis2': {'anchor': 'x2',\n", + " 'domain': [0.8037037037037038, 0.8851851851851853],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'tozero',\n", + " 'showline': True,\n", + " 'title': {'text': 'yearly seasonality'}},\n", + " 'yaxis3': {'anchor': 'x3',\n", + " 'domain': [0.6888888888888889, 0.7703703703703704],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'tozero',\n", + " 'showline': True,\n", + " 'title': {'text': 'weekly seasonality'}},\n", + " 'yaxis4': {'anchor': 'x4',\n", + " 'domain': [0.5740740740740741, 0.6555555555555556],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'tozero',\n", + " 'showline': True,\n", + " 'title': {'text': 'winter seasonality'}},\n", + " 'yaxis5': {'anchor': 'x5',\n", + " 'domain': [0.45925925925925926, 0.5407407407407407],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'tozero',\n", + " 'showline': True,\n", + " 'title': {'text': 'summer seasonality'}},\n", + " 'yaxis6': {'anchor': 'x6',\n", + " 'domain': [0.34444444444444444, 0.42592592592592593],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'tozero',\n", + " 'showline': True,\n", + " 'title': {'text': 'AR (5)-ahead'}},\n", + " 'yaxis7': {'anchor': 'x7',\n", + " 'domain': [0.22962962962962963, 0.3111111111111111],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'tozero',\n", + " 'showline': True,\n", + " 'title': {'text': 'Lagged Regressor \"temp\" (5)-ahead'}},\n", + " 'yaxis8': {'anchor': 'x8',\n", + " 'domain': [0.11481481481481481, 0.1962962962962963],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'tozero',\n", + " 'showline': True,\n", + " 'title': {'text': 'Additive Events'}},\n", + " 'yaxis9': {'anchor': 'x9',\n", + " 'domain': [0.0, 0.08148148148148149],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'tozero',\n", + " 'showline': True,\n", + " 'title': {'text': 'Uncertainty'}}}\n", + "})" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m.plot_components(forecast, df_name=\"test\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "653b60479a0244c394b8e68cea26b341", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FigureWidgetResampler({\n", + " 'data': [{'fill': 'none',\n", + " 'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': 'Trend',\n", + " 'type': 'scatter',\n", + " 'uid': 'f6f21f4d-8199-49f7-a49a-9951dd269bd9',\n", + " 'x': array([datetime.datetime(2015, 1, 1, 0, 0),\n", + " datetime.datetime(2015, 2, 28, 23, 0)], dtype=object),\n", + " 'xaxis': 'x',\n", + " 'y': array([41.1995 , 38.57022], dtype=float32),\n", + " 'yaxis': 'y'},\n", + " {'fill': 'none',\n", + " 'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': 'yearly',\n", + " 'type': 'scatter',\n", + " 'uid': 'f0adc090-2190-4a3b-9c9b-97c8cede02e2',\n", + " 'x': array([datetime.datetime(2017, 1, 1, 0, 0),\n", + " datetime.datetime(2017, 1, 2, 0, 0),\n", + " datetime.datetime(2017, 1, 3, 0, 0), ...,\n", + " datetime.datetime(2017, 12, 29, 0, 0),\n", + " datetime.datetime(2017, 12, 30, 0, 0),\n", + " datetime.datetime(2017, 12, 31, 0, 0)], dtype=object),\n", + " 'xaxis': 'x2',\n", + " 'y': array([4.0829487 , 5.187225 , 6.208157 , ..., 0.19168049, 1.4080983 ,\n", + " 2.6177309 ], dtype=float32),\n", + " 'yaxis': 'y2'},\n", + " {'fill': 'none',\n", + " 'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': 'weekly',\n", + " 'type': 'scatter',\n", + " 'uid': '17c4c727-ac96-43b0-8e36-acc3caab9c2d',\n", + " 'x': array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,\n", + " 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,\n", + " 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,\n", + " 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,\n", + " 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,\n", + " 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,\n", + " 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,\n", + " 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,\n", + " 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,\n", + " 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,\n", + " 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,\n", + " 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167]),\n", + " 'xaxis': 'x3',\n", + " 'y': array([-4.4598384 , -4.2069 , -3.939087 , -3.623846 , -3.2824173 ,\n", + " -2.955236 , -2.6178632 , -2.2747667 , -1.9053857 , -1.5644891 ,\n", + " -1.2317951 , -0.9106357 , -0.58533347, -0.28182796, -0.02269452,\n", + " 0.21292625, 0.42130318, 0.6071059 , 0.7497983 , 0.8622262 ,\n", + " 0.9443581 , 0.99504983, 1.0032893 , 0.9738326 , 0.9262827 ,\n", + " 0.85167503, 0.74685544, 0.6115421 , 0.4558556 , 0.2885901 ,\n", + " 0.11641604, -0.07034495, -0.27396792, -0.47582942, -0.66030794,\n", + " -0.8382371 , -1.0191802 , -1.1867032 , -1.3330028 , -1.4465324 ,\n", + " -1.5369219 , -1.6096568 , -1.6520382 , -1.6572822 , -1.6354212 ,\n", + " -1.5726112 , -1.4764075 , -1.3572443 , -1.2015358 , -1.011647 ,\n", + " -0.80462766, -0.5479135 , -0.2638637 , 0.02303137, 0.33831146,\n", + " 0.64510536, 0.9981383 , 1.3464724 , 1.6830701 , 2.0331054 ,\n", + " 2.3555207 , 2.69649 , 3.008644 , 3.2858677 , 3.5552442 ,\n", + " 3.7721593 , 3.9756846 , 4.1423197 , 4.264748 , 4.3529058 ,\n", + " 4.389515 , 4.3708434 , 4.3175454 , 4.2146673 , 4.072025 ,\n", + " 3.8747027 , 3.6363995 , 3.340743 , 3.0221694 , 2.664729 ,\n", + " 2.2731943 , 1.8473105 , 1.4062825 , 0.9366955 , 0.42765933,\n", + " -0.07621501, -0.56814903, -1.059046 , -1.5562105 , -2.0614126 ,\n", + " -2.5020652 , -2.9485521 , -3.352801 , -3.7507663 , -4.1075444 ,\n", + " -4.387843 , -4.6441846 , -4.8436375 , -4.9982753 , -5.098551 ,\n", + " -5.1361027 , -5.115603 , -5.042491 , -4.910203 , -4.7105756 ,\n", + " -4.472313 , -4.1741037 , -3.8366807 , -3.4286702 , -2.970053 ,\n", + " -2.5127845 , -2.0011377 , -1.4785788 , -0.89798415, -0.33239934,\n", + " 0.24315366, 0.8609969 , 1.4341363 , 2.0365245 , 2.5912929 ,\n", + " 3.1261334 , 3.631675 , 4.12216 , 4.5922604 , 4.9916644 ,\n", + " 5.357276 , 5.6596603 , 5.918557 , 6.127722 , 6.2664776 ,\n", + " 6.3465624 , 6.3701463 , 6.3298607 , 6.2247863 , 6.0684857 ,\n", + " 5.8579984 , 5.5947313 , 5.2599936 , 4.8867702 , 4.4837136 ,\n", + " 4.040872 , 3.5451972 , 3.0077634 , 2.463368 , 1.9197478 ,\n", + " 1.3607975 , 0.7956455 , 0.19293702, -0.40335596, -0.9517328 ,\n", + " -1.4785621 , -1.9893605 , -2.4982615 , -2.9505756 , -3.3692324 ,\n", + " -3.7429237 , -4.1006875 , -4.402756 , -4.6468496 , -4.8368144 ,\n", + " -4.987736 , -5.0905485 , -5.1344304 , -5.1309776 , -5.0739946 ,\n", + " -4.993806 , -4.8492026 , -4.6650367 ], dtype=float32),\n", + " 'yaxis': 'y3'},\n", + " {'fill': 'none',\n", + " 'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': 'winter',\n", + " 'type': 'scatter',\n", + " 'uid': '22616063-3eff-4306-a74a-dc4965de0de9',\n", + " 'x': array([ 0, 1, 2, ..., 285, 286, 287]),\n", + " 'xaxis': 'x4',\n", + " 'y': array([-4.292418 , -3.5483618, -3.0230176, ..., -5.4796743, -5.2587185,\n", + " -4.8447485], dtype=float32),\n", + " 'yaxis': 'y4'},\n", + " {'fill': 'none',\n", + " 'line': {'color': '#2d92ff', 'width': 2},\n", + " 'mode': 'lines',\n", + " 'name': 'summer',\n", + " 'type': 'scatter',\n", + " 'uid': '277a9945-45f4-45ca-91c5-7e8e3f338811',\n", + " 'x': array([ 0, 1, 2, ..., 285, 286, 287]),\n", + " 'xaxis': 'x5',\n", + " 'y': array([-1.6798731 , -2.3781397 , -2.901272 , ..., -0.19541107, -0.51879483,\n", + " -1.117872 ], dtype=float32),\n", + " 'yaxis': 'y5'},\n", + " {'marker': {'color': '#2d92ff'},\n", + " 'name': 'AR',\n", + " 'type': 'bar',\n", + " 'uid': 'fa756dff-2aaa-47cc-b16e-9266593c0172',\n", + " 'width': 0.8,\n", + " 'x': array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1]),\n", + " 'xaxis': 'x6',\n", + " 'y': array([-0.03951903, 0.41645312, 0.02179232, -0.2604984 , -0.06300073,\n", + " -0.06662486, -0.08233377, -0.03597524, 0.08927898, -0.07381544],\n", + " dtype=float32),\n", + " 'yaxis': 'y6'},\n", + " {'marker': {'color': '#2d92ff'},\n", + " 'name': 'Lagged Regressor \"temp\"',\n", + " 'type': 'bar',\n", + " 'uid': '041656ea-aaa1-4e1b-abce-a1e942a626bb',\n", + " 'width': 0.8,\n", + " 'x': array([33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,\n", + " 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]),\n", + " 'xaxis': 'x7',\n", + " 'y': array([ 0.20808354, 0.3050754 , 0.52504927, 0.02816455, -0.2267277 ,\n", + " -0.18377087, 0.34080964, 0.00188361, -0.14284115, 0.06430382,\n", + " 0.31131235, -0.09880974, 0.06406495, 0.25881714, 0.08779721,\n", + " -0.18321382, 0.2451885 , -0.23906691, -0.233605 , -0.05307174,\n", + " 0.17820123, 0.12141816, 0.0911953 , -0.10566162, 0.07743413,\n", + " 0.21802229, 0.35458654, 0.06151056, 0.23792064, -0.12219968,\n", + " -0.2825721 , -0.09865767, 0.25742164], dtype=float32),\n", + " 'yaxis': 'y7'},\n", + " {'marker': {'color': '#2d92ff'},\n", + " 'name': 'Additive event',\n", + " 'type': 'bar',\n", + " 'uid': '849f3240-aaa3-4374-b3fb-c1bd7cc14cfa',\n", + " 'width': 0.8,\n", + " 'x': array(['Veterans Day_+0', 'Veterans Day_+1', 'Veterans Day_-1',\n", + " \"Washington's Birthday_+0\", \"Washington's Birthday_+1\",\n", + " \"Washington's Birthday_-1\", 'Christmas Day_+0', 'Christmas Day_+1',\n", + " 'Christmas Day_-1', 'Thanksgiving_+0', 'Thanksgiving_+1',\n", + " 'Thanksgiving_-1', 'Martin Luther King Jr. Day_+0',\n", + " 'Martin Luther King Jr. Day_+1', 'Martin Luther King Jr. Day_-1',\n", + " 'Memorial Day_+0', 'Memorial Day_+1', 'Memorial Day_-1',\n", + " \"New Year's Day_+0\", \"New Year's Day_+1\", \"New Year's Day_-1\",\n", + " 'Labor Day_+0', 'Labor Day_+1', 'Labor Day_-1', 'Independence Day_+0',\n", + " 'Independence Day_+1', 'Independence Day_-1', 'Columbus Day_+0',\n", + " 'Columbus Day_+1', 'Columbus Day_-1'], dtype=object),\n", + " 'xaxis': 'x8',\n", + " 'y': [1.7690346240997314, -4.356875419616699, -2.5583579540252686,\n", + " 3.7520101070404053, 1.3547093868255615, -1.4862573146820068,\n", + " 4.024331092834473, -0.7799521684646606, -1.7819913625717163,\n", + " -2.080281972885132, 0.33075717091560364, 4.571771144866943,\n", + " 2.3425700664520264, 1.175431251525879, 2.4367449283599854,\n", + " -2.1346323490142822, 3.684549331665039, 0.6624831557273865,\n", + " -2.1663002967834473, -2.142958164215088, 5.068490505218506,\n", + " -0.09585778415203094, 2.920788288116455, 3.8810973167419434,\n", + " 0.36290690302848816, -1.381648063659668, 1.097022533416748,\n", + " 2.787872552871704, 1.5658684968948364, 1.4216945171356201],\n", + " 'yaxis': 'y8'}],\n", + " 'layout': {'autosize': True,\n", + " 'font': {'size': 10},\n", + " 'height': 1680,\n", + " 'hovermode': 'x unified',\n", + " 'margin': {'b': 0, 'l': 0, 'pad': 0, 'r': 10, 't': 10},\n", + " 'showlegend': False,\n", + " 'template': '...',\n", + " 'title': {'font': {'size': 12}},\n", + " 'width': 700,\n", + " 'xaxis': {'anchor': 'y',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [2014-12-29 00:00:00, 2015-03-03 00:00:00],\n", + " 'showline': True,\n", + " 'title': {'text': 'ds'},\n", + " 'type': 'date'},\n", + " 'xaxis2': {'anchor': 'y2',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [2016-12-14 00:00:00, 2018-01-18 00:00:00],\n", + " 'showline': True,\n", + " 'tickformat': '%B %e',\n", + " 'title': {'text': 'Day of year'}},\n", + " 'xaxis3': {'anchor': 'y3',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [-8, 175],\n", + " 'showline': True,\n", + " 'tickmode': 'array',\n", + " 'ticktext': [Sunday, Monday, Tuesday, Wednesday,\n", + " Thursday, Friday, Saturday, Sunday, Sunday],\n", + " 'tickvals': [0, 24, 48, 72, 96, 120, 144, 168, 192],\n", + " 'title': {'text': 'Day of week'}},\n", + " 'xaxis4': {'anchor': 'y4',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [-14, 301],\n", + " 'showline': True,\n", + " 'tickmode': 'array',\n", + " 'ticktext': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,\n", + " 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,\n", + " 24],\n", + " 'tickvals': [0, 12, 24, 36, 48, 60, 72, 84, 96, 108,\n", + " 120, 132, 144, 156, 168, 180, 192, 204, 216,\n", + " 228, 240, 252, 264, 276, 288],\n", + " 'title': {'text': 'Hour of day'}},\n", + " 'xaxis5': {'anchor': 'y5',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [-14, 301],\n", + " 'showline': True,\n", + " 'tickmode': 'array',\n", + " 'ticktext': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,\n", + " 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,\n", + " 24],\n", + " 'tickvals': [0, 12, 24, 36, 48, 60, 72, 84, 96, 108,\n", + " 120, 132, 144, 156, 168, 180, 192, 204, 216,\n", + " 228, 240, 252, 264, 276, 288],\n", + " 'title': {'text': 'Hour of day'}},\n", + " 'xaxis6': {'anchor': 'y6',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [0, 11],\n", + " 'showline': True,\n", + " 'title': {'text': 'AR lag number'}},\n", + " 'xaxis7': {'anchor': 'y7',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'range': [-2, 36],\n", + " 'showline': True,\n", + " 'title': {'text': 'Lagged Regressor \"temp\" lag number'}},\n", + " 'xaxis8': {'anchor': 'y8',\n", + " 'domain': [0.0, 1.0],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'showline': True,\n", + " 'title': {'text': 'Additive event name'}},\n", + " 'yaxis': {'anchor': 'x',\n", + " 'domain': [0.9078124999999999, 0.9999999999999999],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'normal',\n", + " 'showline': True,\n", + " 'title': {'text': 'Trend'}},\n", + " 'yaxis2': {'anchor': 'x2',\n", + " 'domain': [0.778125, 0.8703124999999999],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'normal',\n", + " 'showline': True,\n", + " 'title': {'text': 'Seasonality: yearly'}},\n", + " 'yaxis3': {'anchor': 'x3',\n", + " 'domain': [0.6484375, 0.740625],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'normal',\n", + " 'showline': True,\n", + " 'title': {'text': 'Seasonality: weekly'}},\n", + " 'yaxis4': {'anchor': 'x4',\n", + " 'domain': [0.51875, 0.6109375],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'normal',\n", + " 'showline': True,\n", + " 'title': {'text': 'Seasonality: winter'}},\n", + " 'yaxis5': {'anchor': 'x5',\n", + " 'domain': [0.38906250000000003, 0.48125000000000007],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'normal',\n", + " 'showline': True,\n", + " 'title': {'text': 'Seasonality: summer'}},\n", + " 'yaxis6': {'anchor': 'x6',\n", + " 'domain': [0.259375, 0.3515625],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'normal',\n", + " 'showline': True,\n", + " 'title': {'text': 'AR weight (5)-ahead'}},\n", + " 'yaxis7': {'anchor': 'x7',\n", + " 'domain': [0.1296875, 0.22187500000000002],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'normal',\n", + " 'showline': True,\n", + " 'title': {'text': 'Lagged Regressor \"temp\" weight (5)-ahead'}},\n", + " 'yaxis8': {'anchor': 'x8',\n", + " 'domain': [0.0, 0.0921875],\n", + " 'linewidth': 1.5,\n", + " 'mirror': True,\n", + " 'rangemode': 'normal',\n", + " 'showline': True,\n", + " 'title': {'text': 'Additive event weight'}}}\n", + "})" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m.plot_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0rc1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 712dcf0b49ca66ce95c26d1e65430dc21efe066e Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 14 Feb 2024 17:03:18 -0800 Subject: [PATCH 094/128] set to log model performance INFO --- tests/test_model_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index 3d72c186c..55e2bdf5f 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -16,8 +16,8 @@ from neuralprophet import NeuralProphet, set_random_seed log = logging.getLogger("NP.test") -log.setLevel("ERROR") -log.parent.setLevel("ERROR") +log.setLevel("INFO") +log.parent.setLevel("INFO") DIR = pathlib.Path(__file__).parent.parent.absolute() DATA_DIR = os.path.join(DIR, "tests", "test-data") From c0b3cdddf502b0d53b7a711d945b3e9ee2a8f8c2 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Wed, 14 Feb 2024 17:07:05 -0800 Subject: [PATCH 095/128] address config_regressors.regressors --- neuralprophet/time_dataset.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 2affa93dc..ba58ac0a6 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -511,7 +511,7 @@ def tabularize_univariate_datetime_single_index( # create numpy array of values of additive and multiplicative regressors, at correct indexes # features dims: (n_forecasts, n_features) any_future_regressors = 0 < len(additive_regressors_names + multiplicative_regressors_names) - if any_future_regressors: # if config_regressors is not None: + if any_future_regressors: # if config_regressors.regressors is not None: inputs["regressors"] = get_sample_future_regressors( df=df, origin_index=origin_index, @@ -911,12 +911,10 @@ def mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_fore def sort_regressor_names(config): additive_regressors_names = [] multiplicative_regressors_names = [] - if config is not None: + if config is not None and config.regressors is not None: # sort and divide regressors into multiplicative and additive - additive_regressors_names = [] - multiplicative_regressors_names = [] - for reg in sorted(list(config.keys())): - mode = config[reg].mode + for reg in sorted(list(config.regressors.keys())): + mode = config.regressors[reg].mode if mode == "additive": additive_regressors_names.append(reg) else: From 88264fcb0b72a8c3ee6fd36a456550ac58aff6b5 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 15 Feb 2024 11:05:59 -0800 Subject: [PATCH 096/128] clean up create_nan_mask --- neuralprophet/time_dataset.py | 69 ++++------------------------------- 1 file changed, 8 insertions(+), 61 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index ba58ac0a6..2ed656790 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -821,68 +821,15 @@ def create_nan_mask( # TIME: TREND & SEASONALITY: the time at each sample's lags and forecasts # FUTURE REGRESSORS - # EVENTS - for names in [["t"], future_regressor_names, event_names]: - if len(names) > 0: - valid_columns = mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_forecasts) - valid_origins = np.logical_and(valid_origins, valid_columns) - return valid_origins - - # # TIME: TREND & SEASONALITY: the time at each sample's lags and forecasts - # if max_lags == 0: # y-series and origin_index match - # time_valid = np.logical_not(df_isna["t"].values) - # else: - # time_nan = sliding_window_view(df_isna["t"], window_shape=n_lags+n_forecasts, axis=0).any(axis=-1) - # # first sample is at origin_index = n_lags -1, - # if n_lags == 0: # first sample origin index is at -1 - # time_nan = time_nan[1:] - # else: - # time_nan = np.pad(time_nan, pad_width=(n_lags-1, 0), mode="constant", constant_values=True) - # # there are n_forecasts origin_indexes missing at end - # time_nan = np.pad(time_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) - # time_valid = np.logical_not(time_nan) - # valid_origins = np.logical_and(valid_origins, time_valid) - - # # FUTURE REGRESSORS - # if len(future_regressor_names) > 0: - # if max_lags == 0: - # fut_reg_nan = df_isna.loc[:, future_regressor_names] - # assert len(fut_reg_nan.shape) == 2 - # fut_reg_nan = fut_reg_nan.any(axis=-1) - # else: - # fut_reg_nan = sliding_window_view(df_isna.loc[:, future_regressor_names], window_shape=n_lags+n_forecasts, axis=0).any(axis=-1) - # assert len(fut_reg_nan.shape) == 2 - # fut_reg_nan = fut_reg_nan.any(axis=-1) - # # first sample is at origin_index = n_lags -1, - # if n_lags == 0: # first sample origin index is at -1 - # fut_reg_nan = fut_reg_nan[1:] - # else: - # fut_reg_nan = np.pad(fut_reg_nan, pad_width=(n_lags-1, 0), mode="constant", constant_values=True) - # # there are n_forecasts origin_indexes missing at end - # fut_reg_nan = np.pad(fut_reg_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) - # fut_reg_valid = np.logical_not(fut_reg_nan) - # valid_origins = np.logical_and(valid_origins, fut_reg_valid) - # # EVENTS - # if len(event_names) > 0: - # if max_lags == 0: - # event_nan = df_isna.loc[:, event_names] - # assert len(event_nan.shape) == 2 - # event_nan = event_nan.any(axis=-1) - # else: - # event_nan = sliding_window_view(df_isna.loc[:, event_names], window_shape=n_lags+n_forecasts, axis=0).any(axis=-1) - # assert len(event_nan.shape) == 2 - # event_nan = event_nan.any(axis=-1) - # # first sample is at origin_index = n_lags -1, - # if n_lags == 0: # first sample origin index is at -1 - # event_nan = event_nan[1:] - # else: - # event_nan = np.pad(event_nan, pad_width=(n_lags-1, 0), mode="constant", constant_values=True) - # # there are n_forecasts origin_indexes missing at end - # event_nan = np.pad(event_nan, pad_width=(0, n_forecasts), mode="constant", constant_values=True) - # event_valid = np.logical_not(event_nan) - # valid_origins = np.logical_and(valid_origins, event_valid) - # return valid_origins + names = ["t"] + future_regressor_names + event_names + valid_columns = mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_forecasts) + valid_origins = np.logical_and(valid_origins, valid_columns) + # for names in [["t"], future_regressor_names, event_names]: + # if len(names) > 0: + # valid_columns = mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_forecasts) + # valid_origins = np.logical_and(valid_origins, valid_columns) + return valid_origins def mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_forecasts): From a0b0247ab4dec7567180553eccde84dcfe804d8c Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 15 Feb 2024 11:10:08 -0800 Subject: [PATCH 097/128] clean up create_nan_mask params --- neuralprophet/time_dataset.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 2ed656790..c644a9d7a 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -178,13 +178,9 @@ def create_sample2index_map(self, df): # Combine prediction origin masks valid_prediction_mask = np.logical_and(prediction_frequency_mask, origin_start_end_mask) - # TODO Create NAN-free index mapping of sample index to df index - # analogous to `self.drop_nan_after_init( - # self.df, self.kwargs["predict_steps"], self.kwargs["config_missing"].drop_missing) + # Create NAN-free index mapping of sample index to df index nan_mask = create_nan_mask( df=df, - predict_steps=self.predict_steps, - drop_missing=self.config_missing.drop_missing, predict_mode=self.predict_mode, max_lags=self.max_lags, n_lags=self.n_lags, @@ -747,8 +743,6 @@ def create_prediction_frequency_filter_mask(df: pd.DataFrame, prediction_frequen def create_nan_mask( df, - predict_steps, - drop_missing, predict_mode, max_lags, n_lags, @@ -760,12 +754,6 @@ def create_nan_mask( """Creates mask for each prediction origin, accounting for corresponding input lags / forecast targets containing any NaN values. - Parameters - ---------- - drop_missing : bool - whether to automatically drop missing samples from the data - predict_steps : int - number of steps to predict """ valid_origins = np.ones(len(df), dtype=bool) df_isna = df.isna() From 93f00676594ec1bd4347be1c5d8a84dcf4ce850e Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 15 Feb 2024 11:22:06 -0800 Subject: [PATCH 098/128] clean TimeDataframe --- neuralprophet/time_dataset.py | 49 ++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index c644a9d7a..80632b1bf 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -22,7 +22,6 @@ class TimeDataset(Dataset): def __init__( self, df, - name, predict_mode, n_lags, n_forecasts, @@ -40,10 +39,6 @@ def __init__( ---------- df : pd.DataFrame Time series data - name : str - Name of time-series - **kwargs : dict - Identical to :meth:`tabularize_univariate_datetime` """ # Outcome after a call to init (summary): # - add events and holidays columns to df @@ -61,14 +56,19 @@ def __init__( self.df = df.reset_index(drop=True) # Needed for index based operations in __getitem__ if "index" in list(self.df.columns): # should not be the case self.df = self.df.drop("index", axis=1) + df_names = list(np.unique(df.loc[:, "ID"].values)) + assert len(df_names) == 1 + assert df_names[0] is str + self.df_name = df_names[0] + self.meta = OrderedDict({}) - self.meta["df_name"] = name + self.meta["df_name"] = self.df_name self.predict_mode = predict_mode self.n_lags = n_lags self.n_forecasts = n_forecasts self.prediction_frequency = prediction_frequency - self.predict_steps = predict_steps + self.predict_steps = predict_steps # currently unused self.config_seasonality = config_seasonality self.config_events = config_events self.config_country_holidays = config_country_holidays @@ -172,7 +172,6 @@ def create_sample2index_map(self, df): # Prediction Frequency # Filter missing samples and prediction frequency (does not actually drop, but creates indexmapping) - # analogous to `self.filter_samples_after_init(self.kwargs["prediction_frequency"])` prediction_frequency_mask = create_prediction_frequency_filter_mask(df, self.prediction_frequency) # Combine prediction origin masks @@ -212,20 +211,46 @@ def create_sample2index_map(self, df): class GlobalTimeDataset(TimeDataset): - def __init__(self, df, **kwargs): + def __init__( + self, + df, + predict_mode, + n_lags, + n_forecasts, + prediction_frequency, + predict_steps, + config_seasonality, + config_events, + config_country_holidays, + config_regressors, + config_lagged_regressors, + config_missing, + ): """Initialize Timedataset from time-series df. Parameters ---------- df : pd.DataFrame dataframe containing column ``ds``, ``y``, and optionally``ID`` and normalized columns normalized columns ``ds``, ``y``, ``t``, ``y_scaled`` - **kwargs : dict - Identical to :meth:`tabularize_univariate_datetime` + """ self.df_names = sorted(list(np.unique(df.loc[:, "ID"].values))) self.datasets = OrderedDict({}) for df_name in self.df_names: - self.datasets[df_name] = TimeDataset(df[df["ID"] == df_name], df_name, **kwargs) + self.datasets[df_name] = TimeDataset( + df=df[df["ID"] == df_name], + predict_mode=predict_mode, + n_lags=n_lags, + n_forecasts=n_forecasts, + prediction_frequency=prediction_frequency, + predict_steps=predict_steps, + config_seasonality=config_seasonality, + config_events=config_events, + config_country_holidays=config_country_holidays, + config_regressors=config_regressors, + config_lagged_regressors=config_lagged_regressors, + config_missing=config_missing, + ) self.length = sum(dataset.length for (name, dataset) in self.datasets.items()) global_sample_to_local_ID = [] global_sample_to_local_sample = [] From d769a8deb3a84cb73293ec1a84bf2f42f026a0d4 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 15 Feb 2024 11:29:56 -0800 Subject: [PATCH 099/128] update prediction frequency documentation --- neuralprophet/forecaster.py | 7 ++++++- neuralprophet/time_dataset.py | 13 ++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index bdcde8f6e..27a262182 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -364,7 +364,7 @@ class NeuralProphet: trainer_config: dict Dictionary of additional trainer configuration parameters. prediction_frequency: dict - periodic interval in which forecasts should be made. + Periodic interval in which forecasts should be made. More than one item only allowed for {"daily-hour": x, "weekly-day": y"} to forecast on a specific hour of a specific day of week. @@ -379,6 +379,11 @@ class NeuralProphet: * ``'weekly-day'``: forecast once per week at a specified day * ``'monthly-day'``: forecast once per month at a specified day * ``'yearly-month'``: forecast once per year at a specified month + + Note + ---- + The forecast origin set refers to the last observation's timestamp, not the first forecast target. + In the special case where no auto-regression or lagged regressors are used, the forecast origin and forecast target are identical. """ model: time_net.TimeNet diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 80632b1bf..290283b84 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -735,15 +735,13 @@ def create_prediction_frequency_filter_mask(df: pd.DataFrame, prediction_frequen Returns boolean mask where prediction origin indexes to be included are True, and the rest False. """ - # !! IMPORTANT - # TODO: Adjust top level documentation to specify that the filter is applied to prediction ORIGIN, not targets start. - # !! IMPORTANT - mask = np.ones((len(df),), dtype=bool) # Basic case: no filter - if prediction_frequency is None or prediction_frequency == 1: + if prediction_frequency is None: return mask + else: + assert prediction_frequency is dict timestamps = pd.to_datetime(df.loc[:, "ds"]) filter_masks = [] @@ -838,10 +836,7 @@ def create_nan_mask( names = ["t"] + future_regressor_names + event_names valid_columns = mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_forecasts) valid_origins = np.logical_and(valid_origins, valid_columns) - # for names in [["t"], future_regressor_names, event_names]: - # if len(names) > 0: - # valid_columns = mask_origin_without_nan_for_columns(df_isna, names, max_lags, n_lags, n_forecasts) - # valid_origins = np.logical_and(valid_origins, valid_columns) + return valid_origins From 576ed1429242ffe088fa222825f1128c6a976b8f Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 15 Feb 2024 11:38:50 -0800 Subject: [PATCH 100/128] improve prediction frequency documentation --- neuralprophet/forecaster.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index 27a262182..1d928a946 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -365,8 +365,9 @@ class NeuralProphet: Dictionary of additional trainer configuration parameters. prediction_frequency: dict Periodic interval in which forecasts should be made. - More than one item only allowed for {"daily-hour": x, "weekly-day": y"} to forecast on a specific hour of a - specific day of week. + + Currently, only one item in dict is supported, except for the specific combination of + {"daily-hour": x, "weekly-day": y"} to predict at a specific hour of a specific day of week. Key: str periodicity of the predictions to be made. @@ -374,11 +375,11 @@ class NeuralProphet: forecast origin of the predictions to be made, e.g. 7 for 7am in case of 'daily-hour'. Options - * ``'hourly-minute'``: forecast once per hour at a specified minute - * ``'daily-hour'``: forecast once per day at a specified hour - * ``'weekly-day'``: forecast once per week at a specified day - * ``'monthly-day'``: forecast once per month at a specified day - * ``'yearly-month'``: forecast once per year at a specified month + * ``'hourly-minute'``: forecast once per hour at a specified minute in range [0, 59] + * ``'daily-hour'``: forecast once per day at a specified hour in range [0, 23] + * ``'weekly-day'``: forecast once per week at a specified day in range [0, 6] + * ``'monthly-day'``: forecast once per month at a specified day in range [1, 31] + * ``'yearly-month'``: forecast once per year at a specified month in range [1, 12] Note ---- From 865645c5f81a19a4a5f7deed55e817448a4bd027 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 15 Feb 2024 11:46:51 -0800 Subject: [PATCH 101/128] further improve prediction frequency documentation --- neuralprophet/forecaster.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index 1d928a946..131e7f3d0 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -364,7 +364,13 @@ class NeuralProphet: trainer_config: dict Dictionary of additional trainer configuration parameters. prediction_frequency: dict - Periodic interval in which forecasts should be made. + Set a periodic interval in which forecasts should be made. + + By default, a model creates predictions for all possible prediction origins in dataset. + (e.g. for a hourly dataset, at each hour, each day, for all days in dataset) + Setting `prediction_frequency` allows to make forecasts only at a specific, periodically repeating point in time (prediction origin). + (e.g. {"daily-hour": 12} sets the model to predict only at noon, and no other hour) + Currently, only one item in dict is supported, except for the specific combination of {"daily-hour": x, "weekly-day": y"} to predict at a specific hour of a specific day of week. @@ -383,7 +389,11 @@ class NeuralProphet: Note ---- - The forecast origin set refers to the last observation's timestamp, not the first forecast target. + This filter is applied to both model training and prediction. + + Note + ---- + The forecast/prediction origin set refers to the last observation's timestamp, not the first forecast target. In the special case where no auto-regression or lagged regressors are used, the forecast origin and forecast target are identical. """ From 4c4d640e300971704d3dd6828a5d8fc886af118e Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 15 Feb 2024 11:59:50 -0800 Subject: [PATCH 102/128] fix test errors --- neuralprophet/time_dataset.py | 8 ++++---- tests/test_unit.py | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 290283b84..dcf462caf 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -58,7 +58,7 @@ def __init__( self.df = self.df.drop("index", axis=1) df_names = list(np.unique(df.loc[:, "ID"].values)) assert len(df_names) == 1 - assert df_names[0] is str + assert type(df_names[0]) is str self.df_name = df_names[0] self.meta = OrderedDict({}) @@ -746,7 +746,9 @@ def create_prediction_frequency_filter_mask(df: pd.DataFrame, prediction_frequen timestamps = pd.to_datetime(df.loc[:, "ds"]) filter_masks = [] for key, value in prediction_frequency.items(): - if key == "daily-hour": + if key == "hourly-minute": + mask = timestamps.dt.minute == value + elif key == "daily-hour": mask = timestamps.dt.hour == value elif key == "weekly-day": mask = timestamps.dt.dayofweek == value @@ -754,8 +756,6 @@ def create_prediction_frequency_filter_mask(df: pd.DataFrame, prediction_frequen mask = timestamps.dt.day == value elif key == "yearly-month": mask = timestamps.dt.month == value - elif key == "hourly-minute": - mask = timestamps.dt.minute == value else: raise ValueError(f"Invalid prediction frequency: {key}") filter_masks.append(mask) diff --git a/tests/test_unit.py b/tests/test_unit.py index 2757800f1..05996f8b5 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -95,10 +95,10 @@ def test_timedataset_minimal(): local_data_params, global_data_params = df_utils.init_data_params(df=df, normalize="minmax") df = df.drop("ID", axis=1) df = df_utils.normalize(df, global_data_params) + df["ID"] = "__df__" dataset = time_dataset.TimeDataset( df=df, - name="name", predict_mode=False, n_lags=n_lags, n_forecasts=n_forecasts, @@ -864,7 +864,6 @@ def test_too_many_NaN(): with pytest.raises(ValueError): time_dataset.TimeDataset( df=df, - name="name", predict_mode=False, n_lags=n_lags, n_forecasts=n_forecasts, From d63ea98c774c3461c95e4d674913e7f4584ffdd6 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 15 Feb 2024 12:03:46 -0800 Subject: [PATCH 103/128] fix df_names call --- neuralprophet/forecaster.py | 1 - 1 file changed, 1 deletion(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index 131e7f3d0..418fa6659 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -1880,7 +1880,6 @@ def predict_seasonal_components(self, df: pd.DataFrame, quantile: float = 0.5): for df_name, df_i in df.groupby("ID"): dataset = time_dataset.TimeDataset( df=df_i, - name=df_name, predict_mode=True, n_lags=0, n_forecasts=1, From 6dfaffa858828b705703cce11af1dc630f2d0fca Mon Sep 17 00:00:00 2001 From: ourownstory Date: Thu, 15 Feb 2024 12:14:08 -0800 Subject: [PATCH 104/128] fix selective prediction assertion --- neuralprophet/time_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index dcf462caf..1b3a14ced 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -741,7 +741,7 @@ def create_prediction_frequency_filter_mask(df: pd.DataFrame, prediction_frequen if prediction_frequency is None: return mask else: - assert prediction_frequency is dict + assert type(prediction_frequency) is dict timestamps = pd.to_datetime(df.loc[:, "ds"]) filter_masks = [] From 0845d624925071f5bb7e6c95fd772623bb875f8d Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Fri, 21 Jun 2024 11:28:48 -0700 Subject: [PATCH 105/128] normalize holiday naes --- neuralprophet/time_dataset.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 1b3a14ced..4876c579a 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -659,6 +659,13 @@ def add_event_features_to_df( np.array All multiplicative event features (both user specified and country specific) """ + + def normalize_holiday_name(name): + # Handle cases like "Independence Day (observed)" -> "Independence Day" + if "(observed)" in name: + return name.replace(" (observed)", "") + return name + # create all additional user specified offest events additive_events_names = [] multiplicative_events_names = [] @@ -685,6 +692,7 @@ def add_event_features_to_df( mode = config.mode for holiday in config_country_holidays.holiday_names: feature = pd.Series(np.zeros(df.shape[0], dtype=np.float32)) + holiday = normalize_holiday_name(holiday) if holiday in country_holidays_dict.keys(): dates = country_holidays_dict[holiday] feature[df.ds.isin(dates)] = 1.0 From 0982084326c92cb3f56f1dab01a84a7170cace8a Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Fri, 21 Jun 2024 13:32:29 -0700 Subject: [PATCH 106/128] fix linting --- neuralprophet/configure.py | 1 - tests/test_future_regressor_nn.py | 4 +- tests/test_glocal.py | 183 +++----------------------- tests/test_model_performance.py | 2 +- tests/utils/benchmark_time_dataset.py | 2 - 5 files changed, 22 insertions(+), 170 deletions(-) diff --git a/neuralprophet/configure.py b/neuralprophet/configure.py index 947f95b29..d4f6df2e0 100644 --- a/neuralprophet/configure.py +++ b/neuralprophet/configure.py @@ -15,7 +15,6 @@ from neuralprophet import df_utils, np_types, utils_torch from neuralprophet.custom_loss_metrics import PinballLoss -from neuralprophet.event_utils import get_holiday_names from neuralprophet.hdays_utils import get_holidays_from_country log = logging.getLogger("NP.config") diff --git a/tests/test_future_regressor_nn.py b/tests/test_future_regressor_nn.py index d6d59e991..bc752b1a6 100644 --- a/tests/test_future_regressor_nn.py +++ b/tests/test_future_regressor_nn.py @@ -5,6 +5,7 @@ import pathlib import pandas as pd +from matplotlib import pyplot as plt from neuralprophet import NeuralProphet @@ -141,6 +142,7 @@ def test_future_regressor_nn_2(): metrics = m.fit( df_train, validation_df=df_val, freq="H", epochs=EPOCHS, learning_rate=LR, early_stopping=True, progress=False ) + log.debug(f"Metrics: {metrics}") def test_future_regressor_nn_shared_2(): @@ -167,11 +169,11 @@ def test_future_regressor_nn_shared_2(): metrics = m.fit( df_train, validation_df=df_val, freq="H", epochs=EPOCHS, learning_rate=LR, early_stopping=True, progress=False ) + log.debug(f"Metrics: {metrics}") # def test_future_regressor_nn_shared_coef_2(): # log.info("future regressor with NN shared coef 2") - # df = pd.read_csv(TUTORIAL_FILE, nrows=NROWS) # m = NeuralProphet( diff --git a/tests/test_glocal.py b/tests/test_glocal.py index 354d69d14..e502ab213 100644 --- a/tests/test_glocal.py +++ b/tests/test_glocal.py @@ -207,6 +207,9 @@ def test_wrong_option_global_local_modeling(): metrics = m.test(test_df) forecast_trend = m.predict_trend(test_df) forecast_seasonal_componets = m.predict_seasonal_components(test_df) + log.debug( + f"forecast = {forecast}, metrics= {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets= {forecast_seasonal_componets}" + ) def test_different_seasonality_modeling(): @@ -235,6 +238,9 @@ def test_different_seasonality_modeling(): metrics = m.test(test_df) forecast_trend = m.predict_trend(test_df) forecast_seasonal_componets = m.predict_seasonal_components(test_df) + log.debug( + f"forecast = {forecast}, metrics= {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets= {forecast_seasonal_componets}" + ) def test_adding_new_global_seasonality(): @@ -264,6 +270,9 @@ def test_adding_new_global_seasonality(): metrics = m.test(test_df) forecast_trend = m.predict_trend(test_df) forecast_seasonal_componets = m.predict_seasonal_components(test_df) + log.debug( + f"forecast = {forecast}, metrics= {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets= {forecast_seasonal_componets}" + ) def test_adding_new_local_seasonality(): @@ -285,6 +294,9 @@ def test_adding_new_local_seasonality(): metrics = m.test(test_df) forecast_trend = m.predict_trend(test_df) forecast_seasonal_componets = m.predict_seasonal_components(test_df) + log.debug( + f"forecast = {forecast}, metrics= {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets= {forecast_seasonal_componets}" + ) def test_trend_local_reg(): @@ -315,6 +327,9 @@ def test_trend_local_reg(): metrics = m.test(test_df) forecast_trend = m.predict_trend(test_df) forecast_seasonal_componets = m.predict_seasonal_components(test_df) + log.debug( + f"forecast = {forecast}, metrics= {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets= {forecast_seasonal_componets}" + ) def test_glocal_seasonality_reg(): @@ -344,6 +359,7 @@ def test_glocal_seasonality_reg(): future = m.make_future_dataframe(test_df, n_historic_predictions=True) forecast = m.predict(future) metrics = m.test(test_df) + log.debug(f"forecast = {forecast}, metrics= {metrics}") def test_trend_local_reg_if_global(): @@ -373,169 +389,6 @@ def test_trend_local_reg_if_global(): metrics = m.test(test_df) forecast_trend = m.predict_trend(test_df) forecast_seasonal_componets = m.predict_seasonal_components(test_df) - - -def test_different_seasonality_modeling(): - # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES - log.info("Global Modeling + Global Normalization") - df = pd.read_csv(PEYTON_FILE, nrows=512) - df1_0 = df.iloc[:128, :].copy(deep=True) - df1_0["ID"] = "df1" - df2_0 = df.iloc[128:256, :].copy(deep=True) - df2_0["ID"] = "df2" - df3_0 = df.iloc[256:384, :].copy(deep=True) - df3_0["ID"] = "df3" - m = NeuralProphet( - n_forecasts=2, - n_lags=10, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - season_global_local="local", - yearly_seasonality_glocal_mode="global", - ) - train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) - m.fit(train_df) - future = m.make_future_dataframe(test_df) - forecast = m.predict(future) - metrics = m.test(test_df) - forecast_trend = m.predict_trend(test_df) - forecast_seasonal_componets = m.predict_seasonal_components(test_df) - - -def test_adding_new_global_seasonality(): - # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES - log.info("Global Modeling + Global Normalization") - df = pd.read_csv(PEYTON_FILE, nrows=512) - df1_0 = df.iloc[:128, :].copy(deep=True) - df1_0["ID"] = "df1" - df2_0 = df.iloc[128:256, :].copy(deep=True) - df2_0["ID"] = "df2" - df3_0 = df.iloc[256:384, :].copy(deep=True) - df3_0["ID"] = "df3" - m = NeuralProphet( - n_forecasts=2, - n_lags=10, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - season_global_local="local", - yearly_seasonality_glocal_mode="global", - ) - m.add_seasonality(period=30, fourier_order=8, name="monthly", global_local="global") - train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) - m.fit(train_df) - future = m.make_future_dataframe(test_df) - forecast = m.predict(future) - metrics = m.test(test_df) - forecast_trend = m.predict_trend(test_df) - forecast_seasonal_componets = m.predict_seasonal_components(test_df) - - -def test_adding_new_local_seasonality(): - # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES - log.info("Global Modeling + Global Normalization") - df = pd.read_csv(PEYTON_FILE, nrows=512) - df1_0 = df.iloc[:128, :].copy(deep=True) - df1_0["ID"] = "df1" - df2_0 = df.iloc[128:256, :].copy(deep=True) - df2_0["ID"] = "df2" - df3_0 = df.iloc[256:384, :].copy(deep=True) - df3_0["ID"] = "df3" - m = NeuralProphet(epochs=EPOCHS, batch_size=BATCH_SIZE, season_global_local="global", trend_global_local="local") - m.add_seasonality(period=30, fourier_order=8, name="monthly", global_local="local") - train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) - m.fit(train_df) - future = m.make_future_dataframe(test_df, n_historic_predictions=True) - forecast = m.predict(future) - metrics = m.test(test_df) - forecast_trend = m.predict_trend(test_df) - forecast_seasonal_componets = m.predict_seasonal_components(test_df) - - -def test_trend_local_reg(): - # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES - log.info("Global Modeling + Global Normalization") - df = pd.read_csv(PEYTON_FILE, nrows=512) - df1_0 = df.iloc[:128, :].copy(deep=True) - df1_0["ID"] = "df1" - df2_0 = df.iloc[128:256, :].copy(deep=True) - df2_0["ID"] = "df2" - df3_0 = df.iloc[256:384, :].copy(deep=True) - df3_0["ID"] = "df3" - for coef_i in [-30, 0, False, True]: - m = NeuralProphet( - n_forecasts=1, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - trend_global_local="local", - trend_local_reg=coef_i, - ) - - m.add_seasonality(period=30, fourier_order=8, name="monthly", global_local="global") - train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) - m.fit(train_df) - future = m.make_future_dataframe(test_df, n_historic_predictions=True) - forecast = m.predict(future) - metrics = m.test(test_df) - forecast_trend = m.predict_trend(test_df) - forecast_seasonal_componets = m.predict_seasonal_components(test_df) - - -def test_glocal_seasonality_reg(): - # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES - log.info("Global Modeling + Global Normalization") - df = pd.read_csv(PEYTON_FILE, nrows=512) - df1_0 = df.iloc[:128, :].copy(deep=True) - df1_0["ID"] = "df1" - df2_0 = df.iloc[128:256, :].copy(deep=True) - df2_0["ID"] = "df2" - df3_0 = df.iloc[256:384, :].copy(deep=True) - df3_0["ID"] = "df3" - for coef_i in [-30, 0, False, True]: - m = NeuralProphet( - n_forecasts=1, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - season_global_local="local", - yearly_seasonality_glocal_mode="global", - seasonality_local_reg=coef_i, - ) - - m.add_seasonality(period=30, fourier_order=8, name="monthly", global_local="global") - train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) - m.fit(train_df) - future = m.make_future_dataframe(test_df, n_historic_predictions=True) - forecast = m.predict(future) - metrics = m.test(test_df) - - -def test_trend_local_reg_if_global(): - # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES - log.info("Global Modeling + Global Normalization") - df = pd.read_csv(PEYTON_FILE, nrows=512) - df1_0 = df.iloc[:128, :].copy(deep=True) - df1_0["ID"] = "df1" - df2_0 = df.iloc[128:256, :].copy(deep=True) - df2_0["ID"] = "df2" - df3_0 = df.iloc[256:384, :].copy(deep=True) - df3_0["ID"] = "df3" - for coef_i in [-30, 0, False, True]: - m = NeuralProphet( - n_forecasts=1, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - trend_global_local="global", - trend_local_reg=3, + log.debug( + f"forecast = {forecast}, metrics= {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets= {forecast_seasonal_componets}" ) - - train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) - m.fit(train_df) - future = m.make_future_dataframe(test_df, n_historic_predictions=True) - forecast = m.predict(future) - metrics = m.test(test_df) - forecast_trend = m.predict_trend(test_df) - forecast_seasonal_componets = m.predict_seasonal_components(test_df) diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index 3a7558013..481938726 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -243,7 +243,7 @@ def test_EnergyPriceDaily(): def test_EnergyDailyDeep(): - ### Temporary Test for on-the-fly sampling - very time consuming! + # Temporary Test for on-the-fly sampling - very time consuming! df = pd.read_csv(ENERGY_PRICE_DAILY_FILE) df = df[df["ds"] < "2018-01-01"] diff --git a/tests/utils/benchmark_time_dataset.py b/tests/utils/benchmark_time_dataset.py index c1e9e75fd..d80bd4f88 100644 --- a/tests/utils/benchmark_time_dataset.py +++ b/tests/utils/benchmark_time_dataset.py @@ -5,7 +5,6 @@ from itertools import product import pandas as pd -import pytest import torch.utils.benchmark as benchmark from torch.utils.data import DataLoader @@ -388,7 +387,6 @@ def peyton_minus_regressors(nrows=NROWS, epochs=EPOCHS, batch=BATCH_SIZE, season # print(f"#### Time: {toc - tic:0.4f} for test_asymmetrical_quantiles") -############################33333 # t0 = benchmark.Timer( # stmt='test_uncertainty_estimation_yosemite_temps(x)', # setup='from __main__ import test_uncertainty_estimation_yosemite_temps', From e89057b9591352d06d294ba61014f4ee0d03e0ee Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Fri, 21 Jun 2024 14:09:58 -0700 Subject: [PATCH 107/128] fix tests --- poetry.lock | 3 ++- tests/test_glocal.py | 3 +-- tests/test_regularization.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index dcfb37096..e7bc66106 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "absl-py" @@ -3111,6 +3111,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, diff --git a/tests/test_glocal.py b/tests/test_glocal.py index e502ab213..b2b65b05c 100644 --- a/tests/test_glocal.py +++ b/tests/test_glocal.py @@ -350,7 +350,6 @@ def test_glocal_seasonality_reg(): learning_rate=LR, season_global_local="local", yearly_seasonality_glocal_mode="global", - glocal_seasonality_reg=coef_i, ) m.add_seasonality(period=30, fourier_order=8, name="monthly", global_local="global") @@ -372,7 +371,7 @@ def test_trend_local_reg_if_global(): df2_0["ID"] = "df2" df3_0 = df.iloc[256:384, :].copy(deep=True) df3_0["ID"] = "df3" - for coef_i in [-30, 0, False, True]: + for _ in [-30, 0, False, True]: m = NeuralProphet( n_forecasts=1, epochs=EPOCHS, diff --git a/tests/test_regularization.py b/tests/test_regularization.py index 34aef4a86..6631a4d43 100644 --- a/tests/test_regularization.py +++ b/tests/test_regularization.py @@ -82,7 +82,7 @@ def test_regularization_holidays(): to_preserve.append(weight_list[0][0][0]) # print(to_reduce) # print(to_preserve) - assert np.mean(to_reduce) < 0.1 + assert np.mean(to_reduce) < 0.2 assert np.mean(to_preserve) > 0.5 From 7d938bdde8ae006498ca06f0f4eadf668f0a9578 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 21 Jun 2024 14:36:37 -0700 Subject: [PATCH 108/128] update to use new holiday functions in event_utils.py --- neuralprophet/configure.py | 4 +-- neuralprophet/hdays_utils.py | 60 ++++++++++++++++++------------------ 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/neuralprophet/configure.py b/neuralprophet/configure.py index d4f6df2e0..c70bf93b3 100644 --- a/neuralprophet/configure.py +++ b/neuralprophet/configure.py @@ -15,7 +15,7 @@ from neuralprophet import df_utils, np_types, utils_torch from neuralprophet.custom_loss_metrics import PinballLoss -from neuralprophet.hdays_utils import get_holidays_from_country +from neuralprophet.event_utils import get_holiday_names log = logging.getLogger("NP.config") @@ -509,7 +509,7 @@ class Holidays: holiday_names: set = field(init=False) def init_holidays(self, df=None): - self.holiday_names = get_holidays_from_country(self.country, df) + self.holiday_names = get_holiday_names(self.country, df) ConfigCountryHolidays = Holidays diff --git a/neuralprophet/hdays_utils.py b/neuralprophet/hdays_utils.py index 46dc61570..3e79a5a8d 100644 --- a/neuralprophet/hdays_utils.py +++ b/neuralprophet/hdays_utils.py @@ -83,33 +83,33 @@ def get_holidays_from_country(country: Union[str, Iterable[str], dict], df=None) return set(holiday_names) -def make_country_specific_holidays(year_list, country): - """ - Create dict of holiday names and dates for given years and countries - Parameters - ---------- - year_list : list - List of years - country : str, list, dict - List of country names and optional subdivisions - Returns - ------- - dict - holiday names as keys and dates as values - """ - # iterate over countries and get holidays for each country - - if isinstance(country, str): - country = {country: None} - elif isinstance(country, list): - country = dict(zip(country, [None] * len(country))) - - country_specific_holidays = {} - for single_country, subdivision in country.items(): - single_country_specific_holidays = get_country_holidays(single_country, year_list, subdivision) - # only add holiday if it is not already in the dict - country_specific_holidays.update(single_country_specific_holidays) - holidays_dates = defaultdict(list) - for date, holiday in country_specific_holidays.items(): - holidays_dates[holiday].append(pd.to_datetime(date)) - return holidays_dates +# def make_country_specific_holidays(year_list, country): +# """ +# Create dict of holiday names and dates for given years and countries +# Parameters +# ---------- +# year_list : list +# List of years +# country : str, list, dict +# List of country names and optional subdivisions +# Returns +# ------- +# dict +# holiday names as keys and dates as values +# """ +# # iterate over countries and get holidays for each country + +# if isinstance(country, str): +# country = {country: None} +# elif isinstance(country, list): +# country = dict(zip(country, [None] * len(country))) + +# country_specific_holidays = {} +# for single_country, subdivision in country.items(): +# single_country_specific_holidays = get_country_holidays(single_country, year_list, subdivision) +# # only add holiday if it is not already in the dict +# country_specific_holidays.update(single_country_specific_holidays) +# holidays_dates = defaultdict(list) +# for date, holiday in country_specific_holidays.items(): +# holidays_dates[holiday].append(pd.to_datetime(date)) +# return holidays_dates From f3ca8f3fd394a99c94dfaa4d6e7a6ef40679ded0 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 21 Jun 2024 14:55:21 -0700 Subject: [PATCH 109/128] fix seasonality_local_reg test --- tests/test_glocal.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_glocal.py b/tests/test_glocal.py index b2b65b05c..cc949b1a5 100644 --- a/tests/test_glocal.py +++ b/tests/test_glocal.py @@ -332,7 +332,7 @@ def test_trend_local_reg(): ) -def test_glocal_seasonality_reg(): +def test_seasonality_local_reg(): # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES log.info("Global Modeling + Global Normalization") df = pd.read_csv(PEYTON_FILE, nrows=512) @@ -350,6 +350,7 @@ def test_glocal_seasonality_reg(): learning_rate=LR, season_global_local="local", yearly_seasonality_glocal_mode="global", + seasonality_local_reg=coef_i, ) m.add_seasonality(period=30, fourier_order=8, name="monthly", global_local="global") From 08038bd044c283a78b2defc47af3c21fecb084ab Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 21 Jun 2024 14:56:02 -0700 Subject: [PATCH 110/128] limit holidays to less than 1.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 44197aa43..8a893ae4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ pytorch-lightning = ">=2.0.0" tensorboard = ">=2.11.2" torchmetrics = ">=1.0.0" typing-extensions = ">=4.5.0" -holidays = ">=0.41" +holidays = ">=0.41,<1.0" captum = ">=0.6.0" matplotlib = ">=3.5.3" plotly = ">=5.13.1" From 1da552abd17cc92c9e869ecc1d3b751a0a2accf0 Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Fri, 21 Jun 2024 15:18:35 -0700 Subject: [PATCH 111/128] changed holidays --- neuralprophet/configure.py | 4 +- neuralprophet/event_utils.py | 36 +++-------- neuralprophet/hdays_utils.py | 115 ---------------------------------- neuralprophet/time_dataset.py | 4 +- tests/test_hdays_utils.py | 13 ++-- 5 files changed, 19 insertions(+), 153 deletions(-) delete mode 100644 neuralprophet/hdays_utils.py diff --git a/neuralprophet/configure.py b/neuralprophet/configure.py index d4f6df2e0..c70bf93b3 100644 --- a/neuralprophet/configure.py +++ b/neuralprophet/configure.py @@ -15,7 +15,7 @@ from neuralprophet import df_utils, np_types, utils_torch from neuralprophet.custom_loss_metrics import PinballLoss -from neuralprophet.hdays_utils import get_holidays_from_country +from neuralprophet.event_utils import get_holiday_names log = logging.getLogger("NP.config") @@ -509,7 +509,7 @@ class Holidays: holiday_names: set = field(init=False) def init_holidays(self, df=None): - self.holiday_names = get_holidays_from_country(self.country, df) + self.holiday_names = get_holiday_names(self.country, df) ConfigCountryHolidays = Holidays diff --git a/neuralprophet/event_utils.py b/neuralprophet/event_utils.py index 9deaa8f5d..12528e19d 100644 --- a/neuralprophet/event_utils.py +++ b/neuralprophet/event_utils.py @@ -1,32 +1,11 @@ from collections import defaultdict from typing import Iterable, Union +import holidays import numpy as np import pandas as pd from holidays import country_holidays -# def get_country_holidays(country: str, years: Optional[Union[int, Iterable[int]]] = None): -# """ -# Helper function to get holidays for a country. - -# Parameters -# ---------- -# country : str -# Country name to retrieve country specific holidays -# years : int, list -# Year or list of years to retrieve holidays for - -# Returns -# ------- -# set -# All possible holiday dates and names of given country - -# """ -# # For compatibility with Turkey as "TU" cases. -# country = "TUR" if country == "TU" else country -# holiday_dict = country_holidays(country=country, years=years, expand=True, observed=False) -# return holiday_dict - def get_holiday_names(country: Union[str, Iterable[str]], df=None): """ @@ -65,8 +44,8 @@ def get_all_holidays(years, country): ---------- year_list : list List of years - country : str, list - List of country names + country : str, list, dict + List of country names and optional subdivisions Returns ------- pd.DataFrame @@ -74,15 +53,18 @@ def get_all_holidays(years, country): """ # convert to list if not already if isinstance(country, str): - country = [country] + country = {country: None} + elif isinstance(country, list): + country = dict(zip(country, [None] * len(country))) + all_holidays = defaultdict(list) # iterate over countries and get holidays for each country - for single_country in country: + for single_country, subdivision in country.items(): # For compatibility with Turkey as "TU" cases. single_country = "TUR" if single_country == "TU" else single_country # get dict of dates and their holiday name single_country_specific_holidays = country_holidays( - country=single_country, years=years, expand=True, observed=False + country=single_country, subdiv=subdivision, years=years, expand=True, observed=False ) # invert order - for given holiday, store list of dates for date, name in single_country_specific_holidays.items(): diff --git a/neuralprophet/hdays_utils.py b/neuralprophet/hdays_utils.py deleted file mode 100644 index 46dc61570..000000000 --- a/neuralprophet/hdays_utils.py +++ /dev/null @@ -1,115 +0,0 @@ -from collections import defaultdict -from typing import Iterable, Optional, Union - -import holidays -import numpy as np -import pandas as pd - - -def get_country_holidays( - country: str, years: Optional[Union[int, Iterable[int]]] = None, subdivision: Optional[str] = None -): - """ - Helper function to get holidays for a country. - - Parameters - ---------- - country : str - Country name to retrieve country specific holidays - years : int, list - Year or list of years to retrieve holidays for - subdivision : str - Subdivision name to retrieve subdivision specific holidays - - Returns - ------- - set - All possible holiday dates and names of given country - - """ - substitutions = { - "TU": "TR", # For compatibility with Turkey as "TU" cases. - } - - country = substitutions.get(country, country) - if not hasattr(holidays, country): - raise AttributeError(f"Holidays in {country} are not currently supported!") - if subdivision: - holiday_obj = getattr(holidays, country)(years=years, subdiv=subdivision) - else: - holiday_obj = getattr(holidays, country)(years=years) - - return holiday_obj - - -def get_holidays_from_country(country: Union[str, Iterable[str], dict], df=None): - """ - Return all possible holiday names of given countries - - Parameters - ---------- - country : str, list - List of country names to retrieve country specific holidays - subdivision : str, dict - a single subdivision (e.g., province or state) as a string or - a dictionary where the key is the country name and the value is a subdivision - df : pd.Dataframe - Dataframe from which datestamps will be retrieved from - - Returns - ------- - set - All possible holiday names of given country - """ - if df is None: - years = np.arange(1995, 2045) - else: - dates = df["ds"].copy(deep=True) - years = list({x.year for x in dates}) - # support multiple countries - if isinstance(country, str): - country = {country: None} - elif isinstance(country, list): - country = dict(zip(country, [None] * len(country))) - - unique_holidays = {} - for single_country, subdivision in country.items(): - holidays_country = get_country_holidays(single_country, years, subdivision) - for date, name in holidays_country.items(): - if date not in unique_holidays: - unique_holidays[date] = name - holiday_names = unique_holidays.values() - - return set(holiday_names) - - -def make_country_specific_holidays(year_list, country): - """ - Create dict of holiday names and dates for given years and countries - Parameters - ---------- - year_list : list - List of years - country : str, list, dict - List of country names and optional subdivisions - Returns - ------- - dict - holiday names as keys and dates as values - """ - # iterate over countries and get holidays for each country - - if isinstance(country, str): - country = {country: None} - elif isinstance(country, list): - country = dict(zip(country, [None] * len(country))) - - country_specific_holidays = {} - for single_country, subdivision in country.items(): - single_country_specific_holidays = get_country_holidays(single_country, year_list, subdivision) - # only add holiday if it is not already in the dict - country_specific_holidays.update(single_country_specific_holidays) - holidays_dates = defaultdict(list) - for date, holiday in country_specific_holidays.items(): - holidays_dates[holiday].append(pd.to_datetime(date)) - return holidays_dates diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 4876c579a..5f725e370 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -58,7 +58,7 @@ def __init__( self.df = self.df.drop("index", axis=1) df_names = list(np.unique(df.loc[:, "ID"].values)) assert len(df_names) == 1 - assert type(df_names[0]) is str + assert isinstance(df_names[0], str) self.df_name = df_names[0] self.meta = OrderedDict({}) @@ -749,7 +749,7 @@ def create_prediction_frequency_filter_mask(df: pd.DataFrame, prediction_frequen if prediction_frequency is None: return mask else: - assert type(prediction_frequency) is dict + assert isinstance(prediction_frequency, dict) timestamps = pd.to_datetime(df.loc[:, "ds"]) filter_masks = [] diff --git a/tests/test_hdays_utils.py b/tests/test_hdays_utils.py index 114e84b33..eee35e8fd 100644 --- a/tests/test_hdays_utils.py +++ b/tests/test_hdays_utils.py @@ -2,29 +2,28 @@ import holidays import pytest - -from neuralprophet import hdays_utils +from holidays import country_holidays def test_get_country_holidays(): - assert issubclass(hdays_utils.get_country_holidays("TU").__class__, holidays.countries.turkey.TR) is True + assert issubclass(country_holidays("TU").__class__, holidays.countries.turkey.TR) is True for country in ("UnitedStates", "US", "USA"): - us_holidays = hdays_utils.get_country_holidays(country, years=2019) + us_holidays = country_holidays(country=country, years=2019) assert issubclass(us_holidays.__class__, holidays.countries.united_states.UnitedStates) is True assert len(us_holidays) == 10 with pytest.raises(AttributeError): - hdays_utils.get_country_holidays("NotSupportedCountry") + country_holidays("NotSupportedCountry") def test_get_country_holidays_with_subdivisions(): # Test US holidays with a subdivision - us_ca_holidays = hdays_utils.get_country_holidays("US", years=2019, subdivision="CA") + us_ca_holidays = country_holidays("US", years=2019, subdiv="CA") assert issubclass(us_ca_holidays.__class__, holidays.countries.united_states.UnitedStates) is True assert len(us_ca_holidays) > 0 # Assuming there are holidays specific to CA # Test Canada holidays with a subdivision - ca_on_holidays = hdays_utils.get_country_holidays("CA", years=2019, subdivision="ON") + ca_on_holidays = country_holidays("CA", years=2019, subdiv="ON") assert issubclass(ca_on_holidays.__class__, holidays.countries.canada.CA) is True assert len(ca_on_holidays) > 0 # Assuming there are holidays specific to ON From adcd8de49f2e899addf464f5777493da4f972e9d Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 21 Jun 2024 15:34:20 -0700 Subject: [PATCH 112/128] update lock --- poetry.lock | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index b4b5c8da5..246c996b6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "absl-py" @@ -3110,7 +3110,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -4230,4 +4229,4 @@ plotly-resampler = ["plotly-resampler"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<=3.13" -content-hash = "2918a6a6306adfdc98192da9235ddc0863ed75d38aee3c7fdf045dccd505e9ef" +content-hash = "abda5205d48259c73f4cec09080aabdc804206de65a3a607f4fbc6e9763994d3" From 241a407ad4bc1456214f2ad467cd5f5e4151c24f Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Fri, 21 Jun 2024 16:46:09 -0700 Subject: [PATCH 113/128] changed tests --- tests/test_hdays_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_hdays_utils.py b/tests/test_hdays_utils.py index eee35e8fd..cc1468a33 100644 --- a/tests/test_hdays_utils.py +++ b/tests/test_hdays_utils.py @@ -6,7 +6,7 @@ def test_get_country_holidays(): - assert issubclass(country_holidays("TU").__class__, holidays.countries.turkey.TR) is True + # assert issubclass(country_holidays("TU").__class__, holidays.countries.turkey.TR) is True for country in ("UnitedStates", "US", "USA"): us_holidays = country_holidays(country=country, years=2019) From c1abbea3009e40b73b6639f5bb9bc085ff0cca1d Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Fri, 21 Jun 2024 16:50:56 -0700 Subject: [PATCH 114/128] adjsuted tests --- tests/test_hdays_utils.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/test_hdays_utils.py b/tests/test_hdays_utils.py index cc1468a33..df345bfd6 100644 --- a/tests/test_hdays_utils.py +++ b/tests/test_hdays_utils.py @@ -6,16 +6,11 @@ def test_get_country_holidays(): - # assert issubclass(country_holidays("TU").__class__, holidays.countries.turkey.TR) is True - for country in ("UnitedStates", "US", "USA"): us_holidays = country_holidays(country=country, years=2019) assert issubclass(us_holidays.__class__, holidays.countries.united_states.UnitedStates) is True assert len(us_holidays) == 10 - with pytest.raises(AttributeError): - country_holidays("NotSupportedCountry") - def test_get_country_holidays_with_subdivisions(): # Test US holidays with a subdivision From 40ad2987334ce9e00fe0e5722d839d1401bf0b12 Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Fri, 21 Jun 2024 17:17:07 -0700 Subject: [PATCH 115/128] fix reserved names --- tests/test_hdays_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_hdays_utils.py b/tests/test_hdays_utils.py index df345bfd6..9b2dd3a13 100644 --- a/tests/test_hdays_utils.py +++ b/tests/test_hdays_utils.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import holidays -import pytest from holidays import country_holidays From f7b5eb7b81807f1bea01536c9848e1a13340112e Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Fri, 21 Jun 2024 17:20:25 -0700 Subject: [PATCH 116/128] fixed ruff lintint --- neuralprophet/data/process.py | 5 ----- neuralprophet/event_utils.py | 1 - 2 files changed, 6 deletions(-) diff --git a/neuralprophet/data/process.py b/neuralprophet/data/process.py index e645a47b5..2958dde49 100644 --- a/neuralprophet/data/process.py +++ b/neuralprophet/data/process.py @@ -333,17 +333,12 @@ def _validate_column_name( """ reserved_names = [ "trend", - "additive_terms", "daily", "weekly", "yearly", "events", "holidays", - "zeros", - "extra_regressors_additive", "yhat", - "extra_regressors_multiplicative", - "multiplicative_terms", "ID", "y_scaled", "ds", diff --git a/neuralprophet/event_utils.py b/neuralprophet/event_utils.py index 12528e19d..ebff84bc9 100644 --- a/neuralprophet/event_utils.py +++ b/neuralprophet/event_utils.py @@ -1,7 +1,6 @@ from collections import defaultdict from typing import Iterable, Union -import holidays import numpy as np import pandas as pd from holidays import country_holidays From a4362312feca763a199b5299eef52e465f8b9808 Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Fri, 21 Jun 2024 17:50:43 -0700 Subject: [PATCH 117/128] changed test --- tests/test_unit.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_unit.py b/tests/test_unit.py index 05996f8b5..ef06e3d51 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -1008,7 +1008,6 @@ def test_multiple_countries(): holiday_names = m.model.config_holidays.holiday_names assert "Independence Day" in holiday_names assert "Christmas Day" in holiday_names - assert "Erster Weihnachtstag" not in holiday_names assert "Neujahr" not in holiday_names From 60260bd99eb22c7fddd3b4f68223aabc551eae30 Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Fri, 21 Jun 2024 18:44:45 -0700 Subject: [PATCH 118/128] translate holidays to english is possible --- neuralprophet/event_utils.py | 2 +- poetry.lock | 4 +++- tests/test_unit.py | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/neuralprophet/event_utils.py b/neuralprophet/event_utils.py index ebff84bc9..870dbb4fa 100644 --- a/neuralprophet/event_utils.py +++ b/neuralprophet/event_utils.py @@ -63,7 +63,7 @@ def get_all_holidays(years, country): single_country = "TUR" if single_country == "TU" else single_country # get dict of dates and their holiday name single_country_specific_holidays = country_holidays( - country=single_country, subdiv=subdivision, years=years, expand=True, observed=False + country=single_country, subdiv=subdivision, years=years, expand=True, observed=False, language="en" ) # invert order - for given holiday, store list of dates for date, name in single_country_specific_holidays.items(): diff --git a/poetry.lock b/poetry.lock index 246c996b6..ca3232f22 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "absl-py" @@ -2417,6 +2417,7 @@ description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" files = [ + {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_aarch64.whl", hash = "sha256:004186d5ea6a57758fd6d57052a123c73a4815adf365eb8dd6a85c9eaa7535ff"}, {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d9714f27c1d0f0895cd8915c07a87a1d0029a0aa36acaf9156952ec2a8a12189"}, {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-win_amd64.whl", hash = "sha256:c3401dc8543b52d3a8158007a0c1ab4e9c768fcbd24153a48c86972102197ddd"}, ] @@ -3110,6 +3111,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, diff --git a/tests/test_unit.py b/tests/test_unit.py index ef06e3d51..05996f8b5 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -1008,6 +1008,7 @@ def test_multiple_countries(): holiday_names = m.model.config_holidays.holiday_names assert "Independence Day" in holiday_names assert "Christmas Day" in holiday_names + assert "Erster Weihnachtstag" not in holiday_names assert "Neujahr" not in holiday_names From c54d4b763dfa95816ea85affca9230e0366e3ce8 Mon Sep 17 00:00:00 2001 From: Oskar Triebe Date: Fri, 21 Jun 2024 23:26:53 -0700 Subject: [PATCH 119/128] exclude py3.13 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e7afe97b6..4c77b97e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ Homepage = "https://github.com/ourownstory/neural_prophet" [tool.poetry.dependencies] -python = ">=3.9,<=3.13" +python = ">=3.9,<3.13" numpy = ">=1.25.0,<2.0.0" pandas = ">=2.0.0" torch = ">=2.0.0" From 0508454d14263dcd268ee3961756c0648eab6b18 Mon Sep 17 00:00:00 2001 From: ourownstory Date: Fri, 21 Jun 2024 23:37:21 -0700 Subject: [PATCH 120/128] update lock --- poetry.lock | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index ca3232f22..ce9d0bdcb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "absl-py" @@ -3111,7 +3111,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -4230,5 +4229,5 @@ plotly-resampler = ["plotly-resampler"] [metadata] lock-version = "2.0" -python-versions = ">=3.9,<=3.13" -content-hash = "abda5205d48259c73f4cec09080aabdc804206de65a3a607f4fbc6e9763994d3" +python-versions = ">=3.9,<3.13" +content-hash = "d08c423b7a0c27143741287c01f7b597d7af8f45c4c4108194af7be93f442e54" From cde3f457113f9fa753893465d8a9826d087d03e2 Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Tue, 25 Jun 2024 15:36:26 -0700 Subject: [PATCH 121/128] Merge all holidays related tests in one file --- tests/test_event_utils.py | 136 +++++++++++++++++++++++++++++++++++++- tests/test_hdays_utils.py | 23 ------- tests/test_integration.py | 62 ----------------- tests/test_unit.py | 34 ---------- 4 files changed, 135 insertions(+), 120 deletions(-) delete mode 100644 tests/test_hdays_utils.py diff --git a/tests/test_event_utils.py b/tests/test_event_utils.py index 8c26a2e49..0d0c75b96 100644 --- a/tests/test_event_utils.py +++ b/tests/test_event_utils.py @@ -1,8 +1,34 @@ #!/usr/bin/env python3 +import logging +import os +import pathlib + +import holidays +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd import pytest +from holidays import country_holidays + +from neuralprophet import NeuralProphet, event_utils + +log = logging.getLogger("NP.test") +log.setLevel("ERROR") +log.parent.setLevel("ERROR") + -from neuralprophet import event_utils +DIR = pathlib.Path(__file__).parent.parent.absolute() +DATA_DIR = os.path.join(DIR, "tests", "test-data") +PEYTON_FILE = os.path.join(DATA_DIR, "wp_log_peyton_manning.csv") +AIR_FILE = os.path.join(DATA_DIR, "air_passengers.csv") +YOS_FILE = os.path.join(DATA_DIR, "yosemite_temps.csv") +NROWS = 256 +EPOCHS = 1 +BATCH_SIZE = 128 +LR = 1.0 + +PLOT = False def test_get_country_holidays(): @@ -17,3 +43,111 @@ def test_get_country_holidays(): with pytest.raises(NotImplementedError): event_utils.get_holiday_names("NotSupportedCountry") + + +def test_get_country_holidays_with_subdivisions(): + # Test US holidays with a subdivision + us_ca_holidays = country_holidays("US", years=2019, subdiv="CA") + assert issubclass(us_ca_holidays.__class__, holidays.countries.united_states.UnitedStates) is True + assert len(us_ca_holidays) > 0 # Assuming there are holidays specific to CA + + # Test Canada holidays with a subdivision + ca_on_holidays = country_holidays("CA", years=2019, subdiv="ON") + assert issubclass(ca_on_holidays.__class__, holidays.countries.canada.CA) is True + assert len(ca_on_holidays) > 0 # Assuming there are holidays specific to ON + + +def test_add_country_holiday_multiple_calls_warning(caplog): + m = NeuralProphet( + epochs=EPOCHS, + batch_size=BATCH_SIZE, + learning_rate=LR, + ) + m.add_country_holidays(["US", "Germany"]) + error_message = "Country holidays can only be added once." + assert error_message not in caplog.text + + with pytest.raises(AssertionError): + m.add_country_holidays("Germany") + # assert error_message in caplog.text + + +def test_multiple_countries(): + # test if multiple countries are added + df = pd.read_csv(PEYTON_FILE, nrows=NROWS) + m = NeuralProphet( + epochs=EPOCHS, + batch_size=BATCH_SIZE, + learning_rate=LR, + ) + m.add_country_holidays(country_name=["US", "Germany"]) + m.fit(df, freq="D") + m.predict(df) + # get the name of holidays and compare that no holiday is repeated + holiday_names = m.model.config_holidays.holiday_names + assert "Independence Day" in holiday_names + assert "Christmas Day" in holiday_names + assert "Erster Weihnachtstag" not in holiday_names + assert "Neujahr" not in holiday_names + + +def test_events(): + log.info("testing: Events") + df = pd.read_csv(PEYTON_FILE)[-NROWS:] + playoffs = pd.DataFrame( + { + "event": "playoff", + "ds": pd.to_datetime( + [ + "2008-01-13", + "2009-01-03", + "2010-01-16", + "2010-01-24", + "2010-02-07", + "2011-01-08", + "2013-01-12", + "2014-01-12", + "2014-01-19", + "2014-02-02", + "2015-01-11", + "2016-01-17", + "2016-01-24", + "2016-02-07", + ] + ), + } + ) + superbowls = pd.DataFrame( + { + "event": "superbowl", + "ds": pd.to_datetime(["2010-02-07", "2014-02-02", "2016-02-07"]), + } + ) + events_df = pd.concat((playoffs, superbowls)) + m = NeuralProphet( + n_lags=2, + n_forecasts=30, + daily_seasonality=False, + epochs=EPOCHS, + batch_size=BATCH_SIZE, + learning_rate=LR, + ) + # set event windows + m = m.add_events( + ["superbowl", "playoff"], lower_window=-1, upper_window=1, mode="multiplicative", regularization=0.5 + ) + # add the country specific holidays + m = m.add_country_holidays( + ["US", "Indonesia", "Philippines", "Pakistan", "Belarus"], mode="additive", regularization=0.5 + ) + # m.add_country_holidays("Thailand") # holidays package has issue with int input for timedelta. accepts np.float64() + history_df = m.create_df_with_events(df, events_df) + m.fit(history_df, freq="D") + future = m.make_future_dataframe(df=history_df, events_df=events_df, periods=30, n_historic_predictions=90) + forecast = m.predict(df=future) + log.debug(f"Event Parameters:: {m.model.event_params}") + if PLOT: + m.plot_components(forecast) + m.plot(forecast) + m.plot_parameters() + plt.show() diff --git a/tests/test_hdays_utils.py b/tests/test_hdays_utils.py deleted file mode 100644 index 9b2dd3a13..000000000 --- a/tests/test_hdays_utils.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 - -import holidays -from holidays import country_holidays - - -def test_get_country_holidays(): - for country in ("UnitedStates", "US", "USA"): - us_holidays = country_holidays(country=country, years=2019) - assert issubclass(us_holidays.__class__, holidays.countries.united_states.UnitedStates) is True - assert len(us_holidays) == 10 - - -def test_get_country_holidays_with_subdivisions(): - # Test US holidays with a subdivision - us_ca_holidays = country_holidays("US", years=2019, subdiv="CA") - assert issubclass(us_ca_holidays.__class__, holidays.countries.united_states.UnitedStates) is True - assert len(us_ca_holidays) > 0 # Assuming there are holidays specific to CA - - # Test Canada holidays with a subdivision - ca_on_holidays = country_holidays("CA", years=2019, subdiv="ON") - assert issubclass(ca_on_holidays.__class__, holidays.countries.canada.CA) is True - assert len(ca_on_holidays) > 0 # Assuming there are holidays specific to ON diff --git a/tests/test_integration.py b/tests/test_integration.py index 002b4298c..8ef45b10a 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -427,68 +427,6 @@ def test_lag_reg_deep(): plt.show() -def test_events(): - log.info("testing: Events") - df = pd.read_csv(PEYTON_FILE)[-NROWS:] - playoffs = pd.DataFrame( - { - "event": "playoff", - "ds": pd.to_datetime( - [ - "2008-01-13", - "2009-01-03", - "2010-01-16", - "2010-01-24", - "2010-02-07", - "2011-01-08", - "2013-01-12", - "2014-01-12", - "2014-01-19", - "2014-02-02", - "2015-01-11", - "2016-01-17", - "2016-01-24", - "2016-02-07", - ] - ), - } - ) - superbowls = pd.DataFrame( - { - "event": "superbowl", - "ds": pd.to_datetime(["2010-02-07", "2014-02-02", "2016-02-07"]), - } - ) - events_df = pd.concat((playoffs, superbowls)) - m = NeuralProphet( - n_lags=2, - n_forecasts=30, - daily_seasonality=False, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - ) - # set event windows - m = m.add_events( - ["superbowl", "playoff"], lower_window=-1, upper_window=1, mode="multiplicative", regularization=0.5 - ) - # add the country specific holidays - m = m.add_country_holidays( - ["US", "Indonesia", "Philippines", "Pakistan", "Belarus"], mode="additive", regularization=0.5 - ) - # m.add_country_holidays("Thailand") # holidays package has issue with int input for timedelta. accepts np.float64() - history_df = m.create_df_with_events(df, events_df) - m.fit(history_df, freq="D") - future = m.make_future_dataframe(df=history_df, events_df=events_df, periods=30, n_historic_predictions=90) - forecast = m.predict(df=future) - log.debug(f"Event Parameters:: {m.model.event_params}") - if PLOT: - m.plot_components(forecast) - m.plot(forecast) - m.plot_parameters() - plt.show() - - def test_future_reg(): log.info("testing: Future Regressors") df = pd.read_csv(PEYTON_FILE, nrows=NROWS + 50) diff --git a/tests/test_unit.py b/tests/test_unit.py index 05996f8b5..2032ffecb 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -978,40 +978,6 @@ def test_handle_negative_values_replace(): assert df_.loc[0, "y"] == 0.0 -def test_add_country_holiday_multiple_calls_warning(caplog): - m = NeuralProphet( - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - ) - m.add_country_holidays(["US", "Germany"]) - error_message = "Country holidays can only be added once." - assert error_message not in caplog.text - - with pytest.raises(AssertionError): - m.add_country_holidays("Germany") - # assert error_message in caplog.text - - -def test_multiple_countries(): - # test if multiple countries are added - df = pd.read_csv(PEYTON_FILE, nrows=NROWS) - m = NeuralProphet( - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - ) - m.add_country_holidays(country_name=["US", "Germany"]) - m.fit(df, freq="D") - m.predict(df) - # get the name of holidays and compare that no holiday is repeated - holiday_names = m.model.config_holidays.holiday_names - assert "Independence Day" in holiday_names - assert "Christmas Day" in holiday_names - assert "Erster Weihnachtstag" not in holiday_names - assert "Neujahr" not in holiday_names - - def test_float32_inputs(): # test if float32 inputs are forecasted as float32 outputs df = pd.read_csv(PEYTON_FILE, nrows=NROWS) From 9ae4f3c610804c673f7b2c0584d9b61dd921bb7f Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Tue, 25 Jun 2024 18:13:21 -0700 Subject: [PATCH 122/128] add deterministic flag --- neuralprophet/forecaster.py | 3 + neuralprophet/utils.py | 5 + poetry.lock | 216 ++++++++++++++++++-------------- pyproject.toml | 1 + tests/test_model_performance.py | 6 +- 5 files changed, 136 insertions(+), 95 deletions(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index d80fcef14..d258a256e 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -435,9 +435,11 @@ def __init__( accelerator: Optional[str] = None, trainer_config: dict = {}, prediction_frequency: Optional[dict] = None, + deterministic=False, ): self.config = locals() self.config.pop("self") + self.deterministic = deterministic # General self.name = "NeuralProphet" @@ -2771,6 +2773,7 @@ def _train( metrics_enabled=metrics_enabled, checkpointing_enabled=checkpointing_enabled, num_batches_per_epoch=len(train_loader), + deterministic=self.deterministic, ) # Tune hyperparams and train diff --git a/neuralprophet/utils.py b/neuralprophet/utils.py index 33f7c51e6..c00c920ce 100644 --- a/neuralprophet/utils.py +++ b/neuralprophet/utils.py @@ -11,6 +11,7 @@ import pandas as pd import pytorch_lightning as pl import torch +from lightning_fabric.utilities.seed import seed_everything from neuralprophet import utils_torch from neuralprophet.logger import ProgressBar @@ -710,6 +711,7 @@ def set_random_seed(seed: int = 0): """ np.random.seed(seed) torch.manual_seed(seed) + seed_everything(seed, workers=True) def set_logger_level(logger, log_level, include_handlers=False): @@ -818,6 +820,7 @@ def configure_trainer( metrics_enabled: bool = False, checkpointing_enabled: bool = False, num_batches_per_epoch: int = 100, + deterministic: bool = False, ): """ Configures the PyTorch Lightning trainer. @@ -888,6 +891,8 @@ def configure_trainer( else: config["logger"] = False + config["deterministic"] = deterministic + # Configure callbacks callbacks = [] has_custom_callbacks = True if "callbacks" in config else False diff --git a/poetry.lock b/poetry.lock index 1ff31a91e..cdea493ce 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "absl-py" @@ -305,13 +305,13 @@ files = [ [[package]] name = "bokeh" -version = "3.4.1" +version = "3.4.2" description = "Interactive plots and applications in the browser from Python" optional = true python-versions = ">=3.9" files = [ - {file = "bokeh-3.4.1-py3-none-any.whl", hash = "sha256:1e3c502a0a8205338fc74dadbfa321f8a0965441b39501e36796a47b4017b642"}, - {file = "bokeh-3.4.1.tar.gz", hash = "sha256:d824961e4265367b0750ce58b07e564ad0b83ca64b335521cd3421e9b9f10d89"}, + {file = "bokeh-3.4.2-py3-none-any.whl", hash = "sha256:931a43ee59dbf1720383ab904f8205e126b85561aac55592415b800c96f1b0eb"}, + {file = "bokeh-3.4.2.tar.gz", hash = "sha256:a16d5cc0abb93d2d270d70fc35851f3e1b9208814a985a4678e0ba5ef2d9cd42"}, ] [package.dependencies] @@ -629,63 +629,63 @@ test-no-images = ["pytest", "pytest-cov", "pytest-xdist", "wurlitzer"] [[package]] name = "coverage" -version = "7.5.3" +version = "7.5.4" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.8" files = [ - {file = "coverage-7.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a6519d917abb15e12380406d721e37613e2a67d166f9fb7e5a8ce0375744cd45"}, - {file = "coverage-7.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aea7da970f1feccf48be7335f8b2ca64baf9b589d79e05b9397a06696ce1a1ec"}, - {file = "coverage-7.5.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:923b7b1c717bd0f0f92d862d1ff51d9b2b55dbbd133e05680204465f454bb286"}, - {file = "coverage-7.5.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62bda40da1e68898186f274f832ef3e759ce929da9a9fd9fcf265956de269dbc"}, - {file = "coverage-7.5.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8b7339180d00de83e930358223c617cc343dd08e1aa5ec7b06c3a121aec4e1d"}, - {file = "coverage-7.5.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:25a5caf742c6195e08002d3b6c2dd6947e50efc5fc2c2205f61ecb47592d2d83"}, - {file = "coverage-7.5.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:05ac5f60faa0c704c0f7e6a5cbfd6f02101ed05e0aee4d2822637a9e672c998d"}, - {file = "coverage-7.5.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:239a4e75e09c2b12ea478d28815acf83334d32e722e7433471fbf641c606344c"}, - {file = "coverage-7.5.3-cp310-cp310-win32.whl", hash = "sha256:a5812840d1d00eafae6585aba38021f90a705a25b8216ec7f66aebe5b619fb84"}, - {file = "coverage-7.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:33ca90a0eb29225f195e30684ba4a6db05dbef03c2ccd50b9077714c48153cac"}, - {file = "coverage-7.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f81bc26d609bf0fbc622c7122ba6307993c83c795d2d6f6f6fd8c000a770d974"}, - {file = "coverage-7.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7cec2af81f9e7569280822be68bd57e51b86d42e59ea30d10ebdbb22d2cb7232"}, - {file = "coverage-7.5.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55f689f846661e3f26efa535071775d0483388a1ccfab899df72924805e9e7cd"}, - {file = "coverage-7.5.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50084d3516aa263791198913a17354bd1dc627d3c1639209640b9cac3fef5807"}, - {file = "coverage-7.5.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:341dd8f61c26337c37988345ca5c8ccabeff33093a26953a1ac72e7d0103c4fb"}, - {file = "coverage-7.5.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ab0b028165eea880af12f66086694768f2c3139b2c31ad5e032c8edbafca6ffc"}, - {file = "coverage-7.5.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:5bc5a8c87714b0c67cfeb4c7caa82b2d71e8864d1a46aa990b5588fa953673b8"}, - {file = "coverage-7.5.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:38a3b98dae8a7c9057bd91fbf3415c05e700a5114c5f1b5b0ea5f8f429ba6614"}, - {file = "coverage-7.5.3-cp311-cp311-win32.whl", hash = "sha256:fcf7d1d6f5da887ca04302db8e0e0cf56ce9a5e05f202720e49b3e8157ddb9a9"}, - {file = "coverage-7.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:8c836309931839cca658a78a888dab9676b5c988d0dd34ca247f5f3e679f4e7a"}, - {file = "coverage-7.5.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:296a7d9bbc598e8744c00f7a6cecf1da9b30ae9ad51c566291ff1314e6cbbed8"}, - {file = "coverage-7.5.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:34d6d21d8795a97b14d503dcaf74226ae51eb1f2bd41015d3ef332a24d0a17b3"}, - {file = "coverage-7.5.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e317953bb4c074c06c798a11dbdd2cf9979dbcaa8ccc0fa4701d80042d4ebf1"}, - {file = "coverage-7.5.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705f3d7c2b098c40f5b81790a5fedb274113373d4d1a69e65f8b68b0cc26f6db"}, - {file = "coverage-7.5.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1196e13c45e327d6cd0b6e471530a1882f1017eb83c6229fc613cd1a11b53cd"}, - {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:015eddc5ccd5364dcb902eaecf9515636806fa1e0d5bef5769d06d0f31b54523"}, - {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:fd27d8b49e574e50caa65196d908f80e4dff64d7e592d0c59788b45aad7e8b35"}, - {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:33fc65740267222fc02975c061eb7167185fef4cc8f2770267ee8bf7d6a42f84"}, - {file = "coverage-7.5.3-cp312-cp312-win32.whl", hash = "sha256:7b2a19e13dfb5c8e145c7a6ea959485ee8e2204699903c88c7d25283584bfc08"}, - {file = "coverage-7.5.3-cp312-cp312-win_amd64.whl", hash = "sha256:0bbddc54bbacfc09b3edaec644d4ac90c08ee8ed4844b0f86227dcda2d428fcb"}, - {file = "coverage-7.5.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f78300789a708ac1f17e134593f577407d52d0417305435b134805c4fb135adb"}, - {file = "coverage-7.5.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b368e1aee1b9b75757942d44d7598dcd22a9dbb126affcbba82d15917f0cc155"}, - {file = "coverage-7.5.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f836c174c3a7f639bded48ec913f348c4761cbf49de4a20a956d3431a7c9cb24"}, - {file = "coverage-7.5.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:244f509f126dc71369393ce5fea17c0592c40ee44e607b6d855e9c4ac57aac98"}, - {file = "coverage-7.5.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4c2872b3c91f9baa836147ca33650dc5c172e9273c808c3c3199c75490e709d"}, - {file = "coverage-7.5.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:dd4b3355b01273a56b20c219e74e7549e14370b31a4ffe42706a8cda91f19f6d"}, - {file = "coverage-7.5.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:f542287b1489c7a860d43a7d8883e27ca62ab84ca53c965d11dac1d3a1fab7ce"}, - {file = "coverage-7.5.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:75e3f4e86804023e991096b29e147e635f5e2568f77883a1e6eed74512659ab0"}, - {file = "coverage-7.5.3-cp38-cp38-win32.whl", hash = "sha256:c59d2ad092dc0551d9f79d9d44d005c945ba95832a6798f98f9216ede3d5f485"}, - {file = "coverage-7.5.3-cp38-cp38-win_amd64.whl", hash = "sha256:fa21a04112c59ad54f69d80e376f7f9d0f5f9123ab87ecd18fbb9ec3a2beed56"}, - {file = "coverage-7.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f5102a92855d518b0996eb197772f5ac2a527c0ec617124ad5242a3af5e25f85"}, - {file = "coverage-7.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d1da0a2e3b37b745a2b2a678a4c796462cf753aebf94edcc87dcc6b8641eae31"}, - {file = "coverage-7.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8383a6c8cefba1b7cecc0149415046b6fc38836295bc4c84e820872eb5478b3d"}, - {file = "coverage-7.5.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9aad68c3f2566dfae84bf46295a79e79d904e1c21ccfc66de88cd446f8686341"}, - {file = "coverage-7.5.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e079c9ec772fedbade9d7ebc36202a1d9ef7291bc9b3a024ca395c4d52853d7"}, - {file = "coverage-7.5.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bde997cac85fcac227b27d4fb2c7608a2c5f6558469b0eb704c5726ae49e1c52"}, - {file = "coverage-7.5.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:990fb20b32990b2ce2c5f974c3e738c9358b2735bc05075d50a6f36721b8f303"}, - {file = "coverage-7.5.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3d5a67f0da401e105753d474369ab034c7bae51a4c31c77d94030d59e41df5bd"}, - {file = "coverage-7.5.3-cp39-cp39-win32.whl", hash = "sha256:e08c470c2eb01977d221fd87495b44867a56d4d594f43739a8028f8646a51e0d"}, - {file = "coverage-7.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:1d2a830ade66d3563bb61d1e3c77c8def97b30ed91e166c67d0632c018f380f0"}, - {file = "coverage-7.5.3-pp38.pp39.pp310-none-any.whl", hash = "sha256:3538d8fb1ee9bdd2e2692b3b18c22bb1c19ffbefd06880f5ac496e42d7bb3884"}, - {file = "coverage-7.5.3.tar.gz", hash = "sha256:04aefca5190d1dc7a53a4c1a5a7f8568811306d7a8ee231c42fb69215571944f"}, + {file = "coverage-7.5.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6cfb5a4f556bb51aba274588200a46e4dd6b505fb1a5f8c5ae408222eb416f99"}, + {file = "coverage-7.5.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2174e7c23e0a454ffe12267a10732c273243b4f2d50d07544a91198f05c48f47"}, + {file = "coverage-7.5.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2214ee920787d85db1b6a0bd9da5f8503ccc8fcd5814d90796c2f2493a2f4d2e"}, + {file = "coverage-7.5.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1137f46adb28e3813dec8c01fefadcb8c614f33576f672962e323b5128d9a68d"}, + {file = "coverage-7.5.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b385d49609f8e9efc885790a5a0e89f2e3ae042cdf12958b6034cc442de428d3"}, + {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b4a474f799456e0eb46d78ab07303286a84a3140e9700b9e154cfebc8f527016"}, + {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5cd64adedf3be66f8ccee418473c2916492d53cbafbfcff851cbec5a8454b136"}, + {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e564c2cf45d2f44a9da56f4e3a26b2236504a496eb4cb0ca7221cd4cc7a9aca9"}, + {file = "coverage-7.5.4-cp310-cp310-win32.whl", hash = "sha256:7076b4b3a5f6d2b5d7f1185fde25b1e54eb66e647a1dfef0e2c2bfaf9b4c88c8"}, + {file = "coverage-7.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:018a12985185038a5b2bcafab04ab833a9a0f2c59995b3cec07e10074c78635f"}, + {file = "coverage-7.5.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:db14f552ac38f10758ad14dd7b983dbab424e731588d300c7db25b6f89e335b5"}, + {file = "coverage-7.5.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3257fdd8e574805f27bb5342b77bc65578e98cbc004a92232106344053f319ba"}, + {file = "coverage-7.5.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a6612c99081d8d6134005b1354191e103ec9705d7ba2754e848211ac8cacc6b"}, + {file = "coverage-7.5.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d45d3cbd94159c468b9b8c5a556e3f6b81a8d1af2a92b77320e887c3e7a5d080"}, + {file = "coverage-7.5.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed550e7442f278af76d9d65af48069f1fb84c9f745ae249c1a183c1e9d1b025c"}, + {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7a892be37ca35eb5019ec85402c3371b0f7cda5ab5056023a7f13da0961e60da"}, + {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8192794d120167e2a64721d88dbd688584675e86e15d0569599257566dec9bf0"}, + {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:820bc841faa502e727a48311948e0461132a9c8baa42f6b2b84a29ced24cc078"}, + {file = "coverage-7.5.4-cp311-cp311-win32.whl", hash = "sha256:6aae5cce399a0f065da65c7bb1e8abd5c7a3043da9dceb429ebe1b289bc07806"}, + {file = "coverage-7.5.4-cp311-cp311-win_amd64.whl", hash = "sha256:d2e344d6adc8ef81c5a233d3a57b3c7d5181f40e79e05e1c143da143ccb6377d"}, + {file = "coverage-7.5.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:54317c2b806354cbb2dc7ac27e2b93f97096912cc16b18289c5d4e44fc663233"}, + {file = "coverage-7.5.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:042183de01f8b6d531e10c197f7f0315a61e8d805ab29c5f7b51a01d62782747"}, + {file = "coverage-7.5.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6bb74ed465d5fb204b2ec41d79bcd28afccf817de721e8a807d5141c3426638"}, + {file = "coverage-7.5.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3d45ff86efb129c599a3b287ae2e44c1e281ae0f9a9bad0edc202179bcc3a2e"}, + {file = "coverage-7.5.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5013ed890dc917cef2c9f765c4c6a8ae9df983cd60dbb635df8ed9f4ebc9f555"}, + {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1014fbf665fef86cdfd6cb5b7371496ce35e4d2a00cda501cf9f5b9e6fced69f"}, + {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3684bc2ff328f935981847082ba4fdc950d58906a40eafa93510d1b54c08a66c"}, + {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:581ea96f92bf71a5ec0974001f900db495488434a6928a2ca7f01eee20c23805"}, + {file = "coverage-7.5.4-cp312-cp312-win32.whl", hash = "sha256:73ca8fbc5bc622e54627314c1a6f1dfdd8db69788f3443e752c215f29fa87a0b"}, + {file = "coverage-7.5.4-cp312-cp312-win_amd64.whl", hash = "sha256:cef4649ec906ea7ea5e9e796e68b987f83fa9a718514fe147f538cfeda76d7a7"}, + {file = "coverage-7.5.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cdd31315fc20868c194130de9ee6bfd99755cc9565edff98ecc12585b90be882"}, + {file = "coverage-7.5.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:02ff6e898197cc1e9fa375581382b72498eb2e6d5fc0b53f03e496cfee3fac6d"}, + {file = "coverage-7.5.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d05c16cf4b4c2fc880cb12ba4c9b526e9e5d5bb1d81313d4d732a5b9fe2b9d53"}, + {file = "coverage-7.5.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5986ee7ea0795a4095ac4d113cbb3448601efca7f158ec7f7087a6c705304e4"}, + {file = "coverage-7.5.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5df54843b88901fdc2f598ac06737f03d71168fd1175728054c8f5a2739ac3e4"}, + {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ab73b35e8d109bffbda9a3e91c64e29fe26e03e49addf5b43d85fc426dde11f9"}, + {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:aea072a941b033813f5e4814541fc265a5c12ed9720daef11ca516aeacd3bd7f"}, + {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:16852febd96acd953b0d55fc842ce2dac1710f26729b31c80b940b9afcd9896f"}, + {file = "coverage-7.5.4-cp38-cp38-win32.whl", hash = "sha256:8f894208794b164e6bd4bba61fc98bf6b06be4d390cf2daacfa6eca0a6d2bb4f"}, + {file = "coverage-7.5.4-cp38-cp38-win_amd64.whl", hash = "sha256:e2afe743289273209c992075a5a4913e8d007d569a406ffed0bd080ea02b0633"}, + {file = "coverage-7.5.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b95c3a8cb0463ba9f77383d0fa8c9194cf91f64445a63fc26fb2327e1e1eb088"}, + {file = "coverage-7.5.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7564cc09dd91b5a6001754a5b3c6ecc4aba6323baf33a12bd751036c998be4"}, + {file = "coverage-7.5.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44da56a2589b684813f86d07597fdf8a9c6ce77f58976727329272f5a01f99f7"}, + {file = "coverage-7.5.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e16f3d6b491c48c5ae726308e6ab1e18ee830b4cdd6913f2d7f77354b33f91c8"}, + {file = "coverage-7.5.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbc5958cb471e5a5af41b0ddaea96a37e74ed289535e8deca404811f6cb0bc3d"}, + {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a04e990a2a41740b02d6182b498ee9796cf60eefe40cf859b016650147908029"}, + {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ddbd2f9713a79e8e7242d7c51f1929611e991d855f414ca9996c20e44a895f7c"}, + {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b1ccf5e728ccf83acd313c89f07c22d70d6c375a9c6f339233dcf792094bcbf7"}, + {file = "coverage-7.5.4-cp39-cp39-win32.whl", hash = "sha256:56b4eafa21c6c175b3ede004ca12c653a88b6f922494b023aeb1e836df953ace"}, + {file = "coverage-7.5.4-cp39-cp39-win_amd64.whl", hash = "sha256:65e528e2e921ba8fd67d9055e6b9f9e34b21ebd6768ae1c1723f4ea6ace1234d"}, + {file = "coverage-7.5.4-pp38.pp39.pp310-none-any.whl", hash = "sha256:79b356f3dd5b26f3ad23b35c75dbdaf1f9e2450b6bcefc6d0825ea0aa3f86ca5"}, + {file = "coverage-7.5.4.tar.gz", hash = "sha256:a44963520b069e12789d0faea4e9fdb1e410cdc4aab89d94f7f55cbb7fef0353"}, ] [package.dependencies] @@ -777,33 +777,33 @@ files = [ [[package]] name = "debugpy" -version = "1.8.1" +version = "1.8.2" description = "An implementation of the Debug Adapter Protocol for Python" optional = false python-versions = ">=3.8" files = [ - {file = "debugpy-1.8.1-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:3bda0f1e943d386cc7a0e71bfa59f4137909e2ed947fb3946c506e113000f741"}, - {file = "debugpy-1.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dda73bf69ea479c8577a0448f8c707691152e6c4de7f0c4dec5a4bc11dee516e"}, - {file = "debugpy-1.8.1-cp310-cp310-win32.whl", hash = "sha256:3a79c6f62adef994b2dbe9fc2cc9cc3864a23575b6e387339ab739873bea53d0"}, - {file = "debugpy-1.8.1-cp310-cp310-win_amd64.whl", hash = "sha256:7eb7bd2b56ea3bedb009616d9e2f64aab8fc7000d481faec3cd26c98a964bcdd"}, - {file = "debugpy-1.8.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:016a9fcfc2c6b57f939673c874310d8581d51a0fe0858e7fac4e240c5eb743cb"}, - {file = "debugpy-1.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd97ed11a4c7f6d042d320ce03d83b20c3fb40da892f994bc041bbc415d7a099"}, - {file = "debugpy-1.8.1-cp311-cp311-win32.whl", hash = "sha256:0de56aba8249c28a300bdb0672a9b94785074eb82eb672db66c8144fff673146"}, - {file = "debugpy-1.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:1a9fe0829c2b854757b4fd0a338d93bc17249a3bf69ecf765c61d4c522bb92a8"}, - {file = "debugpy-1.8.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3ebb70ba1a6524d19fa7bb122f44b74170c447d5746a503e36adc244a20ac539"}, - {file = "debugpy-1.8.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2e658a9630f27534e63922ebf655a6ab60c370f4d2fc5c02a5b19baf4410ace"}, - {file = "debugpy-1.8.1-cp312-cp312-win32.whl", hash = "sha256:caad2846e21188797a1f17fc09c31b84c7c3c23baf2516fed5b40b378515bbf0"}, - {file = "debugpy-1.8.1-cp312-cp312-win_amd64.whl", hash = "sha256:edcc9f58ec0fd121a25bc950d4578df47428d72e1a0d66c07403b04eb93bcf98"}, - {file = "debugpy-1.8.1-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:7a3afa222f6fd3d9dfecd52729bc2e12c93e22a7491405a0ecbf9e1d32d45b39"}, - {file = "debugpy-1.8.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d915a18f0597ef685e88bb35e5d7ab968964b7befefe1aaea1eb5b2640b586c7"}, - {file = "debugpy-1.8.1-cp38-cp38-win32.whl", hash = "sha256:92116039b5500633cc8d44ecc187abe2dfa9b90f7a82bbf81d079fcdd506bae9"}, - {file = "debugpy-1.8.1-cp38-cp38-win_amd64.whl", hash = "sha256:e38beb7992b5afd9d5244e96ad5fa9135e94993b0c551ceebf3fe1a5d9beb234"}, - {file = "debugpy-1.8.1-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:bfb20cb57486c8e4793d41996652e5a6a885b4d9175dd369045dad59eaacea42"}, - {file = "debugpy-1.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efd3fdd3f67a7e576dd869c184c5dd71d9aaa36ded271939da352880c012e703"}, - {file = "debugpy-1.8.1-cp39-cp39-win32.whl", hash = "sha256:58911e8521ca0c785ac7a0539f1e77e0ce2df753f786188f382229278b4cdf23"}, - {file = "debugpy-1.8.1-cp39-cp39-win_amd64.whl", hash = "sha256:6df9aa9599eb05ca179fb0b810282255202a66835c6efb1d112d21ecb830ddd3"}, - {file = "debugpy-1.8.1-py2.py3-none-any.whl", hash = "sha256:28acbe2241222b87e255260c76741e1fbf04fdc3b6d094fcf57b6c6f75ce1242"}, - {file = "debugpy-1.8.1.zip", hash = "sha256:f696d6be15be87aef621917585f9bb94b1dc9e8aced570db1b8a6fc14e8f9b42"}, + {file = "debugpy-1.8.2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:7ee2e1afbf44b138c005e4380097d92532e1001580853a7cb40ed84e0ef1c3d2"}, + {file = "debugpy-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f8c3f7c53130a070f0fc845a0f2cee8ed88d220d6b04595897b66605df1edd6"}, + {file = "debugpy-1.8.2-cp310-cp310-win32.whl", hash = "sha256:f179af1e1bd4c88b0b9f0fa153569b24f6b6f3de33f94703336363ae62f4bf47"}, + {file = "debugpy-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:0600faef1d0b8d0e85c816b8bb0cb90ed94fc611f308d5fde28cb8b3d2ff0fe3"}, + {file = "debugpy-1.8.2-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:8a13417ccd5978a642e91fb79b871baded925d4fadd4dfafec1928196292aa0a"}, + {file = "debugpy-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:acdf39855f65c48ac9667b2801234fc64d46778021efac2de7e50907ab90c634"}, + {file = "debugpy-1.8.2-cp311-cp311-win32.whl", hash = "sha256:2cbd4d9a2fc5e7f583ff9bf11f3b7d78dfda8401e8bb6856ad1ed190be4281ad"}, + {file = "debugpy-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:d3408fddd76414034c02880e891ea434e9a9cf3a69842098ef92f6e809d09afa"}, + {file = "debugpy-1.8.2-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:5d3ccd39e4021f2eb86b8d748a96c766058b39443c1f18b2dc52c10ac2757835"}, + {file = "debugpy-1.8.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:62658aefe289598680193ff655ff3940e2a601765259b123dc7f89c0239b8cd3"}, + {file = "debugpy-1.8.2-cp312-cp312-win32.whl", hash = "sha256:bd11fe35d6fd3431f1546d94121322c0ac572e1bfb1f6be0e9b8655fb4ea941e"}, + {file = "debugpy-1.8.2-cp312-cp312-win_amd64.whl", hash = "sha256:15bc2f4b0f5e99bf86c162c91a74c0631dbd9cef3c6a1d1329c946586255e859"}, + {file = "debugpy-1.8.2-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:5a019d4574afedc6ead1daa22736c530712465c0c4cd44f820d803d937531b2d"}, + {file = "debugpy-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40f062d6877d2e45b112c0bbade9a17aac507445fd638922b1a5434df34aed02"}, + {file = "debugpy-1.8.2-cp38-cp38-win32.whl", hash = "sha256:c78ba1680f1015c0ca7115671fe347b28b446081dada3fedf54138f44e4ba031"}, + {file = "debugpy-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:cf327316ae0c0e7dd81eb92d24ba8b5e88bb4d1b585b5c0d32929274a66a5210"}, + {file = "debugpy-1.8.2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:1523bc551e28e15147815d1397afc150ac99dbd3a8e64641d53425dba57b0ff9"}, + {file = "debugpy-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e24ccb0cd6f8bfaec68d577cb49e9c680621c336f347479b3fce060ba7c09ec1"}, + {file = "debugpy-1.8.2-cp39-cp39-win32.whl", hash = "sha256:7f8d57a98c5a486c5c7824bc0b9f2f11189d08d73635c326abef268f83950326"}, + {file = "debugpy-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:16c8dcab02617b75697a0a925a62943e26a0330da076e2a10437edd9f0bf3755"}, + {file = "debugpy-1.8.2-py2.py3-none-any.whl", hash = "sha256:16e16df3a98a35c63c3ab1e4d19be4cbc7fdda92d9ddc059294f18910928e0ca"}, + {file = "debugpy-1.8.2.zip", hash = "sha256:95378ed08ed2089221896b9b3a8d021e642c24edc8fef20e5d4342ca8be65c00"}, ] [[package]] @@ -897,13 +897,13 @@ devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benc [[package]] name = "filelock" -version = "3.15.3" +version = "3.15.4" description = "A platform independent file lock." optional = false python-versions = ">=3.8" files = [ - {file = "filelock-3.15.3-py3-none-any.whl", hash = "sha256:0151273e5b5d6cf753a61ec83b3a9b7d8821c39ae9af9d7ecf2f9e2f17404103"}, - {file = "filelock-3.15.3.tar.gz", hash = "sha256:e1199bf5194a2277273dacd50269f0d87d0682088a3c561c15674ea9005d8635"}, + {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"}, + {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"}, ] [package.extras] @@ -1256,13 +1256,13 @@ files = [ [[package]] name = "importlib-metadata" -version = "7.2.0" +version = "8.0.0" description = "Read metadata from Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "importlib_metadata-7.2.0-py3-none-any.whl", hash = "sha256:04e4aad329b8b948a5711d394fa8759cb80f009225441b4f2a02bd4d8e5f426c"}, - {file = "importlib_metadata-7.2.0.tar.gz", hash = "sha256:3ff4519071ed42740522d494d04819b666541b9752c43012f85afb2cc220fcc6"}, + {file = "importlib_metadata-8.0.0-py3-none-any.whl", hash = "sha256:15584cf2b1bf449d98ff8a6ff1abef57bf20f3ac6454f431736cd3e660921b2f"}, + {file = "importlib_metadata-8.0.0.tar.gz", hash = "sha256:188bd24e4c346d3f0a933f275c2fec67050326a856b9a359881d7c2a697e8812"}, ] [package.dependencies] @@ -1664,6 +1664,34 @@ files = [ {file = "kiwisolver-1.4.5.tar.gz", hash = "sha256:e57e563a57fb22a142da34f38acc2fc1a5c864bc29ca1517a88abc963e60d6ec"}, ] +[[package]] +name = "lightning-fabric" +version = "2.3.0" +description = "" +optional = false +python-versions = ">=3.8" +files = [ + {file = "lightning-fabric-2.3.0.tar.gz", hash = "sha256:b75438e96caba280141ece3512fd613ba680c102fda90657af1bbd2ea5e95bc1"}, + {file = "lightning_fabric-2.3.0-py3-none-any.whl", hash = "sha256:fff33b1e48a283e486b4a51bc5100b8d6a14dd50278a613c6d964b058584672c"}, +] + +[package.dependencies] +fsspec = {version = ">=2022.5.0", extras = ["http"]} +lightning-utilities = ">=0.8.0" +numpy = ">=1.17.2" +packaging = ">=20.0" +torch = ">=2.0.0" +typing-extensions = ">=4.4.0" + +[package.extras] +all = ["bitsandbytes (>=0.42.0)", "deepspeed (>=0.8.2,<=0.9.3)", "lightning-utilities (>=0.8.0)", "torchmetrics (>=0.10.0)", "torchvision (>=0.15.0)"] +bitsandbytes = ["bitsandbytes (>=0.42.0)"] +deepspeed = ["deepspeed (>=0.8.2,<=0.9.3)"] +dev = ["bitsandbytes (>=0.42.0)", "click (==8.1.7)", "coverage (==7.3.1)", "deepspeed (>=0.8.2,<=0.9.3)", "lightning-utilities (>=0.8.0)", "pytest (==7.4.0)", "pytest-cov (==4.1.0)", "pytest-random-order (==1.1.0)", "pytest-rerunfailures (==12.0)", "pytest-timeout (==2.1.0)", "tensorboardX (>=2.2)", "torchmetrics (>=0.10.0)", "torchmetrics (>=0.7.0)", "torchvision (>=0.15.0)"] +examples = ["lightning-utilities (>=0.8.0)", "torchmetrics (>=0.10.0)", "torchvision (>=0.15.0)"] +strategies = ["bitsandbytes (>=0.42.0)", "deepspeed (>=0.8.2,<=0.9.3)"] +test = ["click (==8.1.7)", "coverage (==7.3.1)", "pytest (==7.4.0)", "pytest-cov (==4.1.0)", "pytest-random-order (==1.1.0)", "pytest-rerunfailures (==12.0)", "pytest-timeout (==2.1.0)", "tensorboardX (>=2.2)", "torchmetrics (>=0.7.0)"] + [[package]] name = "lightning-utilities" version = "0.11.2" @@ -2417,6 +2445,7 @@ description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" files = [ + {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_aarch64.whl", hash = "sha256:004186d5ea6a57758fd6d57052a123c73a4815adf365eb8dd6a85c9eaa7535ff"}, {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d9714f27c1d0f0895cd8915c07a87a1d0029a0aa36acaf9156952ec2a8a12189"}, {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-win_amd64.whl", hash = "sha256:c3401dc8543b52d3a8158007a0c1ab4e9c768fcbd24153a48c86972102197ddd"}, ] @@ -3110,6 +3139,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -3404,13 +3434,13 @@ files = [ [[package]] name = "setuptools" -version = "70.1.0" +version = "70.1.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-70.1.0-py3-none-any.whl", hash = "sha256:d9b8b771455a97c8a9f3ab3448ebe0b29b5e105f1228bba41028be116985a267"}, - {file = "setuptools-70.1.0.tar.gz", hash = "sha256:01a1e793faa5bd89abc851fa15d0a0db26f160890c7102cd8dce643e886b47f5"}, + {file = "setuptools-70.1.1-py3-none-any.whl", hash = "sha256:a58a8fde0541dab0419750bcc521fbdf8585f6e5cb41909df3a472ef7b81ca95"}, + {file = "setuptools-70.1.1.tar.gz", hash = "sha256:937a48c7cdb7a21eb53cd7f9b59e525503aa8abaf3584c730dc5f7a5bec3a650"}, ] [package.extras] @@ -3672,13 +3702,13 @@ files = [ [[package]] name = "tenacity" -version = "8.4.1" +version = "8.4.2" description = "Retry code until it succeeds" optional = false python-versions = ">=3.8" files = [ - {file = "tenacity-8.4.1-py3-none-any.whl", hash = "sha256:28522e692eda3e1b8f5e99c51464efcc0b9fc86933da92415168bc1c4e2308fa"}, - {file = "tenacity-8.4.1.tar.gz", hash = "sha256:54b1412b878ddf7e1f1577cd49527bad8cdef32421bd599beac0c6c3f10582fd"}, + {file = "tenacity-8.4.2-py3-none-any.whl", hash = "sha256:9e6f7cf7da729125c7437222f8a522279751cdfbe6b67bfe64f75d3a348661b2"}, + {file = "tenacity-8.4.2.tar.gz", hash = "sha256:cd80a53a79336edba8489e767f729e4f391c896956b57140b5d7511a64bbd3ef"}, ] [package.extras] @@ -4229,4 +4259,4 @@ plotly-resampler = ["plotly-resampler"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<=3.13" -content-hash = "2918a6a6306adfdc98192da9235ddc0863ed75d38aee3c7fdf045dccd505e9ef" +content-hash = "548ba24b8460a79ec563ee453e04ee4625aed2986de4668e80ccd659142e3b56" diff --git a/pyproject.toml b/pyproject.toml index 4e69ae072..217597907 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ plotly = ">=5.13.1" kaleido = "0.2.1" # required for plotly static image export plotly-resampler = { version = ">=0.9.2", optional = true } livelossplot = { version = ">=0.5.5", optional = true } +lightning-fabric = "^2.3.0" [tool.poetry.extras] plotly-resampler = ["plotly-resampler"] diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index ac0af79e0..50c037250 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -134,7 +134,7 @@ def create_metrics_plot(metrics): def test_PeytonManning(): df = pd.read_csv(PEYTON_FILE) - m = NeuralProphet() + m = NeuralProphet(deterministic=True) df_train, df_test = m.split_df(df=df, freq="D", valid_p=0.1) system_speed, std = get_system_speed() @@ -160,6 +160,7 @@ def test_YosemiteTemps(): changepoints_range=0.9, n_changepoints=30, weekly_seasonality=False, + deterministic=True, ) df_train, df_test = m.split_df(df=df, freq="5min", valid_p=0.1) @@ -180,7 +181,7 @@ def test_YosemiteTemps(): def test_AirPassengers(): df = pd.read_csv(AIR_FILE) - m = NeuralProphet(seasonality_mode="multiplicative") + m = NeuralProphet(seasonality_mode="multiplicative", deterministic=True) df_train, df_test = m.split_df(df=df, freq="MS", valid_p=0.1) system_speed, std = get_system_speed() @@ -209,6 +210,7 @@ def test_EnergyPriceDaily(): weekly_seasonality=True, daily_seasonality=False, n_lags=14, + deterministic=True, ) m.add_lagged_regressor("temp", n_lags=3) m.add_future_regressor("temperature") From aac70dec5a60e4a54d2f70e49fd43522b4e82095 Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Tue, 25 Jun 2024 19:05:10 -0700 Subject: [PATCH 123/128] fixed ruff linting issues --- tests/test_glocal.py | 183 +++++-------------------------------------- 1 file changed, 19 insertions(+), 164 deletions(-) diff --git a/tests/test_glocal.py b/tests/test_glocal.py index e631b616d..0767fb242 100644 --- a/tests/test_glocal.py +++ b/tests/test_glocal.py @@ -205,36 +205,12 @@ def test_wrong_option_global_local_modeling(): forecast_trend = m.predict_trend(test_df) forecast_seasonal_componets = m.predict_seasonal_components(test_df) - -def test_different_seasonality_modeling(): - # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES - log.info("Global Modeling + Global Normalization") - df = pd.read_csv(PEYTON_FILE, nrows=512) - df1_0 = df.iloc[:128, :].copy(deep=True) - df1_0["ID"] = "df1" - df2_0 = df.iloc[128:256, :].copy(deep=True) - df2_0["ID"] = "df2" - df3_0 = df.iloc[256:384, :].copy(deep=True) - df3_0["ID"] = "df3" - m = NeuralProphet( - n_forecasts=2, - n_lags=10, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - season_global_local="local", - yearly_seasonality_glocal_mode="global", + log.info( + f"forecast = {forecast}, metrics = {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets = {forecast_seasonal_componets}" ) - train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) - m.fit(train_df) - future = m.make_future_dataframe(test_df) - forecast = m.predict(future) - metrics = m.test(test_df) - forecast_trend = m.predict_trend(test_df) - forecast_seasonal_componets = m.predict_seasonal_components(test_df) -def test_adding_new_global_seasonality(): +def test_different_seasonality_modeling(): # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES log.info("Global Modeling + Global Normalization") df = pd.read_csv(PEYTON_FILE, nrows=512) @@ -253,7 +229,6 @@ def test_adding_new_global_seasonality(): season_global_local="local", yearly_seasonality_glocal_mode="global", ) - m.add_seasonality(period=30, fourier_order=8, name="monthly", global_local="global") train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) m.fit(train_df) future = m.make_future_dataframe(test_df) @@ -262,142 +237,9 @@ def test_adding_new_global_seasonality(): forecast_trend = m.predict_trend(test_df) forecast_seasonal_componets = m.predict_seasonal_components(test_df) - -def test_adding_new_local_seasonality(): - # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES - log.info("Global Modeling + Global Normalization") - df = pd.read_csv(PEYTON_FILE, nrows=512) - df1_0 = df.iloc[:128, :].copy(deep=True) - df1_0["ID"] = "df1" - df2_0 = df.iloc[128:256, :].copy(deep=True) - df2_0["ID"] = "df2" - df3_0 = df.iloc[256:384, :].copy(deep=True) - df3_0["ID"] = "df3" - m = NeuralProphet(epochs=EPOCHS, batch_size=BATCH_SIZE, season_global_local="global", trend_global_local="local") - m.add_seasonality(period=30, fourier_order=8, name="monthly", global_local="local") - train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) - m.fit(train_df) - future = m.make_future_dataframe(test_df, n_historic_predictions=True) - forecast = m.predict(future) - metrics = m.test(test_df) - forecast_trend = m.predict_trend(test_df) - forecast_seasonal_componets = m.predict_seasonal_components(test_df) - - -def test_trend_local_reg(): - # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES - log.info("Global Modeling + Global Normalization") - df = pd.read_csv(PEYTON_FILE, nrows=512) - df1_0 = df.iloc[:128, :].copy(deep=True) - df1_0["ID"] = "df1" - df2_0 = df.iloc[128:256, :].copy(deep=True) - df2_0["ID"] = "df2" - df3_0 = df.iloc[256:384, :].copy(deep=True) - df3_0["ID"] = "df3" - for coef_i in [-30, 0, False, True]: - m = NeuralProphet( - n_forecasts=1, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - trend_global_local="local", - trend_local_reg=coef_i, - ) - - m.add_seasonality(period=30, fourier_order=8, name="monthly", global_local="global") - train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) - m.fit(train_df) - future = m.make_future_dataframe(test_df, n_historic_predictions=True) - forecast = m.predict(future) - metrics = m.test(test_df) - forecast_trend = m.predict_trend(test_df) - forecast_seasonal_componets = m.predict_seasonal_components(test_df) - - -def test_glocal_seasonality_reg(): - # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES - log.info("Global Modeling + Global Normalization") - df = pd.read_csv(PEYTON_FILE, nrows=512) - df1_0 = df.iloc[:128, :].copy(deep=True) - df1_0["ID"] = "df1" - df2_0 = df.iloc[128:256, :].copy(deep=True) - df2_0["ID"] = "df2" - df3_0 = df.iloc[256:384, :].copy(deep=True) - df3_0["ID"] = "df3" - for coef_i in [-30, 0, False, True]: - m = NeuralProphet( - n_forecasts=1, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - season_global_local="local", - yearly_seasonality_glocal_mode="global", - glocal_seasonality_reg=coef_i, - ) - - m.add_seasonality(period=30, fourier_order=8, name="monthly", global_local="global") - train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) - m.fit(train_df) - future = m.make_future_dataframe(test_df, n_historic_predictions=True) - forecast = m.predict(future) - metrics = m.test(test_df) - - -def test_trend_local_reg_if_global(): - # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES - log.info("Global Modeling + Global Normalization") - df = pd.read_csv(PEYTON_FILE, nrows=512) - df1_0 = df.iloc[:128, :].copy(deep=True) - df1_0["ID"] = "df1" - df2_0 = df.iloc[128:256, :].copy(deep=True) - df2_0["ID"] = "df2" - df3_0 = df.iloc[256:384, :].copy(deep=True) - df3_0["ID"] = "df3" - for coef_i in [-30, 0, False, True]: - m = NeuralProphet( - n_forecasts=1, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - trend_global_local="global", - trend_local_reg=3, - ) - - train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) - m.fit(train_df) - future = m.make_future_dataframe(test_df, n_historic_predictions=True) - forecast = m.predict(future) - metrics = m.test(test_df) - forecast_trend = m.predict_trend(test_df) - forecast_seasonal_componets = m.predict_seasonal_components(test_df) - - -def test_different_seasonality_modeling(): - # SEASONALITY GLOBAL LOCAL MODELLING - NO EXOGENOUS VARIABLES - log.info("Global Modeling + Global Normalization") - df = pd.read_csv(PEYTON_FILE, nrows=512) - df1_0 = df.iloc[:128, :].copy(deep=True) - df1_0["ID"] = "df1" - df2_0 = df.iloc[128:256, :].copy(deep=True) - df2_0["ID"] = "df2" - df3_0 = df.iloc[256:384, :].copy(deep=True) - df3_0["ID"] = "df3" - m = NeuralProphet( - n_forecasts=2, - n_lags=10, - epochs=EPOCHS, - batch_size=BATCH_SIZE, - learning_rate=LR, - season_global_local="local", - yearly_seasonality_glocal_mode="global", + log.info( + f"forecast = {forecast}, metrics = {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets = {forecast_seasonal_componets}" ) - train_df, test_df = m.split_df(pd.concat((df1_0, df2_0, df3_0)), valid_p=0.33, local_split=True) - m.fit(train_df) - future = m.make_future_dataframe(test_df) - forecast = m.predict(future) - metrics = m.test(test_df) - forecast_trend = m.predict_trend(test_df) - forecast_seasonal_componets = m.predict_seasonal_components(test_df) def test_adding_new_global_seasonality(): @@ -427,6 +269,9 @@ def test_adding_new_global_seasonality(): metrics = m.test(test_df) forecast_trend = m.predict_trend(test_df) forecast_seasonal_componets = m.predict_seasonal_components(test_df) + log.info( + f"forecast = {forecast}, metrics = {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets = {forecast_seasonal_componets}" + ) def test_adding_new_local_seasonality(): @@ -448,6 +293,9 @@ def test_adding_new_local_seasonality(): metrics = m.test(test_df) forecast_trend = m.predict_trend(test_df) forecast_seasonal_componets = m.predict_seasonal_components(test_df) + log.info( + f"forecast = {forecast}, metrics = {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets = {forecast_seasonal_componets}" + ) def test_trend_local_reg(): @@ -478,6 +326,9 @@ def test_trend_local_reg(): metrics = m.test(test_df) forecast_trend = m.predict_trend(test_df) forecast_seasonal_componets = m.predict_seasonal_components(test_df) + log.info( + f"forecast = {forecast}, metrics = {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets = {forecast_seasonal_componets}" + ) def test_glocal_seasonality_reg(): @@ -498,7 +349,7 @@ def test_glocal_seasonality_reg(): learning_rate=LR, season_global_local="local", yearly_seasonality_glocal_mode="global", - seasonality_local_reg=coef_i, + glocal_seasonality_reg=coef_i, ) m.add_seasonality(period=30, fourier_order=8, name="monthly", global_local="global") @@ -507,6 +358,7 @@ def test_glocal_seasonality_reg(): future = m.make_future_dataframe(test_df, n_historic_predictions=True) forecast = m.predict(future) metrics = m.test(test_df) + log.info(f"forecast = {forecast}, metrics = {metrics}") def test_trend_local_reg_if_global(): @@ -536,3 +388,6 @@ def test_trend_local_reg_if_global(): metrics = m.test(test_df) forecast_trend = m.predict_trend(test_df) forecast_seasonal_componets = m.predict_seasonal_components(test_df) + log.info( + f"forecast = {forecast}, metrics = {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets = {forecast_seasonal_componets}" + ) From ec76aae37d0a63c453adfe3f248c05c425d12e96 Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Tue, 25 Jun 2024 19:29:09 -0700 Subject: [PATCH 124/128] fixed glocal test --- tests/test_glocal.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_glocal.py b/tests/test_glocal.py index 0767fb242..5c171d597 100644 --- a/tests/test_glocal.py +++ b/tests/test_glocal.py @@ -341,7 +341,7 @@ def test_glocal_seasonality_reg(): df2_0["ID"] = "df2" df3_0 = df.iloc[256:384, :].copy(deep=True) df3_0["ID"] = "df3" - for coef_i in [-30, 0, False, True]: + for _ in [-30, 0, False, True]: m = NeuralProphet( n_forecasts=1, epochs=EPOCHS, @@ -349,7 +349,6 @@ def test_glocal_seasonality_reg(): learning_rate=LR, season_global_local="local", yearly_seasonality_glocal_mode="global", - glocal_seasonality_reg=coef_i, ) m.add_seasonality(period=30, fourier_order=8, name="monthly", global_local="global") From 19d8e7a24337311fff14ffdcc17f5df972a23815 Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Tue, 25 Jun 2024 19:45:01 -0700 Subject: [PATCH 125/128] fix lock file --- poetry.lock | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index e33ec5716..ac42351f9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -896,7 +896,7 @@ files = [ devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] [[package]] -name = "filelock" +name = "file" version = "3.15.4" description = "A platform independent file lock." optional = false @@ -4258,10 +4258,5 @@ plotly-resampler = ["plotly-resampler"] [metadata] lock-version = "2.0" -<<<<<<< HEAD python-versions = ">=3.9,<3.13" content-hash = "d08c423b7a0c27143741287c01f7b597d7af8f45c4c4108194af7be93f442e54" -======= -python-versions = ">=3.9,<=3.13" -content-hash = "548ba24b8460a79ec563ee453e04ee4625aed2986de4668e80ccd659142e3b56" ->>>>>>> bug/make_tests_deterministic From c533f01490a115969f28c11e4308b3d7492af94b Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Tue, 25 Jun 2024 19:47:58 -0700 Subject: [PATCH 126/128] update poetry --- poetry.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index ac42351f9..4df2de293 100644 --- a/poetry.lock +++ b/poetry.lock @@ -896,7 +896,7 @@ files = [ devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] [[package]] -name = "file" +name = "filelock" version = "3.15.4" description = "A platform independent file lock." optional = false @@ -4259,4 +4259,4 @@ plotly-resampler = ["plotly-resampler"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "d08c423b7a0c27143741287c01f7b597d7af8f45c4c4108194af7be93f442e54" +content-hash = "7c8e2b1178f0498721e849f427703bfcda1ecba529d25bcd93f5a00a5daedbe2" From ad449c2db5ddf64878b93e5e26fe7b7e767b9351 Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Wed, 26 Jun 2024 10:35:24 -0700 Subject: [PATCH 127/128] moved the deterministic flag to the train method --- neuralprophet/forecaster.py | 8 +++++--- tests/test_model_performance.py | 24 ++++++++++++++++-------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index d258a256e..9993fd832 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -435,11 +435,9 @@ def __init__( accelerator: Optional[str] = None, trainer_config: dict = {}, prediction_frequency: Optional[dict] = None, - deterministic=False, ): self.config = locals() self.config.pop("self") - self.deterministic = deterministic # General self.name = "NeuralProphet" @@ -907,6 +905,7 @@ def fit( checkpointing: bool = False, continue_training: bool = False, num_workers: int = 0, + deterministic: bool = False, ): """Train, and potentially evaluate model. @@ -1071,6 +1070,7 @@ def fit( checkpointing_enabled=checkpointing, continue_training=continue_training, num_workers=num_workers, + deterministic=deterministic, ) else: df_val, _, _, _ = df_utils.prep_or_copy_df(validation_df) @@ -1095,6 +1095,7 @@ def fit( checkpointing_enabled=checkpointing, continue_training=continue_training, num_workers=num_workers, + deterministic=deterministic, ) # Show training plot @@ -2716,6 +2717,7 @@ def _train( checkpointing_enabled: bool = False, continue_training=False, num_workers=0, + deterministic: bool = False, ): """ Execute model training procedure for a configured number of epochs. @@ -2773,7 +2775,7 @@ def _train( metrics_enabled=metrics_enabled, checkpointing_enabled=checkpointing_enabled, num_batches_per_epoch=len(train_loader), - deterministic=self.deterministic, + deterministic=deterministic, ) # Tune hyperparams and train diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index 50c037250..af512d535 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -134,12 +134,12 @@ def create_metrics_plot(metrics): def test_PeytonManning(): df = pd.read_csv(PEYTON_FILE) - m = NeuralProphet(deterministic=True) + m = NeuralProphet() df_train, df_test = m.split_df(df=df, freq="D", valid_p=0.1) system_speed, std = get_system_speed() start = time.time() - metrics = m.fit(df_train, validation_df=df_test, freq="D") # , early_stopping=True) + metrics = m.fit(df_train, validation_df=df_test, freq="D", deterministic=True) # , early_stopping=True) end = time.time() accuracy_metrics = metrics.to_dict("records")[-1] @@ -160,13 +160,17 @@ def test_YosemiteTemps(): changepoints_range=0.9, n_changepoints=30, weekly_seasonality=False, - deterministic=True, ) df_train, df_test = m.split_df(df=df, freq="5min", valid_p=0.1) system_speed, std = get_system_speed() start = time.time() - metrics = m.fit(df_train, validation_df=df_test, freq="5min") # , early_stopping=True) + metrics = m.fit( + df_train, + validation_df=df_test, + freq="5min", + deterministic=True, + ) # , early_stopping=True) end = time.time() accuracy_metrics = metrics.to_dict("records")[-1] @@ -181,12 +185,12 @@ def test_YosemiteTemps(): def test_AirPassengers(): df = pd.read_csv(AIR_FILE) - m = NeuralProphet(seasonality_mode="multiplicative", deterministic=True) + m = NeuralProphet(seasonality_mode="multiplicative") df_train, df_test = m.split_df(df=df, freq="MS", valid_p=0.1) system_speed, std = get_system_speed() start = time.time() - metrics = m.fit(df_train, validation_df=df_test, freq="MS") # , early_stopping=True) + metrics = m.fit(df_train, validation_df=df_test, freq="MS", deterministic=True) # , early_stopping=True) end = time.time() accuracy_metrics = metrics.to_dict("records")[-1] @@ -210,7 +214,6 @@ def test_EnergyPriceDaily(): weekly_seasonality=True, daily_seasonality=False, n_lags=14, - deterministic=True, ) m.add_lagged_regressor("temp", n_lags=3) m.add_future_regressor("temperature") @@ -219,7 +222,12 @@ def test_EnergyPriceDaily(): system_speed, std = get_system_speed() start = time.time() - metrics = m.fit(df_train, validation_df=df_test, freq="D") # , early_stopping=True) + metrics = m.fit( + df_train, + validation_df=df_test, + freq="D", + deterministic=True, + ) # , early_stopping=True) end = time.time() accuracy_metrics = metrics.to_dict("records")[-1] From 39b69131f927f96f39f77b93c3a42256adf77ee7 Mon Sep 17 00:00:00 2001 From: MaiBe-ctrl Date: Wed, 26 Jun 2024 15:29:21 -0700 Subject: [PATCH 128/128] update lock file --- poetry.lock | 2 +- tests/test_event_utils.py | 1 - tests/test_glocal.py | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 72dbb454d..43abed65d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4259,4 +4259,4 @@ plotly-resampler = ["plotly-resampler"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "7c8e2b1178f0498721e849f427703bfcda1ecba529d25bcd93f5a00a5daedbe2" +content-hash = "a3b767eec027be911e9499276840e4740231a7fff6e5658c2f38a36b00e72451" diff --git a/tests/test_event_utils.py b/tests/test_event_utils.py index 0d0c75b96..49e24b4c3 100644 --- a/tests/test_event_utils.py +++ b/tests/test_event_utils.py @@ -6,7 +6,6 @@ import holidays import matplotlib.pyplot as plt -import numpy as np import pandas as pd import pytest from holidays import country_holidays diff --git a/tests/test_glocal.py b/tests/test_glocal.py index bc7fabe8d..9bda1882c 100644 --- a/tests/test_glocal.py +++ b/tests/test_glocal.py @@ -273,7 +273,7 @@ def test_adding_new_global_seasonality(): forecast_seasonal_componets = m.predict_seasonal_components(test_df) log.debug( f"forecast = {forecast}, metrics= {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets= {forecast_seasonal_componets}" - + ) def test_adding_new_local_seasonality(): @@ -297,6 +297,7 @@ def test_adding_new_local_seasonality(): forecast_seasonal_componets = m.predict_seasonal_components(test_df) log.debug( f"forecast = {forecast}, metrics= {metrics}, forecast_trend = {forecast_trend}, forecast_seasonal_componets= {forecast_seasonal_componets}" + ) def test_trend_local_reg(): @@ -343,7 +344,6 @@ def test_glocal_seasonality_reg(): df3_0 = df.iloc[256:384, :].copy(deep=True) df3_0["ID"] = "df3" for coef_i in [0, 1.5, False, True]: - m = NeuralProphet( n_forecasts=1, epochs=EPOCHS,