diff --git a/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py b/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py index cc1e80b52..308d97370 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py +++ b/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py @@ -4,9 +4,11 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import importlib +import logging import numpy as np import pandas as pd +import report_creator as rc from merlion.post_process.threshold import AggregateAlarms from merlion.utils import TimeSeries @@ -21,6 +23,8 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel +logging.getLogger("root").setLevel(logging.WARNING) + class AnomalyMerlionOperatorModel(AnomalyOperatorBaseModel): """Class representing Merlion Anomaly Detection operator model.""" @@ -84,7 +88,7 @@ def _build_model(self) -> AnomalyOutput: for target, df in self.datasets.full_data_dict.items(): data = df.set_index(date_column) data = TimeSeries.from_pd(data) - for model_name, (model_config, model) in model_config_map.items(): + for _, (model_config, model) in model_config_map.items(): if self.spec.model == SupportedModels.BOCPD: model_config = model_config(**self.spec.model_kwargs) else: @@ -115,7 +119,7 @@ def _build_model(self) -> AnomalyOutput: y_pred = (y_pred.to_pd().reset_index()["anom_score"] > 0).astype( int ) - except Exception as e: + except Exception: y_pred = ( scores["anom_score"] > np.percentile( @@ -135,15 +139,12 @@ def _build_model(self) -> AnomalyOutput: OutputColumns.SCORE_COL: scores["anom_score"], } ).reset_index(drop=True) - # model_objects[model_name].append(model) anomaly_output.add_output(target, anomaly, score) return anomaly_output def _generate_report(self): """Genreates a report for the model.""" - import report_creator as rc - other_sections = [ rc.Heading("Selected Models Overview", level=2), rc.Text( diff --git a/ads/opctl/operator/lowcode/anomaly/model/automlx.py b/ads/opctl/operator/lowcode/anomaly/model/automlx.py index a6deef1fa..6e665c125 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/automlx.py +++ b/ads/opctl/operator/lowcode/anomaly/model/automlx.py @@ -1,16 +1,21 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + import pandas as pd +import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency -from .anomaly_dataset import AnomalyOutput +from ads.opctl import logger +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns +from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns + +logging.getLogger("root").setLevel(logging.WARNING) class AutoMLXOperatorModel(AnomalyOperatorBaseModel): @@ -25,16 +30,17 @@ class AutoMLXOperatorModel(AnomalyOperatorBaseModel): ), ) def _build_model(self) -> pd.DataFrame: - from automlx import init import logging + import automlx + try: - init( + automlx.init( engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}}, loglevel=logging.CRITICAL, ) - except Exception as e: + except Exception: logger.info("Ray already initialized") date_column = self.spec.datetime_column.name anomaly_output = AnomalyOutput(date_column=date_column) @@ -73,8 +79,6 @@ def _build_model(self) -> pd.DataFrame: return anomaly_output def _generate_report(self): - import report_creator as rc - """The method that needs to be implemented on the particular model level.""" other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/autots.py b/ads/opctl/operator/lowcode/anomaly/model/autots.py index c795440de..32702596c 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/autots.py +++ b/ads/opctl/operator/lowcode/anomaly/model/autots.py @@ -1,9 +1,12 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + +import report_creator as rc + from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger from ads.opctl.operator.lowcode.anomaly.const import OutputColumns @@ -12,6 +15,8 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel +logging.getLogger("root").setLevel(logging.WARNING) + class AutoTSOperatorModel(AnomalyOperatorBaseModel): """Class representing AutoTS Anomaly Detection operator model.""" @@ -91,8 +96,6 @@ def _build_model(self) -> AnomalyOutput: return anomaly_output def _generate_report(self): - import report_creator as rc - """The method that needs to be implemented on the particular model level.""" other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py index e8de5213e..c24068ccb 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/base_model.py +++ b/ads/opctl/operator/lowcode/anomaly/model/base_model.py @@ -3,6 +3,7 @@ # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import os import tempfile import time @@ -12,6 +13,7 @@ import fsspec import numpy as np import pandas as pd +import report_creator as rc from sklearn import linear_model from ads.common.object_storage_details import ObjectStorageDetails @@ -33,6 +35,8 @@ from ..operator_config import AnomalyOperatorConfig, AnomalyOperatorSpec from .anomaly_dataset import AnomalyDatasets, AnomalyOutput, TestData +logging.getLogger("root").setLevel(logging.WARNING) + class AnomalyOperatorBaseModel(ABC): """The base class for the anomaly detection operator models.""" @@ -59,8 +63,8 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets): def generate_report(self): """Generates the report.""" import matplotlib.pyplot as plt - plt.rcParams.update({'figure.max_open_warning': 0}) - import report_creator as rc + + plt.rcParams.update({"figure.max_open_warning": 0}) start_time = time.time() # fallback using sklearn oneclasssvm when the sub model _build_model fails @@ -84,7 +88,13 @@ def generate_report(self): anomaly_output, test_data, elapsed_time ) table_blocks = [ - rc.DataTable(df.head(SUBSAMPLE_THRESHOLD) if self.spec.subsample_report_data and len(df) > SUBSAMPLE_THRESHOLD else df, label=col, index=True) + rc.DataTable( + df.head(SUBSAMPLE_THRESHOLD) + if self.spec.subsample_report_data and len(df) > SUBSAMPLE_THRESHOLD + else df, + label=col, + index=True, + ) for col, df in self.datasets.full_data_dict.items() ] data_table = rc.Select(blocks=table_blocks) @@ -144,7 +154,9 @@ def generate_report(self): else: figure_blocks = None - blocks.append(rc.Group(*figure_blocks, label=target)) if figure_blocks else None + blocks.append( + rc.Group(*figure_blocks, label=target) + ) if figure_blocks else None plots = rc.Select(blocks) report_sections = [] @@ -154,7 +166,9 @@ def generate_report(self): yaml_appendix = rc.Yaml(self.config.to_dict()) summary = rc.Block( rc.Group( - rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"), + rc.Text( + f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n" + ), rc.Text( "Based on your dataset, you could have also selected " f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`." @@ -285,8 +299,6 @@ def _save_report( test_metrics: pd.DataFrame, ): """Saves resulting reports to the given folder.""" - import report_creator as rc - unique_output_dir = self.spec.output_directory.url if ObjectStorageDetails.is_oci_path(unique_output_dir): diff --git a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py index 0083ad0fd..b5adfd6cc 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py @@ -1,17 +1,21 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + import numpy as np import pandas as pd +import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns -from .base_model import AnomalyOperatorBaseModel from .anomaly_dataset import AnomalyOutput -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns +from .base_model import AnomalyOperatorBaseModel + +logging.getLogger("root").setLevel(logging.WARNING) class IsolationForestOperatorModel(AnomalyOperatorBaseModel): @@ -36,13 +40,9 @@ def _build_model(self) -> AnomalyOutput: for target, df in self.datasets.full_data_dict.items(): model = IsolationForest(**model_kwargs) model.fit(df) - y_pred = np.vectorize(self.outlier_map.get)( - model.predict(df) - ) + y_pred = np.vectorize(self.outlier_map.get)(model.predict(df)) - scores = model.score_samples( - df - ) + scores = model.score_samples(df) index_col = df.columns[0] @@ -59,7 +59,6 @@ def _build_model(self) -> AnomalyOutput: def _generate_report(self): """Generates the report.""" - import report_creator as rc other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py index 157f7eb60..c6d3269ad 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py +++ b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py @@ -1,17 +1,21 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + import numpy as np import pandas as pd +import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns -from .base_model import AnomalyOperatorBaseModel from .anomaly_dataset import AnomalyOutput -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns +from .base_model import AnomalyOperatorBaseModel + +logging.getLogger("root").setLevel(logging.WARNING) class OneClassSVMOperatorModel(AnomalyOperatorBaseModel): @@ -36,13 +40,9 @@ def _build_model(self) -> AnomalyOutput: for target, df in self.datasets.full_data_dict.items(): model = OneClassSVM(**model_kwargs) model.fit(df) - y_pred = np.vectorize(self.outlier_map.get)( - model.predict(df) - ) + y_pred = np.vectorize(self.outlier_map.get)(model.predict(df)) - scores = model.score_samples( - df - ) + scores = model.score_samples(df) index_col = df.columns[0] @@ -54,12 +54,11 @@ def _build_model(self) -> AnomalyOutput: ).reset_index(drop=True) anomaly_output.add_output(target, anomaly, score) - + return anomaly_output def _generate_report(self): """Generates the report.""" - import report_creator as rc other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py index 17f19351d..0ea344228 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py @@ -3,8 +3,11 @@ # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + import numpy as np import pandas as pd +import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger @@ -13,6 +16,8 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel +logging.getLogger("root").setLevel(logging.WARNING) + class RandomCutForestOperatorModel(AnomalyOperatorBaseModel): """ @@ -27,7 +32,7 @@ class RandomCutForestOperatorModel(AnomalyOperatorBaseModel): ), ) def _build_model(self) -> AnomalyOutput: - from rrcf import RCTree + import rrcf model_kwargs = self.spec.model_kwargs @@ -96,7 +101,6 @@ def _build_model(self) -> AnomalyOutput: def _generate_report(self): """Generates the report.""" - import report_creator as rc other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/common/data.py b/ads/opctl/operator/lowcode/common/data.py index 530a1d392..9426bd284 100644 --- a/ads/opctl/operator/lowcode/common/data.py +++ b/ads/opctl/operator/lowcode/common/data.py @@ -1,29 +1,28 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import time -from .transformations import Transformations +from abc import ABC, abstractmethod + +import pandas as pd + from ads.opctl import logger from ads.opctl.operator.lowcode.common.const import DataColumns -from ads.opctl.operator.lowcode.common.utils import load_data from ads.opctl.operator.lowcode.common.errors import ( - InputDataError, InvalidParameterError, - PermissionsError, - DataMismatchError, ) -from abc import ABC -import pandas as pd +from ads.opctl.operator.lowcode.common.utils import load_data + +from .transformations import Transformations class AbstractData(ABC): def __init__(self, spec: dict, name="input_data"): self.Transformations = Transformations self.data = None - self._data_dict = dict() + self._data_dict = {} self.name = name self.spec = spec self.load_transform_ingest_data(spec) @@ -35,12 +34,15 @@ def get_raw_data_by_cat(self, category): condition = pd.Series(True, index=self.raw_data.index) if category in mapping: for col, val in mapping[category].items(): - condition &= (self.raw_data[col] == val) + condition &= self.raw_data[col] == val data_by_cat = self.raw_data[condition].reset_index(drop=True) - data_by_cat = self._data_transformer._format_datetime_col(data_by_cat) if self.spec.datetime_column else data_by_cat + data_by_cat = ( + self._data_transformer._format_datetime_col(data_by_cat) + if self.spec.datetime_column + else data_by_cat + ) return data_by_cat - def get_dict_by_series(self): if not self._data_dict: for s_id in self.list_series_ids(): @@ -59,12 +61,12 @@ def get_data_for_series(self, series_id): data_dict = self.get_dict_by_series() try: return data_dict[series_id] - except: + except Exception as e: raise InvalidParameterError( f"Unable to retrieve series {series_id} from {self.name}. Available series ids are: {self.list_series_ids()}" - ) + ) from e - def _load_data(self, data_spec, **kwargs): + def _load_data(self, data_spec): loading_start_time = time.time() try: raw_data = load_data(data_spec) @@ -77,7 +79,7 @@ def _load_data(self, data_spec, **kwargs): ) return raw_data - def _transform_data(self, spec, raw_data, **kwargs): + def _transform_data(self, spec, raw_data): transformation_start_time = time.time() self._data_transformer = self.Transformations(spec, name=self.name) data = self._data_transformer.run(raw_data) @@ -92,6 +94,7 @@ def load_transform_ingest_data(self, spec): self.data = self._transform_data(spec, self.raw_data) self._ingest_data(spec) + @abstractmethod def _ingest_data(self, spec): pass diff --git a/ads/opctl/operator/lowcode/forecast/model/arima.py b/ads/opctl/operator/lowcode/forecast/model/arima.py index 6bbd58d34..87edccdfa 100644 --- a/ads/opctl/operator/lowcode/forecast/model/arima.py +++ b/ads/opctl/operator/lowcode/forecast/model/arima.py @@ -1,23 +1,26 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +import traceback + import pandas as pd -import numpy as np import pmdarima as pm +import report_creator as rc from joblib import Parallel, delayed from ads.opctl import logger - -from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe from ads.opctl.operator.lowcode.common.utils import seconds_to_datetime -from .base_model import ForecastOperatorBaseModel +from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe + +from ..const import ForecastOutputColumns, SupportedModels from ..operator_config import ForecastOperatorConfig -import traceback +from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -from ..const import ForecastOutputColumns, SupportedModels + +logging.getLogger("root").setLevel(logging.WARNING) class ArimaOperatorModel(ForecastOperatorBaseModel): @@ -39,7 +42,7 @@ def set_kwargs(self): ) model_kwargs = self.spec.model_kwargs model_kwargs["alpha"] = 1 - self.spec.confidence_interval_width - if "error_action" not in model_kwargs.keys(): + if "error_action" not in model_kwargs: model_kwargs["error_action"] = "ignore" return model_kwargs @@ -129,13 +132,14 @@ def _train_model(self, i, s_id, df, model_kwargs): self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc()} + "error_trace": traceback.format_exc(), + } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = dict() + self.models = {} self.additional_regressors = self.datasets.get_additional_data_column_names() model_kwargs = self.set_kwargs() self.forecast_output = ForecastOutput( @@ -154,8 +158,6 @@ def _build_model(self) -> pd.DataFrame: def _generate_report(self): """The method that needs to be implemented on the particular model level.""" - import report_creator as rc - all_sections = [] if len(self.models) > 0: sec5_text = rc.Heading("ARIMA Model Parameters", level=2) diff --git a/ads/opctl/operator/lowcode/forecast/model/automlx.py b/ads/opctl/operator/lowcode/forecast/model/automlx.py index eda6112b4..41846a5d3 100644 --- a/ads/opctl/operator/lowcode/forecast/model/automlx.py +++ b/ads/opctl/operator/lowcode/forecast/model/automlx.py @@ -1,29 +1,30 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -import traceback - # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +import traceback -import pandas as pd import numpy as np +import pandas as pd +import report_creator as rc + from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl import logger +from ads.opctl.operator.lowcode.common.utils import ( + seconds_to_datetime, +) from ads.opctl.operator.lowcode.forecast.const import ( AUTOMLX_METRIC_MAP, ForecastOutputColumns, SupportedModels, ) -from ads.opctl import logger +from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe -from .base_model import ForecastOperatorBaseModel from ..operator_config import ForecastOperatorConfig +from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -from ads.opctl.operator.lowcode.common.utils import ( - seconds_to_datetime, - datetime_to_seconds, -) -from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe +logging.getLogger("root").setLevel(logging.WARNING) AUTOMLX_N_ALGOS_TUNED = 4 AUTOMLX_DEFAULT_SCORE_METRIC = "neg_sym_mean_abs_percent_error" @@ -47,12 +48,13 @@ def set_kwargs(self): ) model_kwargs_cleaned.pop("task", None) time_budget = model_kwargs_cleaned.pop("time_budget", -1) - model_kwargs_cleaned[ - "preprocessing" - ] = self.spec.preprocessing.enabled or model_kwargs_cleaned.get("preprocessing", True) + model_kwargs_cleaned["preprocessing"] = ( + self.spec.preprocessing.enabled + or model_kwargs_cleaned.get("preprocessing", True) + ) return model_kwargs_cleaned, time_budget - def preprocess(self, data, series_id=None): # TODO: re-use self.le for explanations + def preprocess(self, data): # TODO: re-use self.le for explanations _, df_encoded = _label_encode_dataframe( data, no_encode={self.spec.datetime_column.name, self.original_target_column}, @@ -74,11 +76,12 @@ def preprocess(self, data, series_id=None): # TODO: re-use self.le for explanat ), ) def _build_model(self) -> pd.DataFrame: - from automlx import init import logging + import automlx + try: - init( + automlx.init( engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}}, loglevel=logging.CRITICAL, @@ -88,7 +91,7 @@ def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = dict() + self.models = {} horizon = self.spec.horizon self.spec.confidence_interval_width = self.spec.confidence_interval_width or 0.8 self.forecast_output = ForecastOutput( @@ -101,7 +104,7 @@ def _build_model(self) -> pd.DataFrame: # Clean up kwargs for pass through model_kwargs_cleaned, time_budget = self.set_kwargs() - for i, (s_id, df) in enumerate(full_data_dict.items()): + for s_id, df in full_data_dict.items(): try: logger.debug(f"Running automlx on series {s_id}") model_kwargs = model_kwargs_cleaned.copy() @@ -170,7 +173,7 @@ def _build_model(self) -> pd.DataFrame: self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc() + "error_trace": traceback.format_exc(), } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) @@ -197,15 +200,12 @@ def _generate_report(self): - ds_forecast_col (pd.Series): The pd.Series object representing the forecasted column. - ci_col_names (List[str]): A list of column names for the confidence interval in the report. """ - import report_creator as rc - - """The method that needs to be implemented on the particular model level.""" - selected_models = dict() + selected_models = {} models = self.models other_sections = [] if len(self.models) > 0: - for i, (s_id, m) in enumerate(models.items()): + for s_id, m in models.items(): selected_models[s_id] = { "series_id": s_id, "selected_model": m.selected_model_, @@ -352,7 +352,7 @@ def _custom_predict_automlx(self, data): """ data_temp = pd.DataFrame( data, - columns=[col for col in self.dataset_cols], + columns=list(self.dataset_cols), ) return self.models.get(self.series_id).forecast( diff --git a/ads/opctl/operator/lowcode/forecast/model/autots.py b/ads/opctl/operator/lowcode/forecast/model/autots.py index 37b57ca75..fac04a898 100644 --- a/ads/opctl/operator/lowcode/forecast/model/autots.py +++ b/ads/opctl/operator/lowcode/forecast/model/autots.py @@ -1,24 +1,26 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import copy +import logging import traceback + import pandas as pd -import numpy as np +import report_creator as rc import yaml +from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger -from ads.opctl.operator.lowcode.common.utils import seconds_to_datetime -from .base_model import ForecastOperatorBaseModel +from ads.opctl.operator.lowcode.forecast.utils import _select_plot_list + +from ..const import ForecastOutputColumns, SupportedModels from ..operator_config import ForecastOperatorConfig -from ads.common.decorator.runtime_dependency import runtime_dependency +from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -from ..const import ForecastOutputColumns, SupportedModels -from ads.opctl.operator.lowcode.forecast.utils import _select_plot_list +logging.getLogger("root").setLevel(logging.WARNING) AUTOTS_MAX_GENERATION = 10 AUTOTS_MODELS_TO_VALIDATE = 0.15 @@ -43,10 +45,9 @@ def _build_model(self) -> pd.DataFrame: """ # Import necessary libraries - from autots import AutoTS, create_regressor + from autots import AutoTS self.outputs = None - models = dict() # Get the name of the datetime column self.forecast_output = ForecastOutput( confidence_interval_width=self.spec.confidence_interval_width, @@ -208,7 +209,7 @@ def _build_model(self) -> pd.DataFrame: self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc() + "error_trace": traceback.format_exc(), } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) @@ -231,7 +232,6 @@ def _generate_report(self) -> tuple: - ds_forecast_col (pd.Index): A pandas Index containing the forecast column values. - ci_col_names (list): A list of column names for confidence intervals. """ - import report_creator as rc all_sections = [] if self.models: @@ -258,18 +258,16 @@ def _generate_report(self) -> tuple: yaml.dump(list(self.models.best_model.T.to_dict().values())[0]), ) - except KeyError as ke: - logger.warn( - f"Issue generating Model Parameters Table Section. Skipping" - ) + except KeyError: + logger.warn("Issue generating Model Parameters Table Section. Skipping") sec2 = rc.Text("Error generating model parameters.") section_2 = rc.Block(sec2_text, sec2) - all_sections = [sec_1_plots, section_2] + all_sections = [section_1, section_2] if self.spec.generate_explanations: - logger.warn(f"Explanations not yet supported for the AutoTS Module") + logger.warn("Explanations not yet supported for the AutoTS Module") # Model Description model_description = rc.Text( @@ -305,7 +303,7 @@ def generate_train_metrics(self) -> pd.DataFrame: ).T df = pd.concat([mapes, scores]) except Exception as e: - logger.debug(f"Failed to generate training metrics") + logger.debug("Failed to generate training metrics") logger.debug(f"Received Error Statement: {e}") return df diff --git a/ads/opctl/operator/lowcode/forecast/model/base_model.py b/ads/opctl/operator/lowcode/forecast/model/base_model.py index 6045826f1..84aa53208 100644 --- a/ads/opctl/operator/lowcode/forecast/model/base_model.py +++ b/ads/opctl/operator/lowcode/forecast/model/base_model.py @@ -1,52 +1,57 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import fsspec -import numpy as np +import logging import os -import pandas as pd import tempfile import time import traceback from abc import ABC, abstractmethod from typing import Tuple +import fsspec +import numpy as np +import pandas as pd +import report_creator as rc + from ads.common.decorator.runtime_dependency import runtime_dependency from ads.common.object_storage_details import ObjectStorageDetails from ads.opctl import logger from ads.opctl.operator.lowcode.common.utils import ( - human_time_friendly, - enable_print, + datetime_to_seconds, disable_print, - write_data, + enable_print, + human_time_friendly, merged_category_column_name, - datetime_to_seconds, seconds_to_datetime, + write_data, ) from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import TestData from ads.opctl.operator.lowcode.forecast.utils import ( + _build_metrics_df, + _build_metrics_per_horizon, + _label_encode_dataframe, default_signer, evaluate_train_metrics, - get_forecast_plots, get_auto_select_plot, - _build_metrics_df, - _build_metrics_per_horizon, + get_forecast_plots, load_pkl, write_pkl, - _label_encode_dataframe, ) -from .forecast_datasets import ForecastDatasets + from ..const import ( + AUTO_SELECT, SUMMARY_METRICS_HORIZON_LIMIT, + SpeedAccuracyMode, SupportedMetrics, SupportedModels, - SpeedAccuracyMode, - AUTO_SELECT ) from ..operator_config import ForecastOperatorConfig, ForecastOperatorSpec +from .forecast_datasets import ForecastDatasets + +logging.getLogger("root").setLevel(logging.WARNING) class ForecastOperatorBaseModel(ABC): @@ -70,7 +75,7 @@ def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): self.original_target_column = self.spec.target_column self.dt_column_name = self.spec.datetime_column.name - self.model_parameters = dict() + self.model_parameters = {} self.loaded_models = None # these fields are populated in the _build_model() method @@ -79,20 +84,21 @@ def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): # "outputs" is a list of outputs generated by the models. These should only be generated when the framework requires the original output for plotting self.outputs = None self.forecast_output = None - self.errors_dict = dict() - self.le = dict() + self.errors_dict = {} + self.le = {} self.formatted_global_explanation = None self.formatted_local_explanation = None self.forecast_col_name = "yhat" - self.perform_tuning = (self.spec.tuning != None) and ( - self.spec.tuning.n_trials != None + self.perform_tuning = (self.spec.tuning is not None) and ( + self.spec.tuning.n_trials is not None ) def generate_report(self): """Generates the forecasting report.""" import warnings + from sklearn.exceptions import ConvergenceWarning with warnings.catch_warnings(): @@ -100,7 +106,6 @@ def generate_report(self): warnings.simplefilter(action="ignore", category=UserWarning) warnings.simplefilter(action="ignore", category=RuntimeWarning) warnings.simplefilter(action="ignore", category=ConvergenceWarning) - import report_creator as rc # load models if given if self.spec.previous_output_dir is not None: @@ -128,7 +133,7 @@ def generate_report(self): ) = self._test_evaluate_metrics( elapsed_time=elapsed_time, ) - except Exception as e: + except Exception: logger.warn("Unable to generate Test Metrics.") logger.debug(f"Full Traceback: {traceback.format_exc()}") report_sections = [] @@ -253,25 +258,30 @@ def generate_report(self): backtest_report_name = "backtest_stats.csv" file_path = f"{output_dir}/{backtest_report_name}" if self.spec.model == AUTO_SELECT: - backtest_sections.append(rc.Heading("Auto-select statistics", level=2)) + backtest_sections.append( + rc.Heading("Auto-select statistics", level=2) + ) if not os.path.exists(file_path): - failure_msg = rc.Text("auto-select could not be executed. Please check the " - "logs for more details.") + failure_msg = rc.Text( + "auto-select could not be executed. Please check the " + "logs for more details." + ) backtest_sections.append(failure_msg) else: backtest_stats = pd.read_csv(file_path) average_dict = backtest_stats.mean().to_dict() - del average_dict['backtest'] + del average_dict["backtest"] best_model = min(average_dict, key=average_dict.get) backtest_text = rc.Heading("Back Testing Metrics", level=3) summary_text = rc.Text( f"Overall, the average scores for the models are {average_dict}, with {best_model}" - f" being identified as the top-performing model during backtesting.") + f" being identified as the top-performing model during backtesting." + ) backtest_table = rc.DataTable(backtest_stats, index=True) liner_plot = get_auto_select_plot(backtest_stats) - backtest_sections.extend([backtest_text, backtest_table, summary_text, - liner_plot]) - + backtest_sections.extend( + [backtest_text, backtest_table, summary_text, liner_plot] + ) forecast_plots = [] if len(self.forecast_output.list_series_ids()) > 0: @@ -431,14 +441,13 @@ def _save_report( test_metrics_df: pd.DataFrame, ): """Saves resulting reports to the given folder.""" - import report_creator as rc unique_output_dir = self.spec.output_directory.url if ObjectStorageDetails.is_oci_path(unique_output_dir): storage_options = default_signer() else: - storage_options = dict() + storage_options = {} # report-creator html report if self.spec.generate_report: @@ -580,7 +589,7 @@ def _save_report( indent=4, ) else: - logger.info(f"All modeling completed successfully.") + logger.info("All modeling completed successfully.") def preprocess(self, df, series_id): """The method that needs to be implemented on the particular model level.""" @@ -622,8 +631,8 @@ def generate_train_metrics(self) -> pd.DataFrame: def _load_model(self): try: self.loaded_models = load_pkl(self.spec.previous_output_dir + "/model.pkl") - except: - logger.info("model.pkl is not present") + except Exception as e: + logger.info(f"model.pkl is not present. Error: {e}") def _save_model(self, output_dir, storage_options): write_pkl( @@ -693,7 +702,7 @@ def explain_model(self): if not len(kernel_explnr_vals): logger.warn( - f"No explanations generated. Ensure that additional data has been provided." + "No explanations generated. Ensure that additional data has been provided." ) else: self.global_explanation[s_id] = dict( diff --git a/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py index c3804f88d..73a81ac0b 100644 --- a/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +++ b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py @@ -1,33 +1,23 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2023 Oracle and/or its affiliates. +# Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import time import pandas as pd -from pandas.api.types import is_datetime64_any_dtype, is_string_dtype, is_numeric_dtype -from ..operator_config import ForecastOperatorConfig from ads.opctl import logger -from ..const import ForecastOutputColumns, PROPHET_INTERNAL_DATE_COL -from ads.common.object_storage_details import ObjectStorageDetails -from ads.opctl.operator.lowcode.common.utils import ( - get_frequency_in_seconds, - get_frequency_of_datetime, -) from ads.opctl.operator.lowcode.common.data import AbstractData -from ads.opctl.operator.lowcode.forecast.utils import ( - default_signer, -) from ads.opctl.operator.lowcode.common.errors import ( - InputDataError, - InvalidParameterError, - PermissionsError, DataMismatchError, + InvalidParameterError, +) +from ads.opctl.operator.lowcode.common.utils import ( + get_frequency_in_seconds, + get_frequency_of_datetime, ) -from ..const import SupportedModels -from abc import ABC, abstractmethod + +from ..const import ForecastOutputColumns, SupportedModels +from ..operator_config import ForecastOperatorConfig class HistoricalData(AbstractData): @@ -51,13 +41,12 @@ def _verify_dt_col(self, spec): self.freq_in_secs = get_frequency_in_seconds( self.data.index.get_level_values(0) ) - if spec.model == SupportedModels.AutoMLX: - if abs(self.freq_in_secs) < 3600: - message = ( - "{} requires data with a frequency of at least one hour. Please try using a different model," - " or select the 'auto' option.".format(SupportedModels.AutoMLX) - ) - raise InvalidParameterError(message) + if spec.model == SupportedModels.AutoMLX and abs(self.freq_in_secs) < 3600: + message = ( + f"{SupportedModels.AutoMLX} requires data with a frequency of at least one hour. Please try using a different model," + " or select the 'auto' option." + ) + raise InvalidParameterError(message) class AdditionalData(AbstractData): @@ -77,11 +66,11 @@ def __init__(self, spec, historical_data): else: self.name = "additional_data" self.data = None - self._data_dict = dict() + self._data_dict = {} self.create_horizon(spec, historical_data) def create_horizon(self, spec, historical_data): - logger.debug(f"No additional data provided. Constructing horizon.") + logger.debug("No additional data provided. Constructing horizon.") future_dates = pd.Series( pd.date_range( start=historical_data.get_max_time(), @@ -109,6 +98,7 @@ def create_horizon(self, spec, historical_data): self.additional_regressors = [] def _ingest_data(self, spec): + _spec = spec self.additional_regressors = list(self.data.columns) if not self.additional_regressors: logger.warn( @@ -146,12 +136,11 @@ def _load_data(self, spec): self.historical_data = HistoricalData(spec) self.additional_data = AdditionalData(spec, self.historical_data) - if spec.generate_explanations: - if spec.additional_data is None: - logger.warn( - f"Unable to generate explanations as there is no additional data passed in. Either set generate_explanations to False, or pass in additional data." - ) - spec.generate_explanations = False + if spec.generate_explanations and spec.additional_data is None: + logger.warn( + "Unable to generate explanations as there is no additional data passed in. Either set generate_explanations to False, or pass in additional data." + ) + spec.generate_explanations = False def get_all_data_long(self, include_horizon=True): how = "outer" if include_horizon else "left" @@ -182,7 +171,7 @@ def get_data_multi_indexed(self): ) def get_data_by_series(self, include_horizon=True): - total_dict = dict() + total_dict = {} hist_data = self.historical_data.get_dict_by_series() add_data = self.additional_data.get_dict_by_series() how = "outer" if include_horizon else "left" @@ -200,10 +189,10 @@ def get_data_at_series(self, s_id, include_horizon=True): all_data = self.get_data_by_series(include_horizon=include_horizon) try: return all_data[s_id] - except: + except Exception as e: raise InvalidParameterError( f"Unable to retrieve series id: {s_id} from data. Available series ids are: {self.list_series_ids()}" - ) + ) from e def get_horizon_at_series(self, s_id): return self.get_data_at_series(s_id)[-self._horizon :] @@ -234,7 +223,7 @@ def list_series_ids(self, sorted=True): if sorted: try: series_ids.sort() - except: + except Exception: pass return series_ids @@ -269,7 +258,7 @@ def __init__( target_column: str the name of the original target column dt_column: the name of the original datetime column """ - self.series_id_map = dict() + self.series_id_map = {} self._set_ci_column_names(confidence_interval_width) self.horizon = horizon self.target_column_name = target_column @@ -281,7 +270,7 @@ def add_series_id( forecast: pd.DataFrame, overwrite: bool = False, ): - if not overwrite and series_id in self.series_id_map.keys(): + if not overwrite and series_id in self.series_id_map: raise ValueError( f"Attempting to update ForecastOutput for series_id {series_id} when this already exists. Set overwrite to True." ) @@ -321,15 +310,15 @@ def populate_series_output( """ try: output_i = self.series_id_map[series_id] - except KeyError: + except KeyError as e: raise ValueError( f"Attempting to update output for series: {series_id}, however no series output has been initialized." - ) + ) from e if (output_i.shape[0] - self.horizon) == len(fit_val): - output_i["fitted_value"].iloc[ - : -self.horizon - ] = fit_val # Note: may need to do len(output_i) - (len(fit_val) + horizon) : -horizon + output_i["fitted_value"].iloc[: -self.horizon] = ( + fit_val # Note: may need to do len(output_i) - (len(fit_val) + horizon) : -horizon + ) elif (output_i.shape[0] - self.horizon) > len(fit_val): logger.debug( f"Fitted Values were only generated on a subset ({len(fit_val)}/{(output_i.shape[0] - self.horizon)}) of the data for Series: {series_id}." @@ -378,7 +367,7 @@ def get_horizon_long(self): def get_forecast(self, series_id): try: return self.series_id_map[series_id] - except KeyError as ke: + except KeyError: logger.debug( f"No Forecast found for series_id: {series_id}. Returning empty DataFrame." ) @@ -389,7 +378,7 @@ def list_series_ids(self, sorted=True): if sorted: try: series_ids.sort() - except: + except Exception: pass return series_ids diff --git a/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py b/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py index 5af3e304b..9907a26e7 100644 --- a/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py +++ b/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py @@ -2,6 +2,7 @@ # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import traceback import pandas as pd @@ -192,6 +193,8 @@ def _generate_report(self): import report_creator as rc from utilsforecast.plotting import plot_series + logging.getLogger("root").setLevel(logging.WARNING) + # Section 1: Forecast Overview sec1_text = rc.Block( rc.Heading("Forecast Overview", level=2), diff --git a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py index 769b3948a..08afa092a 100644 --- a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py @@ -1,45 +1,35 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +import traceback + import numpy as np import optuna import pandas as pd -from joblib import Parallel, delayed from torch import Tensor -from torchmetrics.regression import ( - MeanAbsoluteError, - MeanAbsolutePercentageError, - MeanSquaredError, - R2Score, - SymmetricMeanAbsolutePercentageError, -) from ads.common.decorator.runtime_dependency import ( OptionalDependency, runtime_dependency, ) from ads.opctl import logger - -from ..const import DEFAULT_TRIALS, ForecastOutputColumns, SupportedModels -from ads.opctl.operator.lowcode.forecast.utils import ( - load_pkl, - write_pkl, - _select_plot_list, - _label_encode_dataframe, -) from ads.opctl.operator.lowcode.common.utils import ( disable_print, enable_print, - seconds_to_datetime, ) -from .base_model import ForecastOperatorBaseModel +from ads.opctl.operator.lowcode.forecast.utils import ( + _select_plot_list, + load_pkl, + write_pkl, +) + +from ..const import DEFAULT_TRIALS, SupportedModels from ..operator_config import ForecastOperatorConfig +from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -import traceback - # def _get_np_metrics_dict(selected_metric): # metric_translation = { @@ -62,7 +52,7 @@ object="NeuralProphet", install_from=OptionalDependency.FORECAST, ) -def _fit_model(data, params, additional_regressors, select_metric): +def _fit_model(data, params, additional_regressors): from neuralprophet import NeuralProphet, set_log_level if logger.level > 10: @@ -70,13 +60,12 @@ def _fit_model(data, params, additional_regressors, select_metric): disable_print() m = NeuralProphet(**params) - # m.metrics = _get_np_metrics_dict(select_metric) for add_reg in additional_regressors: m = m.add_future_regressor(name=add_reg) m.fit(df=data) - accepted_regressors_config = m.config_regressors or dict() + accepted_regressors_config = m.config_regressors or {} if hasattr(accepted_regressors_config, "regressors"): - accepted_regressors_config = accepted_regressors_config.regressors or dict() + accepted_regressors_config = accepted_regressors_config.regressors or {} enable_print() return m, list(accepted_regressors_config.keys()) @@ -97,11 +86,12 @@ def _load_model(self): self.loaded_trainers = load_pkl( self.spec.previous_output_dir + "/trainer.pkl" ) - except: - logger.debug("model.pkl/trainer.pkl is not present") + except Exception as e: + logger.debug(f"model.pkl/trainer.pkl is not present. Error message: {e}") def set_kwargs(self): # Extract the Confidence Interval Width and convert to prophet's equivalent - interval_width + model_kwargs = self.spec.model_kwargs if self.spec.confidence_interval_width is None: quantiles = model_kwargs.get("quantiles", [0.05, 0.95]) self.spec.confidence_interval_width = float(quantiles[1]) - float( @@ -110,8 +100,6 @@ def set_kwargs(self): else: boundaries = round((1 - self.spec.confidence_interval_width) / 2, 2) quantiles = [boundaries, self.spec.confidence_interval_width + boundaries] - - model_kwargs = self.spec.model_kwargs model_kwargs["quantiles"] = quantiles return model_kwargs @@ -124,12 +112,10 @@ def _train_model(self, i, s_id, df, model_kwargs): if self.loaded_models is not None and s_id in self.loaded_models: model = self.loaded_models[s_id] - accepted_regressors_config = ( - model.config_regressors.regressors or dict() - ) + accepted_regressors_config = model.config_regressors.regressors or {} if hasattr(accepted_regressors_config, "regressors"): accepted_regressors_config = ( - accepted_regressors_config.regressors or dict() + accepted_regressors_config.regressors or {} ) self.accepted_regressors[s_id] = list(accepted_regressors_config.keys()) if self.loaded_trainers is not None and s_id in self.loaded_trainers: @@ -143,8 +129,6 @@ def _train_model(self, i, s_id, df, model_kwargs): data=data_i, params=model_kwargs, additional_regressors=self.additional_regressors, - select_metric=None, - # select_metric=self.spec.metric, ) logger.debug( @@ -205,7 +189,6 @@ def _train_model(self, i, s_id, df, model_kwargs): "config_normalization": model.config_normalization, "config_missing": model.config_missing, "config_model": model.config_model, - "config_normalization": model.config_normalization, "data_freq": model.data_freq, "fitted": model.fitted, "data_params": model.data_params, @@ -220,19 +203,19 @@ def _train_model(self, i, s_id, df, model_kwargs): self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc() + "error_trace": traceback.format_exc(), } logger.warn(traceback.format_exc()) raise e def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = dict() - self.trainers = dict() - self.outputs = dict() - self.errors_dict = dict() - self.explanations_info = dict() - self.accepted_regressors = dict() + self.models = {} + self.trainers = {} + self.outputs = {} + self.errors_dict = {} + self.explanations_info = {} + self.accepted_regressors = {} self.additional_regressors = self.datasets.get_additional_data_column_names() model_kwargs = self.set_kwargs() self.forecast_output = ForecastOutput( @@ -282,7 +265,6 @@ def objective(trial): data=df_train, params=params, additional_regressors=self.additional_regressors, - select_metric=self.spec.metric, ) df_test = df_test[["y", "ds"] + accepted_regressors] @@ -326,6 +308,8 @@ def objective(trial): def _generate_report(self): import report_creator as rc + logging.getLogger("root").setLevel(logging.WARNING) + series_ids = self.models.keys() all_sections = [] if len(series_ids) > 0: @@ -371,7 +355,7 @@ def _generate_report(self): sec5_text = rc.Heading("Neural Prophet Model Parameters", level=2) model_states = [] - for i, (s_id, m) in enumerate(self.models.items()): + for s_id, m in self.models.items(): model_states.append( pd.Series( m.state_dict(), @@ -449,7 +433,7 @@ def _save_model(self, output_dir, storage_options): ) def explain_model(self): - self.local_explanation = dict() + self.local_explanation = {} global_expl = [] rename_cols = { f"future_regressor_{col}": col diff --git a/ads/opctl/operator/lowcode/forecast/model/prophet.py b/ads/opctl/operator/lowcode/forecast/model/prophet.py index 40c842911..fc70b6c11 100644 --- a/ads/opctl/operator/lowcode/forecast/model/prophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/prophet.py @@ -1,17 +1,23 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +import traceback + +import matplotlib as mpl import numpy as np import optuna import pandas as pd -import logging from joblib import Parallel, delayed -from ads.common.decorator.runtime_dependency import runtime_dependency + from ads.opctl import logger +from ads.opctl.operator.lowcode.common.utils import set_log_level from ads.opctl.operator.lowcode.forecast.operator_config import ForecastOperatorConfig +from ads.opctl.operator.lowcode.forecast.utils import ( + _select_plot_list, +) from ..const import ( DEFAULT_TRIALS, @@ -19,23 +25,14 @@ ForecastOutputColumns, SupportedModels, ) -from ads.opctl.operator.lowcode.forecast.utils import ( - _select_plot_list, - _label_encode_dataframe, -) -from ads.opctl.operator.lowcode.common.utils import set_log_level from .base_model import ForecastOperatorBaseModel -from ..operator_config import ForecastOperatorConfig from .forecast_datasets import ForecastDatasets, ForecastOutput -import traceback -import matplotlib as mpl - try: set_log_level("prophet", logger.level) set_log_level("cmdstanpy", logger.level) mpl.rcParams["figure.max_open_warning"] = 100 -except: +except Exception: pass @@ -73,9 +70,6 @@ def set_kwargs(self): def _train_model(self, i, series_id, df, model_kwargs): try: - from prophet import Prophet - from prophet.diagnostics import cross_validation, performance_metrics - self.forecast_output.init_series_output( series_id=series_id, data_at_series=df ) @@ -130,15 +124,15 @@ def _train_model(self, i, series_id, df, model_kwargs): self.errors_dict[series_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc() + "error_trace": traceback.format_exc(), } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = dict() - self.outputs = dict() + self.models = {} + self.outputs = {} self.additional_regressors = self.datasets.get_additional_data_column_names() model_kwargs = self.set_kwargs() self.forecast_output = ForecastOutput( @@ -249,6 +243,8 @@ def _generate_report(self): import report_creator as rc from prophet.plot import add_changepoints_to_plot + logging.getLogger("root").setLevel(logging.WARNING) + series_ids = self.models.keys() all_sections = [] if len(series_ids) > 0: @@ -351,7 +347,6 @@ def _generate_report(self): # Append the global explanation text and section to the "all_sections" list all_sections = all_sections + [ global_explanation_section, - local_explanation_text, local_explanation_section, ] except Exception as e: diff --git a/ads/opctl/operator/lowcode/forecast/utils.py b/ads/opctl/operator/lowcode/forecast/utils.py index 76f554ff8..e3a88d7b7 100644 --- a/ads/opctl/operator/lowcode/forecast/utils.py +++ b/ads/opctl/operator/lowcode/forecast/utils.py @@ -1,41 +1,41 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import os -import sys -from typing import List +from typing import Set +import cloudpickle import fsspec import numpy as np import pandas as pd -import cloudpickle -import plotly.express as px +import report_creator as rc from plotly import graph_objects as go +from scipy.stats import linregress from sklearn.metrics import ( explained_variance_score, mean_absolute_percentage_error, mean_squared_error, + r2_score, ) -from scipy.stats import linregress -from sklearn.metrics import r2_score - from ads.common.object_storage_details import ObjectStorageDetails from ads.dataset.label_encoder import DataFrameLabelEncoder from ads.opctl import logger - -from .const import SupportedMetrics, SupportedModels, RENDER_LIMIT -from .errors import ForecastInputDataError, ForecastSchemaYamlError -from .operator_config import ForecastOperatorSpec, ForecastOperatorConfig -from ads.opctl.operator.lowcode.common.utils import merge_category_columns from ads.opctl.operator.lowcode.forecast.const import ForecastOutputColumns -import report_creator as rc +from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import ( + ForecastOutput, + TestData, +) + +from .const import RENDER_LIMIT, SupportedMetrics +logging.getLogger("root").setLevel(logging.WARNING) -def _label_encode_dataframe(df, no_encode=set()): + +def _label_encode_dataframe(df, no_encode: Set = None): df_to_encode = df[list(set(df.columns) - no_encode)] le = DataFrameLabelEncoder().fit(df_to_encode) return le, le.transform(df) @@ -54,15 +54,14 @@ def smape(actual, predicted) -> float: denominator[zero_mask] = 1 numerator = np.abs(actual - predicted) - default_output = np.ones_like(numerator) * np.inf abs_error = np.divide(numerator, denominator) return round(np.mean(abs_error) * 100, 2) def _build_metrics_per_horizon( - test_data: "TestData", - output: "ForecastOutput", + test_data: TestData, + output: ForecastOutput, ) -> pd.DataFrame: """ Calculates Mean sMAPE, Median sMAPE, Mean MAPE, Median MAPE, Mean wMAPE, Median wMAPE for each horizon @@ -172,7 +171,7 @@ def _build_metrics_per_horizon( def load_pkl(filepath): - storage_options = dict() + storage_options = {} if ObjectStorageDetails.is_oci_path(filepath): storage_options = default_signer() @@ -194,13 +193,13 @@ def write_pkl(obj, filename, output_dir, storage_options): def _build_metrics_df(y_true, y_pred, series_id): if len(y_true) == 0 or len(y_pred) == 0: return pd.DataFrame() - metrics = dict() + metrics = {} metrics["sMAPE"] = smape(actual=y_true, predicted=y_pred) metrics["MAPE"] = mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred) metrics["RMSE"] = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred)) try: metrics["r2"] = linregress(y_true, y_pred).rvalue ** 2 - except: + except Exception: metrics["r2"] = r2_score(y_true=y_true, y_pred=y_pred) metrics["Explained Variance"] = explained_variance_score( y_true=y_true, y_pred=y_pred @@ -208,16 +207,13 @@ def _build_metrics_df(y_true, y_pred, series_id): return pd.DataFrame.from_dict(metrics, orient="index", columns=[series_id]) -def evaluate_train_metrics(output, metrics_col_name=None): +def evaluate_train_metrics(output): """ Training metrics Parameters: output: ForecastOutputs - metrics_col_name: str - Only passed in if the series column was created artifically. - When passed in, replaces s_id as the column name in the metrics table """ total_metrics = pd.DataFrame() for s_id in output.list_series_ids(): @@ -262,20 +258,21 @@ def _select_plot_list(fn, series_ids): def _add_unit(num, unit): return f"{num} {unit}" + def get_auto_select_plot(backtest_results): fig = go.Figure() columns = backtest_results.columns.tolist() back_test_column = "backtest" columns.remove(back_test_column) - for i, column in enumerate(columns): - color = 0 #int(i * 255 / len(columns)) + for column in columns: fig.add_trace( go.Scatter( - x=backtest_results[back_test_column], - y=backtest_results[column], - mode="lines", - name=column, - )) + x=backtest_results[back_test_column], + y=backtest_results[column], + mode="lines", + name=column, + ) + ) return rc.Widget(fig) @@ -383,6 +380,7 @@ def plot_forecast_plotly(s_id): return _select_plot_list(plot_forecast_plotly, forecast_output.list_series_ids()) + def convert_target(target: str, target_col: str): """ Removes the target_column that got appended to target. diff --git a/ads/opctl/operator/lowcode/pii/model/report.py b/ads/opctl/operator/lowcode/pii/model/report.py index 50e0fe579..70ef098d8 100644 --- a/ads/opctl/operator/lowcode/pii/model/report.py +++ b/ads/opctl/operator/lowcode/pii/model/report.py @@ -1,10 +1,10 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import os import random import tempfile @@ -40,11 +40,13 @@ try: import report_creator as rc -except ImportError: +except ImportError as e: raise ModuleNotFoundError( f"`report-creator` module was not found. Please run " f"`pip install {OptionalDependency.PII}`." - ) + ) from e + +logging.getLogger("root").setLevel(logging.WARNING) @dataclass(repr=True) @@ -139,13 +141,13 @@ def make_model_card(model_name="", readme_path=""): fig = go.Figure( data=[ go.Table( - header=dict(values=list(df.columns)), - cells=dict(values=[df.Metrics, df.Values]), + header={"Columns": df.columns}, + cells={"Metrics": df.Metrics, "Values": df.Values}, ) ] ) eval_res_tb = rc.Widget(data=fig, caption="Evaluation Results") - except: + except Exception: eval_res_tb = rc.Text("-") logger.warning( "The given readme.md doesn't have correct template for Evaluation Results." @@ -321,7 +323,9 @@ def make_view(self): self.report_sections = [title_text, report_description, time_proceed, structure] return self - def save_report(self, report_sections=None, report_uri=None, storage_options={}): + def save_report( + self, report_sections=None, report_uri=None, storage_options: Dict = None + ): with tempfile.TemporaryDirectory() as temp_dir: report_local_path = os.path.join(temp_dir, "___report.html") disable_print() diff --git a/ads/opctl/operator/lowcode/recommender/model/base_model.py b/ads/opctl/operator/lowcode/recommender/model/base_model.py index f317b19b0..c345f84a7 100644 --- a/ads/opctl/operator/lowcode/recommender/model/base_model.py +++ b/ads/opctl/operator/lowcode/recommender/model/base_model.py @@ -1,39 +1,43 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import os import tempfile import time from abc import ABC, abstractmethod -from typing import Tuple, Dict +from typing import Dict, Tuple import fsspec import pandas as pd import report_creator as rc +from plotly import graph_objects as go from ads.common.object_storage_details import ObjectStorageDetails from ads.opctl import logger -from ads.opctl.operator.lowcode.common.utils import default_signer from ads.opctl.operator.lowcode.common.utils import ( - human_time_friendly, - enable_print, + default_signer, disable_print, + enable_print, + human_time_friendly, write_data, ) + +from ..operator_config import RecommenderOperatorConfig from .factory import SupportedModels from .recommender_dataset import RecommenderDatasets -from ..operator_config import RecommenderOperatorConfig -from plotly import graph_objects as go -import matplotlib.pyplot as plt + +logging.getLogger("root").setLevel(logging.WARNING) class RecommenderOperatorBaseModel(ABC): """The base class for the recommender detection operator models.""" - def __init__(self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets): + def __init__( + self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets + ): self.config = config self.spec = self.config.spec self.datasets = datasets @@ -71,7 +75,7 @@ def generate_report(self): rc.Metric( heading="Num items", value=len(self.datasets.items), - ) + ), ), ) @@ -83,62 +87,67 @@ def generate_report(self): user_rating_counts = self.datasets.interactions[user_col].value_counts() fig_user = go.Figure(data=[go.Histogram(x=user_rating_counts, nbinsx=100)]) fig_user.update_layout( - title=f'Distribution of the number of interactions by {user_col}', - xaxis_title=f'Number of {interaction_col}', - yaxis_title=f'Number of {user_col}', - bargap=0.2 + title=f"Distribution of the number of interactions by {user_col}", + xaxis_title=f"Number of {interaction_col}", + yaxis_title=f"Number of {user_col}", + bargap=0.2, ) item_title = rc.Heading("Item Statistics", level=2) item_rating_counts = self.datasets.interactions[item_col].value_counts() fig_item = go.Figure(data=[go.Histogram(x=item_rating_counts, nbinsx=100)]) fig_item.update_layout( - title=f'Distribution of the number of interactions by {item_col}', - xaxis_title=f'Number of {interaction_col}', - yaxis_title=f'Number of {item_col}', - bargap=0.2 + title=f"Distribution of the number of interactions by {item_col}", + xaxis_title=f"Number of {interaction_col}", + yaxis_title=f"Number of {item_col}", + bargap=0.2, ) result_heatmap_title = rc.Heading("Sample Recommendations", level=2) sample_items = result_df[item_col].head(100).index filtered_df = result_df[result_df[item_col].isin(sample_items)] - data = filtered_df.pivot(index=user_col, columns=item_col, values=interaction_col) - fig = go.Figure(data=go.Heatmap( - z=data.values, - x=data.columns, - y=data.index, - colorscale='Viridis' - )) + data = filtered_df.pivot( + index=user_col, columns=item_col, values=interaction_col + ) + fig = go.Figure( + data=go.Heatmap( + z=data.values, x=data.columns, y=data.index, colorscale="Viridis" + ) + ) fig.update_layout( - title='Recommendation heatmap of User-Item Interactions (sample)', + title="Recommendation heatmap of User-Item Interactions (sample)", width=1500, height=800, xaxis_title=item_col, yaxis_title=user_col, - coloraxis_colorbar=dict(title=interaction_col) + coloraxis_colorbar={"title": interaction_col}, ) - plots = [user_title, rc.Widget(fig_user), - item_title, rc.Widget(fig_item), - result_heatmap_title, rc.Widget(fig)] + plots = [ + user_title, + rc.Widget(fig_user), + item_title, + rc.Widget(fig_item), + result_heatmap_title, + rc.Widget(fig), + ] test_metrics_sections = [rc.DataTable(pd.DataFrame(metrics, index=[0]))] yaml_appendix_title = rc.Heading("Reference: YAML File", level=2) yaml_appendix = rc.Yaml(self.config.to_dict()) report_sections = ( - [summary] - + plots - + test_metrics_sections - + other_sections - + [yaml_appendix_title, yaml_appendix] + [summary] + + plots + + test_metrics_sections + + other_sections + + [yaml_appendix_title, yaml_appendix] ) # save the report and result CSV - self._save_report( - report_sections=report_sections, - result_df=result_df - ) + self._save_report(report_sections=report_sections, result_df=result_df) + @abstractmethod def _evaluation_metrics(self): pass + @abstractmethod def _test_data_evaluate_metrics(self): pass @@ -150,7 +159,7 @@ def _save_report(self, report_sections: Tuple, result_df: pd.DataFrame): if ObjectStorageDetails.is_oci_path(unique_output_dir): storage_options = default_signer() else: - storage_options = dict() + storage_options = {} # report-creator html report if self.spec.generate_report: @@ -161,19 +170,23 @@ def _save_report(self, report_sections: Tuple, result_df: pd.DataFrame): report.save(rc.Block(*report_sections), report_local_path) enable_print() - report_path = os.path.join(unique_output_dir, self.spec.report_filename) + report_path = os.path.join( + unique_output_dir, self.spec.report_filename + ) with open(report_local_path) as f1: with fsspec.open( - report_path, - "w", - **storage_options, + report_path, + "w", + **storage_options, ) as f2: f2.write(f1.read()) # recommender csv report write_data( data=result_df, - filename=os.path.join(unique_output_dir, self.spec.recommendations_filename), + filename=os.path.join( + unique_output_dir, self.spec.recommendations_filename + ), format="csv", storage_options=storage_options, ) diff --git a/ads/opctl/operator/lowcode/recommender/model/svd.py b/ads/opctl/operator/lowcode/recommender/model/svd.py index 968170986..a92a51fda 100644 --- a/ads/opctl/operator/lowcode/recommender/model/svd.py +++ b/ads/opctl/operator/lowcode/recommender/model/svd.py @@ -1,28 +1,30 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -from typing import Tuple, Dict, Any - # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +from typing import Dict, Tuple import pandas as pd +import report_creator as rc from pandas import DataFrame +from surprise import SVD, Dataset, Reader +from surprise.accuracy import mae, rmse +from surprise.model_selection import train_test_split -from .recommender_dataset import RecommenderDatasets +from ..constant import SupportedMetrics from ..operator_config import RecommenderOperatorConfig from .factory import RecommenderOperatorBaseModel -from surprise import Dataset, Reader -from surprise.model_selection import train_test_split -from surprise import SVD -from surprise.accuracy import rmse, mae -import report_creator as rc -from ..constant import SupportedMetrics +from .recommender_dataset import RecommenderDatasets + +logging.getLogger("root").setLevel(logging.WARNING) class SVDOperatorModel(RecommenderOperatorBaseModel): """Class representing scikit surprise SVD operator model.""" - def __init__(self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets): + def __init__( + self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets + ): super().__init__(config, datasets) self.interactions = datasets.interactions self.users = datasets.users @@ -35,8 +37,12 @@ def __init__(self, config: RecommenderOperatorConfig, datasets: RecommenderDatas def _get_recommendations(self, user_id, n): all_item_ids = self.items[self.item_id].unique() - rated_items = self.interactions[self.interactions[self.user_id] == user_id][self.item_id] - unrated_items = [item_id for item_id in all_item_ids if item_id not in rated_items.values] + rated_items = self.interactions[self.interactions[self.user_id] == user_id][ + self.item_id + ] + unrated_items = [ + item_id for item_id in all_item_ids if item_id not in rated_items.values + ] predictions = [self.algo.predict(user_id, item_id) for item_id in unrated_items] predictions.sort(key=lambda x: x.est, reverse=True) top_n_recommendations = predictions[:n] @@ -46,7 +52,10 @@ def _build_model(self) -> Tuple[DataFrame, Dict]: min_rating = self.interactions[self.interaction_column].min() max_rating = self.interactions[self.interaction_column].max() reader = Reader(rating_scale=(min_rating, max_rating)) - data = Dataset.load_from_df(self.interactions[[self.user_id, self.item_id, self.interaction_column]], reader) + data = Dataset.load_from_df( + self.interactions[[self.user_id, self.item_id, self.interaction_column]], + reader, + ) trainset, testset = train_test_split(data, test_size=self.test_size) self.algo.fit(trainset) predictions = self.algo.test(testset) @@ -58,11 +67,13 @@ def _build_model(self) -> Tuple[DataFrame, Dict]: for user_id in self.users[self.user_id]: recommendations = self._get_recommendations(user_id, n=self.spec.top_k) for item_id, est_rating in recommendations: - all_recommendations.append({ - self.user_id: user_id, - self.item_id: item_id, - self.interaction_column: est_rating - }) + all_recommendations.append( + { + self.user_id: user_id, + self.item_id: item_id, + self.interaction_column: est_rating, + } + ) recommendations_df = pd.DataFrame(all_recommendations) return recommendations_df, metric @@ -72,17 +83,18 @@ def _generate_report(self): decompose a user-item interaction matrix into three constituent matrices. These matrices capture the latent factors that explain the observed interactions. """ - new_user_recommendations = self._get_recommendations("__new_user__", self.spec.top_k) + new_user_recommendations = self._get_recommendations( + "__new_user__", self.spec.top_k + ) new_recommendations = [] for item_id, est_rating in new_user_recommendations: - new_recommendations.append({ - self.user_id: "__new_user__", - self.item_id: item_id, - self.interaction_column: est_rating - }) + new_recommendations.append( + { + self.user_id: "__new_user__", + self.item_id: item_id, + self.interaction_column: est_rating, + } + ) title = rc.Heading("Recommendations for new users", level=2) other_sections = [title, rc.DataTable(new_recommendations)] - return ( - model_description, - other_sections - ) + return (model_description, other_sections)