diff --git a/.gitignore b/.gitignore index 1d857879d..ac478a955 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,6 @@ logs/ # vim *.swp + +# Python Wheel +*.whl diff --git a/MANIFEST.in b/MANIFEST.in index 00307141d..b7e2d86ea 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -24,3 +24,7 @@ exclude build/lib/notebooks/** exclude benchmark/** include ads/ads include ads/model/common/*.* +include ads/operator/**/*.md +include ads/operator/**/*.yaml +include ads/operator/**/*.whl +include ads/operator/**/MLoperator diff --git a/README.md b/README.md index e56452084..fdc9fe655 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,14 @@ You have various options when installing ADS. python3 -m pip install oracle-ads ``` +### Installing OCI AI Operators + +To use the AI Forecast Operator, install the "forecast" dependencies using the following command: + +```bash + python3 -m pip install 'oracle_ads[forecast]==2.9.0' +``` + ### Installing extras libraries To work with gradient boosting models, install the `boosted` module. This module includes XGBoost and LightGBM model classes. diff --git a/ads/cli.py b/ads/cli.py index 2025790c4..7b003e52d 100644 --- a/ads/cli.py +++ b/ads/cli.py @@ -14,12 +14,12 @@ import ads.opctl.cli import ads.jobs.cli import ads.pipeline.cli - import os - import json + import ads.opctl.operator.cli except Exception as ex: print( "Please run `pip install oracle-ads[opctl]` to install " - "the required dependencies for ADS CLI." + "the required dependencies for ADS CLI. \n" + f"{str(ex)}" ) logger.debug(ex) logger.debug(traceback.format_exc()) @@ -44,6 +44,7 @@ def cli(): cli.add_command(ads.opctl.cli.commands) cli.add_command(ads.jobs.cli.commands) cli.add_command(ads.pipeline.cli.commands) +cli.add_command(ads.opctl.operator.cli.commands) if __name__ == "__main__": diff --git a/ads/common/auth.py b/ads/common/auth.py index 5135459a4..453967a20 100644 --- a/ads/common/auth.py +++ b/ads/common/auth.py @@ -629,8 +629,8 @@ def create_signer(self) -> Dict: user=configuration["user"], fingerprint=configuration["fingerprint"], private_key_file_location=configuration.get("key_file"), - pass_phrase= configuration.get("pass_phrase"), - private_key_content=configuration.get("key_content") + pass_phrase=configuration.get("pass_phrase"), + private_key_content=configuration.get("key_content"), ), "client_kwargs": self.client_kwargs, } @@ -750,21 +750,10 @@ class SecurityToken(AuthSignerGenerator): a given user - it requires that user's private key and security token. It prepares extra arguments necessary for creating clients for variety of OCI services. """ - SECURITY_TOKEN_GENERIC_HEADERS = [ - "date", - "(request-target)", - "host" - ] - SECURITY_TOKEN_BODY_HEADERS = [ - "content-length", - "content-type", - "x-content-sha256" - ] - SECURITY_TOKEN_REQUIRED = [ - "security_token_file", - "key_file", - "region" - ] + + SECURITY_TOKEN_GENERIC_HEADERS = ["date", "(request-target)", "host"] + SECURITY_TOKEN_BODY_HEADERS = ["content-length", "content-type", "x-content-sha256"] + SECURITY_TOKEN_REQUIRED = ["security_token_file", "key_file", "region"] def __init__(self, args: Optional[Dict] = None): """ @@ -831,12 +820,18 @@ def create_signer(self) -> Dict: return { "config": configuration, "signer": oci.auth.signers.SecurityTokenSigner( - token=self._read_security_token_file(configuration.get("security_token_file")), + token=self._read_security_token_file( + configuration.get("security_token_file") + ), private_key=oci.signer.load_private_key_from_file( configuration.get("key_file"), configuration.get("pass_phrase") ), - generic_headers=configuration.get("generic_headers", self.SECURITY_TOKEN_GENERIC_HEADERS), - body_headers=configuration.get("body_headers", self.SECURITY_TOKEN_BODY_HEADERS) + generic_headers=configuration.get( + "generic_headers", self.SECURITY_TOKEN_GENERIC_HEADERS + ), + body_headers=configuration.get( + "body_headers", self.SECURITY_TOKEN_BODY_HEADERS + ), ), "client_kwargs": self.client_kwargs, } @@ -849,30 +844,37 @@ def _validate_and_refresh_token(self, configuration: Dict[str, Any]): configuration: Dict Security token configuration. """ - security_token = self._read_security_token_file(configuration.get("security_token_file")) - security_token_container = oci.auth.security_token_container.SecurityTokenContainer( - session_key_supplier=None, - security_token=security_token + security_token = self._read_security_token_file( + configuration.get("security_token_file") + ) + security_token_container = ( + oci.auth.security_token_container.SecurityTokenContainer( + session_key_supplier=None, security_token=security_token + ) ) if not security_token_container.valid(): raise SecurityTokenError( "Security token has expired. Call `oci session authenticate` to generate new session." ) - + time_now = int(time.time()) time_expired = security_token_container.get_jwt()["exp"] if time_expired - time_now < SECURITY_TOKEN_LEFT_TIME: if not self.oci_config_location: - logger.warning("Can not auto-refresh token. Specify parameter `oci_config_location` through ads.set_auth() or ads.auth.create_signer().") + logger.warning( + "Can not auto-refresh token. Specify parameter `oci_config_location` through ads.set_auth() or ads.auth.create_signer()." + ) else: - result = os.system(f"oci session refresh --config-file {self.oci_config_location} --profile {self.oci_key_profile}") + result = os.system( + f"oci session refresh --config-file {self.oci_config_location} --profile {self.oci_key_profile}" + ) if result == 1: logger.warning( "Some error happened during auto-refreshing the token. Continue using the current one that's expiring in less than {SECURITY_TOKEN_LEFT_TIME} seconds." "Please follow steps in https://docs.oracle.com/en-us/iaas/Content/API/SDKDocs/clitoken.htm to renew token." ) - + date_time = datetime.fromtimestamp(time_expired).strftime("%Y-%m-%d %H:%M:%S") logger.info(f"Session is valid until {date_time}.") @@ -894,7 +896,7 @@ def _read_security_token_file(self, security_token_file: str) -> str: raise ValueError("Invalid `security_token_file`. Specify a valid path.") try: token = None - with open(expanded_path, 'r') as f: + with open(expanded_path, "r") as f: token = f.read() return token except: @@ -903,7 +905,7 @@ def _read_security_token_file(self, security_token_file: str) -> str: class AuthFactory: """ - AuthFactory class which contains list of registered signers and alllows to register new signers. + AuthFactory class which contains list of registered signers and allows to register new signers. Check documentation for more signers: https://docs.oracle.com/en-us/iaas/tools/python/latest/api/signing.html. Current signers: diff --git a/ads/common/decorator/runtime_dependency.py b/ads/common/decorator/runtime_dependency.py index aaf00dfe7..9010b6c65 100644 --- a/ads/common/decorator/runtime_dependency.py +++ b/ads/common/decorator/runtime_dependency.py @@ -64,6 +64,8 @@ class OptionalDependency: OPTUNA = "oracle-ads[optuna]" SPARK = "oracle-ads[spark]" HUGGINGFACE = "oracle-ads[huggingface]" + FORECAST = "oracle-ads[forecast]" + PII = "oracle-ads[pii]" FEATURE_STORE = "oracle-ads[feature-store]" GRAPHVIZ = "oracle-ads[graphviz]" MLM_INSIGHTS = "oracle-ads[mlm_insights]" diff --git a/ads/common/object_storage_details.py b/ads/common/object_storage_details.py index 9becf72cb..63642b0f2 100644 --- a/ads/common/object_storage_details.py +++ b/ads/common/object_storage_details.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import json @@ -15,7 +15,7 @@ from ads.common import oci_client -class InvalidObjectStoragePath(Exception): # pragma: no cover +class InvalidObjectStoragePath(Exception): # pragma: no cover """Invalid Object Storage Path.""" pass @@ -137,4 +137,4 @@ def is_oci_path(uri: str = None) -> bool: """ if not uri: return False - return uri.startswith("oci://") + return uri.lower().startswith("oci://") diff --git a/ads/common/serializer.py b/ads/common/serializer.py index 08a45b450..6f041e74c 100644 --- a/ads/common/serializer.py +++ b/ads/common/serializer.py @@ -4,6 +4,12 @@ # Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +""" +This module provides a base class for serializable items, as well as methods for serializing and +deserializing objects to and from JSON and YAML formats. It also includes methods for reading and +writing serialized objects to and from files. +""" + import dataclasses import json from abc import ABC, abstractmethod @@ -271,11 +277,16 @@ def from_yaml( Parameters ---------- - yaml_string (string, optional): YAML string. Defaults to None. - uri (string, optional): URI location of file containing YAML string. Defaults to None. - loader (callable, optional): Custom YAML loader. Defaults to CLoader/SafeLoader. - kwargs (dict): keyword arguments to be passed into fsspec.open(). For OCI object storage, this should be config="path/to/.oci/config". - For other storage connections consider e.g. host, port, username, password, etc. + yaml_string (string, optional) + YAML string. Defaults to None. + uri (string, optional) + URI location of file containing YAML string. Defaults to None. + loader (callable, optional) + Custom YAML loader. Defaults to CLoader/SafeLoader. + kwargs (dict) + keyword arguments to be passed into fsspec.open(). + For OCI object storage, this should be config="path/to/.oci/config". + For other storage connections consider e.g. host, port, username, password, etc. Raises ------ @@ -288,10 +299,10 @@ def from_yaml( Returns instance of the class """ if yaml_string: - return cls.from_dict(yaml.load(yaml_string, Loader=loader)) + return cls.from_dict(yaml.load(yaml_string, Loader=loader), **kwargs) if uri: yaml_dict = yaml.load(cls._read_from_file(uri=uri, **kwargs), Loader=loader) - return cls.from_dict(yaml_dict) + return cls.from_dict(yaml_dict, **kwargs) raise ValueError("Must provide either YAML string or URI location") @classmethod @@ -345,8 +356,8 @@ class DataClassSerializable(Serializable): Returns an instance of the class instantiated from the dictionary provided. """ - @staticmethod - def _validate_dict(obj_dict: Dict) -> bool: + @classmethod + def _validate_dict(cls, obj_dict: Dict) -> bool: """validate the dictionary. Parameters @@ -379,7 +390,7 @@ def to_dict(self, **kwargs) -> Dict: obj_dict = dataclasses.asdict(self) if "side_effect" in kwargs and kwargs["side_effect"]: obj_dict = DataClassSerializable._normalize_dict( - obj_dict=obj_dict, case=kwargs["side_effect"] + obj_dict=obj_dict, case=kwargs["side_effect"], recursively=True ) return obj_dict @@ -388,6 +399,8 @@ def from_dict( cls, obj_dict: dict, side_effect: Optional[SideEffect] = SideEffect.CONVERT_KEYS_TO_LOWER.value, + ignore_unknown: Optional[bool] = False, + **kwargs, ) -> "DataClassSerializable": """Returns an instance of the class instantiated by the dictionary provided. @@ -399,6 +412,8 @@ def from_dict( side effect to take on the dictionary. The side effect can be either convert the dictionary keys to "lower" (SideEffect.CONVERT_KEYS_TO_LOWER.value) or "upper"(SideEffect.CONVERT_KEYS_TO_UPPER.value) cases. + ignore_unknown: (bool, optional). Defaults to `False`. + Whether to ignore unknown fields or not. Returns ------- @@ -415,25 +430,36 @@ def from_dict( allowed_fields = set([f.name for f in dataclasses.fields(cls)]) wrong_fields = set(obj_dict.keys()) - allowed_fields - if wrong_fields: + if wrong_fields and not ignore_unknown: logger.warning( f"The class {cls.__name__} doesn't contain attributes: `{list(wrong_fields)}`. " "These fields will be ignored." ) - obj = cls(**{key: obj_dict[key] for key in allowed_fields}) + obj = cls(**{key: obj_dict.get(key) for key in allowed_fields}) for key, value in obj_dict.items(): - if isinstance(value, dict) and hasattr( - getattr(cls(), key).__class__, "from_dict" + if ( + key in allowed_fields + and isinstance(value, dict) + and hasattr(getattr(cls(), key).__class__, "from_dict") ): - attribute = getattr(cls(), key).__class__.from_dict(value) + attribute = getattr(cls(), key).__class__.from_dict( + value, + ignore_unknown=ignore_unknown, + side_effect=side_effect, + **kwargs, + ) setattr(obj, key, attribute) + return obj @staticmethod def _normalize_dict( - obj_dict: Dict, case: str = SideEffect.CONVERT_KEYS_TO_LOWER.value + obj_dict: Dict, + recursively: bool = False, + case: str = SideEffect.CONVERT_KEYS_TO_LOWER.value, + **kwargs, ) -> Dict: """lower all the keys. @@ -444,6 +470,8 @@ def _normalize_dict( case: (optional, str). Defaults to "lower". the case to normalized to. can be either "lower" (SideEffect.CONVERT_KEYS_TO_LOWER.value) or "upper"(SideEffect.CONVERT_KEYS_TO_UPPER.value). + recursively: (bool, optional). Defaults to `False`. + Whether to recursively normalize the dictionary or not. Returns ------- @@ -452,12 +480,16 @@ def _normalize_dict( """ normalized_obj_dict = {} for key, value in obj_dict.items(): - if isinstance(value, dict): + if recursively and isinstance(value, dict): value = DataClassSerializable._normalize_dict( - value, case=SideEffect.CONVERT_KEYS_TO_UPPER.value + value, case=case, recursively=recursively, **kwargs ) normalized_obj_dict = DataClassSerializable._normalize_key( - normalized_obj_dict=normalized_obj_dict, key=key, value=value, case=case + normalized_obj_dict=normalized_obj_dict, + key=key, + value=value, + case=case, + **kwargs, ) return normalized_obj_dict @@ -467,7 +499,7 @@ def _normalize_key( ) -> Dict: """helper function to normalize the key in the case specified and add it back to the dictionary. - Paramaters + Parameters ---------- normalized_obj_dict: (Dict) the dictionary to append the key and value to. @@ -476,17 +508,18 @@ def _normalize_key( value: (Union[str, Dict]) value to be added. case: (str) - the case to normalized to. can be either "lower" (SideEffect.CONVERT_KEYS_TO_LOWER.value) + The case to normalized to. can be either "lower" (SideEffect.CONVERT_KEYS_TO_LOWER.value) or "upper"(SideEffect.CONVERT_KEYS_TO_UPPER.value). Raises ------ - NotImplementedError: if case provided is not either "lower" or "upper". + NotImplementedError + Raised when `case` is not supported. Returns ------- Dict - normalized dictionary with the key and value added in the case specified. + Normalized dictionary with the key and value added in the case specified. """ if case.lower() == SideEffect.CONVERT_KEYS_TO_LOWER.value: normalized_obj_dict[key.lower()] = value diff --git a/ads/common/utils.py b/ads/common/utils.py index 7ec5000e4..0b35ce73f 100644 --- a/ads/common/utils.py +++ b/ads/common/utils.py @@ -1288,6 +1288,7 @@ def copy_file( auth: Optional[Dict] = None, chunk_size: Optional[int] = DEFAULT_BUFFER_SIZE, progressbar_description: Optional[str] = "Copying `{uri_src}` to `{uri_dst}`", + ignore_if_src_not_exists: Optional[bool] = False, ) -> str: """ Copies file from `uri_src` to `uri_dst`. @@ -1307,9 +1308,9 @@ def copy_file( The default authentication is set using `ads.set_auth` API. If you need to override the default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate authentication signer and kwargs required to instantiate IdentityClient object. - chunk_size: (int, optinal). Defaults to `DEFAULT_BUFFER_SIZE` + chunk_size: (int, optional). Defaults to `DEFAULT_BUFFER_SIZE` How much data can be copied in one iteration. - progressbar_description: (str, optinal). Defaults to `"Copying `{uri_src}` to `{uri_dst}`"`. + progressbar_description: (str, optional). Defaults to `"Copying `{uri_src}` to `{uri_dst}`"`. Prefix for the progressbar. Returns @@ -1323,14 +1324,23 @@ def copy_file( If a destination file exists and `force_overwrite` set to `False`. """ chunk_size = chunk_size or DEFAULT_BUFFER_SIZE - auth = auth or authutil.default_signer() if not os.path.basename(uri_dst): uri_dst = os.path.join(uri_dst, os.path.basename(uri_src)) src_path_scheme = urlparse(uri_src).scheme or "file" + + auth = auth or {} + if src_path_scheme.lower() == "oci" and not auth: + auth = authutil.default_signer() + src_file_system = fsspec.filesystem(src_path_scheme, **auth) - file_size = src_file_system.info(uri_src)["size"] + if not fsspec.filesystem(src_path_scheme, **auth).exists(uri_src): + if ignore_if_src_not_exists: + return uri_dst + raise FileNotFoundError(f"The `{uri_src}` not exists.") + + file_size = src_file_system.info(uri_src)["size"] if not force_overwrite: dst_path_scheme = urlparse(uri_dst).scheme or "file" if fsspec.filesystem(dst_path_scheme, **auth).exists(uri_dst): @@ -1601,7 +1611,9 @@ def is_path_exists(uri: str, auth: Optional[Dict] = None) -> bool: bool: return True if the path exists. """ path_scheme = urlparse(uri).scheme or "file" - storage_options = auth or authutil.default_signer() + storage_options = {} + if path_scheme != "file": + storage_options = auth or authutil.default_signer() if fsspec.filesystem(path_scheme, **storage_options).exists(uri): return True return False diff --git a/ads/data_labeling/mixin/data_labeling.py b/ads/data_labeling/mixin/data_labeling.py index 56f85f3a9..e2c65eb20 100644 --- a/ads/data_labeling/mixin/data_labeling.py +++ b/ads/data_labeling/mixin/data_labeling.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8; -*- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ from typing import Dict, List @@ -188,6 +188,7 @@ def render_ner( content_column: str = "Content", annotations_column: str = "Annotations", limit: int = ROWS_TO_RENDER_LIMIT, + return_html: bool = False, ) -> None: """Renders NER dataset. Displays only first 50 rows. @@ -223,6 +224,8 @@ def render_ner( annotations_column=annotations_column, ) result_html = text_visualizer.render(items=items, options=options) + if return_html: + return result_html from IPython.core.display import HTML, Markdown, display diff --git a/ads/jobs/builders/infrastructure/dataflow.py b/ads/jobs/builders/infrastructure/dataflow.py index 3caae3372..894c40e7a 100644 --- a/ads/jobs/builders/infrastructure/dataflow.py +++ b/ads/jobs/builders/infrastructure/dataflow.py @@ -47,7 +47,7 @@ "Standard.E4", "Standard3", "Standard.A1", - "Standard2" + "Standard2", ] @@ -935,7 +935,7 @@ def create(self, runtime: DataFlowRuntime, **kwargs) -> "DataFlow": self.df_app = DataFlowApp(**payload).create() self.with_id(self.df_app.id) return self - + @staticmethod def _validate_shapes(payload: Dict): if "executor_shape" not in payload: @@ -955,9 +955,8 @@ def _validate_shapes(payload: Dict): raise ValueError( "`executor_shape` and `driver_shape` must be from the same shape family." ) - if ( - (not executor_shape.endswith("Flex") and executor_shape_config) - or (not driver_shape.endswith("Flex") and driver_shape_config) + if (not executor_shape.endswith("Flex") and executor_shape_config) or ( + not driver_shape.endswith("Flex") and driver_shape_config ): raise ValueError( "Shape config is not required for non flex shape from user end." @@ -1234,7 +1233,7 @@ def to_yaml(self, **kwargs) -> str: """ return yaml.safe_dump(self.to_dict(**kwargs)) - def init(self) -> "DataFlow": + def init(self, **kwargs) -> "DataFlow": """Initializes a starter specification for the DataFlow. Returns diff --git a/ads/jobs/builders/infrastructure/dsc_job.py b/ads/jobs/builders/infrastructure/dsc_job.py index d67876012..f190436da 100644 --- a/ads/jobs/builders/infrastructure/dsc_job.py +++ b/ads/jobs/builders/infrastructure/dsc_job.py @@ -266,10 +266,9 @@ def load_properties_from_env(self) -> None: # This will skip loading the default configure. nb_session = None if nb_session: - nb_config = ( - getattr(nb_session, "notebook_session_config_details", None) - or getattr(nb_session, "notebook_session_configuration_details", None) - ) + nb_config = getattr( + nb_session, "notebook_session_config_details", None + ) or getattr(nb_session, "notebook_session_configuration_details", None) if nb_config: self._load_infra_from_notebook(nb_config) @@ -742,8 +741,8 @@ def cancel(self, wait_for_completion: bool = True) -> DataScienceJobRun: self.client.cancel_job_run(self.id) if wait_for_completion: while ( - self.lifecycle_state != - oci.data_science.models.JobRun.LIFECYCLE_STATE_CANCELED + self.lifecycle_state + != oci.data_science.models.JobRun.LIFECYCLE_STATE_CANCELED ): self.sync() time.sleep(SLEEP_INTERVAL) @@ -1481,9 +1480,7 @@ def _update_job_infra(self, dsc_job: DSCJob) -> DataScienceJob: ] = JobInfrastructureConfigurationDetails.JOB_INFRASTRUCTURE_TYPE_STANDALONE if self.storage_mount: - if not hasattr( - oci.data_science.models, "StorageMountConfigurationDetails" - ): + if not hasattr(oci.data_science.models, "StorageMountConfigurationDetails"): raise EnvironmentError( "Storage mount hasn't been supported in the current OCI SDK installed." ) @@ -1495,10 +1492,16 @@ def _update_job_infra(self, dsc_job: DSCJob) -> DataScienceJob: def build(self) -> DataScienceJob: self.dsc_job.load_defaults() + + try: + self.dsc_job.load_defaults() + except Exception: + logger.exception("Failed to load default properties.") + self._update_from_dsc_model(self.dsc_job, overwrite=False) return self - def init(self) -> DataScienceJob: + def init(self, **kwargs) -> DataScienceJob: """Initializes a starter specification for the DataScienceJob. Returns diff --git a/ads/jobs/builders/runtimes/base.py b/ads/jobs/builders/runtimes/base.py index 404468c48..674412e10 100644 --- a/ads/jobs/builders/runtimes/base.py +++ b/ads/jobs/builders/runtimes/base.py @@ -11,10 +11,6 @@ Self = TypeVar("Self", bound="Runtime") -"""Special type to represent the current enclosed class. - -This type is used by factory class method or when a method returns ``self``. -""" class Runtime(Builder): @@ -31,6 +27,7 @@ class Runtime(Builder): CONST_FREEFORM_TAGS: "freeform_tags", CONST_DEFINED_TAGS: "defined_tags", CONST_ENV_VAR: CONST_ENV_VAR, + CONST_ARGS: CONST_ARGS, } def __init__(self, spec: Dict = None, **kwargs) -> None: @@ -239,7 +236,7 @@ def maximum_runtime_in_minutes(self) -> int: """Maximum runtime in minutes""" return self.get_spec(self.CONST_MAXIMUM_RUNTIME_IN_MINUTES) - def init(self) -> Self: + def init(self, **kwargs) -> Self: """Initializes a starter specification for the runtime. Returns @@ -248,7 +245,11 @@ def init(self) -> Self: This method returns self to support chaining methods. """ return ( - self.with_environment_variable(env_name="env_value") - .with_freeform_tag(tag_name="tag_value") - .with_argument(key1="val1") + self.with_environment_variable( + **kwargs.get(self.attribute_map[self.CONST_ENV_VAR], {}) + ) + .with_freeform_tag( + **kwargs.get(self.attribute_map[self.CONST_FREEFORM_TAGS], {}) + ) + .with_argument(**kwargs.get(self.attribute_map[self.CONST_ARGS], {})) ) diff --git a/ads/jobs/builders/runtimes/container_runtime.py b/ads/jobs/builders/runtimes/container_runtime.py index d7fcff2d1..b67e90650 100644 --- a/ads/jobs/builders/runtimes/container_runtime.py +++ b/ads/jobs/builders/runtimes/container_runtime.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8; -*- -# Copyright (c) 2021 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ from typing import Union from ads.jobs.builders.runtimes.base import Runtime @@ -123,7 +123,7 @@ def with_cmd(self, cmd: str) -> "ContainerRuntime": self._spec[self.CONST_CMD] = cmd return self - def init(self) -> "ContainerRuntime": + def init(self, **kwargs) -> "ContainerRuntime": """Initializes a starter specification for the runtime. Returns @@ -131,10 +131,10 @@ def init(self) -> "ContainerRuntime": ContainerRuntime The runtime instance. """ - super().init() + super().init(**kwargs) return self.with_image( - image="iad.ocir.io/namespace/image:tag", + image=kwargs.get("image", "iad.ocir.io/namespace/image:tag"), entrypoint=["bash", "--login", "-c"], - cmd="{Container CMD. For MLFlow, it will be replaced with the Project CMD}", + cmd="{Container CMD. For MLflow and Operator will be auto generated}", ) diff --git a/ads/jobs/builders/runtimes/python_runtime.py b/ads/jobs/builders/runtimes/python_runtime.py index 653b3b69d..57315e001 100644 --- a/ads/jobs/builders/runtimes/python_runtime.py +++ b/ads/jobs/builders/runtimes/python_runtime.py @@ -116,18 +116,31 @@ def with_custom_conda(self, uri: str, region: str = None): conda_spec[self.CONST_CONDA_REGION] = region return self.set_spec(self.CONST_CONDA, conda_spec) - def init(self) -> "CondaRuntime": + def init(self, **kwargs) -> "CondaRuntime": """Initializes a starter specification for the runtime. + Parameters + ---------- + **kwargs: Dict + - conda_slug: str + The conda environment slug. + If it contains '/', then the assumption that this is a custom conda environment. + Returns ------- CondaRuntime The runtime instance. """ - super().init() + super().init(**kwargs) + + conda_slug = kwargs.get("conda_slug", "") + + if "/" not in conda_slug: + return self.with_service_conda(conda_slug) + return self.with_custom_conda( - "{Path to the custom conda environment. " - "Example: oci://your_bucket@namespace/object_name" + conda_slug + or "{Path to the custom conda environment. Example: oci://bucket@namespace/prefix}" ) @@ -246,7 +259,7 @@ def with_entrypoint(self, entrypoint: str): """ return self.set_spec(self.CONST_ENTRYPOINT, entrypoint) - def init(self) -> "ScriptRuntime": + def init(self, **kwargs) -> "ScriptRuntime": """Initializes a starter specification for the runtime. Returns @@ -254,15 +267,13 @@ def init(self) -> "ScriptRuntime": ScriptRuntime The runtime instance. """ - super().init() + super().init(**kwargs) return ( - self.with_entrypoint( - "{Entrypoint script. For MLflow, it will be replaced with the CMD}" - ) + self.with_entrypoint("{For MLflow and Operator will be auto generated}") .with_script( - "{Path to the script. For MLflow, it will be replaced with the path to the project}" + "{Path to the script. For MLflow and Operator will be auto generated}" ) - .with_argument(key1="val1") + .with_argument(**kwargs.get("args", {})) ) @@ -435,7 +446,7 @@ class PythonRuntime(ScriptRuntime, _PythonRuntimeMixin): attribute_map.update(ScriptRuntime.attribute_map) attribute_map.update(_PythonRuntimeMixin.attribute_map) - def init(self) -> "PythonRuntime": + def init(self, **kwargs) -> "PythonRuntime": """Initializes a starter specification for the runtime. Returns @@ -443,14 +454,12 @@ def init(self) -> "PythonRuntime": PythonRuntime The runtime instance. """ - super().init() + super().init(**kwargs) return ( - self.with_working_dir("{For MLflow the project folder will be used.}") - .with_entrypoint( - "{Entrypoint script. For MLflow, it will be replaced with the CMD}" - ) + self.with_working_dir("{For MLflow and Operator will be auto generated}") + .with_entrypoint("{For MLflow and Operator will be auto generated}") .with_script( - "{Path to the script. For MLflow, it will be replaced with the path to the project}" + "{Path to the script. For MLflow and Operator will be auto generated}" ) ) @@ -623,7 +632,7 @@ def notebook(self) -> str: """The path of the notebook relative to the source.""" return self.get_spec(self.CONST_ENTRYPOINT) - def init(self) -> "NotebookRuntime": + def init(self, **kwargs) -> "NotebookRuntime": """Initializes a starter specification for the runtime. Returns @@ -631,7 +640,7 @@ def init(self) -> "NotebookRuntime": NotebookRuntime The runtime instance. """ - super().init() + super().init(**kwargs) return self.with_source( uri="{Path to the source code directory. For MLflow, it will be replaced with the path to the project}", notebook="{Entrypoint notebook. For MLflow, it will be replaced with the CMD}", @@ -744,7 +753,7 @@ def ssh_secret_ocid(self) -> str: """The OCID of the OCI Vault secret storing the Git SSH key.""" return self.get_spec(self.CONST_GIT_SSH_SECRET_ID) - def init(self) -> "GitPythonRuntime": + def init(self, **kwargs) -> "GitPythonRuntime": """Initializes a starter specification for the runtime. Returns @@ -752,12 +761,10 @@ def init(self) -> "GitPythonRuntime": GitPythonRuntime The runtime instance. """ - super().init() + super().init(**kwargs) return self.with_source( - "{Git URI. For MLflow, it will be replaced with the Project URI}" - ).with_entrypoint( - "{Entrypoint script. For MLflow, it will be replaced with the CMD}" - ) + "{Git URI. For MLflow and Operator will be auto generated}" + ).with_entrypoint("{For MLflow and Operator will be auto generated}") class DataFlowRuntime(CondaRuntime): @@ -967,7 +974,7 @@ def overwrite(self) -> str: def convert(self, **kwargs): pass - def init(self) -> "DataFlowRuntime": + def init(self, **kwargs) -> "DataFlowRuntime": """Initializes a starter specification for the runtime. Returns @@ -975,15 +982,18 @@ def init(self) -> "DataFlowRuntime": DataFlowRuntime The runtime instance. """ - super().init() + super().init(**kwargs) self._spec.pop(self.CONST_ENV_VAR, None) return ( self.with_script_uri( - "{Path to the executable script. For MLflow, it will be replaced with the CMD}" + "{Path to the executable script. For MLflow and Operator will auto generated}" ) .with_script_bucket( - "{The object storage bucket to save a script. " - "Example: oci://@/}" + kwargs.get( + "script_bucket", + "{The object storage bucket to save a script. " + "Example: oci://@/}", + ) ) .with_overwrite(True) .with_configuration({"spark.driverEnv.env_key": "env_value"}) diff --git a/ads/model/deployment/model_deployment_infrastructure.py b/ads/model/deployment/model_deployment_infrastructure.py index 0bce46cf8..f4e377f59 100644 --- a/ads/model/deployment/model_deployment_infrastructure.py +++ b/ads/model/deployment/model_deployment_infrastructure.py @@ -233,10 +233,9 @@ def _load_default_properties(self) -> Dict: ) logger.debug(traceback.format_exc()) - nb_config = ( - getattr(nb_session, "notebook_session_config_details", None) - or getattr(nb_session, "notebook_session_configuration_details", None) - ) + nb_config = getattr( + nb_session, "notebook_session_config_details", None + ) or getattr(nb_session, "notebook_session_configuration_details", None) if nb_config: defaults[self.CONST_SHAPE_NAME] = nb_config.shape @@ -616,7 +615,7 @@ def subnet_id(self) -> str: """ return self.get_spec(self.CONST_SUBNET_ID, None) - def init(self) -> "ModelDeploymentInfrastructure": + def init(self, **kwargs) -> "ModelDeploymentInfrastructure": """Initializes a starter specification for the ModelDeploymentInfrastructure. Returns @@ -634,6 +633,8 @@ def init(self) -> "ModelDeploymentInfrastructure": .with_shape_name(self.shape_name or DEFAULT_SHAPE_NAME) .with_shape_config_details( ocpus=self.shape_config_details.get(self.CONST_OCPUS, DEFAULT_OCPUS), - memory_in_gbs=self.shape_config_details.get(self.CONST_MEMORY_IN_GBS, DEFAULT_MEMORY_IN_GBS) + memory_in_gbs=self.shape_config_details.get( + self.CONST_MEMORY_IN_GBS, DEFAULT_MEMORY_IN_GBS + ), ) ) diff --git a/ads/model/deployment/model_deployment_runtime.py b/ads/model/deployment/model_deployment_runtime.py index 199c69ac2..26e31f9cd 100644 --- a/ads/model/deployment/model_deployment_runtime.py +++ b/ads/model/deployment/model_deployment_runtime.py @@ -108,7 +108,7 @@ class ModelDeploymentRuntime(Builder): CONST_REGION: "region", CONST_OVERWRITE_EXISTING_ARTIFACT: "overwrite_existing_artifact", CONST_REMOVE_EXISTING_ARTIFACT: "remove_existing_artifact", - CONST_TIMEOUT: "timeout" + CONST_TIMEOUT: "timeout", } ENVIRONMENT_CONFIG_DETAILS_PATH = ( @@ -277,7 +277,7 @@ def with_model_uri(self, model_uri: str) -> "ModelDeploymentRuntime": The ModelDeploymentRuntime instance (self). """ return self.set_spec(self.CONST_MODEL_URI, model_uri) - + @property def bucket_uri(self) -> str: """The bucket uri of model. @@ -303,7 +303,7 @@ def with_bucket_uri(self, bucket_uri: str) -> "ModelDeploymentRuntime": The ModelDeploymentRuntime instance (self). """ return self.set_spec(self.CONST_BUCKET_URI, bucket_uri) - + @property def auth(self) -> Dict: """The auth when uploading large-size model. @@ -314,7 +314,7 @@ def auth(self) -> Dict: The auth when uploading large-size model. """ return self.get_spec(self.CONST_AUTH, {}) - + def with_auth(self, auth: Dict) -> "ModelDeploymentRuntime": """Sets the auth when uploading large-size model. @@ -329,7 +329,7 @@ def with_auth(self, auth: Dict) -> "ModelDeploymentRuntime": The ModelDeploymentRuntime instance (self). """ return self.set_spec(self.CONST_AUTH, auth) - + @property def region(self) -> str: """The region when uploading large-size model. @@ -340,7 +340,7 @@ def region(self) -> str: The region when uploading large-size model. """ return self.get_spec(self.CONST_REGION, None) - + def with_region(self, region: str) -> "ModelDeploymentRuntime": """Sets the region when uploading large-size model. @@ -355,7 +355,7 @@ def with_region(self, region: str) -> "ModelDeploymentRuntime": The ModelDeploymentRuntime instance (self). """ return self.set_spec(self.CONST_REGION, region) - + @property def overwrite_existing_artifact(self) -> bool: """Overwrite existing artifact when uploading large size model. @@ -366,10 +366,9 @@ def overwrite_existing_artifact(self) -> bool: Overwrite existing artifact when uploading large size model. """ return self.get_spec(self.CONST_OVERWRITE_EXISTING_ARTIFACT, True) - + def with_overwrite_existing_artifact( - self, - overwrite_existing_artifact: bool + self, overwrite_existing_artifact: bool ) -> "ModelDeploymentRuntime": """Sets whether to overwrite existing artifact when uploading large size model. @@ -384,10 +383,9 @@ def with_overwrite_existing_artifact( The ModelDeploymentRuntime instance (self). """ return self.set_spec( - self.CONST_OVERWRITE_EXISTING_ARTIFACT, - overwrite_existing_artifact + self.CONST_OVERWRITE_EXISTING_ARTIFACT, overwrite_existing_artifact ) - + @property def remove_existing_artifact(self) -> bool: """Remove existing artifact when uploading large size model. @@ -398,10 +396,9 @@ def remove_existing_artifact(self) -> bool: Remove existing artifact when uploading large size model. """ return self.get_spec(self.CONST_REMOVE_EXISTING_ARTIFACT, True) - + def with_remove_existing_artifact( - self, - remove_existing_artifact: bool + self, remove_existing_artifact: bool ) -> "ModelDeploymentRuntime": """Sets whether to remove existing artifact when uploading large size model. @@ -415,8 +412,10 @@ def with_remove_existing_artifact( ModelDeploymentRuntime The ModelDeploymentRuntime instance (self). """ - return self.set_spec(self.CONST_REMOVE_EXISTING_ARTIFACT, remove_existing_artifact) - + return self.set_spec( + self.CONST_REMOVE_EXISTING_ARTIFACT, remove_existing_artifact + ) + @property def timeout(self) -> int: """The timeout when uploading large-size model. @@ -427,7 +426,7 @@ def timeout(self) -> int: The timeout when uploading large-size model. """ return self.get_spec(self.CONST_TIMEOUT, None) - + def with_timeout(self, timeout: int) -> "ModelDeploymentRuntime": """Sets the timeout when uploading large-size model. @@ -442,8 +441,8 @@ def with_timeout(self, timeout: int) -> "ModelDeploymentRuntime": The ModelDeploymentRuntime instance (self). """ return self.set_spec(self.CONST_TIMEOUT, timeout) - - def init(self) -> "ModelDeploymentRuntime": + + def init(self, **kwargs) -> "ModelDeploymentRuntime": """Initializes a starter specification for the runtime. Returns @@ -492,7 +491,7 @@ def environment_config_type(self) -> str: """ return OCIModelDeploymentRuntimeType.CONDA - def init(self) -> "ModelDeploymentCondaRuntime": + def init(self, **kwargs) -> "ModelDeploymentCondaRuntime": """Initializes a starter specification for the runtime. Returns @@ -500,7 +499,7 @@ def init(self) -> "ModelDeploymentCondaRuntime": CondaRuntime The runtime instance. """ - return super().init() + return super().init(**kwargs) class ModelDeploymentContainerRuntime(ModelDeploymentRuntime): @@ -821,7 +820,7 @@ def with_inference_server( """ return self.set_spec(self.CONST_INFERENCE_SERVER, inference_server.lower()) - def init(self) -> "ModelDeploymentContainerRuntime": + def init(self, **kwargs) -> "ModelDeploymentContainerRuntime": """Initializes a starter specification for the runtime. Returns @@ -829,7 +828,7 @@ def init(self) -> "ModelDeploymentContainerRuntime": CondaRuntime The runtime instance. """ - super().init() + super().init(**kwargs) return ( self.with_image("iad.ocir.io//:") .with_image_digest("") diff --git a/ads/model/runtime/env_info.py b/ads/model/runtime/env_info.py index 22fb80209..711899ecf 100644 --- a/ads/model/runtime/env_info.py +++ b/ads/model/runtime/env_info.py @@ -226,9 +226,9 @@ def _populate_env_info( training_python_version=python_version, ) - @staticmethod - def _validate_dict(obj_dict: Dict) -> bool: - """Validate the content in the ditionary format from the yaml file. + @classmethod + def _validate_dict(cls,obj_dict: Dict) -> bool: + """Validate the content in the dictionary format from the yaml file. Parameters ---------- @@ -283,9 +283,9 @@ def _populate_env_info( inference_python_version=python_version, ) - @staticmethod - def _validate_dict(obj_dict: Dict) -> bool: - """Validate the content in the ditionary format from the yaml file. + @classmethod + def _validate_dict(cls, obj_dict: Dict) -> bool: + """Validate the content in the dictionary format from the yaml file. Parameters ---------- diff --git a/ads/model/runtime/model_deployment_details.py b/ads/model/runtime/model_deployment_details.py index e2bfc9c14..13644ad96 100644 --- a/ads/model/runtime/model_deployment_details.py +++ b/ads/model/runtime/model_deployment_details.py @@ -17,9 +17,9 @@ class ModelDeploymentDetails(DataClassSerializable): inference_conda_env: InferenceEnvInfo = field(default_factory=InferenceEnvInfo) - @staticmethod - def _validate_dict(obj_dict: Dict) -> bool: - """Validate the content in the ditionary format from the yaml file. + @classmethod + def _validate_dict(cls, obj_dict: Dict) -> bool: + """Validate the content in the dictionary format from the yaml file. Parameters ---------- diff --git a/ads/model/runtime/model_provenance_details.py b/ads/model/runtime/model_provenance_details.py index db9a9efe6..a3769795a 100644 --- a/ads/model/runtime/model_provenance_details.py +++ b/ads/model/runtime/model_provenance_details.py @@ -18,8 +18,8 @@ class TrainingCode(DataClassSerializable): artifact_directory: str = "" - @staticmethod - def _validate_dict(obj_dict: Dict) -> bool: + @classmethod + def _validate_dict(cls, obj_dict: Dict) -> bool: assert obj_dict and ( "ARTIFACT_DIRECTORY" in obj_dict ), "`training_code` must have `ARTIFACT_DIRECTORY` field." @@ -40,8 +40,8 @@ class ModelProvenanceDetails(DataClassSerializable): user_ocid: str = "" vm_image_internal_id: str = "" - @staticmethod - def _validate_dict(obj_dict: Dict) -> bool: + @classmethod + def _validate_dict(cls, obj_dict: Dict) -> bool: """validate the yaml file. Parameters diff --git a/ads/model/runtime/runtime_info.py b/ads/model/runtime/runtime_info.py index 2e5e06be0..e88033b81 100644 --- a/ads/model/runtime/runtime_info.py +++ b/ads/model/runtime/runtime_info.py @@ -25,8 +25,8 @@ class RuntimeInfo(DataClassSerializable): default_factory=ModelProvenanceDetails ) - @staticmethod - def _validate_dict(obj_dict: Dict) -> bool: + @classmethod + def _validate_dict(cls, obj_dict: Dict) -> bool: """Validate the runtime info. Parameters diff --git a/ads/opctl/__init__.py b/ads/opctl/__init__.py index d5c24366e..f8ffdca2b 100644 --- a/ads/opctl/__init__.py +++ b/ads/opctl/__init__.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8; -*- -# Copyright (c) 2022 Oracle and/or its affiliates. +# Copyright (c) 2022, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import logging @@ -9,7 +9,9 @@ logger = logging.getLogger(__name__) handler = logging.StreamHandler(sys.stdout) -logger.addHandler(handler) +# logger.addHandler(handler) + +logger.setLevel(logging.INFO) def set_log_level(level): diff --git a/ads/opctl/backend/ads_dataflow.py b/ads/opctl/backend/ads_dataflow.py index 61cb52e9e..903609575 100644 --- a/ads/opctl/backend/ads_dataflow.py +++ b/ads/opctl/backend/ads_dataflow.py @@ -5,28 +5,28 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import os import json +import os import shlex +import tempfile +import time from typing import Dict, Union -from ads.opctl.backend.base import Backend -from ads.opctl.decorator.common import print_watch_command -from ads.common.auth import create_signer, AuthContext +from ads.common.auth import AuthContext, create_signer, AuthType from ads.common.oci_client import OCIClientFactory - -from ads.opctl.backend.base import ( - Backend, - RuntimeFactory, -) - from ads.jobs import ( - Job, DataFlow, - DataFlowRuntime, DataFlowNotebookRuntime, DataFlowRun, + DataFlowRuntime, + Job, ) +from ads.opctl import logger +from ads.opctl.backend.base import Backend, RuntimeFactory +from ads.opctl.constants import OPERATOR_MODULE_PATH +from ads.opctl.decorator.common import print_watch_command +from ads.opctl.operator.common.const import ENV_OPERATOR_ARGS +from ads.opctl.operator.common.operator_loader import OperatorInfo, OperatorLoader REQUIRED_FIELDS = [ "compartment_id", @@ -84,20 +84,39 @@ def init( `None` otherwise. """ + conda_slug = kwargs.get( + "conda_slug", self.config["execution"].get("conda_slug", "conda_slug") + ).lower() + + # if conda slug contains '/' then the assumption is that it is a custom conda pack + # the conda prefix needs to be added + if "/" in conda_slug: + conda_slug = os.path.join( + self.config["execution"].get( + "conda_pack_os_prefix", "oci://bucket@namespace/conda_environments" + ), + conda_slug, + ) + + RUNTIME_KWARGS_MAP = { + DataFlowRuntime().type: { + "conda_slug": conda_slug, + "script_bucket": f"{self.config['infrastructure'].get('script_bucket','').rstrip('/')}", + }, + } + with AuthContext(auth=self.auth_type, profile=self.profile): # define an job job = ( Job() - .with_name( - "{Job name. For MLflow, it will be replaced with the Project name}" - ) + .with_name("{Job name. For MLflow and Operator will be auto generated}") .with_infrastructure( DataFlow(**(self.config.get("infrastructure", {}) or {})).init() ) .with_runtime( DataFlowRuntimeFactory.get_runtime( key=runtime_type or DataFlowRuntime().type - ).init() + ).init(**{**kwargs, **RUNTIME_KWARGS_MAP[runtime_type]}) ) ) @@ -111,7 +130,7 @@ def init( uri=uri, overwrite=overwrite, note=note, - filter_by_attribute_map=True, + filter_by_attribute_map=False, **kwargs, ) @@ -195,6 +214,136 @@ def watch(self): run.watch(interval=interval) +class DataFlowOperatorBackend(DataFlowBackend): + """ + Backend class to run operator on Data Flow Application. + + Attributes + ---------- + runtime_config: (Dict) + The runtime config for the operator. + operator_config: (Dict) + The operator specification config. + operator_type: str + The type of the operator. + operator_version: str + The version of the operator. + job: Job + The Data Science Job. + """ + + def __init__(self, config: Dict, operator_info: OperatorInfo = None) -> None: + """ + Instantiates the operator backend. + + Parameters + ---------- + config: (Dict) + The configuration file containing operator's specification details and execution section. + operator_info: (OperatorInfo, optional) + The operator's detailed information extracted from the operator.__init__ file. + Will be extracted from the operator type in case if not provided. + """ + super().__init__(config=config or {}) + + self.job = None + + self.runtime_config = self.config.get("runtime", {}) + self.operator_config = { + **{ + key: value + for key, value in self.config.items() + if key not in ("runtime", "infrastructure", "execution") + } + } + self.operator_type = self.operator_config.get("type", "unknown") + self.operator_version = self.operator_config.get("version", "unknown") + self.operator_info = operator_info + + def _adjust_common_information(self): + """Adjusts common information of the application.""" + + if self.job.name.lower().startswith("{job"): + self.job.with_name( + f"job_{self.operator_info.type.lower()}" + f"_{self.operator_version.lower()}" + ) + self.job.runtime.with_maximum_runtime_in_minutes( + self.config["execution"].get("max_wait_time", 1200) + ) + + temp_dir = tempfile.mkdtemp() + + # prepare run.py file to run the operator + script_file = os.path.join( + temp_dir, f"{self.operator_info.type}_{int(time.time())}_run.py" + ) + + operator_module = f"{OPERATOR_MODULE_PATH}.{self.operator_type}" + with open(script_file, "w") as fp: + fp.writelines( + "\n".join( + [ + "import runpy", + f"runpy.run_module('{operator_module}', run_name='__main__')", + ] + ) + ) + self.job.runtime.with_script_uri(script_file) + + # propagate environment variables to the runtime config + env_vars = { + "OCI_IAM_TYPE": AuthType.RESOURCE_PRINCIPAL, + "OCIFS_IAM_TYPE": AuthType.RESOURCE_PRINCIPAL, + ENV_OPERATOR_ARGS: json.dumps(self.operator_config), + **(self.job.runtime.envs or {}), + } + + runtime_config = self.job.runtime.configuration or dict() + + existing_env_keys = { + key.upper() + .replace("SPARK.EXECUTORENV.", "") + .replace("SPARK.DRIVERENV.", "") + for key in runtime_config + if "SPARK.EXECUTORENV" in key.upper() or "SPARK.DRIVERENV" in key.upper() + } + + for env_key, env_value in (env_vars or {}).items(): + if env_key.upper() not in existing_env_keys: + runtime_config[f"spark.driverEnv.{env_key}"] = env_value + + self.job.runtime.with_configuration(runtime_config) + + @print_watch_command + def run(self, **kwargs: Dict) -> Union[Dict, None]: + """ + Runs the operator on the Data Flow service. + """ + if not self.operator_info: + self.operator_info = OperatorLoader.from_uri(self.operator_type).load() + + self.job = Job.from_dict(self.runtime_config).build() + + # adjust job's common information + self._adjust_common_information() + + # run the job if only it is not a dry run mode + if not self.config["execution"].get("dry_run"): + job = self.job.create() + logger.info(f"{'*' * 50} Data Flow Application {'*' * 50}") + logger.info(job) + + job_run = job.run() + logger.info(f"{'*' * 50} DataFlow Application Run {'*' * 50}") + logger.info(job_run) + + return {"job_id": job.id, "run_id": job_run.id} + else: + logger.info(f"{'*' * 50} DataFlow Application (Dry Run Mode) {'*' * 50}") + logger.info(self.job) + + class DataFlowRuntimeFactory(RuntimeFactory): """Data Flow runtime factory.""" diff --git a/ads/opctl/backend/ads_ml_job.py b/ads/opctl/backend/ads_ml_job.py index bd1362044..18b458a79 100644 --- a/ads/opctl/backend/ads_ml_job.py +++ b/ads/opctl/backend/ads_ml_job.py @@ -5,16 +5,16 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import copy +import json import os import shlex import shutil import tempfile +import time from distutils import dir_util from typing import Dict, Tuple, Union -from jinja2 import Environment, PackageLoader - -from ads.common.auth import AuthContext, create_signer +from ads.common.auth import AuthContext, AuthType, create_signer from ads.common.oci_client import OCIClientFactory from ads.jobs import ( ContainerRuntime, @@ -34,6 +34,8 @@ from ads.opctl.distributed.common.cluster_config_helper import ( ClusterConfigToJobSpecConverter, ) +from ads.opctl.operator.common.const import ENV_OPERATOR_ARGS +from ads.opctl.operator.common.operator_loader import OperatorInfo, OperatorLoader REQUIRED_FIELDS = [ "project_id", @@ -91,22 +93,51 @@ def init( `None` otherwise. """ + conda_slug = ( + kwargs.get( + "conda_slug", self.config["execution"].get("conda_slug", "conda_slug") + ) + or "" + ).lower() + + # if conda slug contains '/' then the assumption is that it is a custom conda pack + # the conda prefix needs to be added + if "/" in conda_slug: + conda_slug = os.path.join( + self.config["execution"].get( + "conda_pack_os_prefix", "oci://bucket@namespace/conda_environments" + ), + conda_slug, + ) + + RUNTIME_KWARGS_MAP = { + ContainerRuntime().type: { + "image": ( + f"{self.config['infrastructure'].get('docker_registry','').rstrip('/')}" + f"/{kwargs.get('image_name', self.config['execution'].get('image','image:latest'))}" + ) + }, + ScriptRuntime().type: {"conda_slug": conda_slug}, + PythonRuntime().type: {"conda_slug": conda_slug}, + NotebookRuntime().type: {}, + GitPythonRuntime().type: {}, + } + + runtime_type = runtime_type or PythonRuntime().type with AuthContext(auth=self.auth_type, profile=self.profile): # define a job job = ( Job() - .with_name( - "{Job name. For MLflow, it will be replaced with the Project name}" - ) + .with_name("{Job name. For MLflow and Operator will be auto generated}") .with_infrastructure( DataScienceJob( **(self.config.get("infrastructure", {}) or {}) ).init() ) .with_runtime( - JobRuntimeFactory.get_runtime( - key=runtime_type or PythonRuntime().type - ).init() + JobRuntimeFactory.get_runtime(key=runtime_type).init( + **{**kwargs, **RUNTIME_KWARGS_MAP[runtime_type]} + ) ) ) @@ -168,36 +199,6 @@ def run(self) -> Dict: print("JOB RUN OCID:", run_id) return {"job_id": job_id, "run_id": run_id} - def init_operator(self): - # TODO: check if folder is empty, check for force overwrite - # TODO: check that command is being run from advanced-ds repo (important until ads released) - - operator_folder = self.config["execution"].get("operator_folder_path") - os.makedirs(operator_folder, exist_ok=True) - - operator_folder_name = os.path.basename(os.path.normpath(operator_folder)) - docker_tag = f"{os.path.join(self.config['infrastructure'].get('docker_registry'), operator_folder_name)}:latest" - - self.config["execution"]["operator_folder_name"] = operator_folder_name - self.config["execution"]["docker_tag"] = docker_tag - - operator_slug = self.config["execution"].get("operator_slug") - self._jinja_write(operator_slug, operator_folder) - - # DONE - print( - "\nInitialization Successful.\n" - f"All code should be written in main.py located at: {os.path.join(operator_folder, 'main.py')}\n" - f"Additional libraries should be added to environment.yaml located at: {os.path.join(operator_folder, 'environment.yaml')}\n" - "Any changes to main.py will require re-building the docker image, whereas changes to args in the" - " runtime section of the yaml file do not. Write accordingly.\n" - "Run this cluster with:\n" - f"\tdocker build -t {docker_tag} -f {os.path.join(operator_folder, 'Dockerfile')} .\n" - f"\tads opctl publish-image {docker_tag} \n" - f"\tads opctl run -f {os.path.join(operator_folder, operator_slug + '.yaml')} \n" - ) - return operator_folder - def delete(self): """ Delete Job or Job Run from OCID. @@ -245,25 +246,6 @@ def watch(self): run = DataScienceJobRun.from_ocid(run_id) run.watch(interval=interval, wait=wait) - def _jinja_write(self, operator_slug, operator_folder): - # TODO AH: fill in templates with relevant details - env = Environment( - loader=PackageLoader("ads", f"opctl/operators/{operator_slug}") - ) - - for setup_file in [ - "Dockerfile", - "environment.yaml", - "main.py", - "run.py", - "start_scheduler.sh", - "start_worker.sh", - "dask_cluster.yaml", - ]: - template = env.get_template(setup_file + ".jinja2") - with open(os.path.join(operator_folder, setup_file), "w") as ff: - ff.write(template.render(config=self.config)) - def _create_payload(self, infra=None, name=None) -> Job: if not infra: infra = self.config.get("infrastructure", {}) @@ -559,6 +541,163 @@ def run(self, cluster_info, dry_run=False) -> None: return job, main_jobrun, worker_jobruns +class MLJobOperatorBackend(MLJobBackend): + """ + Backend class to run operator on Data Science Jobs. + Currently supported two scenarios: + * Running operator within container runtime. + * Running operator within python runtime. + + Attributes + ---------- + runtime_config: (Dict) + The runtime config for the operator. + operator_config: (Dict) + The operator specification config. + operator_type: str + The type of the operator. + operator_version: str + The version of the operator. + operator_info: OperatorInfo + The detailed information about the operator. + job: Job + The Data Science Job. + """ + + def __init__(self, config: Dict, operator_info: OperatorInfo = None) -> None: + """ + Instantiates the operator backend. + + Parameters + ---------- + config: (Dict) + The configuration file containing operator's specification details and execution section. + operator_info: (OperatorInfo, optional) + The operator's detailed information extracted from the operator.__init__ file. + Will be extracted from the operator type in case if not provided. + """ + super().__init__(config=config or {}) + + self.job = None + + self.runtime_config = self.config.get("runtime", {}) + self.operator_config = { + **{ + key: value + for key, value in self.config.items() + if key not in ("runtime", "infrastructure", "execution") + } + } + self.operator_type = self.operator_config.get("type", "unknown") + self.operator_version = self.operator_config.get("version", "unknown") + + # registering supported runtime adjusters + self._RUNTIME_MAP = { + ContainerRuntime().type: self._adjust_container_runtime, + PythonRuntime().type: self._adjust_python_runtime, + } + + self.operator_info = operator_info + + def _adjust_common_information(self): + """Adjusts common information of the job.""" + + if self.job.name.lower().startswith("{job"): + self.job.with_name( + f"job_{self.operator_info.type.lower()}" + f"_{self.operator_version.lower()}" + ) + self.job.runtime.with_maximum_runtime_in_minutes( + self.config["execution"].get("max_wait_time", 1200) + ) + + def _adjust_container_runtime(self): + """Adjusts container runtime.""" + entrypoint = self.job.runtime.entrypoint + image = self.job.runtime.image.lower() + cmd = " ".join( + [ + "python3", + "-m", + f"{self.operator_info.type}", + ] + ) + self.job.runtime.with_environment_variable( + **{ + "OCI_IAM_TYPE": AuthType.RESOURCE_PRINCIPAL, + "OCIFS_IAM_TYPE": AuthType.RESOURCE_PRINCIPAL, + ENV_OPERATOR_ARGS: json.dumps(self.operator_config), + **(self.job.runtime.envs or {}), + } + ) + self.job.runtime.with_image(image=image, entrypoint=entrypoint, cmd=cmd) + + def _adjust_python_runtime(self): + """Adjusts python runtime.""" + temp_dir = tempfile.mkdtemp() + logger.debug(f"Copying operator's code to the temporary folder: {temp_dir}") + + # prepare run.sh file to run the operator's code + script_file = os.path.join( + temp_dir, f"{self.operator_info.type}_{int(time.time())}_run.sh" + ) + with open(script_file, "w") as fp: + fp.write(f"python3 -m {self.operator_info.type}") + + # copy the operator's source code to the temporary folder + shutil.copytree( + self.operator_info.path.rstrip("/"), + os.path.join(temp_dir, self.operator_info.type), + dirs_exist_ok=True, + ) + + # prepare jobs runtime + self.job.runtime.with_source( + temp_dir, + entrypoint=os.path.basename(script_file), + ).with_working_dir( + os.path.basename(temp_dir.rstrip("/")) + ).with_environment_variable( + **{ + "OCI_IAM_TYPE": AuthType.RESOURCE_PRINCIPAL, + "OCIFS_IAM_TYPE": AuthType.RESOURCE_PRINCIPAL, + ENV_OPERATOR_ARGS: json.dumps(self.operator_config), + **(self.job.runtime.envs or {}), + } + ) + + @print_watch_command + def run(self, **kwargs: Dict) -> Union[Dict, None]: + """ + Runs the operator on the Data Science Jobs. + """ + if not self.operator_info: + self.operator_info = OperatorLoader.from_uri(self.operator_type).load() + + self.job = Job.from_dict(self.runtime_config).build() + + # adjust job's common information + self._adjust_common_information() + + # adjust runtime information + self._RUNTIME_MAP.get(self.job.runtime.type, lambda: None)() + + # run the job if only it is not a dry run mode + if not self.config["execution"].get("dry_run"): + job = self.job.create() + logger.info(f"{'*' * 50}Job{'*' * 50}") + logger.info(job) + + job_run = job.run() + logger.info(f"{'*' * 50}JobRun{'*' * 50}") + logger.info(job_run) + + return {"job_id": job.id, "run_id": job_run.id} + else: + logger.info(f"{'*' * 50} Job (Dry Run Mode) {'*' * 50}") + logger.info(self.job) + + class JobRuntimeFactory(RuntimeFactory): """Job runtime factory.""" diff --git a/ads/opctl/backend/local.py b/ads/opctl/backend/local.py index b30ea6db7..3d9bf7151 100644 --- a/ads/opctl/backend/local.py +++ b/ads/opctl/backend/local.py @@ -1,25 +1,28 @@ #!/usr/bin/env python # -*- coding: utf-8; -*- -# Copyright (c) 2022 Oracle and/or its affiliates. +# Copyright (c) 2022, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import copy import json import os +import runpy +import sys import tempfile from concurrent.futures import Future, ThreadPoolExecutor from time import sleep -from typing import Dict, List +from typing import Dict, List, Optional, Union from oci.data_science.models import PipelineStepRun -from ads.common.auth import create_signer +from ads.common.auth import AuthContext, create_signer from ads.common.decorator.runtime_dependency import ( OptionalDependency, runtime_dependency, ) - +from ads.common.oci_client import OCIClientFactory +from ads.config import NO_CONTAINER from ads.model.model_metadata import ModelCustomMetadata from ads.model.runtime.runtime_info import RuntimeInfo from ads.opctl import logger @@ -30,15 +33,20 @@ DEFAULT_IMAGE_CONDA_DIR, DEFAULT_IMAGE_HOME_DIR, DEFAULT_IMAGE_SCRIPT_DIR, + DEFAULT_MODEL_DEPLOYMENT_FOLDER, DEFAULT_MODEL_FOLDER, DEFAULT_NOTEBOOK_SESSION_CONDA_DIR, DEFAULT_NOTEBOOK_SESSION_SPARK_CONF_DIR, ML_JOB_GPU_IMAGE, ML_JOB_IMAGE, - DEFAULT_MODEL_DEPLOYMENT_FOLDER, ) from ads.opctl.distributed.cmds import load_ini, local_run from ads.opctl.model.cmds import _download_model +from ads.opctl.operator import __operators__ +from ads.opctl.operator.common.const import ENV_OPERATOR_ARGS +from ads.opctl.operator.common.operator_loader import OperatorInfo, OperatorLoader +from ads.opctl.operator.runtime import const as operator_runtime_const +from ads.opctl.operator.runtime import runtime as operator_runtime from ads.opctl.spark.cmds import ( generate_core_site_properties, generate_core_site_properties_str, @@ -51,8 +59,6 @@ run_container, ) from ads.pipeline.ads_pipeline import Pipeline, PipelineStep -from ads.common.oci_client import OCIClientFactory -from ads.config import NO_CONTAINER class CondaPackNotFound(Exception): # pragma: no cover @@ -413,6 +419,8 @@ def _mount_source_folder_if_exists(self, bind_volumes: Dict) -> Dict: def _activate_conda_env_and_run( image: str, slug: str, command: List[str], bind_volumes: Dict, env_vars: Dict ) -> int: + import docker + try: client = get_docker_client() client.api.inspect_image(image) @@ -818,3 +826,228 @@ def _get_conda_info_from_runtime(artifact_dir): runtime_info.model_deployment.inference_conda_env.inference_env_path ) return conda_slug, conda_path + + +class LocalOperatorBackend(Backend): + """ + The local operator backend to execute operator in the local environment. + Currently supported two scenarios: + * Running operator within local conda environment. + * Running operator within local container. + + Attributes + ---------- + runtime_config: (Dict) + The runtime config for the operator. + operator_config: (Dict) + The operator specification config. + operator_type: str + The type of the operator. + operator_info: OperatorInfo + The detailed information about the operator. + """ + + def __init__( + self, config: Optional[Dict], operator_info: OperatorInfo = None + ) -> None: + """ + Instantiates the operator backend. + + Parameters + ---------- + config: (Dict) + The configuration file containing operator's specification details and execution section. + operator_info: (OperatorInfo, optional) + The operator's detailed information extracted from the operator.__init__ file. + Will be extracted from the operator type in case if not provided. + """ + super().__init__(config=config or {}) + + self.runtime_config = self.config.get("runtime", {}) + self.operator_config = { + **{ + key: value + for key, value in self.config.items() + if key not in ("runtime", "infrastructure", "execution") + } + } + self.operator_type = self.operator_config.get("type") + + self._RUNTIME_MAP = { + operator_runtime.ContainerRuntime.type: self._run_with_container, + operator_runtime.PythonRuntime.type: self._run_with_python, + } + + self.operator_info = operator_info + + def _run_with_python(self, **kwargs: Dict) -> int: + """Runs the operator within a local python environment. + + Returns + ------- + int + The operator's run exit code. + """ + + # build runtime object + runtime = operator_runtime.PythonRuntime.from_dict( + self.runtime_config, ignore_unknown=True + ) + + # run operator + operator_spec = json.dumps(self.operator_config) + sys.argv = [self.operator_info.type, "--spec", operator_spec] + + logger.info(f"{'*' * 50} Runtime Config {'*' * 50}") + logger.info(runtime.to_yaml()) + + try: + runpy.run_module(self.operator_info.type, run_name="__main__") + except SystemExit as exception: + return exception.code + else: + return 0 + + def _run_with_container(self, **kwargs: Dict) -> int: + """Runs the operator within a container. + + Returns + ------- + int + The operator's run exit code. + """ + + # build runtime object + runtime: operator_runtime.ContainerRuntime = ( + operator_runtime.ContainerRuntime.from_dict( + self.runtime_config, ignore_unknown=True + ) + ) + # prepare environment variables + env_vars = { + **{env["name"]: env["value"] for env in runtime.spec.env}, + ENV_OPERATOR_ARGS: json.dumps(self.operator_config), + } + + # prepare container volumes + bind_volumes = {} + for volume in runtime.spec.volume: + host_path, container_path = volume.split(":") + bind_volumes[host_path.lstrip().rstrip()] = { + "bind": container_path.lstrip().rstrip() + } + + logger.info(f"{'*' * 50} Runtime Config {'*' * 50}") + logger.info(runtime.to_yaml()) + + return run_container( + image=runtime.spec.image, + bind_volumes=bind_volumes, + env_vars=env_vars, + command=f"'python3 -m {self.operator_info.type}'", + ) + + def run(self, **kwargs: Dict) -> Dict: + """Runs the operator.""" + + # extract runtime + runtime_type = self.runtime_config.get( + "type", operator_runtime.OPERATOR_LOCAL_RUNTIME_TYPE.PYTHON + ) + + if runtime_type not in self._RUNTIME_MAP: + raise RuntimeError( + f"Not supported runtime - {runtime_type} for local backend. " + f"Supported values: {self._RUNTIME_MAP.keys()}" + ) + + if not self.operator_info: + self.operator_info = OperatorLoader.from_uri(self.operator_type).load() + + if self.config.get("dry_run"): + logger.info( + "The dry run option is not supported for " + "the local backends and will be ignored." + ) + + # run operator with provided runtime + exit_code = self._RUNTIME_MAP.get(runtime_type, lambda: None)(**kwargs) + + if exit_code != 0: + raise RuntimeError( + f"Operation did not complete successfully. Exit code: {exit_code}. " + f"Run with the --debug argument to view logs." + ) + + def init( + self, + uri: Union[str, None] = None, + overwrite: bool = False, + runtime_type: Union[str, None] = None, + **kwargs: Dict, + ) -> Union[str, None]: + """Generates a starter YAML specification for the operator local runtime. + + Parameters + ---------- + overwrite: (bool, optional). Defaults to False. + Overwrites the result specification YAML if exists. + uri: (str, optional), Defaults to None. + The filename to save the resulting specification template YAML. + runtime_type: (str, optional). Defaults to None. + The resource runtime type. + **kwargs: Dict + The optional arguments. + + Returns + ------- + Union[str, None] + The YAML specification for the given resource if `uri` was not provided. + `None` otherwise. + """ + runtime_type = runtime_type or operator_runtime.ContainerRuntime.type + if runtime_type not in operator_runtime_const.RUNTIME_TYPE_MAP: + raise ValueError( + f"Not supported runtime type {runtime_type}. " + f"Supported values: {operator_runtime_const.RUNTIME_TYPE_MAP.keys()}" + ) + + RUNTIME_KWARGS_MAP = { + operator_runtime.ContainerRuntime.type: { + "image": f"{self.operator_config['type']}:{self.operator_config['version']}", + "volume": [ + os.path.expanduser( + os.path.dirname(self.config["execution"]["oci_config"]) + ) + + ":" + + "/root/.oci" + ], + "env": [ + { + "name": "operator", + "value": f"{self.operator_config['type']}:{self.operator_config['version']}", + } + ], + }, + operator_runtime.PythonRuntime.type: {}, + } + + with AuthContext(auth=self.auth_type, profile=self.profile): + note = ( + "# This YAML specification was auto generated by the " + "`ads operator init` command.\n" + "# The more details about the operator's runtime YAML " + "specification can be found in the ADS documentation:\n" + "# https://accelerated-data-science.readthedocs.io/en/latest \n\n" + ) + + return ( + operator_runtime_const.RUNTIME_TYPE_MAP[runtime_type] + .init(**RUNTIME_KWARGS_MAP[runtime_type]) + .to_yaml( + uri=uri, + overwrite=overwrite, + note=note, + **kwargs, + ) + ) diff --git a/ads/opctl/cli.py b/ads/opctl/cli.py index 9b43ba0c3..f97fcd2df 100644 --- a/ads/opctl/cli.py +++ b/ads/opctl/cli.py @@ -14,6 +14,7 @@ import ads.opctl.conda.cli import ads.opctl.distributed.cli import ads.opctl.model.cli +import ads.opctl.operator.cli import ads.opctl.spark.cli from ads.common import auth as authutil from ads.common.auth import AuthType @@ -23,7 +24,6 @@ from ads.opctl.cmds import deactivate as deactivate_cmd from ads.opctl.cmds import delete as delete_cmd from ads.opctl.cmds import init as init_cmd -from ads.opctl.cmds import init_operator as init_operator_cmd from ads.opctl.cmds import init_vscode as init_vscode_cmd from ads.opctl.cmds import predict as predict_cmd from ads.opctl.cmds import run as run_cmd @@ -36,6 +36,7 @@ RESOURCE_TYPE, RUNTIME_TYPE, ) +from ads.opctl.decorator.common import with_auth from ads.opctl.utils import build_image as build_image_cmd from ads.opctl.utils import publish_image as publish_image_cmd from ads.opctl.utils import suppress_traceback @@ -51,11 +52,12 @@ def commands(): @click.help_option("--help", "-h") @click.option("--debug", "-d", help="set debug mode", is_flag=True, default=False) def configure(debug): + """Sets up the initial configurations for the ADS OPCTL.""" suppress_traceback(debug)(configure_cmd)() @commands.command() -@click.argument("image-type", type=click.Choice(["job-local", "ads-ops-base"])) +@click.argument("image-type", type=click.Choice(["job-local"])) @click.help_option("--help", "-h") @click.option( "--gpu", @@ -65,23 +67,10 @@ def configure(debug): default=False, required=False, ) -@click.option( - "--source-folder", - "-s", - help="when building custom operator image, source folder of the custom operator", - default=None, - required=False, -) -@click.option( - "--image", - "-i", - help="image name, used when building custom image", - default=None, - required=False, -) @click.option("--debug", "-d", help="set debug mode", is_flag=True, default=False) -def build_image(image_type, gpu, source_folder, image, debug): - suppress_traceback(debug)(build_image_cmd)(image_type, gpu, source_folder, image) +def build_image(image_type, gpu, debug): + """Builds the local Data Science Jobs image.""" + suppress_traceback(debug)(build_image_cmd)(image_type, gpu) @commands.command() @@ -98,6 +87,7 @@ def build_image(image_type, gpu, source_folder, image, debug): @click.help_option("--help", "-h") @click.option("--debug", "-d", help="set debug mode", is_flag=True, default=False) def publish_image(**kwargs): + """Publishes image to the OCI Container Registry.""" debug = kwargs.pop("debug") if kwargs.get("registry", None): registry = kwargs["registry"] @@ -195,7 +185,7 @@ def init_vscode(**kwargs): default=authutil.DEFAULT_LOCATION, ), click.option( - "--oci-profile", help="oci config profile", required=False, default=authutil.DEFAULT_PROFILE + "--oci-profile", help="oci config profile", required=False, default=None ), click.option( "--conf-file", @@ -382,29 +372,16 @@ def _add_options(func): required=False, default=None, ) -def run(file, **kwargs): +@with_auth +def run(file, debug, **kwargs): """ - Runs the workload on the targeted backend. When run `distributed` yaml spec, the backend is always OCI Data Science - Jobs + Runs the operator with the given specification on the targeted backend. + For the distributed backend, the operator is always run as a OCI Data Science job. """ - debug = kwargs["debug"] config = {} if file: - if os.path.exists(file): - auth = {} - if kwargs["auth"]: - auth = authutil.create_signer( - auth_type=kwargs["auth"], - oci_config_location=kwargs["oci_config"], - profile=kwargs["oci_profile"] - ) - else: - auth = authutil.default_signer() - - with fsspec.open(file, "r", **auth) as f: - config = suppress_traceback(debug)(yaml.safe_load)(f.read()) - else: - raise FileNotFoundError(f"{file} is not found") + with fsspec.open(file, "r", **authutil.default_signer()) as f: + config = suppress_traceback(debug)(yaml.safe_load)(f.read()) suppress_traceback(debug)(run_cmd)(config, **kwargs) @@ -437,30 +414,8 @@ def check(file, **kwargs): suppress_traceback(debug)(run_diagnostics_cmd)(config, **kwargs) -@commands.command() -@click.argument("operator_slug", nargs=1) -@click.option( - "--folder_path", - "-fp", - help="the name of the folder wherein to put the operator code", - multiple=True, - required=False, - default=None, -) -@add_options(_options) -def init_operator(**kwargs): - suppress_traceback(kwargs["debug"])(init_operator_cmd)(**kwargs) - - @commands.command() @click.argument("ocid", nargs=1) -@add_options(_model_deployment_options) -@click.option( - "--conda-pack-folder", - required=False, - default=None, - help="folder where conda packs are saved", -) @click.option( "--auth", "-a", @@ -475,6 +430,7 @@ def init_operator(**kwargs): ) @click.option("--debug", "-d", help="set debug mode", is_flag=True, default=False) def delete(**kwargs): + """Deletes a data science service resource.""" suppress_traceback(kwargs["debug"])(delete_cmd)(**kwargs) @@ -501,16 +457,18 @@ def delete(**kwargs): ) @click.option("--debug", "-d", help="set debug mode", is_flag=True, default=False) def cancel(**kwargs): + """Aborts the execution of the OCI resource run.""" suppress_traceback(kwargs["debug"])(cancel_cmd)(**kwargs) @commands.command() @click.argument("ocid", nargs=1) +@click.option("--debug", "-d", help="Set debug mode", is_flag=True, default=False) @click.option( - "--log-type", - help="the type of logging. Allowed value: `custom_log` and `service_log` for pipeline, `access` and `predict` for model deployment.", - required=False, - default=None + "--log-type", + help="the type of logging. Allowed value: `custom_log` and `service_log` for pipeline, `access` and `predict` for model deployment.", + required=False, + default=None, ) @click.option( "--log-filter", @@ -530,7 +488,7 @@ def cancel(**kwargs): help="time in seconds to keep updating the logs after the job run finished for job.", type=int, required=False, - default=90 + default=90, ) @click.option( "--conda-pack-folder", @@ -553,7 +511,7 @@ def cancel(**kwargs): @click.option("--debug", "-d", help="set debug mode", is_flag=True, default=False) def watch(**kwargs): """ - ``tail`` logs form a job run, dataflow run or pipeline run. + Tails the logs form a job run, data flow run or pipeline run. Connects to the logging service that was configured with the JobRun, Application Run or Pipeline Run and streams the logs. """ suppress_traceback(kwargs["debug"])(watch_cmd)(**kwargs) @@ -561,13 +519,7 @@ def watch(**kwargs): @commands.command() @click.argument("ocid", nargs=1) -@add_options(_model_deployment_options) -@click.option( - "--conda-pack-folder", - required=False, - default=None, - help="folder where conda packs are saved", -) +@click.option("--debug", "-d", help="Set debug mode", is_flag=True, default=False) @click.option( "--auth", "-a", @@ -583,13 +535,14 @@ def watch(**kwargs): @click.option("--debug", "-d", help="set debug mode", is_flag=True, default=False) def activate(**kwargs): """ - Activates a data science service. + Activates a data science service resource. """ suppress_traceback(kwargs["debug"])(activate_cmd)(**kwargs) @commands.command() @click.argument("ocid", nargs=1) +@click.option("--debug", "-d", help="Set debug mode", is_flag=True, default=False) @add_options(_model_deployment_options) @click.option( "--conda-pack-folder", @@ -612,7 +565,7 @@ def activate(**kwargs): @click.option("--debug", "-d", help="set debug mode", is_flag=True, default=False) def deactivate(**kwargs): """ - Deactivates a data science service. + Deactivates a data science service resource. """ suppress_traceback(kwargs["debug"])(deactivate_cmd)(**kwargs) diff --git a/ads/opctl/cmds.py b/ads/opctl/cmds.py index fdd392987..26c71e47b 100644 --- a/ads/opctl/cmds.py +++ b/ads/opctl/cmds.py @@ -16,7 +16,6 @@ from ads.common.auth import AuthContext, AuthType from ads.common.extended_enum import ExtendedEnumMeta from ads.common.oci_datascience import DSCNotebookSession -from ads.opctl import logger from ads.opctl.backend.ads_dataflow import DataFlowBackend from ads.opctl.backend.ads_ml_job import MLJobBackend, MLJobDistributedBackend from ads.opctl.backend.ads_ml_pipeline import PipelineBackend @@ -25,6 +24,7 @@ LocalBackend, LocalBackendDistributed, LocalModelDeploymentBackend, + LocalOperatorBackend, LocalPipelineBackend, ) from ads.opctl.config.base import ConfigProcessor @@ -55,6 +55,9 @@ update_ini, verify_and_publish_image, ) +from ads.opctl.operator.common.backend_factory import ( + BackendFactory as OperatorBackendFactory, +) from ads.opctl.utils import get_service_pack_prefix, is_in_notebook_session @@ -98,6 +101,7 @@ class _BackendFactory: BACKEND_NAME.DATAFLOW.value: DataFlowBackend, BACKEND_NAME.PIPELINE.value: PipelineBackend, BACKEND_NAME.MODEL_DEPLOYMENT.value: ModelDeploymentBackend, + BACKEND_NAME.OPERATOR_LOCAL.value: LocalOperatorBackend, } LOCAL_BACKENDS_MAP = { @@ -154,6 +158,7 @@ def _save_yaml(yaml_content, **kwargs): f.write(yaml_content) print(f"Job run info saved to {yaml_path}") + def run(config: Dict, **kwargs) -> Dict: """ Run a job given configuration and command line args passed in (kwargs). @@ -172,7 +177,23 @@ def run(config: Dict, **kwargs) -> Dict: """ if config: p = ConfigProcessor(config).step(ConfigMerger, **kwargs) - if p.config["kind"] != BACKEND_NAME.LOCAL.value and p.config["kind"] != "distributed": + try: + return OperatorBackendFactory.backend( + config=p, + backend=p.config["execution"].get("backend"), + **{ + key: value + for key, value in kwargs.items() + if key not in ("backend", "config") + }, + ).run(**kwargs) + except RuntimeError: + pass + + if ( + p.config["kind"] != BACKEND_NAME.LOCAL.value + and p.config["kind"] != "distributed" + ): p.config["execution"]["backend"] = p.config["kind"] return _BackendFactory(p.config).backend.apply() else: @@ -321,31 +342,6 @@ def _update_env_vars(config, env_vars: List): return config -def init_operator(**kwargs) -> str: - """ - Initialize the resources for an operator - - Parameters - ---------- - kwargs: dict - keyword argument, stores command line args - Returns - ------- - folder_path: str - a path to the folder with all of the resources - """ - # TODO: confirm that operator slug is in the set of valid operator slugs - assert kwargs["operator_slug"] == "dask_cluster" - - if kwargs.get("folder_path"): - kwargs["operator_folder_path"] = kwargs.pop("folder_path")[0] - else: - kwargs["operator_folder_path"] = kwargs["operator_slug"] - p = ConfigProcessor().step(ConfigMerger, **kwargs) - print(f"config check: {p.config}") - return _BackendFactory(p.config).backend.init_operator() - - def delete(**kwargs) -> None: """ Delete a MLJob/DataFlow run. @@ -388,7 +384,7 @@ def cancel(**kwargs) -> None: ---------- kwargs: dict keyword argument, stores command line args - + Returns ------- None @@ -400,9 +396,7 @@ def cancel(**kwargs) -> None: or DataScienceResourceRun.PIPELINE_RUN in kwargs["ocid"] ): kwargs["run_id"] = kwargs.pop("ocid") - elif ( - DataScienceResource.JOB in kwargs["ocid"] - ): + elif DataScienceResource.JOB in kwargs["ocid"]: kwargs["id"] = kwargs.pop("ocid") else: raise ValueError(f"{kwargs['ocid']} is invalid or not supported.") @@ -543,6 +537,12 @@ def configure() -> None: if "CONDA" not in config_parser: config_parser["CONDA"] = {} + oci_auth = click.prompt( + text="Default OCI authentication type:", + type=click.Choice(AuthType.values()), + default=None, + ) + oci_config_path = click.prompt( "OCI config path:", default=config_parser["OCI"].get("oci_config", DEFAULT_OCI_CONFIG_FILE), @@ -558,6 +558,7 @@ def configure() -> None: config_parser["OCI"] = { "oci_config": oci_config_path, "oci_profile": oci_profile, + "auth": oci_auth, } conda_pack_path = click.prompt( "Conda pack install folder:", @@ -604,7 +605,7 @@ def configure() -> None: ("docker_registry", ""), ("conda_pack_os_prefix", "in the format oci://@/"), ("memory_in_gbs", ""), - ("ocpus", "") + ("ocpus", ""), ] _set_service_configurations( ADS_JOBS_CONFIG_FILE_NAME, @@ -634,7 +635,7 @@ def configure() -> None: ("driver_shape_memory_in_gbs", ""), ("driver_shape_ocpus", ""), ("executor_shape_memory_in_gbs", ""), - ("executor_shape_ocpus", "") + ("executor_shape_ocpus", ""), ] _set_service_configurations( ADS_DATAFLOW_CONFIG_FILE_NAME, @@ -685,7 +686,7 @@ def configure() -> None: ("replica", ""), ("web_concurrency", ""), ("memory_in_gbs", ""), - ("ocpus", "") + ("ocpus", ""), ] _set_service_configurations( diff --git a/ads/opctl/conda/cli.py b/ads/opctl/conda/cli.py index b08d9cb08..aac63eb51 100644 --- a/ads/opctl/conda/cli.py +++ b/ads/opctl/conda/cli.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8; -*- -# Copyright (c) 2022 Oracle and/or its affiliates. +# Copyright (c) 2022, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import click @@ -17,6 +17,7 @@ @click.group("conda") @click.help_option("--help", "-h") def commands(): + "The CLI to assist in the management of conda environments." pass diff --git a/ads/opctl/config/merger.py b/ads/opctl/config/merger.py index e7063665a..96cf076e7 100644 --- a/ads/opctl/config/merger.py +++ b/ads/opctl/config/merger.py @@ -43,6 +43,10 @@ class ConfigMerger(ConfigProcessor): def process(self, **kwargs) -> None: config_string = Template(json.dumps(self.config)).safe_substitute(os.environ) self.config = json.loads(config_string) + + if "runtime" not in self.config: + self.config["runtime"] = {} + # 1. merge and overwrite values from command line args self._merge_config_with_cmd_args(kwargs) # 1.5 merge environment variables @@ -61,7 +65,6 @@ def process(self, **kwargs) -> None: self._config_flex_shape_details() - logger.debug(f"Config: {self.config}") return self def _merge_config_with_cmd_args(self, cmd_args: Dict) -> None: @@ -113,9 +116,13 @@ def _fill_config_with_defaults(self, ads_config_path: str) -> None: # set default auth if not self.config["execution"].get("auth", None): if is_in_notebook_session(): - self.config["execution"]["auth"] = AuthType.RESOURCE_PRINCIPAL + self.config["execution"]["auth"] = ( + exec_config.get("auth") or AuthType.RESOURCE_PRINCIPAL + ) else: - self.config["execution"]["auth"] = AuthType.API_KEY + self.config["execution"]["auth"] = ( + exec_config.get("auth") or AuthType.API_KEY + ) # determine profile if self.config["execution"]["auth"] != AuthType.API_KEY: profile = self.config["execution"]["auth"].upper() @@ -127,7 +134,6 @@ def _fill_config_with_defaults(self, ads_config_path: str) -> None: ) self.config["execution"]["oci_profile"] = profile # loading config for corresponding profile - logger.info(f"Loading service config for profile {profile}.") infra_config = self._get_service_config(profile, ads_config_path) if infra_config.get( "conda_pack_os_prefix" @@ -169,11 +175,12 @@ def _get_config_from_config_ini(ads_config_folder: str) -> Dict: return { "oci_config": parser["OCI"].get("oci_config"), "oci_profile": parser["OCI"].get("oci_profile"), + "auth": parser["OCI"].get("auth"), "conda_pack_folder": parser["CONDA"].get("conda_pack_folder"), "conda_pack_os_prefix": parser["CONDA"].get("conda_pack_os_prefix"), } else: - logger.info( + logger.debug( f"{os.path.join(ads_config_folder, 'config.ini')} does not exist. No config loaded." ) return {} @@ -193,8 +200,10 @@ def _get_service_config(self, oci_profile: str, ads_config_folder: str) -> Dict: parser = read_from_ini(os.path.join(ads_config_folder, config_file)) if oci_profile in parser: return parser[oci_profile] + if DEFAULT_PROFILE in parser: + return parser[DEFAULT_PROFILE] else: - logger.info( + logger.debug( f"{os.path.join(ads_config_folder, config_file)} does not exist. No config loaded." ) return {} @@ -214,7 +223,7 @@ def _config_flex_shape_details(self): ): raise ValueError( "Parameters `ocpus` and `memory_in_gbs` must be provided for using flex shape. " - "Call `ads opctl config` to specify." + "Call `ads opctl configure` to specify." ) infrastructure["shape_config_details"] = { "ocpus": infrastructure.pop("ocpus"), @@ -235,7 +244,7 @@ def _config_flex_shape_details(self): if parameter not in infrastructure: raise ValueError( f"Parameters {parameter} must be provided for using flex shape. " - "Call `ads opctl config` to specify." + "Call `ads opctl configure` to specify." ) infrastructure["driver_shape_config"] = { "ocpus": infrastructure.pop("driver_shape_ocpus"), diff --git a/ads/opctl/config/resolver.py b/ads/opctl/config/resolver.py index 931dd1c5c..1f1a9c14c 100644 --- a/ads/opctl/config/resolver.py +++ b/ads/opctl/config/resolver.py @@ -111,8 +111,6 @@ def process(self): self._resolve_mounted_volumes() self._resolve_job_name() - logger.debug(f"Config: {self.config}") - return self def _is_ads_operator(self) -> bool: diff --git a/ads/opctl/config/versioner.py b/ads/opctl/config/versioner.py index e73517fc3..51ee22d2b 100644 --- a/ads/opctl/config/versioner.py +++ b/ads/opctl/config/versioner.py @@ -64,6 +64,5 @@ def process(self): # - user runs ADS operator -> name/YAML # TODO: build this out - logger.debug(f"Config: {self.config}") return self diff --git a/ads/opctl/config/yaml_parsers/base.py b/ads/opctl/config/yaml_parsers/base.py index 7f3a8d035..7f25bb544 100644 --- a/ads/opctl/config/yaml_parsers/base.py +++ b/ads/opctl/config/yaml_parsers/base.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8; -*- -# Copyright (c) 2022 Oracle and/or its affiliates. +# Copyright (c) 2022, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ from logging import getLogger @@ -42,7 +42,6 @@ def parse(self): @classmethod def parse_content(cls, file): - logger.debug(f"Config filename is {file}") yaml_spec = {} if isinstance(file, dict): yaml_spec = file @@ -53,10 +52,7 @@ def parse_content(cls, file): parsed_output = None if kind: className = f"{kind[0].upper()}{kind[1:]}SpecParser" - logger.debug(f"kind: {kind}") - logger.debug(f"Parser Class name: {className}") m = importlib.import_module("ads.opctl.config.yaml_parsers") parser = getattr(m, className) parsed_output = parser(yaml_spec).parse() - logger.debug(f"Parsed Output: {parsed_output}") return parsed_output diff --git a/ads/opctl/constants.py b/ads/opctl/constants.py index 75b5cbfad..0e62a3e0a 100644 --- a/ads/opctl/constants.py +++ b/ads/opctl/constants.py @@ -12,10 +12,8 @@ DEFAULT_MODEL_FOLDER = "~/.ads_ops/models" CONDA_PACK_OS_PREFIX_FORMAT = "oci://@/" DEFAULT_ADS_CONFIG_FOLDER = "~/.ads_ops" -OPS_IMAGE_BASE = "ads-operators-base" ML_JOB_IMAGE = "ml-job" ML_JOB_GPU_IMAGE = "ml-job-gpu" -OPS_IMAGE_GPU_BASE = "ads-operators-gpu-base" DEFAULT_MANIFEST_VERSION = "1.0" ADS_CONFIG_FILE_NAME = "config.ini" ADS_JOBS_CONFIG_FILE_NAME = "ml_job_config.ini" @@ -31,6 +29,10 @@ DEFAULT_SPECIFICATION_FILE_NAME = "oci-datascience-template.yaml" DEFAULT_MODEL_DEPLOYMENT_FOLDER = "/opt/ds/model/deployed_model/" +# OPERATOR +OPERATOR_MODULE_PATH = "ads.opctl.operator.lowcode" +OPERATOR_IMAGE_WORK_DIR = "/etc/operator" + class RUNTIME_TYPE(ExtendedEnum): PYTHON = "python" @@ -49,6 +51,7 @@ class RESOURCE_TYPE(ExtendedEnum): DATAFLOW = "dataflow" PIPELINE = "pipeline" MODEL_DEPLOYMENT = "deployment" + OPERATOR = "operator" class BACKEND_NAME(ExtendedEnum): @@ -57,3 +60,4 @@ class BACKEND_NAME(ExtendedEnum): PIPELINE = "pipeline" MODEL_DEPLOYMENT = "deployment" LOCAL = "local" + OPERATOR_LOCAL = "operator.local" diff --git a/ads/opctl/decorator/common.py b/ads/opctl/decorator/common.py index 98100977c..dd5a3bb02 100644 --- a/ads/opctl/decorator/common.py +++ b/ads/opctl/decorator/common.py @@ -4,22 +4,100 @@ # Copyright (c) 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -from typing import Dict, Callable from functools import wraps +from typing import Callable, Dict, List + +from ads.common.auth import AuthContext +from ads.opctl import logger +from ads.opctl.config.base import ConfigProcessor +from ads.opctl.config.merger import ConfigMerger RUN_ID_FIELD = "run_id" -def print_watch_command(func: callable)->Callable: + +class OpctlEnvironmentError(Exception): + """The custom error to validate OPCTL environment.""" + + NOT_SUPPORTED_ENVIRONMENTS = ( + "Notebook Sessions", + "Data Science Jobs", + "ML Pipelines", + "Data Flow Applications", + ) + + def __init__(self): + super().__init__( + "This operation cannot be executed in the current environment. " + f"It is not supported in: {', '.join(self.NOT_SUPPORTED_ENVIRONMENTS)}." + ) + + +def print_watch_command(func: callable) -> Callable: """The decorator to help build the `opctl watch` command.""" + @wraps(func) - def wrapper(*args, **kwargs)->Dict: + def wrapper(*args: List, **kwargs: Dict) -> Dict: result = func(*args, **kwargs) if result and isinstance(result, Dict) and RUN_ID_FIELD in result: msg_header = ( - f"{'*' * 40} To monitor the progress of a task, execute the following command {'*' * 40}" + f"{'*' * 40} To monitor the progress of the task, " + f"execute the following command {'*' * 40}" ) print(msg_header) print(f"ads opctl watch {result[RUN_ID_FIELD]}") print("*" * len(msg_header)) return result - return wrapper \ No newline at end of file + + return wrapper + + +def validate_environment(func: callable) -> Callable: + """Validates whether an opctl command can be executed in the current environment.""" + + @wraps(func) + def wrapper(*args: List, **kwargs: Dict) -> Dict: + try: + import docker + + docker.from_env().version() + except Exception as ex: + logger.debug(ex) + raise OpctlEnvironmentError() + + return func(*args, **kwargs) + + return wrapper + + +def click_options(options): + """The decorator to help group the click options.""" + + def _add_options(func): + for option in reversed(options): + func = option(func) + return func + + return _add_options + + +def with_auth(func: Callable) -> Callable: + """The decorator to add AuthContext to the method.""" + + @wraps(func) + def wrapper(*args, **kwargs) -> Dict: + p = ConfigProcessor().step(ConfigMerger, **kwargs) + + with AuthContext( + **{ + key: value + for key, value in { + "auth": p.config["execution"]["auth"], + "oci_config_location": p.config["execution"]["oci_config"], + "profile": p.config["execution"]["oci_profile"], + }.items() + if value + } + ): + return func(*args, **kwargs) + + return wrapper diff --git a/ads/opctl/distributed/cli.py b/ads/opctl/distributed/cli.py index 58507985a..838f58f8c 100644 --- a/ads/opctl/distributed/cli.py +++ b/ads/opctl/distributed/cli.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8; -*- -# Copyright (c) 2022 Oracle and/or its affiliates. +# Copyright (c) 2022, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import os @@ -21,6 +21,7 @@ @click.group("distributed-training") @click.help_option("--help", "-h") def commands(): + "The CLI to assist in the management of the distributed training." pass diff --git a/ads/opctl/docker/Dockerfile b/ads/opctl/docker/Dockerfile deleted file mode 100644 index c91343106..000000000 --- a/ads/opctl/docker/Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) 2021 Oracle and/or its affiliates. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ - -FROM ghcr.io/oracle/oraclelinux7-instantclient:19 AS base - -RUN rm -rf /var/cache/yum/* && yum clean all && yum install -y gcc mesa-libGL vim && rm -rf /var/cache/yum/* -RUN curl -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh >> miniconda.sh -RUN bash ./miniconda.sh -b -p /miniconda; rm ./miniconda.sh; -ENV PATH="/miniconda/bin:$PATH" - -ENV HOME /home/datascience -RUN mkdir -p /etc/datascience -WORKDIR /etc/datascience - -COPY operators/environment.yaml operators/environment.yaml -RUN conda env create -f operators/environment.yaml --name op_env && conda clean -afy -ENV PATH="/miniconda/envs/op_env/bin:$PATH" - -RUN /bin/bash -c "source activate op_env" -COPY operators/run.py operators/run.py -CMD bash - -FROM base -COPY docker/merge_dependencies.py merge_dependencies.py -COPY operators/ operators/ - -RUN pip install pyyaml click && python merge_dependencies.py environment.yaml -RUN conda env update -f environment.yaml --name op_env && conda clean -afy -RUN source activate op_env diff --git a/ads/opctl/docker/merge_dependencies.py b/ads/opctl/docker/merge_dependencies.py deleted file mode 100644 index e275e63ed..000000000 --- a/ads/opctl/docker/merge_dependencies.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8; -*- - -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ - -import yaml -import os -import click - - -@click.command() -@click.argument("output_path") -@click.option( - "--source-folder", - "-s", - help="source folder to search for environment yaml", - required=False, - default=None, -) -def merge(output_path, source_folder): - merged = _merge(source_folder) - with open(output_path, "w") as f: - yaml.safe_dump(merged, f) - - -def _merge(source_folder=None): - conda_dependencies = set([]) - pip_dependencies = set([]) - if not source_folder: - source_folder = os.path.join("operators") - for dirpath, dirnames, filenames in os.walk(source_folder): - for fname in filenames: - if fname == "environment.yaml": - env_yaml = os.path.join(dirpath, fname) - print(env_yaml) - with open(env_yaml, "r") as f: - dependencies = yaml.safe_load(f.read())["dependencies"] - for dep in dependencies: - if isinstance(dep, dict) and "pip" in dep: - pip_dependencies.update(dep["pip"]) - else: - conda_dependencies.add(dep) - conda_dependencies.add("pip") - merged_dependencies = { - "dependencies": list(conda_dependencies) + [{"pip": list(pip_dependencies)}] - } - return merged_dependencies - - -if __name__ == "__main__": - merge() diff --git a/ads/opctl/docker/operator/.dockerignore b/ads/opctl/docker/operator/.dockerignore new file mode 100644 index 000000000..e69de29bb diff --git a/ads/opctl/docker/operator/Dockerfile b/ads/opctl/docker/operator/Dockerfile new file mode 100644 index 000000000..905b101a0 --- /dev/null +++ b/ads/opctl/docker/operator/Dockerfile @@ -0,0 +1,41 @@ +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +FROM ghcr.io/oracle/oraclelinux8-instantclient:21 as base + +RUN \ + rm -rf /var/cache/yum/* && \ + yum install -y gcc make patch vim iproute net-tools git && \ + yum clean all && \ + rm -rf /var/cache/yum/* + +########################### CONDA INSTALLATION ######################################## +RUN curl -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh >> miniconda.sh +RUN bash ./miniconda.sh -b -p /miniconda; rm ./miniconda.sh; +ENV PATH="/miniconda/bin:$PATH" + +USER root + +ARG CONDA_ENV_FILE=environment.yaml +ARG CONDA_ENV_NAME=operator +ARG OPERATOR_DIR=/etc/operator +ARG RND + +COPY ./${CONDA_ENV_FILE} /opt/env.yaml +RUN conda install conda-forge::mamba && mamba env create -f /opt/env.yaml --name ${CONDA_ENV_NAME} && conda clean -afy +ENV PATH="/miniconda/envs/${CONDA_ENV_NAME}}/bin:$PATH" + +RUN conda init bash && source ~/.bashrc && conda activate ${CONDA_ENV_NAME} + +########################### SETUP WORKDIR ######################################## +RUN mkdir ${OPERATOR_DIR} + +ENV OPERATOR_DIR=${OPERATOR_DIR} +ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} + +WORKDIR ${OPERATOR_DIR} + +RUN echo "conda activate $CONDA_ENV_NAME">>/root/.bashrc +SHELL ["/bin/bash", "--login", "-c"] + +ENTRYPOINT [ "bash", "--login" , "-c"] diff --git a/ads/opctl/docker/Dockerfile.gpu b/ads/opctl/docker/operator/Dockerfile.gpu similarity index 56% rename from ads/opctl/docker/Dockerfile.gpu rename to ads/opctl/docker/operator/Dockerfile.gpu index 86b3560dd..f74f924dd 100644 --- a/ads/opctl/docker/Dockerfile.gpu +++ b/ads/opctl/docker/operator/Dockerfile.gpu @@ -1,21 +1,24 @@ -# Copyright (c) 2021 Oracle and/or its affiliates. +# Copyright (c) 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -FROM ghcr.io/oracle/oraclelinux7-instantclient:19 AS base +FROM ghcr.io/oracle/oraclelinux8-instantclient:21 as base -RUN yum install -y tar gzip +RUN \ + rm -rf /var/cache/yum/* && \ + yum install -y gcc make patch vim iproute net-tools git && \ + yum clean all && \ + rm -rf /var/cache/yum/* ########################### CUDA INSTALLATION ######################################## +#Reference: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/10.1/centos7/runtime/cudnn7/Dockerfile +#Reference: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/10.1/centos7/runtime/Dockerfile +#Reference: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/10.1/centos7/base/Dockerfile -#Reference: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/centos7/10.1/runtime/cudnn7/Dockerfile -#Reference: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/centos7/10.1/runtime/Dockerfile -#Reference: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/centos7/10.1/base/Dockerfile - -RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ -curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/7fa2af80.pub | sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ +RUN NVIDIA_GPGKEY_SUM=d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87 && \ +curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ echo "$NVIDIA_GPGKEY_SUM /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c --strict - -COPY docker/cuda.repo /etc/yum.repos.d/cuda.repo +COPY cuda.repo /etc/yum.repos.d/cuda.repo ENV CUDA_VERSION 10.1.243 @@ -33,7 +36,7 @@ RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 +ENV LD_LIBRARY_PATH /lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 # nvidia-container-runtime ENV NVIDIA_VISIBLE_DEVICES all @@ -45,34 +48,38 @@ LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" RUN CUDNN_DOWNLOAD_SUM=7eaec8039a2c30ab0bc758d303588767693def6bf49b22485a2c00bf2e136cb3 && \ curl -fsSL http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.5/cudnn-10.1-linux-x64-v7.6.5.32.tgz -O && \ - echo "$CUDNN_DOWNLOAD_SUM cudnn-10.1-linux-x64-v7.6.5.32.tgz" | sha256sum -c - -RUN ls -ltr -RUN tar --no-same-owner -xzf cudnn-10.1-linux-x64-v7.6.5.32.tgz -C /usr/local --wildcards 'cuda/lib64/libcudnn.so.*' && \ + echo "$CUDNN_DOWNLOAD_SUM cudnn-10.1-linux-x64-v7.6.5.32.tgz" | sha256sum -c - && \ + tar --no-same-owner -xzf cudnn-10.1-linux-x64-v7.6.5.32.tgz -C /usr/local --wildcards 'cuda/lib64/libcudnn.so.*' && \ rm cudnn-10.1-linux-x64-v7.6.5.32.tgz && \ ldconfig -##############################################CUDA END######################## -RUN rm -rf /var/cache/yum/* && yum clean all && yum install -y gcc mesa-libGL vim && rm -rf /var/cache/yum/* +########################### CONDA INSTALLATION ######################################## RUN curl -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh >> miniconda.sh RUN bash ./miniconda.sh -b -p /miniconda; rm ./miniconda.sh; ENV PATH="/miniconda/bin:$PATH" -ENV HOME /home/datascience -RUN mkdir -p /etc/datascience -WORKDIR /etc/datascience +USER root + +ARG CONDA_ENV_FILE=environment.yaml +ARG CONDA_ENV_NAME=operator +ARG OPERATOR_DIR=/etc/operator +ARG RND + +COPY ./${CONDA_ENV_FILE} /opt/env.yaml +RUN conda install conda-forge::mamba && mamba env create -f /opt/env.yaml --name ${CONDA_ENV_NAME} && conda clean -afy +ENV PATH="/miniconda/envs/${CONDA_ENV_NAME}}/bin:$PATH" + +RUN conda init bash && source ~/.bashrc && conda activate ${CONDA_ENV_NAME} + +########################### SETUP WORKDIR ######################################## +RUN mkdir ${OPERATOR_DIR} -COPY operators/environment.yaml operators/environment.yaml -RUN conda env create -f operators/environment.yaml --name op_env && conda clean -afy -ENV PATH="/miniconda/envs/op_env/bin:$PATH" +ENV OPERATOR_DIR=${OPERATOR_DIR} +ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} -RUN /bin/bash -c "source activate op_env" -COPY operators/run.py operators/run.py -CMD bash +WORKDIR ${OPERATOR_DIR} -FROM base -COPY docker/merge_dependencies.py merge_dependencies.py -COPY operators/ operators/ +RUN echo "conda activate $CONDA_ENV_NAME">>/root/.bashrc +SHELL ["/bin/bash", "--login", "-c"] -RUN pip install pyyaml click && python merge_dependencies.py environment.yaml -RUN conda env update -f environment.yaml --name op_env && conda clean -afy -RUN source activate op_env +ENTRYPOINT [ "bash", "--login" , "-c"] diff --git a/ads/opctl/docker/operator/cuda.repo b/ads/opctl/docker/operator/cuda.repo new file mode 100644 index 000000000..358420e3a --- /dev/null +++ b/ads/opctl/docker/operator/cuda.repo @@ -0,0 +1,6 @@ +[cuda] +name=cuda +baseurl=http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64 +enabled=1 +gpgcheck=1 +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA diff --git a/ads/opctl/docker/operator/environment.yaml b/ads/opctl/docker/operator/environment.yaml new file mode 100644 index 000000000..6bd6cac5e --- /dev/null +++ b/ads/opctl/docker/operator/environment.yaml @@ -0,0 +1,8 @@ +name: operator +channels: + - conda-forge +dependencies: + - python=3.8 + - pip + - pip: + - "git+https://github.com/oracle/accelerated-data-science.git@feature/forecasting#egg=oracle-ads" diff --git a/ads/opctl/model/cli.py b/ads/opctl/model/cli.py index 4588d4ba9..8d4875b4c 100644 --- a/ads/opctl/model/cli.py +++ b/ads/opctl/model/cli.py @@ -14,6 +14,7 @@ @click.group("model") @click.help_option("--help", "-h") def commands(): + "The CLI to assist in the management of the Data Science Model Deployment." pass diff --git a/ads/opctl/operator/README.md b/ads/opctl/operator/README.md new file mode 100644 index 000000000..ed5b91e11 --- /dev/null +++ b/ads/opctl/operator/README.md @@ -0,0 +1,4 @@ +## ML Operator +-------------- + +Welcome to ML operators. This readme will contain the instructions helping to configure and dispatch operators. diff --git a/ads/opctl/operator/__init__.py b/ads/opctl/operator/__init__.py new file mode 100644 index 000000000..dd01bb362 --- /dev/null +++ b/ads/opctl/operator/__init__.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os + + +def __registered_operators(): + """Gets the list of registered operators.""" + + target_dir = os.path.join(os.path.dirname(__file__), "lowcode") + return [ + f + for f in os.listdir(target_dir) + if os.path.isdir(os.path.join(target_dir, f)) and not f.startswith("__") + ] + + +__operators__ = __registered_operators() + + +class OperatorNotFoundError(Exception): + def __init__(self, operator: str): + super().__init__( + f"Operator: `{operator}` " + f"is not registered. Available operators: {__operators__}" + ) diff --git a/ads/opctl/operator/cli.py b/ads/opctl/operator/cli.py new file mode 100644 index 000000000..62baddb08 --- /dev/null +++ b/ads/opctl/operator/cli.py @@ -0,0 +1,331 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from typing import Any, Dict + +import click +import fsspec +import yaml +from ads.opctl.operator.common.utils import default_signer +from ads.common.auth import AuthType +from ads.common.object_storage_details import ObjectStorageDetails +from ads.opctl.constants import BACKEND_NAME, RUNTIME_TYPE +from ads.opctl.decorator.common import click_options, with_auth +from ads.opctl.utils import suppress_traceback + +from .__init__ import __operators__ +from .cmd import run as cmd_run +from .cmd import build_conda as cmd_build_conda +from .cmd import build_image as cmd_build_image +from .cmd import create as cmd_create +from .cmd import info as cmd_info +from .cmd import init as cmd_init +from .cmd import list as cmd_list +from .cmd import publish_conda as cmd_publish_conda +from .cmd import publish_image as cmd_publish_image +from .cmd import verify as cmd_verify + +DEBUG_OPTION = ( + click.option("--debug", "-d", help="Set debug mode.", is_flag=True, default=False), +) + +ADS_CONFIG_OPTION = ( + click.option( + "--ads-config", + help=( + "The folder where the ADS `config.ini` located. " + "The default location is: `~/.ads_ops` folder. " + "Check the `ads opctl configure --help` command to get details about the `config.ini`." + ), + required=False, + default=None, + ), +) + +OPERATOR_TYPE_OPTION = ( + click.option( + "--type", + "-t", + help=( + "The type of the operator. " + f"Available service operators: `{'`, `'.join(__operators__)}`." + ), + required=True, + ), +) + +AUTH_TYPE_OPTION = ( + click.option( + "--auth", + "-a", + help=( + "The authentication method to leverage OCI resources. " + "The default value will be taken from the ADS `config.ini` file. " + "Check the `ads opctl configure --help` command to get details about the `config.ini`." + ), + type=click.Choice(AuthType.values()), + default=None, + ), + click.option( + "--oci-profile", + help=( + "OCI profile name to use for authentication. " + "By default will be used the value specified in the ADS config file. " + "Check the `ads opctl configure --help` command to get details about the `config.ini`. " + ), + required=False, + default=None, + ), +) + + +@click.group("operator") +def commands(): + "The CLI to assist in the management of the ADS operators." + pass + + +@commands.command() +@click_options(DEBUG_OPTION) +def list(debug: bool, **kwargs: Dict[str, Any]) -> None: + """Prints the list of the registered operators.""" + suppress_traceback(debug)(cmd_list)(**kwargs) + + +@commands.command() +@click_options( + DEBUG_OPTION + OPERATOR_TYPE_OPTION + ADS_CONFIG_OPTION + AUTH_TYPE_OPTION +) +@with_auth +def info(debug: bool, **kwargs: Dict[str, Any]) -> None: + """Prints the detailed information about the particular operator.""" + suppress_traceback(debug)(cmd_info)(**kwargs) + + +@commands.command() +@click_options( + DEBUG_OPTION + OPERATOR_TYPE_OPTION + ADS_CONFIG_OPTION + AUTH_TYPE_OPTION +) +@click.option( + "--output", + help=f"The folder name to save the resulting specification templates.", + required=False, + default=None, +) +@click.option( + "--overwrite", + "-o", + help="Overwrite result file if it already exists.", + is_flag=True, + default=False, +) +@click.option( + "--merge-config", + "-m", + help=( + "Merge the operator's configuration with various backend configurations, " + "resulting in multiple operator configurations, each paired with a distinct backend. " + "By default, the operator's configuration will remain distinct from the backend configuration." + ), + is_flag=True, + default=False, +) +@with_auth +def init(debug: bool, **kwargs: Dict[str, Any]) -> None: + """Generates starter YAML configs for the operator.""" + suppress_traceback(debug)(cmd_init)(**kwargs) + + +@commands.command() +@click_options(DEBUG_OPTION + OPERATOR_TYPE_OPTION) +@click.option( + "--gpu", + "-g", + help="Build a GPU-enabled Docker image.", + is_flag=True, + default=False, + required=False, +) +@click.option( + "--rebuild-base-image", + "-r", + help="Rebuild operator's base image. This option is useful when developing a new operator.", + is_flag=True, + default=False, +) +@with_auth +def build_image(debug: bool, **kwargs: Dict[str, Any]) -> None: + """Creates a new image for the specified operator.""" + suppress_traceback(debug)(cmd_build_image)(**kwargs) + + +@commands.command() +@click_options(DEBUG_OPTION + OPERATOR_TYPE_OPTION + ADS_CONFIG_OPTION) +@click.option( + "--registry", + "-r", + help="Registry to publish to. By default the value will be taken from the ADS opctl config.", + required=False, + default=None, +) +@with_auth +def publish_image(debug, **kwargs): + """Publishes an operator's image to the container registry.""" + suppress_traceback(debug)(cmd_publish_image)(**kwargs) + + +@commands.command(hidden=True) +@click_options(DEBUG_OPTION + OPERATOR_TYPE_OPTION + ADS_CONFIG_OPTION) +@click.option( + "--overwrite", + "-o", + help="Overwrite result file if it already exists.", + is_flag=True, + default=False, +) +@click.option( + "--output", + help="The folder to save the resulting specification template YAML.", + required=False, + default=None, +) +@with_auth +def create(debug: bool, **kwargs: Dict[str, Any]) -> None: + """Creates new operator.""" + suppress_traceback(debug)(cmd_create)(**kwargs) + + +@commands.command() +@click_options(DEBUG_OPTION + ADS_CONFIG_OPTION + AUTH_TYPE_OPTION) +@click.option( + "--file", "-f", help="The path to resource YAML file.", required=True, default=None +) +@with_auth +def verify(debug: bool, **kwargs: Dict[str, Any]) -> None: + """Verifies the operator config.""" + + with fsspec.open( + kwargs["file"], + "r", + **( + default_signer() if ObjectStorageDetails.is_oci_path(kwargs["file"]) else {} + ), + ) as f: + operator_spec = suppress_traceback(debug)(yaml.safe_load)(f.read()) + + suppress_traceback(debug)(cmd_verify)(operator_spec, **kwargs) + + +@commands.command() +@click_options(DEBUG_OPTION + OPERATOR_TYPE_OPTION + ADS_CONFIG_OPTION) +@click.option( + "--conda-pack-folder", + help=( + "The destination folder to save the conda environment. " + "By default will be used the path specified in the ADS config file generated " + "with `ads opctl configure` command." + ), + required=False, + default=None, +) +@click.option( + "--overwrite", + "-o", + help="Overwrite conda environment if it already exists.", + is_flag=True, + default=False, +) +@with_auth +def build_conda(debug: bool, **kwargs: Dict[str, Any]) -> None: + """Creates a new conda environment for the specified operator.""" + suppress_traceback(debug)(cmd_build_conda)(**kwargs) + + +@commands.command() +@click_options( + DEBUG_OPTION + OPERATOR_TYPE_OPTION + ADS_CONFIG_OPTION + AUTH_TYPE_OPTION +) +@click.option( + "--conda-pack-folder", + help=( + "The source folder to search the conda environment. " + "By default will be used the path specified in the ADS config file generated " + "with `ads opctl configure` command." + ), + required=False, + default=None, +) +@click.option( + "--overwrite", + "-o", + help="Overwrite conda environment if it already exists.", + is_flag=True, + default=False, +) +@with_auth +def publish_conda(debug: bool, **kwargs: Dict[str, Any]) -> None: + """Publishes an operator's conda environment to the Object Storage bucket.""" + suppress_traceback(debug)(cmd_publish_conda)(**kwargs) + + +@commands.command() +@click_options(DEBUG_OPTION + ADS_CONFIG_OPTION + AUTH_TYPE_OPTION) +@click.option( + "--file", + "-f", + help="The path to the operator's specification YAML file.", + required=True, + default=None, +) +@click.option( + "--backend", + "-b", + help=( + "Backend name or the path to the operator's backend config YAML file. " + f"\n\nExample 1:\n\n`ads operator run -f operator.yaml -b {BACKEND_NAME.LOCAL.value}`\n\n" + "Supported backend names: " + f"{(BACKEND_NAME.JOB.value,BACKEND_NAME.JOB.value + '.' + RUNTIME_TYPE.CONTAINER.value,BACKEND_NAME.DATAFLOW.value,BACKEND_NAME.LOCAL.value,BACKEND_NAME.LOCAL.value + '.'+ RUNTIME_TYPE.CONTAINER.value,)}. " + "However some operators may support only a subset of these backends." + "\n\nExample 2:\n\n`ads operator run -f operator.yaml -b backend.yaml`\n\n" + "Use the `ads operator init --help` command to generate the operator's specification " + "and all required backend configs. Generating configs is optional and fully automated. " + ), + required=False, + default=None, +) +@click.option( + "--dry-run", + "-r", + default=False, + is_flag=True, + help="During dry run, the actual operation is not performed, only the steps are enumerated.", +) +@with_auth +def run(debug: bool, **kwargs: Dict[str, Any]) -> None: + """ + Runs the operator with the given specification on the targeted backend. + """ + operator_spec = {} + backend = kwargs.pop("backend") + + with fsspec.open( + kwargs["file"], + "r", + **( + default_signer() if ObjectStorageDetails.is_oci_path(kwargs["file"]) else {} + ), + ) as f: + operator_spec = suppress_traceback(debug)(yaml.safe_load)(f.read()) + + if backend and backend.lower().endswith((".yaml", ".yml")): + with fsspec.open( + backend, + "r", + **(default_signer() if ObjectStorageDetails.is_oci_path(backend) else {}), + ) as f: + backend = suppress_traceback(debug)(yaml.safe_load)(f.read()) + + suppress_traceback(debug)(cmd_run)(config=operator_spec, backend=backend, **kwargs) diff --git a/ads/opctl/operator/cmd.py b/ads/opctl/operator/cmd.py new file mode 100644 index 000000000..b6cf9b0a6 --- /dev/null +++ b/ads/opctl/operator/cmd.py @@ -0,0 +1,585 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +import re +import runpy +import shutil +import tempfile +from typing import Any, Dict, Union + +import fsspec +import yaml +from tabulate import tabulate + +from ads.common import utils as ads_common_utils +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.opctl import logger +from ads.opctl.conda.cmds import create as conda_create +from ads.opctl.conda.cmds import publish as conda_publish +from ads.opctl.config.base import ConfigProcessor +from ads.opctl.config.merger import ConfigMerger +from ads.opctl.constants import DEFAULT_ADS_CONFIG_FOLDER +from ads.opctl.decorator.common import validate_environment +from ads.opctl.operator.common.const import ( + OPERATOR_BASE_DOCKER_FILE, + OPERATOR_BASE_DOCKER_GPU_FILE, + OPERATOR_BASE_GPU_IMAGE, + OPERATOR_BASE_IMAGE, +) +from ads.opctl.operator.common.operator_loader import OperatorInfo, OperatorLoader +from ads.opctl.utils import publish_image as publish_image_cmd + +from .__init__ import __operators__ +from .common import utils as operator_utils +from .common.backend_factory import BackendFactory +from .common.errors import ( + OperatorCondaNotFoundError, + OperatorImageNotFoundError, + OperatorSchemaYamlError, +) +from .common.operator_loader import _operator_info_list + + +def list() -> None: + """Prints the list of the registered service operators. + + Returns + ------- + None + """ + print( + tabulate( + ( + { + "Type": item.type, + "Version": item.version, + "Description": item.description, + } + for item in _operator_info_list() + ), + headers="keys", + ) + ) + + +@runtime_dependency(module="rich", install_from=OptionalDependency.OPCTL) +def info( + type: str, + **kwargs: Dict[str, Any], +) -> None: + """ + Prints the detailed information about the particular operator. + + Parameters + ---------- + type: str + The type of the operator to get detailed. + kwargs: (Dict, optional). + Additional key value arguments. + + Returns + ------- + None + """ + from rich.console import Console + from rich.markdown import Markdown + + console = Console() + operator_info = OperatorLoader.from_uri(uri=type).load() + + operator_readme = None + if operator_info.path: + readme_file_path = os.path.join(operator_info.path, "README.md") + + if os.path.exists(readme_file_path): + with open(readme_file_path, "r") as readme_file: + operator_readme = readme_file.read() + + console.print( + Markdown( + operator_readme + or operator_info.description + or "The description for this operator has not been specified." + ) + ) + + +def init( + type: str, + output: Union[str, None] = None, + overwrite: bool = False, + merge_config: bool = False, + ads_config: Union[str, None] = None, + **kwargs: Dict[str, Any], +) -> None: + """ + Generates a starter YAML configurations for the operator. + + Parameters + ---------- + type: str + The type of the operator to generate the specification YAML. + output: (str, optional). Defaults to None. + The path to the folder to save the resulting specification templates. + The Tmp folder will be created in case when `output` is not provided. + overwrite: (bool, optional). Defaults to False. + Whether to overwrite the result specification YAML if exists. + merge_config: (bool, optional). Defaults to False. + Whether to merge the generated specification YAML with the backend configuration. + ads_config: (str, optional) + The folder where the ads opctl config located. + kwargs: (Dict, optional). + Additional key value arguments. + + Raises + ------ + ValueError + If `type` not specified. + OperatorNotFoundError + If `operator` not found. + """ + # validation + if not type: + raise ValueError(f"The `type` attribute must be specified.") + + # load operator info + operator_info: OperatorInfo = OperatorLoader.from_uri(uri=type).load() + + # create TMP folder if one is not provided by user + if output: + output = os.path.join(output, "") + if ads_common_utils.is_path_exists(uri=output) and not overwrite: + raise ValueError( + f"The `{output}` already exists, use `--overwrite` option if you wish to overwrite." + ) + else: + overwrite = True + output = os.path.join(tempfile.TemporaryDirectory().name, "") + + # generating operator specification + operator_config = {} + try: + operator_cmd_module = runpy.run_module( + f"{operator_info.type}.cmd", run_name="init" + ) + operator_config = operator_cmd_module.get("init", lambda: "")( + **{**kwargs, **{"type": type}} + ) + + if not merge_config: + with fsspec.open( + os.path.join(output, f"{operator_info.type}.yaml"), mode="w" + ) as f: + f.write(yaml.dump(operator_config)) + except Exception as ex: + logger.warning( + "The operator's specification was not generated " + f"because it is not supported by the `{operator_info.type}` operator. " + "Use --debug option to see the error details." + ) + logger.debug(ex) + + # copy README and original schema files into a destination folder + for src_file in ("README.md", "schema.yaml", "environment.yaml"): + ads_common_utils.copy_file( + uri_src=os.path.join(operator_info.path, src_file), + uri_dst=output, + force_overwrite=overwrite, + ) + + # generate supported backend specifications templates YAML + for key, value in BackendFactory._init_backend_config( + operator_info=operator_info, + ads_config=ads_config, + output=output, + overwrite=overwrite, + **kwargs, + ).items(): + tmp_config = value + if merge_config and operator_config: + tmp_config = {**operator_config, "runtime": value} + + with fsspec.open( + os.path.join( + output, f"{operator_info.type}_{'_'.join(key).replace('.','_')}.yaml" + ), + mode="w", + ) as f: + f.write(yaml.dump(tmp_config)) + + logger.info("#" * 100) + logger.info(f"The auto-generated configs have been placed in: {output}") + logger.info("#" * 100) + + +@runtime_dependency(module="docker", install_from=OptionalDependency.OPCTL) +@validate_environment +def build_image( + type: str = None, + rebuild_base_image: bool = None, + **kwargs: Dict[str, Any], +) -> None: + """ + Builds the image for the particular operator. + + Parameters + ---------- + type: (str, optional) + Type of the operator to build the image. + rebuild_base_image: (optional, bool) + If rebuilding both base and operator's images required. + kwargs: (Dict, optional). + Additional key value arguments. + + Raises + ------ + ValueError + If `type` not specified. + """ + import docker + + # validation + if not type: + raise ValueError(f"The `type` attribute must be specified.") + + # load operator info + operator_info: OperatorInfo = OperatorLoader.from_uri(uri=type).load() + logger.info(f"Building Docker image for the `{operator_info.type}` operator.") + + # checks if GPU base image needs to be used. + gpu = operator_info.gpu + + cur_dir = os.path.dirname(os.path.abspath(__file__)) + base_image_name = OPERATOR_BASE_GPU_IMAGE if gpu else OPERATOR_BASE_IMAGE + + try: + client = docker.from_env() + client.api.inspect_image(base_image_name) + if rebuild_base_image: + raise docker.errors.ImageNotFound("The base operator's image not found.") + except docker.errors.ImageNotFound: + logger.info(f"Building the base operator's image `{base_image_name}`.") + + base_docker_file = os.path.join( + cur_dir, + "..", + "docker", + "operator", + OPERATOR_BASE_DOCKER_GPU_FILE if gpu else OPERATOR_BASE_DOCKER_FILE, + ) + + result_image_name = operator_utils._build_image( + dockerfile=base_docker_file, + image_name=base_image_name, + target="base", + ) + + logger.info( + f"The base operator image `{result_image_name}` has been successfully built." + ) + + with tempfile.TemporaryDirectory() as td: + shutil.copytree(operator_info.path, os.path.join(td, "operator")) + + run_command = [ + f"FROM {base_image_name}", + f"COPY ./operator/ $OPERATOR_DIR/{operator_info.type}/", + "RUN yum install -y libX11", + ] + if os.path.exists(os.path.join(td, "operator", "environment.yaml")): + run_command.append( + f"RUN mamba env update -f $OPERATOR_DIR/{operator_info.type}/environment.yaml " + "--name $CONDA_ENV_NAME && conda clean -afy" + ) + + custom_docker_file = os.path.join(td, "Dockerfile") + + with open(custom_docker_file, "w") as f: + f.writelines("\n".join(run_command)) + + result_image_name = operator_utils._build_image( + dockerfile=custom_docker_file, + image_name=operator_info.type, + tag=operator_info.version, + ) + + logger.info( + f"The operator image `{result_image_name}` has been successfully built. " + "To publish the image to OCI Container Registry run the " + f"`ads operator publish-image -t {operator_info.type}` command" + ) + + +@runtime_dependency(module="docker", install_from=OptionalDependency.OPCTL) +@validate_environment +def publish_image( + type: str, + registry: str = None, + ads_config: str = None, + **kwargs: Dict[str, Any], +) -> None: + """ + Publishes operator's image to the container registry. + + Parameters + ---------- + type: (str, optional) + The operator type to publish image to container registry. + registry: str + Container registry. + ads_config: (str, optional) + The folder where the ads opctl config located. + kwargs: (Dict, optional). + Additional key value arguments. + + Raises + ------ + ValueError + If `type` not specified. + OperatorImageNotFoundError + If the operator's image doesn't exist. + """ + + import docker + + # validation + if not type: + raise ValueError(f"The `type` attribute must be specified.") + + client = docker.from_env() + + # Check if image with given name exists + image = type + try: + client.api.inspect_image(image) + except docker.errors.ImageNotFound: + # load operator info + operator_info: OperatorInfo = OperatorLoader.from_uri(uri=type).load() + try: + image = f"{operator_info.type}:{operator_info.version or 'undefined'}" + # check if the operator's image exists + client.api.inspect_image(image) + except docker.errors.ImageNotFound: + raise OperatorImageNotFoundError(operator_info.type) + + # extract registry from the ADS config. + if not registry: + p = ConfigProcessor().step( + ConfigMerger, + ads_config=ads_config or DEFAULT_ADS_CONFIG_FOLDER, + **kwargs, + ) + registry = p.config.get("infrastructure", {}).get("docker_registry", None) + + publish_image_cmd( + image=image, + registry=registry, + ) + + +def verify( + config: Dict, + **kwargs: Dict[str, Any], +) -> None: + """ + Verifies operator config. + + Parameters + ---------- + config: Dict + The operator config. + kwargs: (Dict, optional). + Additional key value arguments. + """ + operator_type = config.get("type") + + # validation + if not operator_type: + raise ValueError(f"The `type` attribute must be specified.") + + # load operator info + operator_info: OperatorInfo = OperatorLoader.from_uri(uri=operator_type).load() + + # validate operator + try: + operator_module = runpy.run_module( + f"{operator_info.type}.__main__", + run_name="verify", + ) + operator_module.get("verify")(config, **kwargs) + except OperatorSchemaYamlError as ex: + logger.debug(ex) + raise ValueError( + f"The operator's specification is not valid for the `{operator_info.type}` operator. " + f"{ex}" + ) + except Exception as ex: + logger.debug(ex) + raise ValueError( + f"The validator is not implemented for the `{operator_info.type}` operator." + ) + + +def build_conda( + type: str = None, + conda_pack_folder: str = None, + overwrite: bool = False, + ads_config: Union[str, None] = None, + **kwargs: Dict[str, Any], +) -> None: + """ + Builds the conda environment for the particular operator. + For the service operators, the type needs to be provided. + For the custom operators, the path (source_folder) to the operator needs to be provided. + + Parameters + ---------- + type: str + The type of the operator to build conda environment for. + conda_pack_folder: str + The destination folder to save the conda environment. + By default will be used the path specified in the config file generated + with `ads opctl configure` command + overwrite: (bool, optional). Defaults to False. + Whether to overwrite the result specification YAML if exists. + ads_config: (str, optional) + The folder where the ads opctl config located. + kwargs: (Dict, optional). + Additional key value arguments. + + Returns + ------- + None + + Raises + ------ + ValueError + If `type` not specified. + """ + + # validation + if not type: + raise ValueError(f"The `type` attribute must be specified.") + + # load operator info + operator_info: OperatorInfo = OperatorLoader.from_uri(uri=type).load() + logger.info(f"Building conda environment for the `{operator_info.type}` operator.") + + # invoke the conda create command + conda_create( + name=operator_info.type, + version=re.sub("[^0-9.]", "", operator_info.version), + environment_file=os.path.join(operator_info.path, "environment.yaml"), + conda_pack_folder=conda_pack_folder, + gpu=operator_info.gpu, + overwrite=overwrite, + ads_config=ads_config, + **kwargs, + ) + + +def publish_conda( + type: str = None, + conda_pack_folder: str = None, + overwrite: bool = False, + ads_config: Union[str, None] = None, + **kwargs: Dict[str, Any], +) -> None: + """ + Publishes the conda environment for the particular operator. + + Parameters + ---------- + type: str + The type of the operator to generate the specification YAML. + conda_pack_folder: str + The destination folder to save the conda environment. + By default will be used the path specified in the config file generated + with `ads opctl configure` command + overwrite: (bool, optional). Defaults to False. + Whether to overwrite the result specification YAML if exists. + ads_config: (str, optional) + The folder where the ads opctl config located. + kwargs: (Dict, optional). + Additional key value arguments. + + Raises + ------ + ValueError + If `type` not specified. + OperatorCondaNotFoundError + If the operator's conda environment not exists. + """ + + # validation + if not type: + raise ValueError(f"The `type` attribute must be specified.") + + # load operator info + operator_info: OperatorInfo = OperatorLoader.from_uri(uri=type).load() + + # invoke the conda publish command + try: + conda_publish( + slug=operator_info.conda, + conda_pack_folder=conda_pack_folder, + overwrite=overwrite, + ads_config=ads_config, + **kwargs, + ) + except FileNotFoundError: + raise OperatorCondaNotFoundError(operator_info.type) + + +def create( + type: str, + overwrite: bool = False, + ads_config: Union[str, None] = None, + output: str = None, + **kwargs: Dict[str, Any], +) -> None: + """ + Creates new operator. + + Parameters + ---------- + type: str + The type of the operator to generate the specification YAML. + overwrite: (bool, optional). Defaults to False. + Whether to overwrite the result specification YAML if exists. + ads_config: (str, optional) + The folder where the ads opctl config located. + output: (str, optional). Defaults to None. + The path to the folder to save the resulting specification templates. + The Tmp folder will be created in case when `output` is not provided. + kwargs: (Dict, optional). + Additional key value arguments. + """ + raise NotImplementedError() + + +def run(config: Dict, backend: Union[Dict, str] = None, **kwargs) -> None: + """ + Runs the operator with the given specification on the targeted backend. + + Parameters + ---------- + config: Dict + The operator's config. + backend: (Union[Dict, str], optional) + The backend config or backend name to run the operator. + kwargs: (Dict, optional) + Optional key value arguments to run the operator. + """ + BackendFactory.backend( + config=ConfigProcessor(config).step(ConfigMerger, **kwargs), + backend=backend, + **kwargs, + ).run(**kwargs) diff --git a/ads/opctl/operator/common/__init__.py b/ads/opctl/operator/common/__init__.py new file mode 100644 index 000000000..b8d0460f5 --- /dev/null +++ b/ads/opctl/operator/common/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/ads/opctl/operator/common/backend_factory.py b/ads/opctl/operator/common/backend_factory.py new file mode 100644 index 000000000..812c02af2 --- /dev/null +++ b/ads/opctl/operator/common/backend_factory.py @@ -0,0 +1,440 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +""" +The module contains the factory method to create the backend object for the operator. +The factory validates the backend type and runtime type before creating the backend object. +""" + +import os +from typing import Dict, List, Tuple, Union + +import yaml + +from ads.opctl import logger +from ads.opctl.backend.ads_dataflow import DataFlowOperatorBackend +from ads.opctl.backend.ads_ml_job import MLJobOperatorBackend +from ads.opctl.backend.base import Backend +from ads.opctl.backend.local import LocalOperatorBackend +from ads.opctl.config.base import ConfigProcessor +from ads.opctl.config.merger import ConfigMerger +from ads.opctl.constants import ( + BACKEND_NAME, + DEFAULT_ADS_CONFIG_FOLDER, + RESOURCE_TYPE, + RUNTIME_TYPE, +) +from ads.opctl.operator.common.const import PACK_TYPE +from ads.opctl.operator.common.operator_loader import OperatorInfo, OperatorLoader + + +class BackendFactory: + """ + Class which contains the factory method to create the backend object. + The operator's backend object is created based on the backend type. + """ + + BACKENDS = ( + BACKEND_NAME.JOB.value, + BACKEND_NAME.DATAFLOW.value, + ) + + LOCAL_BACKENDS = ( + BACKEND_NAME.OPERATOR_LOCAL.value, + BACKEND_NAME.LOCAL.value, + ) + + BACKEND_RUNTIME_MAP = { + BACKEND_NAME.JOB.value.lower(): { + RUNTIME_TYPE.PYTHON.value.lower(): ( + BACKEND_NAME.JOB.value.lower(), + RUNTIME_TYPE.PYTHON.value.lower(), + ), + RUNTIME_TYPE.CONTAINER.value.lower(): ( + BACKEND_NAME.JOB.value.lower(), + RUNTIME_TYPE.CONTAINER.value.lower(), + ), + }, + BACKEND_NAME.DATAFLOW.value.lower(): { + RUNTIME_TYPE.DATAFLOW.value.lower(): ( + BACKEND_NAME.DATAFLOW.value.lower(), + RUNTIME_TYPE.DATAFLOW.value.lower(), + ) + }, + BACKEND_NAME.OPERATOR_LOCAL.value.lower(): { + RUNTIME_TYPE.PYTHON.value.lower(): ( + BACKEND_NAME.OPERATOR_LOCAL.value.lower(), + RUNTIME_TYPE.PYTHON.value.lower(), + ), + RUNTIME_TYPE.CONTAINER.value.lower(): ( + BACKEND_NAME.OPERATOR_LOCAL.value.lower(), + RUNTIME_TYPE.CONTAINER.value.lower(), + ), + }, + } + + BACKEND_MAP = { + BACKEND_NAME.JOB.value.lower(): MLJobOperatorBackend, + BACKEND_NAME.DATAFLOW.value.lower(): DataFlowOperatorBackend, + BACKEND_NAME.OPERATOR_LOCAL.value.lower(): LocalOperatorBackend, + BACKEND_NAME.LOCAL.value.lower(): LocalOperatorBackend, + } + + @classmethod + def backend( + cls, config: ConfigProcessor, backend: Union[Dict, str] = None, **kwargs: Dict + ) -> Backend: + """ + The factory method to create the backend object. + + Parameters + ---------- + config: ConfigProcessor + The config processor object. + backend: (Union[Dict, str], optional) + The backend type. Can be a string or a dictionary. + **kwargs: Dict + The keyword arguments. + + Returns + ------- + Returns the backend object. + + Raises + ------ + RuntimeError + If the backend type is not supported. + """ + if not config: + raise RuntimeError("The config is not provided.") + + if config.config.get("kind", "").lower() != "operator": + raise RuntimeError("Not supported kind of workload.") + + operator_type = config.config.get("type", "").lower() + + # validation + if not operator_type: + raise RuntimeError( + f"The `type` attribute must be specified in the operator's config." + ) + + if not backend and not config.config.get("runtime"): + logger.info( + f"Backend config is not provided, the {BACKEND_NAME.LOCAL.value} " + "will be used by default. " + ) + backend = BACKEND_NAME.LOCAL.value + elif not backend: + backend = config.config.get("runtime") + + # extracting details about the operator + operator_info = OperatorLoader.from_uri(uri=operator_type).load() + + supported_backends = tuple( + set(cls.BACKENDS + cls.LOCAL_BACKENDS) + & set( + operator_info.backends + + [ + BACKEND_NAME.OPERATOR_LOCAL.value, + BACKEND_NAME.LOCAL.value, + ] + ) + ) + + runtime_type = None + backend_kind = None + + if isinstance(backend, str): + backend_kind, runtime_type = cls._extract_backend( + backend=backend, supported_backends=supported_backends + ) + backend = {"kind": backend_kind} + + backend_kind = ( + BACKEND_NAME.OPERATOR_LOCAL.value + if backend.get("kind").lower() == BACKEND_NAME.LOCAL.value + else backend.get("kind").lower() + ) + backend["kind"] = backend_kind + + # If the backend kind is Job, then it is necessary to check the infrastructure kind. + # This is necessary, because Jobs and DataFlow have similar kind, + # The only difference would be in the infrastructure kind. + # This is a temporary solution, the logic needs to be placed in the ConfigMerger instead. + if backend_kind == BACKEND_NAME.JOB.value: + if (backend.get("spec", {}) or {}).get("infrastructure", {}).get( + "type", "" + ).lower() == BACKEND_NAME.DATAFLOW.value: + backend_kind = BACKEND_NAME.DATAFLOW.value + + runtime_type = runtime_type or ( + backend.get("type") + or (backend.get("spec", {}) or {}) + .get("runtime", {}) + .get("type", "undefined") + ) + + # validation + cls._validate_backend_and_runtime( + backend_kind=backend_kind, + runtime_type=runtime_type, + supported_backends=supported_backends, + ) + + # generate backend specification in case if it is not provided + if not backend.get("spec"): + backends = cls._init_backend_config( + operator_info=operator_info, backend_kind=backend_kind, **kwargs + ) + + backend = backends.get(cls.BACKEND_RUNTIME_MAP[backend_kind][runtime_type]) + if not backend: + raise RuntimeError( + "An error occurred while attempting to load the " + f"configuration for the `{backend_kind}.{runtime_type}` backend." + ) + + p_backend = ConfigProcessor( + {**backend, **{"execution": {"backend": backend_kind}}} + ).step(ConfigMerger, **kwargs) + + config.config["runtime"] = backend + config.config["infrastructure"] = p_backend.config["infrastructure"] + config.config["execution"] = p_backend.config["execution"] + + return cls.BACKEND_MAP[p_backend.config["execution"]["backend"].lower()]( + config=config.config, operator_info=operator_info + ) + + @classmethod + def _extract_backend( + cls, backend: str, supported_backends: List[str] = None + ) -> Tuple[str, str]: + """ + Extracts the backend type and the runtime type from the backend string. + + Parameters + ---------- + backend: str + The backend string. + Example: `job`, `job.container`, `dataflow.dataflow`, `local.container`, `local.python`. + supported_backends: List[str] + The list of supported backends. + + Returns + ------- + Returns the tuple of the backend type and the runtime type. + + Raises + ------ + RuntimeError + If the backend type is not supported. + """ + supported_backends = supported_backends or (cls.BACKENDS + cls.LOCAL_BACKENDS) + backend = (backend or BACKEND_NAME.OPERATOR_LOCAL.value).lower() + backend_kind, runtime_type = backend, None + + if backend.lower() != BACKEND_NAME.OPERATOR_LOCAL.value and "." in backend: + backend_kind, runtime_type = backend.split(".") + else: + backend_kind = backend + + backend_kind = ( + BACKEND_NAME.OPERATOR_LOCAL.value + if backend_kind == BACKEND_NAME.LOCAL.value + else backend_kind + ) + + if backend_kind not in supported_backends: + raise RuntimeError( + f"Not supported backend - {backend_kind}. Supported backends: {supported_backends}" + ) + + runtime_type = ( + runtime_type or list(cls.BACKEND_RUNTIME_MAP[backend_kind].keys())[0] + ) + + if runtime_type not in cls.BACKEND_RUNTIME_MAP[backend_kind]: + raise RuntimeError( + f"Not supported runtime type - `{runtime_type}` for the backend - `{backend_kind}`. " + f"Supported runtime types: `{list(cls.BACKEND_RUNTIME_MAP[backend_kind].keys())}`" + ) + + return backend_kind, runtime_type + + @classmethod + def _validate_backend_and_runtime( + cls, backend_kind: str, runtime_type: str, supported_backends: List[str] = None + ) -> bool: + """ + Validates the backend kind and runtime type. + + Parameters + ---------- + backend_kind: str + The backend kind. + runtime_type: str + The runtime type. + supported_backends: List[str] + The list of supported backends. + + Returns + ------- + Returns True if the backend type is valid, otherwise False. + + Raises + ------ + RuntimeError + If the backend type is not supported. + """ + supported_backends = supported_backends or (cls.BACKENDS + cls.LOCAL_BACKENDS) + if backend_kind not in supported_backends: + raise RuntimeError( + f"Not supported backend - {backend_kind}. Supported backends: {supported_backends}" + ) + if runtime_type not in cls.BACKEND_RUNTIME_MAP[backend_kind]: + raise RuntimeError( + f"Not supported runtime type - `{runtime_type}` for the backend - `{backend_kind}`. " + f"Supported runtime types: `{list(cls.BACKEND_RUNTIME_MAP[backend_kind].keys())}`" + ) + return True + + @classmethod + def _init_backend_config( + cls, + operator_info: OperatorInfo, + ads_config: Union[str, None] = None, + backend_kind: Tuple[str] = None, + **kwargs: Dict, + ) -> Dict[Tuple, Dict]: + """ + Generates the operator's backend configs. + + Parameters + ---------- + ads_config: (str, optional) + The folder where the ads opctl config located. + backend_kind: (str, optional) + The required backend. + kwargs: (Dict, optional). + Additional key value arguments. + + Returns + ------- + Dict[Tuple, Dict] + The dictionary where the key will be a tuple containing runtime kind and type. + Example: + >>> {("local","python"): {}, ("job", "container"): {}} + + Raises + ------ + RuntimeError + In case if the provided backend is not supported. + """ + from ads.opctl.cmds import _BackendFactory + + result = {} + + freeform_tags = { + "operator": f"{operator_info.type}:{operator_info.version}", + } + + # generate supported backend specifications templates YAML + RUNTIME_TYPE_MAP = { + RESOURCE_TYPE.JOB.value: [ + { + RUNTIME_TYPE.PYTHON: { + "conda_slug": operator_info.conda + if operator_info.conda_type == PACK_TYPE.SERVICE + else operator_info.conda_prefix, + "freeform_tags": freeform_tags, + } + }, + { + RUNTIME_TYPE.CONTAINER: { + "image_name": f"{operator_info.type}:{operator_info.version}", + "freeform_tags": freeform_tags, + } + }, + ], + RESOURCE_TYPE.DATAFLOW.value: [ + { + RUNTIME_TYPE.DATAFLOW: { + "conda_slug": operator_info.conda_prefix, + "freeform_tags": freeform_tags, + } + } + ], + BACKEND_NAME.OPERATOR_LOCAL.value: [ + { + RUNTIME_TYPE.CONTAINER: { + "kind": "operator", + "type": operator_info.type, + "version": operator_info.version, + } + }, + { + RUNTIME_TYPE.PYTHON: { + "kind": "operator", + "type": operator_info.type, + "version": operator_info.version, + } + }, + ], + } + + supported_backends = tuple( + set(RUNTIME_TYPE_MAP.keys()) + & set( + operator_info.backends + + [ + BACKEND_NAME.OPERATOR_LOCAL.value, + BACKEND_NAME.LOCAL.value, + ] + ) + ) + + if backend_kind: + if backend_kind not in supported_backends: + raise RuntimeError( + f"Not supported backend - {backend_kind}. Supported backends: {supported_backends}" + ) + supported_backends = (backend_kind,) + + for resource_type in supported_backends: + try: + for runtime_type_item in RUNTIME_TYPE_MAP.get( + resource_type.lower(), [] + ): + runtime_type, runtime_kwargs = next(iter(runtime_type_item.items())) + + # get config info from ini files + p = ConfigProcessor( + {**runtime_kwargs, **{"execution": {"backend": resource_type}}} + ).step( + ConfigMerger, + ads_config=ads_config or DEFAULT_ADS_CONFIG_FOLDER, + **kwargs, + ) + + # generate YAML specification template + result[ + (resource_type.lower(), runtime_type.value.lower()) + ] = yaml.load( + _BackendFactory(p.config).backend.init( + runtime_type=runtime_type.value, + **{**kwargs, **runtime_kwargs}, + ), + Loader=yaml.FullLoader, + ) + except Exception as ex: + logger.warning( + f"Unable to generate the configuration for the `{resource_type}` backend. " + f"{ex}" + ) + + return result diff --git a/ads/opctl/operator/common/const.py b/ads/opctl/operator/common/const.py new file mode 100644 index 000000000..4ef54a26a --- /dev/null +++ b/ads/opctl/operator/common/const.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from ads.common.extended_enum import ExtendedEnumMeta + +# Env variable representing the operator input arguments. +# This variable is used when operator run on the OCI resources. +ENV_OPERATOR_ARGS = "ENV_OPERATOR_ARGS" + +OPERATOR_BASE_IMAGE = "ads-operator-base" +OPERATOR_BASE_GPU_IMAGE = "ads-operator-gpu-base" +OPERATOR_BASE_DOCKER_FILE = "Dockerfile" +OPERATOR_BASE_DOCKER_GPU_FILE = "Dockerfile.gpu" + + +class PACK_TYPE(str, metaclass=ExtendedEnumMeta): + SERVICE = "service" + CUSTOM = "published" + + +class ARCH_TYPE(str, metaclass=ExtendedEnumMeta): + CPU = "cpu" + GPU = "gpu" diff --git a/ads/opctl/operator/common/errors.py b/ads/opctl/operator/common/errors.py new file mode 100644 index 000000000..ae15b07f4 --- /dev/null +++ b/ads/opctl/operator/common/errors.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from ads.opctl.operator import __operators__ + + +class OperatorSchemaYamlError(Exception): + """Exception raised when there is an issue with the schema.""" + def __init__(self, error: str): + super().__init__( + "Invalid operator specification. Check the YAML structure and ensure it " + "complies with the required schema for the operator. \n" + f"{error}" + ) + + +class OperatorNotFoundError(Exception): + def __init__(self, operator: str): + super().__init__( + f"The provided operator: `{operator}` is not found. You can pick up one from the " + f"registered service operators: `{'`, `'.join(__operators__)}`." + ) + + +class OperatorImageNotFoundError(Exception): + def __init__(self, operator: str): + super().__init__( + f"The Docker image for the operator: `{operator}` nas not been built yet. " + "Please ensure that you build the image before attempting to publish it. " + ) + + +class OperatorCondaNotFoundError(Exception): + def __init__(self, operator: str): + super().__init__( + f"The Conda environment for the operator: `{operator}` nas not been built yet. " + "Please ensure that you build the conda environment before attempting to publish it. " + ) diff --git a/ads/opctl/operator/common/operator_config.py b/ads/opctl/operator/common/operator_config.py new file mode 100644 index 000000000..794cec6ca --- /dev/null +++ b/ads/opctl/operator/common/operator_config.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +import json +from abc import abstractmethod +from dataclasses import dataclass +from typing import Any, Dict + +from ads.common.serializer import DataClassSerializable + +from ads.opctl.operator.common.utils import OperatorValidator +from ads.opctl.operator.common.errors import OperatorSchemaYamlError + + +@dataclass(repr=True) +class OperatorConfig(DataClassSerializable): + """Base class representing operator config. + + Attributes + ---------- + kind: str + The kind of the resource. For operators it is always - `operator`. + type: str + The type of the operator. + version: str + The version of the operator. + spec: object + The operator specification details. + runtime: dict + The runtime details of the operator. + """ + + kind: str = "operator" + type: str = None + version: str = None + spec: Any = None + runtime: Dict = None + + @classmethod + def _validate_dict(cls, obj_dict: Dict) -> bool: + """Validates the operator specification. + + Parameters + ---------- + obj_dict: (dict) + Dictionary representation of the object + + Returns + ------- + bool + True if the validation passed, else False. + + Raises + ------ + ForecastSchemaYamlError + In case of wrong specification format. + """ + schema = cls._load_schema() + validator = OperatorValidator(schema) + validator.allow_unknown = True + result = validator.validate(obj_dict) + + if not result: + raise OperatorSchemaYamlError(json.dumps(validator.errors, indent=2)) + return True + + @classmethod + @abstractmethod + def _load_schema(cls) -> str: + """ + The abstract method to load operator schema. + This method needs to be implemented on the child level. + Every operator will have their own YAML schema. + """ + raise NotImplementedError() diff --git a/ads/opctl/operator/common/operator_loader.py b/ads/opctl/operator/common/operator_loader.py new file mode 100644 index 000000000..02ff0e409 --- /dev/null +++ b/ads/opctl/operator/common/operator_loader.py @@ -0,0 +1,725 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import glob +import importlib +import inspect +import os +import re +import shutil +import sys +import tempfile +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Dict, List +from urllib.parse import urlparse + +from yaml import SafeLoader as loader + +from ads.opctl.operator.common.utils import default_signer +from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.common.serializer import DataClassSerializable +from ads.common.utils import copy_from_uri +from ads.opctl import logger +from ads.opctl.constants import OPERATOR_MODULE_PATH +from ads.opctl.operator import __operators__ + +from .const import ARCH_TYPE, PACK_TYPE +from .errors import OperatorNotFoundError + +LOCAL_SCHEME = "local" +MAIN_BRANCH = "main" + + +@dataclass(repr=True) +class OperatorInfo(DataClassSerializable): + """Class representing brief information about the operator. + + Attributes + ---------- + type (str) + The type of the operator. + name (str) + The name of the operator. + gpu (bool) + Whether the operator supports GPU. + short_description (str) + A short description of the operator. + description (str) + A detailed description of the operator. + version (str) + The version of the operator. + conda (str) + The conda environment required to run the operator. + conda_type (str) + The type of conda pack (e.g., PACK_TYPE.CUSTOM). + path (str) + The location of the operator. + keywords (List[str]) + Keywords associated with the operator. + backends (List[str]) + List of supported backends. + + Properties + ---------- + conda_prefix (str) + Generates the conda prefix for the custom conda pack. + """ + + type: str = "" + name: str = "" + gpu: bool = False + description: str = "" + version: str = "" + conda: str = "" + conda_type: str = "" + path: str = "" + keywords: List[str] = None + backends: List[str] = None + + @property + def conda_prefix(self) -> str: + """ + Generates conda prefix for the custom conda pack. + + Example + ------- + conda = "forecast_v1" + conda_prefix == "cpu/forecast/1/forecast_v1" + + Returns + ------- + str + The conda prefix for the custom conda pack. + """ + return os.path.join( + f"{ARCH_TYPE.GPU if self.gpu else ARCH_TYPE.CPU}", + self.name or self.type, + re.sub("[^0-9.]", "", self.version), + self.conda or f"{self.type}_{self.version}", + ) + + def __post_init__(self): + self.gpu = self.gpu == True or self.gpu == "yes" + self.version = self.version or "v1" + self.conda_type = self.conda_type or PACK_TYPE.CUSTOM + self.conda = self.conda or f"{self.type}_{self.version}" + + @classmethod + def from_yaml( + cls, + yaml_string: str = None, + uri: str = None, + loader: callable = loader, + **kwargs, + ) -> "OperatorInfo": + """Creates an object from YAML string provided or from URI location containing YAML string + + Parameters + ---------- + yaml_string (string, optional): YAML string. Defaults to None. + uri (string, optional): URI location of file containing YAML string. Defaults to None. + loader (callable, optional): Custom YAML loader. Defaults to CLoader/SafeLoader. + kwargs (dict): keyword arguments to be passed into fsspec.open(). + For OCI object storage, this should be config="path/to/.oci/config". + For other storage connections consider e.g. host, port, username, password, etc. + + Raises + ------ + ValueError + Raised if neither string nor uri is provided + + Returns + ------- + cls + Returns instance of the class + """ + obj: OperatorInfo = super().from_yaml( + yaml_string=yaml_string, uri=uri, loader=loader, **kwargs + ) + + if uri: + obj.path = os.path.dirname(uri) + return obj + + +class Loader(ABC): + """Operator Loader Interface. + + Attributes + ---------- + uri (str) + The operator's location (e.g., local path, HTTP path, OCI path, GIT path). + uri_dst (str) + The local folder where the operator can be downloaded from the remote location. + A temporary folder will be generated if not provided. + auth (Dict, optional) + Default authentication settings. + + Methods + ------- + load (**kwargs) + Downloads the operator's source code to the local folder. + cleanup (**kwargs) + Cleans up all temporary files and folders created during operator loading. + """ + + def __init__(self, uri: str, uri_dst: str = None, auth: Dict = None) -> None: + """ + Instantiates Loader. + + Parameters + ---------- + uri (str) + The operator's location. + uri_dst (str) + The local folder where the operator can be downloaded from the remote location. + A temporary folder will be generated if not provided. + auth (Dict, optional) + Default authentication settings. + """ + self.uri = uri + self.uri_dst = uri_dst + self.auth = auth + + @abstractmethod + def _load(self, **kwargs: Dict) -> OperatorInfo: + """ + Downloads the operator's source code to the local folder. + This method needs to be implemented on the child level. + + Parameters + ------------ + **kwargs (Dict) + Additional optional attributes. + + Returns + ------- + OperatorInfo + Information about the operator. + """ + pass + + def load(self, **kwargs: Dict) -> OperatorInfo: + """ + Downloads the operator's source code to the local folder. + + Parameters + ------------ + **kwargs (Dict) + Additional optional attributes. + + Returns + ------- + OperatorInfo + Information about the operator. + """ + operator_info = self._load(**kwargs) + # Adds the operators path to the system path. + # This will allow to execute the operator via runpy.run_module() + sys.path.insert(0, "/".join(operator_info.path.split("/")[0:-1])) + return operator_info + + def cleanup(self, **kwargs: Dict) -> None: + """ + Cleans up all temporary files and folders created during the loading of the operator. + + Parameters + ------------ + **kwargs (Dict) + Additional optional attributes. + """ + pass + + @classmethod + @abstractmethod + def compatible(cls, uri: str, **kwargs: Dict) -> bool: + """ + Checks if the loader is compatible with the given URI. + + Parameters + ------------ + uri (str) + The operator's location. + **kwargs (Dict) + Additional optional attributes. + Returns + ------- + bool + Whether the loader is compatible with the given URI. + """ + pass + + +class OperatorLoader: + """ + The operator loader class. + Helps to download the operator's source code to the local folder. + + Attributes + ---------- + loader (Loader) + The specific operator's loader. + """ + + def __init__(self, loader: Loader): + """ + Initializes OperatorLoader. + + Parameters + ---------- + loader (Loader) + The particular operator loader. + """ + self.loader = loader + + def load(self, **kwargs: Dict) -> OperatorInfo: + """ + Downloads the operator's source code to the local folder. + + Parameters + ------------ + **kwargs (Dict) + Additional optional attributes. + + Returns + ------- + OperatorInfo + Detailed information about the operator. + """ + return self.loader.load(**kwargs) + + @classmethod + def from_uri( + cls, uri: str, uri_dst: str = None, auth: Dict = None + ) -> "OperatorLoader": + """ + Constructs the operator's loader instance. + + Parameters + ---------- + uri (str) + The operator's location. + uri_dst (str) + The local folder where the operator can be downloaded from the remote location. + A temporary folder will be generated if not provided. + auth (Dict, optional) + Default authentication settings. + + Returns + ------- + OperatorLoader + An instance of OperatorLoader. + """ + if not uri: + raise ValueError("The `uri` attribute must be provided.") + + uri = os.path.expanduser(uri) + + for loader in ( + ServiceOperatorLoader, + LocalOperatorLoader, + GitOperatorLoader, + RemoteOperatorLoader, + ): + if loader.compatible(uri=uri, auth=auth): + return cls(loader=loader(uri=uri, uri_dst=uri_dst, auth=auth)) + + raise ValueError(f"The operator cannot be loaded from the given source: {uri}.") + + +class ServiceOperatorLoader(Loader): + """ + Class to load a service operator. + + Attributes + ---------- + uri (str) + The operator's location (e.g., local path, HTTP path, OCI path, GIT path). + uri_dst (str) + The local folder where the operator can be downloaded from the remote location. + A temporary folder will be generated if not provided. + auth (Dict, optional) + Default authentication settings. + """ + + def _load(self, **kwargs: Dict) -> OperatorInfo: + """ + Loads the service operator info. + + Parameters + ---------- + **kwargs (Dict) + Additional optional attributes. + + Returns + ------- + OperatorInfo + Detailed information about the operator. + """ + return _operator_info(name=self.uri) + + @classmethod + def compatible(cls, uri: str, **kwargs: Dict) -> bool: + """ + Checks if the loader is compatible with the given URI. + + Parameters + ---------- + uri (str) + The operator's location. + **kwargs (Dict) + Additional optional attributes. + + Returns + ------- + bool + Whether the loader is compatible with the given URI. + """ + return uri.lower() in __operators__ + + +class LocalOperatorLoader(Loader): + """ + Class to load a local operator. + + Attributes + ---------- + uri (str) + The operator's location (e.g., local path, HTTP path, OCI path, GIT path). + uri_dst (str) + The local folder where the operator can be downloaded from the remote location. + A temporary folder will be generated if not provided. + auth (Dict, optional) + Default authentication settings. + """ + + def _load(self, **kwargs: Dict) -> OperatorInfo: + """ + Loads the local operator info. + + Parameters + ---------- + **kwargs (Dict) + Additional optional attributes. + + Returns + ------- + OperatorInfo + Detailed information about the operator. + """ + return _operator_info(path=self.uri) + + @classmethod + def compatible(cls, uri: str, **kwargs: Dict) -> bool: + """Checks if the loader is compatible with the given URI. + + Parameters + ---------- + uri (str) + The operator's location. + **kwargs (Dict) + Additional optional attributes. + + Returns + ------- + bool + Whether the loader is compatible with the given URI. + """ + return not urlparse(uri).scheme + + +class RemoteOperatorLoader(Loader): + """ + Class to load an operator from a remote location (OCI Object Storage). + + Attributes + ---------- + uri (str) + The operator's location (e.g., local path, HTTP path, OCI path, GIT path). + uri_dst (str) + The local folder where the operator can be downloaded from the remote location. + A temporary folder will be generated if not provided. + auth (Dict, optional) + Default authentication settings. + """ + + def __init__(self, uri: str, uri_dst: str = None, auth: Dict = None) -> None: + """ + Instantiates Loader. + + Parameters + ---------- + uri (str) + The operator's location. + uri_dst (str) + The local folder where the operator can be downloaded from the remote location. + A temporary folder will be generated if not provided. + auth (Dict, optional) + Default authentication settings. + """ + super().__init__(uri=uri, uri_dst=uri_dst, auth=auth or default_signer()) + + def _load(self, **kwargs: Dict) -> OperatorInfo: + """Downloads the operator's source code to the local folder. + + Parameters + ---------- + **kwargs (Dict) + Additional optional attributes. + + Returns + ------- + OperatorInfo + Detailed information about the operator. + """ + self.tmp_dir = tempfile.mkdtemp() if not self.uri_dst else None + uri_dst = os.path.join( + (self.uri_dst or self.tmp_dir).rstrip(), + os.path.splitext(os.path.basename(self.uri.rstrip()))[0], + ) + + logger.info(f"Downloading operator from `{self.uri}` to `{uri_dst}`.") + copy_from_uri( + self.uri, uri_dst, force_overwrite=True, auth=self.auth, unpack=True + ) + + return _operator_info(path=uri_dst) + + def cleanup(self, **kwargs: Dict) -> None: + """Cleans up all temporary files and folders created during operator loading. + + Parameters + ---------- + **kwargs (Dict) + Additional optional attributes. + """ + super().cleanup(**kwargs) + try: + shutil.rmtree(self.tmp_dir) + except Exception as ex: + logger.debug(ex) + + @classmethod + def compatible(cls, uri: str, **kwargs: Dict) -> bool: + """Checks if the loader is compatible with the given URI. + + Parameters + ---------- + uri (str) + The operator's location. + **kwargs (Dict) + Additional optional attributes. + Returns + ------- + bool + Whether the loader is compatible with the given URI. + """ + return urlparse(uri).scheme.lower() == "oci" + + +class GitOperatorLoader(Loader): + """ + Class to load an operator from a GIT repository. + Supported URI format: https://github.com/@# + Examples: + - https://github.com/my-operator-repository.git@feature-branch#forecasting + - https://github.com/my-operator-repository#forecasting + - https://github.com/my-operator-repository + + Attributes + ---------- + uri (str) + The operator's location (e.g., local path, HTTP path, OCI path, GIT path). + uri_dst (str) + The local folder where the operator can be downloaded from the remote location. + A temporary folder will be generated if not provided. + auth (Dict, optional) + Default authentication settings. + """ + + @runtime_dependency( + module="git", + err_msg=( + "The `git` library is required. " + "Use `pip install git` to install the `git` library." + ), + ) + def _load(self, **kwargs: Dict) -> OperatorInfo: + """ + Downloads the operator's source code to the local folder. + + Parameters + ---------- + **kwargs (Dict) + Additional optional attributes. + + Returns + ------- + OperatorInfo + Detailed information about the operator. + """ + import git + + self.tmp_dir = tempfile.mkdtemp() if not self.uri_dst else None + uri_dst = self.uri_dst or self.tmp_dir + + uri_dst = os.path.join( + (self.uri_dst or self.tmp_dir).rstrip(), + os.path.splitext(os.path.basename(self.uri.rstrip()))[0], + ) + + logger.info(f"Fetching operator from `{self.uri}` to `{uri_dst}`.") + + # Parse the GitHub URL + parsed_url = urlparse(self.uri) + logger.debug(parsed_url) + + branch = "main" # Default branch + repo_name = parsed_url.path + + if "@" in parsed_url.path: + # Extract the branch if provided in the URL + branch = parsed_url.path.split("@")[1] + repo_name = parsed_url.path.split("@")[0] + + # Construct the repository URL + repo_url = f"https://{parsed_url.netloc}{repo_name}" + logger.debug(repo_url) + + # Clone the GitHub repository to a temporary directory + with tempfile.TemporaryDirectory() as tmp_git_dir: + repo = git.Repo.clone_from(repo_url, tmp_git_dir, branch=branch) + + # Find the folder to download + if parsed_url.fragment: + folder_to_download = parsed_url.fragment + folder_path = os.path.join(tmp_git_dir, folder_to_download) + + if not os.path.exists(folder_path): + raise ValueError( + f"Folder '{folder_to_download}' not found in the repository." + ) + + # Move the folder to the desired local path + for item in glob.glob(os.path.join(folder_path, "**"), recursive=True): + destination_item = os.path.join( + uri_dst, os.path.relpath(item, folder_path) + ) + if os.path.isdir(item): + # If it's a directory, create it in the destination directory + if not os.path.exists(destination_item): + os.makedirs(destination_item) + else: + # If it's a file, move it to the destination directory + shutil.move(item, destination_item) + + # Clean up the temporary directory + repo.close() + return _operator_info(path=uri_dst) + + def cleanup(self, **kwargs: Dict) -> None: + """Cleans up all temporary files and folders created during operator loading. + + Parameters + ---------- + **kwargs (Dict) + Additional optional attributes. + """ + super().cleanup(**kwargs) + try: + shutil.rmtree(self.tmp_dir) + except Exception as ex: + logger.debug(ex) + + @classmethod + def compatible(cls, uri: str, **kwargs: Dict) -> bool: + """Checks if the loader is compatible with the given URI. + + Parameters + ---------- + uri (str) + The operator's location. + **kwargs (Dict) + Additional optional attributes. + + Returns + ------- + bool + Whether the loader is compatible with the given URI. + """ + return any(element in uri.lower() for element in ("github", ".git")) + + +def _module_from_file(module_name: str, module_path: str) -> Any: + """ + Loads module by it's location. + + Parameters + ---------- + module_name (str) + The name of the module to be imported. + module_path (str) + The physical path of the module. + + Returns + ------- + Loaded module. + """ + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def _operator_info(path: str = None, name: str = None) -> OperatorInfo: + """ + Extracts operator's details by given path. + The expectation is that the operator has an init file where all details are placed. + + Parameters + ------------ + path (str, optional) + The path to the operator. + name (str, optional) + The name of the service operator. + + Returns + ------- + OperatorInfo + The operator details. + """ + try: + if name: + path = os.path.dirname( + inspect.getfile( + importlib.import_module(f"{OPERATOR_MODULE_PATH}.{name}") + ) + ) + return OperatorInfo.from_yaml(uri=os.path.join(path, "MLoperator")) + except Exception as ex: + logger.debug(ex) + raise OperatorNotFoundError(name or path) + + +def _operator_info_list() -> List[OperatorInfo]: + """Returns the list of registered operators. + + Returns + ------- + List[OperatorInfo] + The list of registered operators. + """ + result = [] + + for operator_name in __operators__: + try: + result.append(_operator_info(name=operator_name)) + except OperatorNotFoundError: + logger.debug(f"Operator `{operator_name}` is not registered.") + continue + + return result diff --git a/ads/opctl/operator/common/operator_schema.yaml b/ads/opctl/operator/common/operator_schema.yaml new file mode 100644 index 000000000..58292e4ac --- /dev/null +++ b/ads/opctl/operator/common/operator_schema.yaml @@ -0,0 +1,58 @@ +type: + required: false + type: string + meta: + description: "The type of the operator." +name: + required: true + type: string + meta: + description: "The name of the operator." +version: + required: true + type: string + default: v1 + meta: + description: "The version of the operator." +description: + required: false + type: string + meta: + description: "The short description of the operator." +gpu: + required: false + type: string + default: no + allowed: + - yes + - no + meta: + description: "If the operator requires GPU cluster." +keywords: + required: false + type: list + schema: + type: string + meta: + description: "The operator's keywords." +backends: + required: false + type: list + schema: + type: string + meta: + description: "The operator's supported backends. Can be [job, dataflow]" +conda_type: + required: false + type: string + default: custom + allowed: + - service + - published + meta: + description: "The operator's conda environment type. Can be either service or custom type." +conda: + required: false + type: string + meta: + description: "The operator's conda environment name. Will be auto-generated if not provided." diff --git a/ads/opctl/operator/common/operator_yaml_generator.py b/ads/opctl/operator/common/operator_yaml_generator.py new file mode 100644 index 000000000..1bbc1ae03 --- /dev/null +++ b/ads/opctl/operator/common/operator_yaml_generator.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +from dataclasses import dataclass +from typing import Any, Dict, Optional + +import yaml + + +@dataclass +class YamlGenerator: + """ + Class for generating the YAML config based on the given YAML schema. + + Attributes + ---------- + schema: Dict + The schema of the template. + """ + + schema: Dict[str, Any] = None + + def generate_example_dict(self, values: Optional[Dict[str, Any]] = None) -> Dict: + """ + Generate the YAML config based on the YAML schema. + + Properties + ---------- + values: Optional dictionary containing specific values for the attributes. + + Returns + ------- + Dict + The generated dictionary config. + """ + return self._generate_example(self.schema, values) + + def generate_example(self, values: Optional[Dict[str, Any]] = None) -> str: + """ + Generate the YAML config based on the YAML schema. + + Properties + ---------- + values: Optional dictionary containing specific values for the attributes. + + Returns + ------- + str + The generated YAML config. + """ + return yaml.dump(self._generate_example(self.schema, values)) + + def _check_condition( + self, condition: Dict[str, Any], example: Dict[str, Any] + ) -> bool: + """ + Checks if the YAML schema condition fulfils. + This method is used to include conditional fields into the final config. + + Properties + ---------- + condition: Dict[str, Any] + The schema condition. + Example: + In the example below the `owner_name` field has dependency on the `model` field. + The `owner_name` will be included to the final config if only `model` is `prophet`. + owner_name: + type: string + dependencies: {"model":"prophet"} + example: Dict[str, Any] + The config to check if the dependable value presented there. + Returns + ------- + bool + True if the condition fulfills, false otherwise. + """ + for key, value in condition.items(): + if key not in example or example[key] != value: + return False + return True + + def _generate_example( + self, schema: Dict[str, Any], values: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: + """ + Generates the final YAML config. + This is a recursive method, which iterates through the entire schema. + + Properties + ---------- + schema: Dict[str, Any] + The schema to generate the config. + values: Optional[Dict[str, Any]] + The optional values that would be used instead of default values provided in the schema. + + Returns + ------- + Dict + The result config. + """ + example = {} + + for key, value in schema.items(): + # only generate values for required fields + if ( + value.get("required", False) + or value.get("dependencies", False) + or key in values + ): + if not "dependencies" in value or self._check_condition( + value["dependencies"], example + ): + data_type = value.get("type") + + if key in values: + example[key] = values[key] + elif "default" in value: + example[key] = value["default"] + elif data_type == "string": + example[key] = "value" + elif data_type == "number": + example[key] = 1 + elif data_type == "boolean": + example[key] = True + elif data_type == "list": + # TODO: Handle list of dict + example[key] = ["item1", "item2"] + elif data_type == "dict": + example[key] = self._generate_example( + schema=value.get("schema", {}), values=values + ) + return example diff --git a/ads/opctl/operator/common/utils.py b/ads/opctl/operator/common/utils.py new file mode 100644 index 000000000..54bfbe97c --- /dev/null +++ b/ads/opctl/operator/common/utils.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import argparse +import os +import time +from string import Template +from typing import Any, Dict, List, Tuple + +import fsspec +import yaml +from cerberus import Validator + +from ads.opctl import logger, utils +from ads.opctl.operator import __operators__ + +CONTAINER_NETWORK = "CONTAINER_NETWORK" + + +class OperatorValidator(Validator): + """The custom validator class.""" + + pass + + +def _build_image( + dockerfile: str, + image_name: str, + tag: str = None, + target: str = None, + **kwargs: Dict[str, Any], +) -> str: + """ + Builds the operator image. + + Parameters + ---------- + dockerfile: str + Path to the docker file. + image_name: str + The name of the image. + tag: (str, optional) + The tag of the image. + target: (str, optional) + The image target. + kwargs: (Dict, optional). + Additional key value arguments. + + Returns + ------- + str + The final image name. + + Raises + ------ + ValueError + When dockerfile or image name not provided. + FileNotFoundError + When dockerfile doesn't exist. + RuntimeError + When docker build operation fails. + """ + if not (dockerfile and image_name): + raise ValueError("The `dockerfile` and `image_name` needs to be provided.") + + if not os.path.isfile(dockerfile): + raise FileNotFoundError(f"The file `{dockerfile}` does not exist") + + image_name = f"{image_name}:{tag or 'latest'}" + + command = [ + "docker", + "build", + "-t", + image_name, + "-f", + dockerfile, + ] + + if target: + command += ["--target", target] + if os.environ.get("no_proxy"): + command += ["--build-arg", f"no_proxy={os.environ['no_proxy']}"] + if os.environ.get("http_proxy"): + command += ["--build-arg", f"http_proxy={os.environ['http_proxy']}"] + if os.environ.get("https_proxy"): + command += ["--build-arg", f"https_proxy={os.environ['https_proxy']}"] + command += ["--build-arg", f"RND={time.time()}"] + if os.environ.get(CONTAINER_NETWORK): + command += ["--network", os.environ[CONTAINER_NETWORK]] + command += [os.path.dirname(dockerfile)] + + logger.info(f"Build image: {command}") + + proc = utils.run_command(command) + if proc.returncode != 0: + raise RuntimeError("Docker build failed.") + + return image_name + + +def _extant_file(x: str): + """Checks the extension of the file to yaml.""" + if not (x.lower().endswith(".yml") or x.lower().endswith(".yaml")): + raise argparse.ArgumentTypeError( + f"The {x} exists, but must be a yaml file (.yaml/.yml)" + ) + return x + + +def _parse_input_args(raw_args: List) -> Tuple: + """Parses operator input arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "-f", + "--file", + type=_extant_file, + required=False, + help="Path to the operator specification YAML file", + ) + parser.add_argument( + "-s", "--spec", type=str, required=False, help="Operator Yaml specification" + ) + parser.add_argument( + "-v", + "--verify", + type=bool, + default=False, + required=False, + help="Verify operator schema", + ) + return parser.parse_known_args(raw_args) + + +def _load_yaml_from_string(doc: str, **kwargs) -> Dict: + """Loads YAML from string and merge it with env variables and kwargs.""" + template_dict = {**os.environ, **kwargs} + return yaml.safe_load( + Template(doc).safe_substitute( + **template_dict, + ) + ) + + +def _load_yaml_from_uri(uri: str, **kwargs) -> str: + """Loads YAML from the URI path. Can be Object Storage path.""" + with fsspec.open(uri) as f: + return _load_yaml_from_string(str(f.read(), "UTF-8"), **kwargs) + + +def default_signer(**kwargs): + os.environ["EXTRA_USER_AGENT_INFO"] = "Operator" + from ads.common.auth import default_signer + + return default_signer(**kwargs) diff --git a/ads/opctl/operator/lowcode/__init__.py b/ads/opctl/operator/lowcode/__init__.py new file mode 100644 index 000000000..b8d0460f5 --- /dev/null +++ b/ads/opctl/operator/lowcode/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/ads/opctl/operator/lowcode/forecast/MLoperator b/ads/opctl/operator/lowcode/forecast/MLoperator new file mode 100644 index 000000000..68f2dda95 --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/MLoperator @@ -0,0 +1,18 @@ +type: forecast +version: v1 +name: Forecasting Operator +conda_type: published +conda: forecast_v1 +gpu: no +keywords: + - Prophet + - AutoML + - ARIMA + - RNN + - LSTM +backends: + - job +description: | + Forecasting operator, that leverages historical time series data to generate accurate + forecasts for future trends. Use `ads operator info -t forecast` to get more details about + the forecasting operator." diff --git a/ads/opctl/operator/lowcode/forecast/README.md b/ads/opctl/operator/lowcode/forecast/README.md new file mode 100644 index 000000000..ac4c3d6d3 --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/README.md @@ -0,0 +1,214 @@ +# Forecasting Operator + +The Forecasting Operator leverages historical time series data to generate accurate forecasts for future trends. This operator aims to simplify and expedite the data science process by automating the selection of appropriate models and hyperparameters, as well as identifying relevant features for a given prediction task. + +Below are the steps to configure and run the Forecasting Operator on different resources. + +## 1. Prerequisites + +Follow the [CLI Configuration](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/opctl/configure.html) steps from the ADS documentation. This step is mandatory as it sets up default values for different options while running the Forecasting Operator on OCI Data Science jobs or OCI Data Flow applications. If you have previously done this and used a flexible shape, make sure to adjust `ml_job_config.ini` with shape config details and `docker_registry` information. + +- ocpus = 1 +- memory_in_gbs = 16 +- docker_registry = `` + +## 2. Generating configs + +To generate starter configs, run the command below. This will create a list of YAML configs and place them in the `output` folder. + +```bash +ads operator init -t forecast --overwrite --output ~/forecast/ +``` + +The most important files expected to be generated are: + +- `forecast.yaml`: Contains forecast-related configuration. +- `backend_operator_local_python_config.yaml`: This includes a local backend configuration for running forecasting in a local environment. The environment should be set up manually before running the operator. +- `backend_operator_local_container_config.yaml`: This includes a local backend configuration for running forecasting within a local container. The container should be built before running the operator. Please refer to the instructions below for details on how to accomplish this. +- `backend_job_container_config.yaml`: Contains Data Science job-related config to run forecasting in a Data Science job within a container (BYOC) runtime. The container should be built and published before running the operator. Please refer to the instructions below for details on how to accomplish this. +- `backend_job_python_config.yaml`: Contains Data Science job-related config to run forecasting in a Data Science job within a conda runtime. The conda should be built and published before running the operator. + +All generated configurations should be ready to use without the need for any additional adjustments. However, they are provided as starter kit configurations that can be customized as needed. + +## 3. Running forecasting on the local conda environment + +To run forecasting locally, create and activate a new conda environment (`ads-forecasting`). Install all the required libraries listed in the `environment.yaml` file. + +```yaml +- prophet +- neuralprophet +- pmdarima +- statsmodels +- datapane +- cerberus +- sktime +- optuna==2.9.0 +- oracle-automlx==23.2.3 +- oracle-ads>=2.9.0 +``` + +Please review the previously generated `forecast.yaml` file using the `init` command, and make any necessary adjustments to the input and output file locations. By default, it assumes that the files should be located in the same folder from which the `init` command was executed. + +Use the command below to verify the forecasting config. + +```bash +ads operator verify -f ~/forecast/forecast.yaml +``` + +Use the following command to run the forecasting within the `ads-forecasting` conda environment. + +```bash +ads operator run -f ~/forecast/forecast.yaml -b local +``` + +The operator will run in your local environment without requiring any additional modifications. + +## 4. Running forecasting on the local container + +To run the forecasting operator within a local container, follow these steps: + +Use the command below to build the forecast container. + +```bash +ads operator build-image -t forecast +``` + +This will create a new `forecast:v1` image, with `/etc/operator` as the designated working directory within the container. + + +Check the `backend_operator_local_container_config.yaml` config file. By default, it should have a `volume` section with the `.oci` configs folder mounted. + +```yaml +volume: + - "/Users//.oci:/root/.oci" +``` + +Mounting the OCI configs folder is only required if an OCI Object Storage bucket will be used to store the input forecasting data or output forecasting result. The input/output folders can also be mounted to the container. + +```yaml +volume: + - /Users//.oci:/root/.oci + - /Users//forecast/data:/etc/operator/data + - /Users//forecast/result:/etc/operator/result +``` + +The full config can look like: +```yaml +kind: operator.local +spec: + image: forecast:v1 + volume: + - /Users//.oci:/root/.oci + - /Users//forecast/data:/etc/operator/data + - /Users//forecast/result:/etc/operator/result +type: container +version: v1 +``` + +Run the forecasting within a container using the command below: + +```bash +ads operator run -f ~/forecast/forecast.yaml --backend-config ~/forecast/backend_operator_local_container_config.yaml +``` + +## 5. Running forecasting in the Data Science job within container runtime + +To execute the forecasting operator within a Data Science job using container runtime, please follow the steps outlined below: + +You can use the following command to build the forecast container. This step can be skipped if you have already done this for running the operator within a local container. + +```bash +ads operator build-image -t forecast +``` + +This will create a new `forecast:v1` image, with `/etc/operator` as the designated working directory within the container. + +Publish the `forecast:v1` container to the [Oracle Container Registry](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/home.htm). To become familiar with OCI, read the documentation links posted below. + +- [Access Container Registry](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/Concepts/registryoverview.htm#access) +- [Create repositories](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/Tasks/registrycreatingarepository.htm#top) +- [Push images](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/Tasks/registrypushingimagesusingthedockercli.htm#Pushing_Images_Using_the_Docker_CLI) + +To publish `forecast:v1` to OCR, use the command posted below: + +```bash +ads operator publish-image forecast:v1 --registry +``` + +After the container is published to OCR, it can be used within Data Science jobs service. Check the `backend_job_container_config.yaml` config file. It should contain pre-populated infrastructure and runtime sections. The runtime section should contain an image property, something like `image: iad.ocir.io//forecast:v1`. More details about supported options can be found in the ADS Jobs documentation - [Run a Container](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/run_container.html). + +Adjust the `forecast.yaml` config with proper input/output folders. When the forecasting is run in the Data Science job, it will not have access to local folders. Therefore, input data and output folders should be placed in the Object Storage bucket. Open the `forecast.yaml` and adjust the following fields: + +```yaml +historical_data: + url: oci://bucket@namespace/forecast/input_data/data.csv +output_directory: + url: oci://bucket@namespace/forecast/result/ +test_data: + url: oci://bucket@namespace/forecast/input_data/test.csv +``` + +Run the forecasting on the Data Science jobs using the command posted below: + +```bash +ads operator run -f ~/forecast/forecast.yaml --backend-config ~/forecast/backend_job_container_config.yaml +``` + +The logs can be monitored using the `ads opctl watch` command. + +```bash +ads opctl watch +``` + +## 6. Running forecasting in the Data Science job within conda runtime + +To execute the forecasting operator within a Data Science job using conda runtime, please follow the steps outlined below: + +You can use the following command to build the forecast conda environment. + +```bash +ads operator build-conda -t forecast +``` + +This will create a new `forecast_v1` conda environment and place it in the folder specified within `ads opctl configure` command. + +Use the command below to Publish the `forecast_v1` conda environment to the Object Storage bucket. + +```bash +ads opctl conda publish forecast_v1 +``` +More details about configuring CLI can be found here - [Configuring CLI](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/opctl/configure.html) + + +After the conda environment is published to Object Storage, it can be used within Data Science jobs service. Check the `backend_job_python_config.yaml` config file. It should contain pre-populated infrastructure and runtime sections. The runtime section should contain a `conda` section. + +```yaml +conda: + type: published + uri: oci://bucket@namespace/conda_environments/cpu/forecast/1/forecast_v1 +``` + +More details about supported options can be found in the ADS Jobs documentation - [Run a Python Workload](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/run_python.html). + +Adjust the `forecast.yaml` config with proper input/output folders. When the forecasting is run in the Data Science job, it will not have access to local folders. Therefore, input data and output folders should be placed in the Object Storage bucket. Open the `forecast.yaml` and adjust the following fields: + +```yaml +historical_data: + url: oci://bucket@namespace/forecast/input_data/data.csv +output_directory: + url: oci://bucket@namespace/forecast/result/ +test_data: + url: oci://bucket@namespace/forecast/input_data/test.csv +``` + +Run the forecasting on the Data Science jobs using the command posted below: + +```bash +ads operator run -f ~/forecast/forecast.yaml --backend-config ~/forecast/backend_job_python_config.yaml +``` + +The logs can be monitored using the `ads opctl watch` command. + +```bash +ads opctl watch +``` diff --git a/ads/opctl/operator/lowcode/forecast/__init__.py b/ads/opctl/operator/lowcode/forecast/__init__.py new file mode 100644 index 000000000..b8d0460f5 --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/ads/opctl/operator/lowcode/forecast/__main__.py b/ads/opctl/operator/lowcode/forecast/__main__.py new file mode 100644 index 000000000..1ca4e96bc --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/__main__.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import json +import os +import sys +from typing import Dict, List + +import yaml + +from ads.opctl import logger +from ads.opctl.operator.common.const import ENV_OPERATOR_ARGS +from ads.opctl.operator.common.utils import _parse_input_args + +from .operator_config import ForecastOperatorConfig +from .model.forecast_datasets import ForecastDatasets + + +def operate(operator_config: ForecastOperatorConfig) -> None: + """Runs the forecasting operator.""" + from .model.factory import ForecastOperatorModelFactory + + datasets = ForecastDatasets(operator_config) + ForecastOperatorModelFactory.get_model(operator_config, datasets).generate_report() + + +def verify(spec: Dict, **kwargs: Dict) -> bool: + """Verifies the forecasting operator config.""" + operator = ForecastOperatorConfig.from_dict(spec) + msg_header = ( + f"{'*' * 30} The operator config has been successfully verified {'*' * 30}" + ) + print(msg_header) + print(operator.to_yaml()) + print("*" * len(msg_header)) + + +def main(raw_args: List[str]): + """The entry point of the forecasting the operator.""" + args, _ = _parse_input_args(raw_args) + if not args.file and not args.spec and not os.environ.get(ENV_OPERATOR_ARGS): + logger.info( + "Please specify -f[--file] or -s[--spec] or " + f"pass operator's arguments via {ENV_OPERATOR_ARGS} environment variable." + ) + return + + logger.info("-" * 100) + logger.info(f"{'Running' if not args.verify else 'Verifying'} the operator...") + + # if spec provided as input string, then convert the string into YAML + yaml_string = "" + if args.spec or os.environ.get(ENV_OPERATOR_ARGS): + operator_spec_str = args.spec or os.environ.get(ENV_OPERATOR_ARGS) + try: + yaml_string = yaml.safe_dump(json.loads(operator_spec_str)) + except json.JSONDecodeError: + yaml_string = yaml.safe_dump(yaml.safe_load(operator_spec_str)) + except: + yaml_string = operator_spec_str + + operator_config = ForecastOperatorConfig.from_yaml( + uri=args.file, + yaml_string=yaml_string, + ) + + # run operator + if args.verify: + verify(operator_config) + else: + operate(operator_config) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/ads/opctl/operator/lowcode/forecast/cmd.py b/ads/opctl/operator/lowcode/forecast/cmd.py new file mode 100644 index 000000000..3c4ed24bf --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/cmd.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from typing import Dict + +import click + +from ads.opctl import logger +from ads.opctl.operator.common.utils import _load_yaml_from_uri +from ads.opctl.operator.common.operator_yaml_generator import YamlGenerator + +from .const import SupportedModels + + +def init(**kwargs: Dict) -> str: + """ + Generates operator config by the schema. + + Properties + ---------- + kwargs: (Dict, optional). + Additional key value arguments. + + - type: str + The type of the operator. + + Returns + ------- + str + The YAML specification generated based on the schema. + """ + logger.info("==== Forecasting related options ====") + + model_type = click.prompt( + "Provide a model type:", + type=click.Choice(SupportedModels.values()), + default=SupportedModels.Auto, + ) + + return YamlGenerator( + schema=_load_yaml_from_uri(__file__.replace("cmd.py", "schema.yaml")) + ).generate_example_dict(values={"model": model_type, "type": kwargs.get("type")}) diff --git a/ads/opctl/operator/lowcode/forecast/const.py b/ads/opctl/operator/lowcode/forecast/const.py new file mode 100644 index 000000000..71f350caa --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/const.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from ads.common.extended_enum import ExtendedEnumMeta + + +class SupportedModels(str, metaclass=ExtendedEnumMeta): + """Supported forecast models.""" + + Prophet = "prophet" + Arima = "arima" + NeuralProphet = "neuralprophet" + AutoMLX = "automlx" + AutoTS = "autots" + Auto = "auto" + + +class SupportedMetrics(str, metaclass=ExtendedEnumMeta): + """Supported forecast metrics.""" + + MAPE = "MAPE" + RMSE = "RMSE" + MSE = "MSE" + SMAPE = "sMAPE" + WMAPE = "wMAPE" + R2 = "r2" + EXPLAINED_VARIANCE = "Explained Variance" + MEAN_MAPE = "Mean MAPE" + MEAN_RMSE = "Mean RMSE" + MEAN_MSE = "Mean MSE" + MEAN_SMAPE = "Mean sMAPE" + MEAN_WMAPE = "Mean wMAPE" + MEAN_R2 = "Mean r2" + MEAN_EXPLAINED_VARIANCE = "Mean Explained Variance" + MEDIAN_MAPE = "Median MAPE" + MEDIAN_RMSE = "Median RMSE" + MEDIAN_MSE = "Median MSE" + MEDIAN_SMAPE = "Median sMAPE" + MEDIAN_WMAPE = "Median wMAPE" + MEDIAN_R2 = "Median r2" + MEDIAN_EXPLAINED_VARIANCE = "Median Explained Variance" + ELAPSED_TIME = "Elapsed Time" + + +class ForecastOutputColumns(str, metaclass=ExtendedEnumMeta): + """The column names for the forecast.csv output file""" + + DATE = "Date" + SERIES = "Series" + INPUT_VALUE = "input_value" + FITTED_VALUE = "fitted_value" + FORECAST_VALUE = "forecast_value" + UPPER_BOUND = "upper_bound" + LOWER_BOUND = "lower_bound" + + +AUTOMLX_METRIC_MAP = { + "smape": "neg_sym_mean_abs_percent_error", + "mape": "neg_sym_mean_abs_percent_error", + "mase": "neg_mean_abs_scaled_error", + "mae": "neg_mean_absolute_error", + "mse": "neg_mean_squared_error", + "rmse": "neg_root_mean_squared_error", +} + +MAX_COLUMNS_AUTOMLX = 15 +DEFAULT_TRIALS = 10 +SUMMARY_METRICS_HORIZON_LIMIT = 10 +PROPHET_INTERNAL_DATE_COL = "ds" diff --git a/ads/opctl/operator/lowcode/forecast/environment.yaml b/ads/opctl/operator/lowcode/forecast/environment.yaml new file mode 100644 index 000000000..af2d9ba4f --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/environment.yaml @@ -0,0 +1,19 @@ +name: Forecast +channels: + - conda-forge +dependencies: + - python=3.8 + - pip + - pip: + - oracle-ads>=2.9.0 + - prophet + - neuralprophet + - pmdarima + - statsmodels + - datapane + - cerberus + - sktime + - shap + - autots[additional] + - optuna==2.9.0 + - oracle-automlx==23.2.3 diff --git a/ads/opctl/operator/lowcode/forecast/errors.py b/ads/opctl/operator/lowcode/forecast/errors.py new file mode 100644 index 000000000..c71580864 --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/errors.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +class ForecastSchemaYamlError(Exception): + """Exception raised when there is an issue with the schema.""" + + def __init__(self, error: str): + super().__init__( + "Invalid forecast operator specification. Check the YAML structure and ensure it " + "complies with the required schema for forecast operator. \n" + f"{error}" + ) + + +class ForecastInputDataError(Exception): + """Exception raised when there is an issue with input data.""" + + def __init__(self, error: str): + super().__init__( + "Invalid input data. Check the input data and ensure it " + "complies with the validation criteria. \n" + f"{error}" + ) diff --git a/ads/opctl/operator/lowcode/forecast/model/__init__.py b/ads/opctl/operator/lowcode/forecast/model/__init__.py new file mode 100644 index 000000000..b8d0460f5 --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/model/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/ads/opctl/operator/lowcode/forecast/model/arima.py b/ads/opctl/operator/lowcode/forecast/model/arima.py new file mode 100644 index 000000000..6a11efaec --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/model/arima.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import pandas as pd +import numpy as np +import pmdarima as pm + +from ads.opctl import logger + +from .. import utils +from .base_model import ForecastOperatorBaseModel +from ..operator_config import ForecastOperatorConfig +import traceback +from .forecast_datasets import ForecastDatasets, ForecastOutput +from ..const import ForecastOutputColumns + + +class ArimaOperatorModel(ForecastOperatorBaseModel): + """Class representing ARIMA operator model.""" + + def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): + super().__init__(config, datasets=datasets) + self.global_explanation = {} + self.local_explanation = {} + self.train_metrics = True + self.formatted_global_explanation = None + self.formatted_local_explanation = None + + def _build_model(self) -> pd.DataFrame: + full_data_dict = self.datasets.full_data_dict + + # Extract the Confidence Interval Width and convert to arima's equivalent - alpha + if self.spec.confidence_interval_width is None: + self.spec.confidence_interval_width = 1 - self.spec.model_kwargs.get( + "alpha", 0.05 + ) + model_kwargs = self.spec.model_kwargs + model_kwargs["alpha"] = 1 - self.spec.confidence_interval_width + if "error_action" not in model_kwargs.keys(): + model_kwargs["error_action"] = "ignore" + + models = [] + self.datasets.datetime_col = self.spec.datetime_column.name + self.forecast_output = ForecastOutput( + confidence_interval_width=self.spec.confidence_interval_width + ) + + outputs = dict() + outputs_legacy = [] + fitted_values = dict() + actual_values = dict() + dt_columns = dict() + + for i, (target, df) in enumerate(full_data_dict.items()): + # format the dataframe for this target. Dropping NA on target[df] will remove all future data + le, df_encoded = utils._label_encode_dataframe( + df, no_encode={self.spec.datetime_column.name, target} + ) + + df_encoded[self.spec.datetime_column.name] = pd.to_datetime( + df_encoded[self.spec.datetime_column.name], + format=self.spec.datetime_column.format, + ) + df_clean = df_encoded.set_index(self.spec.datetime_column.name) + data_i = df_clean[df_clean[target].notna()] + + # Assume that all columns passed in should be used as additional data + additional_regressors = set(data_i.columns) - { + target, + self.spec.datetime_column.name, + } + logger.debug( + f"Additional Regressors Detected {list(additional_regressors)}" + ) + + # Split data into X and y for arima tune method + y = data_i[target] + X_in = None + if len(additional_regressors): + X_in = data_i.drop(target, axis=1) + + # Build and fit model + model = pm.auto_arima(y=y, X=X_in, **self.spec.model_kwargs) + + fitted_values[target] = model.predict_in_sample(X=X_in) + actual_values[target] = y + + # Build future dataframe + start_date = y.index.values[-1] + n_periods = self.spec.horizon + if len(additional_regressors): + X = df_clean[df_clean[target].isnull()].drop(target, axis=1) + else: + X = pd.date_range( + start=start_date, periods=n_periods, freq=self.spec.freq + ) + + # Predict and format forecast + yhat, conf_int = model.predict( + n_periods=n_periods, + X=X, + return_conf_int=True, + alpha=model_kwargs["alpha"], + ) + yhat_clean = pd.DataFrame(yhat, index=yhat.index, columns=["yhat"]) + + dt_columns[target] = pd.concat( + [ + df_encoded[self.spec.datetime_column.name], + pd.Series(yhat_clean.index), + ] + ) + conf_int_clean = pd.DataFrame( + conf_int, index=yhat.index, columns=["yhat_lower", "yhat_upper"] + ) + forecast = pd.concat([yhat_clean, conf_int_clean], axis=1) + logger.debug(f"-----------------Model {i}----------------------") + logger.debug(forecast[["yhat", "yhat_lower", "yhat_upper"]].tail()) + + # Collect all outputs + models.append(model) + outputs_legacy.append( + forecast.reset_index().rename(columns={"index": "ds"}) + ) + outputs[target] = forecast + + self.models = models + + logger.debug("===========Done===========") + + # Merge the outputs from each model into 1 df with all outputs by target and category + col = self.original_target_column + output_col = pd.DataFrame() + yhat_upper_name = ForecastOutputColumns.UPPER_BOUND + yhat_lower_name = ForecastOutputColumns.LOWER_BOUND + for cat in self.categories: + output_i = pd.DataFrame() + output_i["Date"] = dt_columns[f"{col}_{cat}"] + output_i = output_i.set_index("Date") + output_i["Series"] = cat + output_i["input_value"] = actual_values[f"{col}_{cat}"] + + output_i["fitted_value"] = fitted_values[f"{col}_{cat}"] + output_i["forecast_value"] = outputs[f"{col}_{cat}"]["yhat"] + output_i[yhat_upper_name] = outputs[f"{col}_{cat}"]["yhat_upper"] + output_i[yhat_lower_name] = outputs[f"{col}_{cat}"]["yhat_lower"] + + output_i = output_i.reset_index(drop=False) + output_col = pd.concat([output_col, output_i]) + self.forecast_output.add_category( + category=cat, target_category_column=f"{col}_{cat}", forecast=output_i + ) + + output_col = output_col.reset_index(drop=True) + + return output_col + + def _generate_report(self): + """The method that needs to be implemented on the particular model level.""" + import datapane as dp + + sec5_text = dp.Text(f"## ARIMA Model Parameters") + blocks = [ + dp.HTML(m.summary().as_html(), label=self.target_columns[i]) + for i, m in enumerate(self.models) + ] + sec5 = dp.Select(blocks=blocks) if len(blocks) > 1 else blocks[0] + all_sections = [sec5_text, sec5] + + if self.spec.generate_explanations: + try: + # If the key is present, call the "explain_model" method + self.explain_model( + datetime_col_name=self.spec.datetime_column.name, + explain_predict_fn=self._custom_predict_arima, + ) + + # Create a markdown text block for the global explanation section + global_explanation_text = dp.Text( + f"## Global Explanation of Models \n " + "The following tables provide the feature attribution for the global explainability." + ) + + # Convert the global explanation data to a DataFrame + global_explanation_df = pd.DataFrame(self.global_explanation) + + self.formatted_global_explanation = ( + global_explanation_df / global_explanation_df.sum(axis=0) * 100 + ) + + # Create a markdown section for the global explainability + global_explanation_section = dp.Blocks( + "### Global Explainability ", + dp.DataTable(self.formatted_global_explanation), + ) + + aggregate_local_explanations = pd.DataFrame() + for s_id, local_ex_df in self.local_explanation.items(): + local_ex_df_copy = local_ex_df.copy() + local_ex_df_copy["Series"] = s_id + aggregate_local_explanations = pd.concat( + [aggregate_local_explanations, local_ex_df_copy], axis=0 + ) + self.formatted_local_explanation = aggregate_local_explanations + + local_explanation_text = dp.Text(f"## Local Explanation of Models \n ") + blocks = [ + dp.DataTable( + local_ex_df.div(local_ex_df.abs().sum(axis=1), axis=0) * 100, + label=s_id, + ) + for s_id, local_ex_df in self.local_explanation.items() + ] + local_explanation_section = ( + dp.Select(blocks=blocks) if len(blocks) > 1 else blocks[0] + ) + + # Append the global explanation text and section to the "all_sections" list + all_sections = all_sections + [ + global_explanation_text, + global_explanation_section, + local_explanation_text, + local_explanation_section, + ] + except Exception as e: + logger.warn(f"Failed to generate Explanations with error: {e}.") + logger.debug(f"Full Traceback: {traceback.format_exc()}") + + model_description = dp.Text( + "An autoregressive integrated moving average, or ARIMA, is a statistical " + "analysis model that uses time series data to either better understand the " + "data set or to predict future trends. A statistical model is autoregressive if " + "it predicts future values based on past values." + ) + other_sections = all_sections + + return ( + model_description, + other_sections, + ) + + def _custom_predict_arima(self, data): + """ + Custom prediction function for ARIMA models. + + Parameters + ---------- + data (array-like): The input data to be predicted. + + Returns + ------- + array-like: The predicted values. + + """ + # Get the index of the current series id + series_index = self.target_columns.index(self.series_id) + + # Use the ARIMA model to predict the values + predictions = self.models[series_index].predict(X=data, n_periods=len(data)) + + return predictions diff --git a/ads/opctl/operator/lowcode/forecast/model/automlx.py b/ads/opctl/operator/lowcode/forecast/model/automlx.py new file mode 100644 index 000000000..605a62184 --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/model/automlx.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- +import traceback + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import pandas as pd +import numpy as np +from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl.operator.lowcode.forecast.const import ( + AUTOMLX_METRIC_MAP, + ForecastOutputColumns, +) +from ads.opctl import logger + +from .. import utils +from .base_model import ForecastOperatorBaseModel +from ..operator_config import ForecastOperatorConfig +from .forecast_datasets import ForecastDatasets, ForecastOutput + +AUTOMLX_N_ALGOS_TUNED = 4 +AUTOMLX_DEFAULT_SCORE_METRIC = "neg_sym_mean_abs_percent_error" + + +class AutoMLXOperatorModel(ForecastOperatorBaseModel): + """Class representing AutoMLX operator model.""" + + def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): + super().__init__(config, datasets) + self.global_explanation = {} + self.local_explanation = {} + self.train_metrics = True + + @runtime_dependency( + module="automl", + err_msg=( + "Please run `pip3 install oracle-automlx==23.2.3` to install the required dependencies for automlx." + ), + ) + @runtime_dependency( + module="sktime", + err_msg=( + "Please run `pip3 install sktime` to install the required dependencies for automlx." + ), + ) + def _build_model(self) -> pd.DataFrame: + from automl import init + from sktime.forecasting.model_selection import temporal_train_test_split + + init(engine="local", check_deprecation_warnings=False) + + full_data_dict = self.datasets.full_data_dict + + models = dict() + outputs = dict() + outputs_legacy = dict() + selected_models = dict() + date_column = self.spec.datetime_column.name + horizon = self.spec.horizon + self.datasets.datetime_col = date_column + self.spec.confidence_interval_width = self.spec.confidence_interval_width or 0.8 + self.forecast_output = ForecastOutput( + confidence_interval_width=self.spec.confidence_interval_width + ) + + # Clean up kwargs for pass through + model_kwargs_cleaned = self.spec.model_kwargs.copy() + model_kwargs_cleaned["n_algos_tuned"] = model_kwargs_cleaned.get( + "n_algos_tuned", AUTOMLX_N_ALGOS_TUNED + ) + model_kwargs_cleaned["score_metric"] = AUTOMLX_METRIC_MAP.get( + self.spec.metric, + model_kwargs_cleaned.get("score_metric", AUTOMLX_DEFAULT_SCORE_METRIC), + ) + model_kwargs_cleaned.pop("task", None) + time_budget = model_kwargs_cleaned.pop("time_budget", 0) + model_kwargs_cleaned[ + "preprocessing" + ] = self.spec.preprocessing or model_kwargs_cleaned.get("preprocessing", True) + + for i, (target, df) in enumerate(full_data_dict.items()): + logger.debug("Running automl for {} at position {}".format(target, i)) + series_values = df[df[target].notna()] + # drop NaNs for the time period where data wasn't recorded + series_values.dropna(inplace=True) + df[date_column] = pd.to_datetime( + df[date_column], format=self.spec.datetime_column.format + ) + df = df.set_index(date_column) + # if len(df.columns) > 1: + # when additional columns are present + y_train, y_test = temporal_train_test_split(df, test_size=horizon) + forecast_x = y_test.drop(target, axis=1) + # else: + # y_train = df + # forecast_x = None + logger.debug( + "Time Index is" + "" + if y_train.index.is_monotonic + else "NOT" + "monotonic." + ) + model = automl.Pipeline( + task="forecasting", + **model_kwargs_cleaned, + ) + model.fit( + X=y_train.drop(target, axis=1), + y=pd.DataFrame(y_train[target]), + time_budget=time_budget, + ) + logger.debug("Selected model: {}".format(model.selected_model_)) + logger.debug( + "Selected model params: {}".format(model.selected_model_params_) + ) + summary_frame = model.forecast( + X=forecast_x, + periods=horizon, + alpha=1 - (self.spec.confidence_interval_width / 100), + ) + input_values = pd.Series( + y_train[target].values, + name="input_value", + index=y_train.index, + ) + fitted_values_raw = model.predict(y_train.drop(target, axis=1)) + fitted_values = pd.Series( + fitted_values_raw[target].values, + name="fitted_value", + index=y_train.index, + ) + + summary_frame = pd.concat( + [input_values, fitted_values, summary_frame], axis=1 + ) + + # Collect Outputs + selected_models[target] = { + "series_id": target, + "selected_model": model.selected_model_, + "model_params": model.selected_model_params_, + } + models[target] = model + summary_frame = summary_frame.rename_axis("ds").reset_index() + summary_frame = summary_frame.rename( + columns={ + f"{target}_ci_upper": "yhat_upper", + f"{target}_ci_lower": "yhat_lower", + f"{target}": "yhat", + } + ) + # In case of Naive model, model.forecast function call does not return confidence intervals. + if "yhat_upper" not in summary_frame: + summary_frame["yhat_upper"] = np.NAN + summary_frame["yhat_lower"] = np.NAN + outputs[target] = summary_frame + # outputs_legacy[target] = summary_frame + + logger.debug("===========Forecast Generated===========") + outputs_merged = pd.DataFrame() + + # Merge the outputs from each model into 1 df with all outputs by target and category + col = self.original_target_column + yhat_upper_name = ForecastOutputColumns.UPPER_BOUND + yhat_lower_name = ForecastOutputColumns.LOWER_BOUND + for cat in self.categories: # Note: add [:2] to restrict + output_i = pd.DataFrame() + output_i["Date"] = outputs[f"{col}_{cat}"]["ds"] + output_i["Series"] = cat + output_i["input_value"] = outputs[f"{col}_{cat}"]["input_value"] + output_i[f"fitted_value"] = outputs[f"{col}_{cat}"]["fitted_value"] + output_i[f"forecast_value"] = outputs[f"{col}_{cat}"]["yhat"] + output_i[yhat_upper_name] = outputs[f"{col}_{cat}"]["yhat_upper"] + output_i[yhat_lower_name] = outputs[f"{col}_{cat}"]["yhat_lower"] + outputs_merged = pd.concat([outputs_merged, output_i]) + outputs_legacy[f"{col}_{cat}"] = output_i + self.forecast_output.add_category( + category=cat, target_category_column=f"{col}_{cat}", forecast=output_i + ) + + # output_col = output_col.sort_values(self.spec.datetime_column.name).reset_index(drop=True) + # output_col = output_col.reset_index(drop=True) + # outputs_merged = pd.concat([outputs_merged, output_col], axis=1) + + self.models = models + return outputs_merged + + @runtime_dependency( + module="datapane", + err_msg=( + "Please run `pip3 install datapane` to install the required dependencies for report generation." + ), + ) + def _generate_report(self): + """ + Generate the report for the automlx model. + + Parameters + ---------- + None + + Returns + ------- + - model_description (datapane.Text): A Text component containing the description of the automlx model. + - other_sections (List[Union[datapane.Text, datapane.Blocks]]): A list of Text and Blocks components representing various sections of the report. + - forecast_col_name (str): The name of the forecasted column. + - train_metrics (bool): A boolean value indicating whether to include train metrics in the report. + - ds_column_series (pd.Series): The pd.Series object representing the datetime column of the dataset. + - ds_forecast_col (pd.Series): The pd.Series object representing the forecasted column. + - ci_col_names (List[str]): A list of column names for the confidence interval in the report. + """ + import datapane as dp + + """The method that needs to be implemented on the particular model level.""" + selected_models_text = dp.Text( + f"## Selected Models Overview \n " + "The following tables provide information regarding the " + "chosen model for each series and the corresponding parameters of the models." + ) + selected_models = dict() + models = self.models + for i, (target, df) in enumerate(self.full_data_dict.items()): + selected_models[target] = { + "series_id": target, + "selected_model": models[target].selected_model_, + "model_params": models[target].selected_model_params_, + } + selected_models_df = pd.DataFrame( + selected_models.items(), columns=["series_id", "best_selected_model"] + ) + selected_df = selected_models_df["best_selected_model"].apply(pd.Series) + selected_models_section = dp.Blocks( + "### Best Selected Model", dp.DataTable(selected_df) + ) + + all_sections = [selected_models_text, selected_models_section] + + if self.spec.generate_explanations: + try: + # If the key is present, call the "explain_model" method + self.explain_model( + datetime_col_name=self.spec.datetime_column.name, + explain_predict_fn=self._custom_predict_automlx, + ) + + # Create a markdown text block for the global explanation section + global_explanation_text = dp.Text( + f"## Global Explanation of Models \n " + "The following tables provide the feature attribution for the global explainability." + ) + + # Convert the global explanation data to a DataFrame + global_explanation_df = pd.DataFrame(self.global_explanation) + + self.formatted_global_explanation = ( + global_explanation_df / global_explanation_df.sum(axis=0) * 100 + ) + + # Create a markdown section for the global explainability + global_explanation_section = dp.Blocks( + "### Global Explainability ", + dp.DataTable(self.formatted_global_explanation), + ) + + aggregate_local_explanations = pd.DataFrame() + for s_id, local_ex_df in self.local_explanation.items(): + local_ex_df_copy = local_ex_df.copy() + local_ex_df_copy["Series"] = s_id + aggregate_local_explanations = pd.concat( + [aggregate_local_explanations, local_ex_df_copy], axis=0 + ) + self.formatted_local_explanation = aggregate_local_explanations + + local_explanation_text = dp.Text(f"## Local Explanation of Models \n ") + blocks = [ + dp.DataTable( + local_ex_df.div(local_ex_df.abs().sum(axis=1), axis=0) * 100, + label=s_id, + ) + for s_id, local_ex_df in self.local_explanation.items() + ] + local_explanation_section = ( + dp.Select(blocks=blocks) if len(blocks) > 1 else blocks[0] + ) + + # Append the global explanation text and section to the "all_sections" list + all_sections = all_sections + [ + global_explanation_text, + global_explanation_section, + local_explanation_text, + local_explanation_section, + ] + except Exception as e: + logger.warn(f"Failed to generate Explanations with error: {e}.") + logger.debug(f"Full Traceback: {traceback.format_exc()}") + + model_description = dp.Text( + "The AutoMLx model automatically preprocesses, selects and engineers " + "high-quality features in your dataset, which are then provided for further processing." + ) + other_sections = all_sections + + return ( + model_description, + other_sections, + ) + + def _custom_predict_automlx(self, data): + """ + Predicts the future values of a time series using the AutoMLX model. + Parameters + ---------- + data (numpy.ndarray): The input data to be used for prediction. + + Returns + ------- + numpy.ndarray: The predicted future values of the time series. + """ + temp = 0 + data_temp = pd.DataFrame( + data, + columns=[col for col in self.dataset_cols], + ) + + return self.models.get(self.series_id).forecast( + X=data_temp, periods=data_temp.shape[0] + )[self.series_id] diff --git a/ads/opctl/operator/lowcode/forecast/model/autots.py b/ads/opctl/operator/lowcode/forecast/model/autots.py new file mode 100644 index 000000000..d711f7e1e --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/model/autots.py @@ -0,0 +1,383 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import copy +import traceback +import pandas as pd +import numpy as np +import yaml + +from ads.opctl import logger +from ads.opctl.operator.lowcode.forecast import utils +from .base_model import ForecastOperatorBaseModel +from ..operator_config import ForecastOperatorConfig +from ads.common.decorator.runtime_dependency import runtime_dependency +from .forecast_datasets import ForecastDatasets, ForecastOutput +from ..const import ForecastOutputColumns + + +AUTOTS_MAX_GENERATION = 10 +AUTOTS_MODELS_TO_VALIDATE = 0.15 + + +class AutoTSOperatorModel(ForecastOperatorBaseModel): + """Class representing AutoTS operator model.""" + + def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): + super().__init__(config, datasets) + self.global_explanation = {} + self.local_explanation = {} + + @runtime_dependency( + module="autots", + err_msg="Please run `pip3 install autots` to install the required dependencies for autots.", + ) + def _build_model(self) -> pd.DataFrame: + """Builds the AutoTS model and generates forecasts. + + Returns: + pd.DataFrame: AutoTS model forecast dataframe + """ + + # Import necessary libraries + from autots import AutoTS, create_regressor + + models = dict() + outputs = dict() + outputs_legacy = [] + # Get the name of the datetime column + date_column = self.spec.datetime_column.name + self.datasets.datetime_col = date_column + self.forecast_output = ForecastOutput( + confidence_interval_width=self.spec.confidence_interval_width + ) + + # Initialize the AutoTS model with specified parameters + model = AutoTS( + forecast_length=self.spec.horizon, + frequency=self.spec.model_kwargs.get("frequency", "infer"), + prediction_interval=self.spec.confidence_interval_width, + max_generations=self.spec.model_kwargs.get( + "max_generations", AUTOTS_MAX_GENERATION + ), + no_negatives=self.spec.model_kwargs.get("no_negatives", False), + constraint=self.spec.model_kwargs.get("constraint", None), + ensemble=self.spec.model_kwargs.get("ensemble", "auto"), + initial_template=self.spec.model_kwargs.get( + "initial_template", "General+Random" + ), + random_seed=self.spec.model_kwargs.get("random_seed", 2022), + holiday_country=self.spec.model_kwargs.get("holiday_country", "US"), + subset=self.spec.model_kwargs.get("subset", None), + aggfunc=self.spec.model_kwargs.get("aggfunc", "first"), + na_tolerance=self.spec.model_kwargs.get("na_tolerance", 1), + drop_most_recent=self.spec.model_kwargs.get("drop_most_recent", 0), + drop_data_older_than_periods=self.spec.model_kwargs.get( + "drop_data_older_than_periods", None + ), + model_list=self.spec.model_kwargs.get("model_list", "fast_parallel"), + transformer_list=self.spec.model_kwargs.get("transformer_list", "auto"), + transformer_max_depth=self.spec.model_kwargs.get( + "transformer_max_depth", 6 + ), + models_mode=self.spec.model_kwargs.get("models_mode", "random"), + num_validations=self.spec.model_kwargs.get("num_validations", "auto"), + models_to_validate=self.spec.model_kwargs.get( + "models_to_validate", AUTOTS_MODELS_TO_VALIDATE + ), + max_per_model_class=self.spec.model_kwargs.get("max_per_model_class", None), + validation_method=self.spec.model_kwargs.get( + "validation_method", "backwards" + ), + min_allowed_train_percent=self.spec.model_kwargs.get( + "min_allowed_train_percent", 0.5 + ), + remove_leading_zeroes=self.spec.model_kwargs.get( + "remove_leading_zeroes", False + ), + prefill_na=self.spec.model_kwargs.get("prefill_na", None), + introduce_na=self.spec.model_kwargs.get("introduce_na", None), + preclean=self.spec.model_kwargs.get("preclean", None), + model_interrupt=self.spec.model_kwargs.get("model_interrupt", True), + generation_timeout=self.spec.model_kwargs.get("generation_timeout", None), + current_model_file=self.spec.model_kwargs.get("current_model_file", None), + verbose=self.spec.model_kwargs.get("verbose", 1), + n_jobs=self.spec.model_kwargs.get("n_jobs", -1), + ) + + # Prepare the data for model training + full_data_dict = self.datasets.full_data_dict + temp_list = [full_data_dict[i] for i in full_data_dict.keys()] + melt_temp = [ + temp_list[i].melt( + temp_list[i].columns.difference(self.target_columns), + var_name="series_id", + value_name=self.original_target_column, + ) + for i in range(len(self.target_columns)) + ] + + self.full_data_long = pd.concat(melt_temp) + + if self.spec.additional_data: + df_temp = ( + self.full_data_long.set_index([self.spec.target_column]) + .reset_index(drop=True) + .copy() + ) + df_temp[self.spec.datetime_column.name] = pd.to_datetime( + df_temp[self.spec.datetime_column.name] + ) + r_tr, _ = create_regressor( + df_temp.pivot( + [self.spec.datetime_column.name], + columns="series_id", + values=list( + self.original_additional_data.set_index( + [ + self.spec.target_category_columns[0], + self.spec.datetime_column.name, + ] + ).columns + ), + ), + forecast_length=self.spec.horizon, + ) + + self.future_regressor_train = r_tr.copy() + + # Fit the model to the training data + model = model.fit( + self.full_data_long.groupby("series_id") + .head(-self.spec.horizon) + .reset_index(drop=True), + date_col=self.spec.datetime_column.name, + value_col=self.original_target_column, + future_regressor=r_tr.head(-self.spec.horizon) + if self.spec.additional_data + else None, + id_col="series_id", + ) + + # Store the trained model and generate forecasts + self.models = copy.deepcopy(model) + logger.debug("===========Forecast Generated===========") + self.prediction = model.predict( + future_regressor=r_tr.tail(self.spec.horizon) + if self.spec.additional_data + else None + ) + + outputs = dict() + + output_col = pd.DataFrame() + yhat_upper_name = ForecastOutputColumns.UPPER_BOUND + yhat_lower_name = ForecastOutputColumns.LOWER_BOUND + + for cat in self.categories: + output_i = pd.DataFrame() + cat_target = f"{self.original_target_column}_{cat}" + input_data_i = full_data_dict[cat_target] + + output_i["Date"] = pd.to_datetime( + input_data_i[self.spec.datetime_column.name], + format=self.spec.datetime_column.format, + ) + output_i["Series"] = cat + output_i["input_value"] = input_data_i[cat_target] + output_i["fitted_value"] = float("nan") + output_i = output_i.set_index("Date") + + output_i["forecast_value"] = self.prediction.forecast[[cat_target]] + output_i[yhat_upper_name] = self.prediction.upper_forecast[[cat_target]] + output_i[yhat_lower_name] = self.prediction.lower_forecast[[cat_target]] + + output_i = output_i.reset_index() + output_col = pd.concat([output_col, output_i]) + self.forecast_output.add_category( + category=cat, target_category_column=cat_target, forecast=output_i + ) + + output_col = output_col.reset_index(drop=True) + + logger.debug("===========Done===========") + + return output_col + + def _generate_report(self) -> tuple: + """ + Generates the report for the given function. + + Returns: + tuple: A tuple containing the following elements: + - model_description (dp.Text): A text object containing the description of the AutoTS model. + - other_sections (list): A list of sections to be included in the report. + - forecast_col_name (str): The name of the forecast column. + - train_metrics (bool): A boolean indicating whether to include train metrics. + - ds_column_series (pd.Series): A pandas Series containing the datetime column values. + - ds_forecast_col (pd.Index): A pandas Index containing the forecast column values. + - ci_col_names (list): A list of column names for confidence intervals. + """ + import datapane as dp + + # Section 1: Forecast Overview + sec1_text = dp.Text( + "## Forecast Overview \n" + "These plots show your forecast in the context of historical data." + ) + sec_1 = utils._select_plot_list( + lambda idx, *args: self.prediction.plot( + self.models.df_wide_numeric, + series=self.models.df_wide_numeric.columns[idx], + start_date=self.models.df_wide_numeric.reset_index()[ + self.spec.datetime_column.name + ].min(), + ), + target_columns=self.target_columns, + ) + + # Section 2: AutoTS Model Parameters + sec2_text = dp.Text(f"## AutoTS Model Parameters") + try: + sec2 = dp.Code( + code=yaml.dump(list(self.models.best_model.T.to_dict().values())[0]), + language="yaml", + ) + + except KeyError as ke: + logger.warn(f"Issue generating Model Parameters Table Section. Skipping") + sec2 = dp.Text(f"Error generating model parameters.") + all_sections = [sec1_text, sec_1, sec2_text, sec2] + + if self.spec.generate_explanations: + # If the key is present, call the "explain_model" method + try: + self.explain_model( + datetime_col_name=self.spec.datetime_column.name, + explain_predict_fn=self._custom_predict_autots, + ) + + # Create a markdown text block for the global explanation section + global_explanation_text = dp.Text( + f"## Global Explanation of Models \n " + "The following tables provide the feature attribution for the global explainability." + ) + + # Convert the global explanation data to a DataFrame + global_explanation_df = pd.DataFrame(self.global_explanation).drop( + index=["series_id", self.spec.target_column] + ) + + self.formatted_global_explanation = ( + global_explanation_df / global_explanation_df.sum(axis=0) * 100 + ) + + # Create a markdown section for the global explainability + global_explanation_section = dp.Blocks( + "### Global Explainability ", + dp.DataTable(self.formatted_global_explanation), + ) + + aggregate_local_explanations = pd.DataFrame() + for s_id, local_ex_df in self.local_explanation.items(): + local_ex_df_copy = local_ex_df.copy() + local_ex_df_copy["Series"] = s_id + aggregate_local_explanations = pd.concat( + [aggregate_local_explanations, local_ex_df_copy], axis=0 + ) + self.formatted_local_explanation = aggregate_local_explanations + + local_explanation_text = dp.Text(f"## Local Explanation of Models \n ") + blocks = [ + dp.DataTable( + local_ex_df.div(local_ex_df.abs().sum(axis=1), axis=0) * 100, + label=s_id, + ) + for s_id, local_ex_df in self.local_explanation.items() + ] + local_explanation_section = ( + dp.Select(blocks=blocks) if len(blocks) > 1 else blocks[0] + ) + + # Append the global explanation text and section to the "all_sections" list + all_sections = all_sections + [ + global_explanation_text, + global_explanation_section, + local_explanation_text, + local_explanation_section, + ] + except Exception as e: + logger.warn(f"Failed to generate Explanations with error: {e}.") + logger.debug(f"Full Traceback: {traceback.format_exc()}") + + # Model Description + model_description = dp.Text( + "AutoTS is a time series package for Python designed for rapidly deploying high-accuracy forecasts at scale. " + "In 2023, AutoTS has won in the M6 forecasting competition, " + "delivering the highest performance investment decisions across 12 months of stock market forecasting." + ) + + other_sections = all_sections + + return ( + model_description, + other_sections, + ) + + def _custom_predict_autots(self, data): + """ + Predicts the future values of a time series using the AutoTS model. + + Parameters + ---------- + data (numpy.ndarray): The input data to be used for prediction. + + Returns + ------- + numpy.ndarray: The predicted future values of the time series. + """ + + data.index = pd.to_datetime(data.index) + temp_model = copy.deepcopy(self.models) + + if data.shape[0] > 1: + temp_model.fit_data( + data[~data.index.duplicated()], + future_regressor=self.future_regressor_train.head(-self.spec.horizon), + ) + dedup_shape = data.shape[0] - data[~data.index.duplicated()].shape[0] + 1 + return pd.Series(0, index=np.arange(dedup_shape)).append( + temp_model.back_forecast( + tail=data[~data.index.duplicated()].shape[0] - 1 + ) + .forecast[self.spec.target_column] + .fillna(0) + ) + + return temp_model.predict( + future_regressor=self.future_regressor_train.loc[ + self.future_regressor_train.index.isin(data.index) + ], + forecast_length=1, + ).forecast[self.series_id] + + def _generate_train_metrics(self) -> pd.DataFrame: + """ + Generate Training Metrics when fitted data is not available. + The method that needs to be implemented on the particular model level. + + metrics Sales_Store 1 + sMAPE 26.19 + MAPE 2.96E+18 + RMSE 2014.192531 + r2 -4.60E-06 + Explained Variance 0.002177087 + """ + mapes = pd.DataFrame(self.models.best_model_per_series_mape()).T + scores = pd.DataFrame( + self.models.best_model_per_series_score(), columns=["AutoTS Score"] + ).T + return pd.concat([mapes, scores]) diff --git a/ads/opctl/operator/lowcode/forecast/model/base_model.py b/ads/opctl/operator/lowcode/forecast/model/base_model.py new file mode 100644 index 000000000..61b8484cf --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/model/base_model.py @@ -0,0 +1,665 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +import tempfile +import time +from abc import ABC, abstractmethod +from typing import Tuple +import traceback + +import fsspec +import numpy as np +import pandas as pd + +from ads.opctl.operator.lowcode.forecast.utils import default_signer +from ads.common.object_storage_details import ObjectStorageDetails +from ads.opctl import logger + +from .. import utils +from ..const import SUMMARY_METRICS_HORIZON_LIMIT, SupportedMetrics, SupportedModels +from ..operator_config import ForecastOperatorConfig, ForecastOperatorSpec +from ads.common.decorator.runtime_dependency import runtime_dependency +from .forecast_datasets import ForecastDatasets, ForecastOutput + + +class ForecastOperatorBaseModel(ABC): + """The base class for the forecast operator models.""" + + def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): + """Instantiates the ForecastOperatorBaseModel instance. + + Properties + ---------- + config: ForecastOperatorConfig + The forecast operator configuration. + """ + self.config: ForecastOperatorConfig = config + self.spec: ForecastOperatorSpec = config.spec + self.datasets: ForecastDatasets = datasets + + self.original_user_data = datasets.original_user_data + self.original_total_data = datasets.original_total_data + self.original_additional_data = datasets.original_additional_data + self.full_data_dict = datasets.full_data_dict + self.target_columns = datasets.target_columns + self.categories = datasets.categories + + self.test_eval_metrics = None + self.original_target_column = self.spec.target_column + + # these fields are populated in the _build_model() method + self.models = None + # "outputs" is a list of outputs generated by the models. These should only be generated when the framework requires the original output for plotting + self.outputs = None + self.forecast_output = None + + self.train_metrics = False + self.forecast_col_name = "yhat" + self.perform_tuning = self.spec.tuning != None + + def generate_report(self): + """Generates the forecasting report.""" + import warnings + from sklearn.exceptions import ConvergenceWarning + + with warnings.catch_warnings(): + warnings.simplefilter(action="ignore", category=FutureWarning) + warnings.simplefilter(action="ignore", category=UserWarning) + warnings.simplefilter(action="ignore", category=RuntimeWarning) + warnings.simplefilter(action="ignore", category=ConvergenceWarning) + import datapane as dp + + # load data and build models + start_time = time.time() + result_df = self._build_model() + elapsed_time = time.time() - start_time + + # Generate metrics + summary_metrics = None + test_data = None + self.eval_metrics = None + + if self.spec.generate_report or self.spec.generate_metrics: + if self.train_metrics: + self.eval_metrics = utils.evaluate_train_metrics( + self.target_columns, + self.datasets, + self.forecast_output, + self.spec.datetime_column.name, + target_col=self.forecast_col_name, + ) + else: + try: + self.eval_metrics = self._generate_train_metrics() + except NotImplementedError: + logger.warn( + f"Training Metrics are not available for model type {self.spec.model}" + ) + + if self.spec.test_data: + try: + ( + self.test_eval_metrics, + summary_metrics, + test_data, + ) = self._test_evaluate_metrics( + target_columns=self.target_columns, + test_filename=self.spec.test_data.url, + output=self.forecast_output, + target_col=self.forecast_col_name, + elapsed_time=elapsed_time, + ) + except Exception as e: + logger.warn("Unable to generate Test Metrics.") + logger.debug(f"Full Traceback: {traceback.format_exc()}") + report_sections = [] + + if self.spec.generate_report: + # build the report + ( + model_description, + other_sections, + ) = self._generate_report() + + ds_column_series = self.datasets.get_longest_datetime_column() + + title_text = dp.Text("# Forecast Report") + + md_columns = " * ".join([f"{x} \n" for x in self.target_columns]) + first_10_rows_blocks = [ + dp.DataTable( + df.head(10).rename({col: self.spec.target_column}, axis=1), + caption="Start", + label=col, + ) + for col, df in self.full_data_dict.items() + ] + + last_10_rows_blocks = [ + dp.DataTable( + df.tail(10).rename({col: self.spec.target_column}, axis=1), + caption="End", + label=col, + ) + for col, df in self.full_data_dict.items() + ] + + data_summary_blocks = [ + dp.DataTable( + df.rename({col: self.spec.target_column}, axis=1).describe(), + caption="Summary Statistics", + label=col, + ) + for col, df in self.full_data_dict.items() + ] + summary = dp.Blocks( + dp.Select( + blocks=[ + dp.Group( + dp.Text( + f"You selected the **`{self.spec.model}`** model." + ), + model_description, + dp.Text( + "Based on your dataset, you could have also selected " + f"any of the models: `{'`, `'.join(SupportedModels.keys())}`." + ), + dp.Group( + dp.BigNumber( + heading="Analysis was completed in ", + value=utils.human_time_friendly(elapsed_time), + ), + dp.BigNumber( + heading="Starting time index", + value=ds_column_series.min().strftime( + "%B %d, %Y" + ), + ), + dp.BigNumber( + heading="Ending time index", + value=ds_column_series.max().strftime( + "%B %d, %Y" + ), + ), + dp.BigNumber( + heading="Num series", + value=len(self.target_columns), + ), + columns=4, + ), + dp.Text("### First 10 Rows of Data"), + dp.Select(blocks=first_10_rows_blocks) + if len(first_10_rows_blocks) > 1 + else first_10_rows_blocks[0], + dp.Text("----"), + dp.Text("### Last 10 Rows of Data"), + dp.Select(blocks=last_10_rows_blocks) + if len(last_10_rows_blocks) > 1 + else last_10_rows_blocks[0], + dp.Text("### Data Summary Statistics"), + dp.Select(blocks=data_summary_blocks) + if len(data_summary_blocks) > 1 + else data_summary_blocks[0], + label="Summary", + ), + dp.Text( + "The following report compares a variety of metrics and plots " + f"for your target columns: \n {md_columns}.\n", + label="Target Columns", + ), + ] + ), + ) + + test_metrics_sections = [] + if ( + self.test_eval_metrics is not None + and not self.test_eval_metrics.empty + ): + sec7_text = dp.Text(f"## Test Data Evaluation Metrics") + sec7 = dp.DataTable(self.test_eval_metrics) + test_metrics_sections = test_metrics_sections + [sec7_text, sec7] + + if summary_metrics is not None and not summary_metrics.empty: + sec8_text = dp.Text(f"## Test Data Summary Metrics") + sec8 = dp.DataTable(summary_metrics) + test_metrics_sections = test_metrics_sections + [sec8_text, sec8] + + train_metrics_sections = [] + if self.eval_metrics is not None and not self.eval_metrics.empty: + sec9_text = dp.Text(f"## Training Data Metrics") + sec9 = dp.DataTable(self.eval_metrics) + train_metrics_sections = [sec9_text, sec9] + + forecast_text = dp.Text(f"## Forecasted Data Overlaying Historical") + forecast_sec = utils.get_forecast_plots( + self.forecast_output, + self.target_columns, + horizon=self.spec.horizon, + test_data=test_data, + ci_interval_width=self.spec.confidence_interval_width, + ) + forecast_plots = [forecast_text, forecast_sec] + + yaml_appendix_title = dp.Text(f"## Reference: YAML File") + yaml_appendix = dp.Code(code=self.config.to_yaml(), language="yaml") + report_sections = ( + [title_text, summary] + + forecast_plots + + other_sections + + test_metrics_sections + + train_metrics_sections + + [yaml_appendix_title, yaml_appendix] + ) + + # save the report and result CSV + self._save_report( + report_sections=report_sections, + result_df=result_df, + metrics_df=self.eval_metrics, + test_metrics_df=self.test_eval_metrics, + ) + + def _test_evaluate_metrics( + self, target_columns, test_filename, output, target_col="yhat", elapsed_time=0 + ): + total_metrics = pd.DataFrame() + summary_metrics = pd.DataFrame() + data = None + try: + storage_options = ( + default_signer() + if ObjectStorageDetails.is_oci_path(test_filename) + else {} + ) + data = utils._load_data( + filename=test_filename, + format=self.spec.test_data.format, + storage_options=storage_options, + columns=self.spec.test_data.columns, + ) + except pd.errors.EmptyDataError: + logger.warn("Empty testdata file") + return total_metrics, summary_metrics, None + + if data.empty: + return total_metrics, summary_metrics, None + + data = self._preprocess( + data, self.spec.datetime_column.name, self.spec.datetime_column.format + ) + data, confirm_targ_columns = utils._clean_data( + data=data, + target_column=self.original_target_column, + target_category_columns=self.spec.target_category_columns, + datetime_column="ds", + ) + + # Calculating Test Metrics + for cat in self.forecast_output.list_categories(): + target_column_i = self.forecast_output.category_to_target[cat] + output_forecast_i = self.forecast_output.get_category(cat) + # Only columns present in test file will be used to generate test error + if target_column_i in data: + # Assuming that predictions have all forecast values + dates = output_forecast_i["Date"] + # Filling zeros for any date missing in test data to maintain consistency in metric calculation as in all other missing values cases it comes as 0 + y_true = [ + data.loc[data["ds"] == date, target_column_i].values[0] + if date in data["ds"].values + else 0 + for date in dates + ] + y_pred_i = output_forecast_i["forecast_value"].values + y_pred = np.asarray(y_pred_i[-len(y_true) :]) + + metrics_df = utils._build_metrics_df( + y_true=y_true[-self.spec.horizon :], + y_pred=y_pred[-self.spec.horizon :], + column_name=target_column_i, + ) + total_metrics = pd.concat([total_metrics, metrics_df], axis=1) + else: + logger.warn( + f"Error Generating Metrics: Unable to find {target_column_i} in the test data." + ) + + if total_metrics.empty: + return total_metrics, summary_metrics, data + + summary_metrics = pd.DataFrame( + { + SupportedMetrics.MEAN_SMAPE: np.mean( + total_metrics.loc[SupportedMetrics.SMAPE] + ), + SupportedMetrics.MEDIAN_SMAPE: np.median( + total_metrics.loc[SupportedMetrics.SMAPE] + ), + SupportedMetrics.MEAN_MAPE: np.mean( + total_metrics.loc[SupportedMetrics.MAPE] + ), + SupportedMetrics.MEDIAN_MAPE: np.median( + total_metrics.loc[SupportedMetrics.MAPE] + ), + SupportedMetrics.MEAN_RMSE: np.mean( + total_metrics.loc[SupportedMetrics.RMSE] + ), + SupportedMetrics.MEDIAN_RMSE: np.median( + total_metrics.loc[SupportedMetrics.RMSE] + ), + SupportedMetrics.MEAN_R2: np.mean( + total_metrics.loc[SupportedMetrics.R2] + ), + SupportedMetrics.MEDIAN_R2: np.median( + total_metrics.loc[SupportedMetrics.R2] + ), + SupportedMetrics.MEAN_EXPLAINED_VARIANCE: np.mean( + total_metrics.loc[SupportedMetrics.EXPLAINED_VARIANCE] + ), + SupportedMetrics.MEDIAN_EXPLAINED_VARIANCE: np.median( + total_metrics.loc[SupportedMetrics.EXPLAINED_VARIANCE] + ), + SupportedMetrics.ELAPSED_TIME: elapsed_time, + }, + index=["All Targets"], + ) + + """Calculates Mean sMAPE, Median sMAPE, Mean MAPE, Median MAPE, Mean wMAPE, Median wMAPE values for each horizon + if horizon <= 10.""" + target_columns_in_output = set(target_columns).intersection(data.columns) + if self.spec.horizon <= SUMMARY_METRICS_HORIZON_LIMIT: + if set(self.forecast_output.list_target_category_columns()) != set( + target_columns_in_output + ): + logger.warn( + f"Column Mismatch between Forecast Output and Target Columns" + ) + metrics_per_horizon = utils._build_metrics_per_horizon( + data=data, + output=self.forecast_output, + target_columns=target_columns, + target_col=target_col, + horizon_periods=self.spec.horizon, + ) + if not metrics_per_horizon.empty: + summary_metrics = pd.concat([summary_metrics, metrics_per_horizon]) + + new_column_order = [ + SupportedMetrics.MEAN_SMAPE, + SupportedMetrics.MEDIAN_SMAPE, + SupportedMetrics.MEAN_MAPE, + SupportedMetrics.MEDIAN_MAPE, + SupportedMetrics.MEAN_WMAPE, + SupportedMetrics.MEDIAN_WMAPE, + SupportedMetrics.MEAN_RMSE, + SupportedMetrics.MEDIAN_RMSE, + SupportedMetrics.MEAN_R2, + SupportedMetrics.MEDIAN_R2, + SupportedMetrics.MEAN_EXPLAINED_VARIANCE, + SupportedMetrics.MEDIAN_EXPLAINED_VARIANCE, + SupportedMetrics.ELAPSED_TIME, + ] + summary_metrics = summary_metrics[new_column_order] + + return total_metrics, summary_metrics, data + + def _save_report( + self, + report_sections: Tuple, + result_df: pd.DataFrame, + metrics_df: pd.DataFrame, + test_metrics_df: pd.DataFrame, + ): + """Saves resulting reports to the given folder.""" + import datapane as dp + + if self.spec.output_directory: + output_dir = self.spec.output_directory.url + else: + output_dir = "tmp_fc_operator_result" + logger.warn( + "Since the output directory was not specified, the output will be saved to {} directory.".format( + output_dir + ) + ) + + if ObjectStorageDetails.is_oci_path(output_dir): + storage_options = default_signer() + else: + storage_options = dict() + + # datapane html report + if self.spec.generate_report: + # datapane html report + with tempfile.TemporaryDirectory() as temp_dir: + report_local_path = os.path.join(temp_dir, "___report.html") + utils.block_print() + dp.save_report(report_sections, report_local_path) + utils.enable_print() + + report_path = os.path.join(output_dir, self.spec.report_filename) + with open(report_local_path) as f1: + with fsspec.open( + report_path, + "w", + **storage_options, + ) as f2: + f2.write(f1.read()) + + # forecast csv report + utils._write_data( + data=result_df, + filename=os.path.join(output_dir, self.spec.forecast_filename), + format="csv", + storage_options=storage_options, + ) + + # metrics csv report + if self.spec.generate_metrics: + if metrics_df is not None: + utils._write_data( + data=metrics_df.rename_axis("metrics").reset_index(), + filename=os.path.join(output_dir, self.spec.metrics_filename), + format="csv", + storage_options=storage_options, + index=False, + ) + else: + logger.warn( + f"Attempted to generate the {self.spec.metrics_filename} file with the training metrics, however the training metrics could not be properly generated." + ) + + # test_metrics csv report + if self.spec.test_data is not None: + if test_metrics_df is not None: + utils._write_data( + data=test_metrics_df.rename_axis("metrics").reset_index(), + filename=os.path.join( + output_dir, self.spec.test_metrics_filename + ), + format="csv", + storage_options=storage_options, + index=False, + ) + else: + logger.warn( + f"Attempted to generate the {self.spec.test_metrics_filename} file with the test metrics, however the test metrics could not be properly generated." + ) + # explanations csv reports + if self.spec.generate_explanations: + try: + if self.formatted_global_explanation is not None: + utils._write_data( + data=self.formatted_global_explanation, + filename=os.path.join( + output_dir, self.spec.global_explanation_filename + ), + format="csv", + storage_options=storage_options, + index=True, + ) + else: + logger.warn( + f"Attempted to generate global explanations for the {self.spec.global_explanation_filename} file, but an issue occured in formatting the explanations." + ) + + if self.formatted_local_explanation is not None: + utils._write_data( + data=self.formatted_local_explanation, + filename=os.path.join( + output_dir, self.spec.local_explanation_filename + ), + format="csv", + storage_options=storage_options, + index=True, + ) + else: + logger.warn( + f"Attempted to generate local explanations for the {self.spec.local_explanation_filename} file, but an issue occured in formatting the explanations." + ) + except AttributeError as e: + logger.warn( + "Unable to generate explanations for this model type or for this dataset." + ) + logger.info( + f"The outputs have been successfully " + f"generated and placed into the directory: {output_dir}." + ) + + def _preprocess(self, data, ds_column, datetime_format): + """The method that needs to be implemented on the particular model level.""" + data["ds"] = pd.to_datetime(data[ds_column], format=datetime_format) + if ds_column != "ds": + data.drop([ds_column], axis=1, inplace=True) + return data + + @abstractmethod + def _generate_report(self): + """ + Generates the report for the particular model. + The method that needs to be implemented on the particular model level. + """ + + @abstractmethod + def _build_model(self) -> pd.DataFrame: + """ + Build the model. + The method that needs to be implemented on the particular model level. + """ + + def _generate_train_metrics(self) -> pd.DataFrame: + """ + Generate Training Metrics when fitted data is not available. + The method that needs to be implemented on the particular model level. + """ + raise NotImplementedError + + @runtime_dependency( + module="shap", + err_msg=( + "Please run `pip3 install shap` to install the required dependencies for model explanation." + ), + ) + def explain_model(self, datetime_col_name, explain_predict_fn) -> dict: + """ + Generates an explanation for the model by using the SHAP (Shapley Additive exPlanations) library. + This function calculates the SHAP values for each feature in the dataset and stores the results in the `global_explanation` dictionary. + + Returns + ------- + dict: A dictionary containing the global explanation for each feature in the dataset. + The keys are the feature names and the values are the average absolute SHAP values. + """ + from shap import KernelExplainer + + for series_id in self.target_columns: + self.series_id = series_id + if self.spec.model == SupportedModels.AutoTS: + self.dataset_cols = ( + self.full_data_long.loc[ + self.full_data_long.series_id == self.series_id + ] + .set_index(datetime_col_name) + .columns + ) + + self.bg_data = self.full_data_long.loc[ + self.full_data_long.series_id == self.series_id + ].set_index(datetime_col_name) + + else: + self.dataset_cols = ( + self.full_data_dict.get(series_id) + .set_index(datetime_col_name) + .drop(series_id, axis=1) + .columns + ) + + self.bg_data = self.full_data_dict.get(series_id).set_index( + datetime_col_name + ) + + kernel_explnr = KernelExplainer( + model=explain_predict_fn, + data=self.bg_data[list(self.dataset_cols)][: -self.spec.horizon][ + list(self.dataset_cols) + ], + keep_index=False + if self.spec.model == SupportedModels.AutoMLX + else True, + ) + + kernel_explnr_vals = kernel_explnr.shap_values( + self.bg_data[: -self.spec.horizon][list(self.dataset_cols)], + nsamples=50, + ) + + if not len(kernel_explnr_vals): + logger.warn( + f"No explanations generated. Ensure that additional data has been provided." + ) + else: + self.global_explanation[series_id] = dict( + zip( + self.dataset_cols, + np.average(np.absolute(kernel_explnr_vals), axis=0), + ) + ) + + self.local_explainer( + kernel_explnr, series_id=series_id, datetime_col_name=datetime_col_name + ) + + def local_explainer(self, kernel_explainer, series_id, datetime_col_name) -> None: + """ + Generate local explanations using a kernel explainer. + + Parameters + ---------- + kernel_explainer: The kernel explainer object to use for generating explanations. + """ + # Get the data for the series ID and select the relevant columns + # data = self.full_data_dict.get(series_id).set_index(datetime_col_name) + data = self.bg_data[-self.spec.horizon :][list(self.dataset_cols)] + + # Generate local SHAP values using the kernel explainer + local_kernel_explnr_vals = kernel_explainer.shap_values(data, nsamples=50) + + # Convert the SHAP values into a DataFrame + local_kernel_explnr_df = pd.DataFrame( + local_kernel_explnr_vals, columns=self.dataset_cols + ) + + # set the index of the DataFrame to the datetime column + local_kernel_explnr_df.index = data.index + + if self.spec.model == SupportedModels.AutoTS: + local_kernel_explnr_df.drop( + ["series_id", self.spec.target_column], axis=1, inplace=True + ) + + self.local_explanation[series_id] = local_kernel_explnr_df diff --git a/ads/opctl/operator/lowcode/forecast/model/factory.py b/ads/opctl/operator/lowcode/forecast/model/factory.py new file mode 100644 index 000000000..184ae15d3 --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/model/factory.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from ..const import SupportedModels +from ..operator_config import ForecastOperatorConfig +from .arima import ArimaOperatorModel +from .automlx import AutoMLXOperatorModel +from .autots import AutoTSOperatorModel +from .base_model import ForecastOperatorBaseModel +from .neuralprophet import NeuralProphetOperatorModel +from .prophet import ProphetOperatorModel +from ..utils import select_auto_model +from .forecast_datasets import ForecastDatasets + +class UnSupportedModelError(Exception): + def __init__(self, model_type: str): + super().__init__( + f"Model: `{model_type}` " + f"is not supported. Supported models: {SupportedModels.values}" + ) + + +class ForecastOperatorModelFactory: + """ + The factory class helps to instantiate proper model operator based on the model type. + """ + + _MAP = { + SupportedModels.Prophet: ProphetOperatorModel, + SupportedModels.Arima: ArimaOperatorModel, + SupportedModels.NeuralProphet: NeuralProphetOperatorModel, + SupportedModels.AutoMLX: AutoMLXOperatorModel, + SupportedModels.AutoTS: AutoTSOperatorModel + } + + @classmethod + def get_model( + cls, operator_config: ForecastOperatorConfig, datasets: ForecastDatasets + ) -> ForecastOperatorBaseModel: + """ + Gets the forecasting operator model based on the model type. + + Parameters + ---------- + operator_config: ForecastOperatorConfig + The forecasting operator config. + datasets: ForecastDatasets + Datasets for predictions + + Returns + ------- + ForecastOperatorBaseModel + The forecast operator model. + + Raises + ------ + UnSupportedModelError + In case of not supported model. + """ + model_type = operator_config.spec.model + if model_type == "auto": + model_type = select_auto_model(datasets, operator_config) + if model_type not in cls._MAP: + raise UnSupportedModelError(model_type) + return cls._MAP[model_type](config=operator_config, datasets=datasets) diff --git a/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py new file mode 100644 index 000000000..b68be4327 --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import pandas as pd +from ..operator_config import ForecastOperatorConfig +from .. import utils +from ads.opctl.operator.lowcode.forecast.utils import default_signer +from .transformations import Transformations +from ads.opctl import logger +import pandas as pd +from ..const import ForecastOutputColumns, PROPHET_INTERNAL_DATE_COL +from pandas.api.types import is_datetime64_any_dtype, is_string_dtype, is_numeric_dtype + + +class ForecastDatasets: + def __init__(self, config: ForecastOperatorConfig): + """Instantiates the DataIO instance. + + Properties + ---------- + config: ForecastOperatorConfig + The forecast operator configuration. + """ + self.original_user_data = None + self.original_total_data = None + self.original_additional_data = None + self.full_data_dict = None + self.target_columns = None + self.categories = None + self.datetime_col = PROPHET_INTERNAL_DATE_COL + self.datetime_format = config.spec.datetime_column.format + self._load_data(config.spec) + + def _load_data(self, spec): + """Loads forecasting input data.""" + + raw_data = utils._load_data( + filename=spec.historical_data.url, + format=spec.historical_data.format, + storage_options=default_signer(), + columns=spec.historical_data.columns, + ) + self.original_user_data = raw_data.copy() + data_transformer = Transformations(raw_data, spec) + data = data_transformer.run() + try: + spec.freq = utils.get_frequency_of_datetime(data, spec) + except TypeError as e: + logger.warn( + f"Error determining frequency: {e.args}. Setting Frequency to None" + ) + logger.debug(f"Full traceback: {e}") + spec.freq = None + + self.original_total_data = data + additional_data = None + + try: + data[spec.datetime_column.name] = pd.to_datetime( + data[spec.datetime_column.name], format=self.datetime_format + ) + except: + raise ValueError( + f"Unable to determine the datetime type for column: {spec.datetime_column.name}. Please specify the format explicitly." + ) + + if spec.additional_data is not None: + additional_data = utils._load_data( + filename=spec.additional_data.url, + format=spec.additional_data.format, + storage_options=default_signer(), + columns=spec.additional_data.columns, + ) + additional_data = data_transformer._sort_by_datetime_col(additional_data) + try: + additional_data[spec.datetime_column.name] = pd.to_datetime( + additional_data[spec.datetime_column.name], + format=self.datetime_format, + ) + except: + raise ValueError( + f"Unable to determine the datetime type for column: {spec.datetime_column.name}. Please specify the format explicitly." + ) + + self.original_additional_data = additional_data.copy() + self.original_total_data = pd.concat([data, additional_data], axis=1) + else: + # Need to add the horizon to the data for compatibility + additional_data_small = data[ + [spec.datetime_column.name] + spec.target_category_columns + ].set_index(spec.datetime_column.name) + if is_datetime64_any_dtype(additional_data_small.index): + horizon_index = pd.date_range( + start=additional_data_small.index.values[-1], + freq=spec.freq, + periods=spec.horizon + 1, + )[1:] + elif is_numeric_dtype(additional_data_small.index): + # If datetime column is just ints + assert ( + len(additional_data_small.index.values) > 1 + ), "Dataset is too small to infer frequency. Please pass in the horizon explicitly through the additional data." + start = additional_data_small.index.values[-1] + step = ( + additional_data_small.index.values[-1] + - additional_data_small.index.values[-2] + ) + horizon_index = pd.RangeIndex( + start, start + step * (spec.horizon + 1), step=step + )[1:] + else: + raise ValueError( + f"Unable to determine the datetime type for column: {spec.datetime_column.name}. Please specify the format explicitly." + ) + + additional_data = pd.DataFrame() + + for cat_col in spec.target_category_columns: + for cat in additional_data_small[cat_col].unique(): + add_data_i = additional_data_small[ + additional_data_small[cat_col] == cat + ] + horizon_df_i = pd.DataFrame([], index=horizon_index) + horizon_df_i[cat_col] = cat + additional_data = pd.concat( + [additional_data, add_data_i, horizon_df_i] + ) + additional_data = additional_data.reset_index().rename( + {"index": spec.datetime_column.name}, axis=1 + ) + + self.original_total_data = pd.concat([data, additional_data], axis=1) + + ( + self.full_data_dict, + self.target_columns, + self.categories, + ) = utils._build_indexed_datasets( + data=data, + target_column=spec.target_column, + datetime_column=spec.datetime_column.name, + horizon=spec.horizon, + target_category_columns=spec.target_category_columns, + additional_data=additional_data, + ) + if spec.generate_explanations: + if spec.additional_data is None: + logger.warn( + f"Unable to generate explanations as there is no additional data passed in. Either set generate_explanations to False, or pass in additional data." + ) + spec.generate_explanations = False + + def format_wide(self): + data_merged = pd.concat( + [ + v[v[k].notna()].set_index(self.datetime_col) + for k, v in self.full_data_dict.items() + ], + axis=1, + ).reset_index() + return data_merged + + def get_longest_datetime_column(self): + return pd.to_datetime( + self.format_wide()[self.datetime_col], format=self.datetime_format + ) + + +class ForecastOutput: + def __init__(self, confidence_interval_width: float): + """Forecast Output contains all of the details required to generate the forecast.csv output file. + + Methods + ---------- + + """ + self.category_map = dict() + self.category_to_target = dict() + self.confidence_interval_width = confidence_interval_width + self.upper_bound_name = None + self.lower_bound_name = None + + def add_category( + self, + category: str, + target_category_column: str, + forecast: pd.DataFrame, + overwrite: bool = False, + ): + if not overwrite and category in self.category_map.keys(): + raise ValueError( + f"Attempting to update ForecastOutput for category {category} when this already exists. Set overwrite to True." + ) + forecast = self._check_forecast_format(forecast) + forecast = self._set_ci_column_names(forecast) + self.category_map[category] = forecast + self.category_to_target[category] = target_category_column + + def get_category(self, category): # change to by_category ? + return self.category_map[category] + + def get_target_category(self, target_category_column): + target_category_columns = self.list_target_category_columns() + category = self.list_categories()[ + list(self.category_to_target.values()).index(target_category_column) + ] + return self.category_map[category] + + def list_categories(self): + return list(self.category_map.keys()) + + def list_target_category_columns(self): + return list(self.category_to_target.values()) + + def format_long(self): + return pd.concat(list(self.category_map.values())) + + def _set_ci_column_names(self, forecast_i): + yhat_lower_percentage = (100 - self.confidence_interval_width * 100) // 2 + self.upper_bound_name = "p" + str(int(100 - yhat_lower_percentage)) + self.lower_bound_name = "p" + str(int(yhat_lower_percentage)) + return forecast_i.rename( + { + ForecastOutputColumns.UPPER_BOUND: self.upper_bound_name, + ForecastOutputColumns.LOWER_BOUND: self.lower_bound_name, + }, + axis=1, + ) + + def format_wide(self): + dataset_time_indexed = { + k: v.set_index(ForecastOutputColumns.DATE) + for k, v in self.category_map.items() + } + datasets_category_appended = [ + v.rename(lambda x: str(x) + f"_{k}", axis=1) + for k, v in dataset_time_indexed.items() + ] + return pd.concat(datasets_category_appended, axis=1) + + def get_longest_datetime_column(self): + return self.format_wide().index + + def _check_forecast_format(self, forecast): + assert isinstance(forecast, pd.DataFrame) + assert ( + len(forecast.columns) == 7 + ), f"Expected just 7 columns, but got: {forecast.columns}" + assert ForecastOutputColumns.DATE in forecast.columns + assert ForecastOutputColumns.SERIES in forecast.columns + assert ForecastOutputColumns.INPUT_VALUE in forecast.columns + assert ForecastOutputColumns.FITTED_VALUE in forecast.columns + assert ForecastOutputColumns.FORECAST_VALUE in forecast.columns + assert ForecastOutputColumns.UPPER_BOUND in forecast.columns + assert ForecastOutputColumns.LOWER_BOUND in forecast.columns + assert not forecast.empty + # forecast.columns = pd.Index([ + # ForecastOutputColumns.DATE, + # ForecastOutputColumns.SERIES, + # ForecastOutputColumns.INPUT_VALUE, + # ForecastOutputColumns.FITTED_VALUE, + # ForecastOutputColumns.FORECAST_VALUE, + # ForecastOutputColumns.UPPER_BOUND, + # ForecastOutputColumns.LOWER_BOUND, + # ]) + return forecast diff --git a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py new file mode 100644 index 000000000..e2303e113 --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import numpy as np +import optuna +import pandas as pd +from torch import Tensor +from torchmetrics.regression import ( + MeanAbsoluteError, + MeanAbsolutePercentageError, + MeanSquaredError, + R2Score, + SymmetricMeanAbsolutePercentageError, +) + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.opctl import logger + +from ..const import DEFAULT_TRIALS, ForecastOutputColumns +from .. import utils +from .base_model import ForecastOperatorBaseModel +from ..operator_config import ForecastOperatorConfig +from .forecast_datasets import ForecastDatasets, ForecastOutput +import traceback + + +def _get_np_metrics_dict(selected_metric): + metric_translation = { + "mape": MeanAbsolutePercentageError, + "smape": SymmetricMeanAbsolutePercentageError, + "mae": MeanAbsoluteError, + "r2": R2Score, + "rmse": MeanSquaredError, + } + if selected_metric not in metric_translation.keys(): + logger.warn( + f"Could not find the metric: {selected_metric} in torchmetrics. Defaulting to MAE and RMSE" + ) + return {"MAE": MeanAbsoluteError(), "RMSE": MeanSquaredError()} + return {selected_metric: metric_translation[selected_metric]()} + + +@runtime_dependency( + module="neuralprophet", + object="NeuralProphet", + install_from=OptionalDependency.FORECAST, +) +def _fit_model(data, params, additional_regressors, select_metric): + from neuralprophet import NeuralProphet + + m = NeuralProphet(**params) + m.metrics = _get_np_metrics_dict(select_metric) + for add_reg in additional_regressors: + m = m.add_future_regressor(name=add_reg) + m.fit(df=data) + accepted_regressors_config = m.config_regressors or dict() + return m, list(accepted_regressors_config.keys()) + + +class NeuralProphetOperatorModel(ForecastOperatorBaseModel): + """Class representing NeuralProphet operator model.""" + + def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): + super().__init__(config=config, datasets=datasets) + self.train_metrics = True + self.forecast_col_name = "yhat1" + + def _build_model(self) -> pd.DataFrame: + from neuralprophet import NeuralProphet + + full_data_dict = self.datasets.full_data_dict + models = [] + outputs = dict() + outputs_legacy = [] + + # Extract the Confidence Interval Width and + # convert to neural prophets equivalent - quantiles + model_kwargs = self.spec.model_kwargs + + if self.spec.confidence_interval_width is None: + quantiles = model_kwargs.get("quantiles", [0.05, 0.95]) + self.spec.confidence_interval_width = float(quantiles[1]) - float( + quantiles[0] + ) + else: + boundaries = round((1 - self.spec.confidence_interval_width) / 2, 2) + quantiles = [boundaries, self.spec.confidence_interval_width + boundaries] + + model_kwargs["quantiles"] = quantiles + self.forecast_output = ForecastOutput( + confidence_interval_width=self.spec.confidence_interval_width + ) + + for i, (target, df) in enumerate(full_data_dict.items()): + le, df_encoded = utils._label_encode_dataframe( + df, no_encode={self.spec.datetime_column.name, target} + ) + model_kwargs_i = model_kwargs.copy() + + # format the dataframe for this target. Dropping NA on target[df] will remove all future data + df_clean = self._preprocess( + df_encoded, + self.spec.datetime_column.name, + self.spec.datetime_column.format, + ) + data_i = df_clean[df_clean[target].notna()] + data_i.rename({target: "y"}, axis=1, inplace=True) + + # Assume that all columns passed in should be used as additional data + additional_regressors = set(data_i.columns) - {"y", "ds"} + training_data = data_i[["y", "ds"] + list(additional_regressors)] + + if self.perform_tuning: + + def objective(trial): + params = { + # 'seasonality_mode': trial.suggest_categorical('seasonality_mode', ['additive', 'multiplicative']), + # 'seasonality_reg': trial.suggest_float('seasonality_reg', 0.1, 500, log=True), + # 'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1, log=True), + "newer_samples_start": trial.suggest_float( + "newer_samples_start", 0.001, 0.999 + ), + "newer_samples_weight": trial.suggest_float( + "newer_samples_weight", 0, 100 + ), + "changepoints_range": trial.suggest_float( + "changepoints_range", 0.8, 0.95 + ), + } + # trend_reg, trend_reg_threshold, ar_reg, impute_rolling/impute_linear, + params.update(model_kwargs_i) + + folds = NeuralProphet(**params).crossvalidation_split_df( + data_i, k=3 + ) + test_metrics_total_i = [] + for df_train, df_test in folds: + m, accepted_regressors = _fit_model( + data=df_train, + params=params, + additional_regressors=additional_regressors, + select_metric=self.spec.metric, + ) + df_test = df_test[["y", "ds"] + accepted_regressors] + + test_forecast_i = m.predict(df=df_test) + fold_metric_i = ( + m.metrics[self.spec.metric] + .forward( + Tensor(test_forecast_i["yhat1"]), + Tensor(test_forecast_i["y"]), + ) + .item() + ) + test_metrics_total_i.append(fold_metric_i) + logger.debug( + f"----------------------{np.asarray(test_metrics_total_i).mean()}----------------------" + ) + return np.asarray(test_metrics_total_i).mean() + + study = optuna.create_study(direction="minimize") + m_params = NeuralProphet().parameters() + study.enqueue_trial( + { + # 'seasonality_mode': m_params['seasonality_mode'], + # 'seasonality_reg': m_params['seasonality_reg'], + # 'learning_rate': m_params['learning_rate'], + "newer_samples_start": m_params["newer_samples_start"], + "newer_samples_weight": m_params["newer_samples_weight"], + "changepoints_range": m_params["changepoints_range"], + } + ) + study.optimize( + objective, + n_trials=self.spec.tuning.n_trials + if self.spec.tuning + else DEFAULT_TRIALS, + n_jobs=-1, + ) + + selected_params = study.best_params + selected_params.update(model_kwargs_i) + model_kwargs_i = selected_params + + # Build and fit model + model, accepted_regressors = _fit_model( + data=training_data, + params=model_kwargs_i, + additional_regressors=additional_regressors, + select_metric=self.spec.metric, + ) + logger.debug( + f"Found the following additional data columns: {additional_regressors}" + ) + logger.debug( + f"While fitting the model, some additional data may have been " + f"discarded. Only using the columns: {accepted_regressors}" + ) + + # Build future dataframe + future = df_clean.reset_index(drop=True) + future["y"] = None + future = future[["y", "ds"] + list(accepted_regressors)] + + # Forecast model and collect outputs + forecast = model.predict(future) + logger.debug(f"-----------------Model {i}----------------------") + logger.debug(forecast.tail()) + models.append(model) + outputs[target] = forecast + outputs_legacy.append(forecast) + + self.models = models + self.outputs = outputs_legacy + + logger.debug("===========Done===========") + + # Merge the outputs from each model into 1 df with all outputs by target and category + col = self.original_target_column + output_col = pd.DataFrame() + yhat_upper_name = ForecastOutputColumns.UPPER_BOUND + yhat_lower_name = ForecastOutputColumns.LOWER_BOUND + for cat in self.categories: + output_i = pd.DataFrame() + + output_i["Date"] = outputs[f"{col}_{cat}"]["ds"] + output_i["Series"] = cat + output_i[f"input_value"] = full_data_dict[f"{col}_{cat}"][f"{col}_{cat}"] + + output_i[f"fitted_value"] = float("nan") + output_i[f"forecast_value"] = float("nan") + output_i[yhat_lower_name] = float("nan") + output_i[yhat_upper_name] = float("nan") + + output_i.iloc[ + : -self.spec.horizon, output_i.columns.get_loc(f"fitted_value") + ] = (outputs[f"{col}_{cat}"]["yhat1"].iloc[: -self.spec.horizon].values) + output_i.iloc[ + -self.spec.horizon :, + output_i.columns.get_loc(f"forecast_value"), + ] = ( + outputs[f"{col}_{cat}"]["yhat1"].iloc[-self.spec.horizon :].values + ) + output_i.iloc[ + -self.spec.horizon :, + output_i.columns.get_loc(yhat_upper_name), + ] = ( + outputs[f"{col}_{cat}"][f"yhat1 {quantiles[1]*100}%"] + .iloc[-self.spec.horizon :] + .values + ) + output_i.iloc[ + -self.spec.horizon :, + output_i.columns.get_loc(yhat_lower_name), + ] = ( + outputs[f"{col}_{cat}"][f"yhat1 {quantiles[0]*100}%"] + .iloc[-self.spec.horizon :] + .values + ) + output_col = pd.concat([output_col, output_i]) + + self.forecast_output.add_category( + category=cat, target_category_column=f"{col}_{cat}", forecast=output_i + ) + + output_col = output_col.reset_index(drop=True) + + return output_col + + def _generate_report(self): + import datapane as dp + + sec1_text = dp.Text( + "## Forecast Overview \nThese plots show your " + "forecast in the context of historical data." + ) + sec1 = utils._select_plot_list( + lambda idx, *args: self.models[idx].plot(self.outputs[idx]), + target_columns=self.target_columns, + ) + + sec2_text = dp.Text(f"## Forecast Broken Down by Trend Component") + sec2 = utils._select_plot_list( + lambda idx, *args: self.models[idx].plot_components(self.outputs[idx]), + target_columns=self.target_columns, + ) + + sec3_text = dp.Text(f"## Forecast Parameter Plots") + sec3 = utils._select_plot_list( + lambda idx, *args: self.models[idx].plot_parameters(), + target_columns=self.target_columns, + ) + + sec5_text = dp.Text(f"## Neural Prophet Model Parameters") + model_states = [] + for i, m in enumerate(self.models): + model_states.append( + pd.Series( + m.state_dict(), + index=m.state_dict().keys(), + name=self.target_columns[i], + ) + ) + all_model_states = pd.concat(model_states, axis=1) + sec5 = dp.DataTable(all_model_states) + + # return [sec4_text, sec4] + all_sections = [ + sec1_text, + sec1, + sec2_text, + sec2, + sec3_text, + sec3, + sec5_text, + sec5, + ] + + if self.spec.generate_explanations: + try: + # If the key is present, call the "explain_model" method + self.explain_model( + datetime_col_name="ds", + explain_predict_fn=self._custom_predict_neuralprophet, + ) + + # Create a markdown text block for the global explanation section + global_explanation_text = dp.Text( + f"## Global Explanation of Models \n " + "The following tables provide the feature attribution for the global explainability." + ) + + # Convert the global explanation data to a DataFrame + global_explanation_df = pd.DataFrame(self.global_explanation) + + self.formatted_global_explanation = ( + global_explanation_df / global_explanation_df.sum(axis=0) * 100 + ) + + # Create a markdown section for the global explainability + global_explanation_section = dp.Blocks( + "### Global Explainability ", + dp.DataTable(self.formatted_global_explanation), + ) + + aggregate_local_explanations = pd.DataFrame() + for s_id, local_ex_df in self.local_explanation.items(): + local_ex_df_copy = local_ex_df.copy() + local_ex_df_copy["Series"] = s_id + aggregate_local_explanations = pd.concat( + [aggregate_local_explanations, local_ex_df_copy], axis=0 + ) + self.formatted_local_explanation = aggregate_local_explanations + + local_explanation_text = dp.Text(f"## Local Explanation of Models \n ") + blocks = [ + dp.DataTable( + local_ex_df.div(local_ex_df.abs().sum(axis=1), axis=0) * 100, + label=s_id, + ) + for s_id, local_ex_df in self.local_explanation.items() + ] + local_explanation_section = ( + dp.Select(blocks=blocks) if len(blocks) > 1 else blocks[0] + ) + + # Append the global explanation text and section to the "all_sections" list + all_sections = all_sections + [ + global_explanation_text, + global_explanation_section, + local_explanation_text, + local_explanation_section, + ] + except Exception as e: + # Do not fail the whole run due to explanations failure + logger.warn(f"Failed to generate Explanations with error: {e}.") + logger.debug(f"Full Traceback: {traceback.format_exc()}") + + model_description = dp.Text( + "NeuralProphet is an easy to learn framework for interpretable time " + "series forecasting. NeuralProphet is built on PyTorch and combines " + "Neural Network and traditional time-series algorithms, inspired by " + "Facebook Prophet and AR-Net." + ) + other_sections = all_sections + + return ( + model_description, + other_sections, + ) + + def _custom_predict_neuralprophet(self, data): + raise NotImplementedError("NeuralProphet does not yet support explanations.") + # data_prepped = data.reset_index() + # data_prepped['y'] = None + # data_prepped['ds'] = pd.to_datetime(data_prepped['ds']) + # return self.models[self.target_columns.index(self.series_id)].predict(data_prepped)["yhat1"] diff --git a/ads/opctl/operator/lowcode/forecast/model/prophet.py b/ads/opctl/operator/lowcode/forecast/model/prophet.py new file mode 100644 index 000000000..0a3b38c40 --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/model/prophet.py @@ -0,0 +1,383 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import numpy as np +import optuna +import pandas as pd +from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl import logger +from ads.opctl.operator.lowcode.forecast.operator_config import ForecastOperatorConfig + +from ..const import DEFAULT_TRIALS, PROPHET_INTERNAL_DATE_COL, ForecastOutputColumns +from .. import utils +from .base_model import ForecastOperatorBaseModel +from ..operator_config import ForecastOperatorConfig +from .forecast_datasets import ForecastDatasets, ForecastOutput +import traceback +import matplotlib as mpl + +mpl.rcParams["figure.max_open_warning"] = 100 + + +def _add_unit(num, unit): + return f"{num} {unit}" + + +def _fit_model(data, params, additional_regressors): + from prophet import Prophet + + model = Prophet(**params) + for add_reg in additional_regressors: + model.add_regressor(add_reg) + model.fit(data) + return model + + +class ProphetOperatorModel(ForecastOperatorBaseModel): + """Class representing Prophet operator model.""" + + def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): + super().__init__(config=config, datasets=datasets) + self.train_metrics = True + self.global_explanation = {} + self.local_explanation = {} + + def _build_model(self) -> pd.DataFrame: + from prophet import Prophet + from prophet.diagnostics import cross_validation, performance_metrics + + full_data_dict = self.datasets.full_data_dict + models = [] + outputs = dict() + outputs_legacy = [] + + # Extract the Confidence Interval Width and convert to prophet's equivalent - interval_width + if self.spec.confidence_interval_width is None: + self.spec.confidence_interval_width = 1 - self.spec.model_kwargs.get( + "alpha", 0.90 + ) + + model_kwargs = self.spec.model_kwargs + model_kwargs["interval_width"] = self.spec.confidence_interval_width + + self.forecast_output = ForecastOutput( + confidence_interval_width=self.spec.confidence_interval_width + ) + + for i, (target, df) in enumerate(full_data_dict.items()): + le, df_encoded = utils._label_encode_dataframe( + df, no_encode={self.spec.datetime_column.name, target} + ) + + model_kwargs_i = model_kwargs.copy() + # format the dataframe for this target. Dropping NA on target[df] will remove all future data + df_clean = self._preprocess( + df_encoded, + self.spec.datetime_column.name, + self.spec.datetime_column.format, + ) + data_i = df_clean[df_clean[target].notna()] + data_i.rename({target: "y"}, axis=1, inplace=True) + + # Assume that all columns passed in should be used as additional data + additional_regressors = set(data_i.columns) - { + "y", + PROPHET_INTERNAL_DATE_COL, + } + + if self.perform_tuning: + + def objective(trial): + params = { + "seasonality_mode": trial.suggest_categorical( + "seasonality_mode", ["additive", "multiplicative"] + ), + "changepoint_prior_scale": trial.suggest_float( + "changepoint_prior_scale", 0.001, 0.5, log=True + ), + "seasonality_prior_scale": trial.suggest_float( + "seasonality_prior_scale", 0.01, 10, log=True + ), + "holidays_prior_scale": trial.suggest_float( + "holidays_prior_scale", 0.01, 10, log=True + ), + "changepoint_range": trial.suggest_float( + "changepoint_range", 0.8, 0.95 + ), + } + params.update(model_kwargs_i) + + model = _fit_model( + data=data_i, + params=params, + additional_regressors=additional_regressors, + ) + + # Manual workaround because pandas 1.x dropped support for M and Y + interval = self.spec.horizon.interval + unit = self.spec.horizon.interval_unit + if unit == "M": + unit = "D" + interval = interval * 30.5 + elif unit == "Y": + unit = "D" + interval = interval * 365.25 + horizon = _add_unit(int(self.spec.horizon * interval), unit=unit) + initial = _add_unit((data_i.shape[0] * interval) // 2, unit=unit) + period = _add_unit((data_i.shape[0] * interval) // 4, unit=unit) + + logger.debug( + f"using: horizon: {horizon}. initial:{initial}, period: {period}" + ) + + df_cv = cross_validation( + model, + horizon=horizon, + initial=initial, + period=period, + parallel="threads", + ) + df_p = performance_metrics(df_cv) + try: + return np.mean(df_p[self.spec.metric]) + except KeyError: + logger.warn( + f"Could not find the metric {self.spec.metric} within " + f"the performance metrics: {df_p.columns}. Defaulting to `rmse`" + ) + return np.mean(df_p["rmse"]) + + study = optuna.create_study(direction="minimize") + m_temp = Prophet() + study.enqueue_trial( + { + "seasonality_mode": m_temp.seasonality_mode, + "changepoint_prior_scale": m_temp.changepoint_prior_scale, + "seasonality_prior_scale": m_temp.seasonality_prior_scale, + "holidays_prior_scale": m_temp.holidays_prior_scale, + "changepoint_range": m_temp.changepoint_range, + } + ) + study.optimize( + objective, + n_trials=self.spec.tuning.n_trials + if self.spec.tuning + else DEFAULT_TRIALS, + n_jobs=-1, + ) + + study.best_params.update(model_kwargs_i) + model_kwargs_i = study.best_params + model = _fit_model( + data=data_i, + params=model_kwargs_i, + additional_regressors=additional_regressors, + ) + + # Make future df for prediction + if len(additional_regressors): + future = df_clean.drop(target, axis=1) + else: + future = model.make_future_dataframe( + periods=self.spec.horizon, + freq=self.spec.freq, + ) + # Make Prediction + forecast = model.predict(future) + logger.debug(f"-----------------Model {i}----------------------") + logger.debug( + forecast[ + [PROPHET_INTERNAL_DATE_COL, "yhat", "yhat_lower", "yhat_upper"] + ].tail() + ) + + # Collect Outputs + models.append(model) + outputs[target] = forecast + outputs_legacy.append(forecast) + + self.models = models + self.outputs = outputs_legacy + + logger.debug("===========Done===========") + + # Merge the outputs from each model into 1 df with all outputs by target and category + col = self.original_target_column + output_col = pd.DataFrame() + yhat_upper_name = ForecastOutputColumns.UPPER_BOUND + yhat_lower_name = ForecastOutputColumns.LOWER_BOUND + for cat in self.categories: + output_i = pd.DataFrame() + + output_i["Date"] = outputs[f"{col}_{cat}"][PROPHET_INTERNAL_DATE_COL] + output_i["Series"] = cat + output_i["input_value"] = full_data_dict[f"{col}_{cat}"][f"{col}_{cat}"] + + output_i[f"fitted_value"] = float("nan") + output_i[f"forecast_value"] = float("nan") + output_i[yhat_upper_name] = float("nan") + output_i[yhat_lower_name] = float("nan") + + output_i.iloc[ + : -self.spec.horizon, output_i.columns.get_loc(f"fitted_value") + ] = (outputs[f"{col}_{cat}"]["yhat"].iloc[: -self.spec.horizon].values) + output_i.iloc[ + -self.spec.horizon :, + output_i.columns.get_loc(f"forecast_value"), + ] = ( + outputs[f"{col}_{cat}"]["yhat"].iloc[-self.spec.horizon :].values + ) + output_i.iloc[ + -self.spec.horizon :, output_i.columns.get_loc(yhat_upper_name) + ] = ( + outputs[f"{col}_{cat}"]["yhat_upper"].iloc[-self.spec.horizon :].values + ) + output_i.iloc[ + -self.spec.horizon :, output_i.columns.get_loc(yhat_lower_name) + ] = ( + outputs[f"{col}_{cat}"]["yhat_lower"].iloc[-self.spec.horizon :].values + ) + output_col = pd.concat([output_col, output_i]) + self.forecast_output.add_category( + category=cat, target_category_column=f"{col}_{cat}", forecast=output_i + ) + + output_col = output_col.reset_index(drop=True) + + return output_col + + def _generate_report(self): + import datapane as dp + from prophet.plot import add_changepoints_to_plot + + sec1_text = dp.Text( + "## Forecast Overview \n" + "These plots show your forecast in the context of historical data." + ) + sec1 = utils._select_plot_list( + lambda idx, *args: self.models[idx].plot( + self.outputs[idx], include_legend=True + ), + target_columns=self.target_columns, + ) + + sec2_text = dp.Text(f"## Forecast Broken Down by Trend Component") + sec2 = utils._select_plot_list( + lambda idx, *args: self.models[idx].plot_components(self.outputs[idx]), + target_columns=self.target_columns, + ) + + sec3_text = dp.Text(f"## Forecast Changepoints") + sec3_figs = [ + self.models[idx].plot(self.outputs[idx]) + for idx in range(len(self.target_columns)) + ] + [ + add_changepoints_to_plot( + sec3_figs[idx].gca(), self.models[idx], self.outputs[idx] + ) + for idx in range(len(self.target_columns)) + ] + sec3 = utils._select_plot_list( + lambda idx, *args: sec3_figs[idx], target_columns=self.target_columns + ) + + all_sections = [sec1_text, sec1, sec2_text, sec2, sec3_text, sec3] + + sec5_text = dp.Text(f"## Prophet Model Seasonality Components") + model_states = [] + for i, m in enumerate(self.models): + model_states.append( + pd.Series( + m.seasonalities, + index=pd.Index(m.seasonalities.keys(), dtype="object"), + name=self.target_columns[i], + dtype="object", + ) + ) + all_model_states = pd.concat(model_states, axis=1) + if not all_model_states.empty: + sec5 = dp.DataTable(all_model_states) + all_sections = all_sections + [sec5_text, sec5] + + if self.spec.generate_explanations: + try: + # If the key is present, call the "explain_model" method + self.explain_model( + datetime_col_name=PROPHET_INTERNAL_DATE_COL, + explain_predict_fn=self._custom_predict_prophet, + ) + + # Create a markdown text block for the global explanation section + global_explanation_text = dp.Text( + f"## Global Explanation of Models \n " + "The following tables provide the feature attribution for the global explainability." + ) + + # Convert the global explanation data to a DataFrame + global_explanation_df = pd.DataFrame(self.global_explanation) + + self.formatted_global_explanation = ( + global_explanation_df / global_explanation_df.sum(axis=0) * 100 + ) + + # Create a markdown section for the global explainability + global_explanation_section = dp.Blocks( + "### Global Explainability ", + dp.DataTable(self.formatted_global_explanation), + ) + + aggregate_local_explanations = pd.DataFrame() + for s_id, local_ex_df in self.local_explanation.items(): + local_ex_df_copy = local_ex_df.copy() + local_ex_df_copy["Series"] = s_id + aggregate_local_explanations = pd.concat( + [aggregate_local_explanations, local_ex_df_copy], axis=0 + ) + self.formatted_local_explanation = aggregate_local_explanations + + local_explanation_text = dp.Text(f"## Local Explanation of Models \n ") + blocks = [ + dp.DataTable( + local_ex_df.div(local_ex_df.abs().sum(axis=1), axis=0) * 100, + label=s_id, + ) + for s_id, local_ex_df in self.local_explanation.items() + ] + local_explanation_section = ( + dp.Select(blocks=blocks) if len(blocks) > 1 else blocks[0] + ) + + # Append the global explanation text and section to the "all_sections" list + all_sections = all_sections + [ + global_explanation_text, + global_explanation_section, + local_explanation_text, + local_explanation_section, + ] + except Exception as e: + # Do not fail the whole run due to explanations failure + logger.warn(f"Failed to generate Explanations with error: {e}.") + logger.debug(f"Full Traceback: {traceback.format_exc()}") + + model_description = dp.Text( + "Prophet is a procedure for forecasting time series data based on an additive " + "model where non-linear trends are fit with yearly, weekly, and daily seasonality, " + "plus holiday effects. It works best with time series that have strong seasonal " + "effects and several seasons of historical data. Prophet is robust to missing " + "data and shifts in the trend, and typically handles outliers well." + ) + other_sections = all_sections + + return ( + model_description, + other_sections, + ) + + def _custom_predict_prophet(self, data): + return self.models[self.target_columns.index(self.series_id)].predict( + data.reset_index() + )["yhat"] diff --git a/ads/opctl/operator/lowcode/forecast/model/transformations.py b/ads/opctl/operator/lowcode/forecast/model/transformations.py new file mode 100644 index 000000000..cf9c92e7c --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/model/transformations.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from ads.opctl import logger + + +class Transformations: + """A class which implements transformation for forecast operator""" + + def __init__(self, data, dataset_info): + """ + Initializes the transformation. + + Parameters + ---------- + data: The Pandas DataFrame. + dataset_info : ForecastOperatorConfig + """ + self.data = data + self.dataset_info = dataset_info + self._set_series_id_column() + self.series_id_column = self.dataset_info.target_category_columns + self.target_variables = dataset_info.target_column + self.date_column = dataset_info.datetime_column.name + self.date_format = dataset_info.datetime_column.format + self.preprocessing = dataset_info.preprocessing + + def run(self): + """ + The function runs all the transformation in a particular order. + + Returns + ------- + A new Pandas DataFrame with treated / transformed target values. + """ + imputed_df = self._missing_value_imputation(self.data) + sorted_df = self._sort_by_datetime_col(imputed_df) + clean_strs_df = self._remove_trailing_whitespace(sorted_df) + if self.preprocessing: + treated_df = self._outlier_treatment(clean_strs_df) + else: + logger.debug("Skipping outlier treatment as preprocessing is disabled") + treated_df = imputed_df + return treated_df + + def _set_series_id_column(self): + if ( + self.dataset_info.target_category_columns is None + or len(self.dataset_info.target_category_columns) == 0 + ): + self.data["__Series"] = "" + self.dataset_info.target_category_columns = ["__Series"] + + def _remove_trailing_whitespace(self, df): + return df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) + + def _missing_value_imputation(self, df): + """ + Function fills missing values in the pandas dataframe using liner interpolation + + Parameters + ---------- + df : The Pandas DataFrame. + + Returns + ------- + A new Pandas DataFrame without missing values. + """ + # missing value imputation using linear interpolation + df[self.target_variables] = df.groupby(self.series_id_column)[ + self.target_variables + ].transform(lambda x: x.interpolate(limit_direction="both")) + return df + + def _outlier_treatment(self, df): + """ + Function finds outliears using z_score and treats with mean value. + + Parameters + ---------- + df : The Pandas DataFrame. + + Returns + ------- + A new Pandas DataFrame with treated outliears. + """ + df["z_score"] = df.groupby(self.series_id_column)[ + self.target_variables + ].transform(lambda x: (x - x.mean()) / x.std()) + outliers_mask = df["z_score"].abs() > 3 + df.loc[outliers_mask, self.target_variables] = df.groupby( + self.series_id_column + )[self.target_variables].transform(lambda x: x.mean()) + df.drop("z_score", axis=1, inplace=True) + return df + + def _sort_by_datetime_col(self, df): + """ + Function sorts by date + + Parameters + ---------- + df : The Pandas DataFrame. + + Returns + ------- + A new Pandas DataFrame with sorted dates for each category + """ + import pandas as pd + + # Temporary column for sorting + df["tmp_col_for_sorting"] = pd.to_datetime( + df[self.date_column], format=self.date_format + ) + df = ( + df.groupby(self.series_id_column, group_keys=True) + .apply(lambda x: x.sort_values(by="tmp_col_for_sorting", ascending=True)) + .reset_index(drop=True) + ) + # Drop the temporary column + df.drop(columns=["tmp_col_for_sorting"], inplace=True) + return df diff --git a/ads/opctl/operator/lowcode/forecast/operator_config.py b/ads/opctl/operator/lowcode/forecast/operator_config.py new file mode 100644 index 000000000..911dccc03 --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/operator_config.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +from dataclasses import dataclass, field +from typing import Dict, List + +from ads.common.serializer import DataClassSerializable +from ads.opctl.operator.common.utils import _load_yaml_from_uri +from ads.opctl.operator.common.operator_config import OperatorConfig + +from .const import SupportedMetrics +from .const import SupportedModels + +@dataclass(repr=True) +class InputData(DataClassSerializable): + """Class representing operator specification input data details.""" + + format: str = None + columns: List[str] = None + url: str = None + options: Dict = None + limit: int = None + + +@dataclass(repr=True) +class TestData(DataClassSerializable): + """Class representing operator specification test data details.""" + + connect_args: Dict = None + format: str = None + columns: List[str] = None + url: str = None + name: str = None + options: Dict = None + + +@dataclass(repr=True) +class OutputDirectory(DataClassSerializable): + """Class representing operator specification output directory details.""" + + connect_args: Dict = None + format: str = None + url: str = None + name: str = None + options: Dict = None + + +@dataclass(repr=True) +class DateTimeColumn(DataClassSerializable): + """Class representing operator specification date time column details.""" + + name: str = None + format: str = None + + +@dataclass(repr=True) +class Tuning(DataClassSerializable): + """Class representing operator specification tuning details.""" + + n_trials: int = None + + +@dataclass(repr=True) +class ForecastOperatorSpec(DataClassSerializable): + """Class representing forecast operator specification.""" + + name: str = None + historical_data: InputData = field(default_factory=InputData) + additional_data: InputData = field(default_factory=InputData) + test_data: TestData = field(default_factory=TestData) + output_directory: OutputDirectory = field(default_factory=OutputDirectory) + report_filename: str = None + report_title: str = None + report_theme: str = None + metrics_filename: str = None + test_metrics_filename: str = None + forecast_filename: str = None + global_explanation_filename: str = None + local_explanation_filename: str = None + target_column: str = None + preprocessing: bool = None + datetime_column: DateTimeColumn = field(default_factory=DateTimeColumn) + target_category_columns: List[str] = field(default_factory=list) + generate_report: bool = None + generate_metrics: bool = None + generate_explanations: bool = None + horizon: int = None + freq: str = None + model: str = None + model_kwargs: Dict = field(default_factory=dict) + confidence_interval_width: float = None + metric: str = None + tuning: Tuning = field(default_factory=Tuning) + + def __post_init__(self): + """Adjusts the specification details.""" + self.metric = (self.metric or "").lower() or SupportedMetrics.SMAPE.lower() + self.model = (self.model or SupportedModels.Auto) + self.confidence_interval_width = self.confidence_interval_width or 0.80 + self.report_filename = self.report_filename or "report.html" + self.preprocessing = ( + self.preprocessing if self.preprocessing is not None else True + ) + # For Report Generation. When user doesn't specify defaults to True + self.generate_report = ( + self.generate_report if self.generate_report is not None else True + ) + # For Metrics files Generation. When user doesn't specify defaults to True + self.generate_metrics = ( + self.generate_metrics if self.generate_metrics is not None else True + ) + # For Explanations Generation. When user doesn't specify defaults to False + self.generate_explanations = ( + self.generate_explanations + if self.generate_explanations is not None + else False + ) + self.report_theme = self.report_theme or "light" + self.metrics_filename = self.metrics_filename or "metrics.csv" + self.test_metrics_filename = self.test_metrics_filename or "test_metrics.csv" + self.forecast_filename = self.forecast_filename or "forecast.csv" + self.global_explanation_filename = ( + self.global_explanation_filename or "global_explanation.csv" + ) + self.local_explanation_filename = ( + self.local_explanation_filename or "local_explanation.csv" + ) + self.target_column = self.target_column or "Sales" + self.model_kwargs = self.model_kwargs or dict() + + +@dataclass(repr=True) +class ForecastOperatorConfig(OperatorConfig): + """Class representing forecast operator config. + + Attributes + ---------- + kind: str + The kind of the resource. For operators it is always - `operator`. + type: str + The type of the operator. For forecast operator it is always - `forecast` + version: str + The version of the operator. + spec: ForecastOperatorSpec + The forecast operator specification. + """ + + kind: str = "operator" + type: str = "forecast" + version: str = "v1" + spec: ForecastOperatorSpec = field(default_factory=ForecastOperatorSpec) + + @classmethod + def _load_schema(cls) -> str: + """Loads operator schema.""" + return _load_yaml_from_uri( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "schema.yaml") + ) diff --git a/ads/opctl/operator/lowcode/forecast/schema.yaml b/ads/opctl/operator/lowcode/forecast/schema.yaml new file mode 100644 index 000000000..38f109bab --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/schema.yaml @@ -0,0 +1,313 @@ +kind: + allowed: + - operator + required: true + type: string + default: operator + meta: + description: "Which service are you trying to use? Common kinds: `operator`, `job`" + +version: + allowed: + - "v1" + required: true + type: string + default: v1 + meta: + description: "Operators may change yaml file schemas from version to version, as well as implementation details. Double check the version to ensure compatibility." + +type: + required: true + type: string + default: forecast + meta: + description: "Type should always be `forecast` when using a forecast operator" + +spec: + required: true + schema: + historical_data: + required: true + type: dict + meta: + description: "This should be indexed by date and target category (optionally). It should include all targets and endogeneous data." + schema: + format: + allowed: + - csv + - json + - clipboard + - excel + - hdf + - feather + - load_files + required: false + type: string + columns: + required: false + type: list + schema: + type: string + options: + nullable: true + required: false + type: dict + url: + required: true + type: string + default: data.csv + meta: + description: "The url can be local, or remote. For example: `oci://@/data.csv`" + limit: + required: false + type: integer + + additional_data: + required: false + type: dict + meta: + description: "Additional datasets must be indexed by the same targets and target categories as the historical data. Also is must have datapoints for each date/category for your horizon. This must be exogeneous data." + schema: + format: + allowed: + - csv + - json + - clipboard + - excel + - hdf + - feather + - load_files + required: false + type: string + columns: + required: false + type: list + schema: + type: string + options: + nullable: true + required: false + type: dict + url: + required: false + type: string + meta: + description: "The url can be local, or remote. For example: `oci://@/data.csv`" + limit: + required: false + type: integer + + test_data: + required: false + meta: + description: "Optional, only if evaluation is needed." + schema: + connect_args: + nullable: true + required: false + type: dict + format: + required: false + type: string + allowed: + - csv + - json + - clipboard + - excel + - hdf + - sql + columns: + required: false + type: list + schema: + type: string + url: + required: true + type: string + default: test.csv + meta: + description: "The url can be local, or remote. For example: `oci://@/data.csv`" + name: + required: false + type: string + options: + nullable: true + required: false + type: dict + type: dict + + output_directory: + required: false + schema: + connect_args: + nullable: true + required: false + type: dict + format: + required: false + type: string + allowed: + - csv + - json + - clipboard + - excel + - hdf + - sql + url: + required: true + type: string + default: result/ + meta: + description: "The url can be local, or remote. For example: `oci://@/`" + name: + required: false + type: string + options: + nullable: true + required: false + type: dict + type: dict + + report_filename: + required: false + type: string + default: report.html + meta: + description: "Placed into output_directory location. Defaults to report.html" + report_title: + required: false + type: string + report_theme: + required: false + type: string + default: light + allowed: + - light + - dark + metrics_filename: + required: false + type: string + default: metrics.csv + meta: + description: "Placed into output_directory location. Defaults to metrics.csv" + test_metrics_filename: + required: false + type: string + default: test_metrics.csv + meta: + description: "Placed into output_directory location. Defaults to test_metrics.csv" + forecast_filename: + required: false + type: string + default: forecast.csv + meta: + description: "Placed into output_directory location. Defaults to forecast.csv" + global_explanation_filename: + required: false + type: string + default: global_explanations.csv + meta: + description: "Placed into output_directory location. Defaults to global_explanations.csv" + local_explanation_filename: + required: false + type: string + default: local_explanations.csv + meta: + description: "Placed into output_directory location. Defaults to local_explanations.csv" + + target_column: + type: string + required: true + default: target + + preprocessing: + type: boolean + required: false + default: true + meta: + description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true" + + generate_explanations: + type: boolean + required: false + default: false + meta: + description: "Explainability, both local and global, can be disabled using this flag. Defaults to false." + + generate_report: + type: boolean + required: false + default: true + meta: + description: "Report file generation can be enabled using this flag. Defaults to true." + + generate_metrics: + type: boolean + required: false + default: true + meta: + description: "Metrics files generation can be enabled using this flag. Defaults to true." + + datetime_column: + type: dict + required: true + schema: + name: + type: string + required: true + default: Date + format: + type: string + required: false + + target_category_columns: + type: list + required: true + schema: + type: string + default: ["Column1"] + + horizon: + required: true + type: integer + + model: + type: string + required: false + default: auto + allowed: + - prophet + - arima + - neuralprophet + - automlx + - autots + - auto + + model_kwargs: + type: dict + required: false + + confidence_interval_width: + type: float + required: false + default: 0.80 + + tuning: + type: dict + required: false + schema: + n_trials: + type: integer + required: false + default: 10 + + metric: + type: string + required: false + default: MAPE + allowed: + - MAPE + - RMSE + - MSE + - SMAPE + type: dict diff --git a/ads/opctl/operator/lowcode/forecast/utils.py b/ads/opctl/operator/lowcode/forecast/utils.py new file mode 100644 index 000000000..dd3672546 --- /dev/null +++ b/ads/opctl/operator/lowcode/forecast/utils.py @@ -0,0 +1,599 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +import sys +from typing import List + +import fsspec +import numpy as np +import pandas as pd +import plotly.express as px +from plotly import graph_objects as go +from sklearn.metrics import ( + explained_variance_score, + mean_absolute_percentage_error, + mean_squared_error, + r2_score, +) + +from ads.common.object_storage_details import ObjectStorageDetails +from ads.dataset.label_encoder import DataFrameLabelEncoder +from ads.opctl import logger + +from .const import SupportedMetrics, SupportedModels +from .errors import ForecastInputDataError, ForecastSchemaYamlError +from .operator_config import ForecastOperatorSpec, ForecastOperatorConfig + + +def _label_encode_dataframe(df, no_encode=set()): + df_to_encode = df[list(set(df.columns) - no_encode)] + le = DataFrameLabelEncoder().fit(df_to_encode) + return le, le.transform(df) + + +def _inverse_transform_dataframe(le, df): + return le.inverse_transform(df) + + +def smape(actual, predicted) -> float: + if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]): + actual, predicted = (np.array(actual), np.array(predicted)) + denominator = np.abs(actual) + np.abs(predicted) + numerator = np.abs(actual - predicted) + default_output = np.ones_like(numerator) * np.inf + + abs_error = np.divide(numerator, denominator) + return round(np.mean(abs_error) * 100, 2) + + +def _build_metrics_per_horizon( + data: pd.DataFrame, + output: pd.DataFrame, + target_columns: List[str], + target_col: str, + horizon_periods: int, +) -> pd.DataFrame: + """ + Calculates Mean sMAPE, Median sMAPE, Mean MAPE, Median MAPE, Mean wMAPE, Median wMAPE for each horizon + + Parameters + ------------ + data: Pandas Dataframe + Dataframe that has the actual data + output: Pandas Dataframe + Dataframe that has the forecasted data + target_columns: List + List of target category columns + target_col: str + Target column name (yhat) + horizon_periods: int + Horizon Periods + + Returns + -------- + Pandas Dataframe + Dataframe with Mean sMAPE, Median sMAPE, Mean MAPE, Median MAPE, Mean wMAPE, Median wMAPE values for each horizon + """ + """ + Assumptions: + data and output have all the target columns. + yhats in output are in the same order as in target_columns. + Test data might not have sorted dates and the order of series also might differ. + """ + + # Select the data with correct order of target_columns. + target_columns = list(set.intersection(set(target_columns), set(data.columns))) + + actuals_df = data[["ds"] + target_columns] + + # Concat the yhats in output and include only dates that are in test data + forecasts_df = pd.DataFrame() + for cat in output.list_categories(): + forecast_i = output.get_category(cat)[["Date", "forecast_value"]] + forecast_i = forecast_i[forecast_i["Date"].isin(actuals_df["ds"])] + forecasts_df = pd.concat([forecasts_df, forecast_i.set_index("Date")], axis=1) + + # Remove dates that are not there in output + actuals_df = actuals_df[actuals_df["ds"].isin(forecasts_df.index.values)] + + if actuals_df.empty or forecasts_df.empty: + return pd.DataFrame() + + totals = actuals_df.sum(numeric_only=True) + wmape_weights = np.array((totals / totals.sum()).values) + + actuals_df = actuals_df.set_index("ds") + + metrics_df = pd.DataFrame( + columns=[ + SupportedMetrics.MEAN_SMAPE, + SupportedMetrics.MEDIAN_SMAPE, + SupportedMetrics.MEAN_MAPE, + SupportedMetrics.MEDIAN_MAPE, + SupportedMetrics.MEAN_WMAPE, + SupportedMetrics.MEDIAN_WMAPE, + ] + ) + + for i, (y_true, y_pred) in enumerate( + zip(actuals_df.itertuples(index=False), forecasts_df.itertuples(index=False)) + ): + y_true, y_pred = np.array(y_true), np.array(y_pred) + + smapes = np.array( + [smape(actual=y_t, predicted=y_p) for y_t, y_p in zip(y_true, y_pred)] + ) + mapes = np.array( + [ + mean_absolute_percentage_error(y_true=[y_t], y_pred=[y_p]) + for y_t, y_p in zip(y_true, y_pred) + ] + ) + wmapes = np.array([mape * weight for mape, weight in zip(mapes, wmape_weights)]) + + metrics_row = { + SupportedMetrics.MEAN_SMAPE: np.mean(smapes), + SupportedMetrics.MEDIAN_SMAPE: np.median(smapes), + SupportedMetrics.MEAN_MAPE: np.mean(mapes), + SupportedMetrics.MEDIAN_MAPE: np.median(mapes), + SupportedMetrics.MEAN_WMAPE: np.mean(wmapes), + SupportedMetrics.MEDIAN_WMAPE: np.median(wmapes), + } + + metrics_df = pd.concat( + [metrics_df, pd.DataFrame(metrics_row, index=[actuals_df.index[i]])], + ) + + return metrics_df + + +def _call_pandas_fsspec(pd_fn, filename, storage_options, **kwargs): + if fsspec.utils.get_protocol(filename) == "file": + return pd_fn(filename, **kwargs) + elif fsspec.utils.get_protocol(filename) in ["http", "https"]: + return pd_fn(filename, **kwargs) + + storage_options = storage_options or ( + default_signer() if ObjectStorageDetails.is_oci_path(filename) else {} + ) + + return pd_fn(filename, storage_options=storage_options, **kwargs) + + +def _load_data(filename, format, storage_options=None, columns=None, **kwargs): + if not format: + _, format = os.path.splitext(filename) + format = format[1:] + if format in ["json", "clipboard", "excel", "csv", "feather", "hdf"]: + read_fn = getattr(pd, f"read_{format}") + data = _call_pandas_fsspec(read_fn, filename, storage_options=storage_options) + elif format in ["tsv"]: + data = _call_pandas_fsspec( + pd.read_csv, filename, storage_options=storage_options, sep="\t" + ) + else: + raise ForecastInputDataError(f"Unrecognized format: {format}") + if columns: + # keep only these columns, done after load because only CSV supports stream filtering + data = data[columns] + return data + + +def _write_data(data, filename, format, storage_options, index=False, **kwargs): + if not format: + _, format = os.path.splitext(filename) + format = format[1:] + if format in ["json", "clipboard", "excel", "csv", "feather", "hdf"]: + write_fn = getattr(data, f"to_{format}") + return _call_pandas_fsspec( + write_fn, filename, index=index, storage_options=storage_options + ) + raise ForecastInputDataError(f"Unrecognized format: {format}") + + +def _merge_category_columns(data, target_category_columns): + result = data.apply( + lambda x: "__".join([str(x[col]) for col in target_category_columns]), axis=1 + ) + return result if not result.empty else pd.Series([], dtype=str) + + +def _clean_data(data, target_column, datetime_column, target_category_columns=None): + if target_category_columns is not None: + data["__Series__"] = _merge_category_columns(data, target_category_columns) + unique_categories = data["__Series__"].unique() + + df = pd.DataFrame() + new_target_columns = [] + + for cat in unique_categories: + data_cat = data[data["__Series__"] == cat].rename( + {target_column: f"{target_column}_{cat}"}, axis=1 + ) + data_cat_clean = data_cat.drop("__Series__", axis=1).set_index( + datetime_column + ) + df = pd.concat([df, data_cat_clean], axis=1) + new_target_columns.append(f"{target_column}_{cat}") + df = df.reset_index() + + return df.fillna(0), new_target_columns + + raise ForecastSchemaYamlError( + f"Either target_columns, target_category_columns, or datetime_column not specified." + ) + + +def _validate_and_clean_data( + cat: str, horizon: int, primary: pd.DataFrame, additional: pd.DataFrame +): + """ + Checks compatibility between primary and additional dataframe for a category. + + Parameters + ---------- + cat: (str) + Category for which data is being validated. + horizon: (int) + horizon value for the forecast. + primary: (pd.DataFrame) + primary dataframe. + additional: (pd.DataFrame) + additional dataframe. + + Returns + ------- + (pd.DataFrame, pd.DataFrame) or (None, None) + Updated primary and additional dataframe or None values if the validation criteria does not satisfy. + """ + # Additional data should have future values for horizon + data_row_count = primary.shape[0] + data_add_row_count = additional.shape[0] + additional_surplus = data_add_row_count - horizon - data_row_count + if additional_surplus < 0: + logger.warn( + "Forecast for {} will not be generated since additional data has fewer values({}) than" + " horizon({}) + primary data({})".format( + cat, data_add_row_count, horizon, data_row_count + ) + ) + return None, None + elif additional_surplus > 0: + # Removing surplus future data in additional + additional.drop(additional.tail(additional_surplus).index, inplace=True) + + # Dates in primary data should be subset of additional data + dates_in_data = primary.index.tolist() + dates_in_additional = additional.index.tolist() + if not set(dates_in_data).issubset(set(dates_in_additional)): + logger.warn( + "Forecast for {} will not be generated since the dates in primary and additional do not" + " match".format(cat) + ) + return None, None + return primary, additional + + +def _build_indexed_datasets( + data, + target_column, + datetime_column, + horizon, + target_category_columns=None, + additional_data=None, + metadata_data=None, +): + df_by_target = dict() + categories = [] + + if target_category_columns is None: + if additional_data is None: + df_by_target[target_column] = data.fillna(0) + else: + df_by_target[target_column] = pd.concat( + [ + data.set_index(datetime_column).fillna(0), + additional_data.set_index(datetime_column).fillna(0), + ], + axis=1, + ).reset_index() + return df_by_target, target_column, categories + + data["__Series__"] = _merge_category_columns(data, target_category_columns) + unique_categories = data["__Series__"].unique() + invalid_categories = [] + + if additional_data is not None and target_column in additional_data.columns: + logger.warn(f"Dropping column '{target_column}' from additional_data") + additional_data.drop(target_column, axis=1, inplace=True) + for cat in unique_categories: + data_by_cat = data[data["__Series__"] == cat].rename( + {target_column: f"{target_column}_{cat}"}, axis=1 + ) + data_by_cat_clean = ( + data_by_cat.drop(target_category_columns + ["__Series__"], axis=1) + .set_index(datetime_column) + .fillna(0) + ) + if additional_data is not None: + additional_data["__Series__"] = _merge_category_columns( + additional_data, target_category_columns + ) + data_add_by_cat = additional_data[ + additional_data["__Series__"] == cat + ].rename({target_column: f"{target_column}_{cat}"}, axis=1) + data_add_by_cat_clean = ( + data_add_by_cat.drop(target_category_columns + ["__Series__"], axis=1) + .set_index(datetime_column) + .fillna(0) + ) + valid_primary, valid_add = _validate_and_clean_data( + cat, horizon, data_by_cat_clean, data_add_by_cat_clean + ) + + if valid_primary is None: + invalid_categories.append(cat) + data_by_cat_clean = None + else: + data_by_cat_clean = pd.concat([valid_add, valid_primary], axis=1) + if data_by_cat_clean is not None: + df_by_target[f"{target_column}_{cat}"] = data_by_cat_clean.reset_index() + + new_target_columns = list(df_by_target.keys()) + remaining_categories = set(unique_categories) - set(invalid_categories) + + if not len(remaining_categories): + raise ForecastInputDataError( + "Stopping forecast operator as there is no data that meets the validation criteria." + ) + return df_by_target, new_target_columns, remaining_categories + + +def _build_metrics_df(y_true, y_pred, column_name): + metrics = dict() + metrics["sMAPE"] = smape(actual=y_true, predicted=y_pred) + metrics["MAPE"] = mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred) + metrics["RMSE"] = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred)) + metrics["r2"] = r2_score(y_true=y_true, y_pred=y_pred) + metrics["Explained Variance"] = explained_variance_score( + y_true=y_true, y_pred=y_pred + ) + return pd.DataFrame.from_dict(metrics, orient="index", columns=[column_name]) + + +def evaluate_train_metrics( + target_columns, datasets, output, datetime_col, target_col="yhat" +): + """ + Training metrics + """ + total_metrics = pd.DataFrame() + for idx, col in enumerate(target_columns): + try: + forecast_by_col = output.get_target_category(col)[ + ["input_value", "Date", "fitted_value"] + ].dropna() + y_true = forecast_by_col["input_value"].values + y_pred = forecast_by_col["fitted_value"].values + metrics_df = _build_metrics_df( + y_true=y_true, y_pred=y_pred, column_name=col + ) + total_metrics = pd.concat([total_metrics, metrics_df], axis=1) + except Exception as e: + logger.warn(f"Failed to generate training metrics for target_series: {col}") + logger.debug(f"Recieved Error Statement: {e}") + return total_metrics + + +def _select_plot_list(fn, target_columns): + import datapane as dp + + blocks = [dp.Plot(fn(i, col), label=col) for i, col in enumerate(target_columns)] + return dp.Select(blocks=blocks) if len(target_columns) > 1 else blocks[0] + + +def _add_unit(num, unit): + return f"{num} {unit}" + + +def get_forecast_plots( + forecast_output, + target_columns, + horizon, + test_data=None, + ci_interval_width=0.95, +): + def plot_forecast_plotly(idx, col): + fig = go.Figure() + forecast_i = forecast_output.get_target_category(col) + upper_bound = forecast_output.upper_bound_name + lower_bound = forecast_output.lower_bound_name + if upper_bound is not None and lower_bound is not None: + fig.add_traces( + [ + go.Scatter( + x=forecast_i["Date"], + y=forecast_i[lower_bound], + mode="lines", + line_color="rgba(0,0,0,0)", + showlegend=False, + ), + go.Scatter( + x=forecast_i["Date"], + y=forecast_i[upper_bound], + mode="lines", + line_color="rgba(0,0,0,0)", + name=f"{ci_interval_width * 100}% confidence interval", + fill="tonexty", + fillcolor="rgba(211, 211, 211, 0.5)", + ), + ] + ) + if test_data is not None and col in test_data: + fig.add_trace( + go.Scatter( + x=test_data["ds"], + y=test_data[col], + mode="markers", + marker_color="green", + name="Actual", + ) + ) + + fig.add_trace( + go.Scatter( + x=forecast_i["Date"], + y=forecast_i["input_value"], + mode="markers", + marker_color="black", + name="Historical", + ) + ) + fig.add_trace( + go.Scatter( + x=forecast_i["Date"], + y=forecast_i["fitted_value"], + mode="lines+markers", + line_color="blue", + name="Fitted Values", + ) + ) + fig.add_trace( + go.Scatter( + x=forecast_i["Date"], + y=forecast_i["forecast_value"], + mode="lines+markers", + line_color="blue", + name="Forecast", + ) + ) + fig.add_vline( + x=forecast_i["Date"][-(horizon + 1) :].values[0], + line_width=1, + line_dash="dash", + line_color="gray", + ) + return fig + + return _select_plot_list(plot_forecast_plotly, target_columns) + + +def human_time_friendly(seconds): + TIME_DURATION_UNITS = ( + ("week", 60 * 60 * 24 * 7), + ("day", 60 * 60 * 24), + ("hour", 60 * 60), + ("min", 60), + ) + if seconds == 0: + return "inf" + accumulator = [] + for unit, div in TIME_DURATION_UNITS: + amount, seconds = divmod(float(seconds), div) + if amount > 0: + accumulator.append( + "{} {}{}".format(int(amount), unit, "" if amount == 1 else "s") + ) + accumulator.append("{} secs".format(round(seconds, 2))) + return ", ".join(accumulator) + + +def select_auto_model( + datasets: "ForecastDatasets", operator_config: ForecastOperatorConfig +) -> str: + """ + Selects AutoMLX or Arima model based on column count. + + If the number of columns is less than or equal to the maximum allowed for AutoMLX, + returns 'AutoMLX'. Otherwise, returns 'Arima'. + + Parameters + ------------ + datasets: ForecastDatasets + Datasets for predictions + + Returns + -------- + str + The type of the model. + """ + date_column = operator_config.spec.datetime_column.name + datetimes = pd.to_datetime( + datasets.original_user_data[date_column].drop_duplicates() + ) + freq_in_secs = datetimes.tail().diff().min().total_seconds() + if datasets.original_additional_data is not None: + num_of_additional_cols = len(datasets.original_additional_data.columns) - 2 + else: + num_of_additional_cols = 0 + row_count = len(datasets.original_user_data.index) + number_of_series = len(datasets.categories) + if ( + num_of_additional_cols < 15 + and row_count < 10000 + and number_of_series < 10 + and freq_in_secs > 3600 + ): + return SupportedModels.AutoMLX + elif row_count < 10000 and number_of_series > 10: + operator_config.spec.model_kwargs["model_list"] = "fast_parallel" + return SupportedModels.AutoTS + elif row_count < 20000 and number_of_series > 10: + operator_config.spec.model_kwargs["model_list"] = "superfast" + return SupportedModels.AutoTS + elif row_count > 20000: + return SupportedModels.NeuralProphet + else: + return SupportedModels.NeuralProphet + + +def get_frequency_of_datetime(data: pd.DataFrame, dataset_info: ForecastOperatorSpec): + """ + Function checks if the data is compatible with the model selected + + Parameters + ------------ + data: pd.DataFrame + primary dataset + dataset_info: ForecastOperatorSpec + + Returns + -------- + None + + """ + date_column = dataset_info.datetime_column.name + datetimes = pd.to_datetime( + data[date_column].drop_duplicates(), format=dataset_info.datetime_column.format + ) + freq = pd.DatetimeIndex(datetimes).inferred_freq + if dataset_info.model == SupportedModels.AutoMLX: + freq_in_secs = datetimes.tail().diff().min().total_seconds() + if abs(freq_in_secs) < 3600: + message = ( + "{} requires data with a frequency of at least one hour. Please try using a different model," + " or select the 'auto' option.".format(SupportedModels.AutoMLX, freq) + ) + raise Exception(message) + return freq + + +def default_signer(**kwargs): + os.environ["EXTRA_USER_AGENT_INFO"] = "Forecast-Operator" + from ads.common.auth import default_signer + + return default_signer(**kwargs) + + +# Disable +def block_print(): + sys.stdout = open(os.devnull, "w") + + +# Restore +def enable_print(): + sys.stdout = sys.__stdout__ diff --git a/ads/opctl/operator/lowcode/pii/MLoperator b/ads/opctl/operator/lowcode/pii/MLoperator new file mode 100644 index 000000000..49dafdb5a --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/MLoperator @@ -0,0 +1,15 @@ +type: pii +version: v1 +name: PII Operator +conda_type: published +conda: pii_v1 +gpu: no +keywords: + - PII + - Spacy +backends: + - job +description: | + PII operator, that detects detect and redact Personally Identifiable Information + (PII) data in datasets by combining pattern match and machine learning solution. + Use `ads operator info -t pii` to get more details about the pii operator." diff --git a/ads/opctl/operator/lowcode/pii/README.md b/ads/opctl/operator/lowcode/pii/README.md new file mode 100644 index 000000000..59b2c43f8 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/README.md @@ -0,0 +1,208 @@ +# PII Operator + + +The PII Operator aims to detect and redact Personally Identifiable Information (PII) in datasets. PII data includes information such as names, addresses, and social security numbers, which can be used to identify individuals. This operator combine pattern matching and machine learning solution to identify PII, and then redacts or anonymizes it to protect the privacy of individuals. + +Below are the steps to configure and run the PII Operator on different resources. + +## 1. Prerequisites + +Follow the [CLI Configuration](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/opctl/configure.html) steps from the ADS documentation. This step is mandatory as it sets up default values for different options while running the PII Operator on OCI Data Science jobs. + +## 2. Generating configs + +To generate starter configs, run the command below. This will create a list of YAML configs and place them in the `output` folder. + +```bash +ads operator init -t pii --overwrite --output ~/pii/ +``` + +The most important files expected to be generated are: + +- `pii.yaml`: Contains pii-related configuration. +- `pii_operator_local_python.yaml`: This includes a local backend configuration for running pii operator in a local environment. The environment should be set up manually before running the operator. +- `pii_operator_local_container.yaml`: This includes a local backend configuration for running pii operator within a local container. The container should be built before running the operator. Please refer to the instructions below for details on how to accomplish this. +- `pii_job_container.yaml`: Contains Data Science job-related config to run pii operator in a Data Science job within a container (BYOC) runtime. The container should be built and published before running the operator. Please refer to the instructions below for details on how to accomplish this. +- `pii_job_python.yaml`: Contains Data Science job-related config to run pii operator in a Data Science job within a conda runtime. The conda should be built and published before running the operator. + +All generated configurations should be ready to use without the need for any additional adjustments. However, they are provided as starter kit configurations that can be customized as needed. + +## 3. Running Pii on the local conda environment + +To run pii operator locally, create and activate a new conda environment (`ads-pii`). Install all the required libraries listed in the `environment.yaml` file. + +```yaml +- aiohttp +- datapane +- gender_guesser +- nameparser +- oracle_ads[opctl] +- plotly +- scrubadub +- scrubadub_spacy +- spacy-transformers==1.2.5 +- spacy==3.6.1 +``` + +Please review the previously generated `pii.yaml` file using the `init` command, and make any necessary adjustments to the input and output file locations. By default, it assumes that the files should be located in the same folder from which the `init` command was executed. + +Use the command below to verify the pii config. + +```bash +ads operator verify -f ~/pii/pii.yaml +``` + +Use the following command to run the pii operator within the `ads-pii` conda environment. + +```bash +ads operator run -f ~/pii/pii.yaml -b local +``` + +The operator will run in your local environment without requiring any additional modifications. + +## 4. Running pii on the local container + +To run the pii operator within a local container, follow these steps: + +Use the command below to build the pii container. + +```bash +ads operator build-image -t pii +``` + +This will create a new `pii:v1` image, with `/etc/operator` as the designated working directory within the container. + + +Check the `pii_operator_local_container.yaml` config file. By default, it should have a `volume` section with the `.oci` configs folder mounted. + +```yaml +volume: + - "/Users//.oci:/root/.oci" +``` + +Mounting the OCI configs folder is only required if an OCI Object Storage bucket will be used to store the input data or output result. The input/output folders can also be mounted to the container. + +```yaml +volume: + - /Users//.oci:/root/.oci + - /Users//pii/data:/etc/operator/data + - /Users//pii/result:/etc/operator/result +``` + +The full config can look like: +```yaml +kind: operator.local +spec: + image: pii:v1 + volume: + - /Users//.oci:/root/.oci + - /Users//pii/data:/etc/operator/data + - /Users//pii/result:/etc/operator/result +type: container +version: v1 +``` + +Run the pii operator within a container using the command below: + +```bash +ads operator run -f ~/pii/pii.yaml --backend-config ~/pii/pii_operator_local_container.yaml +``` + +## 5. Running pii in the Data Science job within container runtime + +To execute the pii operator within a Data Science job using container runtime, please follow the steps outlined below: + +You can use the following command to build the forecast container. This step can be skipped if you have already done this for running the operator within a local container. + +```bash +ads operator build-image -t pii +``` + +This will create a new `pii:v1` image, with `/etc/operator` as the designated working directory within the container. + +Publish the `pii:v1` container to the [Oracle Container Registry](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/home.htm). To become familiar with OCI, read the documentation links posted below. + +- [Access Container Registry](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/Concepts/registryoverview.htm#access) +- [Create repositories](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/Tasks/registrycreatingarepository.htm#top) +- [Push images](https://docs.public.oneportal.content.oci.oraclecloud.com/en-us/iaas/Content/Registry/Tasks/registrypushingimagesusingthedockercli.htm#Pushing_Images_Using_the_Docker_CLI) + +To publish `pii:v1` to OCR, use the command posted below: + +```bash +ads operator publish-image pii:v1 --registry +``` + +After the container is published to OCR, it can be used within Data Science jobs service. Check the `backend_job_container_config.yaml` config file. It should contain pre-populated infrastructure and runtime sections. The runtime section should contain an image property, something like `image: iad.ocir.io//pii:v1`. More details about supported options can be found in the ADS Jobs documentation - [Run a Container](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/run_container.html). + +Adjust the `pii.yaml` config with proper input/output folders. When the operator is run in the Data Science job, it will not have access to local folders. Therefore, input data and output folders should be placed in the Object Storage bucket. Open the `pii.yaml` and adjust the following fields: + +```yaml +input_data: + url: oci://bucket@namespace/pii/input_data/data.csv +output_directory: + url: oci://bucket@namespace/pii/result/ +``` + +Run the pii operator on the Data Science jobs using the command posted below: + +```bash +ads operator run -f ~/pii/pii.yaml --backend-config ~/pii/pii_job_container.yaml +``` + +The logs can be monitored using the `ads opctl watch` command. + +```bash +ads opctl watch +``` + + +## 6. Running pii in the Data Science job within conda runtime + +To execute the pii operator within a Data Science job using conda runtime, please follow the steps outlined below: + +You can use the following command to build the pii conda environment. + +```bash +ads operator build-conda -t pii +``` + +This will create a new `pii_v1` conda environment and place it in the folder specified within `ads opctl configure` command. + +Use the command below to Publish the `pii_v1` conda environment to the Object Storage bucket. + +```bash +ads opctl conda publish pii_v1 +``` +More details about configuring CLI can be found here - [Configuring CLI](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/opctl/configure.html) + + +After the conda environment is published to Object Storage, it can be used within Data Science jobs service. Check the `pii_job_python.yaml` config file. It should contain pre-populated infrastructure and runtime sections. The runtime section should contain a `conda` section. + +```yaml +conda: + type: published + uri: oci://bucket@namespace/conda_environments/cpu/pii/1/pii_v1 +``` + +More details about supported options can be found in the ADS Jobs documentation - [Run a Python Workload](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/run_python.html). + +Adjust the `pii.yaml` config with proper input/output folders. When the pii is run in the Data Science job, it will not have access to local folders. Therefore, input data and output folders should be placed in the Object Storage bucket. Open the `pii.yaml` and adjust the following fields: + +```yaml +input_data: + url: oci://bucket@namespace/pii/input_data/data.csv +output_directory: + url: oci://bucket@namespace/pii/result/ +``` + +Run the pii on the Data Science jobs using the command posted below: + +```bash +ads operator run -f ~/pii/pii.yaml --backend-config ~/pii/pii_job_python.yaml +``` + +The logs can be monitored using the `ads opctl watch` command. + +```bash +ads opctl watch +``` diff --git a/ads/opctl/operator/lowcode/pii/__init__.py b/ads/opctl/operator/lowcode/pii/__init__.py new file mode 100644 index 000000000..b8d0460f5 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/ads/opctl/operator/lowcode/pii/__main__.py b/ads/opctl/operator/lowcode/pii/__main__.py new file mode 100644 index 000000000..111b7ed3f --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/__main__.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import json +import os +import sys +from typing import Dict, List + +import yaml + +from ads.opctl import logger +from ads.opctl.operator.common.const import ENV_OPERATOR_ARGS +from ads.opctl.operator.common.utils import _parse_input_args + +from .model.guardrails import PIIGuardrail +from .operator_config import PiiOperatorConfig + + +def operate(operator_config: PiiOperatorConfig) -> None: + """Runs the PII operator.""" + guard = PIIGuardrail(config=operator_config) + guard.process() + + +def verify(spec: Dict, **kwargs: Dict) -> bool: + """Verifies the PII operator config.""" + operator = PiiOperatorConfig.from_dict(spec) + msg_header = ( + f"{'*' * 30} The operator config has been successfully verified {'*' * 30}" + ) + print(msg_header) + print(operator.to_yaml()) + print("*" * len(msg_header)) + + +def main(raw_args: List[str]): + """The entry point of the PII the operator.""" + args, _ = _parse_input_args(raw_args) + if not args.file and not args.spec and not os.environ.get(ENV_OPERATOR_ARGS): + logger.info( + "Please specify -f[--file] or -s[--spec] or " + f"pass operator's arguments via {ENV_OPERATOR_ARGS} environment variable." + ) + return + + logger.info("-" * 100) + logger.info(f"{'Running' if not args.verify else 'Verifying'} the operator...") + + # if spec provided as input string, then convert the string into YAML + yaml_string = "" + if args.spec or os.environ.get(ENV_OPERATOR_ARGS): + operator_spec_str = args.spec or os.environ.get(ENV_OPERATOR_ARGS) + try: + yaml_string = yaml.safe_dump(json.loads(operator_spec_str)) + except json.JSONDecodeError: + yaml_string = yaml.safe_dump(yaml.safe_load(operator_spec_str)) + except: + yaml_string = operator_spec_str + + operator_config = PiiOperatorConfig.from_yaml( + uri=args.file, + yaml_string=yaml_string, + ) + + logger.info(operator_config.to_yaml()) + + # run operator + if args.verify: + verify(operator_config) + else: + operate(operator_config) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/ads/opctl/operator/lowcode/pii/cmd.py b/ads/opctl/operator/lowcode/pii/cmd.py new file mode 100644 index 000000000..67bf14d27 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/cmd.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from typing import Dict + +from ads.opctl import logger +from ads.opctl.operator.common.operator_yaml_generator import YamlGenerator +from ads.opctl.operator.common.utils import _load_yaml_from_uri + + +def init(**kwargs: Dict) -> str: + """ + Generates operator config by the schema. + + Properties + ---------- + kwargs: (Dict, optional). + Additional key value arguments. + + - type: str + The type of the operator. + + Returns + ------- + str + The YAML specification generated based on the schema. + """ + logger.info("==== PII related options ====") + + default_detector = [{"name": ".", "action": "mask"}] + + return YamlGenerator( + schema=_load_yaml_from_uri(__file__.replace("cmd.py", "schema.yaml")) + ).generate_example_dict( + values={"type": kwargs.get("type"), "detectors": default_detector} + ) diff --git a/ads/opctl/operator/lowcode/pii/constant.py b/ads/opctl/operator/lowcode/pii/constant.py new file mode 100644 index 000000000..5c75ae74c --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/constant.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +from ads.common.extended_enum import ExtendedEnumMeta + +DEFAULT_SHOW_ROWS = 25 +DEFAULT_TIME_OUT = 5 +DEFAULT_COLOR = "#D6D3D1" +DEFAULT_REPORT_FILENAME = "report.html" +DEFAULT_TARGET_COLUMN = "target" + + +class SupportedAction(str, metaclass=ExtendedEnumMeta): + """Supported action to process detected entities.""" + + MASK = "mask" + REMOVE = "remove" + ANONYMIZE = "anonymize" + + +class SupportedDetector(str, metaclass=ExtendedEnumMeta): + """Supported pii detectors.""" + + DEFAULT = "default" + SPACY = "spacy" + + +class DataFrameColumn(str, metaclass=ExtendedEnumMeta): + REDACTED_TEXT: str = "redacted_text" + ENTITIES: str = "entities_cols" + + +class YamlKey(str, metaclass=ExtendedEnumMeta): + """Yaml key used in pii.yaml.""" + + pass + + +YAML_KEYS = [ + "detectors", + "custom_detectors", + "spacy_detectors", + "anonymization", + "name", + "label", + "patterns", + "model", + "named_entities", + "entities", +] + +################ +# Report Const # +################ +PII_REPORT_DESCRIPTION = ( + "This report will offer a comprehensive overview of the redaction of personal identifiable information (PII) from the provided data." + "The `Summary` section will provide an executive summary of this process, including key statistics, configuration, and model usage." + "The `Details` section will offer a more granular analysis of each row of data, including relevant statistics." +) +DETAILS_REPORT_DESCRIPTION = "The following report will show the details on each row. You can view the highlighted named entities and their labels in the text under `TEXT` tab." + +FLAT_UI_COLORS = [ + "#1ABC9C", + "#2ECC71", + "#3498DB", + "#9B59B6", + "#34495E", + "#16A085", + "#27AE60", + "#2980B9", + "#8E44AD", + "#2C3E50", + "#F1C40F", + "#E67E22", + "#E74C3C", + "#ECF0F1", + "#95A5A6", + "#F39C12", + "#D35400", + "#C0392B", + "#BDC3C7", + "#7F8C8D", +] diff --git a/ads/opctl/operator/lowcode/pii/environment.yaml b/ads/opctl/operator/lowcode/pii/environment.yaml new file mode 100644 index 000000000..ffd60045e --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/environment.yaml @@ -0,0 +1,17 @@ +name: pii +channels: + - conda-forge +dependencies: + - python=3.9 + - pip + - pip: + - aiohttp + - datapane + - gender_guesser + - nameparser + - oracle_ads[opctl] + - plotly + - scrubadub + - scrubadub_spacy + - spacy-transformers==1.2.5 + - spacy==3.6.1 diff --git a/ads/opctl/operator/lowcode/pii/errors.py b/ads/opctl/operator/lowcode/pii/errors.py new file mode 100644 index 000000000..73aadaf46 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/errors.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +class PIISchemaYamlError(Exception): + """Exception raised when there is an issue with the schema.""" + + def __init__(self, error: str): + super().__init__( + "Invalid PII operator specification. Check the YAML structure and ensure it " + "complies with the required schema for PII operator. \n" + f"{error}" + ) + + +class PIIInputDataError(Exception): + """Exception raised when there is an issue with input data.""" + + def __init__(self, error: str): + super().__init__( + "Invalid input data. Check the input data and ensure it " + "complies with the validation criteria. \n" + f"{error}" + ) diff --git a/ads/opctl/operator/lowcode/pii/model/__init__.py b/ads/opctl/operator/lowcode/pii/model/__init__.py new file mode 100644 index 000000000..b8d0460f5 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/ads/opctl/operator/lowcode/pii/model/factory.py b/ads/opctl/operator/lowcode/pii/model/factory.py new file mode 100644 index 000000000..102204ea3 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/factory.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import uuid + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.opctl.operator.lowcode.pii.constant import SupportedDetector +from ads.opctl.operator.lowcode.pii.utils import construct_filth_cls_name + + +class UnSupportedDetectorError(Exception): + def __init__(self, dtype: str): + super().__init__( + f"Detector: `{dtype}` " + f"is not supported. Supported models: {SupportedDetector.values}" + ) + + +class PiiBaseDetector: + @classmethod + def construct(cls, **kwargs): + raise NotImplementedError + + +class BuiltInDetector(PiiBaseDetector): + @classmethod + def construct(cls, entity, **kwargs): + return entity + + +class SpacyDetector(PiiBaseDetector): + DEFAULT_SPACY_NAMED_ENTITIES = ["DATE", "FAC", "GPE", "LOC", "ORG", "PER", "PERSON"] + DEFAULT_SPACY_MODEL = "en_core_web_trf" + + @classmethod + @runtime_dependency(module="scrubadub", install_from=OptionalDependency.PII) + @runtime_dependency(module="scrubadub_spacy", install_from=OptionalDependency.PII) + def construct(cls, entity, model, **kwargs): + spacy_entity_detector = scrubadub_spacy.detectors.spacy.SpacyEntityDetector( + named_entities=[entity], + name=f"spacy_{uuid.uuid4()}", + model=model, + ) + if entity.upper() not in cls.DEFAULT_SPACY_NAMED_ENTITIES: + filth_cls = type( + construct_filth_cls_name(entity), + (scrubadub.filth.Filth,), + {"type": entity.upper()}, + ) + spacy_entity_detector.filth_cls_map[entity.upper()] = filth_cls + return spacy_entity_detector + + +class PiiDetectorFactory: + """ + The factory class helps to instantiate proper detector object based on the detector config. + """ + + _MAP = { + SupportedDetector.DEFAULT: BuiltInDetector, + SupportedDetector.SPACY: SpacyDetector, + } + + @classmethod + def get_detector( + cls, + detector_type, + entity, + model=None, + ): + if detector_type not in cls._MAP: + raise UnSupportedDetectorError(detector_type) + + return cls._MAP[detector_type].construct(entity=entity, model=model) diff --git a/ads/opctl/operator/lowcode/pii/model/guardrails.py b/ads/opctl/operator/lowcode/pii/model/guardrails.py new file mode 100644 index 000000000..41dc3514b --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/guardrails.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +import time +from datetime import datetime + +from ads.common.object_storage_details import ObjectStorageDetails +from ads.opctl import logger +from ads.opctl.operator.lowcode.pii.constant import DataFrameColumn +from ads.opctl.operator.lowcode.pii.model.pii import PiiScrubber, detect, scrub +from ads.opctl.operator.lowcode.pii.model.report import ( + PIIOperatorReport, + PiiReportPageSpec, + PiiReportSpec, +) +from ads.opctl.operator.lowcode.pii.operator_config import PiiOperatorConfig +from ads.opctl.operator.lowcode.pii.utils import ( + _load_data, + _write_data, + default_signer, + get_output_name, +) + + +class PIIGuardrail: + def __init__(self, config: PiiOperatorConfig): + self.config = config + self.spec = config.spec + self.pii_scrubber = PiiScrubber(config=config) + self.scrubber = self.pii_scrubber.config_scrubber() + + output_filename = get_output_name( + target_name=self.spec.output_directory.name, + given_name=self.spec.input_data.url, + ) + self.dst_uri = os.path.join(self.spec.output_directory.url, output_filename) + self.config.spec.output_directory.name = output_filename + + self.report_uri = os.path.join( + self.spec.output_directory.url, + self.spec.report.report_filename, + ) + + self.report_context: PiiReportSpec = PiiReportSpec.from_dict( + { + "run_summary": { + "config": self.config, + "selected_detectors": self.pii_scrubber.detectors, + "selected_entities": self.pii_scrubber.entities, + "selected_spacy_model": self.pii_scrubber.spacy_model_detectors, + "show_rows": self.spec.report.show_rows, + "show_sensitive_info": self.spec.report.show_sensitive_content, + "src_uri": self.spec.input_data.url, + "total_tokens": 0, + }, + "run_details": {"rows": []}, + } + ) + + self.storage_options = ( + default_signer() + if ObjectStorageDetails.is_oci_path(self.spec.output_directory.url) + else {} + ) + self.datasets = None + + def load_data(self, uri=None, storage_options=None): + """Loads input data.""" + input_data_uri = uri or self.spec.input_data.url + logger.info(f"Loading input data from `{input_data_uri}` ...") + + self.datasets = _load_data( + filename=input_data_uri, + storage_options=storage_options or self.storage_options, + ) + return self + + def process(self, **kwargs): + """Process input data.""" + self.report_context.run_summary.timestamp = datetime.now().strftime( + "%d/%m/%Y %H:%M:%S" + ) + start_time = time.time() + + data = kwargs.pop("input_data", None) or self.datasets + report_uri = kwargs.pop("report_uri", None) or self.report_uri + dst_uri = kwargs.pop("dst_uri", None) or self.dst_uri + + if not data: + try: + self.load_data() + data = self.datasets + except Exception as e: + logger.warning( + f"Failed to load data from `{self.spec.input_data.url}`." + ) + raise e + + # process user data + data[DataFrameColumn.REDACTED_TEXT] = data[self.spec.target_column].apply( + lambda x: scrub(x, scrubber=self.scrubber) + ) + self.report_context.run_summary.elapsed_time = time.time() - start_time + self.report_context.run_summary.total_rows = len(data.index) + + # save output data + if dst_uri: + logger.info(f"Saving data into `{dst_uri}` ...") + + _write_data( + data=data.loc[:, data.columns != self.spec.target_column], + filename=dst_uri, + storage_options=kwargs.pop("storage_options", None) + or self.storage_options, + ) + + # prepare pii report + if report_uri: + logger.info(f"Generating report to `{report_uri}` ...") + + data[DataFrameColumn.ENTITIES] = data[self.spec.target_column].apply( + lambda x: detect(text=x, scrubber=self.scrubber) + ) + + for i in data.index: + text = data[self.spec.target_column][i] + ent_col = data[DataFrameColumn.ENTITIES][i] + page = PiiReportPageSpec.from_dict( + { + "id": i, + "total_tokens": len(ent_col), + "entities": ent_col, + "raw_text": text, + } + ) + self.report_context.run_details.rows.append(page) + self.report_context.run_summary.total_tokens += len(ent_col) + + self._process_context() + PIIOperatorReport( + report_spec=self.report_context, report_uri=report_uri + ).make_view().save_report( + storage_options=kwargs.pop("storage_options", None) + or self.storage_options + ) + + def _process_context(self): + """Count different type of filth.""" + statics = {} # statics : count Filth type in total + rows = self.report_context.run_details.rows + for row in rows: + entities = row.entities + row_statics = {} # count row + for ent in entities: + row_statics[ent.type] = row_statics.get(ent.type, 0) + 1 + statics[ent.type] = statics.get(ent.type, 0) + 1 + + row.statics = row_statics.copy() + + self.report_context.run_summary.statics = statics diff --git a/ads/opctl/operator/lowcode/pii/model/pii.py b/ads/opctl/operator/lowcode/pii/model/pii.py new file mode 100644 index 000000000..ba036d05e --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/pii.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.opctl import logger +from ads.opctl.operator.common.utils import _load_yaml_from_uri +from ads.opctl.operator.lowcode.pii.model.factory import PiiDetectorFactory +from ads.opctl.operator.lowcode.pii.constant import ( + SupportedAction, + SupportedDetector, +) +from ads.opctl.operator.lowcode.pii.model.processor import ( + POSTPROCESSOR_MAP, + SUPPORTED_REPLACER, + Remover, +) + + +class PiiScrubber: + """Class used for config scrubber and count the detectors in use.""" + + @runtime_dependency(module="scrubadub", install_from=OptionalDependency.PII) + def __init__(self, config): + logger.info(f"Loading config from {config}") + if isinstance(config, str): + config = _load_yaml_from_uri(config) + + self.config = config + self.spec = ( + self.config["spec"] if isinstance(self.config, dict) else self.config.spec + ) + self.detector_spec = ( + self.spec["detectors"] + if isinstance(self.spec, dict) + else self.spec.detectors + ) + + self.scrubber = scrubadub.Scrubber() + + self.detectors = [] + self.entities = [] + self.spacy_model_detectors = [] + self.post_processors = {} + + self._reset_scrubber() + + def _reset_scrubber(self): + # Clean up default detectors + defautls_enable = self.scrubber._detectors.copy() + for d in defautls_enable: + self.scrubber.remove_detector(d) + + def _register(self, name, dtype, model, action, mask_with: str = None): + if action not in SupportedAction.values(): + raise ValueError( + f"Not supported `action`: {action}. Please select from {SupportedAction.values()}." + ) + + detector = PiiDetectorFactory.get_detector( + detector_type=dtype, entity=name, model=model + ) + self.scrubber.add_detector(detector) + self.entities.append(name) + + if action == SupportedAction.ANONYMIZE: + entity = ( + detector + if isinstance(detector, str) + else detector.filth_cls_map[name.upper()].type + ) + if entity in SUPPORTED_REPLACER.keys(): + replacer_name = SUPPORTED_REPLACER.get(entity).name + replacer = self.post_processors.get( + replacer_name, POSTPROCESSOR_MAP.get(replacer_name)() + ) + if hasattr(replacer, "_ENTITIES"): + replacer._ENTITIES.append(name) + self.post_processors[replacer_name] = replacer + else: + raise ValueError( + f"Not supported `action` {action} for this entity `{name}`. Please try with other action." + ) + + if action == SupportedAction.REMOVE: + remover = self.post_processors.get("remover", Remover()) + remover._ENTITIES.append(name) + self.post_processors["remover"] = remover + + def config_scrubber(self): + """Returns an instance of srubadub.Scrubber.""" + + self.scrubber.redact_spec_file = self.spec + + for detector in self.detector_spec: + # example format for detector["name"]: default.phone or spacy.en_core_web_trf.person + d = detector["name"].split(".") + dtype = d[0] + dname = d[1] if len(d) == 2 else d[2] + model = None if len(d) == 2 else d[1] + + action = detector.get("action", SupportedAction.MASK) + self._register( + name=dname, + dtype=dtype, + model=model, + action=action, + ) + if dtype == SupportedDetector.SPACY: + exist = False + for spacy_detectors in self.spacy_model_detectors: + if spacy_detectors["model"] == model: + spacy_detectors["spacy_entites"].append(dname) + exist = True + break + if not exist: + self.spacy_model_detectors.append( + {"model": model, "spacy_entites": [dname]} + ) + + self._register_post_processor() + + self.detectors = list(self.scrubber._detectors.values()) + return self.scrubber + + def _register_post_processor(self): + for _, v in self.post_processors.items(): + self.scrubber.add_post_processor(v) + + +def scrub(text, config=None, scrubber=None): + if not scrubber: + scrubber = PiiScrubber(config=config).config_scrubber() + return scrubber.clean(text) + + +def detect(text, config=None, scrubber=None): + if not scrubber: + scrubber = PiiScrubber(config=config).config_scrubber() + return list(scrubber.iter_filth(text, document_name=None)) diff --git a/ads/opctl/operator/lowcode/pii/model/processor/__init__.py b/ads/opctl/operator/lowcode/pii/model/processor/__init__.py new file mode 100644 index 000000000..062a61aa7 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/processor/__init__.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from .email_replacer import EmailReplacer +from .mbi_replacer import MBIReplacer +from .name_replacer import NameReplacer +from .number_replacer import NumberReplacer +from .remover import Remover + +POSTPROCESSOR_MAP = { + item.name.lower(): item + for item in [ + NameReplacer, + NumberReplacer, + EmailReplacer, + MBIReplacer, + Remover, + ] +} + +# Currently only support anonymization for the following entity. +SUPPORTED_REPLACER = { + "name": NameReplacer, + "number": NumberReplacer, + "phone": NumberReplacer, + "social_security_number": NumberReplacer, + "fin": NumberReplacer, + "mrn": NumberReplacer, + "email": EmailReplacer, + "mbi": MBIReplacer, +} diff --git a/ads/opctl/operator/lowcode/pii/model/processor/email_replacer.py b/ads/opctl/operator/lowcode/pii/model/processor/email_replacer.py new file mode 100644 index 000000000..69a9d92ef --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/processor/email_replacer.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) + +try: + import scrubadub +except ImportError: + raise ModuleNotFoundError( + f"`scrubadub` module was not found. Please run " + f"`pip install {OptionalDependency.PII}`." + ) + + +class EmailReplacer(scrubadub.post_processors.PostProcessor): + name = "email_replacer" + + @runtime_dependency(module="faker", install_from=OptionalDependency.PII) + def process_filth(self, filth_list): + from faker import Faker + + for filth in filth_list: + if filth.replacement_string: + continue + if filth.type.lower() != "email": + continue + filth.replacement_string = Faker().email() + return filth_list diff --git a/ads/opctl/operator/lowcode/pii/model/processor/mbi_replacer.py b/ads/opctl/operator/lowcode/pii/model/processor/mbi_replacer.py new file mode 100644 index 000000000..013526cad --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/processor/mbi_replacer.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import random +import string + +from ads.common.decorator.runtime_dependency import OptionalDependency + +try: + import scrubadub +except ImportError: + raise ModuleNotFoundError( + f"`scrubadub` module was not found. Please run " + f"`pip install {OptionalDependency.PII}`." + ) + + +class MBIReplacer(scrubadub.post_processors.PostProcessor): + name = "mbi_replacer" + CHAR_POOL = "ACDEFGHJKMNPQRTUVWXY" + + def generate_mbi(self): + return "".join(random.choices(self.CHAR_POOL + string.digits, k=11)) + + def process_filth(self, filth_list): + for filth in filth_list: + if filth.replacement_string: + continue + if filth.type.lower() != "mbi": + continue + filth.replacement_string = self.generate_mbi() + return filth_list diff --git a/ads/opctl/operator/lowcode/pii/model/processor/name_replacer.py b/ads/opctl/operator/lowcode/pii/model/processor/name_replacer.py new file mode 100644 index 000000000..2c7dde747 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/processor/name_replacer.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) + +try: + import scrubadub +except ImportError: + raise ModuleNotFoundError( + f"`scrubadub` module was not found. Please run " + f"`pip install {OptionalDependency.PII}`." + ) + + +class NameReplacer(scrubadub.post_processors.PostProcessor): + name = "name_replacer" + + @runtime_dependency(module="faker", install_from=OptionalDependency.PII) + @runtime_dependency(module="gender_guesser", install_from=OptionalDependency.PII) + def __init__(self, name: str = None, mapping: dict = None): + import gender_guesser.detector as gender_detector + from faker import Faker + + if mapping: + self.mapping = mapping + else: + self.mapping = {} + + self.gender_detector = gender_detector.Detector() + self.fake = Faker() + self.groups = { + "first": self.first_name_generator, + "middle": self.first_name_generator, + "last": self.last_name_generator, + "suffix": lambda x: "", + } + super().__init__(name) + + def first_name_generator(self, name): + detected_gender = self.gender_detector.get_gender(name) + if "female" in detected_gender: + return self.fake.first_name_female() + elif "male" in detected_gender: + return self.fake.first_name_male() + return self.fake.first_name_nonbinary() + + def last_name_generator(self, *args): + return self.fake.last_name() + + def unwrap_filth(self, filth_list): + """Un-merge the filths if they have different types.""" + processed = [] + for filth in filth_list: + # MergedFilths has the property "filths" + # Do nothing if filth has a type already + if filth.type in ["unknown", "", None] and hasattr(filth, "filths"): + filth_types = set([f.type.lower() for f in filth.filths]) + # Do nothing if the filth does not contain a name + if "name" not in filth_types: + processed.append(filth) + continue + if len(filth_types) > 1: + processed.extend(filth.filths) + continue + filth.type = filth.filths[0].type + filth.detector_name = filth.filths[0].detector_name + processed.append(filth) + return processed + + @staticmethod + def has_initial(name: "nameparser.HumanName") -> bool: + for attr in ["first", "middle", "last"]: + if len(str(getattr(name, attr)).strip(".")) == 1: + return True + return False + + @staticmethod + def has_non_initial(name: "nameparser.HumanName") -> bool: + for attr in ["first", "middle", "last"]: + if len(str(getattr(name, attr)).strip(".")) > 1: + return True + return False + + @staticmethod + def generate_component(name_component: str, generator): + fake_component = generator(name_component) + if len(name_component.rstrip(".")) == 1: + fake_component = fake_component[0] + if name_component.endswith("."): + fake_component += "." + return fake_component + + def save_name_mapping( + self, name: "nameparser.HumanName", fake_name: "nameparser.HumanName" + ): + """Saves the names with initials to the mapping so that a new name will not be generated. + For example, if name is "John Richard Doe", this method will save the following keys to the mapping: + - J Doe + - John D + - J R Doe + - John R D + - John R Doe + """ + # Both first name and last name must be presented + if not name.first or not name.last: + return + # Remove any dot at the end of the name component. + for attr in ["first", "middle", "last"]: + setattr(name, attr, getattr(name, attr).rstrip(".")) + + self.mapping[ + f"{name.first[0]} {name.last}" + ] = f"{fake_name.first[0]} {fake_name.last}" + + self.mapping[ + f"{name.first} {name.last[0]}" + ] = f"{fake_name.first} {fake_name.last[0]}" + + if name.middle: + self.mapping[ + f"{name.first[0]} {name.middle[0]} {name.last}" + ] = f"{fake_name.first[0]} {fake_name.middle[0]} {fake_name.last}" + + self.mapping[ + f"{name.first} {name.middle[0]} {name.last[0]}" + ] = f"{fake_name.first} {fake_name.middle[0]} {fake_name.last[0]}" + + self.mapping[ + f"{name.first} {name.middle[0]} {name.last}" + ] = f"{fake_name.first} {fake_name.middle[0]} {fake_name.last}" + + @runtime_dependency(module="nameparser", install_from=OptionalDependency.PII) + def replace(self, text): + """Replaces a name with fake name. + + Parameters + ---------- + text : str or HumanName + The name to be replaced. + If text is a HumanName object, the object will be modified to have the new fake names. + + Returns + ------- + str + The replaced name as text. + """ + from nameparser import HumanName + + if isinstance(text, HumanName): + name = text + else: + name = HumanName(text) + skip = [] + # Check if the name is given with initial for one of the first name/last name + key = None + if self.has_initial(name) and self.has_non_initial(name): + if name.middle: + key = f'{name.first.rstrip(".")} {name.middle.rstrip(".")} {name.last.rstrip(".")}' + else: + key = f'{name.first.rstrip(".")} {name.last.rstrip(".")}' + fake_name = self.mapping.get(key) + # If a fake name is found matching the first initial + last name or first name + last initial + # Replace the the initial with the corresponding initial + # and skip processing the first and last name in the replacement. + if fake_name: + fake_name = HumanName(fake_name) + name.first = fake_name.first + name.last = fake_name.last + skip = ["first", "last"] + if name.middle: + name.middle = fake_name.middle + skip.append("middle") + # Replace each component in the name + for attr, generator in self.groups.items(): + if attr in skip: + continue + name_component = getattr(name, attr, None) + if not name_component: + continue + # Check if a fake name has been generated for this name + fake_component = self.mapping.get(name_component) + if not fake_component: + fake_component = self.generate_component(name_component, generator) + # Generate a unique fake name that is not already in the mapping + while fake_component and ( + fake_component in self.mapping.keys() + or fake_component in self.mapping.values() + ): + fake_component = self.generate_component(name_component, generator) + self.mapping[name_component] = fake_component + setattr(name, attr, fake_component) + + # Save name with initials to mapping + original_name = text if isinstance(text, HumanName) else HumanName(text) + self.save_name_mapping(original_name, name) + return str(name) + + @runtime_dependency(module="nameparser", install_from=OptionalDependency.PII) + def process_filth(self, filth_list): + from nameparser import HumanName + + filth_list = self.unwrap_filth(filth_list) + + name_filths = [] + # Filter to keep only the names + for filth in filth_list: + if filth.replacement_string: + continue + if filth.type.lower() != "name": + continue + name_filths.append(filth) + + # Sort reverse by last name so that names having a last name will be processed first. + # When a name is referred by last name (e.g. Mr. White), HumanName will parse it as first name. + name_filths.sort(key=lambda x: HumanName(x.text).last, reverse=True) + for filth in name_filths: + filth.replacement_string = self.replace(filth.text) + return filth_list diff --git a/ads/opctl/operator/lowcode/pii/model/processor/number_replacer.py b/ads/opctl/operator/lowcode/pii/model/processor/number_replacer.py new file mode 100644 index 000000000..5bf678991 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/processor/number_replacer.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import datetime +import random +import re + +from ads.common.decorator.runtime_dependency import OptionalDependency + +try: + import scrubadub +except ImportError: + raise ModuleNotFoundError( + f"`scrubadub` module was not found. Please run " + f"`pip install {OptionalDependency.PII}`." + ) + + +class NumberReplacer(scrubadub.post_processors.PostProcessor): + name = "number_replacer" + _ENTITIES = [ + "number", + "mrn", + "fin", + "phone", + "social_security_number", + ] + + @staticmethod + def replace_digit(obj): + return random.choice("0123456789") + + def match_entity_type(self, filth_types): + if list(set(self._ENTITIES) & set(filth_types)): + return True + return False + + def replace_date(self, text): + date_formats = ["%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y"] + for date_format in date_formats: + try: + date = datetime.datetime.strptime(text, date_format) + except ValueError: + continue + if date.year < 1900 or date.year > datetime.datetime.now().year: + continue + # Now the date is a valid data between 1900 and now + return text + return None + + def replace(self, text): + # Check dates + date = self.replace_date(text) + if date: + return date + return re.sub(r"\d", self.replace_digit, text) + + def process_filth(self, filth_list): + for filth in filth_list: + # Do not process it if it already has a replacement. + if filth.replacement_string: + continue + if filth.type.lower() in self._ENTITIES: + filth.replacement_string = self.replace(filth.text) + # Replace the numbers for merged filth + if filth.type.lower() == "unknown" and hasattr(filth, "filths"): + filth_types = set([f.type for f in filth.filths]) + if self.match_entity_type(filth_types): + filth.replacement_string = self.replace(filth.text) + return filth_list diff --git a/ads/opctl/operator/lowcode/pii/model/processor/remover.py b/ads/opctl/operator/lowcode/pii/model/processor/remover.py new file mode 100644 index 000000000..0e014fe80 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/processor/remover.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +from ads.common.decorator.runtime_dependency import OptionalDependency + +try: + import scrubadub +except ImportError: + raise ModuleNotFoundError( + f"`scrubadub` module was not found. Please run " + f"`pip install {OptionalDependency.PII}`." + ) + + +class Remover(scrubadub.post_processors.PostProcessor): + name = "remover" + _ENTITIES = [] + + def process_filth(self, filth_list): + for filth in filth_list: + if filth.type.lower() in self._ENTITIES: + filth.replacement_string = "" + + return filth_list diff --git a/ads/opctl/operator/lowcode/pii/model/report.py b/ads/opctl/operator/lowcode/pii/model/report.py new file mode 100644 index 000000000..42167ba87 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/model/report.py @@ -0,0 +1,489 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +import os +import random +import tempfile +from dataclasses import dataclass, field +from typing import Dict, List + +import fsspec +import pandas as pd +import requests +import yaml + +from ads.common.decorator.runtime_dependency import ( + OptionalDependency, + runtime_dependency, +) +from ads.common.serializer import DataClassSerializable +from ads.opctl import logger +from ads.opctl.operator.lowcode.pii.constant import ( + DEFAULT_COLOR, + DEFAULT_SHOW_ROWS, + DEFAULT_TIME_OUT, + DETAILS_REPORT_DESCRIPTION, + FLAT_UI_COLORS, + PII_REPORT_DESCRIPTION, +) +from ads.opctl.operator.lowcode.pii.operator_config import PiiOperatorConfig +from ads.opctl.operator.lowcode.pii.utils import ( + block_print, + compute_rate, + enable_print, + human_time_friendly, +) + +try: + import datapane as dp +except ImportError: + raise ModuleNotFoundError( + f"`datapane` module was not found. Please run " + f"`pip install {OptionalDependency.PII}`." + ) + + +@dataclass(repr=True) +class PiiReportPageSpec(DataClassSerializable): + """Class representing each page under Run Details in pii operator report.""" + + entities: list = field(default_factory=list) + id: int = None + raw_text: str = None + statics: dict = field(default_factory=dict) + total_tokens: int = None + + +@dataclass(repr=True) +class RunDetails(DataClassSerializable): + """Class representing Run Details Page in pii operator report.""" + + rows: list = field(default_factory=list) + + +@dataclass(repr=True) +class RunSummary(DataClassSerializable): + """Class representing Run Summary Page in pii operator report.""" + + config: PiiOperatorConfig = None + elapsed_time: str = None + selected_detectors: list = field(default_factory=list) + selected_entities: List[str] = field(default_factory=list) + selected_spacy_model: List[Dict] = field(default_factory=list) + show_rows: int = None + show_sensitive_info: bool = False + src_uri: str = None + statics: dict = None + timestamp: str = None + total_rows: int = None + total_tokens: int = None + + +@dataclass(repr=True) +class PiiReportSpec(DataClassSerializable): + """Class representing pii operator report.""" + + run_details: RunDetails = field(default_factory=RunDetails) + run_summary: RunSummary = field(default_factory=RunSummary) + + +LABEL_TO_COLOR_MAP = {} + + +@runtime_dependency(module="plotly", install_from=OptionalDependency.PII) +def make_model_card(model_name="", readme_path=""): + """Make render model_readme.md as model_card tab. + All spacy model: https://huggingface.co/spacy + For example: "en_core_web_trf": "https://huggingface.co/spacy/en_core_web_trf/raw/main/README.md". + """ + + readme_path = ( + f"https://huggingface.co/spacy/{model_name}/raw/main/README.md" + if model_name + else readme_path + ) + if not readme_path: + raise NotImplementedError("Does not support other spacy model so far.") + + try: + requests.get(readme_path, timeout=DEFAULT_TIME_OUT) + with fsspec.open(readme_path, "r") as file: + content = file.read() + _, front_matter, text = content.split("---", 2) + data = yaml.safe_load(front_matter) + except requests.ConnectionError: + logger.warning( + "You don't have internet connection. Therefore, we are not able to generate model card." + ) + return dp.Group( + dp.Text("-"), + columns=1, + ) + + try: + import plotly.graph_objects as go + + eval_res = data["model-index"][0]["results"] + metrics = [] + values = [] + for eval in eval_res: + metric = [x["name"] for x in eval["metrics"]] + value = [x["value"] for x in eval["metrics"]] + metrics = metrics + metric + values = values + value + df = pd.DataFrame({"Metrics": metrics, "Values": values}) + fig = go.Figure( + data=[ + go.Table( + header=dict(values=list(df.columns)), + cells=dict(values=[df.Metrics, df.Values]), + ) + ] + ) + eval_res_tb = dp.Plot(data=fig, caption="Evaluation Results") + except: + eval_res_tb = dp.Text("-") + logger.warning( + "The given readme.md doesn't have correct template for Evaluation Results." + ) + + return dp.Group( + dp.Text(text), + eval_res_tb, + columns=2, + ) + + +def map_label_to_color(labels): + """Pair label with corresponding color.""" + label_to_colors = {} + for label in labels: + label = label.lower() + label_to_colors[label] = LABEL_TO_COLOR_MAP.get( + label, random.choice(FLAT_UI_COLORS) + ) + LABEL_TO_COLOR_MAP[label] = label_to_colors[label] + + return label_to_colors + + +@runtime_dependency(module="plotly", install_from=OptionalDependency.PII) +def plot_pie(count_map) -> dp.Plot: + import plotly.express as px + + cols = count_map.keys() + cnts = count_map.values() + ent_col_name = "EntityName" + cnt_col_name = "count" + df = pd.DataFrame({ent_col_name: cols, cnt_col_name: cnts}) + + fig = px.pie( + df, + values=cnt_col_name, + names=ent_col_name, + title="The Distribution Of Entities Redacted", + color=ent_col_name, + color_discrete_map=map_label_to_color(cols), + ) + fig.update_traces(textposition="inside", textinfo="percent+label") + return dp.Plot(fig) + + +def build_entity_df(entites, id) -> pd.DataFrame: + text = [ent.text for ent in entites] + types = [ent.type for ent in entites] + replaced_values = [ + ent.replacement_string or "{{" + ent.placeholder + "}}" for ent in entites + ] + d = { + "Row ID": id, + "Entity (Original Text)": text, + "Type": types, + "Redacted To": replaced_values, + } + df = pd.DataFrame(data=d) + if df.size == 0: + # Datapane does not support empty dataframe, append a dummy row + df2 = { + "Row ID": id, + "Entity (Original Text)": "-", + "Type": "-", + "Redacted To": "-", + } + df = df.append(df2, ignore_index=True) + return df + + +class RowReportFields: + def __init__(self, row_spec: PiiReportPageSpec, show_sensitive_info: bool = True): + self.spec = row_spec + self.show_sensitive_info = show_sensitive_info + + def build_report(self) -> dp.Group: + return dp.Group( + dp.Select( + blocks=[ + self._make_stats_card(), + self._make_text_card(), + ], + type=dp.SelectType.TABS, + ), + label="Row Id: " + str(self.spec.id), + ) + + def _make_stats_card(self): + stats = [ + dp.Text("## Row Summary Statistics"), + dp.BigNumber( + heading="Total No. Of Entites Proceed", + value=self.spec.total_tokens or 0, + ), + dp.Text(f"### Entities Distribution"), + plot_pie(self.spec.statics), + ] + if self.show_sensitive_info: + stats.append(dp.Text(f"### Resolved Entities")) + stats.append( + dp.DataTable( + build_entity_df(self.spec.entities, id=self.spec.id), + label="Resolved Entities", + ) + ) + return dp.Group(blocks=stats, label="STATS") + + def _make_text_card(self): + annotations = [] + labels = set() + for ent in self.spec.entities: + annotations.append((ent.beg, ent.end, ent.type)) + labels.add(ent.type) + + if len(annotations) == 0: + annotations.append((0, 0, "No entity detected")) + + d = {"Content": [self.spec.raw_text], "Annotations": [annotations]} + df = pd.DataFrame(data=d) + render_html = df.ads.render_ner( + options={ + "default_color": DEFAULT_COLOR, + "colors": map_label_to_color(labels), + }, + return_html=True, + ) + return dp.Group(dp.HTML(render_html), label="TEXT") + + +class PIIOperatorReport: + def __init__(self, report_spec: PiiReportSpec, report_uri: str): + # set useful field for generating report from context + self.report_spec = report_spec + self.show_rows = report_spec.run_summary.show_rows or DEFAULT_SHOW_ROWS + + rows = report_spec.run_details.rows + rows = rows[0 : self.show_rows] + self.rows_details = [ + RowReportFields(r, report_spec.run_summary.show_sensitive_info) + for r in rows + ] + + self.report_uri = report_uri + + def make_view(self): + title_text = dp.Text("# Personally Identifiable Information Operator Report") + time_proceed = dp.BigNumber( + heading="Ran at", + value=self.report_spec.run_summary.timestamp or "today", + ) + report_description = dp.Text(PII_REPORT_DESCRIPTION) + + structure = dp.Blocks( + dp.Select( + blocks=[ + dp.Group( + self._build_summary_page(), + label="Summary", + ), + dp.Group( + self._build_details_page(), + label="Details", + ), + ], + type=dp.SelectType.TABS, + ) + ) + self.report_sections = [title_text, report_description, time_proceed, structure] + return self + + def save_report(self, report_sections=None, report_uri=None, storage_options={}): + with tempfile.TemporaryDirectory() as temp_dir: + report_local_path = os.path.join(temp_dir, "___report.html") + block_print() + dp.save_report( + report_sections or self.report_sections, + path=report_local_path, + open=False, + ) + enable_print() + + report_uri = report_uri or self.report_uri + with open(report_local_path) as f1: + with fsspec.open( + report_uri, + "w", + **storage_options, + ) as f2: + f2.write(f1.read()) + + def _build_summary_page(self): + summary = dp.Blocks( + dp.Text("# PII Summary"), + dp.Text(self._get_summary_desc()), + dp.Select( + blocks=[ + self._make_summary_stats_card(), + self._make_yaml_card(), + self._make_model_card(), + ], + type=dp.SelectType.TABS, + ), + ) + + return summary + + def _build_details_page(self): + details = dp.Blocks( + dp.Text(DETAILS_REPORT_DESCRIPTION), + dp.Select( + blocks=[ + row.build_report() for row in self.rows_details + ], # RowReportFields + type=dp.SelectType.DROPDOWN, + label="Details", + ), + ) + + return details + + def _make_summary_stats_card(self) -> dp.Group: + """ + Shows summary statics + 1. total rows + 2. total entites + 3. time_spent/row + 4. entities distribution + 5. resolved Entities in sample data - optional + """ + try: + process_rate = compute_rate( + self.report_spec.run_summary.elapsed_time, + self.report_spec.run_summary.total_rows, + ) + except Exception as e: + logger.warning("Failed to compute processing rate.") + logger.debug(f"Full traceback: {e}") + process_rate = "-" + + summary_stats = [ + dp.Text("## Summary Statistics"), + dp.Group( + dp.BigNumber( + heading="Total No. Of Rows", + value=self.report_spec.run_summary.total_rows or "unknown", + ), + dp.BigNumber( + heading="Total No. Of Entites Proceed", + value=self.report_spec.run_summary.total_tokens, + ), + dp.BigNumber( + heading="Rows per second processed", + value=process_rate, + ), + dp.BigNumber( + heading="Total Time Spent", + value=human_time_friendly( + self.report_spec.run_summary.elapsed_time + ), + ), + columns=2, + ), + dp.Text(f"### Entities Distribution"), + plot_pie(self.report_spec.run_summary.statics), + ] + if self.report_spec.run_summary.show_sensitive_info: + entites_df = self._build_total_entity_df() + summary_stats.append(dp.Text(f"### Resolved Entities")) + summary_stats.append(dp.DataTable(entites_df)) + return dp.Group(blocks=summary_stats, label="STATS") + + def _make_yaml_card(self) -> dp.Group: + """Shows the full pii config yaml.""" + yaml_string = self.report_spec.run_summary.config.to_yaml() + yaml_appendix_title = dp.Text(f"## Reference: YAML File") + yaml_appendix = dp.Code(code=yaml_string, language="yaml") + return dp.Group(blocks=[yaml_appendix_title, yaml_appendix], label="YAML") + + def _make_model_card(self) -> dp.Group: + """Generates model card.""" + if len(self.report_spec.run_summary.selected_spacy_model) == 0: + return dp.Group( + dp.Text("No model used."), + label="MODEL CARD", + ) + + model_cards = [ + dp.Group( + make_model_card(model_name=x.get("model")), + label=x.get("model"), + ) + for x in self.report_spec.run_summary.selected_spacy_model + ] + + if len(model_cards) <= 1: + return dp.Group( + blocks=model_cards, + label="MODEL CARD", + ) + return dp.Group( + dp.Select( + blocks=model_cards, + type=dp.SelectType.TABS, + ), + label="MODEL CARD", + ) + + def _build_total_entity_df(self) -> pd.DataFrame: + frames = [] + for row in self.rows_details: # RowReportFields + frames.append(build_entity_df(entites=row.spec.entities, id=row.spec.id)) + + result = pd.concat(frames) + return result + + def _get_summary_desc(self) -> str: + entities_mark_down = [ + "**" + ent + "**" for ent in self.report_spec.run_summary.selected_entities + ] + + model_description = "" + for spacy_model in self.report_spec.run_summary.selected_spacy_model: + model_description = ( + model_description + + f"You chose the **{spacy_model.get('model', 'unknown model')}** model for **{spacy_model.get('spacy_entites', 'unknown entities')}** detection." + ) + if model_description: + model_description = ( + model_description + + "You can view the model details under the ``MODEL CARD`` tab." + ) + + SUMMARY_REPORT_DESCRIPTION_TEMPLATE = f""" + This report will detail the statistics and configuration of the redaction process.The report will contain information such as the number of rows processed, the number of entities redacted, and so on. The report will provide valuable insight into the performance of the PII tool and facilitate any necessary adjustments to improve its performance. + + Based on the configuration file (you can view the YAML details under the ``YAML`` tab), you selected the following entities: {entities_mark_down}. + {model_description} + """ + return SUMMARY_REPORT_DESCRIPTION_TEMPLATE diff --git a/ads/opctl/operator/lowcode/pii/operator_config.py b/ads/opctl/operator/lowcode/pii/operator_config.py new file mode 100644 index 000000000..d70e8770b --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/operator_config.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +from dataclasses import dataclass, field +from typing import Dict, List + +from ads.common.serializer import DataClassSerializable +from ads.opctl.operator.common.operator_config import OperatorConfig +from ads.opctl.operator.common.utils import _load_yaml_from_uri +from ads.opctl.operator.lowcode.pii.constant import ( + DEFAULT_SHOW_ROWS, + DEFAULT_REPORT_FILENAME, + DEFAULT_TARGET_COLUMN, +) + + +@dataclass(repr=True) +class InputData(DataClassSerializable): + """Class representing operator specification input data details.""" + + url: str = None + + +@dataclass(repr=True) +class OutputDirectory(DataClassSerializable): + """Class representing operator specification output directory details.""" + + url: str = None + name: str = None + + +@dataclass(repr=True) +class Report(DataClassSerializable): + """Class representing operator specification report details.""" + + report_filename: str = None + show_rows: int = None + show_sensitive_content: bool = False + + +@dataclass(repr=True) +class Detector(DataClassSerializable): + """Class representing operator specification redactor directory details.""" + + name: str = None + action: str = None + + +@dataclass(repr=True) +class PiiOperatorSpec(DataClassSerializable): + """Class representing pii operator specification.""" + + input_data: InputData = field(default_factory=InputData) + output_directory: OutputDirectory = field(default_factory=OutputDirectory) + report: Report = field(default_factory=Report) + target_column: str = None + detectors: List[Dict] = field(default_factory=list) + + def __post_init__(self): + """Adjusts the specification details.""" + + self.target_column = self.target_column or DEFAULT_TARGET_COLUMN + self.report = self.report or Report.from_dict( + { + "report_filename": DEFAULT_REPORT_FILENAME, + "show_rows": DEFAULT_SHOW_ROWS, + "show_sensitive_content": False, + } + ) + + +@dataclass(repr=True) +class PiiOperatorConfig(OperatorConfig): + """Class representing pii operator config. + + Attributes + ---------- + kind: str + The kind of the resource. For operators it is always - `operator`. + type: str + The type of the operator. For pii operator it is always - `pii` + version: str + The version of the operator. + spec: PiiOperatorSpec + The pii operator specification. + """ + + kind: str = "operator" + type: str = "pii" + version: str = "v1" + spec: PiiOperatorSpec = field(default_factory=PiiOperatorSpec) + + @classmethod + def _load_schema(cls) -> str: + """Loads operator schema.""" + return _load_yaml_from_uri( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "schema.yaml") + ) diff --git a/ads/opctl/operator/lowcode/pii/schema.yaml b/ads/opctl/operator/lowcode/pii/schema.yaml new file mode 100644 index 000000000..ff295c7fa --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/schema.yaml @@ -0,0 +1,108 @@ +kind: + allowed: + - operator + required: true + type: string + default: operator + meta: + description: "Which service are you trying to use? Common kinds: `operator`, `job`" + +version: + allowed: + - "v1" + required: true + type: string + default: v1 + meta: + description: "Operators may change yaml file schemas from version to version, as well as implementation details. Double check the version to ensure compatibility." + +type: + required: true + type: string + default: pii + meta: + description: "Type should always be `pii` when using a pii operator" + + +spec: + required: true + schema: + input_data: + required: true + type: dict + meta: + description: "This should be indexed by target column." + schema: + url: + required: true + type: string + default: data.csv + meta: + description: "The url can be local, or remote. For example: `oci://@/data.csv`" + + output_directory: + required: true + schema: + url: + required: true + type: string + default: result/ + meta: + description: "The url can be local, or remote. For example: `oci://@/`" + name: + required: false + type: string + default: data-out.csv + type: dict + + report: + required: false + schema: + report_filename: + required: true + type: string + default: report.html + meta: + description: "Placed into `output_directory` location. Defaults to `report.html`" + show_rows: + required: false + type: number + meta: + description: "The number of rows that shows in the report. Defaults to `10`" + show_sensitive_content: + required: true + default: false + type: boolean + meta: + description: "Whether to show sensitive content in the report. Defaults to `False`" + type: dict + + target_column: + type: string + required: true + default: target + meta: + description: "Column with user data." + + detectors: + type: list + required: true + schema: + type: dict + schema: + name: + required: true + type: string + meta: + description: "The name of the detector. THe format is `.`." + action: + required: true + type: string + default: mask + allowed: + - anonymize + - mask + - remove + meta: + description: "The way to process the detected entity. Default to `mask`." + type: dict diff --git a/ads/opctl/operator/lowcode/pii/utils.py b/ads/opctl/operator/lowcode/pii/utils.py new file mode 100644 index 000000000..50f28eed9 --- /dev/null +++ b/ads/opctl/operator/lowcode/pii/utils.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +import sys + +import fsspec +import pandas as pd + +from ads.common.object_storage_details import ObjectStorageDetails + +from .errors import PIIInputDataError + + +def default_signer(**kwargs): + os.environ["EXTRA_USER_AGENT_INFO"] = "Pii-Operator" + from ads.common.auth import default_signer + + return default_signer(**kwargs) + + +def _call_pandas_fsspec(pd_fn, filename, storage_options, **kwargs): + if fsspec.utils.get_protocol(filename) == "file": + return pd_fn(filename, **kwargs) + + storage_options = storage_options or ( + default_signer() if ObjectStorageDetails.is_oci_path(filename) else {} + ) + + return pd_fn(filename, storage_options=storage_options, **kwargs) + + +def _load_data(filename, format=None, storage_options=None, columns=None, **kwargs): + if not format: + _, format = os.path.splitext(filename) + format = format[1:] + if format in ["json", "csv"]: + read_fn = getattr(pd, f"read_{format}") + data = _call_pandas_fsspec(read_fn, filename, storage_options=storage_options) + elif format in ["tsv"]: + data = _call_pandas_fsspec( + pd.read_csv, filename, storage_options=storage_options, sep="\t" + ) + else: + raise PIIInputDataError(f"Unrecognized format: {format}") + if columns: + # keep only these columns, done after load because only CSV supports stream filtering + data = data[columns] + return data + + +def _write_data( + data, filename, format=None, storage_options=None, index=False, **kwargs +): + if not format: + _, format = os.path.splitext(filename) + format = format[1:] + if format in ["json", "csv"]: + write_fn = getattr(data, f"to_{format}") + return _call_pandas_fsspec( + write_fn, filename, index=index, storage_options=storage_options + ) + raise PIIInputDataError(f"Unrecognized format: {format}") + + +def get_output_name(given_name, target_name=None): + """Add ``-out`` suffix to the src filename.""" + if not target_name: + basename = os.path.basename(given_name) + fn, ext = os.path.splitext(basename) + target_name = fn + "_out" + ext + return target_name + + +def construct_filth_cls_name(name: str) -> str: + """Constructs the filth class name from the given name. + For example, "name" -> "NameFilth". + + Args: + name (str): filth class name. + + Returns: + str: The filth class name. + """ + return "".join([s.capitalize() for s in name.split("_")]) + "Filth" + + +################ +# Report utils # +################ +def compute_rate(elapsed_time, num_unit): + return elapsed_time / num_unit + + +def human_time_friendly(seconds): + TIME_DURATION_UNITS = ( + ("week", 60 * 60 * 24 * 7), + ("day", 60 * 60 * 24), + ("hour", 60 * 60), + ("min", 60), + ) + if seconds == 0: + return "inf" + accumulator = [] + for unit, div in TIME_DURATION_UNITS: + amount, seconds = divmod(float(seconds), div) + if amount > 0: + accumulator.append( + "{} {}{}".format(int(amount), unit, "" if amount == 1 else "s") + ) + accumulator.append("{} secs".format(round(seconds, 2))) + return ", ".join(accumulator) + + +# Disable +def block_print(): + sys.stdout = open(os.devnull, "w") + + +# Restore +def enable_print(): + sys.stdout = sys.__stdout__ diff --git a/ads/opctl/operator/runtime/__init__.py b/ads/opctl/operator/runtime/__init__.py new file mode 100644 index 000000000..b8d0460f5 --- /dev/null +++ b/ads/opctl/operator/runtime/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/ads/opctl/operator/runtime/const.py b/ads/opctl/operator/runtime/const.py new file mode 100644 index 000000000..356af3d00 --- /dev/null +++ b/ads/opctl/operator/runtime/const.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +from .runtime import ContainerRuntime, PythonRuntime + +RUNTIME_TYPE_MAP = { + ContainerRuntime.type: ContainerRuntime, + PythonRuntime.type: PythonRuntime, +} diff --git a/ads/opctl/operator/runtime/container_runtime_schema.yaml b/ads/opctl/operator/runtime/container_runtime_schema.yaml new file mode 100644 index 000000000..7ec4ee483 --- /dev/null +++ b/ads/opctl/operator/runtime/container_runtime_schema.yaml @@ -0,0 +1,50 @@ +kind: + allowed: + - operator.local + required: true + type: string + meta: + description: "The operator local runtime. Kind should always be `operator.local` when using an operator with local container runtime." +version: + allowed: + - "v1" + required: true + type: string + meta: + description: "Operator local runtime may change yaml file schemas from version to version, as well as implementation details. Double check the version to ensure compatibility." +type: + allowed: + - container + required: true + type: string + meta: + description: "Type should always be `container` when using an operator with local container runtime." +spec: + required: true + type: dict + schema: + image: + nullable: true + required: false + type: string + default: image:tag + meta: + description: "The image to run the operator. By default will be used the operator name with latest tag." + env: + nullable: true + required: false + type: list + schema: + type: dict + schema: + name: + type: string + value: + type: + - number + - string + volume: + required: false + type: + - string + - list diff --git a/ads/opctl/operator/runtime/python_runtime_schema.yaml b/ads/opctl/operator/runtime/python_runtime_schema.yaml new file mode 100644 index 000000000..f3523d3aa --- /dev/null +++ b/ads/opctl/operator/runtime/python_runtime_schema.yaml @@ -0,0 +1,21 @@ +kind: + allowed: + - operator.local + required: true + type: string + meta: + description: "The operator local runtime. Kind should always be `operator.local` when using an operator with local python runtime." +version: + allowed: + - "v1" + required: true + type: string + meta: + description: "Operator local runtime may change yaml file schemas from version to version, as well as implementation details. Double check the version to ensure compatibility." +type: + allowed: + - python + required: true + type: string + meta: + description: "Type should always be `python` when using an operator with local python runtime." diff --git a/ads/opctl/operator/runtime/runtime.py b/ads/opctl/operator/runtime/runtime.py new file mode 100644 index 000000000..d53fb1450 --- /dev/null +++ b/ads/opctl/operator/runtime/runtime.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +import json +import os +from dataclasses import dataclass, field +from typing import Any, ClassVar, Dict, List + +from cerberus import Validator + +from ads.common.extended_enum import ExtendedEnum +from ads.common.serializer import DataClassSerializable +from ads.opctl.operator.common.utils import _load_yaml_from_uri +from ads.opctl.operator.common.errors import OperatorSchemaYamlError + +class OPERATOR_LOCAL_RUNTIME_TYPE(ExtendedEnum): + PYTHON = "python" + CONTAINER = "container" + + +OPERATOR_LOCAL_RUNTIME_KIND = "operator.local" + + +@dataclass(repr=True) +class Runtime(DataClassSerializable): + """Base class for the operator's runtimes.""" + + _schema: ClassVar[str] = "" + kind: str = OPERATOR_LOCAL_RUNTIME_KIND + type: str = None + version: str = None + spec: Any = None + + @classmethod + def _validate_dict(cls, obj_dict: Dict) -> bool: + """Validates the operator specification. + + Parameters + ---------- + obj_dict: (dict) + Dictionary representation of the object + + Returns + ------- + bool + True if the validation passed, else False. + """ + schema = _load_yaml_from_uri( + os.path.join(os.path.dirname(os.path.abspath(__file__)), cls._schema) + ) + validator = Validator(schema, purge_unknown=True) + result = validator.validate(obj_dict) + + if not result: + raise OperatorSchemaYamlError(json.dumps(validator.errors, indent=2)) + return True + + +@dataclass(repr=True) +class ContainerRuntimeSpec(DataClassSerializable): + """Represents a container operator runtime specification.""" + + image: str = None + env: List[Dict] = field(default_factory=list) + volume: List[str] = field(default_factory=list) + + +@dataclass(repr=True) +class ContainerRuntime(Runtime): + """Represents a container operator runtime.""" + + _schema: ClassVar[str] = "container_runtime_schema.yaml" + type: str = OPERATOR_LOCAL_RUNTIME_TYPE.CONTAINER.value + version: str = "v1" + spec: ContainerRuntimeSpec = field(default_factory=ContainerRuntimeSpec) + + @classmethod + def init(cls, **kwargs: Dict) -> "ContainerRuntime": + """Initializes a starter specification for the runtime. + + Returns + ------- + ContainerRuntime + The runtime instance. + """ + return cls(spec=ContainerRuntimeSpec.from_dict(kwargs)) + + +@dataclass(repr=True) +class PythonRuntime(Runtime): + """Represents a python operator runtime.""" + + _schema: ClassVar[str] = "python_runtime_schema.yaml" + type: str = OPERATOR_LOCAL_RUNTIME_TYPE.PYTHON.value + version: str = "v1" + + @classmethod + def init(cls, **kwargs: Dict) -> "PythonRuntime": + """Initializes a starter specification for the runtime. + + Returns + ------- + PythonRuntime + The runtime instance. + """ + return cls() diff --git a/ads/opctl/spark/cli.py b/ads/opctl/spark/cli.py index f63b0be6b..9d571e997 100644 --- a/ads/opctl/spark/cli.py +++ b/ads/opctl/spark/cli.py @@ -15,6 +15,7 @@ @click.group("spark") @click.help_option("--help", "-h") def commands(): + "The CLI to assist in the management of the Spark workloads." pass diff --git a/ads/opctl/utils.py b/ads/opctl/utils.py index 54ffd9d93..a0dcaa69b 100644 --- a/ads/opctl/utils.py +++ b/ads/opctl/utils.py @@ -11,9 +11,7 @@ import subprocess import sys import shlex -import tempfile import urllib.parse -from distutils import dir_util from subprocess import Popen, PIPE, STDOUT from typing import Union, List, Tuple, Dict import yaml @@ -24,9 +22,7 @@ from ads.opctl import logger from ads.opctl.constants import ( ML_JOB_IMAGE, - OPS_IMAGE_BASE, ML_JOB_GPU_IMAGE, - OPS_IMAGE_GPU_BASE, ) from ads.common.decorator.runtime_dependency import ( runtime_dependency, @@ -96,12 +92,6 @@ def get_region_key(auth: dict) -> str: return client.get_tenancy(tenancy).data.home_region_key -# Not needed at the moment -# def _get_compartment_name(compartment_id: str, auth: dict) -> str: -# client = OCIClientFactory(**auth).identity -# return client.get_compartment(compartment_id=compartment_id).data.name - - def publish_image(image: str, registry: str = None) -> None: # pragma: no cover """ Publish an image. @@ -122,6 +112,7 @@ def publish_image(image: str, registry: str = None) -> None: # pragma: no cover print(f"pushed {image}") return image else: + registry = registry.rstrip("/") run_command( ["docker", "tag", f"{image}", f"{registry}/{os.path.basename(image)}"] ) @@ -130,22 +121,18 @@ def publish_image(image: str, registry: str = None) -> None: # pragma: no cover return f"{registry}/{os.path.basename(image)}" -def build_image( - image_type: str, gpu: bool = False, source_folder: str = None, dst_image: str = None -) -> None: +def build_image(image_type: str, gpu: bool = False) -> None: """ Build an image for opctl. Parameters ---------- image_type: str - specify the image to build, can take 'job-local' or 'ads-ops-base', + specify the image to build, can take 'job-local', former for running job with conda pack locally, latter for running operators gpu: bool whether to use gpu version of image - source_folder: str - source folder when building custom operator, to be included in custom image dst_image: str image to save as when building custom operator @@ -205,40 +192,6 @@ def _get_image_name_dockerfile_target(type: str, gpu: bool, arch: str) -> str: return look_up[(type, gpu, arch)] -@runtime_dependency(module="docker", install_from=OptionalDependency.OPCTL) -def _build_custom_operator_image( - gpu: bool, source_folder: str, dst_image: str -) -> None: # pragma: no cover - operator = os.path.basename(source_folder) - base_image_name = OPS_IMAGE_BASE if not gpu else OPS_IMAGE_GPU_BASE - try: - client = docker.from_env() - client.api.inspect_image(base_image_name) - except docker.errors.ImageNotFound: - build_image("ads-ops-base", gpu) - with tempfile.TemporaryDirectory() as td: - dir_util.copy_tree(source_folder, os.path.join(td, operator)) - if os.path.exists(os.path.join(td, operator, "environment.yaml")): - with open(os.path.join(td, "Dockerfile"), "w") as f: - f.write( - f""" -FROM {base_image_name} -COPY ./{operator}/environment.yaml operators/{operator}/environment.yaml -RUN conda env update -f operators/{operator}/environment.yaml --name op_env && conda clean -afy -COPY ./{operator} operators/{operator} - """ - ) - else: - with open(os.path.join(td, "Dockerfile"), "w") as f: - f.write( - f""" -FROM {base_image_name} -COPY ./{operator} operators/{operator} - """ - ) - return run_command(["docker", "build", "-t", f"{dst_image}", "."], td) - - def run_command( cmd: Union[str, List[str]], cwd: str = None, shell: bool = False ) -> Popen: @@ -298,6 +251,8 @@ def wrapper(*args, **kwargs): @runtime_dependency(module="docker", install_from=OptionalDependency.OPCTL) def get_docker_client() -> "docker.client.DockerClient": + import docker + process = subprocess.Popen( ["docker", "info"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT ) @@ -315,7 +270,9 @@ def run_container( command: str = None, entrypoint: str = None, verbose: bool = False, -): +) -> int: + import docker + if env_vars is None: env_vars = {} # If Proxy variables are setup, pass it on to the docker run diff --git a/ads/pipeline/ads_pipeline.py b/ads/pipeline/ads_pipeline.py index fc67bd104..73b247876 100644 --- a/ads/pipeline/ads_pipeline.py +++ b/ads/pipeline/ads_pipeline.py @@ -1981,7 +1981,7 @@ def status(self) -> Optional[str]: return self.data_science_pipeline.lifecycle_state return None - def init(self) -> "Pipeline": + def init(self, **kwargs) -> "Pipeline": """Initializes a starter specification for the Pipeline. Returns diff --git a/dev-requirements.txt b/dev-requirements.txt index 2662845e7..038d2bfe2 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,5 +1,5 @@ -r test-requirements.txt --e ".[bds,data,geo,huggingface,notebook,onnx,opctl,optuna,spark,tensorflow,text,torch,viz]" +-e ".[bds,data,geo,huggingface,notebook,onnx,opctl,optuna,spark,tensorflow,text,torch,viz,forecast,pii]" arff category_encoders dask diff --git a/docs/source/index.rst b/docs/source/index.rst index cc3d35e59..ca4e6b4d2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -35,6 +35,16 @@ Oracle Accelerated Data Science (ADS) user_guide/cli/opctl/configure user_guide/cli/opctl/local-development-setup +.. toctree:: + :hidden: + :maxdepth: 5 + :caption: Operators: + + user_guide/operators/index + user_guide/operators/common/index + user_guide/operators/forecasting_operator/index + user_guide/operators/pii_operator/index + .. toctree:: :hidden: :maxdepth: 5 diff --git a/docs/source/user_guide/cli/opctl/configure.rst b/docs/source/user_guide/cli/opctl/configure.rst index 4990951d5..88313d2e1 100644 --- a/docs/source/user_guide/cli/opctl/configure.rst +++ b/docs/source/user_guide/cli/opctl/configure.rst @@ -28,6 +28,7 @@ This will prompt you to setup default ADS CLI configurations for each OCI profil [OCI] oci_config = ~/.oci/config oci_profile = ANOTHERPROF + auth = api_key # security_token, instance_principal, resource_principal [CONDA] conda_pack_folder = @@ -137,7 +138,7 @@ To generate starter specification run - ads opctl init --help -The resource type is a mandatory attribute that needs to be provided. Currently supported resource types - `dataflow`, `deployment`, `job` and `pipeline`. +The resource type is a mandatory attribute that needs to be provided. Currently supported resource types - ``dataflow``, ``deployment``, ``job`` and ``pipeline``. For instance to generate starter specification for the Data Science job, run - .. code-block:: @@ -149,10 +150,10 @@ The resulting YAML will be printed in the console. By default the ``python`` run **Supported runtimes** - - For a ``job`` - `container`, `gitPython`, `notebook`, `python` and `script`. - - For a ``pipeline`` - `container`, `gitPython`, `notebook`, `python` and `script`. - - For a ``dataflow`` - `dataFlow` and `dataFlowNotebook`. - - For a ``deployment`` - `conda` and `container`. + - For a ``job`` - ``container``, ``gitPython``, ``notebook``, ``python`` and ``script``. + - For a ``pipeline`` - ``container``, ``gitPython``, ``notebook``, ``python`` and ``script``. + - For a ``dataflow`` - ``dataFlow`` and ``dataFlowNotebook``. + - For a ``deployment`` - ``conda`` and ``container``. If you want to specify a particular runtime use - @@ -166,4 +167,3 @@ Use the ``--output`` attribute to save the result in a YAML file. .. code-block:: ads opctl init job --runtime-type container --output job_with_container_runtime.yaml - diff --git a/docs/source/user_guide/operators/common/explore.rst b/docs/source/user_guide/operators/common/explore.rst new file mode 100644 index 000000000..8e67294c8 --- /dev/null +++ b/docs/source/user_guide/operators/common/explore.rst @@ -0,0 +1,273 @@ +============================= +Explore & Configure Operators +============================= + +After ensuring that you have all the necessary prerequisites in order, the next step is to explore and configure the operators. This guide will take you through the process, utilizing the Command Line Interface (CLI) tool to assist you in this endeavor. This step is a preliminary one that precedes the execution of operators. + +.. admonition:: Prerequisites + :class: note + + Before we start, let's ensure you have everything you need for easy starting. If you haven't already, install the accompanying CLI tool, detailed installation instructions can be found in the links below. + + - :doc:`Install ADS CLI<../../cli/quickstart>` + - :doc:`Configure Defaults<../../cli/opctl/configure>` + + +CLI Overview +============ + +The ``ads operator`` CLI tool is your trusty companion when working with operators. It simplifies the process, making it easy to explore the catalog of registered operators, gain insights into their specific use cases, and configure them to meet your needs. Additionally, this tool provides assistance in constructing Docker containers or setting up Conda environments to execute the operator, all while guiding you through the essential steps for running them. + + +Let's start from the very beginning by calling the following command in your terminal to see a list of supported CLI commands: + +.. code-block:: bash + + ads operator --help + + +This command provides a concise overview of all available commands. + +- ``ads operator list``: Retrieve a list of registered operators with this command. + +- ``ads operator info``: Obtain detailed information about a specific operator using this command. It offers comprehensive instructions on how to configure and run the operator across different environments and runtimes. + +- ``ads operator init``: Generate starter YAML configurations for an operator with this command. + +- ``ads operator verify``: Ensure the correctness of an operator's YAML specification using this command. + +- ``ads operator build-conda``: Build a new Conda environment tailored to a particular operator using this command. + +- ``ads operator publish-conda``: Publish the operator's Conda environment to the Object Storage bucket with this command. + +- ``ads operator build-image``: Create a new image customized for the operator using this command. + +- ``ads operator publish-image``: Publish the operator's image to the container registry with this command. + + +Listing Operators +----------------- + +Begin by browsing our operator catalog to discover the pre-packaged solutions available for various data science tasks. The catalog provides short descriptions of each operator's capabilities and use cases. + +.. code-block:: bash + + ads operator list + + +.. figure:: figures/operator_list.png + :align: center + + +Getting an Operator's Details +----------------------------- + +Each operator is accompanied by highly detailed instructions explaining how it can be configured and executed in various environments. + +.. code-block:: bash + + ads operator info --help + + +.. figure:: figures/operator_info.png + :align: center + + +.. code-block:: bash + + ads operator info --type + + +.. figure:: figures/operator_info1.png + :align: center + + +Initializing an Operator's Configs +---------------------------------- + +The ``init`` command is indeed crucial as it generates the foundational configurations tailored for a specific operator. + +Before you start, make sure to complete the :doc:`Configure Defaults<../../cli/opctl/configure>` step. This step is essential as it establishes default values for different options when running the operator on OCI Data Science jobs or OCI Data Flow applications. If you have already completed this setup and are using a flexible shape, remember to adjust the ``ml_job_config.ini`` and ``dataflow_config.ini`` files with the shape configuration details and ``docker_registry`` information. + +**ml_job_config.ini** + +- ``ocpus = 1`` +- ``memory_in_gbs = 16`` +- ``docker_registry = `` + +**dataflow_config.ini** + +- ``driver_shape = VM.Standard.E4.Flex`` +- ``executor_shape = VM.Standard.E4.Flex`` +- ``driver_shape_ocpus = 1`` +- ``executor_shape_ocpus = 1`` +- ``driver_shape_memory_in_gbs = 16`` +- ``executor_shape_memory_in_gbs = 16`` + +Now let's explore the ``init`` command. + +.. code-block:: bash + + ads operator init --help + +.. figure:: figures/operator_init.png + :align: center + +To set up your initial configuration files, execute the following command in your terminal: + +.. code-block:: bash + + ads operator init --type --overwrite --output ~/ + +By default, this command will generate separate configuration files for the operator and supported backends. This separation allows you to maintain a single configuration for the operator while having multiple configurations for different backend environments. This design enables running the same operator in various environments without altering the operator's configuration. + +The following flags are available for customization: + +- ``--overwrite``: This flag is used to overwrite existing files in the ``~//config`` folder. + +- ``--output``: Use this flag to specify the output folder where the configuration files will be generated. + +- ``--type``: This flag is mandatory and is used to specify the operator type. + +Upon executing the command, a list of configuration files will be created in the ``~/`` folder. + +If you need to merge the operator's configuration file with the backend configuration files into a single configuration file, use the following command: + +.. code-block:: bash + + ads operator init --type --overwrite --output ~/ --merge-config + +This command will combine the operator and backend configurations into one cohesive file, simplifying running the operator with one configuration file. + + +**The essential files generated include:** + +- **.yaml**: Contains configuration related to particular operator. Will only be generated if ``--merge-config`` flag is not used. +- **_operator_local_python.yaml**: This file includes local backend configuration for running operator in a local environment. You must manually set up the environment before executing the operator. +- **_local_container.yaml**: This file contains local backend configuration for running operator within a local container. You should build the container before running the operator, following the instructions below. +- **_job_container.yaml**: Contains Data Science job-related configuration for running operator in a container (BYOC) runtime. The container must be built and published before executing the operator, as detailed below. For comprehensive details about the supported configuration options, including the schema and available settings, please refer to the :doc:`OCI Data Science Jobs<../../jobs/yaml_schema>` documentation. +- **_job_python.yaml**: Contains Data Science job-related configuration to run operator in a Data Science job within a conda runtime. The conda environment should be built and published before running the operator. For comprehensive details about the supported configuration options, including the schema and available settings, please refer to the :doc:`OCI Data Science Jobs YAML Schema<../../jobs/yaml_schema>` documentation. +- **b_dataflow_dataflow.yaml**: Contains Data Flow application-related configuration to run operator in a Data Flow application. The conda environment should be built and published before running the operator. For comprehensive details about the supported configuration options, including the schema and available settings, please refer to the :doc:`Data Flow Application YAML Schema<../../apachespark/datafloe>` documentation. + +These generated configurations are designed to be ready for use without additional adjustments. However, they are provided as starter kit configurations that can be customized as needed. + +The operator's configuration file, named as ``.yaml``, is generated based on the operator's schema and contains the essential input attributes required to run the operator. These attributes serve as the bare minimum configuration for the operator to function. + +However, in cases where the operator requires specific input or output sources of data, you may need to adjust the configuration manually to accommodate these requirements. + +Beyond the basic input attributes, additional configurations within the YAML file are generated based on the information provided during the pre-step of configuring defaults, as detailed in the :doc:`Configure Defaults<../../cli/opctl/configure>` documentation. These configurations are populated using environment variables as well, ensuring that the operator runs with the necessary settings and parameters. + +In summary, while the core configuration is automatically generated from the operator's schema, you have the flexibility to fine-tune and customize the configuration to match your specific data science needs. + + +Verifying an Operator's Config +------------------------------ + +Before proceeding to run an operator, it's essential to verify the operator's configuration. Running an operator can be resource-intensive, so it's a valuable step to ensure that the configuration is correct before initiating the operation. Once you have obtained the operator's configuration (specification) in YAML format, you can use the following command to verify it: + +.. code-block:: bash + + ads operator verify --help + +.. figure:: figures/operator_config_verify.png + :align: center + +The configuration file can be located in an Object Storage bucket or in a local repository. To specify the authentication type, use the ``--auth`` attribute. + +Verification helps you catch any errors or inconsistencies in the operator's configuration, ensuring that it will run smoothly and produce the desired results when executed. + +.. code-block:: bash + + ads operator verify -f ~//.yaml + +.. figure:: figures/operator_config_verify_result.png + :align: center + +Building an Operator's Image +---------------------------- + +In order to run an operator within a local container or utilize it with the OCI Data Science Jobs service's BYOC (Bring Your Own Container) runtime, you must first create a dedicated container image for the specific operator. This process is straightforward and only requires that you have the Docker CLI installed on your local machine. Below, we outline the steps to build your custom container. + +.. code-block:: bash + + ads operator build-image --help + +.. figure:: figures/build_operator_image.png + :align: center + +The fundamental attribute you need to provide is ``--type``, which represents the name of the operator. The operator's name and version will be automatically used as the image name and tag. + +.. code-block:: bash + + ads operator build-image --type + +An interesting point to note is that the operator's container can be built to accommodate both CPU and GPU architectures, although this capability depends on the specific operator's requirements. + +Once the operator's image is successfully built, you have the option to publish it to the `Oracle Container Registry `_. This publishing step becomes necessary when you intend to run the operator on the OCI Data Science Jobs service within the BYOC runtime. + + +Publishing an Operator's Image +------------------------------- + +After successfully building the operator's image, the next step is to publish it to the `Oracle Container Registry `_. As mentioned previously, publishing the image is specifically required when you intend to run the operator on the OCI Data Science Jobs service within the BYOC runtime. Fortunately, this step is quite straightforward. Below, you'll find instructions on how to publish the image to the Container Registry. + +.. code-block:: bash + + ads operator publish-image --help + +.. figure:: figures/publish_operator_image.png + :align: center + +The only mandatory parameter for this command is the operator type that you wish to publish. + +.. code-block:: bash + + ads operator publish-image --type + +While the image name is the only required parameter, you also have the option to provide the ``registry`` parameter if needed. By default, the information about the registry where the container should be published is retrieved from the ADS config generated during the :doc:`Configure Defaults<../../cli/opctl/configure>` step. + +For more detailed information on publishing containers, you can refer to the `Oracle Container Registry `_ documentation. + + +Building an Operator's Conda Environment +---------------------------------------- + +Another option for running an operator on OCI resources is to utilize a Conda environment. Building an operator's Conda environment is necessary if you intend to run the operator on the OCI Data Science Jobs service within the Conda runtime or on the Data Flow service. Additionally, the Conda environment can be employed within a Data Science Notebook Session to execute a specific operator. + +To build the operator's Conda environment, follow these steps: + +.. code-block:: bash + + ads operator build-conda --help + +.. figure:: figures/build_operator_conda.png + :align: center + +The only mandatory parameter for this command is the ``--type`` of the operator. However, you also have the option to specify the destination folder for the Conda environment. By default, the information about the destination folder where the Conda environment should be created is retrieved from the ADS config generated during the :doc:`Configure Defaults<../../cli/opctl/configure>` step. + +.. code-block:: bash + + ads operator build-conda --type + +Once you have successfully built the Conda environment, you will need to publish it to OCI Object Storage. This step allows the OCI Data Science Jobs and Data Flow services to utilize the Conda environment seamlessly. + + +Publishing an Operator's Conda Environment +------------------------------------------ + +To make a locally built Conda environment available in the OCI Object Storage bucket, follow these simple steps: + +.. code-block:: bash + + ads operator publish-conda --help + +.. figure:: figures/publish_operator_conda.png + :align: center + +For instance, if you have constructed a Conda environment for the specific operator, the command would appear as follows: + +.. code-block:: bash + + ads operator publish-conda -t + +Publishing the Conda environment to OCI Object Storage enables the OCI Data Science Jobs and Data Flow services to access and utilize this environment efficiently. This step is essential to ensure that your operators run seamlessly within the OCI ecosystem. diff --git a/docs/source/user_guide/operators/common/figures/build_operator_conda.png b/docs/source/user_guide/operators/common/figures/build_operator_conda.png new file mode 100644 index 000000000..814f9ebb0 Binary files /dev/null and b/docs/source/user_guide/operators/common/figures/build_operator_conda.png differ diff --git a/docs/source/user_guide/operators/common/figures/build_operator_image.png b/docs/source/user_guide/operators/common/figures/build_operator_image.png new file mode 100644 index 000000000..5da43a327 Binary files /dev/null and b/docs/source/user_guide/operators/common/figures/build_operator_image.png differ diff --git a/docs/source/user_guide/operators/common/figures/operator_config_verify.png b/docs/source/user_guide/operators/common/figures/operator_config_verify.png new file mode 100644 index 000000000..9dd736b54 Binary files /dev/null and b/docs/source/user_guide/operators/common/figures/operator_config_verify.png differ diff --git a/docs/source/user_guide/operators/common/figures/operator_config_verify_result.png b/docs/source/user_guide/operators/common/figures/operator_config_verify_result.png new file mode 100644 index 000000000..8134cd0df Binary files /dev/null and b/docs/source/user_guide/operators/common/figures/operator_config_verify_result.png differ diff --git a/docs/source/user_guide/operators/common/figures/operator_info.png b/docs/source/user_guide/operators/common/figures/operator_info.png new file mode 100644 index 000000000..8a8ae1346 Binary files /dev/null and b/docs/source/user_guide/operators/common/figures/operator_info.png differ diff --git a/docs/source/user_guide/operators/common/figures/operator_info1.png b/docs/source/user_guide/operators/common/figures/operator_info1.png new file mode 100644 index 000000000..73ee66f6c Binary files /dev/null and b/docs/source/user_guide/operators/common/figures/operator_info1.png differ diff --git a/docs/source/user_guide/operators/common/figures/operator_init.png b/docs/source/user_guide/operators/common/figures/operator_init.png new file mode 100644 index 000000000..7f9d94d9a Binary files /dev/null and b/docs/source/user_guide/operators/common/figures/operator_init.png differ diff --git a/docs/source/user_guide/operators/common/figures/operator_list.png b/docs/source/user_guide/operators/common/figures/operator_list.png new file mode 100644 index 000000000..a078dbc32 Binary files /dev/null and b/docs/source/user_guide/operators/common/figures/operator_list.png differ diff --git a/docs/source/user_guide/operators/common/figures/publish_operator_conda.png b/docs/source/user_guide/operators/common/figures/publish_operator_conda.png new file mode 100644 index 000000000..f058345c1 Binary files /dev/null and b/docs/source/user_guide/operators/common/figures/publish_operator_conda.png differ diff --git a/docs/source/user_guide/operators/common/figures/publish_operator_image.png b/docs/source/user_guide/operators/common/figures/publish_operator_image.png new file mode 100644 index 000000000..13e035489 Binary files /dev/null and b/docs/source/user_guide/operators/common/figures/publish_operator_image.png differ diff --git a/docs/source/user_guide/operators/common/index.rst b/docs/source/user_guide/operators/common/index.rst new file mode 100644 index 000000000..d8601d55e --- /dev/null +++ b/docs/source/user_guide/operators/common/index.rst @@ -0,0 +1,31 @@ +=============== +Getting Started +=============== + +Welcome to the world of operators! Getting started with operators is a breeze, and this section will guide you through the process step by step. Whether you're a seasoned data scientist or a newcomer, you'll find that harnessing the power of operators is both accessible and rewarding. + + +.. admonition:: Prerequisites + :class: note + + Before diving into operators, let's ensure you have everything you need for easy starting. If you haven't already, install the operators and the accompanying CLI tool. Detailed installation instructions can be found in the links below. + + - :doc:`Install ADS CLI<../../cli/quickstart>` + - :doc:`Configure Defaults<../../cli/opctl/configure>` + +After completing the necessary prerequisites, in order to fully leverage the capabilities of operators, it may become necessary to configure :doc:`IAM Policies<./policies>` tailored to the specific environment you are working in. Nevertheless, you retain the flexibility to postpone this task until you advance to the subsequent step of executing the operators. + +Utilizing our CLI tool makes it a straightforward process to explore the catalog of registered operators and gain insights into the specific challenges they address. Simply follow the :doc:`Explore Operators<./explore>` step to access further information. + +After completing the exploration and configuration of operators, the next step is to execute them on your chosen backend. For more detailed instructions, please refer to the :doc:`How To Run<./run>` section. + +With these comprehensive guides, you'll quickly become proficient in using operators across various environments. Don't hesitate to reach out to our support team if you encounter any issues or have questions along the way. Happy data science with operators! + + +.. toctree:: + :hidden: + :maxdepth: 1 + + ./policies + ./explore + ./run diff --git a/docs/source/user_guide/operators/common/policies.rst b/docs/source/user_guide/operators/common/policies.rst new file mode 100644 index 000000000..746f6046e --- /dev/null +++ b/docs/source/user_guide/operators/common/policies.rst @@ -0,0 +1,33 @@ +============ +IAM Policies +============ + +To unleash the full potential of operators, you might need to configure corresponding IAM policies. + + +Object Storage +~~~~~~~~~~~~~~ + +In order to store the results in an Oracle Cloud Infrastructure Object Storage bucket and retrieve source data from there, it may be necessary to set up specific policies for these actions. Find more details for writing policies to control access to Archive Storage, Object Storage, and Data Transfer on this `page `_. However every service like `Data Science Jobs `_ and `Data Flow Applications `_ have their own policies to access Object Storage. It would be preferred to start from the `About Data Science Policies `_ document, to understand the common conception of the Data Science policies. + + +Oracle Container Registry +~~~~~~~~~~~~~~~~~~~~~~~~~ + +`Oracle Cloud Infrastructure Registry `_ (also known as Container Registry) is an Oracle-managed registry that enables you to simplify your development to production workflow. To facilitate the publication of an operator's containers to the Oracle Container Registry, you may be required to configure the authentication `token `_ for this purpose. + + +Data Science Job +~~~~~~~~~~~~~~~~ + +If you're running operators within Oracle Cloud Infrastructure `Data Science Jobs `_, ensure you have the appropriate :doc:`policies <../../jobs/policies>` in place to grant access and permissions. It is advisable to begin with the `About Data Science Policies `_ document to comprehend the fundamental concepts of Data Science policies. + + +Data Flow Application +~~~~~~~~~~~~~~~~~~~~~ + +Oracle Cloud Infrastructure `Data Flow `_ is a fully managed service for running Apache Spark â„¢ applications, offering a simplified runtime environment for execution. Data Flow can serve as one of the backends for operators. However, `Data Flow `_ requires IAM policies to access resources for managing and running sessions. Refer to the `Data Flow Studio Policies `_ documentation for guidance on policy setup. + +After configuring the core Data Flow policies, consult the `Policies Required to Integrate Data Flow and Data Science `_ documentation to enable Data Flow to write data to the Object Storage bucket and manage logs effectively. + +To set up Object Storage for Data Flow, follow the `Set Up Object Store `_ documentation. diff --git a/docs/source/user_guide/operators/common/run.rst b/docs/source/user_guide/operators/common/run.rst new file mode 100644 index 000000000..b8535a39f --- /dev/null +++ b/docs/source/user_guide/operators/common/run.rst @@ -0,0 +1,350 @@ +========== +How To Run +========== + +It's time to run operators in your chosen backend. + +.. admonition:: Prerequisites + :class: note + + Before we start, let's ensure you have everything you need for easy starting. If you haven't already, install the accompanying CLI tool, detailed installation instructions can be found in the links below. + + - :doc:`Install ADS CLI<../../cli/quickstart>` + - :doc:`Configure Defaults<../../cli/opctl/configure>` + - :doc:`Explore & Configure Operators<./explore>` + - :doc:`IAM Policies<./policies>` + + +The first step is to generate starter kit configurations that simplify the execution of the operator across different backends. This can be done easily using the following command: + +.. code-block:: bash + + ads operator init --help + +.. figure:: figures/operator_init.png + :align: center + +.. admonition:: Important + :class: warning + + If the ``--merge-config`` flag is set to ``true``, the ``.yaml`` file will be merged with the backend configuration which contains pre-populated infrastructure and runtime sections. You don't need to provide a backend information separately in this case. + + .. code-block:: bash + + ads operator run -f .yaml + + Alternatively ``ads opctl run`` command can be used: + + .. code-block:: bash + + ads opctl run -f .yaml + + The operator will be run in chosen environment without requiring additional modifications. + + +Different Ways To Run Operator +------------------------------ + +To operator can be run in two different ways: + +.. code-block:: bash + + ads operator run -f .yaml + +Or alternatively: + +.. code-block:: bash + + ads opctl run -f .yaml + +Despite the presented above commands look equivalent, the ``ads operator run`` command is more flexible. +Here the few restrictions when running the operator within the ``ads opctl run`` command: + + - The ``.yaml`` file must contain all the necessary information for running the operator. This means that the ``.yaml`` file must contain the ``runtime`` section describing the backend configuration for the operator. + - If the ``.yaml`` file not contains the ``runtime`` section, then the ``ads opctl run`` command can be used in restricted mode with ``-b`` option. This option allows you to specify the backend to run the operator on. The ``-b`` option can be used with the following backends: ``local``, ``dataflow``, ``job``. However you will not be able to use the ``-b`` option with the local ``container`` backend and Data Science Jobs ``container`` backend. + + +Run Operator Locally +-------------------- + +There are several ways to run the operator in your local environment. The first option is to run it in the environment you've prepared on your own, assuming you've already installed all the necessary operator packages. The second option is to run the operator within a Docker container, which requires building a Docker image for the operator. + +Within Local Environment +~~~~~~~~~~~~~~~~~~~~~~~~ + +To run the operator locally, follow these steps: + +1. Create and activate a new conda environment named ````. +2. Install all the required libraries listed in the ``environment.yaml`` file generated by the ``ads operator init --type `` command. +3. Review the ``.yaml`` file generated by the ``ads operator init`` command and make necessary adjustments to input and output file locations. Notice that the ``.yaml`` will not be generated if the ``--merge-config`` flag is set to ``true``. +4. Verify the operator's configuration using the following command: + +.. code-block:: bash + + ads operator verify -f .yaml + +5. To run the operator within the ```` conda environment, use this command: + +.. code-block:: bash + + ads operator run -f .yaml -b local + +The alternative way to run the operator would be to use the ``ads opctl run`` command: + +.. code-block:: bash + + ads opctl run -f .yaml -b local + +See the `Different Ways To Run Operator <#different-ways-to-run-operator>`_ section for more details. + +Within Container +~~~~~~~~~~~~~~~~ + +To run the operator within a local container, follow these steps: + +1. Build the operator's container using the following command: + +.. code-block:: bash + + ads operator build-image --type + +This command creates a new ``:`` image with ``/etc/operator`` as the working directory within the container. + +2. Check the ``backend_operator_local_container_config.yaml`` configuration file. It should have a ``volume`` section with the ``.oci`` configs folder mounted, as shown below: + +.. code-block:: yaml + + volume: + - "/Users//.oci:/root/.oci" + +Mounting the OCI configs folder is necessary if you intend to use an OCI Object Storage bucket to store input and output data. You can also mount ``input/output`` folders to the container as needed. + +Following is the YAML schema for validating the runtime YAML using `Cerberus `_: + +.. literalinclude:: ../../../../../ads/opctl/operator/runtime/container_runtime_schema.yaml + :language: yaml + :linenos: + +3. Run the operator within the container using this command:: + +.. code-block:: bash + + ads operator run -f .yaml -b backend_operator_local_container_config.yaml + +Or within a short command: + +.. code-block:: bash + + ads operator run -f .yaml -b local.container + + +The alternative way to run the operator would be to use the ``ads opctl run`` command. However in this case the runtime information needs to be merged within operator's config. See the `Different Ways To Run Operator <#different-ways-to-run-operator>`_ section for more details. + +.. code-block:: bash + + ads opctl run -f .yaml + +If the backend runtime information is not merged within operator's config, then there is no way to run the operator within the ``ads opctl run`` command using container runtime. The ``ads operator run`` command should be used instead. + + +Run Operator In Data Science Job +-------------------------------- + +.. admonition:: Prerequisites + :class: note + + To become proficient with Data Science Jobs, it is recommended to explore their functionality thoroughly. Checking the :doc:`YAML Schema <../../jobs/yaml_schema>` link will assist you in configuring job YAML specifications more easily in the future. + + - :doc:`Data Science Jobs <../../jobs/index>` + - :doc:`Run a Script <../../jobs/run_script>` + - :doc:`Run a Container <../../jobs/run_container>` + - :doc:`YAML Schema <../../jobs/yaml_schema>` + +There are several options for running the operator on the OCI `Data Science Jobs `_ service, such as using the :doc:`python runtime <../../jobs/run_python>` or the :doc:`Bring Your Own Container (BYOC) <../../jobs/run_container>` approach. + +Run With BYOC (Bring Your Own Container) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To execute the operator within a Data Science job using :doc:`container <../../jobs/run_container>` runtime, follow these steps: + +1. Build the container using the following command (you can skip this if you've already done it for running the operator within a local container): + +.. code-block:: bash + + ads operator build-image --type + +This creates a new ``:`` image with ``/etc/operator`` as the working directory within the container. + +2. Publish the ``:`` container to the `Oracle Container Registry (OCR) `_. + +To publish ``:`` to OCR, use this command: + +.. code-block:: bash + + ads operator publish-image --type forecast --registry + +After publishing the container to OCR, you can use it within Data Science jobs service. Check the ``backend_job_container_config.yaml`` configuration file built during initializing the starter configs for the operator. It should contain pre-populated infrastructure and runtime sections. The runtime section should have an image property, like ``image: iad.ocir.io//:``. + +3. Adjust the ``.yaml`` configuration with the proper input/output folders. When running operator in a Data Science job, it won't have access to local folders, so input data and output folders should be placed in the Object Storage bucket. Open the ``.yaml`` and adjust the data path fields. + +4. Run the operator on the Data Science jobs using this command: + +.. code-block:: bash + + ads operator run -f .yaml -b backend_job_container_config.yaml + +Or within a short command: + +.. code-block:: bash + + ads operator run -f .yaml -b job.container + +In this case the backend config will be built on the fly. +However the recommended way would be to use explicit configurations for both operator and backend. + +The alternative way to run the operator would be to use the ``ads opctl run`` command. However in this case the runtime information needs to be merged within operator's config. See the `Different Ways To Run Operator <#different-ways-to-run-operator>`_ section for more details. + +.. code-block:: bash + + ads opctl run -f .yaml + +If the backend runtime information is not merged within operator's config, then there is no way to run the operator within the ``ads opctl run`` command using container runtime. The ``ads operator run`` command should be used instead. + +You can run the operator within the ``--dry-run`` attribute to check the final configs that will be used to run the operator on the service. This command will not run the operator, but will print the final configs that will be used to run the operator on the service. + +Running the operator will return a command to help you monitor the job's logs: + +.. code-block:: bash + + ads opctl watch + + +Run With Conda Environment +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To execute the operator within a Data Science job using the conda runtime, follow these steps: + +1. Build the operator's conda environment using this command:: + +.. code-block:: bash + + ads operator build-conda --type + +This creates a new ``_`` conda environment and places it in the folder specified within the ``ads opctl configure`` command. + +2. Publish the ``_`` conda environment to the Object Storage bucket using this command:: + +.. code-block:: bash + + ads operator publish --type + +For more details on configuring the CLI, refer to the :doc:`Explore & Configure Operators<./explore>` documentation. + +3. After publishing the conda environment to Object Storage, you can use it within the Data Science Jobs service. Check the ``backend_job_python_config.yaml`` configuration file, which should contain pre-populated infrastructure and runtime sections. The runtime section should include a ``conda`` section like this:: + +.. code-block:: yaml + + conda: + type: published + uri: oci://bucket@namespace/conda_environments/cpu///_ + +4. Adjust the ``.yaml`` configuration with the proper input/output folders. When running the operator in a Data Science job, it won't have access to local folders, so input data and output folders should be placed in the Object Storage bucket. + +5. Run the operator on the Data Science Jobs service using this command:: + +.. code-block:: bash + + ads operator run -f .yaml -b backend_job_python_config.yaml + +Or within a short command: + +.. code-block:: bash + + ads operator run -f .yaml -b job + +In this case the backend config will be built on the fly. +However the recommended way would be to use explicit configurations for both operator and backend. + +The alternative way to run the operator would be to use the ``ads opctl run`` command. However in this case the runtime information needs to be merged within operator's config. See the `Different Ways To Run Operator <#different-ways-to-run-operator>`_ section for more details. + +.. code-block:: bash + + ads opctl run -f .yaml + +Or if the backend runtime information is not merged within operator's config: + +.. code-block:: bash + + ads opctl run -f .yaml -b job + +6. Monitor the logs using the ``ads opctl watch`` command:: + +.. code-block:: bash + + ads opctl watch + +Data Flow Application +--------------------- + +To execute the operator within a Data Flow application follow these steps: + +1. Build the operator's conda environment using this command:: + +.. code-block:: bash + + ads operator build-conda --type + +This creates a new ``_`` conda environment and places it in the folder specified within the ``ads opctl configure`` command. + +2. Publish the ``_`` conda environment to the Object Storage bucket using this command:: + +.. code-block:: bash + + ads operator publish --type + +For more details on configuring the CLI, refer to the :doc:`Explore & Configure Operators<./explore>` documentation. + +After publishing the conda environment to Object Storage, you can use it within the Data Flow service. Check the ``backend_dataflow_dataflow_config.yaml`` configuration file, which should contain pre-populated infrastructure and runtime sections. The runtime section should include a ``conda`` section like this: + +.. code-block:: yaml + + conda: + type: published + uri: oci://bucket@namespace/conda_environments/cpu///_ + + +3. Adjust the ``.yaml`` configuration with the proper input/output folders. When running the operator in a Data Flow application, it won't have access to local folders, so input data and output folders should be placed in the Object Storage bucket. + +4. Run the operator on the Data Flow service using this command:: + +.. code-block:: bash + + ads operator run -f .yaml -b backend_dataflow_dataflow_config.yaml + +Or within a short command: + +.. code-block:: bash + + ads operator run -f .yaml -b dataflow + +In this case the backend config will be built on the fly. +However the recommended way would be to use explicit configurations for both operator and backend. + +The alternative way to run the operator would be to use the ``ads opctl run`` command. However in this case the runtime information needs to be merged within operator's config. See the `Different Ways To Run Operator <#different-ways-to-run-operator>`_ section for more details. + +.. code-block:: bash + + ads opctl run -f .yaml + +Or if the backend runtime information is not merged within operator's config: + +.. code-block:: bash + + ads opctl run -f .yaml -b dataflow + + +5. Monitor the logs using the ``ads opctl watch`` command:: + +.. code-block:: bash + + ads opctl watch diff --git a/docs/source/user_guide/operators/forecasting_operator/advanced_use_cases.rst b/docs/source/user_guide/operators/forecasting_operator/advanced_use_cases.rst new file mode 100644 index 000000000..1b770be58 --- /dev/null +++ b/docs/source/user_guide/operators/forecasting_operator/advanced_use_cases.rst @@ -0,0 +1,102 @@ +================== +Advanced Use Cases +================== + +**Documentation: Forecasting Science and Model Parameterization** + +**The Science of Forecasting** + +Forecasting is a complex yet essential discipline that involves predicting future values or events based on historical data and various mathematical and statistical techniques. To achieve accurate forecasts, it is crucial to understand some fundamental concepts: + +**Seasonality** + +Seasonality refers to patterns in data that repeat at regular intervals, typically within a year. For example, retail sales often exhibit seasonality with spikes during holidays or specific seasons. Seasonal components can be daily, weekly, monthly, or yearly, and understanding them is vital for capturing and predicting such patterns accurately. + +**Stationarity** + +Stationarity is a critical property of time series data. A time series is considered stationary when its statistical properties, such as mean, variance, and autocorrelation, remain constant over time. Stationary data simplifies forecasting since it allows models to assume that future patterns will resemble past patterns. + +**Cold Start** + +The "cold start" problem arises when you have limited historical data for a new product, service, or entity. Traditional forecasting models may struggle to make accurate predictions in these cases due to insufficient historical context. + +**Passing Parameters to Models** + +To enhance the accuracy and adaptability of forecasting models, our system allows you to pass parameters directly. Here's how to do it: + + +**Specify Model Type** + +Sometimes users will know which models they want to use. When users know this in advance, they can specify using the ``model_kwargs`` dictionary. In the following example, we will instruct the model to *only* use the ``DecisionTreeRegressor`` model. + +.. code-block:: yaml + + kind: operator + type: forecast + version: v1 + spec: + model: automlx + model_kwargs: + model_list: + - NaiveForecaster + search_space: + NaiveForecaster: + sp: [1,100] + + +When using autots, there are model_list *families*. These families are named after the shared characteristics of the models included. For example, we can use the autots "superfast" model_list and set it in the following way: + +.. code-block:: yaml + + kind: operator + type: forecast + version: v1 + spec: + model: autots + model_kwargs: + model_list: superfast + + +Note: this is only supported for the ``autots`` model. + + +**Specify Other Model Details** + +In addition to ``model_list``, there are many other parameters that can be specified. Users may specify, for example, the search space they want to search for their given model type. In automlx, specifying a hyperparameter range is as simple as: + +.. code-block:: yaml + + kind: operator + type: forecast + version: v1 + spec: + model: automlx + model_kwargs: + search_space: + LogisticRegression: + C: + range: [0.03125, 512] + type': continuous + solver: + range: ['newton-cg', 'lbfgs', 'liblinear', 'sag'] + type': categorical + class_weight: + range: [None, 'balanced'] + type: categorical + + +**When Models Perform Poorly and the "Auto" Method** + +Forecasting models are not one-size-fits-all, and some models may perform poorly under certain conditions. Common scenarios where models might struggle include: + +- **Sparse Data:** When there's limited historical data available, traditional models may have difficulty making accurate predictions, especially for cold start problems. + +- **High Seasonality:** Extremely seasonal data with complex patterns can challenge traditional models, as they might not capture all nuances. + +- **Non-Linear Relationships:** In cases where the relationships between input variables and forecasts are nonlinear, linear models may underperform. + +- **Changing Dynamics:** If the underlying data-generating process changes over time, static models may fail to adapt. + +Our system offers an "auto" method that strives to anticipate and address these challenges. It dynamically selects the most suitable forecasting model and parameterizes it based on the characteristics of your data. It can automatically detect seasonality, stationarity, and cold start issues, then choose the best-fitting model and adjust its parameters accordingly. + +By using the "auto" method, you can rely on the system's intelligence to adapt to your data's unique characteristics and make more accurate forecasts, even in challenging scenarios. This approach simplifies the forecasting process and often leads to better results than manual model selection and parameter tuning. diff --git a/docs/source/user_guide/operators/forecasting_operator/examples.rst b/docs/source/user_guide/operators/forecasting_operator/examples.rst new file mode 100644 index 000000000..97891eba5 --- /dev/null +++ b/docs/source/user_guide/operators/forecasting_operator/examples.rst @@ -0,0 +1,95 @@ +======== +Examples +======== + +**Simple Example** + +The simplest yaml file is generated by the ``ads operator init --type forecast`` and looks like the following: + +.. code-block:: yaml + + kind: operator + type: forecast + version: v1 + spec: + datetime_column: + name: Date + historical_data: + url: data.csv + horizon: 3 + model: auto + target_column: target + + +**Typical Example** + +A typical forecast yaml will usually have the following fields: + +.. code-block:: yaml + + kind: operator + type: forecast + version: v1 + spec: + additional_data: + url: additional_data.csv + datetime_column: + name: time + generate_explanations: true + historical_data: + url: primary_data.csv + horizon: 5 + metric: smape + model: "auto" + output_directory: + url: results + target_category_columns: + - Series + target_column: Total + test_data: + url: test_data.csv + + +**Complex Example** + +The yaml can also be maximally stated as follows: + +.. code-block:: yaml + + kind: operator + type: forecast + version: v1 + spec: + historical_data: + url: primary_data.csv + additional_data: + url: additional_data.csv + output_directory: + url: results + test_data: + url: test_data.csv + target_category_columns: + - Store_ID + target_column: Sales + horizon: 5 + datetime_column: + format: "%d/%m/%y" + name: Date + model: automlx + model_kwargs: + time_budget: 1 + tuning: + n_trials: 5 + preprocessing: true + metric: smape + confidence_interval_width: 0.8 + generate_explanations: true + generate_metrics: true + generate_report: true + local_explanation_filename: local_explanation.csv + metrics_filename: metrics.csv + report_filename: report.html + report_theme: light + forecast_filename: forecast.csv + global_explanation_filename: global_explanation.csv + test_metrics_filename: test_metrics.csv diff --git a/docs/source/user_guide/operators/forecasting_operator/faq.rst b/docs/source/user_guide/operators/forecasting_operator/faq.rst new file mode 100644 index 000000000..f84c02ae5 --- /dev/null +++ b/docs/source/user_guide/operators/forecasting_operator/faq.rst @@ -0,0 +1,11 @@ +==== +FAQs +==== + +**How do I learn more about AutoMLX?** + +More details in the documentation here: https://docs.oracle.com/en-us/iaas/tools/automlx/latest/html/multiversion/latest/automl.html + +**How do I learn More about AutoTS?** + +More details in the documentation here: https://winedarksea.github.io/AutoTS/build/html/source/tutorial.html diff --git a/docs/source/user_guide/operators/forecasting_operator/forecast.rst b/docs/source/user_guide/operators/forecasting_operator/forecast.rst new file mode 100644 index 000000000..0188138fa --- /dev/null +++ b/docs/source/user_guide/operators/forecasting_operator/forecast.rst @@ -0,0 +1,64 @@ +=================== +Configure Forecast +=================== + +Let's explore each line of the forecast.yaml so we can better understand options for extending and customizing the operator to our use case. + +Here is an example forecast.yaml with every parameter specified: + +.. code-block:: yaml + + kind: operator + type: forecast + version: v1 + spec: + datetime_column: + name: Date + historical_data: + url: data.csv + horizon: 3 + model: auto + target_column: target + + +* **Kind**: The yaml file always starts with ``kind: operator``. There are many other kinds of yaml files that can be run by ``ads opctl``, so we need to specify this is an operator. +* **Type**: The type of operator is ``forecast``. +* **Version**: The only available version is ``v1``. +* **Spec**: Spec contains the bulk of the information for the specific problem. + * **historical_data**: This dictionary contains the details for how to read the historical data. Historical data must contain the target column, the datetime column, and optionally the target category column. + * **url**: Insert the uri for the dataset if it's on object storage or Data Lake using the URI pattern ``oci://@/path/to/data.csv``. + * **kwargs**: Insert any other args for pandas to load the data (``format``, ``options``, etc.) See full list in ``YAML Schema`` section. + * **target_column**: This string specifies the name of the column where the target data is within the historical data. + * **datetime_column**: The dictionary outlining details around the datetime column. + * **name**: the name of the datetime column. Must be the same in both historical and additional data. + * **format**: the format of the datetime string in python notation `detailed here `_. + * **horizon**: the integer number of periods to forecast. + + * **target_category_columns**: (optional) The category ID of the target. + * **additional_data**: (optional) This dictionary contains the details for how to read the addtional data. Additional data must contain the the datetime column, the target category column (if present in historical), and any other columns with values over the horizon. + * **url**: Insert the uri for the dataset if it's on object storage or Data Lake using the URI pattern ``oci://@/path/to/data.csv``. + * **kwargs**: Insert any other args for pandas to load the data (``format``, ``options``, etc.) See full list in ``YAML Schema`` section. + * **output_directory**: (optional) This dictionary contains the details for where to put the output artifacts. The directory need not exist, but must be accessible by the Operator during runtime. + * **url**: Insert the uri for the dataset if it's on object storage or Data Lake using the URI pattern ``oci://@/subfolder/``. + * **kwargs**: Insert any other args for pandas to load the data (``format``, ``options``, etc.) See full list in ``YAML Schema`` section. + * **model**: (optional) The name of the model framework you want to use. Defaults to "auto". Other options are: ``arima``, ``automlx``, ``prophet``, ``neuralprophet``, ``autots``, and ``auto``. + * **model_kwargs**: (optional) This kwargs dict passes straight through to the model framework. If you want to take direct control of the modeling, this is the best way. + * **test_data**: (optional) This dictionary contains the details for how to read the test data. Test data must be formatted identically to historical data and contain values for every period in the forecast horizon. + * **url**: Insert the uri for the dataset if it's on object storage or Data Lake using the URI pattern ``oci://@/path/to/data.csv``. + * **kwargs**: Insert any other args for pandas to load the data (``format``, ``options``, etc.) See full list in ``YAML Schema`` section. + + * **tuning**: (optional) This dictionary specific details around tuning the NeuralProphet and Prophet models. + * **n_trials**: The number of separate tuning jobs to run. Increasing this integer increases the time to completion, but may improve the quality. + * **preprocessing**: (optional) Preprocessing and feature engineering can be disabled using this flag, Defaults to true + * **metric**: (optional) The metric to select across. Users can select among: MAPE, RMSE, MSE, and SMAPE + * **confidence_interval_width**: (optional) The width of the confidence interval to caluclate in the forecast and report.html. Defaults to 0.80 meaning an 80% confidence interval + + * **report_filename**: (optional) Placed into output_directory location. Defaults to report.html + * **report_title**: (optional) The title of the report. + * **report_theme**: (optional) Can be "dark" or "light". Defaults to "light". + * **metrics_filename**: (optional) Placed into output_directory location. Defaults to metrics.csv + * **test_metrics_filename**: (optional) Placed into output_directory location. Defaults to test_metrics.csv + * **forecast_filename**: (optional) Placed into output_directory location. Defaults to forecast.csv + * **generate_explanations**: (optional) Explainability, both local and global, can be disabled using this flag. Defaults to false. + * **generate_report**: (optional) Report file generation can be enabled using this flag. Defaults to true. + * **generate_metrics**: (optional) Metrics files generation can be enabled using this flag. Defaults to true. diff --git a/docs/source/user_guide/operators/forecasting_operator/getting_started.rst b/docs/source/user_guide/operators/forecasting_operator/getting_started.rst new file mode 100644 index 000000000..2a05dde75 --- /dev/null +++ b/docs/source/user_guide/operators/forecasting_operator/getting_started.rst @@ -0,0 +1,110 @@ +=============== +Getting Started +=============== + +Configure +--------- + +After having set up ``ads opctl`` on your desired machine using ``ads opctl configure``, you are ready to begin forecasting. At a bare minimum, you will need to provide the following details about your forecasting problem: + +- Path to the historical data (historical_data) +- Name of the Datetime column (datetime_column) +- Forecast horizon (horizon) +- Name of the Target column (target_column) + + +These details exactly match the initial forecast.yaml file generated by running ``ads operator init --type forecast``: + +.. code-block:: yaml + + kind: operator + type: forecast + version: v1 + spec: + datetime_column: + name: Date + historical_data: + url: data.csv + horizon: 3 + model: auto + target_column: target + + +Optionally, you are able to specify much more. The most common additions are: + +- Path to the additional data, which has values for each period of the forecast horizon (additional_data) +- Path to test data, in the event you want to evaluate the forecast on a test set (test_data) +- List of column names that index different timeseries within the data, such as a product_ID or some other such series (target_category_columns) +- Path to the output directory, where the operator will place the forecast.csv, metrics.csv, and other artifacts produced from the run (output_directory) + +An extensive list of parameters can be found in the ``YAML Schema`` section. + + +Run +--- + +After you have your forecast.yaml written, you simply run the forecast using: + +.. code-block:: bash + + ads operator run -f forecast.yaml + + +Interpret Results +----------------- + +The forecasting operator produces many output files: ``forecast.csv``, ``metrics.csv``, ``local_explanations.csv``, ``global_explanations.csv``, ``report.html``. + +We will go through each of these output files in turn. + +**Forecast.csv** + +This file contains the entire historical dataset with the following columns: + +- Series: Categorical or numerical index +- Date: Time series data +- Real values: Target values from historical data +- Fitted values: Model's predictions on historical data +- Forecasted values: Only available over the forecast horizon, representing the true forecasts +- Upper and lower bounds: Confidence intervals for the predictions (based on the specified confidence interval width in the YAML file) + +**report.html** + +The report.html file is designed differently for each model type. Generally, it contains a summary of the historical and additional data, a plot of the target from historical data overlaid with fitted and forecasted values, analysis of the models used, and details about the model components. It also includes a receipt YAML file, providing a fully detailed version of the original forecast.yaml file. + +**Metrics.csv** + +The metrics file includes relevant metrics calculated on the training set. + + +**Global and Local Explanations in Forecasting Models** + +In the realm of forecasting models, understanding not only the predictions themselves but also the factors and features driving those predictions is of paramount importance. Global and local explanations are two distinct approaches to achieving this understanding, providing insights into the inner workings of forecasting models at different levels of granularity. + +**Global Explanations:** + +Global explanations aim to provide a high-level overview of how a forecasting model works across the entire dataset or a specific feature space. They offer insights into the model's general behavior, helping users grasp the overarching patterns and relationships it has learned. Here are key aspects of global explanations: + +1. **Feature Importance:** Global explanations often involve the identification of feature importance, which ranks variables based on their contribution to the model's predictions. This helps users understand which features have the most significant influence on the forecasts. + +2. **Model Structure:** Global explanations can also reveal the architecture and structure of the forecasting model, shedding light on the algorithms, parameters, and hyperparameters used. This information aids in understanding the model's overall approach to forecasting. + +3. **Trends and Patterns:** By analyzing global explanations, users can identify broad trends and patterns in the data that the model has captured. This can include seasonality, long-term trends, and cyclical behavior. + +4. **Assumptions and Constraints:** Global explanations may uncover any underlying assumptions or constraints the model operates under, highlighting potential limitations or biases. + +While global explanations provide valuable insights into the model's behavior at a holistic level, they may not capture the nuances and variations that exist within the dataset. + +**Local Explanations:** + +Local explanations, on the other hand, delve deeper into the model's predictions for specific data points or subsets of the dataset. They offer insights into why the model made a particular prediction for a given instance. Key aspects of local explanations include: + +1. **Instance-specific Insights:** Local explanations provide information about the individual features and their contribution to a specific prediction. This helps users understand why the model arrived at a particular forecast for a particular data point. + +2. **Contextual Understanding:** They consider the context of the prediction, taking into account the unique characteristics of the data point in question. This is particularly valuable when dealing with outliers or anomalous data. + +3. **Model Variability:** Local explanations may reveal the model's sensitivity to changes in input variables. Users can assess how small modifications to the data impact the predictions. + +4. **Decision Boundaries:** In classification problems, local explanations can elucidate the decision boundaries and the factors that led to a specific classification outcome. + +While local explanations offer granular insights, they may not provide a comprehensive understanding of the model's behavior across the entire dataset. diff --git a/docs/source/user_guide/operators/forecasting_operator/index.rst b/docs/source/user_guide/operators/forecasting_operator/index.rst new file mode 100644 index 000000000..41d523326 --- /dev/null +++ b/docs/source/user_guide/operators/forecasting_operator/index.rst @@ -0,0 +1,52 @@ +==================== +Forecasting Operator +==================== + +The Forecasting Operator leverages historical time series data to generate accurate forecasts for future trends. This operator aims to simplify and expedite the data science process by automating the selection of appropriate models and hyperparameters, as well as identifying relevant features for a given prediction task. + + +Overview +-------- + +**Introduction to Forecasting with the Python Library Module** + +Forecasting is a crucial component of decision-making in various fields, from finance and supply chain management to weather prediction and demand forecasting. Accurate forecasts enable organizations to allocate resources efficiently, plan for the future, and respond proactively to changing circumstances. The Operators framework is OCI's most extensible, low-code, managed ecosystem for building and deploying forecasting models. + +This technical documentation introduces using ``ads opctl`` for forecasting tasks. This module is engineered with the principles of low-code development in mind, making it accessible to users with varying degrees of technical expertise. It operates on managed infrastructure, ensuring reliability and scalability, while its configurability through YAML allows users to tailor forecasts to their specific needs. + +**Multivariate vs. Univariate Forecasting** + +One of the fundamental decisions in forecasting is whether to employ multivariate or univariate models. Univariate forecasting involves predicting a single variable, typically based on its historical values, making it suitable for straightforward time series analysis. In contrast, multivariate forecasting takes into account multiple interrelated variables, allowing for a more comprehensive understanding of complex systems. + +**Global vs. Local Models for Multivariate Forecasts** + +When dealing with multivariate forecasts, the choice between global and local models is pivotal. Global models assume that the relationships between variables are uniform across all data points, providing a consolidated forecast for the entire dataset. In contrast, local models consider localized relationships, allowing forecasts to adapt to variations within the dataset. + +**Strengths and Weaknesses of Global and Local Models** + +Global models are advantageous when relationships between variables remain relatively stable over time. They offer simplicity and ease of interpretation, making them suitable for a wide range of applications. However, they may struggle to capture nuances in the data when relationships are not consistent throughout the dataset. + +Local models, on the other hand, excel in capturing localized patterns and relationships, making them well-suited for datasets with varying dynamics. They can provide more accurate forecasts in cases where global models fall short. + +**Auto Model Selection** + +Some users know which modeling frameworks (this can be a specific model, such as ARIMA and Prophet or it can be an automl library like Oracle's AutoMLX) they want to use right already, the forecasting operator allows these more advanced users to configure this through the ``model`` parameter. For those newer users who don't know, or want to explore multiple, the forecasting operator sets the ``model`` parameter to "auto" by default. "auto" will select the framework that looks most appropriate given the dataset. + +**Forecasting Documentation** + +This documentation will explore these concepts in greater depth, demonstrating how to leverage the flexibility and configurability of the Python library module to implement both multivariate and univariate forecasting models, as well as global and local approaches. By the end of this guide, users will have the knowledge and tools needed to make informed decisions when designing forecasting solutions tailored to their specific requirements. + +.. versionadded:: 2.9.0 + +.. toctree:: + :maxdepth: 1 + + ./use_cases + ./install + ./getting_started + ./forecast + ./interpret_results + ./examples + ./advanced_use_cases + ./yaml_schema + ./faq diff --git a/docs/source/user_guide/operators/forecasting_operator/install.rst b/docs/source/user_guide/operators/forecasting_operator/install.rst new file mode 100644 index 000000000..f318e45ec --- /dev/null +++ b/docs/source/user_guide/operators/forecasting_operator/install.rst @@ -0,0 +1,50 @@ +==================================== +Installing the AI Forecast Operator +==================================== + +The Forecast Operator can be installed in 2 primary ways: PyPi and Conda Packs. + + +**Installing Through PyPi** + +If you are running the operator from outside of a Notebook Session, you may download ``oracle_ads[forecast]`` from pypi. + +.. code-block:: bash + + python3 -m pip install oracle_ads[forecast]==2.9.0 + + +After that, the Operator is ready to go! + +In order to run on a job, you will need to create and publish a conda pack with ``oracle_ads[forecast]`` installed. The simplest way to do this is from a Notebook Session, running the following commands: + +.. code-block:: bash + + odsc conda create -n forecast -e + conda activate /home/datascience/conda/forecast_v1_0 + python3 -m pip install oracle-ads[forecast]==2.9.0rc1 + odsc conda publish -s /home/datascience/conda/forecast_v1_0 + +Ensure that you have properly configured your conda pack namespace and bucket in the Launcher -> Settings -> Object Storage Settings. For more details, see :doc:`ADS Conda Set Up <../../cli/opctl/configure>` + + +**Installing Through Conda Packs** + +*Coming Soon!* The Forecast Conda Pack will be released on December 1, 2023. + +The service recommended environment for using Operators is through Conda Packs within a Job or Notebook Session on OCI. + +To install: + +1. Open a Notebook Session +2. Go to Environment Explorer (from the Launcher tab) +3. Search for ``forecast`` +4. Download the latest version by clicking the download button. +5. Activate the conda environment using the path, for example: + +.. code-block:: bash + + conda activate /home/datascience/conda/forecast_py38_v1 + + +That's it. Your Operator is ready to go! diff --git a/docs/source/user_guide/operators/forecasting_operator/interpret_results.rst b/docs/source/user_guide/operators/forecasting_operator/interpret_results.rst new file mode 100644 index 000000000..5b6a37178 --- /dev/null +++ b/docs/source/user_guide/operators/forecasting_operator/interpret_results.rst @@ -0,0 +1,59 @@ +====================== +Interpretting Results +====================== + +The forecasting operator produces many output files: ``forecast.csv``, ``metrics.csv``, ``local_explanations.csv``, ``global_explanations.csv``, ``report.html``. + +We will go through each of these output files in turn. + +**Forecast.csv** + +This file contains the entire historical dataset with the following columns: + +- Series: Categorical or numerical index +- Date: Time series data +- Real values: Target values from historical data +- Fitted values: Model's predictions on historical data +- Forecasted values: Only available over the forecast horizon, representing the true forecasts +- Upper and lower bounds: Confidence intervals for the predictions (based on the specified confidence interval width in the YAML file) + +**report.html** + +The report.html file is designed differently for each model type. Generally, it contains a summary of the historical and additional data, a plot of the target from historical data overlaid with fitted and forecasted values, analysis of the models used, and details about the model components. It also includes a receipt YAML file, providing a fully detailed version of the original forecast.yaml file. + +**Metrics.csv** + +The metrics file includes relevant metrics calculated on the training set. + + +**Global and Local Explanations in Forecasting Models** + +In the realm of forecasting models, understanding not only the predictions themselves but also the factors and features driving those predictions is of paramount importance. Global and local explanations are two distinct approaches to achieving this understanding, providing insights into the inner workings of forecasting models at different levels of granularity. + +**Global Explanations:** + +Global explanations aim to provide a high-level overview of how a forecasting model works across the entire dataset or a specific feature space. They offer insights into the model's general behavior, helping users grasp the overarching patterns and relationships it has learned. Here are key aspects of global explanations: + +1. **Feature Importance:** Global explanations often involve the identification of feature importance, which ranks variables based on their contribution to the model's predictions. This helps users understand which features have the most significant influence on the forecasts. + +2. **Model Structure:** Global explanations can also reveal the architecture and structure of the forecasting model, shedding light on the algorithms, parameters, and hyperparameters used. This information aids in understanding the model's overall approach to forecasting. + +3. **Trends and Patterns:** By analyzing global explanations, users can identify broad trends and patterns in the data that the model has captured. This can include seasonality, long-term trends, and cyclical behavior. + +4. **Assumptions and Constraints:** Global explanations may uncover any underlying assumptions or constraints the model operates under, highlighting potential limitations or biases. + +While global explanations provide valuable insights into the model's behavior at a holistic level, they may not capture the nuances and variations that exist within the dataset. + +**Local Explanations:** + +Local explanations, on the other hand, delve deeper into the model's predictions for specific data points or subsets of the dataset. They offer insights into why the model made a particular prediction for a given instance. Key aspects of local explanations include: + +1. **Instance-specific Insights:** Local explanations provide information about the individual features and their contribution to a specific prediction. This helps users understand why the model arrived at a particular forecast for a particular data point. + +2. **Contextual Understanding:** They consider the context of the prediction, taking into account the unique characteristics of the data point in question. This is particularly valuable when dealing with outliers or anomalous data. + +3. **Model Variability:** Local explanations may reveal the model's sensitivity to changes in input variables. Users can assess how small modifications to the data impact the predictions. + +4. **Decision Boundaries:** In classification problems, local explanations can elucidate the decision boundaries and the factors that led to a specific classification outcome. + +While local explanations offer granular insights, they may not provide a comprehensive understanding of the model's behavior across the entire dataset. diff --git a/docs/source/user_guide/operators/forecasting_operator/use_cases.rst b/docs/source/user_guide/operators/forecasting_operator/use_cases.rst new file mode 100644 index 000000000..59bb04b94 --- /dev/null +++ b/docs/source/user_guide/operators/forecasting_operator/use_cases.rst @@ -0,0 +1,73 @@ +================================================= +Will the Forecast Operator Work for My Use Case? +================================================= + +As a low-code extensible framework, operators enable a wide range of use cases. This section will highlight some of the use cases that the AI Forecast Operator aims to serve. + + +**Dataset Size** + +* First off, if you're unsure what model to use, we recommend using the "auto" setting, which is the default. "auto" will look at the parameters of your data and pick an algorithm that is likely to converge in a reasonable amount of time. Note, this may not always be the most performant algorithm! If need accuracy and do not care about cost or time, we recommend using all 5 frameworks and comparing across test datasets. +* When under 5,000 rows, and 5 columns, all operators should be quick, finishing in a couple minutes. If you include explainability, it may take longer. +* Over 5,000 rows, different algorithms perform to different degrees. This varies on more than the size of the dataset, but the service provides some recommendations in the next section, *Which Model is Right for You?*. +* For best results, the service recommends a minimum of 100 rows per category, however this is not a requirement, see "Cold Start Problem" below. +* For best results, the service recommends fewer than 100 total categories. Increasing category count is expected to linearly increase the time to completion. + + +**Which Model is Right for You?** + +* The ARIMA and AutoMLX models slow down substantially as you increase columns. Aim to use these when you have less than 10 additional data columns. +* AutoTS is a global model. It works well for wide datasets but can take a long time to train especially on long datasets. One technique here is to pass ``model_list: superfast`` into the model kwargs to speed up an initial run. To fully utilize autots, consider setting ``model_list: all`` in the ``model_kwargs``, however this may lead to the model taking a long time or even hanging. +* Prophet and NeuralProphet are much more consistent in their time to completion, and perform very well on most datasets. +* Automlx is not recommended when the data interval is less than 1 hour. +* Note: Explainability usually takes several minutes to a couple of hours. Explanations can be enabled using the flag ``generate_explanations: True``, however this is False by default. Because explanations are highly parallelized computations, explanations can be sped up by scaling up your compute shape. + + +**Target Column** + +* The target column should be present in the dataset passed into the ``historical_data`` field. +* The ``historical_data`` dataset must have 1. a target column, 2. a datetime column, and optionally 3. a target_category_column or series. +* The ``historical_data`` cannot contain any other columns. +* If passing ``additional_data``, it must match have the datetime column, the target_category_column if it's present in the historical data, and then as many additional features as needed. +* The ``additional_data`` cannot contain the target column. + + +**Additional Features** + +* It is recommended to include additional "future regressors" when available. These features can greatly improve the ability to forecast. +* A "future regressor" is one that is known for all future timestamps in your forecast horizon during training time. (Typically these are variables within your control, such as whether or not to discount a product or the staffing of a particular outlet.) +* All additional data provided must be put in a separate location and passed into "additional_data" in the ``forecast.yaml`` file. +* All additional data must be given for each period of the forecast horizon. Missing values may result in sub-optimal forecasts. + + +**Long Horizon Problems** + +* A Long Horizon Problem is defined by having a forecast horizon period that's more than 50% of the length of the input data period (e.g. forecasting next 2 years on 4 years of data). These problems are particularly difficult for AutoMLX, AutoTS, and ARMIA. Customers are encouraged to use NeuralProphet and/or Prophet for these types of problems. + + +**Cold Start Problems** + +* A cold start problem can occur when there's data available for some categories of the target variable, but not all. Using these proxies, the model can make a forecast for the categories it hasn't seen yet based on the trends and the additional data characteristics. +* For cold start problems, customers are strongly encouraged to use AutoTS as AutoTS is a "global model" implementation. AutoTS can ensemble many models into a single aggregate model, allowing it to rely on all features of the dataset in making any 1 prediction. + + +**Datetime Input** + +* The datetime input column must have a consistent interval throughout the historical and additional datasets. Inconsistent diffs will cause failure on automlx and may affect performance on other frameworks. +* Note: missing data is okay, however it will likely cause sub-optimal forecasting. +* It is strongly recommended that the datetime column is passed in sorted from earliest to latest, however this is not a requirement, and the operator will attempt to sort on your behalf. +* It is recommended that you pass in the format of your datetime string into the ``format`` option of the ``datetime_column`` parameter. The operator uses the python datetime string format outlined here: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + + +**Output Files** + +* Apart from the ``report.html``, all output files should match formatting regardless of the model framework used (e.g. AutoMLX v Prophet). +* The ``report.html`` is custom built for each model framework, and so it will differ. +* All output files can be disabled, with the exception of ``forecast.csv``. For more details in disabling, look for ``generate_X`` boolean parameters in the ``forecast.yaml`` file. + + +**Feature Engineering** + +* Apart from ARIMA, it is not recommended to create features around "day of the week" or "holiday" as NeuralProphet, Prophet, AutoTS and AutoMLX can generate this information internally. +* AutoMLX performs further feature engineering on your behalf. It will expand your features into lag, min, max, average, and more. When using automlx, it is recommended that you only pass in features that contain new information. +* AutoTS performs some feature engineering, but it is not as extensive as AutoMLX. diff --git a/docs/source/user_guide/operators/forecasting_operator/yaml_schema.rst b/docs/source/user_guide/operators/forecasting_operator/yaml_schema.rst new file mode 100644 index 000000000..4c7264889 --- /dev/null +++ b/docs/source/user_guide/operators/forecasting_operator/yaml_schema.rst @@ -0,0 +1,8 @@ +=========== +YAML Schema +=========== +.. raw:: html + + +
kind
allowed
  • operator
requiredTrue
typestring
defaultoperator
meta
descriptionWhich service are you trying to use? Common kinds: `operator`, `job`
version
allowed
  • v1
requiredTrue
typestring
defaultv1
meta
descriptionOperators may change yaml file schemas from version to version, as well as implementation details. Double check the version to ensure compatibility.
type
requiredTrue
typestring
defaultforecast
meta
descriptionType should always be `forecast` when using a forecast operator
spec
requiredTrue
schema
historical_data
requiredTrue
typedict
meta
descriptionThis should be indexed by date and target category (optionally). It should include all targets and endogeneous data.
schema
format
allowed
  • csv
  • json
  • clipboard
  • excel
  • hdf
  • feather
  • load_files
requiredFalse
typestring
columns
requiredFalse
typelist
schema
typestring
options
nullableTrue
requiredFalse
typedict
url
requiredTrue
typestring
defaultdata.csv
meta
descriptionThe url can be local, or remote. For example: `oci://<bucket>@<namespace>/data.csv`
limit
requiredFalse
typeinteger
additional_data
requiredFalse
typedict
meta
descriptionAdditional datasets must be indexed by the same targets and target categories as the historical data. Also is must have datapoints for each date/category for your horizon. This must be exogeneous data.
schema
format
allowed
  • csv
  • json
  • clipboard
  • excel
  • hdf
  • feather
  • load_files
requiredFalse
typestring
columns
requiredFalse
typelist
schema
typestring
options
nullableTrue
requiredFalse
typedict
url
requiredFalse
typestring
meta
descriptionThe url can be local, or remote. For example: `oci://<bucket>@<namespace>/data.csv`
limit
requiredFalse
typeinteger
test_data
requiredFalse
meta
descriptionOptional, only if evaluation is needed.
schema
connect_args
nullableTrue
requiredFalse
typedict
format
requiredFalse
typestring
allowed
  • csv
  • json
  • clipboard
  • excel
  • hdf
  • sql
columns
requiredFalse
typelist
schema
typestring
url
requiredTrue
typestring
defaulttest.csv
meta
descriptionThe url can be local, or remote. For example: `oci://<bucket>@<namespace>/data.csv`
name
requiredFalse
typestring
options
nullableTrue
requiredFalse
typedict
typedict
output_directory
requiredFalse
schema
connect_args
nullableTrue
requiredFalse
typedict
format
requiredFalse
typestring
allowed
  • csv
  • json
  • clipboard
  • excel
  • hdf
  • sql
url
requiredTrue
typestring
defaultresult/
meta
descriptionThe url can be local, or remote. For example: `oci://<bucket>@<namespace>/`
name
requiredFalse
typestring
options
nullableTrue
requiredFalse
typedict
typedict
report_file_name
requiredFalse
typestring
defaultreport.html
meta
descriptionPlaced into output_directory location. Defaults to report.html
report_title
requiredFalse
typestring
report_theme
requiredFalse
typestring
defaultlight
allowed
  • light
  • dark
report_metrics_name
requiredFalse
typestring
defaultreport.csv
meta
descriptionPlaced into output_directory location. Defaults to report.csv
target_column
typestring
requiredTrue
defaulttarget
preprocessing
typeboolean
requiredFalse
defaultTrue
meta
descriptionpreprocessing and feature engineering can be disabled using this flag, Defaults to true
datetime_column
typedict
requiredTrue
schema
name
typestring
requiredTrue
defaultDate
format
typestring
requiredFalse
target_category_columns
typelist
requiredFalse
schema
typestring
horizon
requiredTrue
schema
periods
typeinteger
default3
requiredTrue
interval
typeinteger
requiredFalse
interval_unit
requiredTrue
typestring
defaultM
allowed
  • S
  • M
  • H
  • D
  • W
  • Mo
  • Q
  • Y
typedict
model
typestring
requiredTrue
defaultautomlx
allowed
  • prophet
  • arima
  • neuralprophet
  • automlx
  • autots
  • auto
model_kwargs
typedict
requiredFalse
confidence_interval_width
typefloat
requiredFalse
default0.8
tuning
typedict
requiredFalse
schema
n_trials
typeinteger
requiredFalse
default10
metric
typestring
requiredFalse
defaultMAPE
allowed
  • MAPE
  • RMSE
  • MSE
  • SMAPE
typedict
+ diff --git a/docs/source/user_guide/operators/index.rst b/docs/source/user_guide/operators/index.rst new file mode 100644 index 000000000..6c68f6889 --- /dev/null +++ b/docs/source/user_guide/operators/index.rst @@ -0,0 +1,20 @@ +================== +What Are Operators +================== + +Operators are a suite of pre-packaged solutions designed to address a wide range of data science needs. They offer a user-friendly, low-code interface, making data science tasks accessible to users with varying levels of expertise. Even if you're not a domain expert, you can leverage operators as they encapsulate all the necessary domain-specific knowledge. However, these operators also cater to experts who value the convenience of low-code tools and wish to fine-tune their behavior. This customization is achieved through configuration parameters exposed by different operators, providing the best of both worlds. + +Key Characteristics of Operators +--------------------------------- + +- **No Domain Expertise Required**: You don't need to be a domain expert to use an operator. Each operator is equipped with all the necessary domain-specific knowledge, eliminating the need for specialized expertise. + +- **Configuration Options**: For domain experts who want to fine-tune the operator's behavior, key configuration settings are available. These settings allow you to customize the operator to suit your specific needs. + +- **Default Settings**: Operators come with default configurations that encapsulate domain-specific information. This ensures that users can get started quickly, even without making customizations. + +- **Versatile Deployment**: Operators can be executed in a variety of environments, including OCI Data Science Jobs, OCI Data Flow Applications, Kubernetes (K8s), and locally. This flexibility allows you to choose the environment that best suits your requirements. + +- **CLI Tool for Easy Deployment**: To simplify the deployment process across different environments, we provide a user-friendly CLI (Command Line Interface) tool. This tool streamlines the setup and execution of operators, making it easier for you to harness their power regardless of where you choose to run them. + +Whether you're working in the cloud or locally, our operators, along with the accompanying CLI tool, offer a seamless and customizable solution for your data science needs. diff --git a/docs/source/user_guide/operators/pii_operator/examples.rst b/docs/source/user_guide/operators/pii_operator/examples.rst new file mode 100644 index 000000000..037bee176 --- /dev/null +++ b/docs/source/user_guide/operators/pii_operator/examples.rst @@ -0,0 +1,53 @@ +======== +Examples +======== + +**Simple Example** + +The simplest yaml file is generated by the ``ads operator init --type pii`` and looks like the following: + +.. code-block:: yaml + + kind: operator + type: pii + version: v1 + spec: + input_data: + url: mydata.csv + target_column: target + output_directory: + url: result/ + detectors: + - name: default.phone + action: mask + + + +**Complex Example** + +The yaml can also be maximally stated as follows: + + +.. code-block:: yaml + + kind: operator + type: pii + version: v1 + spec: + output_directory: + url: oci://my-bucket@my-tenancy/results + name: myProcessedData.csv + report: + report_filename: report.html + show_rows: 10 + show_sensitive_content: true + input_data: + url: oci://my-bucket@my-tenancy/mydata.csv + target_column: target + detectors: + - name: default.phone + action: mask + - name: default.social_security_number + action: remove + - name: spacy.en_core_web_trf.person + action: anonymize diff --git a/docs/source/user_guide/operators/pii_operator/getting_started.rst b/docs/source/user_guide/operators/pii_operator/getting_started.rst new file mode 100644 index 000000000..a5ce67d6a --- /dev/null +++ b/docs/source/user_guide/operators/pii_operator/getting_started.rst @@ -0,0 +1,64 @@ +=============== +Getting Started +=============== + +Configure +--------- + +After having set up ``ads opctl`` on your desired machine using ``ads opctl configure``, you are ready to begin using pii operator. At a bare minimum, you will need to provide the following details about your tasks: + +- Path to the input data (input_data) +- Path to the output directory, where the operator will place the processed data and report.html produced from the run (output_directory) +- Name of the column with user data (target_column) +- The detector will be used in the operator (detectors) + +You can check :ref:`Configure Detector ` for more details on how to configure ``detectors`` parameter. These details exactly match the initial pii.yaml file generated by running ``ads operator init --type pii``: + +.. code-block:: yaml + + kind: operator + type: pii + version: v1 + spec: + input_data: + url: mydata.csv + target_column: target + output_directory: + url: result/ + detectors: + - name: default.phone + action: mask + + +Optionally, you are able to specify much more. The most common additions are: + +- Whether to show sensitive content in the report (show_sensitive_content) +- Way to process the detected entity (action) + +An extensive list of parameters can be found in the :ref:`YAML Schema `. + + +Run +--- + +After you have your pii.yaml written, you simply run the operator using: + +.. code-block:: bash + + ads operator run -f pii.yaml + + +Interpret Results +----------------- + +The pii operator produces the following output files: ``mydata-out.csv`` and ``report.html``. + +We will go through each of these output files in turn. + +**mydata-out.csv** + +The name of this file can be customized based on ``output_directory`` parameters in the configuration yaml. This file contains the processed dataset. + +**report.html** + +The report.html file is customized based on report parameters in the configuration yaml. It contains a summary of statistics, a plot of entities distributions, details of the resolved entites, and details about any modelused. By default sensitive information is not shown in the report, but for debugging purposes you can disable this with ``show_sensitive_content``. It also includes a copy of YAML file, providing a fully detailed version of the original specification. diff --git a/docs/source/user_guide/operators/pii_operator/index.rst b/docs/source/user_guide/operators/pii_operator/index.rst new file mode 100644 index 000000000..cdf5d962b --- /dev/null +++ b/docs/source/user_guide/operators/pii_operator/index.rst @@ -0,0 +1,37 @@ +============ +PII Operator +============ + +The PII operator aims to detect and redact Personally Identifiable Information(PII) in datasets by combining pattern match and machine learning solution. + +Overview +-------- + +**Introduction to PII** + +Personal Identifiable Information (PII) refers to any information that can identify an individual, encompassing financial, medical, educational, and employment records. Failure to protect Personal Identifiable Information (PII) can lead to identity theft, financial loss, and reputational damage of individuals and businesses alike, highlighting the importance of taking appropriate measures to safeguard sensitive information. The Operators framework is OCI's most extensible, low-code, managed ecosystem for detecting and redacting pii in dataset. + +This technical documentation introduces using ``ads opctl`` for detecting and redacting pii tasks. This module is engineered with the principles of low-code development in mind, making it accessible to users with varying degrees of technical expertise. It operates on managed infrastructure, ensuring reliability and scalability, while its configurability through YAML allows users to customize redaction to their specific needs. + +**Automated Detection and Classification** + +By leveraging pattern matching and AI-powered solution, the ADS PII Operator efficiently identifies sentitive data on free form texts. + +**Intelligent Co-reference Resolution** + +A standout feature of the ADS PII Operator is its ability to maintain co-reference entity relationships even after anonymization, this not only anonymizes the data, but peserves the statistical properties of the data. + +**PII Operator Documentation** + +This documentation will explore the key concepts and capabilities of the PII operator, providing examples and practical guidance on how to use its various functions and modules. By the end of this guide, users will have a solid understanding of the PII operator and its capabilities, as well as the knowledge and tools needed to make informed decisions when designing solutions tailored to their specific requirements. + +.. versionadded:: 2.9.0 + +.. toctree:: + :maxdepth: 1 + + ./install + ./getting_started + ./pii + ./examples + ./yaml_schema diff --git a/docs/source/user_guide/operators/pii_operator/install.rst b/docs/source/user_guide/operators/pii_operator/install.rst new file mode 100644 index 000000000..7386f69cf --- /dev/null +++ b/docs/source/user_guide/operators/pii_operator/install.rst @@ -0,0 +1,24 @@ +=========================== +Installing the PII Operator +=========================== + +The PII Operator can be installed from PyPi. + + +.. code-block:: bash + + python3 -m pip install oracle_ads[pii]==2.9 + + +After that, the Operator is ready to go! + +In order to run on a job, you will need to create and publish a conda pack with ``oracle_ads[pii]`` installed. The simplest way to do this is from a Notebook Session, running the following commands: + +.. code-block:: bash + + odsc conda create -n ads_pii -e + conda activate /home/datascience/conda/ads_pii_v1_0 + python3 -m pip install oracle-ads[pii]==2.9 + odsc conda publish -s /home/datascience/conda/ads_pii_v1_0 + +Ensure that you have properly configured your conda pack namespace and bucket in the Launcher -> Settings -> Object Storage Settings. For more details, see :doc:`ADS Conda Set Up <../../cli/opctl/configure>` diff --git a/docs/source/user_guide/operators/pii_operator/pii.rst b/docs/source/user_guide/operators/pii_operator/pii.rst new file mode 100644 index 000000000..92cc47254 --- /dev/null +++ b/docs/source/user_guide/operators/pii_operator/pii.rst @@ -0,0 +1,132 @@ +============= +Configure PII +============= + +Let's explore each line of the pii.yaml so we can better understand options for extending and customizing the operator to our use case. + +Here is an example pii.yaml with every parameter specified: + +.. code-block:: yaml + + kind: operator + type: pii + version: v1 + spec: + output_directory: + url: oci://my-bucket@my-tenancy/results + name: mydata-out.csv + report: + report_filename: report.html + show_rows: 10 + show_sensitive_content: true + input_data: + url: oci://my-bucket@my-tenancy/mydata.csv + target_column: target + detectors: + - name: default.phone + action: anonymize + + +* **Kind**: The yaml file always starts with ``kind: operator``. There are many other kinds of yaml files that can be run by ``ads opctl``, so we need to specify this is an operator. +* **Type**: The type of operator is ``pii``. +* **Version**: The only available version is ``v1``. +* **Spec**: Spec contains the bulk of the information for the specific problem. + * **input_data**: This dictionary contains the details for how to read the input data. + * **url**: Insert the uri for the dataset if it's on object storage using the URI pattern ``oci://@/path/to/data.csv``. + * **target_column**: This string specifies the name of the column where the user data is within the input data. + * **detectors**: This list contains the details for each detector and action that will be taken. + * **name**: The string specifies the name of the detector. The format should be ``.``. Check :ref:`Configure Detector ` for more details. + * **action**: The string specifies the way to process the detected entity. Default to mask. + * **output_directory**: This dictionary contains the details for where to put the output artifacts. The directory need not exist, but must be accessible by the Operator during runtime. + * **url**: Insert the uri for the dataset if it's on object storage using the URI pattern ``oci://@/subfolder/``. + * **name**: The string specifies the name of the processed data file. + + * **report**: (optional) This dictionary specific details for the generated report. + * **report_filename**: Placed into output_directory location. Defaults to ``report.html``. + * **show_sensitive_content**: Whether to show sensitive content in the report. Defaults to ``false``. + * **show_rows**: The number of rows that shows in the report. + + +.. _config_detector: + +Configure Detector +------------------ + +A detector consists of ``name`` and ``action``. The **name** parameter defines the detector that will be used, and the **action** parameter defines the way to process the entity. + +Configure Name +~~~~~~~~~~~~~~ + +We currently support the following type of detectors: + +* default +* spacy + +Default +^^^^^^^ + +Here scrubadub's pre-defined detector is used. You can designate the name in the format of ``default.`` (e.g., ``default.phone``). Check the supported detectors from `scrubadub `_. + +.. note:: + + If you want to de-identify `address` by this tool, `scrubadub_address` is required. + You will need to follow the `instructions`_ to install the required dependencies. + + .. _instructions: https://scrubadub.readthedocs.io/en/stable/addresses.html/ + + +spaCy +^^^^^ + +To use spaCy’s NER to identify entity, you can designate the name in the format of ``spacy..`` (e.g., ``spacy.en_core_web_sm.person``). +The "entity" value can correspond to any entity that spaCy recognizes. For a list of available models and entities, please refer to the `spaCy documentation `_. + + + +Configure Action +~~~~~~~~~~~~~~~~ + +We currently support the following types of actions: + +* mask +* remove +* anonymize + +Mask +^^^^ + +The ``mask`` action is used to mask the detected entity with the name of the entity type. It replaces the entity with a placeholder. For example, with the following configured detector: + +.. code-block:: yaml + + name: spacy.en_core_web_sm.person + action: mask + +After processing, the input text "Hi, my name is John Doe." will become "Hi, my name is {{NAME}}." + +Remove +^^^^^^ + +The ``remove`` action is used to delete the detected entity from the text. It completely removes the entity without replacement. For example, with the following configured detector: + +.. code-block:: yaml + + name: spacy.en_core_web_sm.person + action: remove + +After processing, the input text "Hi, my name is John Doe." will become "Hi, my name is ." + + +Anonymize +^^^^^^^^^ + +The ``anonymize`` action can be used to obfuscate the detected sensitive information. +Currently, we provide context-aware anonymization for name, email, and number-like entities. +For example, with the following configured detector: + +.. code-block:: yaml + + name: spacy.en_core_web_sm.person + action: anonymize + +After processing, the input text "Hi, my name is John Doe." will become "Hi, my name is Joe Blow." diff --git a/docs/source/user_guide/operators/pii_operator/yaml_schema.rst b/docs/source/user_guide/operators/pii_operator/yaml_schema.rst new file mode 100644 index 000000000..6a887b5e1 --- /dev/null +++ b/docs/source/user_guide/operators/pii_operator/yaml_schema.rst @@ -0,0 +1,11 @@ +.. _pii-yaml-schema: + +=========== +YAML Schema +=========== + +Following is the YAML schema for validating the YAML using `Cerberus `_: + +.. literalinclude:: ../../../../../ads/opctl/operator/lowcode/pii/schema.yaml + :language: yaml + :linenos: diff --git a/pyproject.toml b/pyproject.toml index beabf51dc..c8caf66bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ classifiers = [ # In dependencies se "; platform_machine == 'aarch64'" to specify ARM underlying platform # Copied from install_requires list in setup.py, setup.py got removed in favor of this config file dependencies = [ + "PyYAML>=6", # pyyaml 5.4 is broken with cython 3 "asteval>=0.9.25", "cerberus>=1.3.4", "cloudpickle>=1.6.0", @@ -67,7 +68,6 @@ dependencies = [ "pandas>1.2.1,<2.1", "psutil>=5.7.2", "python_jsonschema_objects>=0.3.13", - "PyYAML>=6", # pyyaml 5.4 is broken with cython 3 "requests", "scikit-learn>=1.0", "tabulate>=0.8.9", @@ -106,7 +106,7 @@ notebook = [ "ipywidgets~=7.6.3", ] onnx = [ - "lightgbm==3.3.1", + "lightgbm", "onnx>=1.12.0", "onnxmltools>=1.10.0", "onnxruntime>=1.10.0,<1.16", # v1.16 introduced issues https://github.com/microsoft/onnxruntime/issues/17631, revealedd by unit tests @@ -124,6 +124,7 @@ opctl = [ "nbformat", "oci-cli", "py-cpuinfo", + "rich", ] optuna = [ "optuna==2.9.0", @@ -152,6 +153,34 @@ viz = [ "scipy>=1.5.4", "seaborn>=0.11.0", ] +forecast = [ + "autots[additional]", + "datapane", + "holidays==0.21.13", + "neuralprophet", + "numpy", + "oci-cli", + "optuna==2.9.0", + "oracle-ads[opctl]", + "oracle-automlx==23.2.3", + "pmdarima", + "prophet", + "shap", + "sktime", + "statsmodels", +] +pii = [ + "aiohttp", + "datapane", + "gender_guesser", + "nameparser", + "oracle_ads[opctl]", + "plotly", + "scrubadub==2.0.1", + "scrubadub_spacy", + "spacy-transformers==1.2.5", + "spacy==3.6.1", +] [project.urls] "Github" = "https://github.com/oracle/accelerated-data-science" diff --git a/test-requirements.txt b/test-requirements.txt index 53fbf3852..085909478 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -7,3 +7,5 @@ pip pytest pytest-codecov pytest-xdist +# darts +# docker diff --git a/tests/integration/opctl/operator/__init__.py b/tests/integration/opctl/operator/__init__.py new file mode 100644 index 000000000..fe904ad27 --- /dev/null +++ b/tests/integration/opctl/operator/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/tests/integration/opctl/operator/forecast/__init__.py b/tests/integration/opctl/operator/forecast/__init__.py new file mode 100644 index 000000000..fe904ad27 --- /dev/null +++ b/tests/integration/opctl/operator/forecast/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/tests/unitary/default_setup/common/test_common_utils.py b/tests/unitary/default_setup/common/test_common_utils.py index 634f844b6..9f6d341a2 100644 --- a/tests/unitary/default_setup/common/test_common_utils.py +++ b/tests/unitary/default_setup/common/test_common_utils.py @@ -386,8 +386,6 @@ def test_copy_file(self, mock_default_signer, input_params, expected_result): force_overwrite=input_params["force_overwrite"], chunk_size=input_params.get("chunk_size"), ) - if not input_params["auth"]: - mock_default_signer.assert_called() assert result_file_name.endswith(expected_result) assert os.path.exists(result_file_name) diff --git a/tests/unitary/default_setup/opctl/test_datasets.py b/tests/unitary/default_setup/opctl/test_datasets.py new file mode 100644 index 000000000..e4e974ac4 --- /dev/null +++ b/tests/unitary/default_setup/opctl/test_datasets.py @@ -0,0 +1,201 @@ +# #!/usr/bin/env python + +# # Copyright (c) 2023 Oracle and/or its affiliates. +# # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +# # from darts import datasets as d_datasets +# # import yaml +# # import tempfile +# # import subprocess +# # import pandas as pd +# # import pytest +# # from time import sleep, time +# # from copy import deepcopy +# # from pathlib import Path +# # import random + + +# DATASETS_LIST = [ +# "AirPassengersDataset", +# "AusBeerDataset", +# "AustralianTourismDataset", +# "ETTh1Dataset", +# # 'ETTh2Dataset', +# # 'ETTm1Dataset', +# # 'ETTm2Dataset', +# # 'ElectricityDataset', +# "EnergyDataset", +# "ExchangeRateDataset", +# "GasRateCO2Dataset", +# "HeartRateDataset", +# "ILINetDataset", +# "IceCreamHeaterDataset", +# "MonthlyMilkDataset", +# "MonthlyMilkIncompleteDataset", +# "SunspotsDataset", +# "TaylorDataset", +# "TemperatureDataset", +# "TrafficDataset", +# "USGasolineDataset", +# "UberTLCDataset", +# "WeatherDataset", +# "WineDataset", +# "WoolyDataset", +# ] + +# TEMPLATE_YAML = { +# "kind": "operator", +# "type": "forecast", +# "version": "v1", +# "spec": { +# "historical_data": { +# "url": None, +# }, +# "output_directory": { +# "url": "results", +# }, +# "model": None, +# "target_column": None, +# "datetime_column": { +# "name": None, +# }, +# "target_category_columns": [], +# "horizon": None, +# "generate_explanations": False, +# }, +# } + + +# PERIODS = 5 +# MAX_ADDITIONAL_COLS = 3 +# SAMPLE_FRACTION = 1 + +# parameters_short = [] + +# for dataset_i in DATASETS_LIST[2:3] + [DATASETS_LIST[-2]]: +# for model in [ +# "arima", +# "automlx", +# "prophet", +# "neuralprophet", +# "autots", +# "auto", +# ]: # ["arima", "automlx", "prophet", "neuralprophet", "autots", "auto"] +# parameters_short.append((model, dataset_i)) + + +# @pytest.mark.parametrize("model, dataset_name", parameters_short) +# def test_load_datasets(model, dataset_name): +# dataset_i = getattr(d_datasets, dataset_name)().load() +# datetime_col = dataset_i.time_index.name + +# columns = dataset_i.components +# target = dataset_i[columns[0]][:-PERIODS] +# test = dataset_i[columns[0]][-PERIODS:] + +# print(dataset_name, len(columns), len(target)) +# with tempfile.TemporaryDirectory() as tmpdirname: +# historical_data_path = f"{tmpdirname}/primary_data.csv" +# additional_data_path = f"{tmpdirname}/add_data.csv" +# test_data_path = f"{tmpdirname}/test_data.csv" +# output_data_path = f"{tmpdirname}/results" +# yaml_i = deepcopy(TEMPLATE_YAML) +# generate_train_metrics = True # bool(random.getrandbits(1)) + +# # TODO: Open bug ticket so that series is not required +# df_i = target.pd_dataframe().reset_index() +# df_i["Series"] = "A" +# if model == "automlx" and dataset_name == "AustralianTourismDataset": +# df_i[datetime_col] = pd.to_datetime( +# [f"{x+1:03d}" for x in df_i[datetime_col]], format="%j" +# ) + +# df_i.to_csv(historical_data_path, index=False) +# # .sample(frac=SAMPLE_FRACTION).sort_values(by=datetime_col) + +# test_df = test.pd_dataframe().reset_index() +# test_df["Series"] = "A" +# if model == "automlx" and dataset_name == "AustralianTourismDataset": +# test_df[datetime_col] = pd.to_datetime( +# [f"{x+1:03d}" for x in test_df[datetime_col]], format="%j" +# ) +# test_df.to_csv(test_data_path, index=False) + +# if len(columns) > 1: +# additional_cols = columns[1 : min(len(columns), MAX_ADDITIONAL_COLS)] +# additional_data = dataset_i[list(additional_cols)] +# df_additional = additional_data.pd_dataframe().reset_index() +# df_additional["Series"] = "A" +# if model == "automlx" and dataset_name == "AustralianTourismDataset": +# df_additional[datetime_col] = pd.to_datetime( +# [f"{x+1:03d}" for x in df_additional[datetime_col]], format="%j" +# ) +# df_additional.to_csv(additional_data_path, index=False) +# yaml_i["spec"]["additional_data"] = {"url": additional_data_path} + +# yaml_i["spec"]["historical_data"]["url"] = historical_data_path +# yaml_i["spec"]["test_data"] = {"url": test_data_path} +# yaml_i["spec"]["output_directory"]["url"] = output_data_path +# yaml_i["spec"]["model"] = model +# yaml_i["spec"]["target_column"] = columns[0] +# yaml_i["spec"]["datetime_column"]["name"] = datetime_col +# yaml_i["spec"]["target_category_columns"] = ["Series"] +# yaml_i["spec"]["horizon"] = PERIODS +# if ( +# yaml_i["spec"].get("additional_data") is not None +# and model != "neuralprophet" +# ): +# yaml_i["spec"]["generate_explanations"] = True +# if generate_train_metrics: +# yaml_i["spec"]["generate_metrics"] = generate_train_metrics +# if model == "autots": +# yaml_i["spec"]["model_kwargs"] = {"model_list": "superfast"} +# if model == "automlx": +# yaml_i["spec"]["model_kwargs"] = {"time_budget": 1} + +# forecast_yaml_filename = f"{tmpdirname}/forecast.yaml" +# with open(f"{tmpdirname}/forecast.yaml", "w") as f: +# f.write(yaml.dump(yaml_i)) +# sleep(0.5) +# subprocess.run( +# f"ads operator run -f {forecast_yaml_filename} --debug", shell=True +# ) +# sleep(0.1) +# subprocess.run(f"ls -a {output_data_path}", shell=True) +# # if yaml_i["spec"]["generate_explanations"]: +# # glb_expl = pd.read_csv(f"{tmpdirname}/results/global_explanation.csv") +# # print(glb_expl) +# # loc_expl = pd.read_csv(f"{tmpdirname}/results/local_explanation.csv") +# # print(loc_expl) + +# test_metrics = pd.read_csv(f"{tmpdirname}/results/test_metrics.csv") +# print(test_metrics) +# train_metrics = pd.read_csv(f"{tmpdirname}/results/metrics.csv") +# print(train_metrics) +# return test_metrics.iloc[0][f"{columns[0]}_A"] + + +# if __name__ == "__main__": +# failed_runs = [] +# results = dict() +# timings = dict() +# for dataset_name in DATASETS_LIST[2:3]: # random.sample(DATASETS_LIST, 2): +# results[dataset_name] = dict() +# timings[dataset_name] = dict() +# for m in [ +# "automlx" +# ]: # ["arima", "automlx", "prophet", "neuralprophet", "autots", "auto"]: +# start_time = time() +# try: +# results[dataset_name][m] = test_load_datasets( +# model=m, dataset_name=dataset_name +# ) +# except Exception as e: +# print(f"Failed with the following error! {e}") +# failed_runs.append((dataset_name, m)) +# elapsed = time() - start_time +# timings[dataset_name][m] = elapsed +# print(f"Failed Runs: {failed_runs}") +# print(f"results: {pd.DataFrame(results)}") +# print(f"timings: {timings}") +# pd.DataFrame(results).to_csv("~/Desktop/AUTO_benchmark_darts.csv") diff --git a/tests/unitary/with_extras/opctl/test_files/dataflow_dataFlow.yaml b/tests/unitary/with_extras/opctl/test_files/dataflow_dataFlow.yaml index 87aa8a750..05ee1f89a 100644 --- a/tests/unitary/with_extras/opctl/test_files/dataflow_dataFlow.yaml +++ b/tests/unitary/with_extras/opctl/test_files/dataflow_dataFlow.yaml @@ -17,17 +17,20 @@ spec: scriptBucket: oci://bucket@namespace/prefix sparkVersion: 3.2.1 type: dataFlow - name: '{Job name. For MLflow, it will be replaced with the Project name}' + name: '{Job name. For MLflow and Operator will be auto generated}' runtime: kind: runtime spec: + args: [] + conda: + type: published + uri: oci://bucket@namespace/conda_environments/test/conda/slug condaAuthType: resource_principal configuration: spark.driverEnv.env_key: env_value - freeformTags: - tag_name: tag_value + freeformTags: {} overwrite: true - scriptBucket: '{The object storage bucket to save a script. Example: oci://@/}' - scriptPathURI: '{Path to the executable script. For MLflow, it will be replaced - with the CMD}' + scriptBucket: oci://bucket@namespace/prefix + scriptPathURI: '{Path to the executable script. For MLflow and Operator will + auto generated}' type: dataFlow diff --git a/tests/unitary/with_extras/opctl/test_files/dataflow_dataFlowNotebook.yaml b/tests/unitary/with_extras/opctl/test_files/dataflow_dataFlowNotebook.yaml index 579f20cd2..7aeb2fa6a 100644 --- a/tests/unitary/with_extras/opctl/test_files/dataflow_dataFlowNotebook.yaml +++ b/tests/unitary/with_extras/opctl/test_files/dataflow_dataFlowNotebook.yaml @@ -17,10 +17,14 @@ spec: scriptBucket: oci://bucket@namespace/prefix sparkVersion: 3.2.1 type: dataFlow - name: '{Job name. For MLflow, it will be replaced with the Project name}' + name: '{Job name. For MLflow and Operator will be auto generated}' runtime: kind: runtime spec: + args: [] + conda: + type: published + uri: oci://bucket@namespace/conda_environments/test/conda/slug condaAuthType: resource_principal configuration: spark.driverEnv.env_key: env_value @@ -28,6 +32,5 @@ spec: tag_name: tag_value overwrite: true scriptBucket: '{The object storage bucket to save a script. Example: oci://@/}' - scriptPathURI: '{Path to the executable script. For MLflow, it will be replaced - with the CMD}' + scriptPathURI: '{Path to the executable script. For MLflow and Operator will be auto generated}' type: dataFlowNotebook diff --git a/tests/unitary/with_extras/opctl/test_files/job_container.yaml b/tests/unitary/with_extras/opctl/test_files/job_container.yaml index 05454f62b..f987f7da5 100644 --- a/tests/unitary/with_extras/opctl/test_files/job_container.yaml +++ b/tests/unitary/with_extras/opctl/test_files/job_container.yaml @@ -18,19 +18,16 @@ spec: shapeName: VM.Standard2.1 subnetId: ocid1.subnet.oc1.iad. type: dataScienceJob - name: '{Job name. For MLflow, it will be replaced with the Project name}' + name: '{Job name. For MLflow and Operator will be auto generated}' runtime: kind: runtime spec: - cmd: '{Container CMD. For MLFlow, it will be replaced with the Project CMD}' + args: [] + cmd: '{Container CMD. For MLflow and Operator will be auto generated}' entrypoint: - bash - --login - -c - env: - - name: env_name - value: env_value - freeformTags: - tag_name: tag_value - image: iad.ocir.io/namespace/image:tag + freeformTags: {} + image: /image:latest type: container diff --git a/tests/unitary/with_extras/opctl/test_files/job_gitPython.yaml b/tests/unitary/with_extras/opctl/test_files/job_gitPython.yaml index a663c301e..a1ea95a35 100644 --- a/tests/unitary/with_extras/opctl/test_files/job_gitPython.yaml +++ b/tests/unitary/with_extras/opctl/test_files/job_gitPython.yaml @@ -1,6 +1,6 @@ # This YAML specification was auto generated by the `ads opctl init` command. # The more details about the jobs YAML specification can be found in the ADS documentation: -# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/index.html +# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/index.html kind: job @@ -18,18 +18,15 @@ spec: shapeName: VM.Standard2.1 subnetId: ocid1.subnet.oc1.iad. type: dataScienceJob - name: '{Job name. For MLflow, it will be replaced with the Project name}' + name: '{Job name. For MLflow and Operator will be auto generated}' runtime: kind: runtime spec: + args: [] conda: - type: published - uri: '{Path to the custom conda environment. Example: oci://your_bucket@namespace/object_name' - entrypoint: '{Entrypoint script. For MLflow, it will be replaced with the CMD}' - env: - - name: env_name - value: env_value - freeformTags: - tag_name: tag_value - url: '{Git URI. For MLflow, it will be replaced with the Project URI}' + slug: '' + type: service + entrypoint: '{For MLflow and Operator will be auto generated}' + freeformTags: {} + url: '{Git URI. For MLflow and Operator will be auto generated}' type: gitPython diff --git a/tests/unitary/with_extras/opctl/test_files/job_notebook.yaml b/tests/unitary/with_extras/opctl/test_files/job_notebook.yaml index 4487cd050..69775ae36 100644 --- a/tests/unitary/with_extras/opctl/test_files/job_notebook.yaml +++ b/tests/unitary/with_extras/opctl/test_files/job_notebook.yaml @@ -1,6 +1,6 @@ # This YAML specification was auto generated by the `ads opctl init` command. # The more details about the jobs YAML specification can be found in the ADS documentation: -# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/index.html +# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/index.html kind: job @@ -18,23 +18,18 @@ spec: shapeName: VM.Standard2.1 subnetId: ocid1.subnet.oc1.iad. type: dataScienceJob - name: '{Job name. For MLflow, it will be replaced with the Project name}' + name: '{Job name. For MLflow and Operator will be auto generated}' runtime: kind: runtime spec: + args: [] conda: - type: published - uri: '{Path to the custom conda environment. Example: oci://your_bucket@namespace/object_name' - entrypoint: '{Entrypoint notebook. For MLflow, it will be replaced with the - CMD}' - env: - - name: env_name - value: env_value + slug: '' + type: service + entrypoint: '{Entrypoint notebook. For MLflow, it will be replaced with the CMD}' excludeTags: - tag1 - freeformTags: - tag_name: tag_value + freeformTags: {} notebookEncoding: utf-8 - source: '{Path to the source code directory. For MLflow, it will be replaced - with the path to the project}' + source: '{Path to the source code directory. For MLflow, it will be replaced with the path to the project}' type: notebook diff --git a/tests/unitary/with_extras/opctl/test_files/job_python.yaml b/tests/unitary/with_extras/opctl/test_files/job_python.yaml index 8bc9ad481..aba6287a3 100644 --- a/tests/unitary/with_extras/opctl/test_files/job_python.yaml +++ b/tests/unitary/with_extras/opctl/test_files/job_python.yaml @@ -1,6 +1,6 @@ # This YAML specification was auto generated by the `ads opctl init` command. # The more details about the jobs YAML specification can be found in the ADS documentation: -# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/index.html +# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/index.html kind: job @@ -18,20 +18,17 @@ spec: shapeName: VM.Standard2.1 subnetId: ocid1.subnet.oc1.iad. type: dataScienceJob - name: '{Job name. For MLflow, it will be replaced with the Project name}' + name: '{Job name. For MLflow and Operator will be auto generated}' runtime: kind: runtime spec: + args: [] conda: - type: published - uri: '{Path to the custom conda environment. Example: oci://your_bucket@namespace/object_name' - entrypoint: '{Entrypoint script. For MLflow, it will be replaced with the CMD}' - env: - - name: env_name - value: env_value - freeformTags: - tag_name: tag_value - scriptPathURI: '{Path to the script. For MLflow, it will be replaced with the - path to the project}' - workingDir: '{For MLflow the project folder will be used.}' + slug: conda_slug + type: service + entrypoint: '{For MLflow and Operator will be auto generated}' + freeformTags: {} + scriptPathURI: '{Path to the script. For MLflow and Operator will be auto + generated}' + workingDir: '{For MLflow and Operator will be auto generated}' type: python diff --git a/tests/unitary/with_extras/opctl/test_files/job_script.yaml b/tests/unitary/with_extras/opctl/test_files/job_script.yaml index a8d60a61c..7b3d6f21d 100644 --- a/tests/unitary/with_extras/opctl/test_files/job_script.yaml +++ b/tests/unitary/with_extras/opctl/test_files/job_script.yaml @@ -1,6 +1,6 @@ # This YAML specification was auto generated by the `ads opctl init` command. # The more details about the jobs YAML specification can be found in the ADS documentation: -# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/index.html +# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/index.html kind: job @@ -18,19 +18,15 @@ spec: shapeName: VM.Standard2.1 subnetId: ocid1.subnet.oc1.iad. type: dataScienceJob - name: '{Job name. For MLflow, it will be replaced with the Project name}' + name: '{Job name. For MLflow and Operator will be auto generated}' runtime: kind: runtime spec: + args: [] conda: - type: published - uri: '{Path to the custom conda environment. Example: oci://your_bucket@namespace/object_name' - entrypoint: '{Entrypoint script. For MLflow, it will be replaced with the CMD}' - env: - - name: env_name - value: env_value - freeformTags: - tag_name: tag_value - scriptPathURI: '{Path to the script. For MLflow, it will be replaced with the - path to the project}' + slug: conda_slug + type: service + entrypoint: '{For MLflow and Operator will be auto generated}' + freeformTags: {} + scriptPathURI: '{Path to the script. For MLflow and Operator will be auto generated}' type: script diff --git a/tests/unitary/with_extras/opctl/test_files/pipeline_container.yaml b/tests/unitary/with_extras/opctl/test_files/pipeline_container.yaml index 3f29e9e0e..09c595217 100644 --- a/tests/unitary/with_extras/opctl/test_files/pipeline_container.yaml +++ b/tests/unitary/with_extras/opctl/test_files/pipeline_container.yaml @@ -27,19 +27,13 @@ spec: runtime: kind: runtime spec: - args: - - --key1 - - val1 - cmd: '{Container CMD. For MLFlow, it will be replaced with the Project CMD}' + args: [] + cmd: '{Container CMD. For MLflow and Operator will be auto generated}' entrypoint: - bash - --login - -c - env: - - name: env_name - value: env_value - freeformTags: - tag_name: tag_value + freeformTags: {} image: iad.ocir.io/namespace/image:tag type: container type: pipeline diff --git a/tests/unitary/with_extras/opctl/test_files/pipeline_gitPython.yaml b/tests/unitary/with_extras/opctl/test_files/pipeline_gitPython.yaml index 2b5992800..e7f6d2b7b 100644 --- a/tests/unitary/with_extras/opctl/test_files/pipeline_gitPython.yaml +++ b/tests/unitary/with_extras/opctl/test_files/pipeline_gitPython.yaml @@ -27,19 +27,12 @@ spec: runtime: kind: runtime spec: - args: - - --key1 - - val1 + args: [] conda: - type: published - uri: '{Path to the custom conda environment. Example: oci://your_bucket@namespace/object_name' - entrypoint: '{Entrypoint script. For MLflow, it will be replaced with the - CMD}' - env: - - name: env_name - value: env_value - freeformTags: - tag_name: tag_value - url: '{Git URI. For MLflow, it will be replaced with the Project URI}' + slug: '' + type: service + entrypoint: '{For MLflow and Operator will be auto generated}' + freeformTags: {} + url: '{Git URI. For MLflow and Operator will be auto generated}' type: gitPython type: pipeline diff --git a/tests/unitary/with_extras/opctl/test_files/pipeline_notebook.yaml b/tests/unitary/with_extras/opctl/test_files/pipeline_notebook.yaml index 953ea4d0c..697cdcc3d 100644 --- a/tests/unitary/with_extras/opctl/test_files/pipeline_notebook.yaml +++ b/tests/unitary/with_extras/opctl/test_files/pipeline_notebook.yaml @@ -27,23 +27,15 @@ spec: runtime: kind: runtime spec: - args: - - --key1 - - val1 + args: [] conda: - type: published - uri: '{Path to the custom conda environment. Example: oci://your_bucket@namespace/object_name' - entrypoint: '{Entrypoint notebook. For MLflow, it will be replaced with - the CMD}' - env: - - name: env_name - value: env_value + slug: '' + type: service + entrypoint: '{Entrypoint notebook. For MLflow, it will be replaced with the CMD}' excludeTags: - tag1 - freeformTags: - tag_name: tag_value + freeformTags: {} notebookEncoding: utf-8 - source: '{Path to the source code directory. For MLflow, it will be replaced - with the path to the project}' + source: '{Path to the source code directory. For MLflow, it will be replaced with the path to the project}' type: notebook type: pipeline diff --git a/tests/unitary/with_extras/opctl/test_files/pipeline_python.yaml b/tests/unitary/with_extras/opctl/test_files/pipeline_python.yaml index 771fc9afd..319f51938 100644 --- a/tests/unitary/with_extras/opctl/test_files/pipeline_python.yaml +++ b/tests/unitary/with_extras/opctl/test_files/pipeline_python.yaml @@ -27,23 +27,14 @@ spec: runtime: kind: runtime spec: - args: - - --key1 - - val1 - - --key1 - - val1 + args: [] conda: - type: published - uri: '{Path to the custom conda environment. Example: oci://your_bucket@namespace/object_name' - entrypoint: '{Entrypoint script. For MLflow, it will be replaced with the - CMD}' - env: - - name: env_name - value: env_value - freeformTags: - tag_name: tag_value - scriptPathURI: '{Path to the script. For MLflow, it will be replaced with - the path to the project}' - workingDir: '{For MLflow the project folder will be used.}' + slug: '' + type: service + entrypoint: '{For MLflow and Operator will be auto generated}' + freeformTags: {} + scriptPathURI: '{Path to the script. For MLflow and Operator will be auto + generated}' + workingDir: '{For MLflow and Operator will be auto generated}' type: python type: pipeline diff --git a/tests/unitary/with_extras/opctl/test_files/pipeline_script.yaml b/tests/unitary/with_extras/opctl/test_files/pipeline_script.yaml index ef7bd7063..cb8273dba 100644 --- a/tests/unitary/with_extras/opctl/test_files/pipeline_script.yaml +++ b/tests/unitary/with_extras/opctl/test_files/pipeline_script.yaml @@ -27,22 +27,12 @@ spec: runtime: kind: runtime spec: - args: - - --key1 - - val1 - - --key1 - - val1 + args: [] conda: - type: published - uri: '{Path to the custom conda environment. Example: oci://your_bucket@namespace/object_name' - entrypoint: '{Entrypoint script. For MLflow, it will be replaced with the - CMD}' - env: - - name: env_name - value: env_value - freeformTags: - tag_name: tag_value - scriptPathURI: '{Path to the script. For MLflow, it will be replaced with - the path to the project}' + slug: '' + type: service + entrypoint: '{For MLflow and Operator will be auto generated}' + freeformTags: {} + scriptPathURI: '{Path to the script. For MLflow and Operator will be auto generated}' type: script type: pipeline diff --git a/tests/unitary/with_extras/opctl/test_opctl_cmds.py b/tests/unitary/with_extras/opctl/test_opctl_cmds.py index 3c614594a..6c4d6e9e0 100644 --- a/tests/unitary/with_extras/opctl/test_opctl_cmds.py +++ b/tests/unitary/with_extras/opctl/test_opctl_cmds.py @@ -38,6 +38,7 @@ def test_configure(self, confirm, prompt, monkeypatch): prompt.side_effect = ( [ os.path.join(td, "ads_ops"), + "api_key", os.path.join(td, "oci_config"), "DEFAULT", ".", @@ -109,6 +110,7 @@ def test_configure_in_notebook_session(self, confirm, prompt, monkeypatch): prompt.side_effect = ( [ os.path.join(td, "ads_ops"), + "api_key", os.path.join(td, "oci_config"), "DEFAULT", ".", diff --git a/tests/unitary/with_extras/opctl/test_opctl_config.py b/tests/unitary/with_extras/opctl/test_opctl_config.py index 9c23342d8..d5ae4e71d 100644 --- a/tests/unitary/with_extras/opctl/test_opctl_config.py +++ b/tests/unitary/with_extras/opctl/test_opctl_config.py @@ -182,7 +182,7 @@ def test_config_flex_shape_details(self): "infrastructure": { "compartment_id": "oci.compartmentid.abcd", "project_id": "oci.projectid.abcd", - "shape_name": "VM.Standard.E2.4" + "shape_name": "VM.Standard.E2.4", }, } @@ -201,19 +201,19 @@ def test_config_flex_shape_details(self): "infrastructure": { "compartment_id": "oci.compartmentid.abcd", "project_id": "oci.projectid.abcd", - "shape_name": "VM.Standard.E2.4" + "shape_name": "VM.Standard.E2.4", }, } - + config_one["infrastructure"]["shape_name"] = "VM.Standard.E3.Flex" m = ConfigMerger(config_one) with pytest.raises( - ValueError, + ValueError, match="Parameters `ocpus` and `memory_in_gbs` must be provided for using flex shape. " - "Call `ads opctl config` to specify." + "Call `ads opctl configure` to specify.", ): - m._config_flex_shape_details() + m._config_flex_shape_details() config_one["infrastructure"]["ocpus"] = 2 config_one["infrastructure"]["memory_in_gbs"] = 24 @@ -233,10 +233,7 @@ def test_config_flex_shape_details(self): "compartment_id": "oci.compartmentid.abcd", "project_id": "oci.projectid.abcd", "shape_name": "VM.Standard.E3.Flex", - "shape_config_details": { - "ocpus": 2, - "memory_in_gbs": 24 - } + "shape_config_details": {"ocpus": 2, "memory_in_gbs": 24}, }, } @@ -253,20 +250,19 @@ def test_config_flex_shape_details(self): "compartment_id": "oci.compartmentid.abcd", "project_id": "oci.projectid.abcd", "executor_shape": "VM.Standard.E3.Flex", - "driver_shape": "VM.Standard.E3.Flex" + "driver_shape": "VM.Standard.E3.Flex", }, } m = ConfigMerger(config_two) with pytest.raises( - ValueError, + ValueError, match="Parameters driver_shape_memory_in_gbs must be provided for using flex shape. " - "Call `ads opctl config` to specify." + "Call `ads opctl configure` to specify.", ): m._config_flex_shape_details() - config_two["infrastructure"]["driver_shape_memory_in_gbs"] = 36 config_two["infrastructure"]["driver_shape_ocpus"] = 4 config_two["infrastructure"]["executor_shape_memory_in_gbs"] = 48 @@ -287,18 +283,13 @@ def test_config_flex_shape_details(self): "compartment_id": "oci.compartmentid.abcd", "project_id": "oci.projectid.abcd", "executor_shape": "VM.Standard.E3.Flex", - "executor_shape_config": { - "ocpus": 5, - "memory_in_gbs": 48 - }, + "executor_shape_config": {"ocpus": 5, "memory_in_gbs": 48}, "driver_shape": "VM.Standard.E3.Flex", - "driver_shape_config": { - "ocpus": 4, - "memory_in_gbs": 36 - } + "driver_shape_config": {"ocpus": 4, "memory_in_gbs": 36}, }, } + class TestConfigResolver: def test_resolve_operator_name(self): config = {"name": "name1", "execution": {"operator_name": "name2"}} diff --git a/tests/unitary/with_extras/opctl/test_opctl_dataflow_backend.py b/tests/unitary/with_extras/opctl/test_opctl_dataflow_backend.py index da6eddde2..940f28bd8 100644 --- a/tests/unitary/with_extras/opctl/test_opctl_dataflow_backend.py +++ b/tests/unitary/with_extras/opctl/test_opctl_dataflow_backend.py @@ -107,7 +107,7 @@ def test_watch(self, mock_from_ocid, mock_watch): @pytest.mark.parametrize( "runtime_type", - ["dataFlow", "dataFlowNotebook"], + ["dataFlow"], ) def test_init(self, runtime_type, monkeypatch): """Ensures that starter YAML can be generated for every supported runtime of the Data Flow.""" @@ -123,6 +123,7 @@ def test_init(self, runtime_type, monkeypatch): uri=test_yaml_uri, overwrite=False, runtime_type=runtime_type, + conda_slug="test/conda/slug", ) with open(test_yaml_uri, "r") as stream: @@ -130,4 +131,4 @@ def test_init(self, runtime_type, monkeypatch): with open(expected_yaml_uri, "r") as stream: expected_yaml_dict = yaml.safe_load(stream) - assert test_yaml_dict == expected_yaml_dict \ No newline at end of file + assert test_yaml_dict == expected_yaml_dict diff --git a/tests/unitary/with_extras/opctl/test_opctl_decorators.py b/tests/unitary/with_extras/opctl/test_opctl_decorators.py new file mode 100644 index 000000000..fc295354b --- /dev/null +++ b/tests/unitary/with_extras/opctl/test_opctl_decorators.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import pytest +from ads.opctl.decorator.common import validate_environment, OpctlEnvironmentError +from unittest.mock import patch, MagicMock + + +class TestOpctlDecorators: + """Tests the all OPCTL common decorators.""" + + def test_validate_environment_success(self): + """Tests validating environment decorator.""" + + @validate_environment + def mock_function(): + return "SUCCESS" + + assert mock_function() == "SUCCESS" + + def test_validate_environment_fail(self): + """Tests validating environment decorator fails.""" + + @validate_environment + def mock_function(): + return "SUCCESS" + + import docker + + with patch.object( + docker, + "from_env", + return_value=MagicMock( + "version", + return_value=MagicMock(side_effect=ValueError("Something went wrong")), + ), + ): + with pytest.raises(OpctlEnvironmentError): + assert mock_function() diff --git a/tests/unitary/with_extras/operator/__init__.py b/tests/unitary/with_extras/operator/__init__.py new file mode 100644 index 000000000..fe904ad27 --- /dev/null +++ b/tests/unitary/with_extras/operator/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/tests/unitary/with_extras/operator/forecast/__init__.py b/tests/unitary/with_extras/operator/forecast/__init__.py new file mode 100644 index 000000000..fe904ad27 --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/tests/unitary/with_extras/operator/forecast/benchmarks/benchmark_datasets.py b/tests/unitary/with_extras/operator/forecast/benchmarks/benchmark_datasets.py new file mode 100755 index 000000000..f96ec61ab --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/benchmarks/benchmark_datasets.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- +import copy + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from ads.opctl.operator.lowcode.forecast.operator_config import * +from ads.opctl.operator.lowcode.forecast.model.factory import ForecastOperatorModelFactory +import pandas as pd +from ads.opctl import logger +import os + +if __name__ == '__main__': + """Benchmarks for datasets.""" + + try: + data_dir = os.environ["OCI__FORECASTING_DATA_DIR"] + except: + raise ValueError("Please set the environment variable `OCI__FORECASTING_DATA_DIR` to the location of the " + "forecasting datasets") + + smape = SupportedMetrics.SMAPE + mape = SupportedMetrics.MAPE + rmse = SupportedMetrics.RMSE + + prophet = 'prophet' + arima = 'arima' + automlx = 'automlx' + neuralprophet = 'neuralprophet' + + benchmark_metrics = [smape, mape, rmse] + + # Expected values + cust1_numbers = { + prophet: {smape: 30, mape: 10, rmse: 1780}, + arima: {smape: 20, mape: 2, rmse: 1500}, + automlx: {smape: 30, mape: 7, rmse: 1750}, + # neuralprophet: {smape: 29, mape: 9.5, rmse: 1760}, + } + + cust2_numbers = { + prophet: {smape: 18, mape: 0.5, rmse: 75}, + arima: {smape: 21, mape: 0.45, rmse: 75}, + automlx: {smape: 15, mape: 0.3, rmse: 74}, + # neuralprophet: {smape: 30, mape: 10, rmse: 1780}, + } + + datasets = { + 'cust1': cust1_numbers, + 'cust2': cust2_numbers, + } + metrics = [SupportedMetrics.SMAPE, SupportedMetrics.MAPE, SupportedMetrics.RMSE] + + for dataset in datasets: + for model in datasets[dataset]: + operator_config: ForecastOperatorConfig = ForecastOperatorConfig.from_yaml( + uri=os.path.join(data_dir, dataset, 'forecast.yaml') + ) + operator_config.spec.model = model + operator_config.spec.output_directory = OutputDirectory( + url=os.path.join(operator_config.spec.output_directory.url, model) + ) + + # Training and generating the model outputs + ForecastOperatorModelFactory.get_model(operator_config).generate_report() + + # Reading holdout errors. + metrics_df = pd.read_csv(os.path.join(data_dir, dataset, 'output', model, 'metrics.csv')).set_index( + 'metrics') + metrics_dict = metrics_df.mean(axis=1).to_dict() + logger.info("{} | {} | {}".format(dataset, model, metrics_dict)) + # Actual values should be less than actual values + for metric in benchmark_metrics: + assert metrics_dict[metric] <= datasets[dataset][model][metric] + logger.info("Test completed for {} and {} model".format(dataset, model)) diff --git a/tests/unitary/with_extras/operator/forecast/bug_smash_test_suite.py b/tests/unitary/with_extras/operator/forecast/bug_smash_test_suite.py new file mode 100644 index 000000000..73d5a0d9c --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/bug_smash_test_suite.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +class BugSmashTestSuite: + """Tests the bugs defined here.""" + + pass diff --git a/tests/unitary/with_extras/operator/forecast/test_cmd.py b/tests/unitary/with_extras/operator/forecast/test_cmd.py new file mode 100644 index 000000000..a727b4710 --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/test_cmd.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +class TestCMD: + def test_init(self): + """Tests generating a starter specification template YAML for the operator..""" diff --git a/tests/unitary/with_extras/operator/forecast/test_common_utils.py b/tests/unitary/with_extras/operator/forecast/test_common_utils.py new file mode 100644 index 000000000..4ef099829 --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/test_common_utils.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +class TestCommonUtils: + """Tests all common utils methods of the forecast operator.""" + + pass diff --git a/tests/unitary/with_extras/operator/forecast/test_model_arima.py b/tests/unitary/with_extras/operator/forecast/test_model_arima.py new file mode 100644 index 000000000..0f9f4135e --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/test_model_arima.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +class TestArimaOperatorModel: + """Tests the arima operator model class.""" + + pass diff --git a/tests/unitary/with_extras/operator/forecast/test_model_automlx.py b/tests/unitary/with_extras/operator/forecast/test_model_automlx.py new file mode 100644 index 000000000..037e464e6 --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/test_model_automlx.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import unittest +from unittest.mock import patch, Mock +import pandas as pd +import datapane as dp +from ads.opctl.operator.lowcode.forecast.model.automlx import AutoMLXOperatorModel +from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import ForecastDatasets + +from ads.opctl.operator.lowcode.forecast.operator_config import ( + ForecastOperatorConfig, + ForecastOperatorSpec, + DateTimeColumn, + InputData, +) + + +class TestAutoMLXOperatorModel(unittest.TestCase): + """Tests the automlx operator model class.""" + + def setUp(self): + self.target_columns = ["Sales_Product Group 107", "Sales_Product Group 108"] + self.target_category_columns = ["PPG_Code"] + self.test_filename = "test.csv" + self.primary_data = pd.DataFrame( + { + "PPG_Code": [ + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + ], + "last_day_of_week": [ + "12-01-2019", + "19-01-2019", + "05-01-2019", + "26-01-2019", + "02-02-2019", + "09-02-2019", + "16-02-2019", + "23-02-2019", + "02-03-2019", + "09-03-2019", + "16-03-2019", + ], + "Sales": [ + 2187.0, + 1149.0, + 2070.0, + 5958.0, + 9540.0, + 2883.0, + 968.0, + 1245.0, + 1689.0, + 1514.0, + 1083.0, + ], + } + ) + self.additional_data = pd.DataFrame( + { + "PPG_Code": [ + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + ], + "last_day_of_week": [ + "12-01-2019", + "19-01-2019", + "05-01-2019", + "26-01-2019", + "02-02-2019", + "09-02-2019", + "16-02-2019", + "23-02-2019", + "02-03-2019", + "09-03-2019", + "16-03-2019", + "23-03-2019", + ], + "pt1": [0, 0, 0, 3, 7, 4, 0, 0, 0, 0, 0, 0], + } + ) + + self.target_col = "yhat" + self.datetime_column_name = "last_day_of_week" + self.original_target_column = "Sales" + + spec = Mock(spec=ForecastOperatorSpec) + spec.target_column = self.target_col + spec.target_category_columns = self.target_category_columns + spec.target_column = self.original_target_column + spec.datetime_column = Mock(spec=DateTimeColumn) + spec.datetime_column.name = self.datetime_column_name + spec.datetime_column.format = "%d-%m-%Y" + spec.horizon = 1 + spec.tuning = None + spec.confidence_interval_width = None + spec.historical_data = Mock(spec="InputData") + spec.historical_data.url = "primary.csv" + spec.historical_data.format = None + spec.historical_data.columns = None + spec.additional_data = Mock(spec="InputData") + spec.additional_data.url = "additional.csv" + spec.additional_data.format = None + spec.additional_data.columns = None + spec.model_kwargs = {} + config = Mock(spec=ForecastOperatorConfig) + config.spec = spec + + self.config = config + + @patch("ads.opctl.operator.lowcode.forecast.utils._call_pandas_fsspec") + def test_automlx_for_unsorted_data(self, mock__call_pandas_fsspec): + mock__call_pandas_fsspec.side_effect = ( + lambda read_fn, filename, storage_options: self.primary_data + if filename == "primary.csv" + else self.additional_data + ) + datasets = ForecastDatasets(self.config) + automlx = AutoMLXOperatorModel(self.config, datasets) + + outputs = automlx._build_model() + self.assertFalse(outputs.empty) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unitary/with_extras/operator/forecast/test_model_autots.py b/tests/unitary/with_extras/operator/forecast/test_model_autots.py new file mode 100644 index 000000000..39fc0f397 --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/test_model_autots.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +import unittest +from unittest.mock import patch, Mock +import pandas as pd +import datapane as dp +import autots +from ads.opctl.operator.common.utils import _build_image, _parse_input_args +from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import ForecastDatasets +from ads.opctl.operator.lowcode.forecast.model.autots import ( + AutoTSOperatorModel, + AUTOTS_MAX_GENERATION, + AUTOTS_MODELS_TO_VALIDATE, +) +from ads.opctl.operator.lowcode.forecast.operator_config import ( + ForecastOperatorConfig, + ForecastOperatorSpec, + TestData, + DateTimeColumn, + OutputDirectory, +) +from ads.opctl.operator.lowcode.forecast.const import SupportedMetrics + + +class TestAutoTSOperatorModel(unittest.TestCase): + """Tests the base class for the forecasting models""" + + pass + + def setUp(self): + spec = Mock(spec=ForecastOperatorSpec) + spec.datetime_column = Mock(spec=DateTimeColumn) + spec.datetime_column.name = "last_day_of_week" + spec.horizon = 3 + spec.tuning = None + spec.model_kwargs = {} + spec.confidence_interval_width = 0.7 + spec.additional_data = None + self.spec = spec + + config = Mock(spec=ForecastOperatorConfig) + config.spec = self.spec + self.config = config + + datasets = Mock(spec=ForecastDatasets) + datasets.original_user_data = None + datasets.original_total_data = None + datasets.original_additional_data = None + datasets.full_data_dict = {} + datasets.target_columns = [] + datasets.categories = [] + self.datasets = datasets + + @patch("autots.AutoTS") + @patch("pandas.concat") + def test_autots_parameter_passthrough(self, mock_concat, mock_autots): + autots = AutoTSOperatorModel(self.config, self.datasets) + autots._build_model() + + # When model_kwargs does not have anything, defaults should be sent as parameters. + mock_autots.assert_called_once_with( + forecast_length=self.spec.horizon, + frequency="infer", + prediction_interval=self.spec.confidence_interval_width, + max_generations=AUTOTS_MAX_GENERATION, + no_negatives=False, + constraint=None, + ensemble="auto", + initial_template="General+Random", + random_seed=2022, + holiday_country="US", + subset=None, + aggfunc="first", + na_tolerance=1, + drop_most_recent=0, + drop_data_older_than_periods=None, + model_list="fast_parallel", + transformer_list="auto", + transformer_max_depth=6, + models_mode="random", + num_validations="auto", + models_to_validate=AUTOTS_MODELS_TO_VALIDATE, + max_per_model_class=None, + validation_method="backwards", + min_allowed_train_percent=0.5, + remove_leading_zeroes=False, + prefill_na=None, + introduce_na=None, + preclean=None, + model_interrupt=True, + generation_timeout=None, + current_model_file=None, + verbose=1, + n_jobs=-1, + ) + + mock_autots.reset_mock() + + self.spec.model_kwargs = { + "forecast_length": "forecast_length_from_model_kwargs", + "frequency": "frequency_from_model_kwargs", + "prediction_interval": "prediction_interval_from_model_kwargs", + "max_generations": "max_generations_from_model_kwargs", + "no_negatives": "no_negatives_from_model_kwargs", + "constraint": "constraint_from_model_kwargs", + "ensemble": "ensemble_from_model_kwargs", + "initial_template": "initial_template_from_model_kwargs", + "random_seed": "random_seed_from_model_kwargs", + "holiday_country": "holiday_country_from_model_kwargs", + "subset": "subset_from_model_kwargs", + "aggfunc": "aggfunc_from_model_kwargs", + "na_tolerance": "na_tolerance_from_model_kwargs", + "drop_most_recent": "drop_most_recent_from_model_kwargs", + "drop_data_older_than_periods": "drop_data_older_than_periods_from_model_kwargs", + "model_list": " model_list_from_model_kwargs", + "transformer_list": "transformer_list_from_model_kwargs", + "transformer_max_depth": "transformer_max_depth_from_model_kwargs", + "models_mode": "models_mode_from_model_kwargs", + "num_validations": "num_validations_from_model_kwargs", + "models_to_validate": "models_to_validate_from_model_kwargs", + "max_per_model_class": "max_per_model_class_from_model_kwargs", + "validation_method": "validation_method_from_model_kwargs", + "min_allowed_train_percent": "min_allowed_train_percent_from_model_kwargs", + "remove_leading_zeroes": "remove_leading_zeroes_from_model_kwargs", + "prefill_na": "prefill_na_from_model_kwargs", + "introduce_na": "introduce_na_from_model_kwargs", + "preclean": "preclean_from_model_kwargs", + "model_interrupt": "model_interrupt_from_model_kwargs", + "generation_timeout": "generation_timeout_from_model_kwargs", + "current_model_file": "current_model_file_from_model_kwargs", + "verbose": "verbose_from_model_kwargs", + "n_jobs": "n_jobs_from_model_kwargs", + } + + autots._build_model() + + # All parameters in model_kwargs should be passed to autots + mock_autots.assert_called_once_with( + forecast_length=self.spec.horizon, + frequency=self.spec.model_kwargs.get("frequency"), + prediction_interval=self.spec.confidence_interval_width, + max_generations=self.spec.model_kwargs.get("max_generations"), + no_negatives=self.spec.model_kwargs.get("no_negatives"), + constraint=self.spec.model_kwargs.get("constraint"), + ensemble=self.spec.model_kwargs.get("ensemble"), + initial_template=self.spec.model_kwargs.get("initial_template"), + random_seed=self.spec.model_kwargs.get("random_seed"), + holiday_country=self.spec.model_kwargs.get("holiday_country"), + subset=self.spec.model_kwargs.get("subset"), + aggfunc=self.spec.model_kwargs.get("aggfunc"), + na_tolerance=self.spec.model_kwargs.get("na_tolerance"), + drop_most_recent=self.spec.model_kwargs.get("drop_most_recent"), + drop_data_older_than_periods=self.spec.model_kwargs.get( + "drop_data_older_than_periods" + ), + model_list=self.spec.model_kwargs.get("model_list"), + transformer_list=self.spec.model_kwargs.get("transformer_list"), + transformer_max_depth=self.spec.model_kwargs.get("transformer_max_depth"), + models_mode=self.spec.model_kwargs.get("models_mode"), + num_validations=self.spec.model_kwargs.get("num_validations"), + models_to_validate=self.spec.model_kwargs.get("models_to_validate"), + max_per_model_class=self.spec.model_kwargs.get("max_per_model_class"), + validation_method=self.spec.model_kwargs.get("validation_method"), + min_allowed_train_percent=self.spec.model_kwargs.get( + "min_allowed_train_percent" + ), + remove_leading_zeroes=self.spec.model_kwargs.get("remove_leading_zeroes"), + prefill_na=self.spec.model_kwargs.get("prefill_na"), + introduce_na=self.spec.model_kwargs.get("introduce_na"), + preclean=self.spec.model_kwargs.get("preclean"), + model_interrupt=self.spec.model_kwargs.get("model_interrupt"), + generation_timeout=self.spec.model_kwargs.get("generation_timeout"), + current_model_file=self.spec.model_kwargs.get("current_model_file"), + verbose=self.spec.model_kwargs.get("verbose"), + n_jobs=self.spec.model_kwargs.get("n_jobs"), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unitary/with_extras/operator/forecast/test_model_base_model.py b/tests/unitary/with_extras/operator/forecast/test_model_base_model.py new file mode 100644 index 000000000..dfd167bdc --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/test_model_base_model.py @@ -0,0 +1,488 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import unittest +from unittest.mock import patch, Mock +import pandas as pd +import numpy as np +from datetime import datetime +from ads.opctl.operator.common.utils import _build_image, _parse_input_args +from ads.opctl.operator.lowcode.forecast.model.prophet import ProphetOperatorModel +from ads.opctl.operator.lowcode.forecast.model.base_model import ( + ForecastOperatorBaseModel, +) +from ads.opctl.operator.lowcode.forecast.operator_config import ( + ForecastOperatorConfig, + ForecastOperatorSpec, + TestData, + DateTimeColumn, + InputData, +) +from ads.opctl.operator.lowcode.forecast.const import SupportedMetrics + +import unittest +from unittest.mock import patch, Mock +import pandas as pd +import datapane as dp +from ads.opctl.operator.common.utils import _build_image, _parse_input_args +from ads.opctl.operator.lowcode.forecast.model.prophet import ProphetOperatorModel +from ads.opctl.operator.lowcode.forecast.model.automlx import AutoMLXOperatorModel +from ads.opctl.operator.lowcode.forecast.model.base_model import ( + ForecastOperatorBaseModel, +) +from ads.opctl.operator.lowcode.forecast.operator_config import ( + ForecastOperatorConfig, + ForecastOperatorSpec, + TestData, + DateTimeColumn, + OutputDirectory, +) +from ads.opctl.operator.lowcode.forecast.const import SupportedMetrics +from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import ( + ForecastDatasets, + ForecastOutput, +) + + +class TestForecastOperatorBaseModel(unittest.TestCase): + """Tests the base class for the forecasting models""" + + def setUp(self): + self.target_columns = ["Sales_Product Group 107", "Sales_Product Group 108"] + self.target_category_columns = ["PPG_Code"] + self.test_filename = "test.csv" + self.full_data_dict = { + "Sales_Product Group 107": pd.DataFrame( + { + "ds": ["2020-10-31", "2020-11-07"], + "yhat": [1569.536030, 1568.052261], + } + ), + "Sales_Product Group 108": pd.DataFrame( + { + "ds": ["2020-10-31", "2020-11-07"], + "yhat": [1569.536030, 1568.052261], + } + ), + } + + self.data = pd.DataFrame({"last_day_of_week": ["2020-10-31", "2020-11-07"]}) + self.target_col = "yhat" + self.datetime_column_name = "last_day_of_week" + self.original_target_column = "Sales" + self.eval_metrics = pd.DataFrame( + {"Sales_Product Group 107": [25.07]}, index=["sMAPE"] + ) + self.evaluation_metrics = [ + SupportedMetrics.SMAPE, + SupportedMetrics.MAPE, + SupportedMetrics.RMSE, + SupportedMetrics.R2, + SupportedMetrics.EXPLAINED_VARIANCE, + ] + self.summary_metrics = [ + SupportedMetrics.MEAN_SMAPE, + SupportedMetrics.MEDIAN_SMAPE, + SupportedMetrics.MEAN_MAPE, + SupportedMetrics.MEDIAN_MAPE, + SupportedMetrics.MEAN_WMAPE, + SupportedMetrics.MEDIAN_WMAPE, + SupportedMetrics.MEAN_RMSE, + SupportedMetrics.MEDIAN_RMSE, + SupportedMetrics.MEAN_R2, + SupportedMetrics.MEDIAN_R2, + SupportedMetrics.MEAN_EXPLAINED_VARIANCE, + SupportedMetrics.MEDIAN_EXPLAINED_VARIANCE, + SupportedMetrics.ELAPSED_TIME, + ] + self.summary_metrics_all_targets = [ + SupportedMetrics.MEAN_SMAPE, + SupportedMetrics.MEDIAN_SMAPE, + SupportedMetrics.MEAN_MAPE, + SupportedMetrics.MEDIAN_MAPE, + SupportedMetrics.MEAN_RMSE, + SupportedMetrics.MEDIAN_RMSE, + SupportedMetrics.MEAN_R2, + SupportedMetrics.MEDIAN_R2, + SupportedMetrics.MEAN_EXPLAINED_VARIANCE, + SupportedMetrics.MEDIAN_EXPLAINED_VARIANCE, + SupportedMetrics.ELAPSED_TIME, + ] + spec = Mock(spec=ForecastOperatorSpec) + spec.target_column = self.target_col + spec.target_category_columns = self.target_category_columns + spec.target_column = self.original_target_column + spec.test_data = Mock(spec=TestData) + spec.datetime_column = Mock(spec=DateTimeColumn) + spec.datetime_column.name = self.datetime_column_name + spec.datetime_column.format = None + spec.historical_data = Mock(spec="InputData") + spec.historical_data.url = "primary.csv" + spec.historical_data.format = None + spec.historical_data.columns = None + spec.horizon = 3 + spec.tuning = None + spec.output_directory = Mock(spec=OutputDirectory) + spec.output_directory.url = "URL" + spec.forecast_filename = "forecast" + spec.metrics_filename = "metrics" + spec.test_metrics_filename = "test_metrics" + spec.report_filename = "report" + + config = Mock(spec=ForecastOperatorConfig) + config.spec = spec + + self.config = config + + self.datasets = Mock(spec=ForecastDatasets) + self.datasets.original_user_data = None + self.datasets.original_total_data = None + self.datasets.original_additional_data = None + self.datasets.full_data_dict = None + self.datasets.target_columns = None + self.datasets.categories = None + + def get_longest_datetime_column_mock(): + return pd.Series( + [ + datetime.strptime("2020-10-31", "%Y-%m-%d"), + datetime.strptime("2020-11-07", "%Y-%m-%d"), + datetime.strptime("2020-11-14", "%Y-%m-%d"), + datetime.strptime("2020-11-21", "%Y-%m-%d"), + datetime.strptime("2020-11-28", "%Y-%m-%d"), + ] + ) + + self.datasets.get_longest_datetime_column.side_effect = ( + get_longest_datetime_column_mock + ) + + self.output = ForecastOutput(confidence_interval_width=0.7) + self.output.add_category( + "Product Group 107", + "Sales_Product Group 107", + pd.DataFrame( + { + "Date": [ + datetime.strptime("2020-10-31", "%Y-%m-%d"), + datetime.strptime("2020-11-07", "%Y-%m-%d"), + datetime.strptime("2020-11-14", "%Y-%m-%d"), + datetime.strptime("2020-11-21", "%Y-%m-%d"), + datetime.strptime("2020-11-28", "%Y-%m-%d"), + ], + "Series": [ + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + "Product Group 107", + ], + "input_value": [1569.536030, 1568.052261, np.nan, np.nan, np.nan], + "fitted_value": [1569.536030, 1568.052261, np.nan, np.nan, np.nan], + "forecast_value": [ + np.nan, + np.nan, + 1566.568493, + 1565.084725, + 1563.600957, + ], + "upper_bound": [ + np.nan, + np.nan, + 1566.568493, + 1565.084725, + 1563.600957, + ], + "lower_bound": [ + np.nan, + np.nan, + 1566.568493, + 1565.084725, + 1563.600957, + ], + } + ), + ) + self.output.add_category( + "Product Group 108", + "Sales_Product Group 108", + pd.DataFrame( + { + "Date": [ + datetime.strptime("2020-10-31", "%Y-%m-%d"), + datetime.strptime("2020-11-07", "%Y-%m-%d"), + datetime.strptime("2020-11-14", "%Y-%m-%d"), + datetime.strptime("2020-11-21", "%Y-%m-%d"), + datetime.strptime("2020-11-28", "%Y-%m-%d"), + ], + "Series": [ + "Product Group 108", + "Product Group 107", + "Product Group 108", + "Product Group 108", + "Product Group 108", + ], + "input_value": [1569.536030, 1568.052261, np.nan, np.nan, np.nan], + "fitted_value": [1569.536030, 1568.052261, np.nan, np.nan, np.nan], + "forecast_value": [ + np.nan, + np.nan, + 1254.850813, + 1240.009167, + 1225.167521, + ], + "upper_bound": [ + np.nan, + np.nan, + 1254.850813, + 1240.009167, + 1225.167521, + ], + "lower_bound": [ + np.nan, + np.nan, + 1254.850813, + 1240.009167, + 1225.167521, + ], + } + ), + ) + + @patch("ads.opctl.operator.lowcode.forecast.utils._load_data") + def test_empty_testdata_file(self, mock__load_data): + # When test file is empty + + mock__load_data.side_effect = pd.errors.EmptyDataError() + prophet = ProphetOperatorModel(self.config, self.datasets) + total_metrics, summary_metrics, data = prophet._test_evaluate_metrics( + target_columns=self.target_columns, + test_filename=self.test_filename, + output=self.output, + target_col=self.target_col, + elapsed_time=0, + ) + + self.assertTrue(total_metrics.empty) + self.assertTrue(summary_metrics.empty) + self.assertIsNone(data) + + @patch("ads.opctl.operator.lowcode.forecast.utils._load_data") + def test_no_series_testdata_file(self, mock__load_data): + # When test file has no series + + mock__load_data.return_value = pd.DataFrame( + columns=["PPG_Code", "last_day_of_week", "Sales"] + ) + + prophet = ProphetOperatorModel(self.config, self.datasets) + prophet.forecast_output = self.output + total_metrics, summary_metrics, data = prophet._test_evaluate_metrics( + target_columns=self.target_columns, + test_filename=self.test_filename, + output=self.output, + target_col=self.target_col, + elapsed_time=0, + ) + + self.assertTrue(total_metrics.empty) + self.assertTrue(summary_metrics.empty) + self.assertIsNone(data) + + @patch("ads.opctl.operator.lowcode.forecast.utils._load_data") + def test_one_missing_series_testdata_file(self, mock__load_data): + """ + When there are NaN values for an entire series it will be loaded as zeros. And evaluation, summary metrics will be calculated with that zeros. + When one entire series is missing in test file i.e; missing rows + In this case evaluation metrics and summary metrics will not involve this series + """ + mock__load_data.return_value = pd.DataFrame( + { + "PPG_Code": ["Product Group 107", "Product Group 107"], + "last_day_of_week": ["2020-11-14", "2020-11-28"], + "Sales": [1403, 6532], + } + ) + + prophet = ProphetOperatorModel(self.config, self.datasets) + prophet.forecast_output = self.output + total_metrics, summary_metrics, data = prophet._test_evaluate_metrics( + target_columns=self.target_columns, + test_filename=self.test_filename, + output=self.output, + target_col=self.target_col, + elapsed_time=0, + ) + + self.assertFalse(total_metrics.empty) + self.assertFalse(summary_metrics.empty) + + # Missing series should not be there in evaluation metrics + self.assertEquals(total_metrics.columns.to_list(), ["Sales_Product Group 107"]) + + # one entire series is not there, summary metrics per horizon will be calculated and all horizons should be there + self.assertEqual( + [ + timestamp.strftime("%Y-%m-%d") + for timestamp in summary_metrics.index.values[1:] + ], + ["2020-11-14", "2020-11-28"], + ) + + # All metrics should be present + self.assertEquals(total_metrics.index.to_list(), self.evaluation_metrics) + self.assertEquals(summary_metrics.columns.to_list(), self.summary_metrics) + + @patch("ads.opctl.operator.lowcode.forecast.utils._load_data") + def test_missing_rows_testdata_file(self, mock__load_data): + """ + In the case where all series are present but there are missing rows in the test file + Suppose the missing row was for a horizon in a series, if any other series has value for that horizon then this missing value will automatically come as 0 while loading data. + So evaluation and summary metrics will be calculated by taking missing values as zeros in this case. + + When for a horizon, every series has missing row then in loaded data that horizon will not be there. + In this case for total metrics zeros are added for the missing values in the series to calculate evaluation metrics. + Where as summary metrics per horizon is not calculated for that horizon. + """ + + mock__load_data.return_value = pd.DataFrame( + { + "PPG_Code": [ + "Product Group 107", + "Product Group 107", + "Product Group 108", + "Product Group 108", + ], + "last_day_of_week": [ + "2020-11-14", + "2020-11-28", + "2020-11-14", + "2020-11-28", + ], + "Sales": [1403, 6532, 1647, 1414], + } + ) + + prophet = ProphetOperatorModel(self.config, self.datasets) + prophet.forecast_output = self.output + total_metrics, summary_metrics, data = prophet._test_evaluate_metrics( + target_columns=self.target_columns, + test_filename=self.test_filename, + output=self.output, + target_col=self.target_col, + elapsed_time=0, + ) + self.assertFalse(total_metrics.empty) + self.assertFalse(summary_metrics.empty) + + # Missing horizon should not be there in summary_metrics per horizon, metrics should be there for other horizons + self.assertEqual( + [ + timestamp.strftime("%Y-%m-%d") + for timestamp in summary_metrics.index.values[1:] + ], + ["2020-11-14", "2020-11-28"], + ) + + # Total metrics should be there for all series. + self.assertEqual(total_metrics.columns.to_list(), self.target_columns) + + # All metrics should be present + self.assertEquals(total_metrics.index.to_list(), self.evaluation_metrics) + self.assertEquals(summary_metrics.columns.to_list(), self.summary_metrics) + + @patch("datapane.save_report") + @patch("ads.opctl.operator.lowcode.forecast.utils.get_forecast_plots") + @patch("ads.opctl.operator.lowcode.forecast.utils.evaluate_train_metrics") + @patch("ads.opctl.operator.lowcode.forecast.utils._write_data") + @patch( + "ads.opctl.operator.lowcode.forecast.model.base_model.ForecastOperatorBaseModel._test_evaluate_metrics" + ) + @patch( + "ads.opctl.operator.lowcode.forecast.model.prophet.ProphetOperatorModel._build_model" + ) + @patch( + "ads.opctl.operator.lowcode.forecast.model.prophet.ProphetOperatorModel._generate_report" + ) + @patch("ads.opctl.operator.lowcode.forecast.model.base_model.open") + @patch("fsspec.open") + def test_boolean_disable( + self, + mock_fsspec_open, + mock_open, + mock__generate_report, + mock__build_model, + mock__test_evaluate_metrics, + mock__write_data, + mock_evaluate_train_metrics, + mock_get_forecast_plots, + mock_save_report, + ): + mock__test_evaluate_metrics.return_value = (pd.DataFrame(), None, None) + mock__generate_report.return_value = ( + dp.Text("Description"), + [dp.Text("Other Sections")], + ) + mock__build_model.return_value = pd.DataFrame() + mock_evaluate_train_metrics.return_value = self.eval_metrics + mock_get_forecast_plots = dp.Text("Random Text") + + self.config.spec.generate_metrics = True + self.config.spec.generate_report = False + + prophet = ProphetOperatorModel(self.config, self.datasets) + prophet.target_columns = self.target_columns + prophet.full_data_dict = self.full_data_dict + prophet.forecast_output = self.output + + prophet.generate_report() + + # Metrics are generated, Report is not generated + mock__test_evaluate_metrics.assert_called_once() + mock_evaluate_train_metrics.assert_called_once() + self.assertTrue(mock_save_report.call_count == 0) + self.assertTrue(mock__write_data.call_count == 3) + + mock__test_evaluate_metrics.reset_mock() + mock_evaluate_train_metrics.reset_mock() + mock__write_data.reset_mock() + mock_save_report.reset_mock() + + self.config.spec.generate_metrics = False + self.config.spec.generate_report = True + prophet.generate_report() + + # Metrics are generated to be included in report but not saved, Report is generated + mock__test_evaluate_metrics.assert_called_once() + mock_evaluate_train_metrics.assert_called_once() + self.assertTrue(mock_save_report.call_count == 1) + self.assertTrue(mock__write_data.call_count == 1) + + @patch( + "ads.opctl.operator.lowcode.forecast.model.automlx.AutoMLXOperatorModel.explain_model" + ) + def test_boolean_disable_explanations(self, mock_explain_model): + self.config.spec.generate_explanations = False + + automlx = AutoMLXOperatorModel(self.config, self.datasets) + automlx.output = self.output + automlx.full_data_dict = {} + automlx.data = self.data + automlx.local_explanation = {"dummy": pd.DataFrame({"pt1": [1, 2, 3]})} + automlx._generate_report() + + # Explanations are not generated + mock_explain_model.assert_not_called() + + self.config.spec.generate_explanations = True + automlx._generate_report() + + # Explanations are generated + mock_explain_model.assert_called_once() + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unitary/with_extras/operator/forecast/test_model_factory.py b/tests/unitary/with_extras/operator/forecast/test_model_factory.py new file mode 100644 index 000000000..1be489506 --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/test_model_factory.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +class TestForecastOperatorModelFactory: + """Tests the factory class which contains a list of registered forecasting operator models.""" + + pass diff --git a/tests/unitary/with_extras/operator/forecast/test_model_neural_prophet.py b/tests/unitary/with_extras/operator/forecast/test_model_neural_prophet.py new file mode 100644 index 000000000..ead35cc39 --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/test_model_neural_prophet.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +class TestNeuralProphetOperatorModel: + """Tests the neuralprophet operator model class.""" + + pass diff --git a/tests/unitary/with_extras/operator/forecast/test_model_prophet.py b/tests/unitary/with_extras/operator/forecast/test_model_prophet.py new file mode 100644 index 000000000..84b274821 --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/test_model_prophet.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +class TestProphetOperatorModel: + """Tests the prophet operator model class.""" + + pass diff --git a/tests/unitary/with_extras/operator/forecast/test_operator.py b/tests/unitary/with_extras/operator/forecast/test_operator.py new file mode 100644 index 000000000..f2b79b54e --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/test_operator.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +class TestOperator: + """Tests common methods of the forecast operator module.""" + + pass diff --git a/tests/unitary/with_extras/operator/forecast/test_operator_config.py b/tests/unitary/with_extras/operator/forecast/test_operator_config.py new file mode 100644 index 000000000..0973c0e54 --- /dev/null +++ b/tests/unitary/with_extras/operator/forecast/test_operator_config.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +class TestForecastOperatorConfig: + """Tests forecast operator config class""" + + pass diff --git a/tests/unitary/with_extras/operator/pii/__init__.py b/tests/unitary/with_extras/operator/pii/__init__.py new file mode 100644 index 000000000..fe904ad27 --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/tests/unitary/with_extras/operator/pii/test_factory.py b/tests/unitary/with_extras/operator/pii/test_factory.py new file mode 100644 index 000000000..04e153cd0 --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/test_factory.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import pytest +from scrubadub_spacy.detectors.spacy import SpacyEntityDetector + +from ads.opctl.operator.lowcode.pii.model.factory import ( + PiiDetectorFactory, + UnSupportedDetectorError, +) + + +class TestPiiDetectorFactory: + def test_get_default_detector(self): + detector_type = "default" + entity = "phone" + model = None + expected_result = "phone" + detector = PiiDetectorFactory.get_detector( + detector_type=detector_type, entity=entity, model=model + ) + assert detector == expected_result + + @pytest.mark.parametrize( + "detector_type, entity, model", + [ + ("spacy", "person", "en_core_web_sm"), + ("spacy", "other", "en_core_web_sm"), + # ("spacy", "org", "en_core_web_trf"), + # ("spacy", "loc", "en_core_web_md"), + # ("spacy", "date", "en_core_web_lg"), + ], + ) + def test_get_spacy_detector(self, detector_type, entity, model): + detector = PiiDetectorFactory.get_detector( + detector_type=detector_type, entity=entity, model=model + ) + assert isinstance(detector, SpacyEntityDetector) + assert entity.upper() in detector.filth_cls_map + + def test_get_detector_fail(self): + detector_type = "unknow" + entity = "myentity" + model = None + with pytest.raises(UnSupportedDetectorError): + PiiDetectorFactory.get_detector( + detector_type=detector_type, entity=entity, model=model + ) diff --git a/tests/unitary/with_extras/operator/pii/test_files/__init__.py b/tests/unitary/with_extras/operator/pii/test_files/__init__.py new file mode 100644 index 000000000..fe904ad27 --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/test_files/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/tests/unitary/with_extras/operator/pii/test_files/pii_test.yaml b/tests/unitary/with_extras/operator/pii/test_files/pii_test.yaml new file mode 100644 index 000000000..b9ef962b4 --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/test_files/pii_test.yaml @@ -0,0 +1,14 @@ +kind: operator +spec: + detectors: + - action: anonymize + name: default.phone + - action: mask + name: default.text_blob_name + input_data: + url: ./test_data.csv + output_directory: + url: ./test_result/ + target_column: text +type: pii +version: v1 diff --git a/tests/unitary/with_extras/operator/pii/test_files/test_data.csv b/tests/unitary/with_extras/operator/pii/test_files/test_data.csv new file mode 100644 index 000000000..250e24577 --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/test_files/test_data.csv @@ -0,0 +1,3 @@ +id,text +00001cee341fdb12,"Hi, this is John Doe, my number is (805) 555-1234." +00097b6214686db5,"John has a beautiful puppy." diff --git a/tests/unitary/with_extras/operator/pii/test_guardrail.py b/tests/unitary/with_extras/operator/pii/test_guardrail.py new file mode 100644 index 000000000..ae8c7be60 --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/test_guardrail.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +import tempfile +from io import StringIO + +import yaml + +from ads.opctl.operator.lowcode.pii.constant import DEFAULT_REPORT_FILENAME +from ads.opctl.operator.lowcode.pii.model.guardrails import PIIGuardrail +from ads.opctl.operator.lowcode.pii.operator_config import PiiOperatorConfig + + +class TestPiiGuardrail: + test_files_uri = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "test_files" + ) + + def yaml_content_simple(self): + content = StringIO( + f""" +kind: operator +spec: + detectors: + - action: anonymize + name: default.phone + input_data: + url: {self.test_files_uri}/test_data.csv + output_directory: + url: {self.test_files_uri} + target_column: text +type: pii +version: v1 + +""" + ) + return content + + def yaml_content_complex(self): + content = StringIO( + """ +kind: operator +spec: + detectors: + - action: anonymize + name: default.phone + - action: mask + name: default.social_security_number + input_data: + url: oci://my-bucket@my-tenancy/input_data/mydata.csv + output_directory: + name: myProcesseData.csv + url: oci://my-bucket@my-tenancy/result/ + report: + report_filename: myreport.html + show_sensitive_content: true + show_rows: 10 + target_column: text +type: pii +version: v1 + +""" + ) + return content + + def test_init(self): + conf = yaml.load(self.yaml_content_complex(), yaml.SafeLoader) + operator_config = PiiOperatorConfig.from_yaml( + yaml_string=self.yaml_content_complex() + ) + guardrail = PIIGuardrail(config=operator_config) + + assert guardrail.dst_uri == os.path.join( + conf["spec"]["output_directory"]["url"], + conf["spec"]["output_directory"]["name"], + ) + assert guardrail.report_uri == os.path.join( + conf["spec"]["output_directory"]["url"], + conf["spec"]["report"]["report_filename"], + ) + assert len(guardrail.scrubber._detectors) == 2 + assert not guardrail.storage_options == {} + + def test_load_data(self): + conf = yaml.load(self.yaml_content_simple(), yaml.SafeLoader) + + operator_config = PiiOperatorConfig.from_yaml( + yaml_string=self.yaml_content_simple() + ) + guardrail = PIIGuardrail(config=operator_config) + guardrail.load_data() + + assert guardrail.datasets is not None + assert guardrail.storage_options == {} + assert guardrail.dst_uri == os.path.join( + conf["spec"]["output_directory"]["url"], + "test_data_out.csv", + ) + assert guardrail.report_uri == os.path.join( + conf["spec"]["output_directory"]["url"], + DEFAULT_REPORT_FILENAME, + ) + + def test_process(self): + operator_config = PiiOperatorConfig.from_yaml( + yaml_string=self.yaml_content_simple() + ) + guardrail = PIIGuardrail(config=operator_config) + with tempfile.TemporaryDirectory() as temp_dir: + dst_uri = os.path.join(temp_dir, "test_out.csv") + report_uri = os.path.join(temp_dir, DEFAULT_REPORT_FILENAME) + guardrail.process( + dst_uri=dst_uri, + report_uri=report_uri, + ) + assert os.path.exists(dst_uri) + assert os.path.exists(report_uri) diff --git a/tests/unitary/with_extras/operator/pii/test_pii_scrubber.py b/tests/unitary/with_extras/operator/pii/test_pii_scrubber.py new file mode 100644 index 000000000..df2929a06 --- /dev/null +++ b/tests/unitary/with_extras/operator/pii/test_pii_scrubber.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import os + +import pytest + +from ads.opctl.operator.common.utils import _load_yaml_from_uri +from ads.opctl.operator.lowcode.pii.model.pii import PiiScrubber +from ads.opctl.operator.lowcode.pii.operator_config import PiiOperatorConfig + + +class TestPiiScrubber: + test_yaml_uri = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "test_files", "pii_test.yaml" + ) + operator_config = PiiOperatorConfig.from_yaml(uri=test_yaml_uri) + config_dict = _load_yaml_from_uri(uri=test_yaml_uri) + + name_entity = "John Doe" + phone_entity = "(800) 223-1711" + text = f""" + This is {name_entity}. My number is {phone_entity}. + """ + + @pytest.mark.parametrize( + "config", + [ + test_yaml_uri, + operator_config, + config_dict, + ], + ) + def test_init(self, config): + pii_scrubber = PiiScrubber(config=config) + + assert isinstance(pii_scrubber.detector_spec, list) + assert len(pii_scrubber.detector_spec) == 2 + assert pii_scrubber.detector_spec[0]["name"] == "default.phone" + + assert len(pii_scrubber.scrubber._detectors) == 0 + + def test_config_scrubber(self): + scrubber = PiiScrubber(config=self.test_yaml_uri).config_scrubber() + + assert len(scrubber._detectors) == 2 + assert len(scrubber._post_processors) == 1 + + processed_text = scrubber.clean(self.text) + + assert self.name_entity not in processed_text + assert self.phone_entity not in processed_text diff --git a/tests/unitary/with_extras/operator/test_cmd.py b/tests/unitary/with_extras/operator/test_cmd.py new file mode 100644 index 000000000..5eb6ef838 --- /dev/null +++ b/tests/unitary/with_extras/operator/test_cmd.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +class TestOperatorCMD: + """Tests operator commands.""" + + def test_list(self): + """Ensures that the list of the registered operators can be printed.""" + pass + + def test_info(self): + """Ensures that the detailed information about the particular operator can be printed.""" + pass + + def test_init_success(self): + """Ensures that a starter YAML configurations for the operator can be generated.""" + pass + + def test_init_fail(self): + """Ensures that generating starter specification fails in case of wrong input attributes.""" + pass + + def test_build_image(self): + """Ensures that operator's image can be successfully built.""" + pass + + def test_publish_image(self): + """Ensures that operator's image can be successfully published.""" + pass + + def test_verify(self): + """Ensures that operator's config can be successfully verified.""" + pass + + def test_build_conda(self): + """Ensures that the operator's conda environment can be successfully built.""" + pass + + def test_publish_conda(self): + """Ensures that the operator's conda environment can be successfully published.""" + pass + + def test_run(self): + """Ensures that the operator can be run on the targeted backend.""" + pass diff --git a/tests/unitary/with_extras/operator/test_common_backend_factory.py b/tests/unitary/with_extras/operator/test_common_backend_factory.py new file mode 100644 index 000000000..df23e40e3 --- /dev/null +++ b/tests/unitary/with_extras/operator/test_common_backend_factory.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +import os +from unittest.mock import MagicMock, patch + +import pytest + + +from ads.jobs import Job +from ads.opctl.config.base import ConfigProcessor +from ads.opctl.config.merger import ConfigMerger +from ads.opctl.constants import BACKEND_NAME +from ads.opctl.operator.common.backend_factory import BackendFactory +from ads.opctl.operator.common.const import PACK_TYPE +from ads.opctl.operator.common.operator_loader import OperatorInfo, OperatorLoader +from ads.opctl.operator.runtime import runtime as operator_runtime + + +class TestBackendFactory: + """Test the backend factory.""" + + def setup_class(cls): + # current directory and test files directory + cls.CUR_DIR = os.path.dirname(os.path.abspath(__file__)) + cls.TEST_FILES_DIR = os.path.join(cls.CUR_DIR, "test_files") + + # mock backends + cls.MOCK_BACKEND = { + "job.config": Job.from_yaml( + uri=os.path.join(cls.TEST_FILES_DIR, "job_python.yaml") + ).to_dict(), + "job.python.config": Job.from_yaml( + uri=os.path.join(cls.TEST_FILES_DIR, "job_python.yaml") + ).to_dict(), + "job.container.config": Job.from_yaml( + uri=os.path.join(cls.TEST_FILES_DIR, "job_container.yaml") + ).to_dict(), + "dataflow.config": Job.from_yaml( + uri=os.path.join(cls.TEST_FILES_DIR, "dataflow_dataflow.yaml") + ).to_dict(), + "dataflow.dataflow.config": Job.from_yaml( + uri=os.path.join(cls.TEST_FILES_DIR, "dataflow_dataflow.yaml") + ).to_dict(), + "local.config": operator_runtime.PythonRuntime.from_yaml( + uri=os.path.join(cls.TEST_FILES_DIR, "local_python.yaml") + ).to_dict(), + "local.container.config": operator_runtime.ContainerRuntime.from_yaml( + uri=os.path.join(cls.TEST_FILES_DIR, "local_container.yaml") + ).to_dict(), + "local.python.config": operator_runtime.PythonRuntime.from_yaml( + uri=os.path.join(cls.TEST_FILES_DIR, "local_python.yaml") + ).to_dict(), + } + + def setup_method(self): + # mock operator info with the all supported backends + self.mock_operator_info = OperatorInfo( + type="example", + gpu="no", + description="An example operator", + version="v1", + conda="example_v1", + conda_type=PACK_TYPE.CUSTOM, + path="/fake/path/to/operator", + backends=[BACKEND_NAME.JOB.value, BACKEND_NAME.DATAFLOW.value], + ) + + # mock operator config + self.mock_operator_config = { + "kind": "operator", + "type": "example", + "version": "v1", + "spec": {}, + } + + # expected backends + self.mock_supported_backends = tuple( + set(BackendFactory.BACKENDS + BackendFactory.LOCAL_BACKENDS) + & set( + self.mock_operator_info.backends + + [ + BACKEND_NAME.OPERATOR_LOCAL.value, + BACKEND_NAME.LOCAL.value, + ] + ) + ) + + @pytest.mark.parametrize( + "backend, expected_backend_kind, expected_runtime_type", + [ + ("job", "job", "python"), + ("job.container", "job", "container"), + ("dataflow.dataflow", "dataflow", "dataflow"), + ("local.container", "operator.local", "container"), + ("local.python", "operator.local", "python"), + ("invalid", None, None), + ("job.invalid", None, None), + ("local.invalid", None, None), + ], + ) + def test_extract_backend( + self, backend, expected_backend_kind, expected_runtime_type + ): + """Ensure that the backend and runtime type are extracted correctly.""" + if expected_backend_kind is None: + with pytest.raises(RuntimeError): + BackendFactory._extract_backend(backend) + else: + backend_kind, runtime_type = BackendFactory._extract_backend(backend) + assert backend_kind == expected_backend_kind + assert runtime_type == expected_runtime_type + + def test_validate_backend_and_runtime(self): + """Ensure that the backend and runtime type are validated correctly.""" + backend_kind = "job" + runtime_type = "python" + supported_backends = ["job", "dataflow", "operator_local", "local"] + assert ( + BackendFactory._validate_backend_and_runtime( + backend_kind, runtime_type, supported_backends + ) + == True + ) + + backend_kind = "invalid_backend" + runtime_type = "python" + supported_backends = ["job", "dataflow", "operator_local", "local"] + with pytest.raises(RuntimeError): + BackendFactory._validate_backend_and_runtime( + backend_kind, runtime_type, supported_backends + ) + + backend_kind = "job" + runtime_type = "invalid_runtime" + supported_backends = ["job", "dataflow", "operator_local", "local"] + with pytest.raises(RuntimeError): + BackendFactory._validate_backend_and_runtime( + backend_kind, runtime_type, supported_backends + ) + + def test_get_backend_fail(self): + """Ensures that getting backend fails in case of wrong input data.""" + + mock_config = MagicMock() + mock_config.return_value = {"kind": "job", "type": "python"} + + with pytest.raises(RuntimeError): + BackendFactory.backend(config=None) + + mock_config.return_value = {"kind": "value"} + with pytest.raises(RuntimeError): + BackendFactory.backend(config=mock_config) + + mock_config.return_value = {"kind": "operator"} + with pytest.raises(RuntimeError): + BackendFactory.backend(config=mock_config) + + @pytest.mark.parametrize( + "mock_backend, expected_backend_kind, expected_runtime_type", + [ + (None, "operator.local", "python"), + ("job", "job", "python"), + ("job.python", "job", "python"), + ("job.container", "job", "container"), + ("dataflow", "dataflow", "dataflow"), + ("dataflow.dataflow", "dataflow", "dataflow"), + ("local", "operator.local", "python"), + ("local.container", "operator.local", "container"), + ("local.python", "operator.local", "python"), + ("job.config", "job", "python"), + ("job.python.config", "job", "python"), + ("job.container.config", "job", "container"), + ("dataflow.config", "dataflow", "dataFlow"), + ("dataflow.dataflow.config", "dataflow", "dataFlow"), + ("local.config", "operator.local", "python"), + ("local.container.config", "operator.local", "container"), + ("local.python.config", "operator.local", "python"), + ], + ) + @patch.object(BackendFactory, "_validate_backend_and_runtime") + @patch.object(BackendFactory, "_init_backend_config") + def test_get_backend( + self, + mock_init_backend_config, + mock_validate_backend_and_runtime, + mock_backend, + expected_backend_kind, + expected_runtime_type, + ): + """Ensure that the backend is returned correctly.""" + + mock_backend_config = self.MOCK_BACKEND[ + f"{expected_backend_kind.replace('operator.','').lower()}.{expected_runtime_type.lower()}.config" + ] + + # check if mock backend is a config dict + if mock_backend in self.MOCK_BACKEND: + mock_backend = self.MOCK_BACKEND[mock_backend] + + # prepares mock config by applying the config merger + # this step can be replaced with magic mock + mock_config = ConfigProcessor(self.mock_operator_config).step( + ConfigMerger, **{} + ) + + with patch.object(OperatorLoader, "from_uri") as mock_operator_loader_from_uri: + # mock objects + mock_operator_loader_from_uri.return_value = MagicMock( + load=MagicMock(return_value=self.mock_operator_info) + ) + mock_init_backend_config.return_value = { + (expected_backend_kind, expected_runtime_type): mock_backend_config + } + + # run test + result_backend = BackendFactory.backend( + config=mock_config, backend=mock_backend + ) + + # validate result + mock_operator_loader_from_uri.assert_called_with( + uri=self.mock_operator_config["type"] + ) + mock_validate_backend_and_runtime.assert_called_with( + backend_kind=expected_backend_kind, + runtime_type=expected_runtime_type, + supported_backends=self.mock_supported_backends, + ) + + if isinstance(mock_backend, str): + mock_init_backend_config.assert_called_with( + operator_info=self.mock_operator_info, + backend_kind=expected_backend_kind, + **{}, + ) + + # validate result_backend + assert result_backend.operator_type == self.mock_operator_config["type"] + assert result_backend.operator_info == self.mock_operator_info diff --git a/tests/unitary/with_extras/operator/test_common_utils.py b/tests/unitary/with_extras/operator/test_common_utils.py new file mode 100644 index 000000000..32dc6b9d1 --- /dev/null +++ b/tests/unitary/with_extras/operator/test_common_utils.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import argparse +import os +import unittest +from unittest.mock import patch, MagicMock + +from ads.opctl.operator.common.utils import _build_image, _parse_input_args + + +class TestBuildImage(unittest.TestCase): + def setUp(self): + self.curr_dir = os.path.dirname(os.path.abspath(__file__)) + self.dockerfile = os.path.join(self.curr_dir, "test_files", "Dockerfile.test") + self.image_name = "test_image" + self.tag = "test_tag" + self.target = "test_target" + self.kwargs = {"arg1": "value1", "arg2": "value2"} + + @patch("ads.opctl.utils.run_command") + @patch("time.time") + def test_build_image(self, mock_time, mock_run_command): + mock_time.return_value = 1 + mock_proc = MagicMock() + mock_proc.returncode = 0 + mock_run_command.return_value = mock_proc + + image_name = _build_image( + self.dockerfile, + self.image_name, + tag=self.tag, + target=self.target, + **self.kwargs, + ) + + expected_image_name = f"{self.image_name}:{self.tag}" + command = [ + "docker", + "build", + "-t", + expected_image_name, + "-f", + self.dockerfile, + "--target", + self.target, + "--build-arg", + "RND=1", + os.path.dirname(self.dockerfile), + ] + + mock_run_command.assert_called_once_with(command) + self.assertEqual(image_name, expected_image_name) + + def test_build_image_missing_dockerfile(self): + with self.assertRaises(FileNotFoundError): + _build_image("non_existing_docker_file", "non_existing_image") + + def test_build_image_missing_image_name(self): + with self.assertRaises(ValueError): + _build_image(self.dockerfile, None) + + @patch("ads.opctl.utils.run_command") + def test_build_image_docker_build_failure(self, mock_run_command): + mock_proc = MagicMock() + mock_proc.returncode = 1 + mock_run_command.return_value = mock_proc + + with self.assertRaises(RuntimeError): + _build_image(self.dockerfile, self.image_name) + + +class TestParseInputArgs(unittest.TestCase): + def test_parse_input_args_with_file(self): + raw_args = ["-f", "path/to/file.yaml"] + expected_output = ( + argparse.Namespace(file="path/to/file.yaml", spec=None, verify=False), + [], + ) + self.assertEqual(_parse_input_args(raw_args), expected_output) + + def test_parse_input_args_with_spec(self): + raw_args = ["-s", "spec"] + expected_output = (argparse.Namespace(file=None, spec="spec", verify=False), []) + self.assertEqual(_parse_input_args(raw_args), expected_output) + + def test_parse_input_args_with_verify(self): + raw_args = ["-v", "True"] + expected_output = (argparse.Namespace(file=None, spec=None, verify=True), []) + self.assertEqual(_parse_input_args(raw_args), expected_output) + + def test_parse_input_args_with_unknown_args(self): + raw_args = ["-f", "path/to/file.yaml", "--unknown-arg", "value"] + expected_output = ( + argparse.Namespace(file="path/to/file.yaml", spec=None, verify=False), + ["--unknown-arg", "value"], + ) + self.assertEqual(_parse_input_args(raw_args), expected_output) + + @patch("argparse.ArgumentParser.parse_known_args") + def test_parse_input_args_with_no_args(self, mock_parse_known_args): + mock_parse_known_args.return_value = ( + argparse.Namespace(file=None, spec=None, verify=False), + [], + ) + expected_output = (argparse.Namespace(file=None, spec=None, verify=False), []) + self.assertEqual(_parse_input_args([]), expected_output) diff --git a/tests/unitary/with_extras/operator/test_files/Dockerfile.test b/tests/unitary/with_extras/operator/test_files/Dockerfile.test new file mode 100644 index 000000000..8049807ad --- /dev/null +++ b/tests/unitary/with_extras/operator/test_files/Dockerfile.test @@ -0,0 +1,2 @@ +FROM baseImage +RUN echo "This is a message" diff --git a/tests/unitary/with_extras/operator/test_files/dataflow_dataflow.yaml b/tests/unitary/with_extras/operator/test_files/dataflow_dataflow.yaml new file mode 100644 index 000000000..05ee1f89a --- /dev/null +++ b/tests/unitary/with_extras/operator/test_files/dataflow_dataflow.yaml @@ -0,0 +1,36 @@ +# This YAML specification was auto generated by the `ads opctl init` command. +# The more details about the jobs YAML specification can be found in the ADS documentation: +# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/apachespark/dataflow.html + + +kind: job +spec: + infrastructure: + kind: infrastructure + spec: + compartmentId: ocid1.compartment.oc1.. + driverShape: VM.Standard.E2.4 + executorShape: VM.Standard.E2.4 + language: PYTHON + logsBucketUri: oci://bucket@namespace + numExecutors: '1' + scriptBucket: oci://bucket@namespace/prefix + sparkVersion: 3.2.1 + type: dataFlow + name: '{Job name. For MLflow and Operator will be auto generated}' + runtime: + kind: runtime + spec: + args: [] + conda: + type: published + uri: oci://bucket@namespace/conda_environments/test/conda/slug + condaAuthType: resource_principal + configuration: + spark.driverEnv.env_key: env_value + freeformTags: {} + overwrite: true + scriptBucket: oci://bucket@namespace/prefix + scriptPathURI: '{Path to the executable script. For MLflow and Operator will + auto generated}' + type: dataFlow diff --git a/tests/unitary/with_extras/operator/test_files/job_container.yaml b/tests/unitary/with_extras/operator/test_files/job_container.yaml new file mode 100644 index 000000000..f987f7da5 --- /dev/null +++ b/tests/unitary/with_extras/operator/test_files/job_container.yaml @@ -0,0 +1,33 @@ +# This YAML specification was auto generated by the `ads opctl init` command. +# The more details about the jobs YAML specification can be found in the ADS documentation: +# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/index.html + + +kind: job +spec: + infrastructure: + kind: infrastructure + spec: + blockStorageSize: 50 + compartmentId: ocid1.compartment.oc1.. + jobInfrastructureType: ME_STANDALONE + jobType: DEFAULT + logGroupId: ocid1.loggroup.oc1.iad. + logId: ocid1.log.oc1.iad. + projectId: ocid1.datascienceproject.oc1. + shapeName: VM.Standard2.1 + subnetId: ocid1.subnet.oc1.iad. + type: dataScienceJob + name: '{Job name. For MLflow and Operator will be auto generated}' + runtime: + kind: runtime + spec: + args: [] + cmd: '{Container CMD. For MLflow and Operator will be auto generated}' + entrypoint: + - bash + - --login + - -c + freeformTags: {} + image: /image:latest + type: container diff --git a/tests/unitary/with_extras/operator/test_files/job_python.yaml b/tests/unitary/with_extras/operator/test_files/job_python.yaml new file mode 100644 index 000000000..aba6287a3 --- /dev/null +++ b/tests/unitary/with_extras/operator/test_files/job_python.yaml @@ -0,0 +1,34 @@ +# This YAML specification was auto generated by the `ads opctl init` command. +# The more details about the jobs YAML specification can be found in the ADS documentation: +# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/index.html + + +kind: job +spec: + infrastructure: + kind: infrastructure + spec: + blockStorageSize: 50 + compartmentId: ocid1.compartment.oc1.. + jobInfrastructureType: ME_STANDALONE + jobType: DEFAULT + logGroupId: ocid1.loggroup.oc1.iad. + logId: ocid1.log.oc1.iad. + projectId: ocid1.datascienceproject.oc1. + shapeName: VM.Standard2.1 + subnetId: ocid1.subnet.oc1.iad. + type: dataScienceJob + name: '{Job name. For MLflow and Operator will be auto generated}' + runtime: + kind: runtime + spec: + args: [] + conda: + slug: conda_slug + type: service + entrypoint: '{For MLflow and Operator will be auto generated}' + freeformTags: {} + scriptPathURI: '{Path to the script. For MLflow and Operator will be auto + generated}' + workingDir: '{For MLflow and Operator will be auto generated}' + type: python diff --git a/tests/unitary/with_extras/operator/test_files/local_container.yaml b/tests/unitary/with_extras/operator/test_files/local_container.yaml new file mode 100644 index 000000000..b22fd2d8d --- /dev/null +++ b/tests/unitary/with_extras/operator/test_files/local_container.yaml @@ -0,0 +1,14 @@ +# This YAML specification was auto generated by the `ads opctl init` command. +# The more details about the jobs YAML specification can be found in the ADS documentation: +# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/index.html + +kind: operator.local +spec: + env: + - name: test_env_key + value: test_env_val + image: forecast:v1 + volume: + - /test/volume/one:/root/one +type: container +version: v1 diff --git a/tests/unitary/with_extras/operator/test_files/local_python.yaml b/tests/unitary/with_extras/operator/test_files/local_python.yaml new file mode 100644 index 000000000..71b837bf0 --- /dev/null +++ b/tests/unitary/with_extras/operator/test_files/local_python.yaml @@ -0,0 +1,8 @@ +# This YAML specification was auto generated by the `ads opctl init` command. +# The more details about the jobs YAML specification can be found in the ADS documentation: +# https://accelerated-data-science.readthedocs.io/en/latest/user_guide/jobs/index.html + +kind: operator.local +spec: null +type: python +version: v1 diff --git a/tests/unitary/with_extras/operator/test_files/test_operator/MLoperator b/tests/unitary/with_extras/operator/test_files/test_operator/MLoperator new file mode 100644 index 000000000..af54c8018 --- /dev/null +++ b/tests/unitary/with_extras/operator/test_files/test_operator/MLoperator @@ -0,0 +1,13 @@ +type: example +version: v1 +name: Example Operator +conda_type: published +conda: example_v1 +gpu: no +keywords: + - Example Operator +backends: + - job + - dataflow +description: | + Description for the operator diff --git a/tests/unitary/with_extras/operator/test_operator_backend.py b/tests/unitary/with_extras/operator/test_operator_backend.py new file mode 100644 index 000000000..e23d044fc --- /dev/null +++ b/tests/unitary/with_extras/operator/test_operator_backend.py @@ -0,0 +1,394 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +import json +import os +import tempfile +from unittest.mock import MagicMock, patch + +import pytest +import yaml + +from ads import jobs +from ads.opctl.backend.ads_ml_job import MLJobOperatorBackend +from ads.opctl.backend.local import LocalOperatorBackend, OperatorLoader +from ads.opctl.config.base import ConfigProcessor +from ads.opctl.config.merger import ConfigMerger +from ads.opctl.constants import BACKEND_NAME +from ads.opctl.operator.common.const import ENV_OPERATOR_ARGS, PACK_TYPE +from ads.opctl.operator.common.operator_loader import OperatorInfo +from ads.opctl.operator.runtime import runtime as operator_runtime +from ads.opctl.operator.runtime.runtime import ContainerRuntime, PythonRuntime + + +class TestLocalOperatorBackend: + def setup_class(cls): + # current directory and test files directory + cls.CUR_DIR = os.path.dirname(os.path.abspath(__file__)) + cls.TEST_FILES_DIR = os.path.join(cls.CUR_DIR, "test_files") + + # mock backends + cls.MOCK_BACKEND_CONFIG = { + "local.container.config": operator_runtime.ContainerRuntime.from_yaml( + uri=os.path.join(cls.TEST_FILES_DIR, "local_container.yaml") + ).to_dict(), + "local.python.config": operator_runtime.PythonRuntime.from_yaml( + uri=os.path.join(cls.TEST_FILES_DIR, "local_python.yaml") + ).to_dict(), + } + + def setup_method(self): + # mock operator config + self.mock_config = { + "kind": "operator", + "type": "example", + "version": "v1", + "spec": {}, + "runtime": {}, + "infrastructure": {}, + "execution": {"oci_config": "test_oci_config"}, + } + + # mock operator info + self.mock_operator_info = OperatorInfo( + type="example", + gpu="no", + description="An example operator", + version="v1", + conda="example_v1", + conda_type=PACK_TYPE.CUSTOM, + path=os.path.join(self.TEST_FILES_DIR, "test_operator"), + backends=[BACKEND_NAME.JOB.value, BACKEND_NAME.DATAFLOW.value], + ) + + # mock operator backend + self.mock_backend = LocalOperatorBackend( + config=self.mock_config, operator_info=self.mock_operator_info + ) + + def test__init(self): + """Ensures that the local operator backend can be successfully initialized.""" + assert self.mock_backend.runtime_config == {} + expected_operator_config = { + **{ + key: value + for key, value in self.mock_config.items() + if key not in ("runtime", "infrastructure", "execution") + } + } + assert self.mock_backend.operator_config == expected_operator_config + assert self.mock_backend.operator_type == self.mock_config["type"] + + assert operator_runtime.ContainerRuntime.type in self.mock_backend._RUNTIME_MAP + assert operator_runtime.PythonRuntime.type in self.mock_backend._RUNTIME_MAP + + self.mock_backend.operator_info = self.mock_operator_info + + @patch("runpy.run_module") + def test__run_with_python(self, mock_run_module): + """Tests running the operator within a local python environment.""" + self.mock_backend.runtime_config = PythonRuntime.init().to_dict() + result = self.mock_backend._run_with_python() + mock_run_module.assert_called_with( + self.mock_operator_info.type, run_name="__main__" + ) + assert result == 0 + + @patch("runpy.run_module") + def test__run_with_python_fail(self, mock_run_module): + """Tests running the operator within a local python environment.""" + mock_run_module.side_effect = SystemExit(1) + self.mock_backend.runtime_config = PythonRuntime.init().to_dict() + result = self.mock_backend._run_with_python() + mock_run_module.assert_called_with( + self.mock_operator_info.type, run_name="__main__" + ) + assert result == 1 + + @patch("ads.opctl.backend.local.run_container") + def test__run_with_container(self, mock_run_container): + """Tests running the operator within a container.""" + self.mock_backend.runtime_config = ContainerRuntime.init( + **{ + "image": "test_image", + "env": [{"name": "test_env_key", "value": "test_env_value"}], + "volume": ["host_value:container_value"], + } + ).to_dict() + self.mock_backend._run_with_container() + + mock_run_container.assert_called_with( + image="test_image", + bind_volumes={"host_value": {"bind": "container_value"}}, + env_vars={ + "test_env_key": "test_env_value", + ENV_OPERATOR_ARGS: json.dumps(self.mock_backend.operator_config), + }, + command=f"'python3 -m {self.mock_operator_info.type}'", + ) + + @pytest.mark.parametrize( + "mock_runtime_type, mock_runtime_config", + [ + ("python", PythonRuntime().to_dict()), + ("container", ContainerRuntime().to_dict()), + ], + ) + def test_run_success(self, mock_runtime_type, mock_runtime_config): + """Test running the operator with success result""" + self.mock_backend.runtime_config = mock_runtime_config + self.mock_backend.operator_info = None + + mock_run_with = MagicMock(return_value=0) + self.mock_backend._RUNTIME_MAP[mock_runtime_type] = mock_run_with + + with patch.object(OperatorLoader, "from_uri") as mock_operator_loader_from_uri: + # mock objects + mock_operator_loader_from_uri.return_value = MagicMock( + load=MagicMock(return_value=self.mock_operator_info) + ) + + self.mock_backend.run() + mock_run_with.assert_called() + + mock_run_with.return_value = 1 + with pytest.raises(RuntimeError): + self.mock_backend.run() + + def test_run_fail(self): + """Test running the operator with failed result""" + with pytest.raises(RuntimeError): + self.mock_backend.runtime_config = {"type": "undefined"} + self.mock_backend.run() + + def test_init_fail(self): + """Ensures that initiating starter config fails in case of wrong input params.""" + with pytest.raises(ValueError): + self.mock_backend.init(runtime_type="unknown") + + @pytest.mark.parametrize( + "mock_runtime_type, expected_result", + [ + ( + "python", + yaml.load( + "# This YAML specification was auto generated by the `ads operator init` " + "command.\n# The more details about the operator's runtime YAML " + "specification can be found in the ADS documentation:\n# " + "https://accelerated-data-science.readthedocs.io/en/latest " + "\n\n\nkind: operator.local\nspec: null\ntype: python\nversion: v1\n", + Loader=yaml.FullLoader, + ), + ), + ( + "container", + yaml.load( + "# This YAML specification was auto generated by the `ads operator " + "init` command.\n# The more details about the operator's runtime YAML " + "specification can be found in the ADS documentation:\n# " + "https://accelerated-data-science.readthedocs.io/en/latest \n\n\nkind: " + "operator.local\nspec:\n env:\n - name: operator\n " + "value: example:v1\n image: example:v1\n " + "volume:\n - :/root/.oci\ntype: container\nversion: v1\n", + Loader=yaml.FullLoader, + ), + ), + ], + ) + def test_init_success(self, mock_runtime_type, expected_result): + """Tests generating a starter YAML specification for the operator local runtime.""" + assert ( + yaml.load( + self.mock_backend.init(runtime_type=mock_runtime_type), + Loader=yaml.FullLoader, + ) + == expected_result + ) + + +class TestMLJobOperatorBackend: + """Tests backend class to run operator on Data Science Jobs.""" + + def setup_class(cls): + # current directory and test files directory + cls.CUR_DIR = os.path.dirname(os.path.abspath(__file__)) + cls.TEST_FILES_DIR = os.path.join(cls.CUR_DIR, "test_files") + + # mock backends + cls.MOCK_BACKEND_CONFIG = { + "job.container.config": jobs.Job.from_yaml( + uri=os.path.join(cls.TEST_FILES_DIR, "job_container.yaml") + ).to_dict(), + "job.python.config": jobs.Job.from_yaml( + uri=os.path.join(cls.TEST_FILES_DIR, "job_python.yaml") + ).to_dict(), + } + + def setup_method(self): + self.mock_config = ( + ConfigProcessor( + { + "kind": "operator", + "type": "example", + "version": "v1", + "spec": {}, + "runtime": {}, + } + ) + .step(ConfigMerger, **{}) + .config + ) + + # mock operator info + self.mock_operator_info = OperatorInfo( + type="example", + gpu="no", + description="An example operator", + version="v1", + conda="example_v1", + conda_type=PACK_TYPE.CUSTOM, + path=os.path.join(self.TEST_FILES_DIR, "test_operator"), + backends=[BACKEND_NAME.JOB.value, BACKEND_NAME.DATAFLOW.value], + ) + + # mock operator backend + self.mock_backend = MLJobOperatorBackend( + config=self.mock_config, operator_info=self.mock_operator_info + ) + + def test__init(self): + assert self.mock_backend.job is None + assert self.mock_backend.runtime_config == {} + + expected_operator_config = { + **{ + key: value + for key, value in self.mock_config.items() + if key not in ("runtime", "infrastructure", "execution") + } + } + assert self.mock_backend.operator_config == expected_operator_config + assert self.mock_backend.operator_type == self.mock_config["type"] + assert self.mock_backend.operator_version == self.mock_config["version"] + + assert jobs.ContainerRuntime().type in self.mock_backend._RUNTIME_MAP + assert jobs.PythonRuntime().type in self.mock_backend._RUNTIME_MAP + + self.mock_backend.operator_info = self.mock_operator_info + + def test__adjust_common_information(self): + self.mock_backend.job = jobs.Job(name="{job", runtime=jobs.PythonRuntime({})) + self.mock_backend._adjust_common_information() + + assert self.mock_backend.job.name == ( + f"job_{self.mock_operator_info.type.lower()}" + f"_{self.mock_operator_info.version.lower()}" + ) + + def test__adjust_container_runtime(self): + self.mock_backend.job = jobs.Job( + name="{job", runtime=jobs.ContainerRuntime().with_image("test-image") + ) + self.mock_backend._adjust_container_runtime() + + assert self.mock_backend.job.runtime.to_dict() == ( + { + "kind": "runtime", + "spec": { + "cmd": "python3 -m example", + "entrypoint": None, + "env": [ + {"name": "OCI_IAM_TYPE", "value": "resource_principal"}, + {"name": "OCIFS_IAM_TYPE", "value": "resource_principal"}, + { + "name": "ENV_OPERATOR_ARGS", + "value": '{"kind": "operator", "type": "example", ' + '"version": "v1", "spec": {}}', + }, + ], + "image": "test-image", + }, + "type": "container", + } + ) + + @patch("time.time", return_value=1) + def test__adjust_python_runtime(self, mock_time): + with tempfile.TemporaryDirectory() as temp_dir: + with patch("tempfile.mkdtemp", return_value=temp_dir): + self.mock_backend.job = jobs.Job( + name="{job", runtime=jobs.PythonRuntime() + ) + self.mock_backend._adjust_python_runtime() + + assert self.mock_backend.job.runtime.to_dict() == ( + { + "kind": "runtime", + "type": "python", + "spec": { + "entrypoint": "example_1_run.sh", + "scriptPathURI": temp_dir, + "workingDir": os.path.basename(temp_dir.rstrip("/")), + "env": [ + {"name": "OCI_IAM_TYPE", "value": "resource_principal"}, + { + "name": "OCIFS_IAM_TYPE", + "value": "resource_principal", + }, + { + "name": "ENV_OPERATOR_ARGS", + "value": '{"kind": "operator", "type": "example", "version": "v1", "spec": {}}', + }, + ], + }, + } + ) + + @pytest.mark.parametrize( + "mock_runtime_type, mock_runtime_config", + [ + ( + "python", + jobs.Job( + name="test_name", + runtime=jobs.PythonRuntime(), + infrastructure=jobs.DataScienceJob(), + ).to_dict(), + ), + ( + "container", + jobs.Job( + name="test_name", + runtime=jobs.ContainerRuntime(), + infrastructure=jobs.DataScienceJob(), + ).to_dict(), + ), + ], + ) + @patch.object(jobs.Job, "create") + @patch.object(jobs.Job, "run") + def test_run_success( + self, mock_job_run, mock_job_create, mock_runtime_type, mock_runtime_config + ): + mock_job_create.return_value = MagicMock(run=mock_job_run) + """Test running the operator with success result""" + self.mock_backend.runtime_config = mock_runtime_config + self.mock_backend.operator_info = None + + mock_run_with = MagicMock() + self.mock_backend._RUNTIME_MAP[mock_runtime_type] = mock_run_with + + with patch.object(OperatorLoader, "from_uri") as mock_operator_loader_from_uri: + # mock objects + mock_operator_loader_from_uri.return_value = MagicMock( + load=MagicMock(return_value=self.mock_operator_info) + ) + + self.mock_backend.run() + mock_run_with.assert_called() + mock_job_run.assert_called() + mock_job_create.assert_called() diff --git a/tests/unitary/with_extras/operator/test_operator_config.py b/tests/unitary/with_extras/operator/test_operator_config.py new file mode 100644 index 000000000..18afea1e8 --- /dev/null +++ b/tests/unitary/with_extras/operator/test_operator_config.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +from dataclasses import dataclass + +import pytest +import yaml + +from ads.opctl.operator.common.errors import OperatorSchemaYamlError +from ads.opctl.operator.common.operator_config import OperatorConfig + + +class TestOperatorConfig: + def test_operator_config(self): + # Test valid operator config + + @dataclass(repr=True) + class MyOperatorConfig(OperatorConfig): + @classmethod + def _load_schema(cls) -> str: + return yaml.safe_load( + """ + kind: + required: true + type: string + version: + required: true + type: string + type: + required: true + type: string + spec: + required: true + type: dict + schema: + foo: + required: false + type: string + """ + ) + + config = MyOperatorConfig.from_dict( + { + "kind": "operator", + "type": "my-operator", + "version": "v1", + "spec": {"foo": "bar"}, + } + ) + assert config.kind == "operator" + assert config.type == "my-operator" + assert config.version == "v1" + assert config.spec == {"foo": "bar"} + + # Test invalid operator config + @dataclass(repr=True) + class InvalidOperatorConfig(OperatorConfig): + @classmethod + def _load_schema(cls) -> str: + return yaml.safe_load( + """ + kind: + required: true + type: string + version: + required: true + type: string + allowed: + - v1 + type: + required: true + type: string + spec: + required: true + type: dict + schema: + foo: + required: true + type: string + """ + ) + + with pytest.raises(OperatorSchemaYamlError): + InvalidOperatorConfig.from_dict( + { + "kind": "operator", + "type": "invalid-operator", + "version": "v2", + "spec": {"foo1": 123}, + } + ) diff --git a/tests/unitary/with_extras/operator/test_operator_loader.py b/tests/unitary/with_extras/operator/test_operator_loader.py new file mode 100644 index 000000000..bbe71d87c --- /dev/null +++ b/tests/unitary/with_extras/operator/test_operator_loader.py @@ -0,0 +1,436 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import os +import shutil +import unittest +from unittest.mock import MagicMock, Mock, patch + +import pytest + +from ads.opctl.operator.common.const import ARCH_TYPE, PACK_TYPE +from ads.opctl.operator.common.operator_loader import ( + GitOperatorLoader, + LocalOperatorLoader, + OperatorInfo, + OperatorLoader, + RemoteOperatorLoader, + ServiceOperatorLoader, +) + + +class TestOperatorInfo(unittest.TestCase): + def test_construction(self): + operator_info = OperatorInfo( + type="example", + gpu="yes", + description="An example operator", + version="v1", + conda="example_v1", + conda_type=PACK_TYPE.CUSTOM, + path="/path/to/operator", + backends=["backend1", "backend2"], + ) + assert (operator_info.type, "example") + self.assertTrue(operator_info.gpu) + assert (operator_info.description, "An example operator") + assert (operator_info.version, "v1") + assert (operator_info.conda, "example_v1") + assert (operator_info.conda_type, PACK_TYPE.CUSTOM) + assert (operator_info.path, "/path/to/operator") + assert (operator_info.backends, ["backend1", "backend2"]) + + def test_conda_prefix(self): + operator_info = OperatorInfo( + type="example", + gpu="yes", + description="An example operator", + version="v1", + conda="example_v1", + conda_type=PACK_TYPE.CUSTOM, + path="/path/to/operator", + backends=["backend1", "backend2"], + ) + assert ( + operator_info.conda_prefix, + f"{ARCH_TYPE.GPU}/example/1/example_v1", + ) + + def test_conda_prefix_without_gpu(self): + operator_info = OperatorInfo( + type="example", + gpu="no", + description="An example operator", + version="v1", + conda="example_v1", + conda_type=PACK_TYPE.CUSTOM, + path="/path/to/operator", + backends=["backend1", "backend2"], + ) + assert ( + operator_info.conda_prefix, + f"{ARCH_TYPE.CPU}/example/1/example_v1", + ) + + def test_post_init(self): + operator_info = OperatorInfo( + type="example", + gpu="yes", # Should be converted to boolean + version="", # Should be set to "v1" + conda_type=None, # Should be set to PACK_TYPE.CUSTOM + conda=None, # Should be set to "example_v1" + ) + self.assertTrue(operator_info.gpu) + assert (operator_info.version, "v1") + assert (operator_info.conda_type, PACK_TYPE.CUSTOM) + assert (operator_info.conda, "example_v1") + + def test_from_yaml_with_yaml_string(self): + yaml_string = """ + type: example + gpu: yes + description: An example operator + version: v1 + conda_type: published + path: /path/to/operator + backends: + - backend1 + - backend2 + """ + operator_info = OperatorInfo.from_yaml(yaml_string=yaml_string) + assert (operator_info.type, "example") + self.assertTrue(operator_info.gpu) + assert (operator_info.description, "An example operator") + assert (operator_info.version, "v1") + assert (operator_info.conda, "example_v1") + assert (operator_info.conda_type, PACK_TYPE.CUSTOM) + assert (operator_info.path, "/path/to/operator") + assert (operator_info.backends, ["backend1", "backend2"]) + + @patch("ads.common.serializer.Serializable.from_yaml") + def test_from_yaml_with_uri(self, mock_from_yaml): + uri = "http://example.com/operator.yaml" + loader = MagicMock() + mock_from_yaml.return_value = OperatorInfo( + type="example", + gpu="yes", + description="An example operator", + version="v1", + conda="example_v1", + conda_type=PACK_TYPE.CUSTOM, + path="/path/to/operator", + backends=["backend1", "backend2"], + ) + operator_info = OperatorInfo.from_yaml(uri=uri, loader=loader) + mock_from_yaml.assert_called_with(yaml_string=None, uri=uri, loader=loader) + assert (operator_info.type, "example") + self.assertTrue(operator_info.gpu) + assert (operator_info.description, "An example operator") + assert (operator_info.version, "v1") + assert (operator_info.conda, "example_v1") + assert (operator_info.conda_type, PACK_TYPE.CUSTOM) + assert (operator_info.path, "http://example.com") + assert (operator_info.backends, ["backend1", "backend2"]) + + +class TestOperatorLoader: + def setup_method(self): + # Create a mock Loader instance for testing + self.loader = Mock() + self.operator_loader = OperatorLoader(self.loader) + + def test_load_operator(self): + # Define a mock OperatorInfo object to return when load is called on the loader + mock_operator_info = OperatorInfo( + type="mock_operator", + gpu=False, + description="Mock Operator", + version="v1", + conda="mock_operator_v1", + conda_type="custom", + path="/path/to/mock_operator", + backends=["cpu"], + ) + + # Mock the _load method to return the mock_operator_info object + self.loader.load.return_value = mock_operator_info + + # Call the load method of the OperatorLoader + operator_info = self.operator_loader.load() + + # Check if the returned OperatorInfo object matches the expected values + + assert operator_info.type == "mock_operator" + assert operator_info.gpu == False + assert operator_info.description == "Mock Operator" + assert operator_info.version == "v1" + assert operator_info.conda == "mock_operator_v1" + assert operator_info.conda_type == "custom" + assert operator_info.path == "/path/to/mock_operator" + assert operator_info.backends == ["cpu"] + + def test_load_operator_exception(self): + # Mock the _load method to raise an exception + self.loader.load.side_effect = Exception("Error loading operator") + + # Call the load method of the OperatorLoader and expect an exception + with pytest.raises(Exception): + self.operator_loader.load() + + @pytest.mark.parametrize( + "test_name, uri, expected_result", + [ + ("Service Path", "forecast", ServiceOperatorLoader), + ("Local Path", "/path/to/local_operator", LocalOperatorLoader), + ("OCI Path", "oci://bucket/operator.zip", RemoteOperatorLoader), + ( + "Git Path", + "https://github.com/my-operator-repository", + GitOperatorLoader, + ), + ], + ) + def test_from_uri(self, test_name, uri, expected_result): + # Call the from_uri method of the OperatorLoader class + operator_loader = OperatorLoader.from_uri(uri=uri) + assert isinstance(operator_loader.loader, expected_result) + + def test_empty_uri(self): + # Test with an empty URI that should raise a ValueError + with pytest.raises(ValueError): + OperatorLoader.from_uri(uri="", uri_dst=None) + + def test_invalid_uri(self): + # Test with an invalid URI that should raise a ValueError + with pytest.raises(ValueError): + OperatorLoader.from_uri(uri="aws://", uri_dst=None) + + +class TestServiceOperatorLoader(unittest.TestCase): + def setUp(self): + # Create a mock ServiceOperatorLoader instance for testing + self.loader = ServiceOperatorLoader(uri="mock_service_operator") + self.mock_operator_info = OperatorInfo( + type="mock_operator", + gpu="no", + description="Mock Operator", + version="v1", + conda="mock_operator_v1", + conda_type="custom", + path="/path/to/mock_operator", + backends=["cpu"], + ) + + def test_compatible(self): + # Test the compatible method with a valid URI + uri = "forecast" + self.assertTrue(ServiceOperatorLoader.compatible(uri=uri)) + + # Test the compatible method with an invalid URI + uri = "invalid_service_operator" + self.assertFalse(ServiceOperatorLoader.compatible(uri=uri)) + + def test_load(self): + # Mock the _load method to return the mock_operator_info object + self.loader._load = Mock(return_value=self.mock_operator_info) + + # Call the load method of the ServiceOperatorLoader + operator_info = self.loader.load() + + # Check if the returned OperatorInfo object matches the expected values + self.assertEqual(operator_info.type, "mock_operator") + self.assertEqual(operator_info.gpu, False) + self.assertEqual(operator_info.description, "Mock Operator") + self.assertEqual(operator_info.version, "v1") + self.assertEqual(operator_info.conda, "mock_operator_v1") + self.assertEqual(operator_info.conda_type, "custom") + self.assertEqual(operator_info.path, "/path/to/mock_operator") + self.assertEqual(operator_info.backends, ["cpu"]) + + def test_load_exception(self): + # Mock the _load method to raise an exception + self.loader._load = Mock( + side_effect=Exception("Error loading service operator") + ) + + # Call the load method of the ServiceOperatorLoader and expect an exception + with self.assertRaises(Exception): + self.loader.load() + + +class TestLocalOperatorLoader(unittest.TestCase): + def setUp(self): + # Create a mock LocalOperatorLoader instance for testing + self.loader = LocalOperatorLoader(uri="path/to/local/operator") + self.mock_operator_info = OperatorInfo( + type="mock_operator", + gpu=False, + description="Mock Operator", + version="v1", + conda="mock_operator_v1", + conda_type="custom", + path="/path/to/mock_operator", + backends=["cpu"], + ) + + def test_compatible(self): + # Test the compatible method with a valid URI + uri = "path/to/local/operator" + self.assertTrue(LocalOperatorLoader.compatible(uri=uri)) + + # Test the compatible method with an invalid URI + uri = "http://example.com/remote/operator" + self.assertFalse(LocalOperatorLoader.compatible(uri=uri)) + + def test_load(self): + # Mock the _load method to return the mock_operator_info object + self.loader._load = Mock(return_value=self.mock_operator_info) + + # Call the load method of the LocalOperatorLoader + operator_info = self.loader.load() + + # Check if the returned OperatorInfo object matches the expected values + self.assertEqual(operator_info.type, "mock_operator") + self.assertEqual(operator_info.gpu, False) + self.assertEqual(operator_info.description, "Mock Operator") + self.assertEqual(operator_info.version, "v1") + self.assertEqual(operator_info.conda, "mock_operator_v1") + self.assertEqual(operator_info.conda_type, "custom") + self.assertEqual(operator_info.path, "/path/to/mock_operator") + self.assertEqual(operator_info.backends, ["cpu"]) + + def test_load_exception(self): + # Mock the _load method to raise an exception + self.loader._load = Mock(side_effect=Exception("Error loading local operator")) + + # Call the load method of the LocalOperatorLoader and expect an exception + with self.assertRaises(Exception): + self.loader.load() + + +class TestRemoteOperatorLoader(unittest.TestCase): + def setUp(self): + # Create a mock RemoteOperatorLoader instance for testing + self.loader = RemoteOperatorLoader(uri="oci://bucket/operator.zip") + self.mock_operator_info = OperatorInfo( + type="mock_operator", + gpu=False, + description="Mock Operator", + version="v1", + conda="mock_operator_v1", + conda_type="custom", + path="/path/to/mock_operator", + backends=["cpu"], + ) + + def test_compatible(self): + # Test the compatible method with a valid URI + uri = "oci://bucket/operator.zip" + self.assertTrue(RemoteOperatorLoader.compatible(uri=uri)) + + # Test the compatible method with an invalid URI + uri = "http://example.com/remote/operator" + self.assertFalse(RemoteOperatorLoader.compatible(uri=uri)) + + def test_load(self): + # Mock the _load method to return the mock_operator_info object + self.loader._load = Mock(return_value=self.mock_operator_info) + + # Call the load method of the RemoteOperatorLoader + operator_info = self.loader.load() + + # Check if the returned OperatorInfo object matches the expected values + self.assertEqual(operator_info.type, "mock_operator") + self.assertEqual(operator_info.gpu, False) + self.assertEqual(operator_info.description, "Mock Operator") + self.assertEqual(operator_info.version, "v1") + self.assertEqual(operator_info.conda, "mock_operator_v1") + self.assertEqual(operator_info.conda_type, "custom") + self.assertEqual(operator_info.path, "/path/to/mock_operator") + self.assertEqual(operator_info.backends, ["cpu"]) + + def test_load_exception(self): + # Mock the _load method to raise an exception + self.loader._load = Mock(side_effect=Exception("Error loading remote operator")) + + # Call the load method of the RemoteOperatorLoader and expect an exception + with self.assertRaises(Exception): + self.loader.load() + + +class TestGitOperatorLoader(unittest.TestCase): + def setUp(self): + # Create a temporary directory for testing + self.temp_dir = "temp_git_loader" + os.makedirs(self.temp_dir, exist_ok=True) + + # Create a mock GitOperatorLoader instance for testing + self.loader = GitOperatorLoader( + uri="https://github.com/mock_operator_repository.git@feature-branch#forecasting", + uri_dst=self.temp_dir, + ) + self.mock_operator_info = OperatorInfo( + type="mock_operator", + gpu=False, + description="Mock Operator", + version="v1", + conda="mock_operator_v1", + conda_type="custom", + path=os.path.join(self.temp_dir, "forecasting"), + backends=["cpu"], + ) + + def tearDown(self): + # Clean up the temporary directory + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def test_compatible(self): + # Test the compatible method with a valid URI + uri = ( + "https://github.com/mock_operator_repository.git@feature-branch#forecasting" + ) + self.assertTrue(GitOperatorLoader.compatible(uri=uri)) + + # Test the compatible method with an invalid URI + uri = "http://example.com/remote/operator" + self.assertFalse(GitOperatorLoader.compatible(uri=uri)) + + def test_load(self): + # Mock the git.Repo.clone_from method to avoid actual Git operations + with patch("git.Repo.clone_from") as mock_clone_from: + mock_clone_from.return_value = Mock() + + # Mock the _load method to return the mock_operator_info object + self.loader._load = Mock(return_value=self.mock_operator_info) + + # Call the load method of the GitOperatorLoader + operator_info = self.loader.load() + + # Check if the returned OperatorInfo object matches the expected values + self.assertEqual(operator_info.type, "mock_operator") + self.assertEqual(operator_info.gpu, False) + self.assertEqual(operator_info.description, "Mock Operator") + self.assertEqual(operator_info.version, "v1") + self.assertEqual(operator_info.conda, "mock_operator_v1") + self.assertEqual(operator_info.conda_type, "custom") + self.assertEqual( + operator_info.path, os.path.join(self.temp_dir, "forecasting") + ) + self.assertEqual(operator_info.backends, ["cpu"]) + + def test_load_exception(self): + # Mock the git.Repo.clone_from method to raise an exception + with patch( + "git.Repo.clone_from", side_effect=Exception("Error cloning Git repository") + ): + # Mock the _load method to raise an exception + self.loader._load = Mock( + side_effect=Exception("Error loading Git operator") + ) + + # Call the load method of the GitOperatorLoader and expect an exception + with self.assertRaises(Exception): + self.loader.load() diff --git a/tests/unitary/with_extras/operator/test_operator_yaml_generator.py b/tests/unitary/with_extras/operator/test_operator_yaml_generator.py new file mode 100644 index 000000000..354e6fc88 --- /dev/null +++ b/tests/unitary/with_extras/operator/test_operator_yaml_generator.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import pytest +import yaml + +from ads.opctl.operator.common.operator_yaml_generator import YamlGenerator + + +class TestOperatorYamlGenerator: + """Tests class for generating the YAML config based on the given YAML schema.""" + + @pytest.mark.parametrize( + "schema, values, expected_result", + [ + # Test case: Basic schema with required and default values + ( + { + "key1": { + "type": "string", + "default": "test_value", + "required": True, + }, + "key2": {"type": "number", "default": 42, "required": True}, + }, + {}, + {"key1": "test_value", "key2": 42}, + ), + # Test case: Basic schema with required and default values + ( + { + "key1": {"type": "string", "required": True}, + "key2": {"type": "number", "default": 42, "required": True}, + }, + {"key1": "test_value"}, + {"key1": "test_value", "key2": 42}, + ), + # Test case: Basic schema with required and default values + ( + { + "key1": {"type": "string", "required": True}, + "key2": {"type": "number", "default": 42}, + }, + {"key1": "test_value"}, + {"key1": "test_value"}, + ), + # Test case: Schema with dependencies + ( + { + "model": {"type": "string", "required": True, "default": "prophet"}, + "owner_name": { + "type": "string", + "dependencies": {"model": "prophet"}, + }, + }, + {"owner_name": "value"}, + {"model": "prophet", "owner_name": "value"}, + ), + # Test case: Schema with dependencies + ( + { + "model": {"type": "string", "required": True, "default": "prophet"}, + "owner_name": { + "type": "string", + "dependencies": {"model": "prophet1"}, + }, + }, + {"owner_name": "value"}, + {"model": "prophet"}, + ), + ], + ) + def test_generate_example(self, schema, values, expected_result): + yaml_generator = YamlGenerator(schema=schema) + yaml_config = yaml_generator.generate_example(values) + assert yaml_config == yaml.dump(expected_result) diff --git a/tests/unitary/with_extras/operator/test_runtime.py b/tests/unitary/with_extras/operator/test_runtime.py new file mode 100644 index 000000000..77e324d8f --- /dev/null +++ b/tests/unitary/with_extras/operator/test_runtime.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import unittest +from unittest.mock import MagicMock, patch + +from ads.opctl.operator.common.errors import OperatorSchemaYamlError +from ads.opctl.operator.runtime.runtime import ( + OPERATOR_LOCAL_RUNTIME_TYPE, + ContainerRuntime, + ContainerRuntimeSpec, + PythonRuntime, + Runtime, +) + + +class TestRuntime(unittest.TestCase): + def setUp(self): + self.runtime = Runtime() + + def test_kind(self): + self.assertEqual(self.runtime.kind, "operator.local") + + def test_type(self): + self.assertIsNone(self.runtime.type) + + def test_version(self): + self.assertIsNone(self.runtime.version) + + @patch("ads.opctl.operator.runtime.runtime._load_yaml_from_uri") + @patch("ads.opctl.operator.runtime.runtime.Validator") + def test_validate_dict(self, mock_validator, mock_load_yaml): + mock_validator.return_value.validate.return_value = True + self.assertTrue(Runtime._validate_dict({})) + mock_load_yaml.assert_called_once() + mock_validator.assert_called_once() + + @patch("ads.opctl.operator.runtime.runtime._load_yaml_from_uri") + @patch("ads.opctl.operator.runtime.runtime.Validator") + def test_validate_dict_invalid(self, mock_validator, mock_load_yaml): + mock_validator.return_value = MagicMock( + errors=[{"error": "error"}], validate=MagicMock(return_value=False) + ) + mock_validator.return_value.validate.return_value = False + with self.assertRaises(OperatorSchemaYamlError): + Runtime._validate_dict({}) + mock_load_yaml.assert_called_once() + mock_validator.assert_called_once() + + +class TestContainerRuntime(unittest.TestCase): + def test_init(self): + runtime = ContainerRuntime.init( + image="my-image", + env=[{"name": "VAR1", "value": "value1"}], + volume=["/data"], + ) + self.assertIsInstance(runtime, ContainerRuntime) + self.assertEqual(runtime.type, OPERATOR_LOCAL_RUNTIME_TYPE.CONTAINER.value) + self.assertEqual(runtime.version, "v1") + self.assertIsInstance(runtime.spec, ContainerRuntimeSpec) + self.assertEqual(runtime.spec.image, "my-image") + self.assertEqual(runtime.spec.env, [{"name": "VAR1", "value": "value1"}]) + self.assertEqual(runtime.spec.volume, ["/data"]) + + def test_validate_dict(self): + valid_dict = { + "kind": "operator.local", + "type": "container", + "version": "v1", + "spec": { + "image": "my-image", + "env": [{"name": "VAR1", "value": "value1"}], + "volume": ["/data"], + }, + } + self.assertTrue(ContainerRuntime._validate_dict(valid_dict)) + + invalid_dict = { + "kind": "operator.local", + "type": "unknown", + "version": "v1", + "spec": { + "image": "my-image", + "env": [{"name": "VAR1"}], + "volume": ["/data"], + }, + } + with self.assertRaises(OperatorSchemaYamlError): + ContainerRuntime._validate_dict(invalid_dict) + + +class TestPythonRuntime(unittest.TestCase): + def test_init(self): + runtime = PythonRuntime.init() + self.assertIsInstance(runtime, PythonRuntime) + self.assertEqual(runtime.type, "python") + self.assertEqual(runtime.version, "v1")