diff --git a/THIRD_PARTY_LICENSES.txt b/THIRD_PARTY_LICENSES.txt index 6c320bca9..418d831bb 100644 --- a/THIRD_PARTY_LICENSES.txt +++ b/THIRD_PARTY_LICENSES.txt @@ -72,6 +72,12 @@ fastavro * Source code: https://github.com/fastavro/fastavro * Project home: https://github.com/fastavro/fastavro +fiona +* Copyright (c) 2007, Sean C. Gillies +* License: BSD 3-Clause "New" or "Revised" License +* Source code: https://github.com/Toblerity/Fiona +* Project home: https://github.com/Toblerity/Fiona + folium * Copyright (C) 2013, Rob Story * License: MIT License @@ -459,7 +465,13 @@ pydantic * Source code: https://github.com/pydantic/pydantic * Project home: https://docs.pydantic.dev/latest/ -======= +rrcf +* Copyright 2018 kLabUM +* License: MIT License +* Source code: https://github.com/kLabUM/rrcf +* Project home: https://github.com/kLabUM/rrcf + + =============================== Licenses =============================== ------------------------------------------------------------------------ diff --git a/ads/opctl/conda/cmds.py b/ads/opctl/conda/cmds.py index d24c4f0d1..0fd34ccff 100644 --- a/ads/opctl/conda/cmds.py +++ b/ads/opctl/conda/cmds.py @@ -181,29 +181,29 @@ def _create( logger.info( f"Preparing manifest. Manifest in the environment: {conda_dep.get('manifest')}" ) - manifest = _fetch_manifest_template() + manifest_template = _fetch_manifest_template() if "name" not in manifest: - manifest["manifest"]["name"] = name - manifest["manifest"]["slug"] = slug + manifest_template["manifest"]["name"] = name + manifest_template["manifest"]["slug"] = slug if "type" not in manifest: logger.info("Setting manifest to published") - manifest["manifest"]["type"] = "published" + manifest_template["manifest"]["type"] = "published" if "version" not in manifest: - manifest["manifest"]["version"] = version - manifest["manifest"]["arch_type"] = "GPU" if gpu else "CPU" + manifest_template["manifest"]["version"] = version + manifest_template["manifest"]["arch_type"] = "GPU" if gpu else "CPU" - manifest["manifest"]["create_date"] = datetime.utcnow().strftime( + manifest_template["manifest"]["create_date"] = datetime.utcnow().strftime( "%a, %b %d, %Y, %H:%M:%S %Z UTC" ) if not "manifest_version" in manifest: - manifest["manifest"]["manifest_version"] = "1.0" + manifest_template["manifest"]["manifest_version"] = "1.0" logger.info(f"Creating conda environment {slug}") manifest_dict = { - k: manifest["manifest"][k] - for k in manifest["manifest"] - if manifest["manifest"][k] + k: manifest_template["manifest"][k] + for k in manifest_template["manifest"] + if manifest_template["manifest"][k] } if "manifest" in conda_dep: conda_dep["manifest"].update(manifest_dict) diff --git a/ads/opctl/operator/lowcode/anomaly/const.py b/ads/opctl/operator/lowcode/anomaly/const.py index ff0e0fd22..73bac624b 100644 --- a/ads/opctl/operator/lowcode/anomaly/const.py +++ b/ads/opctl/operator/lowcode/anomaly/const.py @@ -21,6 +21,7 @@ class NonTimeADSupportedModels(str, metaclass=ExtendedEnumMeta): OneClassSVM = "oneclasssvm" IsolationForest = "isolationforest" + RandomCutForest = "randomcutforest" # TODO : Add DBScan # DBScan = "dbscan" diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py index e909976d8..e8de5213e 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/base_model.py +++ b/ads/opctl/operator/lowcode/anomaly/model/base_model.py @@ -16,7 +16,11 @@ from ads.common.object_storage_details import ObjectStorageDetails from ads.opctl import logger -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics, SUBSAMPLE_THRESHOLD +from ads.opctl.operator.lowcode.anomaly.const import ( + SUBSAMPLE_THRESHOLD, + OutputColumns, + SupportedMetrics, +) from ads.opctl.operator.lowcode.anomaly.utils import _build_metrics_df, default_signer from ads.opctl.operator.lowcode.common.utils import ( disable_print, @@ -55,6 +59,7 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets): def generate_report(self): """Generates the report.""" import matplotlib.pyplot as plt + plt.rcParams.update({'figure.max_open_warning': 0}) import report_creator as rc start_time = time.time() @@ -87,43 +92,59 @@ def generate_report(self): self.spec.datetime_column.name if self.spec.datetime_column else "index" ) + ( + model_description, + other_sections, + ) = self._generate_report() + blocks = [] for target, df in self.datasets.full_data_dict.items(): - figure_blocks = [] - time_col = df[date_column].reset_index(drop=True) - anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[ - OutputColumns.ANOMALY_COL - ] - anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1] - downsampled_time_col = time_col - selected_indices = list(range(len(time_col))) - if self.spec.subsample_report_data: - non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices] - # Downsample non-anomalous data if it exceeds the threshold (1000) - if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD: - downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD] - selected_indices = anomaly_indices + downsampled_non_anomaly_indices - selected_indices.sort() - downsampled_time_col = time_col[selected_indices] - - columns = set(df.columns).difference({date_column}) - for col in columns: - y = df[col].reset_index(drop=True) - - downsampled_y = y[selected_indices] - - fig, ax = plt.subplots(figsize=(8, 3), layout="constrained") - ax.grid() - ax.plot(downsampled_time_col, downsampled_y, color="black") - # Plot anomalies - for i in anomaly_indices: - ax.scatter(time_col[i], y[i], color="red", marker="o") - plt.xlabel(date_column) - plt.ylabel(col) - plt.title(f"`{col}` with reference to anomalies") - figure_blocks.append(rc.Widget(ax)) - - blocks.append(rc.Group(*figure_blocks, label=target)) + if target in anomaly_output.list_categories(): + figure_blocks = [] + time_col = df[date_column].reset_index(drop=True) + anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[ + OutputColumns.ANOMALY_COL + ] + anomaly_indices = [ + i for i, index in enumerate(anomaly_col) if index == 1 + ] + downsampled_time_col = time_col + selected_indices = list(range(len(time_col))) + if self.spec.subsample_report_data: + non_anomaly_indices = [ + i for i in range(len(time_col)) if i not in anomaly_indices + ] + # Downsample non-anomalous data if it exceeds the threshold (1000) + if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD: + downsampled_non_anomaly_indices = non_anomaly_indices[ + :: len(non_anomaly_indices) // SUBSAMPLE_THRESHOLD + ] + selected_indices = ( + anomaly_indices + downsampled_non_anomaly_indices + ) + selected_indices.sort() + downsampled_time_col = time_col[selected_indices] + + columns = set(df.columns).difference({date_column}) + for col in columns: + y = df[col].reset_index(drop=True) + + downsampled_y = y[selected_indices] + + fig, ax = plt.subplots(figsize=(8, 3), layout="constrained") + ax.grid() + ax.plot(downsampled_time_col, downsampled_y, color="black") + # Plot anomalies + for i in anomaly_indices: + ax.scatter(time_col[i], y[i], color="red", marker="o") + plt.xlabel(date_column) + plt.ylabel(col) + plt.title(f"`{col}` with reference to anomalies") + figure_blocks.append(rc.Widget(ax)) + else: + figure_blocks = None + + blocks.append(rc.Group(*figure_blocks, label=target)) if figure_blocks else None plots = rc.Select(blocks) report_sections = [] @@ -133,7 +154,7 @@ def generate_report(self): yaml_appendix = rc.Yaml(self.config.to_dict()) summary = rc.Block( rc.Group( - rc.Text(f"You selected the **`{self.spec.model}`** model."), + rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"), rc.Text( "Based on your dataset, you could have also selected " f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`." diff --git a/ads/opctl/operator/lowcode/anomaly/model/factory.py b/ads/opctl/operator/lowcode/anomaly/model/factory.py index 49adfb04f..64028cba2 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/factory.py +++ b/ads/opctl/operator/lowcode/anomaly/model/factory.py @@ -15,6 +15,7 @@ from .base_model import AnomalyOperatorBaseModel from .isolationforest import IsolationForestOperatorModel from .oneclasssvm import OneClassSVMOperatorModel +from .randomcutforest import RandomCutForestOperatorModel class UnSupportedModelError(Exception): @@ -52,6 +53,7 @@ class AnomalyOperatorModelFactory: _NonTime_MAP = { NonTimeADSupportedModels.OneClassSVM: OneClassSVMOperatorModel, NonTimeADSupportedModels.IsolationForest: IsolationForestOperatorModel, + NonTimeADSupportedModels.RandomCutForest: RandomCutForestOperatorModel, # TODO: Add DBScan model for non time based anomaly # NonTimeADSupportedModels.DBScan: DBScanOperatorModel, } diff --git a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py new file mode 100644 index 000000000..e2b8b9d5a --- /dev/null +++ b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +# Copyright (c) 2023, 2024 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +import numpy as np +import pandas as pd + +from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl import logger +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns + +from .anomaly_dataset import AnomalyOutput +from .base_model import AnomalyOperatorBaseModel + + +class RandomCutForestOperatorModel(AnomalyOperatorBaseModel): + """ + Class representing Random Cut Forest Anomaly Detection operator model. + """ + + @runtime_dependency( + module="rrcf", + err_msg=( + "Please run `pip install rrcf` to " + "install the required dependencies for RandomCutForest." + ), + ) + def _build_model(self) -> AnomalyOutput: + from rrcf import RCTree + + model_kwargs = self.spec.model_kwargs + + anomaly_output = AnomalyOutput(date_column="index") + + # Set tree parameters + num_trees = model_kwargs.get("num_trees", 200) + shingle_size = model_kwargs.get("shingle_size", None) + anomaly_threshold = model_kwargs.get("anamoly_threshold", 95) + + for target, df in self.datasets.full_data_dict.items(): + try: + if df.shape[0] == 1: + raise ValueError("Dataset size must be greater than 1") + df_values = df[self.spec.target_column].astype(float).values + + cal_shingle_size = ( + shingle_size + if shingle_size + else int(2 ** np.floor(np.log2(df.shape[0])) / 2) + ) + points = np.vstack(list(rrcf.shingle(df_values, size=cal_shingle_size))) + + sample_size_range = (1, points.shape[0]) + n = points.shape[0] + avg_codisp = pd.Series(0.0, index=np.arange(n)) + index = np.zeros(n) + + forest = [] + while len(forest) < num_trees: + ixs = np.random.choice(n, size=sample_size_range, replace=False) + trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs] + forest.extend(trees) + + for tree in forest: + codisp = pd.Series( + {leaf: tree.codisp(leaf) for leaf in tree.leaves} + ) + avg_codisp[codisp.index] += codisp + np.add.at(index, codisp.index.values, 1) + + avg_codisp /= index + avg_codisp.index = df.iloc[(cal_shingle_size - 1) :].index + avg_codisp = (avg_codisp - avg_codisp.min()) / ( + avg_codisp.max() - avg_codisp.min() + ) + + y_pred = ( + avg_codisp > np.percentile(avg_codisp, anomaly_threshold) + ).astype(int) + + index_col = df.columns[0] + + anomaly = pd.DataFrame( + {index_col: y_pred.index, OutputColumns.ANOMALY_COL: y_pred} + ).reset_index(drop=True) + score = pd.DataFrame( + {"index": avg_codisp.index, OutputColumns.SCORE_COL: avg_codisp} + ).reset_index(drop=True) + + anomaly_output.add_output(target, anomaly, score) + except Exception as e: + logger.warn(f"Encountered Error: {e}. Skipping series {target}.") + + return anomaly_output + + def _generate_report(self): + """Generates the report.""" + import report_creator as rc + + other_sections = [ + rc.Heading("Selected Models Overview", level=2), + rc.Text( + "The following tables provide information regarding the chosen model." + ), + ] + + model_description = rc.Text( + "The Random Cut Forest (RCF) is an unsupervised machine learning algorithm that is used for anomaly detection." + " It works by building an ensemble of binary trees (random cut trees) and using them to compute anomaly scores for data points." + ) + + return ( + model_description, + other_sections, + ) diff --git a/ads/opctl/operator/lowcode/anomaly/schema.yaml b/ads/opctl/operator/lowcode/anomaly/schema.yaml index e6c6cd998..bb5caa6ec 100644 --- a/ads/opctl/operator/lowcode/anomaly/schema.yaml +++ b/ads/opctl/operator/lowcode/anomaly/schema.yaml @@ -363,6 +363,7 @@ spec: - auto - oneclasssvm - isolationforest + - randomcutforest meta: description: "The model to be used for anomaly detection" diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 42af9030f..7322fe732 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,6 +2,12 @@ Release Notes ============= +2.11.18 +------- +Release date: September 20, 2024 + +* Added ``with_artifact()`` in ``ContainerRuntime`` class to support running container job with additional artifact. + 2.11.17 ------- Release date: August 9, 2024 diff --git a/docs/source/user_guide/jobs/run_container.rst b/docs/source/user_guide/jobs/run_container.rst index 9c469c555..b148831bc 100644 --- a/docs/source/user_guide/jobs/run_container.rst +++ b/docs/source/user_guide/jobs/run_container.rst @@ -22,7 +22,7 @@ Here is an example to create and run a container job: To configure ``ContainerRuntime``, you must specify the container ``image``. Similar to other runtime, you can add environment variables. -You can optionally specify the `entrypoint`, `cmd`, `image_digest` and `image_signature_id` for running the container. +You can optionally specify the `entrypoint`, `cmd`, `image_digest` and `image_signature_id` for running the container. You may also add additional artifact (file or directory) if needed. Please note that if you add a directory, it will be compressed as a zip file under `/home/datascience` and you will need to unzip if in your container. See also: diff --git a/docs/source/user_guide/jobs/tabs/container_runtime.rst b/docs/source/user_guide/jobs/tabs/container_runtime.rst index 0ef47d152..cccf7c172 100644 --- a/docs/source/user_guide/jobs/tabs/container_runtime.rst +++ b/docs/source/user_guide/jobs/tabs/container_runtime.rst @@ -33,6 +33,7 @@ .with_environment_variable(GREETINGS="Welcome to OCI Data Science") .with_entrypoint(["/bin/sh", "-c"]) .with_cmd("sleep 5 && echo $GREETINGS") + .artifact("") ) ) @@ -69,6 +70,7 @@ - name: GREETINGS value: Welcome to OCI Data Science image: .ocir.io// + scriptPathURI: path/to/artifact .. code-block:: python diff --git a/pyproject.toml b/pyproject.toml index feb8cf355..daf60c1fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ build-backend = "flit_core.buildapi" # Required name = "oracle_ads" # the install (PyPI) name; name for local build in [tool.flit.module] section below -version = "2.11.17" +version = "2.11.18" # Optional description = "Oracle Accelerated Data Science SDK" @@ -95,6 +95,7 @@ data = [ ] geo = [ "geopandas<1.0.0", # in v1.0.0 removed the built-in dataset 'naturalearth_lowres', fix when relax version of geopandas needed + "fiona<=1.9.6", "oracle_ads[viz]" ] huggingface = [ @@ -177,6 +178,8 @@ anomaly = [ "autots", "oracledb", "report-creator==1.0.9", + "rrcf==0.4.4", + "scikit-learn" ] recommender = [ "oracle_ads[opctl]", diff --git a/test-requirements-operators.txt b/test-requirements-operators.txt index 838418393..64b1ba683 100644 --- a/test-requirements-operators.txt +++ b/test-requirements-operators.txt @@ -1,5 +1,6 @@ -r test-requirements.txt -e ".[forecast]" +-e ".[anomaly]" -e ".[recommender]" -e ".[feature-store-marketplace]" plotly diff --git a/tests/operators/anomaly/test_anomaly_simple.py b/tests/operators/anomaly/test_anomaly_simple.py index b94c49007..aac4dad3e 100644 --- a/tests/operators/anomaly/test_anomaly_simple.py +++ b/tests/operators/anomaly/test_anomaly_simple.py @@ -52,7 +52,7 @@ for d in DATASETS: parameters_short.append((m, d)) -MODELS = ["autots", "oneclasssvm", "isolationforest"] +MODELS = ["autots", "oneclasssvm", "isolationforest", "randomcutforest"] @pytest.mark.parametrize("model", ["autots"]) def test_artificial_big(model):