Skip to content

Commit

Permalink
sync with main
Browse files Browse the repository at this point in the history
  • Loading branch information
VipulMascarenhas committed Sep 24, 2024
2 parents 5b4d2d3 + 8cf0d7c commit c5b5921
Show file tree
Hide file tree
Showing 13 changed files with 217 additions and 52 deletions.
14 changes: 13 additions & 1 deletion THIRD_PARTY_LICENSES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ fastavro
* Source code: https://github.com/fastavro/fastavro
* Project home: https://github.com/fastavro/fastavro

fiona
* Copyright (c) 2007, Sean C. Gillies
* License: BSD 3-Clause "New" or "Revised" License
* Source code: https://github.com/Toblerity/Fiona
* Project home: https://github.com/Toblerity/Fiona

folium
* Copyright (C) 2013, Rob Story
* License: MIT License
Expand Down Expand Up @@ -459,7 +465,13 @@ pydantic
* Source code: https://github.com/pydantic/pydantic
* Project home: https://docs.pydantic.dev/latest/

=======
rrcf
* Copyright 2018 kLabUM
* License: MIT License
* Source code: https://github.com/kLabUM/rrcf
* Project home: https://github.com/kLabUM/rrcf


=============================== Licenses ===============================
------------------------------------------------------------------------

Expand Down
22 changes: 11 additions & 11 deletions ads/opctl/conda/cmds.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,29 +181,29 @@ def _create(
logger.info(
f"Preparing manifest. Manifest in the environment: {conda_dep.get('manifest')}"
)
manifest = _fetch_manifest_template()
manifest_template = _fetch_manifest_template()
if "name" not in manifest:
manifest["manifest"]["name"] = name
manifest["manifest"]["slug"] = slug
manifest_template["manifest"]["name"] = name
manifest_template["manifest"]["slug"] = slug
if "type" not in manifest:
logger.info("Setting manifest to published")
manifest["manifest"]["type"] = "published"
manifest_template["manifest"]["type"] = "published"
if "version" not in manifest:
manifest["manifest"]["version"] = version
manifest["manifest"]["arch_type"] = "GPU" if gpu else "CPU"
manifest_template["manifest"]["version"] = version
manifest_template["manifest"]["arch_type"] = "GPU" if gpu else "CPU"

manifest["manifest"]["create_date"] = datetime.utcnow().strftime(
manifest_template["manifest"]["create_date"] = datetime.utcnow().strftime(
"%a, %b %d, %Y, %H:%M:%S %Z UTC"
)

if not "manifest_version" in manifest:
manifest["manifest"]["manifest_version"] = "1.0"
manifest_template["manifest"]["manifest_version"] = "1.0"

logger.info(f"Creating conda environment {slug}")
manifest_dict = {
k: manifest["manifest"][k]
for k in manifest["manifest"]
if manifest["manifest"][k]
k: manifest_template["manifest"][k]
for k in manifest_template["manifest"]
if manifest_template["manifest"][k]
}
if "manifest" in conda_dep:
conda_dep["manifest"].update(manifest_dict)
Expand Down
1 change: 1 addition & 0 deletions ads/opctl/operator/lowcode/anomaly/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class NonTimeADSupportedModels(str, metaclass=ExtendedEnumMeta):

OneClassSVM = "oneclasssvm"
IsolationForest = "isolationforest"
RandomCutForest = "randomcutforest"
# TODO : Add DBScan
# DBScan = "dbscan"

Expand Down
95 changes: 58 additions & 37 deletions ads/opctl/operator/lowcode/anomaly/model/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@

from ads.common.object_storage_details import ObjectStorageDetails
from ads.opctl import logger
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics, SUBSAMPLE_THRESHOLD
from ads.opctl.operator.lowcode.anomaly.const import (
SUBSAMPLE_THRESHOLD,
OutputColumns,
SupportedMetrics,
)
from ads.opctl.operator.lowcode.anomaly.utils import _build_metrics_df, default_signer
from ads.opctl.operator.lowcode.common.utils import (
disable_print,
Expand Down Expand Up @@ -55,6 +59,7 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets):
def generate_report(self):
"""Generates the report."""
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import report_creator as rc

start_time = time.time()
Expand Down Expand Up @@ -87,43 +92,59 @@ def generate_report(self):
self.spec.datetime_column.name if self.spec.datetime_column else "index"
)

(
model_description,
other_sections,
) = self._generate_report()

blocks = []
for target, df in self.datasets.full_data_dict.items():
figure_blocks = []
time_col = df[date_column].reset_index(drop=True)
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
OutputColumns.ANOMALY_COL
]
anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1]
downsampled_time_col = time_col
selected_indices = list(range(len(time_col)))
if self.spec.subsample_report_data:
non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices]
# Downsample non-anomalous data if it exceeds the threshold (1000)
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD]
selected_indices = anomaly_indices + downsampled_non_anomaly_indices
selected_indices.sort()
downsampled_time_col = time_col[selected_indices]

columns = set(df.columns).difference({date_column})
for col in columns:
y = df[col].reset_index(drop=True)

downsampled_y = y[selected_indices]

fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
ax.grid()
ax.plot(downsampled_time_col, downsampled_y, color="black")
# Plot anomalies
for i in anomaly_indices:
ax.scatter(time_col[i], y[i], color="red", marker="o")
plt.xlabel(date_column)
plt.ylabel(col)
plt.title(f"`{col}` with reference to anomalies")
figure_blocks.append(rc.Widget(ax))

blocks.append(rc.Group(*figure_blocks, label=target))
if target in anomaly_output.list_categories():
figure_blocks = []
time_col = df[date_column].reset_index(drop=True)
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
OutputColumns.ANOMALY_COL
]
anomaly_indices = [
i for i, index in enumerate(anomaly_col) if index == 1
]
downsampled_time_col = time_col
selected_indices = list(range(len(time_col)))
if self.spec.subsample_report_data:
non_anomaly_indices = [
i for i in range(len(time_col)) if i not in anomaly_indices
]
# Downsample non-anomalous data if it exceeds the threshold (1000)
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
downsampled_non_anomaly_indices = non_anomaly_indices[
:: len(non_anomaly_indices) // SUBSAMPLE_THRESHOLD
]
selected_indices = (
anomaly_indices + downsampled_non_anomaly_indices
)
selected_indices.sort()
downsampled_time_col = time_col[selected_indices]

columns = set(df.columns).difference({date_column})
for col in columns:
y = df[col].reset_index(drop=True)

downsampled_y = y[selected_indices]

fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
ax.grid()
ax.plot(downsampled_time_col, downsampled_y, color="black")
# Plot anomalies
for i in anomaly_indices:
ax.scatter(time_col[i], y[i], color="red", marker="o")
plt.xlabel(date_column)
plt.ylabel(col)
plt.title(f"`{col}` with reference to anomalies")
figure_blocks.append(rc.Widget(ax))
else:
figure_blocks = None

blocks.append(rc.Group(*figure_blocks, label=target)) if figure_blocks else None
plots = rc.Select(blocks)

report_sections = []
Expand All @@ -133,7 +154,7 @@ def generate_report(self):
yaml_appendix = rc.Yaml(self.config.to_dict())
summary = rc.Block(
rc.Group(
rc.Text(f"You selected the **`{self.spec.model}`** model."),
rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"),
rc.Text(
"Based on your dataset, you could have also selected "
f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`."
Expand Down
2 changes: 2 additions & 0 deletions ads/opctl/operator/lowcode/anomaly/model/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .base_model import AnomalyOperatorBaseModel
from .isolationforest import IsolationForestOperatorModel
from .oneclasssvm import OneClassSVMOperatorModel
from .randomcutforest import RandomCutForestOperatorModel


class UnSupportedModelError(Exception):
Expand Down Expand Up @@ -52,6 +53,7 @@ class AnomalyOperatorModelFactory:
_NonTime_MAP = {
NonTimeADSupportedModels.OneClassSVM: OneClassSVMOperatorModel,
NonTimeADSupportedModels.IsolationForest: IsolationForestOperatorModel,
NonTimeADSupportedModels.RandomCutForest: RandomCutForestOperatorModel,
# TODO: Add DBScan model for non time based anomaly
# NonTimeADSupportedModels.DBScan: DBScanOperatorModel,
}
Expand Down
116 changes: 116 additions & 0 deletions ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/env python

# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import numpy as np
import pandas as pd

from ads.common.decorator.runtime_dependency import runtime_dependency
from ads.opctl import logger
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns

from .anomaly_dataset import AnomalyOutput
from .base_model import AnomalyOperatorBaseModel


class RandomCutForestOperatorModel(AnomalyOperatorBaseModel):
"""
Class representing Random Cut Forest Anomaly Detection operator model.
"""

@runtime_dependency(
module="rrcf",
err_msg=(
"Please run `pip install rrcf` to "
"install the required dependencies for RandomCutForest."
),
)
def _build_model(self) -> AnomalyOutput:
from rrcf import RCTree

model_kwargs = self.spec.model_kwargs

anomaly_output = AnomalyOutput(date_column="index")

# Set tree parameters
num_trees = model_kwargs.get("num_trees", 200)
shingle_size = model_kwargs.get("shingle_size", None)
anomaly_threshold = model_kwargs.get("anamoly_threshold", 95)

for target, df in self.datasets.full_data_dict.items():
try:
if df.shape[0] == 1:
raise ValueError("Dataset size must be greater than 1")
df_values = df[self.spec.target_column].astype(float).values

cal_shingle_size = (
shingle_size
if shingle_size
else int(2 ** np.floor(np.log2(df.shape[0])) / 2)
)
points = np.vstack(list(rrcf.shingle(df_values, size=cal_shingle_size)))

sample_size_range = (1, points.shape[0])
n = points.shape[0]
avg_codisp = pd.Series(0.0, index=np.arange(n))
index = np.zeros(n)

forest = []
while len(forest) < num_trees:
ixs = np.random.choice(n, size=sample_size_range, replace=False)
trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs]
forest.extend(trees)

for tree in forest:
codisp = pd.Series(
{leaf: tree.codisp(leaf) for leaf in tree.leaves}
)
avg_codisp[codisp.index] += codisp
np.add.at(index, codisp.index.values, 1)

avg_codisp /= index
avg_codisp.index = df.iloc[(cal_shingle_size - 1) :].index
avg_codisp = (avg_codisp - avg_codisp.min()) / (
avg_codisp.max() - avg_codisp.min()
)

y_pred = (
avg_codisp > np.percentile(avg_codisp, anomaly_threshold)
).astype(int)

index_col = df.columns[0]

anomaly = pd.DataFrame(
{index_col: y_pred.index, OutputColumns.ANOMALY_COL: y_pred}
).reset_index(drop=True)
score = pd.DataFrame(
{"index": avg_codisp.index, OutputColumns.SCORE_COL: avg_codisp}
).reset_index(drop=True)

anomaly_output.add_output(target, anomaly, score)
except Exception as e:
logger.warn(f"Encountered Error: {e}. Skipping series {target}.")

return anomaly_output

def _generate_report(self):
"""Generates the report."""
import report_creator as rc

other_sections = [
rc.Heading("Selected Models Overview", level=2),
rc.Text(
"The following tables provide information regarding the chosen model."
),
]

model_description = rc.Text(
"The Random Cut Forest (RCF) is an unsupervised machine learning algorithm that is used for anomaly detection."
" It works by building an ensemble of binary trees (random cut trees) and using them to compute anomaly scores for data points."
)

return (
model_description,
other_sections,
)
1 change: 1 addition & 0 deletions ads/opctl/operator/lowcode/anomaly/schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ spec:
- auto
- oneclasssvm
- isolationforest
- randomcutforest
meta:
description: "The model to be used for anomaly detection"

Expand Down
6 changes: 6 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
Release Notes
=============

2.11.18
-------
Release date: September 20, 2024

* Added ``with_artifact()`` in ``ContainerRuntime`` class to support running container job with additional artifact.

2.11.17
-------
Release date: August 9, 2024
Expand Down
2 changes: 1 addition & 1 deletion docs/source/user_guide/jobs/run_container.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Here is an example to create and run a container job:

To configure ``ContainerRuntime``, you must specify the container ``image``.
Similar to other runtime, you can add environment variables.
You can optionally specify the `entrypoint`, `cmd`, `image_digest` and `image_signature_id` for running the container.
You can optionally specify the `entrypoint`, `cmd`, `image_digest` and `image_signature_id` for running the container. You may also add additional artifact (file or directory) if needed. Please note that if you add a directory, it will be compressed as a zip file under `/home/datascience` and you will need to unzip if in your container.

See also:

Expand Down
2 changes: 2 additions & 0 deletions docs/source/user_guide/jobs/tabs/container_runtime.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
.with_environment_variable(GREETINGS="Welcome to OCI Data Science")
.with_entrypoint(["/bin/sh", "-c"])
.with_cmd("sleep 5 && echo $GREETINGS")
.artifact("<path/to/artifact>")
)
)

Expand Down Expand Up @@ -69,6 +70,7 @@
- name: GREETINGS
value: Welcome to OCI Data Science
image: <region>.ocir.io/<your_tenancy>/<your_image>
scriptPathURI: path/to/artifact


.. code-block:: python
Expand Down
Loading

0 comments on commit c5b5921

Please sign in to comment.