Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ODSC-61571 : Add RCF Implementation #934

Merged
merged 8 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions THIRD_PARTY_LICENSES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,12 @@ mlforecast
* Source code: https://github.com/Nixtla/mlforecast
* Project home: https://github.com/Nixtla/mlforecast

rrcf
* Copyright 2018 kLabUM
* License: MIT License
* Source code: https://github.com/kLabUM/rrcf
* Project home: https://github.com/kLabUM/rrcf

=======
=============================== Licenses ===============================
------------------------------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions ads/opctl/operator/lowcode/anomaly/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class NonTimeADSupportedModels(str, metaclass=ExtendedEnumMeta):

OneClassSVM = "oneclasssvm"
IsolationForest = "isolationforest"
RandomCutForest = "randomcutforest"
# TODO : Add DBScan
# DBScan = "dbscan"

Expand Down
95 changes: 58 additions & 37 deletions ads/opctl/operator/lowcode/anomaly/model/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@

from ads.common.object_storage_details import ObjectStorageDetails
from ads.opctl import logger
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics, SUBSAMPLE_THRESHOLD
from ads.opctl.operator.lowcode.anomaly.const import (
SUBSAMPLE_THRESHOLD,
OutputColumns,
SupportedMetrics,
)
from ads.opctl.operator.lowcode.anomaly.utils import _build_metrics_df, default_signer
from ads.opctl.operator.lowcode.common.utils import (
disable_print,
Expand Down Expand Up @@ -55,6 +59,7 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets):
def generate_report(self):
"""Generates the report."""
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

great idea

import report_creator as rc

start_time = time.time()
Expand Down Expand Up @@ -87,43 +92,59 @@ def generate_report(self):
self.spec.datetime_column.name if self.spec.datetime_column else "index"
)

(
model_description,
other_sections,
) = self._generate_report()

blocks = []
for target, df in self.datasets.full_data_dict.items():
figure_blocks = []
time_col = df[date_column].reset_index(drop=True)
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
OutputColumns.ANOMALY_COL
]
anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1]
downsampled_time_col = time_col
selected_indices = list(range(len(time_col)))
if self.spec.subsample_report_data:
non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices]
# Downsample non-anomalous data if it exceeds the threshold (1000)
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD]
selected_indices = anomaly_indices + downsampled_non_anomaly_indices
selected_indices.sort()
downsampled_time_col = time_col[selected_indices]

columns = set(df.columns).difference({date_column})
for col in columns:
y = df[col].reset_index(drop=True)

downsampled_y = y[selected_indices]

fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
ax.grid()
ax.plot(downsampled_time_col, downsampled_y, color="black")
# Plot anomalies
for i in anomaly_indices:
ax.scatter(time_col[i], y[i], color="red", marker="o")
plt.xlabel(date_column)
plt.ylabel(col)
plt.title(f"`{col}` with reference to anomalies")
figure_blocks.append(rc.Widget(ax))

blocks.append(rc.Group(*figure_blocks, label=target))
if target in anomaly_output.list_categories():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a way to get rid of the "series 0" in the report?
Is there an edge case where the index column value is the same as the name of the target col?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are plotting for each series in the dataset, there may be edge cases where we don't have sufficient data for a particular series hence no anomaly scores, this ensures we try to plot only the series for which the anomaly scores are available.

figure_blocks = []
time_col = df[date_column].reset_index(drop=True)
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
OutputColumns.ANOMALY_COL
]
anomaly_indices = [
i for i, index in enumerate(anomaly_col) if index == 1
]
downsampled_time_col = time_col
selected_indices = list(range(len(time_col)))
if self.spec.subsample_report_data:
non_anomaly_indices = [
i for i in range(len(time_col)) if i not in anomaly_indices
]
# Downsample non-anomalous data if it exceeds the threshold (1000)
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
downsampled_non_anomaly_indices = non_anomaly_indices[
:: len(non_anomaly_indices) // SUBSAMPLE_THRESHOLD
]
selected_indices = (
anomaly_indices + downsampled_non_anomaly_indices
)
selected_indices.sort()
downsampled_time_col = time_col[selected_indices]

columns = set(df.columns).difference({date_column})
for col in columns:
y = df[col].reset_index(drop=True)

downsampled_y = y[selected_indices]

fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
ax.grid()
ax.plot(downsampled_time_col, downsampled_y, color="black")
# Plot anomalies
for i in anomaly_indices:
ax.scatter(time_col[i], y[i], color="red", marker="o")
plt.xlabel(date_column)
plt.ylabel(col)
plt.title(f"`{col}` with reference to anomalies")
figure_blocks.append(rc.Widget(ax))
else:
figure_blocks = None

blocks.append(rc.Group(*figure_blocks, label=target)) if figure_blocks else None
plots = rc.Select(blocks)

report_sections = []
Expand All @@ -133,7 +154,7 @@ def generate_report(self):
yaml_appendix = rc.Yaml(self.config.to_dict())
summary = rc.Block(
rc.Group(
rc.Text(f"You selected the **`{self.spec.model}`** model."),
rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"),
rc.Text(
"Based on your dataset, you could have also selected "
f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`."
Expand Down
2 changes: 2 additions & 0 deletions ads/opctl/operator/lowcode/anomaly/model/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .base_model import AnomalyOperatorBaseModel
from .isolationforest import IsolationForestOperatorModel
from .oneclasssvm import OneClassSVMOperatorModel
from .randomcutforest import RandomCutForestOperatorModel


class UnSupportedModelError(Exception):
Expand Down Expand Up @@ -52,6 +53,7 @@ class AnomalyOperatorModelFactory:
_NonTime_MAP = {
NonTimeADSupportedModels.OneClassSVM: OneClassSVMOperatorModel,
NonTimeADSupportedModels.IsolationForest: IsolationForestOperatorModel,
NonTimeADSupportedModels.RandomCutForest: RandomCutForestOperatorModel,
# TODO: Add DBScan model for non time based anomaly
# NonTimeADSupportedModels.DBScan: DBScanOperatorModel,
}
Expand Down
116 changes: 116 additions & 0 deletions ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/env python

# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import numpy as np
import pandas as pd

from ads.common.decorator.runtime_dependency import runtime_dependency
from ads.opctl import logger
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns

from .anomaly_dataset import AnomalyOutput
from .base_model import AnomalyOperatorBaseModel


class RandomCutForestOperatorModel(AnomalyOperatorBaseModel):
"""
Class representing Random Cut Forest Anomaly Detection operator model.
"""

@runtime_dependency(
module="rrcf",
err_msg=(
"Please run `pip install rrcf` to "
"install the required dependencies for RandomCutForest."
),
)
def _build_model(self) -> AnomalyOutput:
from rrcf import RCTree

model_kwargs = self.spec.model_kwargs

anomaly_output = AnomalyOutput(date_column="index")

# Set tree parameters
num_trees = model_kwargs.get("num_trees", 200)
shingle_size = model_kwargs.get("shingle_size", None)
anomaly_threshold = model_kwargs.get("anamoly_threshold", 95)

for target, df in self.datasets.full_data_dict.items():
try:
if df.shape[0] == 1:
raise ValueError("Dataset size must be greater than 1")
df_values = df[self.spec.target_column].astype(float).values

cal_shingle_size = (
shingle_size
if shingle_size
else int(2 ** np.floor(np.log2(df.shape[0])) / 2)
)
points = np.vstack(list(rrcf.shingle(df_values, size=cal_shingle_size)))

sample_size_range = (1, points.shape[0])
n = points.shape[0]
avg_codisp = pd.Series(0.0, index=np.arange(n))
index = np.zeros(n)

forest = []
while len(forest) < num_trees:
ixs = np.random.choice(n, size=sample_size_range, replace=False)
trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs]
forest.extend(trees)

for tree in forest:
codisp = pd.Series(
{leaf: tree.codisp(leaf) for leaf in tree.leaves}
)
avg_codisp[codisp.index] += codisp
np.add.at(index, codisp.index.values, 1)

avg_codisp /= index
avg_codisp.index = df.iloc[(cal_shingle_size - 1) :].index
avg_codisp = (avg_codisp - avg_codisp.min()) / (
avg_codisp.max() - avg_codisp.min()
)

y_pred = (
avg_codisp > np.percentile(avg_codisp, anomaly_threshold)
).astype(int)

index_col = df.columns[0]

anomaly = pd.DataFrame(
{index_col: y_pred.index, OutputColumns.ANOMALY_COL: y_pred}
).reset_index(drop=True)
score = pd.DataFrame(
{"index": avg_codisp.index, OutputColumns.SCORE_COL: avg_codisp}
).reset_index(drop=True)

anomaly_output.add_output(target, anomaly, score)
except Exception as e:
logger.warn(f"Encountered Error: {e}. Skipping series {target}.")

return anomaly_output

def _generate_report(self):
"""Generates the report."""
import report_creator as rc

other_sections = [
rc.Heading("Selected Models Overview", level=2),
rc.Text(
"The following tables provide information regarding the chosen model."
),
]

model_description = rc.Text(
"The Random Cut Forest (RCF) is an unsupervised machine learning algorithm that is used for anomaly detection."
" It works by building an ensemble of binary trees (random cut trees) and using them to compute anomaly scores for data points."
)

return (
model_description,
other_sections,
)
1 change: 1 addition & 0 deletions ads/opctl/operator/lowcode/anomaly/schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ spec:
- auto
- oneclasssvm
- isolationforest
- randomcutforest
meta:
description: "The model to be used for anomaly detection"

Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ anomaly = [
"autots",
"oracledb",
"report-creator==1.0.9",
"rrcf==0.4.4",
"scikit-learn"
]
recommender = [
"oracle_ads[opctl]",
Expand Down
1 change: 1 addition & 0 deletions test-requirements-operators.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
-r test-requirements.txt
-e ".[forecast]"
-e ".[anomaly]"
-e ".[recommender]"
-e ".[feature-store-marketplace]"
plotly
Expand Down
2 changes: 1 addition & 1 deletion tests/operators/anomaly/test_anomaly_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
for d in DATASETS:
parameters_short.append((m, d))

MODELS = ["autots", "oneclasssvm", "isolationforest"]
MODELS = ["autots", "oneclasssvm", "isolationforest", "randomcutforest"]

@pytest.mark.parametrize("model", ["autots"])
def test_artificial_big(model):
Expand Down
Loading