-
Notifications
You must be signed in to change notification settings - Fork 44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ODSC-61571 : Add RCF Implementation #934
Changes from all commits
077ec39
97c78b0
8dbccae
8b5c699
8f4f362
c0d0565
b7e41d2
c54e341
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,11 @@ | |
|
||
from ads.common.object_storage_details import ObjectStorageDetails | ||
from ads.opctl import logger | ||
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics, SUBSAMPLE_THRESHOLD | ||
from ads.opctl.operator.lowcode.anomaly.const import ( | ||
SUBSAMPLE_THRESHOLD, | ||
OutputColumns, | ||
SupportedMetrics, | ||
) | ||
from ads.opctl.operator.lowcode.anomaly.utils import _build_metrics_df, default_signer | ||
from ads.opctl.operator.lowcode.common.utils import ( | ||
disable_print, | ||
|
@@ -55,6 +59,7 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets): | |
def generate_report(self): | ||
"""Generates the report.""" | ||
import matplotlib.pyplot as plt | ||
plt.rcParams.update({'figure.max_open_warning': 0}) | ||
import report_creator as rc | ||
|
||
start_time = time.time() | ||
|
@@ -87,43 +92,59 @@ def generate_report(self): | |
self.spec.datetime_column.name if self.spec.datetime_column else "index" | ||
) | ||
|
||
( | ||
model_description, | ||
other_sections, | ||
) = self._generate_report() | ||
|
||
blocks = [] | ||
for target, df in self.datasets.full_data_dict.items(): | ||
figure_blocks = [] | ||
time_col = df[date_column].reset_index(drop=True) | ||
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[ | ||
OutputColumns.ANOMALY_COL | ||
] | ||
anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1] | ||
downsampled_time_col = time_col | ||
selected_indices = list(range(len(time_col))) | ||
if self.spec.subsample_report_data: | ||
non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices] | ||
# Downsample non-anomalous data if it exceeds the threshold (1000) | ||
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD: | ||
downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD] | ||
selected_indices = anomaly_indices + downsampled_non_anomaly_indices | ||
selected_indices.sort() | ||
downsampled_time_col = time_col[selected_indices] | ||
|
||
columns = set(df.columns).difference({date_column}) | ||
for col in columns: | ||
y = df[col].reset_index(drop=True) | ||
|
||
downsampled_y = y[selected_indices] | ||
|
||
fig, ax = plt.subplots(figsize=(8, 3), layout="constrained") | ||
ax.grid() | ||
ax.plot(downsampled_time_col, downsampled_y, color="black") | ||
# Plot anomalies | ||
for i in anomaly_indices: | ||
ax.scatter(time_col[i], y[i], color="red", marker="o") | ||
plt.xlabel(date_column) | ||
plt.ylabel(col) | ||
plt.title(f"`{col}` with reference to anomalies") | ||
figure_blocks.append(rc.Widget(ax)) | ||
|
||
blocks.append(rc.Group(*figure_blocks, label=target)) | ||
if target in anomaly_output.list_categories(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this a way to get rid of the "series 0" in the report? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are plotting for each series in the dataset, there may be edge cases where we don't have sufficient data for a particular series hence no anomaly scores, this ensures we try to plot only the series for which the anomaly scores are available. |
||
figure_blocks = [] | ||
time_col = df[date_column].reset_index(drop=True) | ||
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[ | ||
OutputColumns.ANOMALY_COL | ||
] | ||
anomaly_indices = [ | ||
i for i, index in enumerate(anomaly_col) if index == 1 | ||
] | ||
downsampled_time_col = time_col | ||
selected_indices = list(range(len(time_col))) | ||
if self.spec.subsample_report_data: | ||
non_anomaly_indices = [ | ||
i for i in range(len(time_col)) if i not in anomaly_indices | ||
] | ||
# Downsample non-anomalous data if it exceeds the threshold (1000) | ||
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD: | ||
downsampled_non_anomaly_indices = non_anomaly_indices[ | ||
:: len(non_anomaly_indices) // SUBSAMPLE_THRESHOLD | ||
] | ||
selected_indices = ( | ||
anomaly_indices + downsampled_non_anomaly_indices | ||
) | ||
selected_indices.sort() | ||
downsampled_time_col = time_col[selected_indices] | ||
|
||
columns = set(df.columns).difference({date_column}) | ||
for col in columns: | ||
y = df[col].reset_index(drop=True) | ||
|
||
downsampled_y = y[selected_indices] | ||
|
||
fig, ax = plt.subplots(figsize=(8, 3), layout="constrained") | ||
ax.grid() | ||
ax.plot(downsampled_time_col, downsampled_y, color="black") | ||
# Plot anomalies | ||
for i in anomaly_indices: | ||
ax.scatter(time_col[i], y[i], color="red", marker="o") | ||
plt.xlabel(date_column) | ||
plt.ylabel(col) | ||
plt.title(f"`{col}` with reference to anomalies") | ||
figure_blocks.append(rc.Widget(ax)) | ||
else: | ||
figure_blocks = None | ||
|
||
blocks.append(rc.Group(*figure_blocks, label=target)) if figure_blocks else None | ||
plots = rc.Select(blocks) | ||
|
||
report_sections = [] | ||
|
@@ -133,7 +154,7 @@ def generate_report(self): | |
yaml_appendix = rc.Yaml(self.config.to_dict()) | ||
summary = rc.Block( | ||
rc.Group( | ||
rc.Text(f"You selected the **`{self.spec.model}`** model."), | ||
rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"), | ||
rc.Text( | ||
"Based on your dataset, you could have also selected " | ||
f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`." | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#!/usr/bin/env python | ||
|
||
# Copyright (c) 2023, 2024 Oracle and/or its affiliates. | ||
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from ads.common.decorator.runtime_dependency import runtime_dependency | ||
from ads.opctl import logger | ||
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns | ||
|
||
from .anomaly_dataset import AnomalyOutput | ||
from .base_model import AnomalyOperatorBaseModel | ||
|
||
|
||
class RandomCutForestOperatorModel(AnomalyOperatorBaseModel): | ||
""" | ||
Class representing Random Cut Forest Anomaly Detection operator model. | ||
""" | ||
|
||
@runtime_dependency( | ||
module="rrcf", | ||
err_msg=( | ||
"Please run `pip install rrcf` to " | ||
"install the required dependencies for RandomCutForest." | ||
), | ||
) | ||
def _build_model(self) -> AnomalyOutput: | ||
from rrcf import RCTree | ||
|
||
model_kwargs = self.spec.model_kwargs | ||
|
||
anomaly_output = AnomalyOutput(date_column="index") | ||
|
||
# Set tree parameters | ||
num_trees = model_kwargs.get("num_trees", 200) | ||
shingle_size = model_kwargs.get("shingle_size", None) | ||
anomaly_threshold = model_kwargs.get("anamoly_threshold", 95) | ||
|
||
for target, df in self.datasets.full_data_dict.items(): | ||
try: | ||
if df.shape[0] == 1: | ||
raise ValueError("Dataset size must be greater than 1") | ||
df_values = df[self.spec.target_column].astype(float).values | ||
|
||
cal_shingle_size = ( | ||
shingle_size | ||
if shingle_size | ||
else int(2 ** np.floor(np.log2(df.shape[0])) / 2) | ||
) | ||
points = np.vstack(list(rrcf.shingle(df_values, size=cal_shingle_size))) | ||
|
||
sample_size_range = (1, points.shape[0]) | ||
n = points.shape[0] | ||
avg_codisp = pd.Series(0.0, index=np.arange(n)) | ||
index = np.zeros(n) | ||
|
||
forest = [] | ||
while len(forest) < num_trees: | ||
ixs = np.random.choice(n, size=sample_size_range, replace=False) | ||
trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs] | ||
forest.extend(trees) | ||
|
||
for tree in forest: | ||
codisp = pd.Series( | ||
{leaf: tree.codisp(leaf) for leaf in tree.leaves} | ||
) | ||
avg_codisp[codisp.index] += codisp | ||
np.add.at(index, codisp.index.values, 1) | ||
|
||
avg_codisp /= index | ||
avg_codisp.index = df.iloc[(cal_shingle_size - 1) :].index | ||
avg_codisp = (avg_codisp - avg_codisp.min()) / ( | ||
avg_codisp.max() - avg_codisp.min() | ||
) | ||
|
||
y_pred = ( | ||
avg_codisp > np.percentile(avg_codisp, anomaly_threshold) | ||
).astype(int) | ||
|
||
index_col = df.columns[0] | ||
|
||
anomaly = pd.DataFrame( | ||
{index_col: y_pred.index, OutputColumns.ANOMALY_COL: y_pred} | ||
).reset_index(drop=True) | ||
score = pd.DataFrame( | ||
{"index": avg_codisp.index, OutputColumns.SCORE_COL: avg_codisp} | ||
).reset_index(drop=True) | ||
|
||
anomaly_output.add_output(target, anomaly, score) | ||
except Exception as e: | ||
logger.warn(f"Encountered Error: {e}. Skipping series {target}.") | ||
|
||
return anomaly_output | ||
|
||
def _generate_report(self): | ||
"""Generates the report.""" | ||
import report_creator as rc | ||
|
||
other_sections = [ | ||
rc.Heading("Selected Models Overview", level=2), | ||
rc.Text( | ||
"The following tables provide information regarding the chosen model." | ||
), | ||
] | ||
|
||
model_description = rc.Text( | ||
"The Random Cut Forest (RCF) is an unsupervised machine learning algorithm that is used for anomaly detection." | ||
" It works by building an ensemble of binary trees (random cut trees) and using them to compute anomaly scores for data points." | ||
) | ||
|
||
return ( | ||
model_description, | ||
other_sections, | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
great idea