Skip to content

Commit

Permalink
update the todos, complete implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
codeloop committed Aug 23, 2024
1 parent 8dbccae commit 8b5c699
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 91 deletions.
87 changes: 51 additions & 36 deletions ads/opctl/operator/lowcode/anomaly/model/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets):
def generate_report(self):
"""Generates the report."""
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import report_creator as rc

start_time = time.time()
Expand Down Expand Up @@ -87,43 +88,57 @@ def generate_report(self):
self.spec.datetime_column.name if self.spec.datetime_column else "index"
)

(
model_description,
other_sections,
) = self._generate_report()

blocks = []
for target, df in self.datasets.full_data_dict.items():
figure_blocks = []
time_col = df[date_column].reset_index(drop=True)
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
OutputColumns.ANOMALY_COL
]
anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1]
downsampled_time_col = time_col
selected_indices = list(range(len(time_col)))
if self.spec.subsample_report_data:
non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices]
# Downsample non-anomalous data if it exceeds the threshold (1000)
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD]
selected_indices = anomaly_indices + downsampled_non_anomaly_indices
selected_indices.sort()
downsampled_time_col = time_col[selected_indices]

columns = set(df.columns).difference({date_column})
for col in columns:
y = df[col].reset_index(drop=True)

downsampled_y = y[selected_indices]

fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
ax.grid()
ax.plot(downsampled_time_col, downsampled_y, color="black")
# Plot anomalies
for i in anomaly_indices:
ax.scatter(time_col[i], y[i], color="red", marker="o")
plt.xlabel(date_column)
plt.ylabel(col)
plt.title(f"`{col}` with reference to anomalies")
figure_blocks.append(rc.Widget(ax))

blocks.append(rc.Group(*figure_blocks, label=target))
if target in anomaly_output.list_categories():
figure_blocks = []
time_col = df[date_column].reset_index(drop=True)
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
OutputColumns.ANOMALY_COL
]
anomaly_indices = [
i for i, index in enumerate(anomaly_col) if index == 1
]
downsampled_time_col = time_col
selected_indices = list(range(len(time_col)))
if self.spec.subsample_report_data:
non_anomaly_indices = [
i for i in range(len(time_col)) if i not in anomaly_indices
]
# Downsample non-anomalous data if it exceeds the threshold (1000)
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
downsampled_non_anomaly_indices = non_anomaly_indices[
:: len(non_anomaly_indices) // SUBSAMPLE_THRESHOLD
]
selected_indices = (
anomaly_indices + downsampled_non_anomaly_indices
)
selected_indices.sort()
downsampled_time_col = time_col[selected_indices]

columns = set(df.columns).difference({date_column})
for col in columns:
y = df[col].reset_index(drop=True)

downsampled_y = y[selected_indices]

fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
ax.grid()
ax.plot(downsampled_time_col, downsampled_y, color="black")
# Plot anomalies
for i in anomaly_indices:
ax.scatter(time_col[i], y[i], color="red", marker="o")
plt.xlabel(date_column)
plt.ylabel(col)
plt.title(f"`{col}` with reference to anomalies")
figure_blocks.append(rc.Widget(ax))

blocks.append(rc.Group(*figure_blocks, label=target))
plots = rc.Select(blocks)

report_sections = []
Expand All @@ -133,7 +148,7 @@ def generate_report(self):
yaml_appendix = rc.Yaml(self.config.to_dict())
summary = rc.Block(
rc.Group(
rc.Text(f"You selected the **`{self.spec.model}`** model."),
rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"),
rc.Text(
"Based on your dataset, you could have also selected "
f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`."
Expand Down
110 changes: 55 additions & 55 deletions ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pandas as pd

from ads.common.decorator.runtime_dependency import runtime_dependency
from ads.opctl import logger
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns

from .anomaly_dataset import AnomalyOutput
Expand All @@ -29,68 +30,67 @@ def _build_model(self) -> AnomalyOutput:
from rrcf import RCTree

model_kwargs = self.spec.model_kwargs
# map the output as per anomaly dataset class, 1: outlier, 0: inlier
# self.outlier_map = {1: 0, -1: 1}

anomaly_output = AnomalyOutput(date_column="index")

# Set tree parameters
num_trees = model_kwargs.get("num_trees", 200)
shingle_size = model_kwargs.get("shingle_size", 1)
tree_size = model_kwargs.get("tree_size", 1000)
shingle_size = model_kwargs.get("shingle_size", None)
anamoly_threshold = model_kwargs.get("anamoly_threshold", 95)

for target, df in self.datasets.full_data_dict.items():
df_values = df[self.spec.target_column].astype(float).values

# TODO: Update size to log logic
points = np.vstack(list(rrcf.shingle(df_values, size=4)))

# TODO: remove hardcode
sample_size_range = (1, 6)
n = points.shape[0]
avg_codisp = pd.Series(0.0, index=np.arange(n))
index = np.zeros(n)

forest = []
while len(forest) < num_trees:
ixs = np.random.choice(n, size=sample_size_range, replace=False)
trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs]
forest.extend(trees)
print(len(forest))

for tree in forest:
codisp = pd.Series({leaf: tree.codisp(leaf) for leaf in tree.leaves})
avg_codisp[codisp.index] += codisp
np.add.at(index, codisp.index.values, 1)

avg_codisp /= index
# TODO: remove hardcode
avg_codisp.index = df.iloc[(4 - 1) :].index
avg_codisp = (avg_codisp - avg_codisp.min()) / (
avg_codisp.max() - avg_codisp.min()
)

# TODO: use model kwargs for percentile threshold
y_pred = (avg_codisp > np.percentile(avg_codisp, 95)).astype(int)

# TODO: rem pdb
# import pdb

# pdb.set_trace()
print("Done")

# scores = model.score_samples(df)

# index_col = df.columns[0]

# anomaly = pd.DataFrame(
# {index_col: df[index_col], OutputColumns.ANOMALY_COL: y_pred}
# ).reset_index(drop=True)
# score = pd.DataFrame(
# {"index": df[index_col], OutputColumns.SCORE_COL: scores}
# ).reset_index(drop=True)

# anomaly_output.add_output(target, anomaly, score)
try:
if df.shape[0] == 1:
raise ValueError("Dataset size must be greater than 1")
df_values = df[self.spec.target_column].astype(float).values

cal_shingle_size = (
shingle_size
if shingle_size
else int(2 ** np.floor(np.log2(df.shape[0])) / 2)
)
points = np.vstack(list(rrcf.shingle(df_values, size=cal_shingle_size)))

sample_size_range = (1, points.shape[0])
n = points.shape[0]
avg_codisp = pd.Series(0.0, index=np.arange(n))
index = np.zeros(n)

forest = []
while len(forest) < num_trees:
ixs = np.random.choice(n, size=sample_size_range, replace=False)
trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs]
forest.extend(trees)

for tree in forest:
codisp = pd.Series(
{leaf: tree.codisp(leaf) for leaf in tree.leaves}
)
avg_codisp[codisp.index] += codisp
np.add.at(index, codisp.index.values, 1)

avg_codisp /= index
avg_codisp.index = df.iloc[(cal_shingle_size - 1) :].index
avg_codisp = (avg_codisp - avg_codisp.min()) / (
avg_codisp.max() - avg_codisp.min()
)

y_pred = (
avg_codisp > np.percentile(avg_codisp, anamoly_threshold)
).astype(int)

index_col = df.columns[0]

anomaly = pd.DataFrame(
{index_col: y_pred.index, OutputColumns.ANOMALY_COL: y_pred}
).reset_index(drop=True)
score = pd.DataFrame(
{"index": avg_codisp.index, OutputColumns.SCORE_COL: avg_codisp}
).reset_index(drop=True)

anomaly_output.add_output(target, anomaly, score)
except Exception as e:
logger.warn(f"Encountered Error: {e}. Skipping series {target}.")

return anomaly_output

Expand Down

0 comments on commit 8b5c699

Please sign in to comment.