diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py index e909976d8..983228ba5 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/base_model.py +++ b/ads/opctl/operator/lowcode/anomaly/model/base_model.py @@ -55,6 +55,7 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets): def generate_report(self): """Generates the report.""" import matplotlib.pyplot as plt + plt.rcParams.update({'figure.max_open_warning': 0}) import report_creator as rc start_time = time.time() @@ -87,43 +88,57 @@ def generate_report(self): self.spec.datetime_column.name if self.spec.datetime_column else "index" ) + ( + model_description, + other_sections, + ) = self._generate_report() + blocks = [] for target, df in self.datasets.full_data_dict.items(): - figure_blocks = [] - time_col = df[date_column].reset_index(drop=True) - anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[ - OutputColumns.ANOMALY_COL - ] - anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1] - downsampled_time_col = time_col - selected_indices = list(range(len(time_col))) - if self.spec.subsample_report_data: - non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices] - # Downsample non-anomalous data if it exceeds the threshold (1000) - if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD: - downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD] - selected_indices = anomaly_indices + downsampled_non_anomaly_indices - selected_indices.sort() - downsampled_time_col = time_col[selected_indices] - - columns = set(df.columns).difference({date_column}) - for col in columns: - y = df[col].reset_index(drop=True) - - downsampled_y = y[selected_indices] - - fig, ax = plt.subplots(figsize=(8, 3), layout="constrained") - ax.grid() - ax.plot(downsampled_time_col, downsampled_y, color="black") - # Plot anomalies - for i in anomaly_indices: - ax.scatter(time_col[i], y[i], color="red", marker="o") - plt.xlabel(date_column) - plt.ylabel(col) - plt.title(f"`{col}` with reference to anomalies") - figure_blocks.append(rc.Widget(ax)) - - blocks.append(rc.Group(*figure_blocks, label=target)) + if target in anomaly_output.list_categories(): + figure_blocks = [] + time_col = df[date_column].reset_index(drop=True) + anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[ + OutputColumns.ANOMALY_COL + ] + anomaly_indices = [ + i for i, index in enumerate(anomaly_col) if index == 1 + ] + downsampled_time_col = time_col + selected_indices = list(range(len(time_col))) + if self.spec.subsample_report_data: + non_anomaly_indices = [ + i for i in range(len(time_col)) if i not in anomaly_indices + ] + # Downsample non-anomalous data if it exceeds the threshold (1000) + if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD: + downsampled_non_anomaly_indices = non_anomaly_indices[ + :: len(non_anomaly_indices) // SUBSAMPLE_THRESHOLD + ] + selected_indices = ( + anomaly_indices + downsampled_non_anomaly_indices + ) + selected_indices.sort() + downsampled_time_col = time_col[selected_indices] + + columns = set(df.columns).difference({date_column}) + for col in columns: + y = df[col].reset_index(drop=True) + + downsampled_y = y[selected_indices] + + fig, ax = plt.subplots(figsize=(8, 3), layout="constrained") + ax.grid() + ax.plot(downsampled_time_col, downsampled_y, color="black") + # Plot anomalies + for i in anomaly_indices: + ax.scatter(time_col[i], y[i], color="red", marker="o") + plt.xlabel(date_column) + plt.ylabel(col) + plt.title(f"`{col}` with reference to anomalies") + figure_blocks.append(rc.Widget(ax)) + + blocks.append(rc.Group(*figure_blocks, label=target)) plots = rc.Select(blocks) report_sections = [] @@ -133,7 +148,7 @@ def generate_report(self): yaml_appendix = rc.Yaml(self.config.to_dict()) summary = rc.Block( rc.Group( - rc.Text(f"You selected the **`{self.spec.model}`** model."), + rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"), rc.Text( "Based on your dataset, you could have also selected " f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`." diff --git a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py index 38d1c224e..730938d03 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py @@ -7,6 +7,7 @@ import pandas as pd from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl import logger from ads.opctl.operator.lowcode.anomaly.const import OutputColumns from .anomaly_dataset import AnomalyOutput @@ -29,68 +30,67 @@ def _build_model(self) -> AnomalyOutput: from rrcf import RCTree model_kwargs = self.spec.model_kwargs - # map the output as per anomaly dataset class, 1: outlier, 0: inlier - # self.outlier_map = {1: 0, -1: 1} anomaly_output = AnomalyOutput(date_column="index") # Set tree parameters num_trees = model_kwargs.get("num_trees", 200) - shingle_size = model_kwargs.get("shingle_size", 1) - tree_size = model_kwargs.get("tree_size", 1000) + shingle_size = model_kwargs.get("shingle_size", None) + anamoly_threshold = model_kwargs.get("anamoly_threshold", 95) for target, df in self.datasets.full_data_dict.items(): - df_values = df[self.spec.target_column].astype(float).values - - # TODO: Update size to log logic - points = np.vstack(list(rrcf.shingle(df_values, size=4))) - - # TODO: remove hardcode - sample_size_range = (1, 6) - n = points.shape[0] - avg_codisp = pd.Series(0.0, index=np.arange(n)) - index = np.zeros(n) - - forest = [] - while len(forest) < num_trees: - ixs = np.random.choice(n, size=sample_size_range, replace=False) - trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs] - forest.extend(trees) - print(len(forest)) - - for tree in forest: - codisp = pd.Series({leaf: tree.codisp(leaf) for leaf in tree.leaves}) - avg_codisp[codisp.index] += codisp - np.add.at(index, codisp.index.values, 1) - - avg_codisp /= index - # TODO: remove hardcode - avg_codisp.index = df.iloc[(4 - 1) :].index - avg_codisp = (avg_codisp - avg_codisp.min()) / ( - avg_codisp.max() - avg_codisp.min() - ) - - # TODO: use model kwargs for percentile threshold - y_pred = (avg_codisp > np.percentile(avg_codisp, 95)).astype(int) - - # TODO: rem pdb - # import pdb - - # pdb.set_trace() - print("Done") - - # scores = model.score_samples(df) - - # index_col = df.columns[0] - - # anomaly = pd.DataFrame( - # {index_col: df[index_col], OutputColumns.ANOMALY_COL: y_pred} - # ).reset_index(drop=True) - # score = pd.DataFrame( - # {"index": df[index_col], OutputColumns.SCORE_COL: scores} - # ).reset_index(drop=True) - - # anomaly_output.add_output(target, anomaly, score) + try: + if df.shape[0] == 1: + raise ValueError("Dataset size must be greater than 1") + df_values = df[self.spec.target_column].astype(float).values + + cal_shingle_size = ( + shingle_size + if shingle_size + else int(2 ** np.floor(np.log2(df.shape[0])) / 2) + ) + points = np.vstack(list(rrcf.shingle(df_values, size=cal_shingle_size))) + + sample_size_range = (1, points.shape[0]) + n = points.shape[0] + avg_codisp = pd.Series(0.0, index=np.arange(n)) + index = np.zeros(n) + + forest = [] + while len(forest) < num_trees: + ixs = np.random.choice(n, size=sample_size_range, replace=False) + trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs] + forest.extend(trees) + + for tree in forest: + codisp = pd.Series( + {leaf: tree.codisp(leaf) for leaf in tree.leaves} + ) + avg_codisp[codisp.index] += codisp + np.add.at(index, codisp.index.values, 1) + + avg_codisp /= index + avg_codisp.index = df.iloc[(cal_shingle_size - 1) :].index + avg_codisp = (avg_codisp - avg_codisp.min()) / ( + avg_codisp.max() - avg_codisp.min() + ) + + y_pred = ( + avg_codisp > np.percentile(avg_codisp, anamoly_threshold) + ).astype(int) + + index_col = df.columns[0] + + anomaly = pd.DataFrame( + {index_col: y_pred.index, OutputColumns.ANOMALY_COL: y_pred} + ).reset_index(drop=True) + score = pd.DataFrame( + {"index": avg_codisp.index, OutputColumns.SCORE_COL: avg_codisp} + ).reset_index(drop=True) + + anomaly_output.add_output(target, anomaly, score) + except Exception as e: + logger.warn(f"Encountered Error: {e}. Skipping series {target}.") return anomaly_output