Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sf merlion for time-based and forecast AD #956

Merged
merged 12 commits into from
Oct 22, 2024
5 changes: 5 additions & 0 deletions THIRD_PARTY_LICENSES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,11 @@ rrcf
* Source code: https://github.com/kLabUM/rrcf
* Project home: https://github.com/kLabUM/rrcf

Merlion
* Copyright 2021 Salesforce.com Inc
* License: BSD-3 Clause License
* Source code: https://github.com/salesforce/Merlion
* Project Home: https://github.com/salesforce/Merlion

=============================== Licenses ===============================
------------------------------------------------------------------------
Expand Down
84 changes: 84 additions & 0 deletions ads/opctl/operator/lowcode/anomaly/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@
# Copyright (c) 2023 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import random
from ads.common.extended_enum import ExtendedEnumMeta
from ads.opctl.operator.lowcode.common.const import DataColumns
from merlion.models.anomaly import autoencoder, deep_point_anomaly_detector, isolation_forest, spectral_residual, windstats, windstats_monthly
from merlion.models.anomaly.change_point import bocpd
from merlion.models.forecast import prophet


class SupportedModels(str, metaclass=ExtendedEnumMeta):
Expand All @@ -14,6 +18,7 @@ class SupportedModels(str, metaclass=ExtendedEnumMeta):
AutoMLX = "automlx"
AutoTS = "autots"
Auto = "auto"
MerilonAD = "merlion_ad"
# TODS = "tods"

class NonTimeADSupportedModels(str, metaclass=ExtendedEnumMeta):
Expand Down Expand Up @@ -56,6 +61,84 @@ class TODSSubModels(str, metaclass=ExtendedEnumMeta):
}


class MerlionADSubmodels(str, metaclass=ExtendedEnumMeta):
"""Supported Merlion AD sub models."""

# point anomaly
AUTOENCODER = "autoencoder"
DAGMM = "dagmm"
DBL = "dbl"
DEEP_POINT_ANOMALY_DETECTOR = "deep_point_anomaly_detector"
ISOLATION_FOREST = "isolation_forest"
LOF = "lof"
LSTM_ED = "lstm_ed"
# RANDOM_CUT_FOREST = "random_cut_forest"
SPECTRAL_RESIDUAL = "spectral_residual"
STAT_RESIDUAL = "stat_residual"
VAE = "vae"
WINDSTATS = "windstats"
WINDSTATS_MONTHLY = "windstats_monthly"
ZMS = "zms"

# forecast_based
ARIMA = "arima"
ETS = "ets"
MSES = "mses"
PROPHET = "prophet"
SARIMA = "sarima"

#changepoint
BOCPD = "bocpd"


MERLIONAD_IMPORT_MODEL_MAP = {
MerlionADSubmodels.AUTOENCODER: ".autoendcoder",
MerlionADSubmodels.DAGMM: ".dagmm",
MerlionADSubmodels.DBL: ".dbl",
MerlionADSubmodels.DEEP_POINT_ANOMALY_DETECTOR: ".deep_point_anomaly_detector",
MerlionADSubmodels.ISOLATION_FOREST: ".isolation_forest",
MerlionADSubmodels.LOF: ".lof",
MerlionADSubmodels.LSTM_ED: ".lstm_ed",
# MerlionADSubmodels.RANDOM_CUT_FOREST: ".random_cut_forest",
MerlionADSubmodels.SPECTRAL_RESIDUAL: ".spectral_residual",
MerlionADSubmodels.STAT_RESIDUAL: ".stat_residual",
MerlionADSubmodels.VAE: ".vae",
MerlionADSubmodels.WINDSTATS: ".windstats",
MerlionADSubmodels.WINDSTATS_MONTHLY: ".windstats_monthly",
MerlionADSubmodels.ZMS: ".zms",
MerlionADSubmodels.ARIMA: ".forecast_based.arima",
MerlionADSubmodels.ETS: ".forecast_based.ets",
MerlionADSubmodels.MSES: ".forecast_based.mses",
MerlionADSubmodels.PROPHET: ".forecast_based.prophet",
MerlionADSubmodels.SARIMA: ".forecast_based.sarima",
MerlionADSubmodels.BOCPD: ".change_point.bocpd",
}


MERLIONAD_MODEL_MAP = {
MerlionADSubmodels.AUTOENCODER: "AutoEncoder",
MerlionADSubmodels.DAGMM: "DAGMM",
MerlionADSubmodels.DBL: "DynamicBaseline",
MerlionADSubmodels.DEEP_POINT_ANOMALY_DETECTOR: "DeepPointAnomalyDetector",
MerlionADSubmodels.ISOLATION_FOREST: "IsolationForest",
MerlionADSubmodels.LOF: "LOF",
MerlionADSubmodels.LSTM_ED: "LSTMED",
# MerlionADSubmodels.RANDOM_CUT_FOREST: "RandomCutForest",
MerlionADSubmodels.SPECTRAL_RESIDUAL: "SpectralResidual",
MerlionADSubmodels.STAT_RESIDUAL: "StatThreshold",
MerlionADSubmodels.VAE: "VAE",
MerlionADSubmodels.WINDSTATS: "WindStats",
MerlionADSubmodels.WINDSTATS_MONTHLY: "MonthlyWindStats",
MerlionADSubmodels.ZMS: "ZMS",
MerlionADSubmodels.ARIMA: "ArimaDetector",
MerlionADSubmodels.ETS: "ETSDetector",
MerlionADSubmodels.MSES: "MSESDetector",
MerlionADSubmodels.PROPHET: "ProphetDetector",
MerlionADSubmodels.SARIMA: "SarimaDetector",
MerlionADSubmodels.BOCPD: "BOCPD",
}


class SupportedMetrics(str, metaclass=ExtendedEnumMeta):
UNSUPERVISED_UNIFY95 = "unsupervised_unify95"
UNSUPERVISED_UNIFY95_LOG_LOSS = "unsupervised_unify95_log_loss"
Expand Down Expand Up @@ -94,5 +177,6 @@ class OutputColumns(str, metaclass=ExtendedEnumMeta):
Series = DataColumns.Series


MERLION_DEFAULT_MODEL = "prophet"
TODS_DEFAULT_MODEL = "ocsvm"
SUBSAMPLE_THRESHOLD = 1000
158 changes: 158 additions & 0 deletions ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#!/usr/bin/env python

# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import importlib

import numpy as np
import pandas as pd
from merlion.utils import TimeSeries

from ads.common.decorator.runtime_dependency import runtime_dependency
from ads.opctl.operator.lowcode.anomaly.const import (
MERLION_DEFAULT_MODEL,
MERLIONAD_IMPORT_MODEL_MAP,
MERLIONAD_MODEL_MAP,
OutputColumns,
)

from .anomaly_dataset import AnomalyOutput
from .base_model import AnomalyOperatorBaseModel


class AnomalyMerlionOperatorModel(AnomalyOperatorBaseModel):
"""Class representing Merlion Anomaly Detection operator model."""

@runtime_dependency(
module="merlion",
err_msg=(
"Please run `pip3 install salesforce-merlion[all]` to "
"install the required packages."
),
)
def _get_config_model(self, model_list):
"""
Returns a dictionary with model names as keys and a list of model config and model object as values.

Parameters
----------
model_list : list
A list of model names.

Returns
-------
dict
A dictionary with model names as keys and a list of model config and model object as values.
"""
model_config_map = {}
for model_name in model_list:
model_module = importlib.import_module(
name=MERLIONAD_IMPORT_MODEL_MAP.get(model_name),
package="merlion.models.anomaly",
)
model_config = getattr(
model_module, MERLIONAD_MODEL_MAP.get(model_name) + "Config"
)
model = getattr(model_module, MERLIONAD_MODEL_MAP.get(model_name))
model_config_map[model_name] = [model_config, model]
return model_config_map

def _build_model(self) -> AnomalyOutput:
"""
Builds a Merlion anomaly detection model and trains it using the given data.

Parameters
----------
None

Returns
-------
AnomalyOutput
An AnomalyOutput object containing the anomaly detection results.
"""
model_kwargs = self.spec.model_kwargs
anomaly_output = AnomalyOutput(date_column="index")
anomaly_threshold = model_kwargs.get("anomaly_threshold", 95)
model_config_map = {}
if model_kwargs.get("sub_model", None):
model_config_map = self._get_config_model(model_kwargs.get("sub_model"))
else:
from merlion.models.anomaly.forecast_based.prophet import ( # noqa: I001
ProphetDetector,
ProphetDetectorConfig,
)

model_config_map[MERLION_DEFAULT_MODEL] = [
ProphetDetectorConfig,
ProphetDetector,
]

date_column = self.spec.datetime_column.name

anomaly_output = AnomalyOutput(date_column=date_column)
# model_objects = defaultdict(list)
for target, df in self.datasets.full_data_dict.items():
data = df.set_index(date_column)
data = TimeSeries.from_pd(data)
for model_name, (model_config, model) in model_config_map.items():
model_config = model_config(**self.spec.model_kwargs)
if hasattr(model_config, "target_seq_index"):
model_config.target_seq_index = df.columns.get_loc(
self.spec.target_column
)
model = model(model_config)

scores = model.train(train_data=data, anomaly_labels=None)

try:
y_pred = model.get_anomaly_label(data)
y_pred = (
y_pred.to_pd().reset_index()["anom_score"] > 0
).astype(int)
except Exception as e:
y_pred = (
scores.to_pd().reset_index()["anom_score"]
> np.percentile(
scores.to_pd().reset_index()["anom_score"],
anomaly_threshold,
)
).astype(int)

index_col = df.columns[0]

anomaly = pd.DataFrame(
{index_col: df[index_col], OutputColumns.ANOMALY_COL: y_pred}
).reset_index(drop=True)
score = pd.DataFrame(
{
index_col: df[index_col],
OutputColumns.SCORE_COL: scores.to_pd().reset_index()[
"anom_score"
],
}
).reset_index(drop=True)
# model_objects[model_name].append(model)

anomaly_output.add_output(target, anomaly, score)
return anomaly_output

def _generate_report(self):
"""Genreates a report for the model."""
import report_creator as rc

other_sections = [
rc.Heading("Selected Models Overview", level=2),
rc.Text(
"The following tables provide information regarding the chosen model."
),
]

model_description = rc.Text(
"The Merlion anomaly detection model is a full-stack automated machine learning system for anomaly detection."
)

return (
model_description,
other_sections,
)
2 changes: 2 additions & 0 deletions ads/opctl/operator/lowcode/anomaly/model/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ..const import NonTimeADSupportedModels, SupportedModels
from ..operator_config import AnomalyOperatorConfig
from .anomaly_dataset import AnomalyDatasets
from .anomaly_merlion import AnomalyMerlionOperatorModel
from .automlx import AutoMLXOperatorModel
from .autots import AutoTSOperatorModel

Expand Down Expand Up @@ -48,6 +49,7 @@ class AnomalyOperatorModelFactory:
SupportedModels.AutoMLX: AutoMLXOperatorModel,
# SupportedModels.TODS: TODSOperatorModel,
SupportedModels.AutoTS: AutoTSOperatorModel,
SupportedModels.MerilonAD: AnomalyMerlionOperatorModel
}

_NonTime_MAP = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _build_model(self) -> AnomalyOutput:
# Set tree parameters
num_trees = model_kwargs.get("num_trees", 200)
shingle_size = model_kwargs.get("shingle_size", None)
anomaly_threshold = model_kwargs.get("anamoly_threshold", 95)
anomaly_threshold = model_kwargs.get("anomaly_threshold", 95)
codeloop marked this conversation as resolved.
Show resolved Hide resolved

for target, df in self.datasets.full_data_dict.items():
try:
Expand Down
1 change: 1 addition & 0 deletions ads/opctl/operator/lowcode/anomaly/schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ spec:
- oneclasssvm
- isolationforest
- randomcutforest
- merlion_ad
codeloop marked this conversation as resolved.
Show resolved Hide resolved
meta:
description: "The model to be used for anomaly detection"

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ anomaly = [
"oracledb",
"report-creator==1.0.9",
"rrcf==0.4.4",
"scikit-learn"
"scikit-learn",
"salesforce-merlion[all]==2.0.4"
codeloop marked this conversation as resolved.
Show resolved Hide resolved
]
recommender = [
"oracle_ads[opctl]",
Expand Down
3 changes: 2 additions & 1 deletion tests/operators/anomaly/test_anomaly_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@

MODELS = ["autots", "oneclasssvm", "isolationforest", "randomcutforest"]

@pytest.mark.parametrize("model", ["autots"])

@pytest.mark.parametrize("model", ["autots", "merlion_ad"])
def test_artificial_big(model):
all_data = []
TARGET_COLUMN = "sensor"
Expand Down
Loading