From 27709c2c8a19bb07ca208fdb4cf243b6573c588b Mon Sep 17 00:00:00 2001 From: Piotr Czarnas Date: Thu, 10 Oct 2024 23:25:35 +0200 Subject: [PATCH] Stationary anomaly detection functions modified to call a library function, to enable customization. --- distribution/zip.xml | 7 ++ home/lib/anomalies/__init__.py | 28 +++++++ home/lib/anomalies/anomaly_detection.py | 73 +++++++++++++++++++ ...ly_stationary_percentile_moving_average.py | 56 ++++++-------- ...onary_percentile_moving_average_30_days.py | 56 ++++++-------- 5 files changed, 150 insertions(+), 70 deletions(-) create mode 100644 home/lib/anomalies/__init__.py create mode 100644 home/lib/anomalies/anomaly_detection.py diff --git a/distribution/zip.xml b/distribution/zip.xml index db441b52d6..8b012f3db3 100644 --- a/distribution/zip.xml +++ b/distribution/zip.xml @@ -22,6 +22,13 @@ /lib + + ${project.basedir}/../home/lib/anomalies + + *.py + + /lib/anomalies + ${project.basedir}/../home/lib diff --git a/home/lib/anomalies/__init__.py b/home/lib/anomalies/__init__.py new file mode 100644 index 0000000000..2a15ec29b9 --- /dev/null +++ b/home/lib/anomalies/__init__.py @@ -0,0 +1,28 @@ +# Copyright © 2021 DQOps (support@dqops.com) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/home/lib/anomalies/anomaly_detection.py b/home/lib/anomalies/anomaly_detection.py new file mode 100644 index 0000000000..48dda5a69b --- /dev/null +++ b/home/lib/anomalies/anomaly_detection.py @@ -0,0 +1,73 @@ +# Copyright © 2021 DQOps (support@dqops.com) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Sequence +import numpy as np +import scipy +import scipy.stats + + +def detect_upper_bound_anomaly(values_above_median: list[float], degrees_of_freedom: int, tail: float): + values_array = np.array(values_above_median, dtype=float) + values_median = np.median(values_array) + values_std = scipy.stats.tstd(values_array) + + if float(values_std) == 0: + return values_median + else: + # Assumption: the historical data follows t-student distribution + upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=values_median, + scale=values_std) + return float(upper_readout_distribution.ppf(1 - tail)) + + +def detect_lower_bound_anomaly(values_below_median: list[float], degrees_of_freedom: int, tail: float): + values_array = np.array(values_below_median, dtype=float) + values_median = np.median(values_array) + values_std = scipy.stats.tstd(values_array) + + if float(values_std) == 0: + return values_median + else: + # Assumption: the historical data follows t-student distribution + lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=values_median, + scale=values_std) + return float(lower_readout_distribution.ppf(tail)) diff --git a/home/rules/percentile/anomaly_stationary_percentile_moving_average.py b/home/rules/percentile/anomaly_stationary_percentile_moving_average.py index 74bbe7ce4b..270e7fe9ed 100644 --- a/home/rules/percentile/anomaly_stationary_percentile_moving_average.py +++ b/home/rules/percentile/anomaly_stationary_percentile_moving_average.py @@ -19,6 +19,7 @@ import numpy as np import scipy import scipy.stats +from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly # rule specific parameters object, contains values received from the quality check threshold configuration @@ -104,30 +105,22 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR if all(readout > 0 for readout in extracted): # using a 0-based calculation (scale from 0) upper_median_multiples_array = [(readout / filtered_median_float - 1.0) for readout in extracted if readout >= filtered_median_float] - upper_multiples = np.array(upper_median_multiples_array, dtype=float) - upper_multiples_median = np.median(upper_multiples) - upper_multiples_std = scipy.stats.tstd(upper_multiples) + threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(upper_multiples_std) == 0: - threshold_upper = filtered_median_float - else: - # Assumption: the historical data follows t-student distribution - upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std) - threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail)) + if threshold_upper_multiple is not None: threshold_upper = (threshold_upper_multiple + 1.0) * filtered_median_float + else: + threshold_upper = rule_parameters.actual_value lower_median_multiples_array = [(-1.0 / (readout / filtered_median_float)) for readout in extracted if readout <= filtered_median_float if readout != 0] - lower_multiples = np.array(lower_median_multiples_array, dtype=float) - lower_multiples_median = np.median(lower_multiples) - lower_multiples_std = scipy.stats.tstd(lower_multiples) + threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(lower_multiples_std) == 0: - threshold_lower = filtered_median_float - else: - # Assumption: the historical data follows t-student distribution - lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, scale=lower_multiples_std) - threshold_lower_multiple = float(lower_readout_distribution.ppf(tail)) + if threshold_lower_multiple is not None: threshold_lower = filtered_median_float * (-1.0 / threshold_lower_multiple) + else: + threshold_lower = rule_parameters.actual_value passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper @@ -139,28 +132,21 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR else: # using unrestricted method upper_half_filtered = [readout for readout in extracted if readout >= filtered_median_float] - upper_half = np.array(upper_half_filtered, dtype=float) - upper_half_median = np.median(upper_half) - upper_half_std = scipy.stats.tstd(upper_half) + threshold_upper_result = detect_upper_bound_anomaly(values_above_median=upper_half_filtered, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(upper_half_std) == 0: - threshold_upper = filtered_median_float + if threshold_upper_result is not None: + threshold_upper = threshold_upper_result else: - # Assumption: the historical data follows t-student distribution - upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_half_median, scale=upper_half_std) - threshold_upper = float(upper_readout_distribution.ppf(1 - tail)) + threshold_upper = rule_parameters.actual_value lower_half_list = [readout for readout in extracted if readout <= filtered_median_float] - lower_half = np.array(lower_half_list, dtype=float) - lower_half_median = np.median(lower_half) - lower_half_std = scipy.stats.tstd(lower_half) - - if float(lower_half_std) == 0: - threshold_lower = filtered_median_float + threshold_lower_result = detect_lower_bound_anomaly(values_below_median=lower_half_list, + degrees_of_freedom=degrees_of_freedom, tail=tail) + if threshold_lower_result is not None: + threshold_lower = threshold_lower_result else: - # Assumption: the historical data follows t-student distribution - lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_half_median, scale=lower_half_std) - threshold_lower = float(lower_readout_distribution.ppf(tail)) + threshold_lower = rule_parameters.actual_value passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper diff --git a/home/rules/percentile/anomaly_stationary_percentile_moving_average_30_days.py b/home/rules/percentile/anomaly_stationary_percentile_moving_average_30_days.py index 64932ebfca..26481a4051 100644 --- a/home/rules/percentile/anomaly_stationary_percentile_moving_average_30_days.py +++ b/home/rules/percentile/anomaly_stationary_percentile_moving_average_30_days.py @@ -19,6 +19,7 @@ import numpy as np import scipy import scipy.stats +from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly # rule specific parameters object, contains values received from the quality check threshold configuration @@ -107,30 +108,22 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR if all(readout > 0 for readout in extracted): # using a 0-based calculation (scale from 0) upper_median_multiples_array = [(readout / filtered_median_float - 1.0) for readout in extracted if readout >= filtered_median_float] - upper_multiples = np.array(upper_median_multiples_array, dtype=float) - upper_multiples_median = np.median(upper_multiples) - upper_multiples_std = scipy.stats.tstd(upper_multiples) + threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(upper_multiples_std) == 0: - threshold_upper = filtered_median_float - else: - # Assumption: the historical data follows t-student distribution - upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std) - threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail)) + if threshold_upper_multiple is not None: threshold_upper = (threshold_upper_multiple + 1.0) * filtered_median_float + else: + threshold_upper = rule_parameters.actual_value lower_median_multiples_array = [(-1.0 / (readout / filtered_median_float)) for readout in extracted if readout <= filtered_median_float if readout != 0] - lower_multiples = np.array(lower_median_multiples_array, dtype=float) - lower_multiples_median = np.median(lower_multiples) - lower_multiples_std = scipy.stats.tstd(lower_multiples) + threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(lower_multiples_std) == 0: - threshold_lower = filtered_median_float - else: - # Assumption: the historical data follows t-student distribution - lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, scale=lower_multiples_std) - threshold_lower_multiple = float(lower_readout_distribution.ppf(tail)) + if threshold_lower_multiple is not None: threshold_lower = filtered_median_float * (-1.0 / threshold_lower_multiple) + else: + threshold_lower = rule_parameters.actual_value passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper @@ -142,28 +135,21 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR else: # using unrestricted method upper_half_filtered = [readout for readout in extracted if readout >= filtered_median_float] - upper_half = np.array(upper_half_filtered, dtype=float) - upper_half_median = np.median(upper_half) - upper_half_std = scipy.stats.tstd(upper_half) + threshold_upper_result = detect_upper_bound_anomaly(values_above_median=upper_half_filtered, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(upper_half_std) == 0: - threshold_upper = filtered_median_float + if threshold_upper_result is not None: + threshold_upper = threshold_upper_result else: - # Assumption: the historical data follows t-student distribution - upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_half_median, scale=upper_half_std) - threshold_upper = float(upper_readout_distribution.ppf(1 - tail)) + threshold_upper = rule_parameters.actual_value lower_half_list = [readout for readout in extracted if readout <= filtered_median_float] - lower_half = np.array(lower_half_list, dtype=float) - lower_half_median = np.median(lower_half) - lower_half_std = scipy.stats.tstd(lower_half) - - if float(lower_half_std) == 0: - threshold_lower = filtered_median_float + threshold_lower_result = detect_lower_bound_anomaly(values_below_median=lower_half_list, + degrees_of_freedom=degrees_of_freedom, tail=tail) + if threshold_lower_result is not None: + threshold_lower = threshold_lower_result else: - # Assumption: the historical data follows t-student distribution - lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_half_median, scale=lower_half_std) - threshold_lower = float(lower_readout_distribution.ppf(tail)) + threshold_lower = rule_parameters.actual_value passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper