From 27709c2c8a19bb07ca208fdb4cf243b6573c588b Mon Sep 17 00:00:00 2001
From: Piotr Czarnas <piotr.czarnas@gmail.com>
Date: Thu, 10 Oct 2024 23:25:35 +0200
Subject: [PATCH] Stationary anomaly detection functions modified to call a
 library function, to enable customization.

---
 distribution/zip.xml                          |  7 ++
 home/lib/anomalies/__init__.py                | 28 +++++++
 home/lib/anomalies/anomaly_detection.py       | 73 +++++++++++++++++++
 ...ly_stationary_percentile_moving_average.py | 56 ++++++--------
 ...onary_percentile_moving_average_30_days.py | 56 ++++++--------
 5 files changed, 150 insertions(+), 70 deletions(-)
 create mode 100644 home/lib/anomalies/__init__.py
 create mode 100644 home/lib/anomalies/anomaly_detection.py
diff --git a/distribution/zip.xml b/distribution/zip.xml
index db441b52d6..8b012f3db3 100644
--- a/distribution/zip.xml
+++ b/distribution/zip.xml
@@ -22,6 +22,13 @@
             </includes>
             <outputDirectory>/lib</outputDirectory>
         </fileSet>
+        <fileSet>
+            <directory>${project.basedir}/../home/lib/anomalies</directory>
+            <includes>
+                <include>*.py</include>
+            </includes>
+            <outputDirectory>/lib/anomalies</outputDirectory>
+        </fileSet>
         <fileSet>
             <directory>${project.basedir}/../home/lib</directory>
             <includes>
diff --git a/home/lib/anomalies/__init__.py b/home/lib/anomalies/__init__.py
new file mode 100644
index 0000000000..2a15ec29b9
--- /dev/null
+++ b/home/lib/anomalies/__init__.py
@@ -0,0 +1,28 @@
+#  Copyright © 2021 DQOps (support@dqops.com)
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/home/lib/anomalies/anomaly_detection.py b/home/lib/anomalies/anomaly_detection.py
new file mode 100644
index 0000000000..48dda5a69b
--- /dev/null
+++ b/home/lib/anomalies/anomaly_detection.py
@@ -0,0 +1,73 @@
+#  Copyright © 2021 DQOps (support@dqops.com)
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Sequence
+import numpy as np
+import scipy
+import scipy.stats
+
+
+def detect_upper_bound_anomaly(values_above_median: list[float], degrees_of_freedom: int, tail: float):
+    values_array = np.array(values_above_median, dtype=float)
+    values_median = np.median(values_array)
+    values_std = scipy.stats.tstd(values_array)
+
+    if float(values_std) == 0:
+        return values_median
+    else:
+        # Assumption: the historical data follows t-student distribution
+        upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=values_median,
+                                                   scale=values_std)
+        return float(upper_readout_distribution.ppf(1 - tail))
+
+
+def detect_lower_bound_anomaly(values_below_median: list[float], degrees_of_freedom: int, tail: float):
+    values_array = np.array(values_below_median, dtype=float)
+    values_median = np.median(values_array)
+    values_std = scipy.stats.tstd(values_array)
+
+    if float(values_std) == 0:
+        return values_median
+    else:
+        # Assumption: the historical data follows t-student distribution
+        lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=values_median,
+                                                   scale=values_std)
+        return float(lower_readout_distribution.ppf(tail))
diff --git a/home/rules/percentile/anomaly_stationary_percentile_moving_average.py b/home/rules/percentile/anomaly_stationary_percentile_moving_average.py
index 74bbe7ce4b..270e7fe9ed 100644
--- a/home/rules/percentile/anomaly_stationary_percentile_moving_average.py
+++ b/home/rules/percentile/anomaly_stationary_percentile_moving_average.py
@@ -19,6 +19,7 @@
 import numpy as np
 import scipy
 import scipy.stats
+from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly
 
 
 # rule specific parameters object, contains values received from the quality check threshold configuration
@@ -104,30 +105,22 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR
     if all(readout > 0 for readout in extracted):
         # using a 0-based calculation (scale from 0)
         upper_median_multiples_array = [(readout / filtered_median_float - 1.0) for readout in extracted if readout >= filtered_median_float]
-        upper_multiples = np.array(upper_median_multiples_array, dtype=float)
-        upper_multiples_median = np.median(upper_multiples)
-        upper_multiples_std = scipy.stats.tstd(upper_multiples)
+        threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array,
+                                                              degrees_of_freedom=degrees_of_freedom, tail=tail)
 
-        if float(upper_multiples_std) == 0:
-            threshold_upper = filtered_median_float
-        else:
-            # Assumption: the historical data follows t-student distribution
-            upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std)
-            threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail))
+        if threshold_upper_multiple is not None:
             threshold_upper = (threshold_upper_multiple + 1.0) * filtered_median_float
+        else:
+            threshold_upper = rule_parameters.actual_value
 
         lower_median_multiples_array = [(-1.0 / (readout / filtered_median_float)) for readout in extracted if readout <= filtered_median_float if readout != 0]
-        lower_multiples = np.array(lower_median_multiples_array, dtype=float)
-        lower_multiples_median = np.median(lower_multiples)
-        lower_multiples_std = scipy.stats.tstd(lower_multiples)
+        threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array,
+                                                              degrees_of_freedom=degrees_of_freedom, tail=tail)
 
-        if float(lower_multiples_std) == 0:
-            threshold_lower = filtered_median_float
-        else:
-            # Assumption: the historical data follows t-student distribution
-            lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, scale=lower_multiples_std)
-            threshold_lower_multiple = float(lower_readout_distribution.ppf(tail))
+        if threshold_lower_multiple is not None:
             threshold_lower = filtered_median_float * (-1.0 / threshold_lower_multiple)
+        else:
+            threshold_lower = rule_parameters.actual_value
 
         passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper
 
@@ -139,28 +132,21 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR
     else:
         # using unrestricted method
         upper_half_filtered = [readout for readout in extracted if readout >= filtered_median_float]
-        upper_half = np.array(upper_half_filtered, dtype=float)
-        upper_half_median = np.median(upper_half)
-        upper_half_std = scipy.stats.tstd(upper_half)
+        threshold_upper_result = detect_upper_bound_anomaly(values_above_median=upper_half_filtered,
+                                                            degrees_of_freedom=degrees_of_freedom, tail=tail)
 
-        if float(upper_half_std) == 0:
-            threshold_upper = filtered_median_float
+        if threshold_upper_result is not None:
+            threshold_upper = threshold_upper_result
         else:
-            # Assumption: the historical data follows t-student distribution
-            upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_half_median, scale=upper_half_std)
-            threshold_upper = float(upper_readout_distribution.ppf(1 - tail))
+            threshold_upper = rule_parameters.actual_value
 
         lower_half_list = [readout for readout in extracted if readout <= filtered_median_float]
-        lower_half = np.array(lower_half_list, dtype=float)
-        lower_half_median = np.median(lower_half)
-        lower_half_std = scipy.stats.tstd(lower_half)
-
-        if float(lower_half_std) == 0:
-            threshold_lower = filtered_median_float
+        threshold_lower_result = detect_lower_bound_anomaly(values_below_median=lower_half_list,
+                                                            degrees_of_freedom=degrees_of_freedom, tail=tail)
+        if threshold_lower_result is not None:
+            threshold_lower = threshold_lower_result
         else:
-            # Assumption: the historical data follows t-student distribution
-            lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_half_median, scale=lower_half_std)
-            threshold_lower = float(lower_readout_distribution.ppf(tail))
+            threshold_lower = rule_parameters.actual_value
 
         passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper
 
diff --git a/home/rules/percentile/anomaly_stationary_percentile_moving_average_30_days.py b/home/rules/percentile/anomaly_stationary_percentile_moving_average_30_days.py
index 64932ebfca..26481a4051 100644
--- a/home/rules/percentile/anomaly_stationary_percentile_moving_average_30_days.py
+++ b/home/rules/percentile/anomaly_stationary_percentile_moving_average_30_days.py
@@ -19,6 +19,7 @@
 import numpy as np
 import scipy
 import scipy.stats
+from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly
 
 
 # rule specific parameters object, contains values received from the quality check threshold configuration
@@ -107,30 +108,22 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR
     if all(readout > 0 for readout in extracted):
         # using a 0-based calculation (scale from 0)
         upper_median_multiples_array = [(readout / filtered_median_float - 1.0) for readout in extracted if readout >= filtered_median_float]
-        upper_multiples = np.array(upper_median_multiples_array, dtype=float)
-        upper_multiples_median = np.median(upper_multiples)
-        upper_multiples_std = scipy.stats.tstd(upper_multiples)
+        threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array,
+                                                              degrees_of_freedom=degrees_of_freedom, tail=tail)
 
-        if float(upper_multiples_std) == 0:
-            threshold_upper = filtered_median_float
-        else:
-            # Assumption: the historical data follows t-student distribution
-            upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std)
-            threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail))
+        if threshold_upper_multiple is not None:
             threshold_upper = (threshold_upper_multiple + 1.0) * filtered_median_float
+        else:
+            threshold_upper = rule_parameters.actual_value
 
         lower_median_multiples_array = [(-1.0 / (readout / filtered_median_float)) for readout in extracted if readout <= filtered_median_float if readout != 0]
-        lower_multiples = np.array(lower_median_multiples_array, dtype=float)
-        lower_multiples_median = np.median(lower_multiples)
-        lower_multiples_std = scipy.stats.tstd(lower_multiples)
+        threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array,
+                                                              degrees_of_freedom=degrees_of_freedom, tail=tail)
 
-        if float(lower_multiples_std) == 0:
-            threshold_lower = filtered_median_float
-        else:
-            # Assumption: the historical data follows t-student distribution
-            lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, scale=lower_multiples_std)
-            threshold_lower_multiple = float(lower_readout_distribution.ppf(tail))
+        if threshold_lower_multiple is not None:
             threshold_lower = filtered_median_float * (-1.0 / threshold_lower_multiple)
+        else:
+            threshold_lower = rule_parameters.actual_value
 
         passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper
 
@@ -142,28 +135,21 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR
     else:
         # using unrestricted method
         upper_half_filtered = [readout for readout in extracted if readout >= filtered_median_float]
-        upper_half = np.array(upper_half_filtered, dtype=float)
-        upper_half_median = np.median(upper_half)
-        upper_half_std = scipy.stats.tstd(upper_half)
+        threshold_upper_result = detect_upper_bound_anomaly(values_above_median=upper_half_filtered,
+                                                            degrees_of_freedom=degrees_of_freedom, tail=tail)
 
-        if float(upper_half_std) == 0:
-            threshold_upper = filtered_median_float
+        if threshold_upper_result is not None:
+            threshold_upper = threshold_upper_result
         else:
-            # Assumption: the historical data follows t-student distribution
-            upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_half_median, scale=upper_half_std)
-            threshold_upper = float(upper_readout_distribution.ppf(1 - tail))
+            threshold_upper = rule_parameters.actual_value
 
         lower_half_list = [readout for readout in extracted if readout <= filtered_median_float]
-        lower_half = np.array(lower_half_list, dtype=float)
-        lower_half_median = np.median(lower_half)
-        lower_half_std = scipy.stats.tstd(lower_half)
-
-        if float(lower_half_std) == 0:
-            threshold_lower = filtered_median_float
+        threshold_lower_result = detect_lower_bound_anomaly(values_below_median=lower_half_list,
+                                                            degrees_of_freedom=degrees_of_freedom, tail=tail)
+        if threshold_lower_result is not None:
+            threshold_lower = threshold_lower_result
         else:
-            # Assumption: the historical data follows t-student distribution
-            lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_half_median, scale=lower_half_std)
-            threshold_lower = float(lower_readout_distribution.ppf(tail))
+            threshold_lower = rule_parameters.actual_value
 
         passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper