Skip to content

Commit

Permalink
Stationary anomaly detection functions modified to call a library fun…
Browse files Browse the repository at this point in the history
…ction, to enable customization.
  • Loading branch information
piotrczarnas committed Oct 10, 2024
1 parent a5a4806 commit 27709c2
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 70 deletions.
7 changes: 7 additions & 0 deletions distribution/zip.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@
</includes>
<outputDirectory>/lib</outputDirectory>
</fileSet>
<fileSet>
<directory>${project.basedir}/../home/lib/anomalies</directory>
<includes>
<include>*.py</include>
</includes>
<outputDirectory>/lib/anomalies</outputDirectory>
</fileSet>
<fileSet>
<directory>${project.basedir}/../home/lib</directory>
<includes>
Expand Down
28 changes: 28 additions & 0 deletions home/lib/anomalies/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright © 2021 DQOps (support@dqops.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
73 changes: 73 additions & 0 deletions home/lib/anomalies/anomaly_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright © 2021 DQOps (support@dqops.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from typing import Sequence
import numpy as np
import scipy
import scipy.stats


def detect_upper_bound_anomaly(values_above_median: list[float], degrees_of_freedom: int, tail: float):
values_array = np.array(values_above_median, dtype=float)
values_median = np.median(values_array)
values_std = scipy.stats.tstd(values_array)

if float(values_std) == 0:
return values_median
else:
# Assumption: the historical data follows t-student distribution
upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=values_median,
scale=values_std)
return float(upper_readout_distribution.ppf(1 - tail))


def detect_lower_bound_anomaly(values_below_median: list[float], degrees_of_freedom: int, tail: float):
values_array = np.array(values_below_median, dtype=float)
values_median = np.median(values_array)
values_std = scipy.stats.tstd(values_array)

if float(values_std) == 0:
return values_median
else:
# Assumption: the historical data follows t-student distribution
lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=values_median,
scale=values_std)
return float(lower_readout_distribution.ppf(tail))
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import numpy as np
import scipy
import scipy.stats
from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly


# rule specific parameters object, contains values received from the quality check threshold configuration
Expand Down Expand Up @@ -104,30 +105,22 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR
if all(readout > 0 for readout in extracted):
# using a 0-based calculation (scale from 0)
upper_median_multiples_array = [(readout / filtered_median_float - 1.0) for readout in extracted if readout >= filtered_median_float]
upper_multiples = np.array(upper_median_multiples_array, dtype=float)
upper_multiples_median = np.median(upper_multiples)
upper_multiples_std = scipy.stats.tstd(upper_multiples)
threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array,
degrees_of_freedom=degrees_of_freedom, tail=tail)

if float(upper_multiples_std) == 0:
threshold_upper = filtered_median_float
else:
# Assumption: the historical data follows t-student distribution
upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std)
threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail))
if threshold_upper_multiple is not None:
threshold_upper = (threshold_upper_multiple + 1.0) * filtered_median_float
else:
threshold_upper = rule_parameters.actual_value

lower_median_multiples_array = [(-1.0 / (readout / filtered_median_float)) for readout in extracted if readout <= filtered_median_float if readout != 0]
lower_multiples = np.array(lower_median_multiples_array, dtype=float)
lower_multiples_median = np.median(lower_multiples)
lower_multiples_std = scipy.stats.tstd(lower_multiples)
threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array,
degrees_of_freedom=degrees_of_freedom, tail=tail)

if float(lower_multiples_std) == 0:
threshold_lower = filtered_median_float
else:
# Assumption: the historical data follows t-student distribution
lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, scale=lower_multiples_std)
threshold_lower_multiple = float(lower_readout_distribution.ppf(tail))
if threshold_lower_multiple is not None:
threshold_lower = filtered_median_float * (-1.0 / threshold_lower_multiple)
else:
threshold_lower = rule_parameters.actual_value

passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper

Expand All @@ -139,28 +132,21 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR
else:
# using unrestricted method
upper_half_filtered = [readout for readout in extracted if readout >= filtered_median_float]
upper_half = np.array(upper_half_filtered, dtype=float)
upper_half_median = np.median(upper_half)
upper_half_std = scipy.stats.tstd(upper_half)
threshold_upper_result = detect_upper_bound_anomaly(values_above_median=upper_half_filtered,
degrees_of_freedom=degrees_of_freedom, tail=tail)

if float(upper_half_std) == 0:
threshold_upper = filtered_median_float
if threshold_upper_result is not None:
threshold_upper = threshold_upper_result
else:
# Assumption: the historical data follows t-student distribution
upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_half_median, scale=upper_half_std)
threshold_upper = float(upper_readout_distribution.ppf(1 - tail))
threshold_upper = rule_parameters.actual_value

lower_half_list = [readout for readout in extracted if readout <= filtered_median_float]
lower_half = np.array(lower_half_list, dtype=float)
lower_half_median = np.median(lower_half)
lower_half_std = scipy.stats.tstd(lower_half)

if float(lower_half_std) == 0:
threshold_lower = filtered_median_float
threshold_lower_result = detect_lower_bound_anomaly(values_below_median=lower_half_list,
degrees_of_freedom=degrees_of_freedom, tail=tail)
if threshold_lower_result is not None:
threshold_lower = threshold_lower_result
else:
# Assumption: the historical data follows t-student distribution
lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_half_median, scale=lower_half_std)
threshold_lower = float(lower_readout_distribution.ppf(tail))
threshold_lower = rule_parameters.actual_value

passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import numpy as np
import scipy
import scipy.stats
from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly


# rule specific parameters object, contains values received from the quality check threshold configuration
Expand Down Expand Up @@ -107,30 +108,22 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR
if all(readout > 0 for readout in extracted):
# using a 0-based calculation (scale from 0)
upper_median_multiples_array = [(readout / filtered_median_float - 1.0) for readout in extracted if readout >= filtered_median_float]
upper_multiples = np.array(upper_median_multiples_array, dtype=float)
upper_multiples_median = np.median(upper_multiples)
upper_multiples_std = scipy.stats.tstd(upper_multiples)
threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array,
degrees_of_freedom=degrees_of_freedom, tail=tail)

if float(upper_multiples_std) == 0:
threshold_upper = filtered_median_float
else:
# Assumption: the historical data follows t-student distribution
upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std)
threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail))
if threshold_upper_multiple is not None:
threshold_upper = (threshold_upper_multiple + 1.0) * filtered_median_float
else:
threshold_upper = rule_parameters.actual_value

lower_median_multiples_array = [(-1.0 / (readout / filtered_median_float)) for readout in extracted if readout <= filtered_median_float if readout != 0]
lower_multiples = np.array(lower_median_multiples_array, dtype=float)
lower_multiples_median = np.median(lower_multiples)
lower_multiples_std = scipy.stats.tstd(lower_multiples)
threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array,
degrees_of_freedom=degrees_of_freedom, tail=tail)

if float(lower_multiples_std) == 0:
threshold_lower = filtered_median_float
else:
# Assumption: the historical data follows t-student distribution
lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, scale=lower_multiples_std)
threshold_lower_multiple = float(lower_readout_distribution.ppf(tail))
if threshold_lower_multiple is not None:
threshold_lower = filtered_median_float * (-1.0 / threshold_lower_multiple)
else:
threshold_lower = rule_parameters.actual_value

passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper

Expand All @@ -142,28 +135,21 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR
else:
# using unrestricted method
upper_half_filtered = [readout for readout in extracted if readout >= filtered_median_float]
upper_half = np.array(upper_half_filtered, dtype=float)
upper_half_median = np.median(upper_half)
upper_half_std = scipy.stats.tstd(upper_half)
threshold_upper_result = detect_upper_bound_anomaly(values_above_median=upper_half_filtered,
degrees_of_freedom=degrees_of_freedom, tail=tail)

if float(upper_half_std) == 0:
threshold_upper = filtered_median_float
if threshold_upper_result is not None:
threshold_upper = threshold_upper_result
else:
# Assumption: the historical data follows t-student distribution
upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_half_median, scale=upper_half_std)
threshold_upper = float(upper_readout_distribution.ppf(1 - tail))
threshold_upper = rule_parameters.actual_value

lower_half_list = [readout for readout in extracted if readout <= filtered_median_float]
lower_half = np.array(lower_half_list, dtype=float)
lower_half_median = np.median(lower_half)
lower_half_std = scipy.stats.tstd(lower_half)

if float(lower_half_std) == 0:
threshold_lower = filtered_median_float
threshold_lower_result = detect_lower_bound_anomaly(values_below_median=lower_half_list,
degrees_of_freedom=degrees_of_freedom, tail=tail)
if threshold_lower_result is not None:
threshold_lower = threshold_lower_result
else:
# Assumption: the historical data follows t-student distribution
lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_half_median, scale=lower_half_std)
threshold_lower = float(lower_readout_distribution.ppf(tail))
threshold_lower = rule_parameters.actual_value

passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper

Expand Down

0 comments on commit 27709c2

Please sign in to comment.