Skip to content

Commit

Permalink
Add kscomplement fix
Browse files Browse the repository at this point in the history
  • Loading branch information
fealho committed Nov 15, 2024
1 parent 5193369 commit 7412dfb
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 8 deletions.
5 changes: 5 additions & 0 deletions sdmetrics/single_column/statistical/kscomplement.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Kolmogorov-Smirnov test based Metric."""

import sys

import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
Expand Down Expand Up @@ -58,6 +60,9 @@ def compute(real_data, synthetic_data):
synthetic_data = pd.to_numeric(synthetic_data)

try:
max_decimals = sys.float_info.dig - 1
real_data = real_data.round(max_decimals)
synthetic_data = synthetic_data.round(max_decimals)
statistic, _ = ks_2samp(real_data, synthetic_data)
except ValueError as e:
if str(e) == 'Data passed to ks_2samp must not be empty':
Expand Down
6 changes: 3 additions & 3 deletions tests/integration/reports/multi_table/test_quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,11 +334,11 @@ def test_quality_report_with_errors():
'Error': [
None,
None,
"TypeError: '<' not supported between instances of 'int' and 'str'",
"TypeError: can't multiply sequence by non-int of type 'float'",
np.nan,
np.nan,
"TypeError: '<' not supported between instances of 'Timestamp' and 'str'",
"TypeError: '<' not supported between instances of 'float' and 'str'",
"TypeError: can't multiply sequence by non-int of type 'float'",
"TypeError: can't multiply sequence by non-int of type 'float'",
None,
],
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,8 @@ def test_get_score_errors(self):
# Run
column_shape_property = ColumnShapes()

expected_message_1 = (
"TypeError: '<' not supported between instances of 'Timestamp' and 'int'"
)
expected_message_2 = "TypeError: '<' not supported between instances of 'str' and 'float'"
expected_message_1 = "TypeError: unsupported operand type(s) for *: 'Timestamp' and 'float'"
expected_message_2 = "TypeError: can't multiply sequence by non-int of type 'float'"

score = column_shape_property.get_score(real_data, synthetic_data, metadata)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,10 @@ def test_bad(array_like):

assert 0.0 <= output < 0.5
assert 0.0 <= normalized < 0.5


def test_one_float_value():
real = pd.Series([0.3 - 0.2])
synth = pd.Series([0.2 - 0.1])
output = KSComplement.compute(real, synth)
assert output == 1
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def test__generate_details_error(self):
result = column_shape_property._generate_details(real_data, synthetic_data, metadata)

# Assert
expected_message = "TypeError: '<' not supported between instances of 'str' and 'int'"
expected_message = "TypeError: can't multiply sequence by non-int of type 'float'"
result_nan = result.loc[pd.isna(result['Score'])]
column_names_nan = result_nan['Column'].tolist()
error_message = result_nan['Error'].tolist()
Expand Down

0 comments on commit 7412dfb

Please sign in to comment.