From 97538ad70e86ad18f7388cd8ebd52f31c00f4025 Mon Sep 17 00:00:00 2001 From: Felipe Date: Fri, 15 Nov 2024 10:21:01 -0800 Subject: [PATCH 1/5] Add kscomplement fix --- sdmetrics/single_column/statistical/kscomplement.py | 9 +++++++++ .../single_column/statistical/test_kscomplement.py | 7 +++++++ 2 files changed, 16 insertions(+) diff --git a/sdmetrics/single_column/statistical/kscomplement.py b/sdmetrics/single_column/statistical/kscomplement.py index 525e85c7..038662a3 100644 --- a/sdmetrics/single_column/statistical/kscomplement.py +++ b/sdmetrics/single_column/statistical/kscomplement.py @@ -1,5 +1,7 @@ """Kolmogorov-Smirnov test based Metric.""" +import sys + import numpy as np import pandas as pd from scipy.stats import ks_2samp @@ -57,6 +59,13 @@ def compute(real_data, synthetic_data): real_data = pd.to_numeric(real_data) synthetic_data = pd.to_numeric(synthetic_data) + try: + max_decimals = sys.float_info.dig - 1 + real_data = real_data.round(max_decimals) + synthetic_data = synthetic_data.round(max_decimals) + except TypeError: + pass + try: statistic, _ = ks_2samp(real_data, synthetic_data) except ValueError as e: diff --git a/tests/integration/single_column/statistical/test_kscomplement.py b/tests/integration/single_column/statistical/test_kscomplement.py index d15e8bca..081e731e 100644 --- a/tests/integration/single_column/statistical/test_kscomplement.py +++ b/tests/integration/single_column/statistical/test_kscomplement.py @@ -46,3 +46,10 @@ def test_bad(array_like): assert 0.0 <= output < 0.5 assert 0.0 <= normalized < 0.5 + + +def test_one_float_value(): + real = pd.Series([0.3 - 0.2]) + synth = pd.Series([0.2 - 0.1]) + output = KSComplement.compute(real, synth) + assert output == 1 From 3a7e0245946a33ac4fcb746f964dac535cfe71a2 Mon Sep 17 00:00:00 2001 From: Felipe Date: Wed, 20 Nov 2024 11:02:17 -0800 Subject: [PATCH 2/5] Update tests --- .../single_column/statistical/kscomplement.py | 9 +++----- .../multi_table/test_quality_report.py | 16 +++++++++++--- .../_properties/test_column_shapes.py | 21 +++++++++++++------ .../_properties/test_column_shapes.py | 9 +++++++- 4 files changed, 39 insertions(+), 16 deletions(-) diff --git a/sdmetrics/single_column/statistical/kscomplement.py b/sdmetrics/single_column/statistical/kscomplement.py index 038662a3..e8f0b182 100644 --- a/sdmetrics/single_column/statistical/kscomplement.py +++ b/sdmetrics/single_column/statistical/kscomplement.py @@ -59,12 +59,9 @@ def compute(real_data, synthetic_data): real_data = pd.to_numeric(real_data) synthetic_data = pd.to_numeric(synthetic_data) - try: - max_decimals = sys.float_info.dig - 1 - real_data = real_data.round(max_decimals) - synthetic_data = synthetic_data.round(max_decimals) - except TypeError: - pass + max_decimals = sys.float_info.dig - 1 + real_data = real_data.round(max_decimals) + synthetic_data = synthetic_data.round(max_decimals) try: statistic, _ = ks_2samp(real_data, synthetic_data) diff --git a/tests/integration/reports/multi_table/test_quality_report.py b/tests/integration/reports/multi_table/test_quality_report.py index fb571e81..d8df85c5 100644 --- a/tests/integration/reports/multi_table/test_quality_report.py +++ b/tests/integration/reports/multi_table/test_quality_report.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +from packaging import version from sdmetrics.demos import load_demo from sdmetrics.reports.multi_table.quality_report import QualityReport @@ -299,6 +300,15 @@ def test_quality_report_with_errors(): 'Property': ['Column Shapes', 'Column Pair Trends', 'Cardinality', 'Intertable Trends'], 'Score': [0.8165079365079364, 0.55, 0.95, 0.5833333333333334], }) + + numpy_version = version.parse(np.__version__) + if numpy_version >= version.parse('1.19.0'): + err1 = "TypeError: '<' not supported between instances of 'int' and 'str'" + err2 = "TypeError: '<' not supported between instances of 'Timestamp' and 'str'" + err3 = "TypeError: '<' not supported between instances of 'float' and 'str'" + else: + err1 = err2 = err3 = "TypeError: can't multiply sequence by non-int of type 'float'" + expected_details = pd.DataFrame({ 'Table': [ 'users', @@ -334,11 +344,11 @@ def test_quality_report_with_errors(): 'Error': [ None, None, - "TypeError: '<' not supported between instances of 'int' and 'str'", + err1, np.nan, np.nan, - "TypeError: '<' not supported between instances of 'Timestamp' and 'str'", - "TypeError: '<' not supported between instances of 'float' and 'str'", + err2, + err3, None, ], }) diff --git a/tests/integration/reports/single_table/_properties/test_column_shapes.py b/tests/integration/reports/single_table/_properties/test_column_shapes.py index 5d490027..c3e6bdfc 100644 --- a/tests/integration/reports/single_table/_properties/test_column_shapes.py +++ b/tests/integration/reports/single_table/_properties/test_column_shapes.py @@ -1,4 +1,6 @@ +import numpy as np import pandas as pd +from packaging import version from sdmetrics.demos import load_demo from sdmetrics.reports.single_table._properties import ColumnShapes @@ -84,15 +86,22 @@ def test_get_score_errors(self): # Run column_shape_property = ColumnShapes() - - expected_message_1 = ( - "TypeError: '<' not supported between instances of 'Timestamp' and 'int'" - ) - expected_message_2 = "TypeError: '<' not supported between instances of 'str' and 'float'" - score = column_shape_property.get_score(real_data, synthetic_data, metadata) # Assert + numpy_version = version.parse(np.__version__) + if numpy_version >= version.parse('1.19.0'): + expected_message_1 = ( + "TypeError: '<' not supported between instances of 'Timestamp' and 'int'" + ) + expected_message_2 = ( + "TypeError: '<' not supported between instances of 'str' and 'float'" + ) + else: + expected_message_1 = ( + "TypeError: unsupported operand type(s) for *: 'Timestamp' and 'float'" + ) + expected_message_2 = "TypeError: can't multiply sequence by non-int of type 'float'" details = column_shape_property.details details_nan = details.loc[pd.isna(details['Score'])] diff --git a/tests/unit/reports/single_table/_properties/test_column_shapes.py b/tests/unit/reports/single_table/_properties/test_column_shapes.py index f9c688cf..29237c31 100644 --- a/tests/unit/reports/single_table/_properties/test_column_shapes.py +++ b/tests/unit/reports/single_table/_properties/test_column_shapes.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +from packaging import version from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes @@ -108,7 +109,13 @@ def test__generate_details_error(self): result = column_shape_property._generate_details(real_data, synthetic_data, metadata) # Assert - expected_message = "TypeError: '<' not supported between instances of 'str' and 'int'" + numpy_version = version.parse(np.__version__) + print(numpy_version) + if numpy_version >= version.parse('1.19.0'): + expected_message = "TypeError: '<' not supported between instances of 'str' and 'int'" + else: + expected_message = "TypeError: can't multiply sequence by non-int of type 'float'" + result_nan = result.loc[pd.isna(result['Score'])] column_names_nan = result_nan['Column'].tolist() error_message = result_nan['Error'].tolist() From 276252b03003c0eddec02e68dd480d3545b4ddd7 Mon Sep 17 00:00:00 2001 From: Felipe Date: Mon, 25 Nov 2024 07:19:37 -0800 Subject: [PATCH 3/5] Fix tests --- tests/integration/reports/multi_table/test_quality_report.py | 4 ++-- .../reports/single_table/_properties/test_column_shapes.py | 5 ++--- .../reports/single_table/_properties/test_column_shapes.py | 5 ++--- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/integration/reports/multi_table/test_quality_report.py b/tests/integration/reports/multi_table/test_quality_report.py index d8df85c5..8e372459 100644 --- a/tests/integration/reports/multi_table/test_quality_report.py +++ b/tests/integration/reports/multi_table/test_quality_report.py @@ -301,8 +301,8 @@ def test_quality_report_with_errors(): 'Score': [0.8165079365079364, 0.55, 0.95, 0.5833333333333334], }) - numpy_version = version.parse(np.__version__) - if numpy_version >= version.parse('1.19.0'): + pandas_version = version.parse(pd.__version__) + if pandas_version >= version.parse('2.2.0'): err1 = "TypeError: '<' not supported between instances of 'int' and 'str'" err2 = "TypeError: '<' not supported between instances of 'Timestamp' and 'str'" err3 = "TypeError: '<' not supported between instances of 'float' and 'str'" diff --git a/tests/integration/reports/single_table/_properties/test_column_shapes.py b/tests/integration/reports/single_table/_properties/test_column_shapes.py index c3e6bdfc..748e9db2 100644 --- a/tests/integration/reports/single_table/_properties/test_column_shapes.py +++ b/tests/integration/reports/single_table/_properties/test_column_shapes.py @@ -1,4 +1,3 @@ -import numpy as np import pandas as pd from packaging import version @@ -89,8 +88,8 @@ def test_get_score_errors(self): score = column_shape_property.get_score(real_data, synthetic_data, metadata) # Assert - numpy_version = version.parse(np.__version__) - if numpy_version >= version.parse('1.19.0'): + pandas_version = version.parse(pd.__version__) + if pandas_version >= version.parse('2.2.0'): expected_message_1 = ( "TypeError: '<' not supported between instances of 'Timestamp' and 'int'" ) diff --git a/tests/unit/reports/single_table/_properties/test_column_shapes.py b/tests/unit/reports/single_table/_properties/test_column_shapes.py index 29237c31..bc84b100 100644 --- a/tests/unit/reports/single_table/_properties/test_column_shapes.py +++ b/tests/unit/reports/single_table/_properties/test_column_shapes.py @@ -109,9 +109,8 @@ def test__generate_details_error(self): result = column_shape_property._generate_details(real_data, synthetic_data, metadata) # Assert - numpy_version = version.parse(np.__version__) - print(numpy_version) - if numpy_version >= version.parse('1.19.0'): + pandas_version = version.parse(pd.__version__) + if pandas_version >= version.parse('2.2.0'): expected_message = "TypeError: '<' not supported between instances of 'str' and 'int'" else: expected_message = "TypeError: can't multiply sequence by non-int of type 'float'" From 031febf1b389e1937068df3514444ed297244742 Mon Sep 17 00:00:00 2001 From: Felipe Date: Mon, 25 Nov 2024 07:38:12 -0800 Subject: [PATCH 4/5] Feedback --- sdmetrics/single_column/statistical/kscomplement.py | 7 ++++--- .../single_column/statistical/test_kscomplement.py | 5 +++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/sdmetrics/single_column/statistical/kscomplement.py b/sdmetrics/single_column/statistical/kscomplement.py index e8f0b182..ef45efb7 100644 --- a/sdmetrics/single_column/statistical/kscomplement.py +++ b/sdmetrics/single_column/statistical/kscomplement.py @@ -10,6 +10,8 @@ from sdmetrics.single_column.base import SingleColumnMetric from sdmetrics.utils import is_datetime +MAX_DECIMALS = sys.float_info.dig - 1 + class KSComplement(SingleColumnMetric): """Kolmogorov-Smirnov statistic based metric. @@ -59,9 +61,8 @@ def compute(real_data, synthetic_data): real_data = pd.to_numeric(real_data) synthetic_data = pd.to_numeric(synthetic_data) - max_decimals = sys.float_info.dig - 1 - real_data = real_data.round(max_decimals) - synthetic_data = synthetic_data.round(max_decimals) + real_data = real_data.round(MAX_DECIMALS) + synthetic_data = synthetic_data.round(MAX_DECIMALS) try: statistic, _ = ks_2samp(real_data, synthetic_data) diff --git a/tests/integration/single_column/statistical/test_kscomplement.py b/tests/integration/single_column/statistical/test_kscomplement.py index 081e731e..e377c46c 100644 --- a/tests/integration/single_column/statistical/test_kscomplement.py +++ b/tests/integration/single_column/statistical/test_kscomplement.py @@ -49,7 +49,12 @@ def test_bad(array_like): def test_one_float_value(): + # Setup real = pd.Series([0.3 - 0.2]) synth = pd.Series([0.2 - 0.1]) + + # Run output = KSComplement.compute(real, synth) + + # Assert assert output == 1 From 37004191991ad45354b88983c23891bf48b7cdd0 Mon Sep 17 00:00:00 2001 From: Felipe Date: Mon, 25 Nov 2024 07:47:43 -0800 Subject: [PATCH 5/5] Docstring --- tests/integration/single_column/statistical/test_kscomplement.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/single_column/statistical/test_kscomplement.py b/tests/integration/single_column/statistical/test_kscomplement.py index e377c46c..d2baf5a0 100644 --- a/tests/integration/single_column/statistical/test_kscomplement.py +++ b/tests/integration/single_column/statistical/test_kscomplement.py @@ -49,6 +49,7 @@ def test_bad(array_like): def test_one_float_value(): + """Test KSComplement.compute when both data have the same float values GH#652.""" # Setup real = pd.Series([0.3 - 0.2]) synth = pd.Series([0.2 - 0.1])