diff --git a/HISTORY.md b/HISTORY.md index 1ad7bd05..89257a1b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,13 @@ # History +## v0.9.1 - 2023-02-17 + +This release fixes bugs in the existing metrics and reports. + +### Bug Fixes +* Fix issue-296 for discrete and continuous columns - Issue [#296](https://github.com/sdv-dev/SDMetrics/issues/296) by @R-Palazzo +* Support new metadata for datetime_format - Issue [#303](https://github.com/sdv-dev/SDMetrics/issues/303) by @frances-h + ## v0.9.0 - 2023-01-18 This release supports Python 3.10 and drops support for Python 3.6. We also add a verbosity argument to report generation. diff --git a/conda/meta.yaml b/conda/meta.yaml index 3d3549ee..d6640646 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,4 +1,4 @@ -{% set version = '0.9.0' %} +{% set version = '0.9.1.dev1' %} package: name: "{{ name|lower }}" diff --git a/sdmetrics/__init__.py b/sdmetrics/__init__.py index cfe69936..89e4a4d5 100644 --- a/sdmetrics/__init__.py +++ b/sdmetrics/__init__.py @@ -4,7 +4,7 @@ __author__ = 'MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' -__version__ = '0.9.0' +__version__ = '0.9.1.dev1' import pandas as pd diff --git a/sdmetrics/reports/utils.py b/sdmetrics/reports/utils.py index 27c0347b..ddaa898f 100644 --- a/sdmetrics/reports/utils.py +++ b/sdmetrics/reports/utils.py @@ -10,7 +10,8 @@ from pandas.core.tools.datetimes import _guess_datetime_format_for_array from sdmetrics.utils import ( - get_alternate_keys, get_columns_from_metadata, get_type_from_column_meta, is_datetime) + get_alternate_keys, get_columns_from_metadata, get_missing_percentage, + get_type_from_column_meta, is_datetime) DATACEBO_DARK = '#000036' DATACEBO_LIGHT = '#01E0C9' @@ -85,8 +86,8 @@ def make_discrete_column_plot(real_column, synthetic_column, sdtype): synthetic_data = pd.DataFrame({'values': synthetic_column.copy()}) synthetic_data['Data'] = 'Synthetic' - missing_data_real = round((real_column.isna().sum() / len(real_column)) * 100, 2) - missing_data_synthetic = round((synthetic_column.isna().sum() / len(synthetic_column)), 2) + missing_data_real = get_missing_percentage(real_column) + missing_data_synthetic = get_missing_percentage(synthetic_column) all_data = pd.concat([real_data, synthetic_data], axis=0, ignore_index=True) @@ -153,8 +154,8 @@ def make_continuous_column_plot(real_column, synthetic_column, sdtype): plotly.graph_objects._figure.Figure """ column_name = real_column.name if hasattr(real_column, 'name') else '' - missing_data_real = round((real_column.isna().sum() / len(real_column)) * 100, 2) - missing_data_synthetic = round((synthetic_column.isna().sum() / len(synthetic_column)), 2) + missing_data_real = get_missing_percentage(real_column) + missing_data_synthetic = get_missing_percentage(synthetic_column) real_data = real_column.dropna() synthetic_data = synthetic_column.dropna() @@ -473,9 +474,10 @@ def discretize_table_data(real_data, synthetic_data, metadata): real_col = real_data[field_name] synthetic_col = synthetic_data[field_name] if field_type == 'datetime': - if real_col.dtype == 'O' and field_meta.get('format', ''): - real_col = pd.to_datetime(real_col, format=field_meta['format']) - synthetic_col = pd.to_datetime(synthetic_col, format=field_meta['format']) + datetime_format = field_meta.get('format') or field_meta.get('datetime_format') + if real_col.dtype == 'O' and datetime_format: + real_col = pd.to_datetime(real_col, format=datetime_format) + synthetic_col = pd.to_datetime(synthetic_col, format=datetime_format) real_col = pd.to_numeric(real_col) synthetic_col = pd.to_numeric(synthetic_col) diff --git a/sdmetrics/utils.py b/sdmetrics/utils.py index e1c1bfca..532d73ae 100644 --- a/sdmetrics/utils.py +++ b/sdmetrics/utils.py @@ -74,6 +74,20 @@ def get_frequencies(real, synthetic): return f_obs, f_exp +def get_missing_percentage(data_column): + """Compute the missing value percentage of a column. + + Args: + data_column (pandas.Series): + The data of the desired column. + + Returns: + pandas.Series: + Percentage of missing values inside the column. + """ + return round((data_column.isna().sum() / len(data_column)) * 100, 2) + + def get_cardinality_distribution(parent_column, child_column): """Compute the cardinality distribution of the (parent, child) pairing. diff --git a/setup.cfg b/setup.cfg index 8ccf2a59..87031d5b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.9.0 +current_version = 0.9.1.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 1e2d5d4a..dee1c5dc 100644 --- a/setup.py +++ b/setup.py @@ -125,6 +125,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/SDMetrics', - version='0.9.0', + version='0.9.1.dev1', zip_safe=False, ) diff --git a/tests/unit/reports/test_utils.py b/tests/unit/reports/test_utils.py index afbffd95..5de4bf7b 100644 --- a/tests/unit/reports/test_utils.py +++ b/tests/unit/reports/test_utils.py @@ -898,6 +898,79 @@ def test_discretize_table_data(): } +def test_discretize_table_data_new_metadata(): + """Test the ``discretize_table_data`` method with new metadata. + + Expect that numerical and datetime fields are discretized. + + Input: + - real data + - synthetic data + - metadata + + Output: + - discretized real data + - discretized synthetic data + - updated metadata + """ + # Setup + real_data = pd.DataFrame({ + 'col1': [1, 2, 3], + 'col2': ['a', 'b', 'c'], + 'col3': [datetime(2020, 1, 2), datetime(2019, 10, 1), datetime(2021, 3, 2)], + 'col4': [True, False, True], + 'col5': [date(2020, 1, 2), date(2010, 10, 12), date(2021, 1, 2)], + }) + synthetic_data = pd.DataFrame({ + 'col1': [3, 1, 4], + 'col2': ['c', 'a', 'c'], + 'col3': [datetime(2021, 3, 2), datetime(2018, 11, 2), datetime(2020, 5, 7)], + 'col4': [False, False, True], + 'col5': [date(2020, 5, 3), date(2015, 11, 15), date(2022, 3, 2)], + }) + metadata = { + 'fields': { + 'col1': {'sdtype': 'numerical'}, + 'col2': {'sdtype': 'categorical'}, + 'col3': {'sdtype': 'datetime'}, + 'col4': {'sdtype': 'boolean'}, + 'col5': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'}, + }, + } + + # Run + discretized_real, discretized_synth, updated_metadata = discretize_table_data( + real_data, synthetic_data, metadata) + + # Assert + expected_real = pd.DataFrame({ + 'col1': [1, 6, 11], + 'col2': ['a', 'b', 'c'], + 'col3': [2, 1, 11], + 'col4': [True, False, True], + 'col5': [10, 1, 11], + }) + expected_synth = pd.DataFrame({ + 'col1': [11, 1, 11], + 'col2': ['c', 'a', 'c'], + 'col3': [11, 0, 5], + 'col4': [False, False, True], + 'col5': [10, 5, 11], + }) + + pd.testing.assert_frame_equal(discretized_real, expected_real) + pd.testing.assert_frame_equal(discretized_synth, expected_synth) + assert updated_metadata == { + 'fields': { + 'col1': {'sdtype': 'categorical'}, + 'col2': {'sdtype': 'categorical'}, + 'col3': {'sdtype': 'categorical'}, + 'col4': {'sdtype': 'boolean'}, + 'col5': {'sdtype': 'categorical'}, + }, + } + + @patch('sdmetrics.reports.utils.discretize_table_data') def test_discretize_and_apply_metric(discretize_table_data_mock): """Test the ``discretize_and_apply_metric`` method. diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 1c9218d0..07fec083 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -6,7 +6,7 @@ from sdmetrics.utils import ( HyperTransformer, get_alternate_keys, get_cardinality_distribution, get_columns_from_metadata, - get_type_from_column_meta) + get_missing_percentage, get_type_from_column_meta) def test_get_cardinality_distribution(): @@ -30,6 +30,25 @@ def test_get_cardinality_distribution(): assert cardinality_distribution.to_list() == [2.0, 0.0, 1.0, 3.0, 1.0] +def test_get_missing_percentage(): + """Test the ``get_missing_percentage`` utility function. + + Input: + - test column + + Output: + - the expected percentage of NaN inside the column. + """ + # Setup + column = pd.Series([1, 2, 3, np.nan, 5, 6, np.nan]) + + # Run + percentage_nan = get_missing_percentage(column) + + # Assert + assert percentage_nan == 28.57 + + def test_get_columns_from_metadata(): """Test the ``get_columns_from_metadata`` method with current metadata format.