make release-tag: Merge branch 'master' into stable

sdv-dev · Feb 17, 2023 · cffb12e · cffb12e
2 parents bb7a3be + 5390ef8
commit cffb12e
Show file tree

Hide file tree

Showing 9 changed files with 129 additions and 13 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,13 @@
 # History
 
+## v0.9.1 - 2023-02-17
+
+This release fixes bugs in the existing metrics and reports.
+
+### Bug Fixes
+* Fix issue-296 for discrete and continuous columns - Issue [#296](https://github.com/sdv-dev/SDMetrics/issues/296) by @R-Palazzo
+* Support new metadata for datetime_format - Issue [#303](https://github.com/sdv-dev/SDMetrics/issues/303) by @frances-h
+
 ## v0.9.0 - 2023-01-18
 
 This release supports Python 3.10 and drops support for Python 3.6. We also add a verbosity argument to report generation.

diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = '0.9.0' %}
+{% set version = '0.9.1.dev1' %}
 
 package:
   name: "{{ name|lower }}"

diff --git a/sdmetrics/__init__.py b/sdmetrics/__init__.py
@@ -4,7 +4,7 @@
 
 __author__ = 'MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
-__version__ = '0.9.0'
+__version__ = '0.9.1.dev1'
 
 import pandas as pd
 

diff --git a/sdmetrics/reports/utils.py b/sdmetrics/reports/utils.py
@@ -10,7 +10,8 @@
 from pandas.core.tools.datetimes import _guess_datetime_format_for_array
 
 from sdmetrics.utils import (
-    get_alternate_keys, get_columns_from_metadata, get_type_from_column_meta, is_datetime)
+    get_alternate_keys, get_columns_from_metadata, get_missing_percentage,
+    get_type_from_column_meta, is_datetime)
 
 DATACEBO_DARK = '#000036'
 DATACEBO_LIGHT = '#01E0C9'
@@ -85,8 +86,8 @@ def make_discrete_column_plot(real_column, synthetic_column, sdtype):
     synthetic_data = pd.DataFrame({'values': synthetic_column.copy()})
     synthetic_data['Data'] = 'Synthetic'
 
-    missing_data_real = round((real_column.isna().sum() / len(real_column)) * 100, 2)
-    missing_data_synthetic = round((synthetic_column.isna().sum() / len(synthetic_column)), 2)
+    missing_data_real = get_missing_percentage(real_column)
+    missing_data_synthetic = get_missing_percentage(synthetic_column)
 
     all_data = pd.concat([real_data, synthetic_data], axis=0, ignore_index=True)
 
@@ -153,8 +154,8 @@ def make_continuous_column_plot(real_column, synthetic_column, sdtype):
         plotly.graph_objects._figure.Figure
     """
     column_name = real_column.name if hasattr(real_column, 'name') else ''
-    missing_data_real = round((real_column.isna().sum() / len(real_column)) * 100, 2)
-    missing_data_synthetic = round((synthetic_column.isna().sum() / len(synthetic_column)), 2)
+    missing_data_real = get_missing_percentage(real_column)
+    missing_data_synthetic = get_missing_percentage(synthetic_column)
 
     real_data = real_column.dropna()
     synthetic_data = synthetic_column.dropna()
@@ -473,9 +474,10 @@ def discretize_table_data(real_data, synthetic_data, metadata):
             real_col = real_data[field_name]
             synthetic_col = synthetic_data[field_name]
             if field_type == 'datetime':
-                if real_col.dtype == 'O' and field_meta.get('format', ''):
-                    real_col = pd.to_datetime(real_col, format=field_meta['format'])
-                    synthetic_col = pd.to_datetime(synthetic_col, format=field_meta['format'])
+                datetime_format = field_meta.get('format') or field_meta.get('datetime_format')
+                if real_col.dtype == 'O' and datetime_format:
+                    real_col = pd.to_datetime(real_col, format=datetime_format)
+                    synthetic_col = pd.to_datetime(synthetic_col, format=datetime_format)
 
                 real_col = pd.to_numeric(real_col)
                 synthetic_col = pd.to_numeric(synthetic_col)

diff --git a/sdmetrics/utils.py b/sdmetrics/utils.py
@@ -74,6 +74,20 @@ def get_frequencies(real, synthetic):
     return f_obs, f_exp
 
 
+def get_missing_percentage(data_column):
+    """Compute the missing value percentage of a column.
+
+    Args:
+        data_column (pandas.Series):
+            The data of the desired column.
+
+    Returns:
+        pandas.Series:
+            Percentage of missing values inside the column.
+    """
+    return round((data_column.isna().sum() / len(data_column)) * 100, 2)
+
+
 def get_cardinality_distribution(parent_column, child_column):
     """Compute the cardinality distribution of the (parent, child) pairing.
 

diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.9.0
+current_version = 0.9.1.dev1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?

diff --git a/setup.py b/setup.py
@@ -125,6 +125,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/sdv-dev/SDMetrics',
-    version='0.9.0',
+    version='0.9.1.dev1',
     zip_safe=False,
 )
diff --git a/tests/unit/reports/test_utils.py b/tests/unit/reports/test_utils.py
@@ -898,6 +898,79 @@ def test_discretize_table_data():
     }
 
 
+def test_discretize_table_data_new_metadata():
+    """Test the ``discretize_table_data`` method with new metadata.
+
+    Expect that numerical and datetime fields are discretized.
+
+    Input:
+    - real data
+    - synthetic data
+    - metadata
+
+    Output:
+    - discretized real data
+    - discretized synthetic data
+    - updated metadata
+    """
+    # Setup
+    real_data = pd.DataFrame({
+        'col1': [1, 2, 3],
+        'col2': ['a', 'b', 'c'],
+        'col3': [datetime(2020, 1, 2), datetime(2019, 10, 1), datetime(2021, 3, 2)],
+        'col4': [True, False, True],
+        'col5': [date(2020, 1, 2), date(2010, 10, 12), date(2021, 1, 2)],
+    })
+    synthetic_data = pd.DataFrame({
+        'col1': [3, 1, 4],
+        'col2': ['c', 'a', 'c'],
+        'col3': [datetime(2021, 3, 2), datetime(2018, 11, 2), datetime(2020, 5, 7)],
+        'col4': [False, False, True],
+        'col5': [date(2020, 5, 3), date(2015, 11, 15), date(2022, 3, 2)],
+    })
+    metadata = {
+        'fields': {
+            'col1': {'sdtype': 'numerical'},
+            'col2': {'sdtype': 'categorical'},
+            'col3': {'sdtype': 'datetime'},
+            'col4': {'sdtype': 'boolean'},
+            'col5': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
+        },
+    }
+
+    # Run
+    discretized_real, discretized_synth, updated_metadata = discretize_table_data(
+        real_data, synthetic_data, metadata)
+
+    # Assert
+    expected_real = pd.DataFrame({
+        'col1': [1, 6, 11],
+        'col2': ['a', 'b', 'c'],
+        'col3': [2, 1, 11],
+        'col4': [True, False, True],
+        'col5': [10, 1, 11],
+    })
+    expected_synth = pd.DataFrame({
+        'col1': [11, 1, 11],
+        'col2': ['c', 'a', 'c'],
+        'col3': [11, 0, 5],
+        'col4': [False, False, True],
+        'col5': [10, 5, 11],
+    })
+
+    pd.testing.assert_frame_equal(discretized_real, expected_real)
+    pd.testing.assert_frame_equal(discretized_synth, expected_synth)
+    assert updated_metadata == {
+        'fields': {
+            'col1': {'sdtype': 'categorical'},
+            'col2': {'sdtype': 'categorical'},
+            'col3': {'sdtype': 'categorical'},
+            'col4': {'sdtype': 'boolean'},
+            'col5': {'sdtype': 'categorical'},
+        },
+    }
+
+
 @patch('sdmetrics.reports.utils.discretize_table_data')
 def test_discretize_and_apply_metric(discretize_table_data_mock):
     """Test the ``discretize_and_apply_metric`` method.

diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
@@ -6,7 +6,7 @@
 
 from sdmetrics.utils import (
     HyperTransformer, get_alternate_keys, get_cardinality_distribution, get_columns_from_metadata,
-    get_type_from_column_meta)
+    get_missing_percentage, get_type_from_column_meta)
 
 
 def test_get_cardinality_distribution():
@@ -30,6 +30,25 @@ def test_get_cardinality_distribution():
     assert cardinality_distribution.to_list() == [2.0, 0.0, 1.0, 3.0, 1.0]
 
 
+def test_get_missing_percentage():
+    """Test the ``get_missing_percentage`` utility function.
+
+    Input:
+    - test column
+
+    Output:
+    - the expected percentage of NaN inside the column.
+    """
+    # Setup
+    column = pd.Series([1, 2, 3, np.nan, 5, 6, np.nan])
+
+    # Run
+    percentage_nan = get_missing_percentage(column)
+
+    # Assert
+    assert percentage_nan == 28.57
+
+
 def test_get_columns_from_metadata():
     """Test the ``get_columns_from_metadata`` method with current metadata format.