From c9967494126e6273d3d97ebf8c1b045861a3f126 Mon Sep 17 00:00:00 2001
From: Katharine Xiao <2405771+katxiao@users.noreply.github.com>
Date: Wed, 18 Jan 2023 15:58:48 -0500
Subject: [PATCH 1/5] =?UTF-8?q?Bump=20version:=200.9.0=20=E2=86=92=200.9.1?=
 =?UTF-8?q?.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 conda/meta.yaml       | 2 +-
 sdmetrics/__init__.py | 2 +-
 setup.cfg             | 2 +-
 setup.py              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/conda/meta.yaml b/conda/meta.yaml
index 3d3549ee..cdffb260 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = '0.9.0' %}
+{% set version = '0.9.1.dev0' %}
 
 package:
   name: "{{ name|lower }}"
diff --git a/sdmetrics/__init__.py b/sdmetrics/__init__.py
index cfe69936..98649d46 100644
--- a/sdmetrics/__init__.py
+++ b/sdmetrics/__init__.py
@@ -4,7 +4,7 @@
 
 __author__ = 'MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
-__version__ = '0.9.0'
+__version__ = '0.9.1.dev0'
 
 import pandas as pd
 
diff --git a/setup.cfg b/setup.cfg
index 8ccf2a59..be9185fe 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.9.0
+current_version = 0.9.1.dev0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 1e2d5d4a..a9174838 100644
--- a/setup.py
+++ b/setup.py
@@ -125,6 +125,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/sdv-dev/SDMetrics',
-    version='0.9.0',
+    version='0.9.1.dev0',
     zip_safe=False,
 )

From 35d856ff67a6f67bfc9e0f17597f02b52106ffeb Mon Sep 17 00:00:00 2001
From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com>
Date: Wed, 15 Feb 2023 19:56:06 +0100
Subject: [PATCH 2/5] fix issue-296 for discrete and continuous columns (#304)

* fix issue-296 for discrete and continous

* update flake8 test

* add method and unit test

* fix style check

* add blank line

* style

* improve style

* rounding part inside get_missing_percentage
---
 sdmetrics/reports/utils.py | 11 ++++++-----
 sdmetrics/utils.py         | 14 ++++++++++++++
 tests/unit/test_utils.py   | 21 ++++++++++++++++++++-
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/sdmetrics/reports/utils.py b/sdmetrics/reports/utils.py
index 27c0347b..ea2ff57f 100644
--- a/sdmetrics/reports/utils.py
+++ b/sdmetrics/reports/utils.py
@@ -10,7 +10,8 @@
 from pandas.core.tools.datetimes import _guess_datetime_format_for_array
 
 from sdmetrics.utils import (
-    get_alternate_keys, get_columns_from_metadata, get_type_from_column_meta, is_datetime)
+    get_alternate_keys, get_columns_from_metadata, get_missing_percentage,
+    get_type_from_column_meta, is_datetime)
 
 DATACEBO_DARK = '#000036'
 DATACEBO_LIGHT = '#01E0C9'
@@ -85,8 +86,8 @@ def make_discrete_column_plot(real_column, synthetic_column, sdtype):
     synthetic_data = pd.DataFrame({'values': synthetic_column.copy()})
     synthetic_data['Data'] = 'Synthetic'
 
-    missing_data_real = round((real_column.isna().sum() / len(real_column)) * 100, 2)
-    missing_data_synthetic = round((synthetic_column.isna().sum() / len(synthetic_column)), 2)
+    missing_data_real = get_missing_percentage(real_column)
+    missing_data_synthetic = get_missing_percentage(synthetic_column)
 
     all_data = pd.concat([real_data, synthetic_data], axis=0, ignore_index=True)
 
@@ -153,8 +154,8 @@ def make_continuous_column_plot(real_column, synthetic_column, sdtype):
         plotly.graph_objects._figure.Figure
     """
     column_name = real_column.name if hasattr(real_column, 'name') else ''
-    missing_data_real = round((real_column.isna().sum() / len(real_column)) * 100, 2)
-    missing_data_synthetic = round((synthetic_column.isna().sum() / len(synthetic_column)), 2)
+    missing_data_real = get_missing_percentage(real_column)
+    missing_data_synthetic = get_missing_percentage(synthetic_column)
 
     real_data = real_column.dropna()
     synthetic_data = synthetic_column.dropna()
diff --git a/sdmetrics/utils.py b/sdmetrics/utils.py
index e1c1bfca..532d73ae 100644
--- a/sdmetrics/utils.py
+++ b/sdmetrics/utils.py
@@ -74,6 +74,20 @@ def get_frequencies(real, synthetic):
     return f_obs, f_exp
 
 
+def get_missing_percentage(data_column):
+    """Compute the missing value percentage of a column.
+
+    Args:
+        data_column (pandas.Series):
+            The data of the desired column.
+
+    Returns:
+        pandas.Series:
+            Percentage of missing values inside the column.
+    """
+    return round((data_column.isna().sum() / len(data_column)) * 100, 2)
+
+
 def get_cardinality_distribution(parent_column, child_column):
     """Compute the cardinality distribution of the (parent, child) pairing.
 
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 1c9218d0..07fec083 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -6,7 +6,7 @@
 
 from sdmetrics.utils import (
     HyperTransformer, get_alternate_keys, get_cardinality_distribution, get_columns_from_metadata,
-    get_type_from_column_meta)
+    get_missing_percentage, get_type_from_column_meta)
 
 
 def test_get_cardinality_distribution():
@@ -30,6 +30,25 @@ def test_get_cardinality_distribution():
     assert cardinality_distribution.to_list() == [2.0, 0.0, 1.0, 3.0, 1.0]
 
 
+def test_get_missing_percentage():
+    """Test the ``get_missing_percentage`` utility function.
+
+    Input:
+    - test column
+
+    Output:
+    - the expected percentage of NaN inside the column.
+    """
+    # Setup
+    column = pd.Series([1, 2, 3, np.nan, 5, 6, np.nan])
+
+    # Run
+    percentage_nan = get_missing_percentage(column)
+
+    # Assert
+    assert percentage_nan == 28.57
+
+
 def test_get_columns_from_metadata():
     """Test the ``get_columns_from_metadata`` method with current metadata format.
 

From ef3c813b432694535373c21f07dc05225e013e58 Mon Sep 17 00:00:00 2001
From: Frances Hartwell <franceshartwell09@gmail.com>
Date: Thu, 16 Feb 2023 13:20:50 -0500
Subject: [PATCH 3/5] Support new metadata for `datetime_format` (#306)

* fix ValueError with datetime and new metadata

* fix lint
---
 sdmetrics/reports/utils.py       |  7 +--
 tests/unit/reports/test_utils.py | 73 ++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/sdmetrics/reports/utils.py b/sdmetrics/reports/utils.py
index ea2ff57f..ddaa898f 100644
--- a/sdmetrics/reports/utils.py
+++ b/sdmetrics/reports/utils.py
@@ -474,9 +474,10 @@ def discretize_table_data(real_data, synthetic_data, metadata):
             real_col = real_data[field_name]
             synthetic_col = synthetic_data[field_name]
             if field_type == 'datetime':
-                if real_col.dtype == 'O' and field_meta.get('format', ''):
-                    real_col = pd.to_datetime(real_col, format=field_meta['format'])
-                    synthetic_col = pd.to_datetime(synthetic_col, format=field_meta['format'])
+                datetime_format = field_meta.get('format') or field_meta.get('datetime_format')
+                if real_col.dtype == 'O' and datetime_format:
+                    real_col = pd.to_datetime(real_col, format=datetime_format)
+                    synthetic_col = pd.to_datetime(synthetic_col, format=datetime_format)
 
                 real_col = pd.to_numeric(real_col)
                 synthetic_col = pd.to_numeric(synthetic_col)
diff --git a/tests/unit/reports/test_utils.py b/tests/unit/reports/test_utils.py
index afbffd95..5de4bf7b 100644
--- a/tests/unit/reports/test_utils.py
+++ b/tests/unit/reports/test_utils.py
@@ -898,6 +898,79 @@ def test_discretize_table_data():
     }
 
 
+def test_discretize_table_data_new_metadata():
+    """Test the ``discretize_table_data`` method with new metadata.
+
+    Expect that numerical and datetime fields are discretized.
+
+    Input:
+    - real data
+    - synthetic data
+    - metadata
+
+    Output:
+    - discretized real data
+    - discretized synthetic data
+    - updated metadata
+    """
+    # Setup
+    real_data = pd.DataFrame({
+        'col1': [1, 2, 3],
+        'col2': ['a', 'b', 'c'],
+        'col3': [datetime(2020, 1, 2), datetime(2019, 10, 1), datetime(2021, 3, 2)],
+        'col4': [True, False, True],
+        'col5': [date(2020, 1, 2), date(2010, 10, 12), date(2021, 1, 2)],
+    })
+    synthetic_data = pd.DataFrame({
+        'col1': [3, 1, 4],
+        'col2': ['c', 'a', 'c'],
+        'col3': [datetime(2021, 3, 2), datetime(2018, 11, 2), datetime(2020, 5, 7)],
+        'col4': [False, False, True],
+        'col5': [date(2020, 5, 3), date(2015, 11, 15), date(2022, 3, 2)],
+    })
+    metadata = {
+        'fields': {
+            'col1': {'sdtype': 'numerical'},
+            'col2': {'sdtype': 'categorical'},
+            'col3': {'sdtype': 'datetime'},
+            'col4': {'sdtype': 'boolean'},
+            'col5': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
+        },
+    }
+
+    # Run
+    discretized_real, discretized_synth, updated_metadata = discretize_table_data(
+        real_data, synthetic_data, metadata)
+
+    # Assert
+    expected_real = pd.DataFrame({
+        'col1': [1, 6, 11],
+        'col2': ['a', 'b', 'c'],
+        'col3': [2, 1, 11],
+        'col4': [True, False, True],
+        'col5': [10, 1, 11],
+    })
+    expected_synth = pd.DataFrame({
+        'col1': [11, 1, 11],
+        'col2': ['c', 'a', 'c'],
+        'col3': [11, 0, 5],
+        'col4': [False, False, True],
+        'col5': [10, 5, 11],
+    })
+
+    pd.testing.assert_frame_equal(discretized_real, expected_real)
+    pd.testing.assert_frame_equal(discretized_synth, expected_synth)
+    assert updated_metadata == {
+        'fields': {
+            'col1': {'sdtype': 'categorical'},
+            'col2': {'sdtype': 'categorical'},
+            'col3': {'sdtype': 'categorical'},
+            'col4': {'sdtype': 'boolean'},
+            'col5': {'sdtype': 'categorical'},
+        },
+    }
+
+
 @patch('sdmetrics.reports.utils.discretize_table_data')
 def test_discretize_and_apply_metric(discretize_table_data_mock):
     """Test the ``discretize_and_apply_metric`` method.

From f05146651fc508213977af94fddf586f6a1b3d11 Mon Sep 17 00:00:00 2001
From: Frances Hartwell <frances.hartwell@alteryx.com>
Date: Thu, 16 Feb 2023 16:37:43 -0500
Subject: [PATCH 4/5] =?UTF-8?q?Bump=20version:=200.9.1.dev0=20=E2=86=92=20?=
 =?UTF-8?q?0.9.1.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 conda/meta.yaml       | 2 +-
 sdmetrics/__init__.py | 2 +-
 setup.cfg             | 2 +-
 setup.py              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conda/meta.yaml b/conda/meta.yaml
index cdffb260..d6640646 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = '0.9.1.dev0' %}
+{% set version = '0.9.1.dev1' %}
 
 package:
   name: "{{ name|lower }}"
diff --git a/sdmetrics/__init__.py b/sdmetrics/__init__.py
index 98649d46..89e4a4d5 100644
--- a/sdmetrics/__init__.py
+++ b/sdmetrics/__init__.py
@@ -4,7 +4,7 @@
 
 __author__ = 'MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
-__version__ = '0.9.1.dev0'
+__version__ = '0.9.1.dev1'
 
 import pandas as pd
 
diff --git a/setup.cfg b/setup.cfg
index be9185fe..87031d5b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.9.1.dev0
+current_version = 0.9.1.dev1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index a9174838..dee1c5dc 100644
--- a/setup.py
+++ b/setup.py
@@ -125,6 +125,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/sdv-dev/SDMetrics',
-    version='0.9.1.dev0',
+    version='0.9.1.dev1',
     zip_safe=False,
 )

From 5390ef898ebc4508eaaa8ffc41551c74a4f03769 Mon Sep 17 00:00:00 2001
From: Frances Hartwell <frances@datacebo.com>
Date: Fri, 17 Feb 2023 13:53:53 -0500
Subject: [PATCH 5/5] update history for v0.9.1 (#308)

---
 HISTORY.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index 1ad7bd05..89257a1b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,13 @@
 # History
 
+## v0.9.1 - 2023-02-17
+
+This release fixes bugs in the existing metrics and reports.
+
+### Bug Fixes
+* Fix issue-296 for discrete and continuous columns - Issue [#296](https://github.com/sdv-dev/SDMetrics/issues/296) by @R-Palazzo
+* Support new metadata for datetime_format - Issue [#303](https://github.com/sdv-dev/SDMetrics/issues/303) by @frances-h
+
 ## v0.9.0 - 2023-01-18
 
 This release supports Python 3.10 and drops support for Python 3.6. We also add a verbosity argument to report generation.