Skip to content

Commit

Permalink
make release-tag: Merge branch 'master' into stable
Browse files Browse the repository at this point in the history
  • Loading branch information
frances-h committed Feb 17, 2023
2 parents bb7a3be + 5390ef8 commit cffb12e
Show file tree
Hide file tree
Showing 9 changed files with 129 additions and 13 deletions.
8 changes: 8 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# History

## v0.9.1 - 2023-02-17

This release fixes bugs in the existing metrics and reports.

### Bug Fixes
* Fix issue-296 for discrete and continuous columns - Issue [#296](https://github.com/sdv-dev/SDMetrics/issues/296) by @R-Palazzo
* Support new metadata for datetime_format - Issue [#303](https://github.com/sdv-dev/SDMetrics/issues/303) by @frances-h

## v0.9.0 - 2023-01-18

This release supports Python 3.10 and drops support for Python 3.6. We also add a verbosity argument to report generation.
Expand Down
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% set version = '0.9.0' %}
{% set version = '0.9.1.dev1' %}

package:
name: "{{ name|lower }}"
Expand Down
2 changes: 1 addition & 1 deletion sdmetrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

__author__ = 'MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__version__ = '0.9.0'
__version__ = '0.9.1.dev1'

import pandas as pd

Expand Down
18 changes: 10 additions & 8 deletions sdmetrics/reports/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from pandas.core.tools.datetimes import _guess_datetime_format_for_array

from sdmetrics.utils import (
get_alternate_keys, get_columns_from_metadata, get_type_from_column_meta, is_datetime)
get_alternate_keys, get_columns_from_metadata, get_missing_percentage,
get_type_from_column_meta, is_datetime)

DATACEBO_DARK = '#000036'
DATACEBO_LIGHT = '#01E0C9'
Expand Down Expand Up @@ -85,8 +86,8 @@ def make_discrete_column_plot(real_column, synthetic_column, sdtype):
synthetic_data = pd.DataFrame({'values': synthetic_column.copy()})
synthetic_data['Data'] = 'Synthetic'

missing_data_real = round((real_column.isna().sum() / len(real_column)) * 100, 2)
missing_data_synthetic = round((synthetic_column.isna().sum() / len(synthetic_column)), 2)
missing_data_real = get_missing_percentage(real_column)
missing_data_synthetic = get_missing_percentage(synthetic_column)

all_data = pd.concat([real_data, synthetic_data], axis=0, ignore_index=True)

Expand Down Expand Up @@ -153,8 +154,8 @@ def make_continuous_column_plot(real_column, synthetic_column, sdtype):
plotly.graph_objects._figure.Figure
"""
column_name = real_column.name if hasattr(real_column, 'name') else ''
missing_data_real = round((real_column.isna().sum() / len(real_column)) * 100, 2)
missing_data_synthetic = round((synthetic_column.isna().sum() / len(synthetic_column)), 2)
missing_data_real = get_missing_percentage(real_column)
missing_data_synthetic = get_missing_percentage(synthetic_column)

real_data = real_column.dropna()
synthetic_data = synthetic_column.dropna()
Expand Down Expand Up @@ -473,9 +474,10 @@ def discretize_table_data(real_data, synthetic_data, metadata):
real_col = real_data[field_name]
synthetic_col = synthetic_data[field_name]
if field_type == 'datetime':
if real_col.dtype == 'O' and field_meta.get('format', ''):
real_col = pd.to_datetime(real_col, format=field_meta['format'])
synthetic_col = pd.to_datetime(synthetic_col, format=field_meta['format'])
datetime_format = field_meta.get('format') or field_meta.get('datetime_format')
if real_col.dtype == 'O' and datetime_format:
real_col = pd.to_datetime(real_col, format=datetime_format)
synthetic_col = pd.to_datetime(synthetic_col, format=datetime_format)

real_col = pd.to_numeric(real_col)
synthetic_col = pd.to_numeric(synthetic_col)
Expand Down
14 changes: 14 additions & 0 deletions sdmetrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,20 @@ def get_frequencies(real, synthetic):
return f_obs, f_exp


def get_missing_percentage(data_column):
"""Compute the missing value percentage of a column.
Args:
data_column (pandas.Series):
The data of the desired column.
Returns:
pandas.Series:
Percentage of missing values inside the column.
"""
return round((data_column.isna().sum() / len(data_column)) * 100, 2)


def get_cardinality_distribution(parent_column, child_column):
"""Compute the cardinality distribution of the (parent, child) pairing.
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.9.0
current_version = 0.9.1.dev1
commit = True
tag = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/sdv-dev/SDMetrics',
version='0.9.0',
version='0.9.1.dev1',
zip_safe=False,
)
73 changes: 73 additions & 0 deletions tests/unit/reports/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -898,6 +898,79 @@ def test_discretize_table_data():
}


def test_discretize_table_data_new_metadata():
"""Test the ``discretize_table_data`` method with new metadata.
Expect that numerical and datetime fields are discretized.
Input:
- real data
- synthetic data
- metadata
Output:
- discretized real data
- discretized synthetic data
- updated metadata
"""
# Setup
real_data = pd.DataFrame({
'col1': [1, 2, 3],
'col2': ['a', 'b', 'c'],
'col3': [datetime(2020, 1, 2), datetime(2019, 10, 1), datetime(2021, 3, 2)],
'col4': [True, False, True],
'col5': [date(2020, 1, 2), date(2010, 10, 12), date(2021, 1, 2)],
})
synthetic_data = pd.DataFrame({
'col1': [3, 1, 4],
'col2': ['c', 'a', 'c'],
'col3': [datetime(2021, 3, 2), datetime(2018, 11, 2), datetime(2020, 5, 7)],
'col4': [False, False, True],
'col5': [date(2020, 5, 3), date(2015, 11, 15), date(2022, 3, 2)],
})
metadata = {
'fields': {
'col1': {'sdtype': 'numerical'},
'col2': {'sdtype': 'categorical'},
'col3': {'sdtype': 'datetime'},
'col4': {'sdtype': 'boolean'},
'col5': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
},
}

# Run
discretized_real, discretized_synth, updated_metadata = discretize_table_data(
real_data, synthetic_data, metadata)

# Assert
expected_real = pd.DataFrame({
'col1': [1, 6, 11],
'col2': ['a', 'b', 'c'],
'col3': [2, 1, 11],
'col4': [True, False, True],
'col5': [10, 1, 11],
})
expected_synth = pd.DataFrame({
'col1': [11, 1, 11],
'col2': ['c', 'a', 'c'],
'col3': [11, 0, 5],
'col4': [False, False, True],
'col5': [10, 5, 11],
})

pd.testing.assert_frame_equal(discretized_real, expected_real)
pd.testing.assert_frame_equal(discretized_synth, expected_synth)
assert updated_metadata == {
'fields': {
'col1': {'sdtype': 'categorical'},
'col2': {'sdtype': 'categorical'},
'col3': {'sdtype': 'categorical'},
'col4': {'sdtype': 'boolean'},
'col5': {'sdtype': 'categorical'},
},
}


@patch('sdmetrics.reports.utils.discretize_table_data')
def test_discretize_and_apply_metric(discretize_table_data_mock):
"""Test the ``discretize_and_apply_metric`` method.
Expand Down
21 changes: 20 additions & 1 deletion tests/unit/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from sdmetrics.utils import (
HyperTransformer, get_alternate_keys, get_cardinality_distribution, get_columns_from_metadata,
get_type_from_column_meta)
get_missing_percentage, get_type_from_column_meta)


def test_get_cardinality_distribution():
Expand All @@ -30,6 +30,25 @@ def test_get_cardinality_distribution():
assert cardinality_distribution.to_list() == [2.0, 0.0, 1.0, 3.0, 1.0]


def test_get_missing_percentage():
"""Test the ``get_missing_percentage`` utility function.
Input:
- test column
Output:
- the expected percentage of NaN inside the column.
"""
# Setup
column = pd.Series([1, 2, 3, np.nan, 5, 6, np.nan])

# Run
percentage_nan = get_missing_percentage(column)

# Assert
assert percentage_nan == 28.57


def test_get_columns_from_metadata():
"""Test the ``get_columns_from_metadata`` method with current metadata format.
Expand Down

0 comments on commit cffb12e

Please sign in to comment.