Skip to content

Commit

Permalink
make release-tag: Merge branch 'master' into stable
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo committed Apr 12, 2023
2 parents 30a1729 + dd75dce commit 571a421
Show file tree
Hide file tree
Showing 24 changed files with 500 additions and 87 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
name: Integration Tests

on:
- push
- pull_request
push:
pull_request:
types: [opened, reopened]

jobs:
unit:
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
name: Style Checks

on:
- push
- pull_request
push:
pull_request:
types: [opened, reopened]

jobs:
lint:
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/minimum.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
name: Unit Tests Minimum Versions

on:
- push
- pull_request
push:
pull_request:
types: [opened, reopened]

jobs:
minimum:
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/readme.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
name: Test README

on:
- push
- pull_request
push:
pull_request:
types: [opened, reopened]

jobs:
readme:
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/unit.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
name: Unit Tests

on:
- push
- pull_request
push:
pull_request:
types: [opened, reopened]

jobs:
unit:
Expand Down
17 changes: 16 additions & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
# History

## v0.9.2 - 2023-03-07
## v0.9.3 - 2023-04-12

This release improves the clarity of warning/error messages. We also add a version add-on, update the workflow to optimize the runtime and fix a bug in the `NewRowSynthesis` metric when computing the `synthetic_sample_size` for multi-table.

### New Features
* Add functionality to find version add-on - Issue [#321](https://github.com/sdv-dev/SDMetrics/issues/321) by @frances-h
* More detailed warning in QualityReport when there is a constant input - Issue [#316](https://github.com/sdv-dev/SDMetrics/issues/316) by @pvk-developer
* Make error more informative in QualityReport when tables cannot be merged - Issue [#317](https://github.com/sdv-dev/SDMetrics/issues/317) by @frances-h
* More detailed warning in QualityReport for unexpected category values - Issue [#315](https://github.com/sdv-dev/SDMetrics/issues/315) by @frances-h

### Bug Fixes
* Multi table DiagnosticReport sets synthetic_sample_size too low for NewRowSynthesis - Issue [#320](https://github.com/sdv-dev/SDMetrics/issues/320) by @pvk-developer


## v0.9.2 - 2023-03-08

This release fixes bugs in the `NewRowSynthesis` metric when too many columns were present. It also fixes bugs around datetime columns that are formatted as strings in both `get_column_pair_plot` and `get_column_plot`.

### Bug Fixes
Expand Down
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% set version = '0.9.2' %}
{% set version = '0.9.3.dev1' %}

package:
name: "{{ name|lower }}"
Expand Down
5 changes: 4 additions & 1 deletion sdmetrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@

__author__ = 'MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__version__ = '0.9.2'
__version__ = '0.9.3.dev1'

import pandas as pd

from sdmetrics import (
column_pairs, demos, goal, multi_table, single_column, single_table, timeseries)
from sdmetrics._addons import _find_addons
from sdmetrics.demos import load_demo

__all__ = [
Expand All @@ -23,6 +24,8 @@
'timeseries',
]

_find_addons(group='sdmetrics_modules', parent_globals=globals())


def compute_metrics(metrics, real_data, synthetic_data, metadata=None, **kwargs):
"""Compute a collection of metrics on the given data.
Expand Down
26 changes: 26 additions & 0 deletions sdmetrics/_addons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""SDMetrics add-ons functionality."""
import warnings

from pkg_resources import iter_entry_points


def _find_addons(group, parent_globals):
"""Find and load add-ons based on the given group.
Args:
group (str):
The name of the entry points group to load.
parent_globals (dict):
The caller's global scope. Modules will be added
to the parent's global scope through their name.
"""
for entry_point in iter_entry_points(group=group):
try:
module = entry_point.load()
except Exception:
msg = f'Failed to load "{entry_point.name}" from "{entry_point.module}".'
warnings.warn(msg)
continue

if entry_point.name not in parent_globals:
parent_globals[entry_point.name] = module
29 changes: 24 additions & 5 deletions sdmetrics/column_pairs/statistical/correlation_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,21 @@ class CorrelationSimilarity(ColumnPairsMetric):
min_value = 0.0
max_value = 1.0

@staticmethod
def _generate_warning_msg(columns, prefix, warning_messages):
if len(columns) > 1:
cols = ', '.join(columns)
warning_messages.append(
f"The {prefix} in columns '{cols}' contain a constant value. "
'Correlation is undefined for constant data.'
)

elif len(columns):
warning_messages.append(
f"The {prefix} in column '{columns[0]}' contains a constant value. "
'Correlation is undefined for constant data.'
)

@classmethod
def compute_breakdown(cls, real_data, synthetic_data, coefficient='Pearson'):
"""Compare the breakdown of correlation similarity of two continuous columns.
Expand All @@ -53,11 +68,15 @@ def compute_breakdown(cls, real_data, synthetic_data, coefficient='Pearson'):
synthetic_data = pd.DataFrame(synthetic_data)

if (real_data.nunique() == 1).any() or (synthetic_data.nunique() == 1).any():
msg = (
'One or both of the input arrays is constant. '
'The CorrelationSimilarity metric is either undefined or infinite.'
)
warnings.warn(ConstantInputWarning(msg))
warning_messages = []
real_columns = list(real_data.loc[:, real_data.nunique() == 1].columns)
synthetic_columns = list(synthetic_data.loc[:, synthetic_data.nunique() == 1].columns)
cls._generate_warning_msg(real_columns, 'real data', warning_messages)
cls._generate_warning_msg(synthetic_columns, 'synthetic data', warning_messages)

for msg in warning_messages:
warnings.warn(ConstantInputWarning(msg))

return {'score': np.nan}

real_data = real_data.dropna()
Expand Down
43 changes: 30 additions & 13 deletions sdmetrics/multi_table/multi_single_table.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""MultiTable metrics based on applying SingleTable metrics on all the tables."""

import warnings
from collections import defaultdict

import numpy as np
Expand All @@ -8,6 +9,7 @@
from sdmetrics.errors import IncomputableMetricError
from sdmetrics.multi_table.base import MultiTableMetric
from sdmetrics.utils import nested_attrs_meta
from sdmetrics.warnings import SDMetricsWarning


class MultiSingleTableMetric(MultiTableMetric, metaclass=nested_attrs_meta('single_table_metric')):
Expand Down Expand Up @@ -37,6 +39,17 @@ def __init__(self, single_table_metric):
self.single_table_metric = single_table_metric
self.compute = self._compute

@staticmethod
def _multitable_warning(caught_warnings, table_name):
for warning in caught_warnings:
if issubclass(warning.category, SDMetricsWarning):
prefixes = ['The real data in', 'The synthetic data in']
message = str(warning.message)
for prefix in prefixes:
message = message.replace(prefix, f"{prefix[:-3]} in table '{table_name}',")

warnings.warn(warning.category(message))

def _compute(self, real_data, synthetic_data, metadata=None, **kwargs):
"""Compute this metric.
Expand All @@ -49,7 +62,7 @@ def _compute(self, real_data, synthetic_data, metadata=None, **kwargs):
synthetic_data (dict[str, pandas.DataFrame]):
The tables from the synthetic dataset.
metadata (dict):
Multi-table metadata dict. If not passed, it is build based on the
Multi-table metadata dict. If not passed, it is built based on the
real_data fields and dtypes.
**kwargs:
Any additional keyword arguments will be passed down
Expand All @@ -73,16 +86,20 @@ def _compute(self, real_data, synthetic_data, metadata=None, **kwargs):
synthetic_table = synthetic_data[table_name]
table_meta = metadata['tables'][table_name]

try:
score_breakdown = self.single_table_metric.compute_breakdown(
real_table, synthetic_table, table_meta, **kwargs)
scores[table_name] = score_breakdown
except AttributeError:
score = self.single_table_metric.compute(
real_table, synthetic_table, table_meta, **kwargs)
scores[table_name] = score
except Exception as error:
errors[table_name] = error
with warnings.catch_warnings(record=True) as caught_warnings:
try:
score_breakdown = self.single_table_metric.compute_breakdown(
real_table, synthetic_table, table_meta, **kwargs)
scores[table_name] = score_breakdown
except AttributeError:
score = self.single_table_metric.compute(
real_table, synthetic_table, table_meta, **kwargs)
scores[table_name] = score
except Exception as error:
errors[table_name] = error

if caught_warnings:
self._multitable_warning(caught_warnings, table_name)

if not scores:
raise IncomputableMetricError(f'Encountered the following errors: {errors}')
Expand All @@ -102,7 +119,7 @@ def compute(cls, real_data, synthetic_data, metadata=None, **kwargs):
synthetic_data (dict[str, pandas.DataFrame]):
The tables from the synthetic dataset.
metadata (dict):
Multi-table metadata dict. If not passed, it is build based on the
Multi-table metadata dict. If not passed, it is built based on the
real_data fields and dtypes.
**kwargs:
Any additional keyword arguments will be passed down
Expand Down Expand Up @@ -141,7 +158,7 @@ def compute_breakdown(cls, real_data, synthetic_data, metadata=None, **kwargs):
synthetic_data (dict[str, pandas.DataFrame]):
The tables from the synthetic dataset.
metadata (dict):
Multi-table metadata dict. If not passed, it is build based on the
Multi-table metadata dict. If not passed, it is built based on the
real_data fields and dtypes.
**kwargs:
Any additional keyword arguments will be passed down
Expand Down
8 changes: 4 additions & 4 deletions sdmetrics/reports/multi_table/diagnostic_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from sdmetrics.reports.single_table.plot_utils import (
get_column_boundaries_plot, get_column_coverage_plot, get_synthesis_plot)
from sdmetrics.reports.utils import (
DIAGNOSTIC_REPORT_RESULT_DETAILS, aggregate_metric_results, print_results_for_level)
DIAGNOSTIC_REPORT_RESULT_DETAILS, aggregate_metric_results, print_results_for_level,
validate_multi_table_inputs)


class DiagnosticReport():
Expand Down Expand Up @@ -82,16 +83,15 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
verbose (bool):
Whether or not to print report summary and progress.
"""
validate_multi_table_inputs(real_data, synthetic_data, metadata)

metadata = metadata.copy()
if 'relationships' in metadata:
for rel in metadata['relationships']:
table_meta = metadata['tables'][rel['child_table_name']]
table_meta['columns'][rel['child_foreign_key']] = {'sdtype': 'id'}

metrics = list(itertools.chain.from_iterable(self.METRICS.values()))
self._metric_args['NewRowSynthesis']['synthetic_sample_size'] = min(
len(real_data), self._metric_args['NewRowSynthesis']['synthetic_sample_size'])

for metric in tqdm.tqdm(metrics, desc='Creating report', disable=(not verbose)):
metric_name = metric.__name__
try:
Expand Down
5 changes: 4 additions & 1 deletion sdmetrics/reports/multi_table/quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
TVComplement)
from sdmetrics.reports.multi_table.plot_utils import get_table_relationships_plot
from sdmetrics.reports.single_table.plot_utils import get_column_pairs_plot, get_column_shapes_plot
from sdmetrics.reports.utils import aggregate_metric_results, discretize_and_apply_metric
from sdmetrics.reports.utils import (
aggregate_metric_results, discretize_and_apply_metric, validate_multi_table_inputs)


class QualityReport():
Expand Down Expand Up @@ -71,6 +72,8 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
verbose (bool):
Whether or not to print report summary and progress.
"""
validate_multi_table_inputs(real_data, synthetic_data, metadata)

metadata = metadata.copy()
if 'relationships' in metadata:
for rel in metadata['relationships']:
Expand Down
5 changes: 4 additions & 1 deletion sdmetrics/reports/single_table/diagnostic_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from sdmetrics.reports.single_table.plot_utils import (
get_column_boundaries_plot, get_column_coverage_plot, get_synthesis_plot)
from sdmetrics.reports.utils import (
DIAGNOSTIC_REPORT_RESULT_DETAILS, aggregate_metric_results, print_results_for_level)
DIAGNOSTIC_REPORT_RESULT_DETAILS, aggregate_metric_results, print_results_for_level,
validate_single_table_inputs)
from sdmetrics.single_table import (
BoundaryAdherence, CategoryCoverage, NewRowSynthesis, RangeCoverage)

Expand Down Expand Up @@ -80,6 +81,8 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
verbose (bool):
Whether or not to print report summary and progress.
"""
validate_single_table_inputs(real_data, synthetic_data, metadata)

metrics = list(itertools.chain.from_iterable(self.METRICS.values()))
self._metric_args['NewRowSynthesis']['synthetic_sample_size'] = min(
min(len(real_data), len(synthetic_data)),
Expand Down
5 changes: 4 additions & 1 deletion sdmetrics/reports/single_table/quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@

from sdmetrics.errors import IncomputableMetricError
from sdmetrics.reports.single_table.plot_utils import get_column_pairs_plot, get_column_shapes_plot
from sdmetrics.reports.utils import aggregate_metric_results, discretize_and_apply_metric
from sdmetrics.reports.utils import (
aggregate_metric_results, discretize_and_apply_metric, validate_single_table_inputs)
from sdmetrics.single_table import (
ContingencySimilarity, CorrelationSimilarity, KSComplement, TVComplement)

Expand Down Expand Up @@ -67,6 +68,8 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
verbose (bool):
Whether or not to print report summary and progress.
"""
validate_single_table_inputs(real_data, synthetic_data, metadata)

metrics = list(itertools.chain.from_iterable(self.METRICS.values()))

for metric in tqdm.tqdm(metrics, desc='Creating report', disable=(not verbose)):
Expand Down
Loading

0 comments on commit 571a421

Please sign in to comment.