make release-tag: Merge branch 'master' into stable

sdv-dev · Apr 12, 2023 · 571a421 · 571a421
2 parents 30a1729 + dd75dce
commit 571a421
Show file tree

Hide file tree

Showing 24 changed files with 500 additions and 87 deletions.
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -1,8 +1,9 @@
 name: Integration Tests
 
 on:
-  - push
-  - pull_request
+  push:
+  pull_request:
+    types: [opened, reopened]
 
 jobs:
   unit:

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -1,8 +1,9 @@
 name: Style Checks
 
 on:
-  - push
-  - pull_request
+  push:
+  pull_request:
+    types: [opened, reopened]
 
 jobs:
   lint:

diff --git a/.github/workflows/minimum.yml b/.github/workflows/minimum.yml
@@ -1,8 +1,9 @@
 name: Unit Tests Minimum Versions
 
 on:
-  - push
-  - pull_request
+  push:
+  pull_request:
+    types: [opened, reopened]
 
 jobs:
   minimum:

diff --git a/.github/workflows/readme.yml b/.github/workflows/readme.yml
@@ -1,8 +1,9 @@
 name: Test README
 
 on:
-  - push
-  - pull_request
+  push:
+  pull_request:
+    types: [opened, reopened]
 
 jobs:
   readme:

diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml
@@ -1,8 +1,9 @@
 name: Unit Tests
 
 on:
-  - push
-  - pull_request
+  push:
+  pull_request:
+    types: [opened, reopened]
 
 jobs:
   unit:

diff --git a/HISTORY.md b/HISTORY.md
@@ -1,6 +1,21 @@
 # History
 
-## v0.9.2 - 2023-03-07
+## v0.9.3 - 2023-04-12
+
+This release improves the clarity of warning/error messages. We also add a version add-on, update the workflow to optimize the runtime and fix a bug in the `NewRowSynthesis` metric when computing the `synthetic_sample_size` for multi-table.
+
+### New Features
+* Add functionality to find version add-on - Issue [#321](https://github.com/sdv-dev/SDMetrics/issues/321) by @frances-h
+* More detailed warning in QualityReport when there is a constant input - Issue [#316](https://github.com/sdv-dev/SDMetrics/issues/316) by @pvk-developer
+* Make error more informative in QualityReport when tables cannot be merged - Issue [#317](https://github.com/sdv-dev/SDMetrics/issues/317) by @frances-h
+* More detailed warning in QualityReport for unexpected category values - Issue [#315](https://github.com/sdv-dev/SDMetrics/issues/315) by @frances-h
+
+### Bug Fixes
+* Multi table DiagnosticReport sets synthetic_sample_size too low for NewRowSynthesis - Issue [#320](https://github.com/sdv-dev/SDMetrics/issues/320) by @pvk-developer
+
+
+## v0.9.2 - 2023-03-08
+
 This release fixes bugs in the  `NewRowSynthesis` metric when too many columns were present. It also fixes bugs around datetime columns that are formatted as strings in both `get_column_pair_plot` and `get_column_plot`.
 
 ### Bug Fixes

diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = '0.9.2' %}
+{% set version = '0.9.3.dev1' %}
 
 package:
   name: "{{ name|lower }}"

diff --git a/sdmetrics/__init__.py b/sdmetrics/__init__.py
@@ -4,12 +4,13 @@
 
 __author__ = 'MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
-__version__ = '0.9.2'
+__version__ = '0.9.3.dev1'
 
 import pandas as pd
 
 from sdmetrics import (
     column_pairs, demos, goal, multi_table, single_column, single_table, timeseries)
+from sdmetrics._addons import _find_addons
 from sdmetrics.demos import load_demo
 
 __all__ = [
@@ -23,6 +24,8 @@
     'timeseries',
 ]
 
+_find_addons(group='sdmetrics_modules', parent_globals=globals())
+
 
 def compute_metrics(metrics, real_data, synthetic_data, metadata=None, **kwargs):
     """Compute a collection of metrics on the given data.

diff --git a/sdmetrics/_addons.py b/sdmetrics/_addons.py
@@ -0,0 +1,26 @@
+"""SDMetrics add-ons functionality."""
+import warnings
+
+from pkg_resources import iter_entry_points
+
+
+def _find_addons(group, parent_globals):
+    """Find and load add-ons based on the given group.
+
+    Args:
+        group (str):
+            The name of the entry points group to load.
+        parent_globals (dict):
+            The caller's global scope. Modules will be added
+            to the parent's global scope through their name.
+    """
+    for entry_point in iter_entry_points(group=group):
+        try:
+            module = entry_point.load()
+        except Exception:
+            msg = f'Failed to load "{entry_point.name}" from "{entry_point.module}".'
+            warnings.warn(msg)
+            continue
+
+        if entry_point.name not in parent_globals:
+            parent_globals[entry_point.name] = module
diff --git a/sdmetrics/column_pairs/statistical/correlation_similarity.py b/sdmetrics/column_pairs/statistical/correlation_similarity.py
@@ -31,6 +31,21 @@ class CorrelationSimilarity(ColumnPairsMetric):
     min_value = 0.0
     max_value = 1.0
 
+    @staticmethod
+    def _generate_warning_msg(columns, prefix, warning_messages):
+        if len(columns) > 1:
+            cols = ', '.join(columns)
+            warning_messages.append(
+                f"The {prefix} in columns '{cols}' contain a constant value. "
+                'Correlation is undefined for constant data.'
+            )
+
+        elif len(columns):
+            warning_messages.append(
+                f"The {prefix} in column '{columns[0]}' contains a constant value. "
+                'Correlation is undefined for constant data.'
+            )
+
     @classmethod
     def compute_breakdown(cls, real_data, synthetic_data, coefficient='Pearson'):
         """Compare the breakdown of correlation similarity of two continuous columns.
@@ -53,11 +68,15 @@ def compute_breakdown(cls, real_data, synthetic_data, coefficient='Pearson'):
             synthetic_data = pd.DataFrame(synthetic_data)
 
         if (real_data.nunique() == 1).any() or (synthetic_data.nunique() == 1).any():
-            msg = (
-                'One or both of the input arrays is constant. '
-                'The CorrelationSimilarity metric is either undefined or infinite.'
-            )
-            warnings.warn(ConstantInputWarning(msg))
+            warning_messages = []
+            real_columns = list(real_data.loc[:, real_data.nunique() == 1].columns)
+            synthetic_columns = list(synthetic_data.loc[:, synthetic_data.nunique() == 1].columns)
+            cls._generate_warning_msg(real_columns, 'real data', warning_messages)
+            cls._generate_warning_msg(synthetic_columns, 'synthetic data', warning_messages)
+
+            for msg in warning_messages:
+                warnings.warn(ConstantInputWarning(msg))
+
             return {'score': np.nan}
 
         real_data = real_data.dropna()

diff --git a/sdmetrics/multi_table/multi_single_table.py b/sdmetrics/multi_table/multi_single_table.py
@@ -1,5 +1,6 @@
 """MultiTable metrics based on applying SingleTable metrics on all the tables."""
 
+import warnings
 from collections import defaultdict
 
 import numpy as np
@@ -8,6 +9,7 @@
 from sdmetrics.errors import IncomputableMetricError
 from sdmetrics.multi_table.base import MultiTableMetric
 from sdmetrics.utils import nested_attrs_meta
+from sdmetrics.warnings import SDMetricsWarning
 
 
 class MultiSingleTableMetric(MultiTableMetric, metaclass=nested_attrs_meta('single_table_metric')):
@@ -37,6 +39,17 @@ def __init__(self, single_table_metric):
         self.single_table_metric = single_table_metric
         self.compute = self._compute
 
+    @staticmethod
+    def _multitable_warning(caught_warnings, table_name):
+        for warning in caught_warnings:
+            if issubclass(warning.category, SDMetricsWarning):
+                prefixes = ['The real data in', 'The synthetic data in']
+                message = str(warning.message)
+                for prefix in prefixes:
+                    message = message.replace(prefix, f"{prefix[:-3]} in table '{table_name}',")
+
+                warnings.warn(warning.category(message))
+
     def _compute(self, real_data, synthetic_data, metadata=None, **kwargs):
         """Compute this metric.
 
@@ -49,7 +62,7 @@ def _compute(self, real_data, synthetic_data, metadata=None, **kwargs):
             synthetic_data (dict[str, pandas.DataFrame]):
                 The tables from the synthetic dataset.
             metadata (dict):
-                Multi-table metadata dict. If not passed, it is build based on the
+                Multi-table metadata dict. If not passed, it is built based on the
                 real_data fields and dtypes.
             **kwargs:
                 Any additional keyword arguments will be passed down
@@ -73,16 +86,20 @@ def _compute(self, real_data, synthetic_data, metadata=None, **kwargs):
             synthetic_table = synthetic_data[table_name]
             table_meta = metadata['tables'][table_name]
 
-            try:
-                score_breakdown = self.single_table_metric.compute_breakdown(
-                    real_table, synthetic_table, table_meta, **kwargs)
-                scores[table_name] = score_breakdown
-            except AttributeError:
-                score = self.single_table_metric.compute(
-                    real_table, synthetic_table, table_meta, **kwargs)
-                scores[table_name] = score
-            except Exception as error:
-                errors[table_name] = error
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                try:
+                    score_breakdown = self.single_table_metric.compute_breakdown(
+                        real_table, synthetic_table, table_meta, **kwargs)
+                    scores[table_name] = score_breakdown
+                except AttributeError:
+                    score = self.single_table_metric.compute(
+                        real_table, synthetic_table, table_meta, **kwargs)
+                    scores[table_name] = score
+                except Exception as error:
+                    errors[table_name] = error
+
+            if caught_warnings:
+                self._multitable_warning(caught_warnings, table_name)
 
         if not scores:
             raise IncomputableMetricError(f'Encountered the following errors: {errors}')
@@ -102,7 +119,7 @@ def compute(cls, real_data, synthetic_data, metadata=None, **kwargs):
             synthetic_data (dict[str, pandas.DataFrame]):
                 The tables from the synthetic dataset.
             metadata (dict):
-                Multi-table metadata dict. If not passed, it is build based on the
+                Multi-table metadata dict. If not passed, it is built based on the
                 real_data fields and dtypes.
             **kwargs:
                 Any additional keyword arguments will be passed down
@@ -141,7 +158,7 @@ def compute_breakdown(cls, real_data, synthetic_data, metadata=None, **kwargs):
             synthetic_data (dict[str, pandas.DataFrame]):
                 The tables from the synthetic dataset.
             metadata (dict):
-                Multi-table metadata dict. If not passed, it is build based on the
+                Multi-table metadata dict. If not passed, it is built based on the
                 real_data fields and dtypes.
             **kwargs:
                 Any additional keyword arguments will be passed down

diff --git a/sdmetrics/reports/multi_table/diagnostic_report.py b/sdmetrics/reports/multi_table/diagnostic_report.py
@@ -17,7 +17,8 @@
 from sdmetrics.reports.single_table.plot_utils import (
     get_column_boundaries_plot, get_column_coverage_plot, get_synthesis_plot)
 from sdmetrics.reports.utils import (
-    DIAGNOSTIC_REPORT_RESULT_DETAILS, aggregate_metric_results, print_results_for_level)
+    DIAGNOSTIC_REPORT_RESULT_DETAILS, aggregate_metric_results, print_results_for_level,
+    validate_multi_table_inputs)
 
 
 class DiagnosticReport():
@@ -82,16 +83,15 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
             verbose (bool):
                 Whether or not to print report summary and progress.
         """
+        validate_multi_table_inputs(real_data, synthetic_data, metadata)
+
         metadata = metadata.copy()
         if 'relationships' in metadata:
             for rel in metadata['relationships']:
                 table_meta = metadata['tables'][rel['child_table_name']]
                 table_meta['columns'][rel['child_foreign_key']] = {'sdtype': 'id'}
 
         metrics = list(itertools.chain.from_iterable(self.METRICS.values()))
-        self._metric_args['NewRowSynthesis']['synthetic_sample_size'] = min(
-            len(real_data), self._metric_args['NewRowSynthesis']['synthetic_sample_size'])
-
         for metric in tqdm.tqdm(metrics, desc='Creating report', disable=(not verbose)):
             metric_name = metric.__name__
             try:

diff --git a/sdmetrics/reports/multi_table/quality_report.py b/sdmetrics/reports/multi_table/quality_report.py
@@ -16,7 +16,8 @@
     TVComplement)
 from sdmetrics.reports.multi_table.plot_utils import get_table_relationships_plot
 from sdmetrics.reports.single_table.plot_utils import get_column_pairs_plot, get_column_shapes_plot
-from sdmetrics.reports.utils import aggregate_metric_results, discretize_and_apply_metric
+from sdmetrics.reports.utils import (
+    aggregate_metric_results, discretize_and_apply_metric, validate_multi_table_inputs)
 
 
 class QualityReport():
@@ -71,6 +72,8 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
             verbose (bool):
                 Whether or not to print report summary and progress.
         """
+        validate_multi_table_inputs(real_data, synthetic_data, metadata)
+
         metadata = metadata.copy()
         if 'relationships' in metadata:
             for rel in metadata['relationships']:

diff --git a/sdmetrics/reports/single_table/diagnostic_report.py b/sdmetrics/reports/single_table/diagnostic_report.py
@@ -15,7 +15,8 @@
 from sdmetrics.reports.single_table.plot_utils import (
     get_column_boundaries_plot, get_column_coverage_plot, get_synthesis_plot)
 from sdmetrics.reports.utils import (
-    DIAGNOSTIC_REPORT_RESULT_DETAILS, aggregate_metric_results, print_results_for_level)
+    DIAGNOSTIC_REPORT_RESULT_DETAILS, aggregate_metric_results, print_results_for_level,
+    validate_single_table_inputs)
 from sdmetrics.single_table import (
     BoundaryAdherence, CategoryCoverage, NewRowSynthesis, RangeCoverage)
 
@@ -80,6 +81,8 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
             verbose (bool):
                 Whether or not to print report summary and progress.
         """
+        validate_single_table_inputs(real_data, synthetic_data, metadata)
+
         metrics = list(itertools.chain.from_iterable(self.METRICS.values()))
         self._metric_args['NewRowSynthesis']['synthetic_sample_size'] = min(
             min(len(real_data), len(synthetic_data)),

diff --git a/sdmetrics/reports/single_table/quality_report.py b/sdmetrics/reports/single_table/quality_report.py
@@ -12,7 +12,8 @@
 
 from sdmetrics.errors import IncomputableMetricError
 from sdmetrics.reports.single_table.plot_utils import get_column_pairs_plot, get_column_shapes_plot
-from sdmetrics.reports.utils import aggregate_metric_results, discretize_and_apply_metric
+from sdmetrics.reports.utils import (
+    aggregate_metric_results, discretize_and_apply_metric, validate_single_table_inputs)
 from sdmetrics.single_table import (
     ContingencySimilarity, CorrelationSimilarity, KSComplement, TVComplement)
 
@@ -67,6 +68,8 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
             verbose (bool):
                 Whether or not to print report summary and progress.
         """
+        validate_single_table_inputs(real_data, synthetic_data, metadata)
+
         metrics = list(itertools.chain.from_iterable(self.METRICS.values()))
 
         for metric in tqdm.tqdm(metrics, desc='Creating report', disable=(not verbose)):