make release-tag: Merge branch 'main' into stable

sdv-dev · Dec 5, 2023 · 27e9acd · 27e9acd
2 parents 12e9082 + 66026b5
commit 27e9acd
Show file tree

Hide file tree

Showing 26 changed files with 807 additions and 142 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,29 @@
 # Release Notes
 
+## 1.8.0 - 2023-12-05
+
+This release adds support for the new Diagnostic Report from SDMetrics. This report calculates scores for three basic but important properties of your data: data validity, data structure and in the multi table case, relationship validity. Data validity checks that the columns of your data are valid (eg. correct range or values). Data structure makes sure the synthetic data has the correct columns. Relationship validity checks to make sure key references are correct and the cardinality is within ranges seen in the real data.
+
+Additionally, a few bugs were fixed and functionality was improved around synthesizers. It is now possible to access the loss values for the `TVAESynthesizer` and `CTGANSynthesizer` by using the `get_loss_values` method. The `get_parameters` method is now more detailed and returns all the parameters used to make a synthesizer. The metadata is now capable of detecting some common pii sdtypes. Finally, a bug that made every parent row generated by the `HMASynthesizer` have at least one child row was patched. This should improve cardinality.
+
+### Maintenance
+
+* Address `SettingWithCopyWarning` (HMASynthesizer) - Issue [#1557](https://github.com/sdv-dev/SDV/issues/1557) by @pvk-developer
+* Bump SDMetrics version - Issue [#1702](https://github.com/sdv-dev/SDV/issues/1702) by @amontanez24
+
+### New Features
+
+* Allow me to access loss values for GAN-based synthesizers  - Issue [#1671](https://github.com/sdv-dev/SDV/issues/1671) by @frances-h
+* Create a unified `get_parameters` method for all multi-table synthesizers - Issue [#1674](https://github.com/sdv-dev/SDV/issues/1674) by @frances-h
+* Set credentials key as variables - Issue [#1680](https://github.com/sdv-dev/SDV/issues/1680) by @R-Palazzo
+* Identifying PII Sdtypes in Metadata - Issue [#1683](https://github.com/sdv-dev/SDV/issues/1683) by @R-Palazzo
+* Make SDV compatible with the latest SDMetrics - Issue [#1687](https://github.com/sdv-dev/SDV/issues/1687) by @fealho
+* SingleTablePreset uses FrequencyEncoder - Issue [#1695](https://github.com/sdv-dev/SDV/issues/1695) by @fealho
+
+### Bugs Fixed
+
+* HMASynthesizer creates too much synthetic data (always creates a child for every parent row) - Issue [#1673](https://github.com/sdv-dev/SDV/issues/1673) by @frances-h
+
 ## 1.7.0 - 2023-11-16
 
 This release adds an alert to the `CTGANSynthesizer` during preprocessing. The alert informs the user if the fitting of the synthesizer is likely to be slow on their schema. Additionally, it is now possible to enforce that sampled datetime values stay within the range of the fitted data!

diff --git a/sdv/__init__.py b/sdv/__init__.py
@@ -6,7 +6,7 @@
 
 __author__ = 'DataCebo, Inc.'
 __email__ = 'info@sdv.dev'
-__version__ = '1.7.0'
+__version__ = '1.8.0.dev1'
 
 
 import sys

diff --git a/sdv/datasets/demo.py b/sdv/datasets/demo.py
@@ -4,7 +4,6 @@
 import json
 import logging
 import os
-import urllib.request
 from collections import defaultdict
 from pathlib import Path
 from zipfile import ZipFile
@@ -14,13 +13,15 @@
 import pandas as pd
 from botocore import UNSIGNED
 from botocore.client import Config
+from botocore.exceptions import ClientError
 
 from sdv.metadata.multi_table import MultiTableMetadata
 from sdv.metadata.single_table import SingleTableMetadata
 
 LOGGER = logging.getLogger(__name__)
 BUCKET = 'sdv-demo-datasets'
 BUCKET_URL = 'https://sdv-demo-datasets.s3.amazonaws.com'
+SIGNATURE_VERSION = UNSIGNED
 METADATA_FILENAME = 'metadata.json'
 
 
@@ -38,19 +39,27 @@ def _validate_output_folder(output_folder_name):
         )
 
 
+def _get_data_from_bucket(object_key):
+    session = boto3.Session()
+    s3 = session.client('s3', config=Config(signature_version=SIGNATURE_VERSION))
+    response = s3.get_object(Bucket=BUCKET, Key=object_key)
+    return response['Body'].read()
+
+
 def _download(modality, dataset_name):
     dataset_url = f'{BUCKET_URL}/{modality.upper()}/{dataset_name}.zip'
+    object_key = f'{modality.upper()}/{dataset_name}.zip'
     LOGGER.info(f'Downloading dataset {dataset_name} from {dataset_url}')
     try:
-        response = urllib.request.urlopen(dataset_url)
-    except urllib.error.HTTPError:
+        file_content = _get_data_from_bucket(object_key)
+    except ClientError:
         raise ValueError(
             f"Invalid dataset name '{dataset_name}'. "
             'Make sure you have the correct modality for the dataset name or '
             "use 'get_available_demos' to get a list of demo datasets."
         )
 
-    return io.BytesIO(response.read())
+    return io.BytesIO(file_content)
 
 
 def _extract_data(bytes_io, output_folder_name):
@@ -162,7 +171,7 @@ def get_available_demos(modality):
             * If ``modality`` is not ``'single_table'``, ``'multi_table'`` or ``'sequential'``.
     """
     _validate_modalities(modality)
-    client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
+    client = boto3.client('s3', config=Config(signature_version=SIGNATURE_VERSION))
     tables_info = defaultdict(list)
     for item in client.list_objects(Bucket=BUCKET)['Contents']:
         dataset_modality, dataset = item['Key'].split('/', 1)

diff --git a/sdv/lite/single_table.py b/sdv/lite/single_table.py
@@ -4,7 +4,6 @@
 import sys
 
 import cloudpickle
-import rdt.transformers
 
 from sdv.single_table import GaussianCopulaSynthesizer
 
@@ -38,10 +37,6 @@ def _setup_fast_preset(self, metadata, locales):
             enforce_rounding=False,
             locales=locales
         )
-        self._synthesizer._data_processor._update_transformers_by_sdtypes(
-            'categorical',
-            rdt.transformers.FrequencyEncoder(add_noise=True)
-        )
 
     def __init__(self, metadata, name, locales=None):
         if name not in PRESETS:

diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py
@@ -52,6 +52,46 @@ class SingleTableMetadata:
         'sequence_index',
         'METADATA_SPEC_VERSION'
     ])
+
+    _REFERENCE_TO_SDTYPE = {
+        'phonenumber': 'phone_number',
+        'email': 'email',
+        'ssn': 'ssn',
+        'firstname': 'first_name',
+        'lastname': 'last_name',
+        'countrycode': 'country_code',
+        'administativeunit': 'administrative_unit',
+        'state': 'administrative_unit',
+        'province': 'administrative_unit',
+        'stateabbr': 'state_abbr',
+        'city': 'city',
+        'postalcode': 'postcode',
+        'zipcode': 'postcode',
+        'postcode': 'postcode',
+        'streetaddress': 'street_address',
+        'line1': 'street_address',
+        'secondaryaddress': 'secondary_address',
+        'line2': 'secondary_address',
+        'latitude': 'latitude',
+        'longitude': 'longitude',
+        'ipv4': 'ipv4_address',
+        'ipv4address': 'ipv4_address',
+        'ipv6': 'ipv6_address',
+        'ipv6address': 'ipv6_address',
+        'ipaddress': 'ipv6_address',
+        'macaddress': 'mac_address',
+        'useragent': 'user_agent_string',
+        'useragentstring': 'user_agent_string',
+        'iban': 'iban',
+        'swift': 'swift11',
+        'swift11': 'swift11',
+        'swift8': 'swift8',
+        'creditcardnumber': 'credit_card_number',
+        'vin': 'vin',
+        'licenseplate': 'license_plate',
+        'license': 'license_plate',
+    }
+
     METADATA_SPEC_VERSION = 'SINGLE_TABLE_V1'
     _DEFAULT_SDTYPES = list(_SDTYPE_KWARGS) + list(SDTYPE_ANONYMIZERS)
 
@@ -250,6 +290,19 @@ def to_dict(self):
 
         return deepcopy(metadata)
 
+    def _detect_pii_column(self, column_name):
+        """Detect PII columns.
+
+        Args:
+            column_name (str):
+                The column name to be analyzed.
+        """
+        cleaned_name = re.sub(r'[^a-zA-Z0-9]', '', column_name).lower()
+        return next((
+            sdtype for reference, sdtype in self._REFERENCE_TO_SDTYPE.items()
+            if reference in cleaned_name
+        ), None)
+
     def _determine_sdtype_for_numbers(self, data):
         """Determine the sdtype for a numerical column.
 
@@ -322,31 +375,32 @@ def _detect_columns(self, data):
             clean_data = column_data.dropna()
             dtype = clean_data.infer_objects().dtype.kind
 
-            sdtype = None
-            if dtype in self._DTYPES_TO_SDTYPES:
-                sdtype = self._DTYPES_TO_SDTYPES[dtype]
-            elif dtype in ['i', 'f']:
-                sdtype = self._determine_sdtype_for_numbers(column_data)
-
-            elif dtype == 'O':
-                sdtype = self._determine_sdtype_for_objects(column_data)
-
+            sdtype = self._detect_pii_column(field)
             if sdtype is None:
-                raise InvalidMetadataError(
-                    f"Unsupported data type for column '{field}' (kind: {dtype})."
-                    "The valid data types are: 'object', 'int', 'float', 'datetime', 'bool'."
-                )
-
-            # Set the first ID column we detect to be the primary key
-            if sdtype == 'id':
-                if self.primary_key is None:
-                    self.primary_key = field
-                else:
-                    sdtype = 'unknown'
+                if dtype in self._DTYPES_TO_SDTYPES:
+                    sdtype = self._DTYPES_TO_SDTYPES[dtype]
+                elif dtype in ['i', 'f']:
+                    sdtype = self._determine_sdtype_for_numbers(column_data)
+
+                elif dtype == 'O':
+                    sdtype = self._determine_sdtype_for_objects(column_data)
+
+                if sdtype is None:
+                    raise InvalidMetadataError(
+                        f"Unsupported data type for column '{field}' (kind: {dtype})."
+                        "The valid data types are: 'object', 'int', 'float', 'datetime', 'bool'."
+                    )
+
+                # Set the first ID column we detect to be the primary key
+                if sdtype == 'id':
+                    if self.primary_key is None:
+                        self.primary_key = field
+                    else:
+                        sdtype = 'unknown'
 
             column_dict = {'sdtype': sdtype}
 
-            if sdtype == 'unknown':
+            if sdtype in self._REFERENCE_TO_SDTYPE.values() or sdtype == 'unknown':
                 column_dict['pii'] = True
             elif sdtype == 'datetime' and dtype == 'O':
                 datetime_format = get_datetime_format(column_data.iloc[:100])

diff --git a/sdv/multi_table/base.py b/sdv/multi_table/base.py
@@ -115,7 +115,7 @@ def set_address_columns(self, table_name, column_names, anonymization_level='ful
         self._table_synthesizers[table_name].set_address_columns(column_names, anonymization_level)
 
     def get_table_parameters(self, table_name):
-        """Return the parameters that will be used to instantiate the table's synthesizer.
+        """Return the parameters for the given table's synthesizer.
 
         Args:
             table_name (str):
@@ -126,21 +126,33 @@ def get_table_parameters(self, table_name):
                 A dictionary representing the parameters that will be used to instantiate the
                 table's synthesizer.
         """
-        return self._table_parameters.get(table_name, {})
+        table_synthesizer = self._table_synthesizers.get(table_name)
+        if not table_synthesizer:
+            table_params = {'table_synthesizer': None, 'table_parameters': {}}
+        else:
+            table_params = {
+                'table_synthesizer': type(table_synthesizer).__name__,
+                'table_parameters': table_synthesizer.get_parameters()
+            }
 
-    def get_parameters(self, table_name):
-        """Return the parameters used to instantiate the table's synthesizer.
+        return table_params
 
-        Args:
-            table_name (str):
-                Table name for which the parameters should be retrieved.
+    def get_parameters(self):
+        """Return the parameters used to instantiate the synthesizer and all table synthesizers.
 
         Returns:
             parameters (dict):
-                A dictionary representing the parameters used to instantiate the table's
-                synthesizer.
+                A dictionary representing the parameters used to instantiate the synthesizer.
         """
-        return self._table_synthesizers.get(table_name).get_parameters()
+        parameters_dict = {
+            'locales': self.locales,
+            'verbose': self.verbose,
+            'tables': {
+                table: self.get_table_parameters(table) for table in self.metadata.tables
+            }
+        }
+
+        return parameters_dict
 
     def set_table_parameters(self, table_name, table_parameters):
         """Update the table's synthesizer instantiation parameters.
@@ -406,6 +418,34 @@ def get_learned_distributions(self, table_name):
             f"table because it uses the '{synthesizer.__class__.__name__}'."
         )
 
+    def get_loss_values(self, table_name):
+        """Get the loss values from a model for a table.
+
+        Return a pandas dataframe mapping of the loss values per epoch of GAN
+        based synthesizers
+
+        Args:
+            table_name (str):
+                Table name for which the parameters should be retrieved.
+
+        Returns:
+            pd.DataFrame:
+                Dataframe of loss values per epoch
+        """
+        if table_name not in self._table_synthesizers:
+            raise ValueError(
+                f"Table '{table_name}' is not present in the metadata."
+            )
+
+        synthesizer = self._table_synthesizers[table_name]
+        if hasattr(synthesizer, 'get_loss_values'):
+            return synthesizer.get_loss_values()
+
+        raise SynthesizerInputError(
+            f"Loss values are not available for table '{table_name}' "
+            'because the table does not use a GAN-based model.'
+        )
+
     def _validate_constraints_to_be_added(self, constraints):
         for constraint_dict in constraints:
             if 'table_name' not in constraint_dict.keys():

diff --git a/sdv/multi_table/hma.py b/sdv/multi_table/hma.py
@@ -1,7 +1,6 @@
 """Hierarchical Modeling Algorithms."""
 
 import logging
-import math
 from copy import deepcopy
 
 import numpy as np
@@ -110,7 +109,8 @@ def _get_num_extended_columns(self, table_name, parent_table, columns_per_table)
         if num_data_columns == 0:
             return num_rows_columns
 
-        distribution = self.get_table_parameters(table_name)['default_distribution']
+        table_parameters = self.get_table_parameters(table_name)['table_parameters']
+        distribution = table_parameters['default_distribution']
         num_parameters_columns = num_rows_columns * num_data_columns
         if distribution in {'beta', 'truncnorm'}:
             num_parameters_columns *= 4
@@ -431,7 +431,7 @@ def _extract_parameters(self, parent_row, table_name, foreign_key):
             num_rows = flat_parameters[num_rows_key]
             flat_parameters[num_rows_key] = min(
                 self._max_child_rows[num_rows_key],
-                math.ceil(num_rows)
+                round(num_rows)
             )
 
         return flat_parameters.rename(new_keys).to_dict()