Skip to content

Commit

Permalink
Add dtype comparison in TableStructure metric (used in Diagnostic r…
Browse files Browse the repository at this point in the history
…eport) (#633)
  • Loading branch information
R-Palazzo authored Sep 23, 2024
1 parent 2d6d3e7 commit 1b4957d
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 7 deletions.
11 changes: 6 additions & 5 deletions sdmetrics/single_table/table_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@ def compute_breakdown(cls, real_data, synthetic_data):
synthetic_data (pandas.DataFrame):
The synthetic data.
"""
synthetic_columns = set(synthetic_data.columns)
real_columns = set(real_data.columns)
intersection_columns = real_columns & synthetic_columns
union_columns = real_columns | synthetic_columns
score = len(intersection_columns) / len(union_columns)
real_columns_dtypes = set(zip(real_data.columns, map(str, real_data.dtypes)))
synthetic_columns_dtypes = set(zip(synthetic_data.columns, map(str, synthetic_data.dtypes)))

intersection = real_columns_dtypes & synthetic_columns_dtypes
union = real_columns_dtypes | synthetic_columns_dtypes
score = len(intersection) / len(union)

return {'score': score}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def test_end_to_end_with_metrics_failing(self):
"""Test the ``DiagnosticReport`` report when some metrics crash.
This test makes fail the 'Boundary' property to check that the report still works.
The TableStructure should no longer be 1.0 since there is some dtype mismatch.
"""
real_data, synthetic_data, metadata = load_demo(modality='multi_table')
real_data['users']['age'].iloc[0] = 'error_1'
Expand All @@ -62,7 +63,7 @@ def test_end_to_end_with_metrics_failing(self):
# Assert
expected_properties = pd.DataFrame({
'Property': ['Data Validity', 'Data Structure', 'Relationship Validity'],
'Score': [1.0, 1.0, 1.0],
'Score': [1.0, 0.6761904761904761, 1.0],
})
expected_details = pd.DataFrame({
'Table': [
Expand Down Expand Up @@ -119,7 +120,7 @@ def test_end_to_end_with_metrics_failing(self):
None,
],
})
assert results == 1.0
assert results == 0.892063492063492
pd.testing.assert_frame_equal(
report.get_properties(), expected_properties, check_exact=False, atol=2e-2
)
Expand Down
22 changes: 22 additions & 0 deletions tests/unit/single_table/test_table_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,28 @@ def test_compute_breakdown(self, real_data):
expected_result = {'score': 1.0}
assert result == expected_result

def test_compute_breakdown_with_different_dtypes(self, real_data):
"""Test the ``compute_breakdown`` method with different data types.
- Real data has 5 columns.
- Synthetic data is identical except 'col_1' has a different data type.
- Total unique (column_name, dtype) combinations: 6.
- Matching combinations: 4.
- Expected score: 4 / 6 = 2 / 3.
"""
# Setup
synthetic_data = real_data.copy()
synthetic_data['col_1'] = synthetic_data['col_1'].astype('float')

metric = TableStructure()

# Run
result = metric.compute_breakdown(real_data, synthetic_data)

# Assert
expected_result = {'score': 2 / 3}
assert result == expected_result

def test_compute_breakdown_with_missing_columns(self, real_data):
"""Test the ``compute_breakdown`` method with missing columns."""
# Setup
Expand Down

0 comments on commit 1b4957d

Please sign in to comment.