Skip to content

Commit

Permalink
convert datetimes in BaseReport
Browse files Browse the repository at this point in the history
  • Loading branch information
frances-h committed Sep 13, 2023
1 parent d50d953 commit 20c7845
Show file tree
Hide file tree
Showing 6 changed files with 207 additions and 6 deletions.
1 change: 1 addition & 0 deletions sdmetrics/reports/base_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
Whether or not to print report summary and progress.
"""
self.validate(real_data, synthetic_data, metadata)
self.convert_datetimes(real_data, synthetic_data, metadata)

scores = []
progress_bar = None
Expand Down
37 changes: 37 additions & 0 deletions tests/integration/reports/multi_table/test_diagnostic_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,43 @@ def test_end_to_end(self):
}
assert results == expected_results

def test_end_to_end_with_object_datetimes(self):
"""Test the ``DiagnosticReport`` report with object datetimes."""
real_data, synthetic_data, metadata = load_demo(modality='multi_table')
for table, table_meta in metadata['tables'].items():
for column, column_meta in table_meta['columns'].items():
if column_meta['sdtype'] == 'datetime':
dt_format = column_meta['datetime_format']
real_data[table][column] = real_data[table][column].dt.strftime(dt_format)

report = DiagnosticReport()

# Run
report.generate(real_data, synthetic_data, metadata, verbose=False)
results = report.get_results()
properties = report.get_properties()

# Assert
expected_dataframe = pd.DataFrame({
'Property': ['Coverage', 'Boundary', 'Synthesis'],
'Score': [0.9573447196980541, 0.8666666666666667, 0.6333333333333333]
})
expected_results = {
'SUCCESS': [
'The synthetic data covers over 90% of the categories present in the real data',
'The synthetic data covers over 90% of the numerical ranges present'
' in the real data'
],
'WARNING': [
'More than 10% the synthetic data does not follow the min/max boundaries'
' set by the real data',
'More than 10% of the synthetic rows are copies of the real data'
],
'DANGER': []
}
assert results == expected_results
pd.testing.assert_frame_equal(properties, expected_dataframe)

def test_end_to_end_with_metrics_failing(self):
"""Test the ``DiagnosticReport`` report when some metrics crash.
Expand Down
26 changes: 26 additions & 0 deletions tests/integration/reports/multi_table/test_quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,32 @@ def test_quality_report_end_to_end():
pd.testing.assert_frame_equal(properties, expected_properties)


def test_quality_report_with_object_datetimes():
"""Test the multi table QualityReport with object datetimes."""
# Setup
real_data, synthetic_data, metadata = load_demo(modality='multi_table')
for table, table_meta in metadata['tables'].items():
for column, column_meta in table_meta['columns'].items():
if column_meta['sdtype'] == 'datetime':
dt_format = column_meta['datetime_format']
real_data[table][column] = real_data[table][column].dt.strftime(dt_format)

report = QualityReport()

# Run
report.generate(real_data, synthetic_data, metadata)
score = report.get_score()
properties = report.get_properties()

# Assert
expected_properties = pd.DataFrame({
'Property': ['Column Shapes', 'Column Pair Trends', 'Cardinality'],
'Score': [0.7922619047619048, 0.4249665433225429, 0.8],
})
assert score == 0.672409482694816
pd.testing.assert_frame_equal(properties, expected_properties)


def test_quality_report_with_errors():
"""Test the multi table QualityReport with errors when computing metrics."""
# Setup
Expand Down
70 changes: 70 additions & 0 deletions tests/integration/reports/single_table/test_diagnostic_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,76 @@ def test_end_to_end(self):
expected_details_boundary
)

def test_generate_with_object_datetimes(self):
"""Test the diagnostic report with object datetimes."""
# Setup
real_data, synthetic_data, metadata = load_demo(modality='single_table')
for column, column_meta in metadata['columns'].items():
if column_meta['sdtype'] == 'datetime':
dt_format = column_meta['datetime_format']
real_data[column] = real_data[column].dt.strftime(dt_format)

report = DiagnosticReport()

# Run
report.generate(real_data, synthetic_data, metadata)

# Assert
expected_details_synthetis = pd.DataFrame(
{
'Metric': 'NewRowSynthesis',
'Score': 1.0,
'Num Matched Rows': 0,
'Num New Rows': 215
}, index=[0]
)

expected_details_coverage = pd.DataFrame({
'Column': [
'start_date', 'end_date', 'salary', 'duration', 'high_perc', 'high_spec',
'mba_spec', 'second_perc', 'gender', 'degree_perc', 'placed', 'experience_years',
'employability_perc', 'mba_perc', 'work_experience', 'degree_type'
],
'Metric': [
'RangeCoverage', 'RangeCoverage', 'RangeCoverage', 'RangeCoverage',
'RangeCoverage', 'CategoryCoverage', 'CategoryCoverage', 'RangeCoverage',
'CategoryCoverage', 'RangeCoverage', 'CategoryCoverage', 'RangeCoverage',
'RangeCoverage', 'RangeCoverage', 'CategoryCoverage', 'CategoryCoverage'
],
'Score': [
1.0, 1.0, 0.42333783783783785, 1.0, 0.9807348482826732, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 0.6666666666666667, 1.0, 1.0, 1.0, 1.0
]
})

expected_details_boundary = pd.DataFrame({
'Column': [
'start_date', 'end_date', 'salary', 'duration', 'high_perc', 'second_perc',
'degree_perc', 'experience_years', 'employability_perc', 'mba_perc'
],
'Metric': ['BoundaryAdherence'] * 10,
'Score': [
0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0,
0.8651162790697674, 0.9255813953488372, 0.9441860465116279, 1.0,
0.8883720930232558, 0.8930232558139535
]
})

pd.testing.assert_frame_equal(
report.get_details('Synthesis'),
expected_details_synthetis
)

pd.testing.assert_frame_equal(
report.get_details('Coverage'),
expected_details_coverage
)

pd.testing.assert_frame_equal(
report.get_details('Boundary'),
expected_details_boundary
)

def test_generate_multiple_times(self):
"""The results should be the same both times."""
# Setup
Expand Down
67 changes: 67 additions & 0 deletions tests/integration/reports/single_table/test_quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,73 @@ def test_report_end_to_end(self):
)
assert report.get_score() == 0.7804181608907237

def test_quality_report_with_object_datetimes(self):
"""Test the quality report with object datetimes.
The report must compute each property and the overall quality score.
"""
# Setup
column_names = [
'student_id', 'degree_type', 'start_date', 'second_perc', 'work_experience'
]
real_data, synthetic_data, metadata = load_demo(modality='single_table')
for column, column_meta in metadata['columns'].items():
if column_meta['sdtype'] == 'datetime':
dt_format = column_meta['datetime_format']
real_data[column] = real_data[column].dt.strftime(dt_format)

metadata['columns'] = {
key: val for key, val in metadata['columns'].items() if key in column_names
}
report = QualityReport()

# Run
report.generate(real_data[column_names], synthetic_data[column_names], metadata)

# Assert
expected_details_column_shapes_dict = {
'Column': ['start_date', 'second_perc', 'work_experience', 'degree_type'],
'Metric': ['KSComplement', 'KSComplement', 'TVComplement', 'TVComplement'],
'Score': [
0.7011066184294531, 0.627906976744186, 0.9720930232558139, 0.9255813953488372
],
}

expected_details_cpt__dict = {
'Column 1': [
'start_date', 'start_date', 'start_date', 'second_perc',
'second_perc', 'work_experience'
],
'Column 2': [
'second_perc', 'work_experience', 'degree_type', 'work_experience',
'degree_type', 'degree_type'
],
'Metric': [
'CorrelationSimilarity', 'ContingencySimilarity', 'ContingencySimilarity',
'ContingencySimilarity', 'ContingencySimilarity', 'ContingencySimilarity'
],
'Score': [
0.9854510263003199, 0.586046511627907, 0.6232558139534884, 0.7348837209302326,
0.6976744186046512, 0.8976744186046511
],
'Real Correlation': [
0.04735340044317632, np.nan, np.nan, np.nan, np.nan, np.nan
],
'Synthetic Correlation': [
0.07645134784253645, np.nan, np.nan, np.nan, np.nan, np.nan
]
}
expected_details_column_shapes = pd.DataFrame(expected_details_column_shapes_dict)
expected_details_cpt = pd.DataFrame(expected_details_cpt__dict)

pd.testing.assert_frame_equal(
report.get_details('Column Shapes'), expected_details_column_shapes
)
pd.testing.assert_frame_equal(
report.get_details('Column Pair Trends'), expected_details_cpt
)
assert report.get_score() == 0.7804181608907237

def test_report_end_to_end_with_errors(self):
"""Test the quality report end to end with errors in the properties computation."""
# Setup
Expand Down
12 changes: 6 additions & 6 deletions tests/unit/reports/test_base_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,8 @@ def test_generate(self):
})
metadata = {
'columns': {
'column1': {'sdtypes': 'numerical'},
'column2': {'sdtypes': 'categorical'}
'column1': {'sdtype': 'numerical'},
'column2': {'sdtype': 'categorical'}
}
}

Expand Down Expand Up @@ -238,10 +238,10 @@ def test_generate_verbose(self, mock_tqdm):
})
metadata = {
'columns': {
'column1': {'sdtypes': 'numerical'},
'column2': {'sdtypes': 'categorical'},
'column3': {'sdtypes': 'numerical'},
'column4': {'sdtypes': 'numerical'},
'column1': {'sdtype': 'numerical'},
'column2': {'sdtype': 'categorical'},
'column3': {'sdtype': 'numerical'},
'column4': {'sdtype': 'numerical'},
}
}

Expand Down

0 comments on commit 20c7845

Please sign in to comment.