Skip to content

Commit

Permalink
Provide meta information about the reports (#472)
Browse files Browse the repository at this point in the history
  • Loading branch information
frances-h authored Oct 24, 2023
1 parent 99cb1e4 commit 7f39ca7
Show file tree
Hide file tree
Showing 4 changed files with 228 additions and 2 deletions.
29 changes: 29 additions & 0 deletions sdmetrics/reports/base_report.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
"""Single table base report."""
import pickle
import sys
import time
import warnings
from copy import deepcopy
from datetime import datetime
from importlib.metadata import version

import numpy as np
import pandas as pd
Expand All @@ -22,6 +26,11 @@ def __init__(self):
self.is_generated = False
self._properties = {}
self._results_handler = None
self.report_info = {
'report_type': self.__class__.__name__,
'generated_date': None,
'sdmetrics_version': version('sdmetrics')
}

def _validate_metadata_matches_data(self, real_data, synthetic_data, metadata):
"""Validate that the metadata matches the data.
Expand Down Expand Up @@ -104,11 +113,25 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
self.validate(real_data, synthetic_data, metadata)
self.convert_datetimes(real_data, synthetic_data, metadata)

self.report_info['generated_date'] = datetime.today().strftime('%Y-%m-%d')
if 'tables' in metadata:
self.report_info['num_tables'] = len(metadata['tables'])
self.report_info['num_rows_real_data'] = {
name: len(table) for name, table in real_data.items()
}
self.report_info['num_rows_synthetic_data'] = {
name: len(table) for name, table in synthetic_data.items()
}
else:
self.report_info['num_rows_real_data'] = len(real_data)
self.report_info['num_rows_synthetic_data'] = len(synthetic_data)

scores = []
progress_bar = None
if verbose:
sys.stdout.write('Generating report ...\n')

start_time = time.time()
for ind, (property_name, property_instance) in enumerate(self._properties.items()):
if verbose:
num_iterations = int(property_instance._get_num_iterations(metadata))
Expand All @@ -126,6 +149,8 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):

self._overall_score = np.nanmean(scores)
self.is_generated = True
end_time = time.time()
self.report_info['generation_time'] = end_time - start_time

self._handle_results(verbose)

Expand All @@ -143,6 +168,10 @@ def _check_property_name(self, property_name):
f" Valid property names are '{valid_property_names}'."
)

def get_info(self):
"""Get the information about the report."""
return deepcopy(self.report_info)

def _check_report_generated(self):
if not self.is_generated:
raise ValueError('The report has not been generated. Please call `generate` first.')
Expand Down
28 changes: 28 additions & 0 deletions tests/integration/reports/multi_table/test_quality_report.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
from datetime import date, datetime

import numpy as np
Expand Down Expand Up @@ -90,7 +91,9 @@ def test_multi_table_quality_report():
# Run `generate`, `get_properties` and `get_score`,
# as well as `get_visualization` and `get_details` for every property:
# 'Column Shapes', 'Column Pair Trends', 'Cardinality'
generate_start_time = time.time()
report.generate(real_data, synthetic_data, metadata)
generate_end_time = time.time()
properties = report.get_properties()
property_names = list(properties['Property'])
score = report.get_score()
Expand Down Expand Up @@ -183,6 +186,21 @@ def test_multi_table_quality_report():
})
pd.testing.assert_frame_equal(details[5], expected_df_4)

# Assert report info saved
report_info = report.get_info()
assert report_info == report.report_info

expected_info_keys = {
'report_type', 'generated_date', 'sdmetrics_version', 'num_tables', 'num_rows_real_data',
'num_rows_synthetic_data', 'generation_time'
}
assert report_info.keys() == expected_info_keys
assert report_info['report_type'] == 'QualityReport'
assert report_info['num_tables'] == 2
assert report_info['num_rows_real_data'] == {'table1': 4, 'table2': 4}
assert report_info['num_rows_synthetic_data'] == {'table1': 4, 'table2': 4}
assert report_info['generation_time'] <= generate_end_time - generate_start_time


def test_quality_report_end_to_end():
"""Test the multi table QualityReport end to end."""
Expand All @@ -194,6 +212,7 @@ def test_quality_report_end_to_end():
report.generate(real_data, synthetic_data, metadata)
score = report.get_score()
properties = report.get_properties()
info = report.get_info()

# Assert
expected_properties = pd.DataFrame({
Expand All @@ -202,6 +221,15 @@ def test_quality_report_end_to_end():
})
assert score == 0.6249089638729638
pd.testing.assert_frame_equal(properties, expected_properties)
expected_info_keys = {
'report_type', 'generated_date', 'sdmetrics_version', 'num_tables', 'num_rows_real_data',
'num_rows_synthetic_data', 'generation_time'
}
assert info.keys() == expected_info_keys
assert info['report_type'] == 'QualityReport'
assert info['num_tables'] == 3
assert info['num_rows_real_data'] == {'sessions': 10, 'users': 10, 'transactions': 10}
assert info['num_rows_synthetic_data'] == {'sessions': 9, 'users': 10, 'transactions': 10}


def test_quality_report_with_object_datetimes():
Expand Down
16 changes: 16 additions & 0 deletions tests/integration/reports/single_table/test_quality_report.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import contextlib
import io
import re
import time
from datetime import date, datetime

import numpy as np
Expand Down Expand Up @@ -80,7 +81,9 @@ def test_report_end_to_end(self):
report = QualityReport()

# Run
generate_start_time = time.time()
report.generate(real_data[column_names], synthetic_data[column_names], metadata)
generate_end_time = time.time()

# Assert
expected_details_column_shapes_dict = {
Expand Down Expand Up @@ -126,6 +129,19 @@ def test_report_end_to_end(self):
)
assert report.get_score() == 0.7804181608907237

report_info = report.get_info()
assert report_info == report.report_info

expected_info_keys = {
'report_type', 'generated_date', 'sdmetrics_version', 'num_rows_real_data',
'num_rows_synthetic_data', 'generation_time'
}
assert report_info.keys() == expected_info_keys
assert report_info['report_type'] == 'QualityReport'
assert report_info['num_rows_real_data'] == 215
assert report_info['num_rows_synthetic_data'] == 215
assert report_info['generation_time'] <= generate_end_time - generate_start_time

def test_quality_report_with_object_datetimes(self):
"""Test the quality report with object datetimes.
Expand Down
157 changes: 155 additions & 2 deletions tests/unit/reports/test_base_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,21 @@ def test_generate_metadata_not_dict(self):
with pytest.raises(TypeError, match=expected_message):
base_report.generate(real_data, synthetic_data, metadata, verbose=False)

def test_generate(self):
@patch('sdmetrics.reports.base_report.datetime')
@patch('sdmetrics.reports.base_report.time')
@patch('sdmetrics.reports.base_report.version')
def test_generate(self, version_mock, time_mock, datetime_mock):
"""Test the ``generate`` method.
This test checks that the method calls the ``validate`` method and the ``get_score``
method for each property.
method for each property. Also tests that the ``details`` property is correctly
populated.
"""
# Setup
datetime_mock.today.return_value = pd.to_datetime('2020-01-05')
time_mock.time.side_effect = [5, 10]
version_mock.return_value = 'version'

base_report = BaseReport()
mock_validate = Mock()
mock_handle_results = Mock()
Expand Down Expand Up @@ -217,6 +225,102 @@ def test_generate(self):
base_report._properties['Property 2'].get_score.assert_called_with(
real_data, synthetic_data, metadata, progress_bar=None
)
expected_info = {
'report_type': 'BaseReport',
'generated_date': '2020-01-05',
'sdmetrics_version': 'version',
'num_rows_real_data': 3,
'num_rows_synthetic_data': 3,
'generation_time': 5
}
assert base_report.report_info == expected_info

@patch('sdmetrics.reports.base_report.datetime')
@patch('sdmetrics.reports.base_report.time')
@patch('sdmetrics.reports.base_report.version')
def test_generate_multi_table_details(self, version_mock, time_mock, datetime_mock):
"""Test the ``generate`` method with multi-table data.
This test checks that the ``details`` property is correctly populated with
multi-table data.
"""
# Setup
datetime_mock.today.return_value = pd.to_datetime('2020-01-05')
time_mock.time.side_effect = [5, 10]
version_mock.return_value = 'version'

base_report = BaseReport()
base_report._handle_results = Mock()
base_report.validate = Mock()
base_report.convert_datetimes = Mock()
base_report._properties['Property 1'] = Mock()
base_report._properties['Property 1'].get_score.return_value = 1.0
base_report._properties['Property 2'] = Mock()
base_report._properties['Property 2'].get_score.return_value = 1.0

real_data = {
'table1': pd.DataFrame({
'column1': [1, 2, 3],
'column2': ['a', 'b', 'c']
}),
'table2': pd.DataFrame({
'column3': ['x', 'y', 'z'],
'column4': [10, 9, 8]
})
}
synthetic_data = {
'table1': pd.DataFrame({
'column1': [1, 2, 3],
'column2': ['a', 'b', 'c']
}),
'table2': pd.DataFrame({
'column3': ['x', 'y', 'z'],
'column4': [10, 9, 8]
})
}
metadata = {
'tables': {
'table1': {
'columns': {
'column1': {'sdtype': 'numerical'},
'column2': {'sdtype': 'categorical'}
}
},
'table2': {
'columns': {
'column3': {'sdtype': 'categorical'},
'column4': {'sdtype': 'numerical'}
}
}
}
}

# Run
base_report.generate(real_data, synthetic_data, metadata, verbose=False)

# Assert
base_report._properties['Property 1'].get_score.assert_called_with(
real_data, synthetic_data, metadata, progress_bar=None
)
base_report._properties['Property 2'].get_score.assert_called_with(
real_data, synthetic_data, metadata, progress_bar=None
)
expected_info = {
'report_type': 'BaseReport',
'generated_date': '2020-01-05',
'sdmetrics_version': 'version',
'num_tables': 2,
'num_rows_real_data': {
'table1': 3,
'table2': 3
},
'num_rows_synthetic_data': {
'table1': 3,
'table2': 3
},
'generation_time': 5
}
assert base_report.report_info == expected_info

def test__handle_results(self):
"""Test the ``_handle_results`` method."""
Expand Down Expand Up @@ -337,6 +441,55 @@ def test_get_properties(self):
}),
)

@patch('sdmetrics.reports.base_report.datetime')
@patch('sdmetrics.reports.base_report.time')
@patch('sdmetrics.reports.base_report.version')
def test_get_info(self, version_mock, time_mock, datetime_mock):
"""Test the ``get_info`` method."""
# Setup
datetime_mock.today.return_value = pd.to_datetime('2020-01-05')
time_mock.time.side_effect = [5, 10]
version_mock.return_value = 'version'

base_report = BaseReport()
mock_validate = Mock()
mock_handle_results = Mock()
base_report._handle_results = mock_handle_results
base_report.validate = mock_validate
base_report._properties['Property 1'] = Mock()
base_report._properties['Property 1'].get_score.return_value = 1.0
base_report._properties['Property 2'] = Mock()
base_report._properties['Property 2'].get_score.return_value = 1.0

real_data = pd.DataFrame({
'column1': [1, 2, 3],
'column2': ['a', 'b', 'c']
})
synthetic_data = pd.DataFrame({
'column1': [1, 2, 3],
'column2': ['a', 'b', 'c']
})
metadata = {
'columns': {
'column1': {'sdtype': 'numerical'},
'column2': {'sdtype': 'categorical'}
}
}

# Run
base_report.generate(real_data, synthetic_data, metadata, verbose=False)

# Assert
expected_info = {
'report_type': 'BaseReport',
'generated_date': '2020-01-05',
'sdmetrics_version': 'version',
'num_rows_real_data': 3,
'num_rows_synthetic_data': 3,
'generation_time': 5
}
assert base_report.get_info() == expected_info

def test_get_visualization(self):
"""Test the ``get_visualization`` method."""
# Setup
Expand Down

0 comments on commit 7f39ca7

Please sign in to comment.