From ca2e8a63a9b86093f033013de7aedfc495d3ac64 Mon Sep 17 00:00:00 2001 From: Jonathan Karr Date: Thu, 21 Jan 2021 18:19:02 -0500 Subject: [PATCH] Moved array padding to common utils Added export to TSV, XLSX Added warnings for importing/exporting to CSV, TSV, XLSX that meta data (data type, shape) isn't supported Added support for multidimensional reports Added support for data types and shapes to HDF5 output --- biosimulators_utils/report/data_model.py | 2 + biosimulators_utils/report/io.py | 251 ++++++++++++++------- biosimulators_utils/report/warnings.py | 38 ++++ biosimulators_utils/sedml/utils.py | 51 +---- biosimulators_utils/sedml/warnings.py | 6 - biosimulators_utils/utils/core.py | 53 ++++- requirements.txt | 2 +- tests/report/test_report_io.py | 264 ++++++++++++++++++++++- tests/sedml/test_sedml_exec.py | 106 ++++----- tests/sedml/test_sedml_utils.py | 4 +- 10 files changed, 585 insertions(+), 192 deletions(-) create mode 100644 biosimulators_utils/report/warnings.py diff --git a/biosimulators_utils/report/data_model.py b/biosimulators_utils/report/data_model.py index 77606e42..0dc7fe04 100644 --- a/biosimulators_utils/report/data_model.py +++ b/biosimulators_utils/report/data_model.py @@ -66,3 +66,5 @@ class ReportFormat(str, enum.Enum): h5 = 'h5' hdf = 'h5' hdf5 = 'h5' + tsv = 'tsv' + xlsx = 'xlsx' diff --git a/biosimulators_utils/report/io.py b/biosimulators_utils/report/io.py index fe80b28f..630894b1 100644 --- a/biosimulators_utils/report/io.py +++ b/biosimulators_utils/report/io.py @@ -8,16 +8,16 @@ from ..config import get_config from ..sedml.data_model import Report # noqa: F401 -from ..sedml.utils import pad_arrays_to_consistent_shapes -from ..sedml.warnings import RepeatDataSetLabelsWarning +from ..utils.core import pad_arrays_to_consistent_shapes from ..warnings import warn from .data_model import DataSetResults, ReportFormat +from .warnings import RepeatDataSetLabelsWarning, MissingReportMetadataWarning, MissingDataWarning, ExtraDataWarning import glob +import h5py import numpy +import openpyxl import os import pandas -import tables -import warnings __all__ = [ 'ReportWriter', @@ -46,43 +46,74 @@ def run(self, report, results, base_path, rel_path, format=ReportFormat.h5): format (:obj:`ReportFormat`, optional): report format """ - data_set_labels = [data_set.label for data_set in report.data_sets] - if len(set(data_set_labels)) < len(report.data_sets): - warn('To facilitate machine interpretation, data sets should have unique ids.', - RepeatDataSetLabelsWarning) - - results_list = [] + results_array = [] + data_set_ids = [] data_set_labels = [] + data_set_data_types = [] + data_set_shapes = [] for data_set in report.data_sets: if data_set.id in results: - results_list.append(results[data_set.id]) + data_set_result = results[data_set.id] + results_array.append(data_set_result) + data_set_ids.append(data_set.id) data_set_labels.append(data_set.label) - results_list = pad_arrays_to_consistent_shapes(results_list) - results_df = pandas.DataFrame(numpy.array(results_list), index=data_set_labels) + if data_set_result is None: + data_set_data_types.append('__None__') + data_set_shapes.append('') + else: + data_set_data_types.append(data_set_result.dtype.name) + data_set_shapes.append(','.join(str(dim_len) for dim_len in data_set_result.shape)) + results_array = pad_arrays_to_consistent_shapes(results_array) + results_array = numpy.array(results_array) + + if format in [ReportFormat.csv, ReportFormat.tsv, ReportFormat.xlsx]: + if results_array.ndim > 2: + raise ValueError('Report has {} dimensions. Multidimensional reports cannot be exported to {}.'.format( + results_array.ndim, format.value.upper())) + + if len(set(data_set.label for data_set in report.data_sets)) < len(report.data_sets): + warn('To facilitate machine interpretation, data sets should have unique labels.', + RepeatDataSetLabelsWarning) + + warn('Reports exported to {} do not contain information about the data type or size of each data set.', + MissingReportMetadataWarning) + + results_df = pandas.DataFrame(results_array, index=data_set_labels) + + if format in [ReportFormat.csv, ReportFormat.tsv]: + filename = os.path.join(base_path, rel_path + '.' + format.value) + out_dir = os.path.dirname(filename) + if not os.path.isdir(out_dir): + os.makedirs(out_dir) - if format == ReportFormat.csv: - filename = os.path.join(base_path, rel_path + '.' + format.value) - out_dir = os.path.dirname(filename) - if not os.path.isdir(out_dir): - os.makedirs(out_dir) - results_df.to_csv(filename, - header=False) + results_df.to_csv(filename, header=False, sep=',' if format == ReportFormat.csv else '\t') + else: + filename = os.path.join(base_path, os.path.dirname(rel_path) + '.' + format.value) + out_dir = os.path.dirname(filename) + if not os.path.isdir(out_dir): + os.makedirs(out_dir) + + with pandas.ExcelWriter(filename, mode='a' if os.path.isfile(filename) else 'w', engine='openpyxl') as writer: + results_df.to_excel(writer, sheet_name=os.path.basename(rel_path), header=False) elif format == ReportFormat.h5: filename = os.path.join(base_path, get_config().H5_REPORTS_PATH) if not os.path.isdir(base_path): os.makedirs(base_path) - with warnings.catch_warnings(): - warnings.simplefilter("ignore", tables.NaturalNameWarning) - results_df.to_hdf(filename, - key=rel_path, - format='table', - complevel=9, - complib='zlib', - mode='a', - append=False, - ) + with h5py.File(filename, 'a') as file: + try: + file[rel_path] + del file[rel_path] + except KeyError: + pass + + data_set = file.create_dataset(rel_path, data=results_array, + chunks=True, compression="gzip", compression_opts=9) + data_set.attrs['dataSetIds'] = data_set_ids + data_set.attrs['dataSetLabels'] = data_set_labels + data_set.attrs['dataSetDataTypes'] = data_set_data_types + data_set.attrs['dataSetShapes'] = data_set_shapes else: raise NotImplementedError('Report format {} is not supported'.format(format)) @@ -111,49 +142,111 @@ def run(self, report, base_path, rel_path, format=ReportFormat.h5): Returns: :obj:`DataSetResults`: report results """ - if format == ReportFormat.csv: - filename = os.path.join(base_path, rel_path + '.' + format.value) - df = pandas.read_csv(filename, - index_col=0, - header=None) + if format in [ReportFormat.csv, ReportFormat.tsv, ReportFormat.xlsx]: + warn('Reports exported to {} do not contain information about the data type or size of each data set.', + MissingReportMetadataWarning) + + if format in [ReportFormat.csv, ReportFormat.tsv]: + filename = os.path.join(base_path, rel_path + '.' + format.value) + df = pandas.read_csv(filename, + index_col=0, + header=None, + sep=',' if format == ReportFormat.csv else '\t') + else: + filename = os.path.join(base_path, os.path.dirname(rel_path) + '.' + format.value) + df = pandas.read_excel(filename, + sheet_name=os.path.basename(rel_path), + index_col=0, + header=None, + engine='openpyxl') df.columns = pandas.RangeIndex(start=0, stop=df.shape[1], step=1) + results = DataSetResults() + + data_set_labels = [data_set.label for data_set in report.data_sets] + if df.index.tolist() == data_set_labels: + data = df.to_numpy() + for i_data_set, data_set in enumerate(report.data_sets): + results[data_set.id] = data[i_data_set, :] + extra_data_sets = set() + + else: + data_set_label_to_index = {} + for i_data_set, data_set_label in enumerate(df.index): + if data_set_label not in data_set_label_to_index: + data_set_label_to_index[data_set_label] = i_data_set + else: + data_set_label_to_index[data_set_label] = None + + unreadable_data_sets = [] + for data_set in report.data_sets: + i_data_set = data_set_label_to_index.get(data_set.label, None) + if i_data_set is None: + # results[data_set.id] = None + unreadable_data_sets.append(data_set.id) + else: + results[data_set.id] = df.loc[data_set.label, :].to_numpy() + + if unreadable_data_sets: + warn('Some data sets could not be read because their labels are not unique:\n - {}'.format( + '\n'.join('`' + id + '`' for id in sorted(unreadable_data_sets))), RepeatDataSetLabelsWarning) + + data_set_id_to_label = {data_set.id: data_set.label for data_set in report.data_sets} + extra_data_sets = set(df.index) - set(data_set_id_to_label[id] for id in results.keys()) - set(unreadable_data_sets) + + file_data_set_ids = set(results.keys()) | extra_data_sets + elif format == ReportFormat.h5: filename = os.path.join(base_path, get_config().H5_REPORTS_PATH) - df = pandas.read_hdf(filename, - key=rel_path, - ) - - else: - raise NotImplementedError('Report format {} is not supported'.format(format)) - results = DataSetResults() - - data_set_labels = [data_set.label for data_set in report.data_sets] - unreadable_data_sets = [] - if df.index.tolist() == data_set_labels: + with h5py.File(filename, 'r') as file: + data_set = file[rel_path] + data_set_results = data_set[:] + file_data_set_ids = data_set.attrs['dataSetIds'] + data_set_data_types = data_set.attrs['dataSetDataTypes'] + data_set_shapes = [] + for data_set_shape in data_set.attrs['dataSetShapes']: + if data_set_shape: + data_set_shapes.append([int(dim_len) for dim_len in data_set_shape.split(',')]) + else: + data_set_shapes.append([]) + + results = DataSetResults() + data_set_id_to_index = {data_set_id: i_data_set for i_data_set, data_set_id in enumerate(file_data_set_ids)} + + data_set_ndim = data_set_results.ndim - 1 for data_set in report.data_sets: - results[data_set.id] = df.loc[data_set.label, :] + i_data_set = data_set_id_to_index.get(data_set.id, None) + if i_data_set is not None: + data_set_data_type = data_set_data_types[i_data_set] + if data_set_data_type == '__None__': + results[data_set.id] = None + else: + data_set_shape = data_set_shapes[i_data_set] + data_set_slice = [slice(0, dim_len) for dim_len in data_set_shape] + \ + [slice(0, 1)] * (data_set_ndim - len(data_set_shape)) + results[data_set.id] = ( + data_set_results[i_data_set][data_set_slice] + .reshape(data_set_shape) + .astype(data_set_data_type) + ) + + file_data_set_ids = set(file_data_set_ids) else: - data_set_label_to_index = {} - for i_data_set, data_set_label in enumerate(df.index): - if data_set_label not in data_set_label_to_index: - data_set_label_to_index[data_set_label] = i_data_set - else: - data_set_label_to_index[data_set_label] = None + raise NotImplementedError('Report format {} is not supported'.format(format)) - for data_set in report.data_sets: - i_data_set = data_set_label_to_index.get(data_set.label, None) - if i_data_set is None: - # results[data_set.id] = None - unreadable_data_sets.append(data_set.id) - else: - results[data_set.id] = df.loc[data_set.label, :] + report_data_set_ids = set(data_set.id for data_set in report.data_sets) + missing_data_set_ids = report_data_set_ids.difference(file_data_set_ids) + extra_data_set_ids = file_data_set_ids.difference(report_data_set_ids) + + if missing_data_set_ids: + warn('File does not contain data for the following data sets of the report:\n - {}'.format( + '\n'.join('`' + id + '`' for id in sorted(missing_data_set_ids))), MissingDataWarning) - if unreadable_data_sets: - warn('Some data sets could not be read because their labels are not unique:\n - {}'.format( - '\n'.join('`' + id + '`' for id in sorted(unreadable_data_sets))), RepeatDataSetLabelsWarning) + if extra_data_set_ids: + warn('File contains additional data that could not be mapped to data sets of the report:\n - {}'.format( + '\n'.join('`' + id + '`' for id in sorted(extra_data_set_ids))), ExtraDataWarning) return results @@ -169,23 +262,33 @@ def get_ids(self, base_path, format=ReportFormat.h5): format (:obj:`ReportFormat`, optional): report format Returns: - :obj:`set` of :obj:`str`: ids of reports + :obj:`list` of :obj:`str`: ids of reports """ - if format == ReportFormat.csv: - report_ids = set() + if format in [ReportFormat.csv, ReportFormat.tsv]: + report_ids = [] for path in glob.glob(os.path.join(base_path, '**/*.' + format.value), recursive=True): - report_ids.add(os.path.relpath(path, base_path)[0:-len(format.value)-1]) + report_ids.append(os.path.relpath(path, base_path)[0:-len(format.value)-1]) + return report_ids + + elif format == ReportFormat.xlsx: + report_ids = [] + for path in glob.glob(os.path.join(base_path, '**/*.' + format.value), recursive=True): + wb = openpyxl.load_workbook(path) + for sheet_name in wb.get_sheet_names(): + report_ids.append(os.path.join(os.path.relpath(path, base_path)[0:-len(format.value)-1], sheet_name)) return report_ids elif format == ReportFormat.h5: filename = os.path.join(base_path, get_config().H5_REPORTS_PATH) - reports_file = tables.open_file(filename, mode="r") - report_ids = set() - for node in reports_file.walk_nodes(): - base_path, _, rel_path = node._v_pathname.rpartition('/') - if rel_path == 'table': - report_ids.add(base_path[1:]) - reports_file.close() + with h5py.File(filename, 'r') as file: + report_ids = [] + + def append_report_id(name, object): + if isinstance(object, h5py.Dataset): + report_ids.append(name) + + file.visititems(append_report_id) + return report_ids else: diff --git a/biosimulators_utils/report/warnings.py b/biosimulators_utils/report/warnings.py new file mode 100644 index 00000000..eff30358 --- /dev/null +++ b/biosimulators_utils/report/warnings.py @@ -0,0 +1,38 @@ +""" Warnings for reports + +:Author: Jonathan Karr +:Date: 2021-01-21 +:Copyright: 2021, Center for Reproducible Biomedical Modeling +:License: MIT +""" + +from ..warnings import BioSimulatorsWarning + +__all__ = [ + 'RepeatDataSetLabelsWarning', + 'MissingReportMetadataWarning', + 'MissingDataWarning', + 'ExtraDataWarning', +] + + +class RepeatDataSetLabelsWarning(BioSimulatorsWarning): + """ Warning that multiple data sets with a report have the same label """ + pass # pragma: no cover + + +class MissingReportMetadataWarning(BioSimulatorsWarning): + """ Warning that an exported file of a report will not or does contain comprehensive metadata about the report + such as the data type and shape of each data set. + """ + pass # pragma: no cover + + +class MissingDataWarning(BioSimulatorsWarning): + """ Warning that a file does not contain data for one or more data sets of a report. """ + pass # pragma: no cover + + +class ExtraDataWarning(BioSimulatorsWarning): + """ Warning that a file contains additional data that could not be mapped to a data set of a report. """ + pass # pragma: no cover diff --git a/biosimulators_utils/sedml/utils.py b/biosimulators_utils/sedml/utils.py index a0dd0f0b..7a48c154 100644 --- a/biosimulators_utils/sedml/utils.py +++ b/biosimulators_utils/sedml/utils.py @@ -8,6 +8,7 @@ from ..log.data_model import Status from ..report.data_model import VariableResults, DataGeneratorResults # noqa: F401 +from ..utils.core import pad_arrays_to_consistent_shapes from ..warnings import warn from ..xml.utils import get_namespaces_for_xml_doc from .data_model import (SedDocument, Model, ModelChange, ModelAttributeChange, AddElementModelChange, # noqa: F401 @@ -38,7 +39,6 @@ 'calc_compute_model_change_new_value', 'calc_data_generator_results', 'calc_data_generators_results', - 'pad_arrays_to_consistent_shapes', 'compile_math', 'eval_math', 'remove_model_changes', @@ -659,55 +659,6 @@ def calc_data_generators_results(data_generators, variable_results, output, task return results, statuses, exception, task_contributes_to_data_generators -def pad_arrays_to_consistent_shapes(arrays): - """ Pad a list of NumPy arrays to a consistent shape - - Args: - arrays (:obj:`list` of :obj:`numpy.ndarray`): list of NumPy arrays - - Returns: - :obj:`list` of :obj:`numpy.ndarray`: list of padded arrays - """ - shapes = set() - for array in arrays: - if array is not None: - shape = array.shape - if not shape and array.size: - shape = (1,) - shapes.add(shape) - - if len(shapes) > 1: - warn('Arrays do not have consistent shapes', UserWarning) - - max_shape = [] - for shape in shapes: - max_shape = max_shape + [1 if max_shape else 0] * (len(shape) - len(max_shape)) - shape = list(shape) + [1 if shape else 0] * (len(max_shape) - len(shape)) - max_shape = [max(x, y) for x, y in zip(max_shape, shape)] - - padded_arrays = [] - for array in arrays: - if array is None: - array = numpy.full(max_shape, numpy.nan) - - shape = tuple(list(array.shape) - + [1 if array.size else 0] - * (len(max_shape) - array.ndim)) - array = array.reshape(shape) - - pad_width = tuple((0, x - y) for x, y in zip(max_shape, shape)) - - if pad_width: - array = numpy.pad(array, - pad_width, - mode='constant', - constant_values=numpy.nan) - - padded_arrays.append(array) - - return padded_arrays - - def compile_math(math): """ Compile a mathematical expression diff --git a/biosimulators_utils/sedml/warnings.py b/biosimulators_utils/sedml/warnings.py index bdb7125d..839af446 100644 --- a/biosimulators_utils/sedml/warnings.py +++ b/biosimulators_utils/sedml/warnings.py @@ -9,7 +9,6 @@ from ..warnings import BioSimulatorsWarning __all__ = [ - 'RepeatDataSetLabelsWarning', 'IllogicalSedmlWarning', 'InconsistentVariableShapesWarning', 'NoTasksWarning', @@ -21,11 +20,6 @@ ] -class RepeatDataSetLabelsWarning(BioSimulatorsWarning): - """ Warning that multiple data sets with a report have the same label """ - pass # pragma: no cover - - class IllogicalSedmlWarning(BioSimulatorsWarning): """ Warning that a SED document is illogical, such as when a report or plot contains no datasets, curves, or surfaces. diff --git a/biosimulators_utils/utils/core.py b/biosimulators_utils/utils/core.py index 64d9f0b7..5ec24fd5 100644 --- a/biosimulators_utils/utils/core.py +++ b/biosimulators_utils/utils/core.py @@ -7,13 +7,15 @@ """ from ..data_model import ValueType, OntologyTerm +from ..warnings import warn import json +import numpy import re __all__ = [ 'are_lists_equal', 'none_sorted', 'assert_exception', 'validate_value', 'validate_str_value', 'format_value', 'parse_value', - 'patch_dict', + 'patch_dict', 'pad_arrays_to_consistent_shapes', ] @@ -275,3 +277,52 @@ def patch_dict(dictionary, patch): else: props[key] = new_val + + +def pad_arrays_to_consistent_shapes(arrays): + """ Pad a list of NumPy arrays to a consistent shape + + Args: + arrays (:obj:`list` of :obj:`numpy.ndarray`): list of NumPy arrays + + Returns: + :obj:`list` of :obj:`numpy.ndarray`: list of padded arrays + """ + shapes = set() + for array in arrays: + if array is not None: + shape = array.shape + if not shape and array.size: + shape = (1,) + shapes.add(shape) + + if len(shapes) > 1: + warn('Arrays do not have consistent shapes', UserWarning) + + max_shape = [] + for shape in shapes: + max_shape = max_shape + [1 if max_shape else 0] * (len(shape) - len(max_shape)) + shape = list(shape) + [1 if shape else 0] * (len(max_shape) - len(shape)) + max_shape = [max(x, y) for x, y in zip(max_shape, shape)] + + padded_arrays = [] + for array in arrays: + if array is None: + array = numpy.full(max_shape, numpy.nan) + + shape = tuple(list(array.shape) + + [1 if array.size else 0] + * (len(max_shape) - array.ndim)) + array = array.astype('float64').reshape(shape) + + pad_width = tuple((0, x - y) for x, y in zip(max_shape, shape)) + + if pad_width: + array = numpy.pad(array, + pad_width, + mode='constant', + constant_values=numpy.nan) + + padded_arrays.append(array) + + return padded_arrays diff --git a/requirements.txt b/requirements.txt index 23b120b8..5e7f14c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ matplotlib mpmath networkx numpy +openpyxl pandas python_dateutil python_libcombine >= 0.2.9 @@ -13,6 +14,5 @@ python_libsedml >= 2.0.14 pyyaml requests simplejson -tables termcolor yamldown diff --git a/tests/report/test_report_io.py b/tests/report/test_report_io.py index 1a24f762..b185a3f8 100644 --- a/tests/report/test_report_io.py +++ b/tests/report/test_report_io.py @@ -1,9 +1,10 @@ from biosimulators_utils.report import data_model from biosimulators_utils.report import io +from biosimulators_utils.report.warnings import MissingDataWarning, ExtraDataWarning from biosimulators_utils.sedml.data_model import Report, DataSet import numpy +import numpy.testing import os -import pandas import shutil import tempfile import unittest @@ -16,11 +17,266 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.dirname) - def test_write_errors(self): + def test_read_write(self): + report_1 = Report( + id='report_1', + data_sets=[ + DataSet(id='w', label='W'), + DataSet(id='x', label='X'), + DataSet(id='y', label='Y'), + DataSet(id='z', label='Z'), + ], + ) + report_2 = Report( + id='report_2', + data_sets=[ + DataSet(id='a', label='A'), + DataSet(id='b', label='B'), + DataSet(id='c', label='C'), + DataSet(id='d', label='D'), + ], + ) + report_3 = Report( + id='report_3', + data_sets=[ + DataSet(id='a', label='A'), + DataSet(id='b', label='B'), + DataSet(id='c', label='C'), + DataSet(id='d', label='D'), + ], + ) + data_set_results_1 = data_model.DataSetResults({ + 'w': None, + 'x': numpy.array([1, 2, 3]), + 'y': numpy.array([4., numpy.nan]), + 'z': numpy.array(6.), + }) + data_set_results_2 = data_model.DataSetResults({ + 'a': numpy.array([1, 2]), + 'b': numpy.array([7., 8., 9.]), + 'c': numpy.array(True), + 'd': None, + }) + data_set_results_3 = data_model.DataSetResults({ + 'a': numpy.array([[1, 2], [3, 4], [5, 6]]), + 'b': numpy.array([7., 8., 9.]), + 'c': numpy.array(True), + 'd': None, + }) + + # CSV, TSV + for format in [data_model.ReportFormat.csv, data_model.ReportFormat.tsv, data_model.ReportFormat.xlsx]: + rel_path_1 = os.path.join(format.value, 'a/b/c.sedml', report_1.id) + rel_path_2 = os.path.join(format.value, 'a/d.sedml', report_2.id) + rel_path_3 = os.path.join(format.value, 'e.sedml', report_2.id) + + io.ReportWriter().run(report_1, data_set_results_1, self.dirname, rel_path_1, format=format) + io.ReportWriter().run(report_2, data_set_results_2, self.dirname, rel_path_2, format=format) + with self.assertRaisesRegex(ValueError, 'Multidimensional reports cannot be exported'): + io.ReportWriter().run(report_3, data_set_results_3, self.dirname, rel_path_3, format=format) + data_set_results_1_b = io.ReportReader().run(report_1, self.dirname, rel_path_1, format=format) + data_set_results_2_b = io.ReportReader().run(report_2, self.dirname, rel_path_2, format=format) + + self.assertEqual(set(io.ReportReader().get_ids(self.dirname, format=format)), set([rel_path_1, rel_path_2])) + + numpy.testing.assert_allclose(data_set_results_1_b['w'], numpy.array([numpy.nan, numpy.nan, numpy.nan])) + numpy.testing.assert_allclose(data_set_results_1_b['x'], numpy.array([1., 2., 3.])) + numpy.testing.assert_allclose(data_set_results_1_b['y'], numpy.array([4., numpy.nan, numpy.nan])) + numpy.testing.assert_allclose(data_set_results_1_b['z'], numpy.array([6., numpy.nan, numpy.nan])) + + self.assertEqual(data_set_results_1_b['w'].dtype.name, 'float64') + self.assertEqual(data_set_results_1_b['x'].dtype.name, 'float64') + self.assertEqual(data_set_results_1_b['y'].dtype.name, 'float64') + self.assertEqual(data_set_results_1_b['z'].dtype.name, 'float64') + + numpy.testing.assert_allclose(data_set_results_2_b['a'], numpy.array([1., 2., numpy.nan])) + numpy.testing.assert_allclose(data_set_results_2_b['b'], numpy.array([7., 8., 9.])) + numpy.testing.assert_allclose(data_set_results_2_b['c'], numpy.array([1., numpy.nan, numpy.nan])) + numpy.testing.assert_allclose(data_set_results_2_b['d'], numpy.array([numpy.nan, numpy.nan, numpy.nan])) + + self.assertEqual(data_set_results_2_b['a'].dtype.name, 'float64') + self.assertEqual(data_set_results_2_b['b'].dtype.name, 'float64') + self.assertEqual(data_set_results_2_b['c'].dtype.name, 'float64') + self.assertEqual(data_set_results_2_b['d'].dtype.name, 'float64') + + # HDF + for format in [data_model.ReportFormat.h5]: + rel_path_1 = os.path.join(format.value, 'a/b/c.sedml', report_1.id) + rel_path_2 = os.path.join(format.value, 'a/d.sedml', report_2.id) + rel_path_3 = os.path.join(format.value, 'e.sedml', report_2.id) + + io.ReportWriter().run(report_1, data_set_results_1, self.dirname, rel_path_1, format=format) + io.ReportWriter().run(report_2, data_set_results_2, self.dirname, rel_path_2, format=format) + io.ReportWriter().run(report_3, data_set_results_3, self.dirname, rel_path_3, format=format) + data_set_results_1_b = io.ReportReader().run(report_1, self.dirname, rel_path_1, format=format) + data_set_results_2_b = io.ReportReader().run(report_2, self.dirname, rel_path_2, format=format) + data_set_results_3_b = io.ReportReader().run(report_3, self.dirname, rel_path_3, format=format) + + self.assertEqual(set(io.ReportReader().get_ids(self.dirname, format=format)), set([rel_path_1, rel_path_2, rel_path_3])) + + self.assertEqual(data_set_results_1_b['w'], None) + numpy.testing.assert_allclose(data_set_results_1_b['x'], numpy.array([1, 2, 3])) + numpy.testing.assert_allclose(data_set_results_1_b['y'], numpy.array([4., numpy.nan])) + numpy.testing.assert_allclose(data_set_results_1_b['z'], numpy.array(6.)) + + self.assertEqual(data_set_results_1_b['x'].dtype.name, 'int64') + self.assertEqual(data_set_results_1_b['y'].dtype.name, 'float64') + self.assertEqual(data_set_results_1_b['z'].dtype.name, 'float64') + + numpy.testing.assert_allclose(data_set_results_2_b['a'], numpy.array([1, 2])) + numpy.testing.assert_allclose(data_set_results_2_b['b'], numpy.array([7., 8., 9.])) + numpy.testing.assert_allclose(data_set_results_2_b['c'], numpy.array(True)) + self.assertEqual(data_set_results_2_b['d'], None) + + self.assertEqual(data_set_results_2_b['a'].dtype.name, 'int64') + self.assertEqual(data_set_results_2_b['b'].dtype.name, 'float64') + self.assertEqual(data_set_results_2_b['c'].dtype.name, 'bool') + + numpy.testing.assert_allclose(data_set_results_3_b['a'], numpy.array([[1, 2], [3, 4], [5, 6]])) + numpy.testing.assert_allclose(data_set_results_3_b['b'], numpy.array([7., 8., 9.])) + numpy.testing.assert_allclose(data_set_results_3_b['c'], numpy.array(True)) + self.assertEqual(data_set_results_3_b['d'], None) + + self.assertEqual(data_set_results_3_b['a'].dtype.name, 'int64') + self.assertEqual(data_set_results_3_b['b'].dtype.name, 'float64') + self.assertEqual(data_set_results_3_b['c'].dtype.name, 'bool') + + def test_read_write_warnings(self): + report_1 = Report( + id='report_1', + data_sets=[ + DataSet(id='x', label='X'), + DataSet(id='y', label='Y'), + DataSet(id='z', label='Z'), + ], + ) + data_set_results_1 = data_model.DataSetResults({ + 'x': numpy.array([1., 2.]), + 'y': numpy.array([3., 4.]), + 'z': numpy.array([5., 6.]), + }) + + rel_path_1 = os.path.join('a/b/c.sedml', report_1.id) + + io.ReportWriter().run(report_1, data_set_results_1, self.dirname, rel_path_1, format=data_model.ReportFormat.h5) + + report_1.data_sets.append(DataSet(id='w', label='W')) + with self.assertWarns(MissingDataWarning): + io.ReportReader().run(report_1, self.dirname, rel_path_1, format=data_model.ReportFormat.h5) + + report_1.data_sets.pop() + report_1.data_sets.pop() + with self.assertWarns(ExtraDataWarning): + io.ReportReader().run(report_1, self.dirname, rel_path_1, format=data_model.ReportFormat.h5) + + def test_read_write_duplicate_labels(self): + # labels in same order + report_1 = Report( + id='report_1', + data_sets=[ + DataSet(id='x', label='A'), + DataSet(id='y', label='A'), + DataSet(id='z', label='A'), + ], + ) + data_set_results_1 = data_model.DataSetResults({ + 'x': numpy.array([1., 2.]), + 'y': numpy.array([3., 4.]), + 'z': numpy.array([5., 6.]), + }) + + rel_path_1 = os.path.join('a/b/c.sedml', report_1.id) + + io.ReportWriter().run(report_1, data_set_results_1, self.dirname, rel_path_1, format=data_model.ReportFormat.csv) + data_set_results_2 = io.ReportReader().run(report_1, self.dirname, rel_path_1, format=data_model.ReportFormat.csv) + + numpy.testing.assert_allclose(data_set_results_2['x'], numpy.array([1., 2.])) + numpy.testing.assert_allclose(data_set_results_2['y'], numpy.array([3., 4.])) + numpy.testing.assert_allclose(data_set_results_2['z'], numpy.array([5., 6.])) + + # labels in different order + report_1 = Report( + id='report_1', + data_sets=[ + DataSet(id='x', label='X'), + DataSet(id='y', label='X'), + DataSet(id='z', label='Z'), + ], + ) + data_set_results_1 = data_model.DataSetResults({ + 'x': numpy.array([1., 2.]), + 'y': numpy.array([3., 4.]), + 'z': numpy.array([5., 6.]), + }) + + rel_path_1 = os.path.join('a/b/c.sedml', report_1.id) + + io.ReportWriter().run(report_1, data_set_results_1, self.dirname, rel_path_1, format=data_model.ReportFormat.csv) + + report_2 = Report( + id='report_1', + data_sets=[ + DataSet(id='x', label='X'), + DataSet(id='z', label='Z'), + DataSet(id='y', label='X'), + ], + ) + data_set_results_2 = io.ReportReader().run(report_2, self.dirname, rel_path_1, format=data_model.ReportFormat.csv) + + self.assertEqual(set(data_set_results_2.keys()), set(['z'])) + numpy.testing.assert_allclose(data_set_results_2['z'], numpy.array([5., 6.])) + + def test_overwrite_report(self): + report_1 = Report( + id='report_1', + data_sets=[ + DataSet(id='x', label='X'), + DataSet(id='y', label='Y'), + DataSet(id='z', label='Z'), + ], + ) + data_set_results_1 = data_model.DataSetResults({ + 'x': numpy.array([1., 2.]), + 'y': numpy.array([3., 4.]), + 'z': numpy.array([5., 6.]), + }) + + rel_path_1 = os.path.join('a/b/c.sedml', report_1.id) + + io.ReportWriter().run(report_1, data_set_results_1, self.dirname, rel_path_1, format=data_model.ReportFormat.h5) + data_set_results_2 = io.ReportReader().run(report_1, self.dirname, rel_path_1, format=data_model.ReportFormat.h5) + + numpy.testing.assert_allclose(data_set_results_2['x'], numpy.array([1., 2.])) + numpy.testing.assert_allclose(data_set_results_2['y'], numpy.array([3., 4.])) + numpy.testing.assert_allclose(data_set_results_2['z'], numpy.array([5., 6.])) + + data_set_results_1 = data_model.DataSetResults({ + 'x': numpy.array([1., 2.]) + 1., + 'y': numpy.array([3., 4.]) + 1., + 'z': numpy.array([5., 6.]) + 1., + }) + + io.ReportWriter().run(report_1, data_set_results_1, self.dirname, rel_path_1, format=data_model.ReportFormat.h5) + data_set_results_2 = io.ReportReader().run(report_1, self.dirname, rel_path_1, format=data_model.ReportFormat.h5) + + numpy.testing.assert_allclose(data_set_results_2['x'], numpy.array([1., 2.]) + 1.) + numpy.testing.assert_allclose(data_set_results_2['y'], numpy.array([3., 4.]) + 1.) + numpy.testing.assert_allclose(data_set_results_2['z'], numpy.array([5., 6.]) + 1.) + + def test_write_error_handling(self): with self.assertRaisesRegex(NotImplementedError, 'is not supported'): io.ReportWriter().run(Report(), None, None, None, format='TSV') - def test_read_errors(self): + report = Report(data_sets=[DataSet(id='x', label='x')]) + + data_set_results = data_model.DataSetResults({'x': numpy.zeros((3, ))}) + io.ReportWriter().run(report, data_set_results, self.dirname, '.', format=data_model.ReportFormat.csv) + + data_set_results['x'] = data_set_results['x'].reshape((3, 1)) + with self.assertRaisesRegex(ValueError, 'Multidimensional reports cannot be exported'): + io.ReportWriter().run(report, data_set_results, self.dirname, '.', format=data_model.ReportFormat.csv) + + def test_read_error_handling(self): with self.assertRaisesRegex(NotImplementedError, 'is not supported'): io.ReportReader().run(Report(), None, None, format='TSV') @@ -46,7 +302,7 @@ def test_get_ids(self): io.ReportWriter().run(report, results, filename, 'a/b/report5', format=format) io.ReportWriter().run(report, results, filename, 'a/b/report6', format=format) - self.assertEqual(io.ReportReader().get_ids(filename, format=format), set([ + self.assertEqual(set(io.ReportReader().get_ids(filename, format=format)), set([ 'a/b/c.sedml/report1', 'a/b/c.sedml/report2', 'a/b/c.sedml/report3', diff --git a/tests/sedml/test_sedml_exec.py b/tests/sedml/test_sedml_exec.py index 5eb0f1ba..5a83f56d 100644 --- a/tests/sedml/test_sedml_exec.py +++ b/tests/sedml/test_sedml_exec.py @@ -5,19 +5,17 @@ from biosimulators_utils.plot.data_model import PlotFormat from biosimulators_utils.report.data_model import VariableResults, DataSetResults, ReportResults, ReportFormat from biosimulators_utils.report.io import ReportReader +from biosimulators_utils.report.warnings import RepeatDataSetLabelsWarning from biosimulators_utils.sedml import data_model from biosimulators_utils.sedml import exec from biosimulators_utils.sedml import io from biosimulators_utils.sedml.exceptions import SedmlExecutionError -from biosimulators_utils.sedml.warnings import (NoTasksWarning, NoOutputsWarning, RepeatDataSetLabelsWarning, - SedmlFeatureNotSupportedWarning, InconsistentVariableShapesWarning) +from biosimulators_utils.sedml.warnings import NoTasksWarning, NoOutputsWarning, InconsistentVariableShapesWarning from lxml import etree from unittest import mock import numpy import numpy.testing import os -import pandas -import requests import shutil import tempfile import unittest @@ -910,19 +908,21 @@ def execute_task(task, variables, log): pass out_dir = os.path.join(self.tmp_dir, 'results2') - # TODO: remove once multidimensional results supported - with self.assertRaisesRegex(SedmlExecutionError, 'Must pass 2-d input'): + with self.assertRaisesRegex(SedmlExecutionError, 'Multidimensional reports cannot be exported to CSV'): with self.assertWarnsRegex(UserWarning, 'do not have consistent shapes'): - report_results, _ = exec.exec_sed_doc(execute_task, doc, working_dir, out_dir) - # numpy.testing.assert_equal(report_results[doc.outputs[0].id][doc.outputs[0].data_sets[0].id], - # numpy.array(((1., numpy.nan, numpy.nan), (numpy.nan, numpy.nan, numpy.nan), (numpy.nan, numpy.nan, numpy.nan)))) - # numpy.testing.assert_equal(report_results[doc.outputs[0].id][doc.outputs[0].data_sets[1].id], - # numpy.array(((1., 2., numpy.nan), (numpy.nan, numpy.nan, numpy.nan), (numpy.nan, numpy.nan, numpy.nan)))) - # numpy.testing.assert_equal(report_results[doc.outputs[0].id][doc.outputs[0].data_sets[2].id], - # numpy.array(((1., 2., 3.), (4., 5., 6.), (7., 8., 9.)))) + exec.exec_sed_doc(execute_task, doc, working_dir, out_dir) + + with self.assertWarnsRegex(UserWarning, 'do not have consistent shapes'): + report_results, _ = exec.exec_sed_doc(execute_task, doc, working_dir, out_dir, report_formats=[ReportFormat.h5]) + numpy.testing.assert_equal(report_results[doc.outputs[0].id][doc.outputs[0].data_sets[0].id], + numpy.array((1.,))) + numpy.testing.assert_equal(report_results[doc.outputs[0].id][doc.outputs[0].data_sets[1].id], + numpy.array((1., 2.))) + numpy.testing.assert_equal(report_results[doc.outputs[0].id][doc.outputs[0].data_sets[2].id], + numpy.array(((1., 2., 3.), (4., 5., 6.), (7., 8., 9.)))) # warning: data set labels are not unique - doc.data_generators = [ + doc.data_generators=[ data_model.DataGenerator( id='data_gen_1', variables=[ @@ -947,7 +947,7 @@ def execute_task(task, variables, log): ), ] - doc.outputs = [ + doc.outputs=[ data_model.Report( id='report_1', data_sets=[ @@ -966,38 +966,38 @@ def execute_task(task, variables, log): ] def execute_task(task, variables, log): - results = VariableResults() - results[doc.data_generators[0].variables[0].id] = numpy.array((1., 2.)) - results[doc.data_generators[1].variables[0].id] = numpy.array((2., 3.)) + results=VariableResults() + results[doc.data_generators[0].variables[0].id]=numpy.array((1., 2.)) + results[doc.data_generators[1].variables[0].id]=numpy.array((2., 3.)) return results, log - working_dir = self.tmp_dir + working_dir=self.tmp_dir with open(os.path.join(working_dir, doc.models[0].source), 'w'): pass - out_dir = os.path.join(self.tmp_dir, 'results') - with self.assertWarnsRegex(RepeatDataSetLabelsWarning, 'should have unique ids'): + out_dir=os.path.join(self.tmp_dir, 'results') + with self.assertWarnsRegex(RepeatDataSetLabelsWarning, 'should have unique labels'): exec.exec_sed_doc(execute_task, doc, working_dir, out_dir) # error: unsupported outputs - doc.outputs = [ + doc.outputs=[ mock.Mock(id='unsupported') ] - working_dir = self.tmp_dir + working_dir=self.tmp_dir with open(os.path.join(working_dir, doc.models[0].source), 'w'): pass - log = SedDocumentLog(tasks={}, outputs={}) + log=SedDocumentLog(tasks={}, outputs={}) for task in doc.tasks: - log.tasks[task.id] = TaskLog(parent=log) + log.tasks[task.id]=TaskLog(parent=log) for output in doc.outputs: - log.outputs[output.id] = ReportLog(parent=log) + log.outputs[output.id]=ReportLog(parent=log) with self.assertRaisesRegex(SedmlExecutionError, 'are not supported'): exec.exec_sed_doc(execute_task, doc, working_dir, out_dir, log=log) def test_2d_plot(self): - doc = data_model.SedDocument() + doc=data_model.SedDocument() doc.models.append(data_model.Model( id='model', @@ -1082,21 +1082,21 @@ def test_2d_plot(self): ], )) - filename = os.path.join(self.tmp_dir, 'test.sedml') + filename=os.path.join(self.tmp_dir, 'test.sedml') io.SedmlSimulationWriter().run(doc, filename) def execute_task(task, variables, log=None): - results = VariableResults() - results[doc.data_generators[0].variables[0].id] = numpy.linspace(0., 10., 10 + 1) - results[doc.data_generators[1].variables[0].id] = 2 * results[doc.data_generators[0].variables[0].id] + results=VariableResults() + results[doc.data_generators[0].variables[0].id]=numpy.linspace(0., 10., 10 + 1) + results[doc.data_generators[1].variables[0].id]=2 * results[doc.data_generators[0].variables[0].id] return results, log - working_dir = os.path.dirname(filename) + working_dir=os.path.dirname(filename) with open(os.path.join(working_dir, doc.models[0].source), 'w'): pass - out_dir = os.path.join(self.tmp_dir, 'results') - _, log = exec.exec_sed_doc(execute_task, filename, working_dir, + out_dir=os.path.join(self.tmp_dir, 'results') + _, log=exec.exec_sed_doc(execute_task, filename, working_dir, out_dir, plot_formats=[PlotFormat.pdf]) self.assertTrue(os.path.isfile(os.path.join(out_dir, 'plot_2d_1.pdf'))) @@ -1135,9 +1135,9 @@ def execute_task(task, variables, log=None): os.remove(os.path.join(out_dir, 'plot_2d_2.pdf')) # error with a curve - doc.data_generators[0].math = 'time * var' + doc.data_generators[0].math='time * var' io.SedmlSimulationWriter().run(doc, filename) - log = init_sed_document_log(doc) + log=init_sed_document_log(doc) with self.assertRaisesRegex(SedmlExecutionError, "name 'var' is not defined"): exec.exec_sed_doc(execute_task, filename, working_dir, out_dir, log=log, plot_formats=[PlotFormat.pdf]) @@ -1176,14 +1176,14 @@ def execute_task(task, variables, log=None): # error with a task def execute_task(task, variables, log=None): - results = VariableResults() - results[doc.data_generators[0].variables[0].id] = None - results[doc.data_generators[1].variables[0].id] = 2 * numpy.linspace(0., 10., 10 + 1) + results=VariableResults() + results[doc.data_generators[0].variables[0].id]=None + results[doc.data_generators[1].variables[0].id]=2 * numpy.linspace(0., 10., 10 + 1) return results, log - doc.data_generators[0].math = 'time' + doc.data_generators[0].math='time' io.SedmlSimulationWriter().run(doc, filename) - log = init_sed_document_log(doc) + log=init_sed_document_log(doc) with self.assertRaisesRegex(SedmlExecutionError, "Some generators could not be produced:"): exec.exec_sed_doc(execute_task, filename, working_dir, out_dir, log=log, plot_formats=[PlotFormat.pdf]) @@ -1221,7 +1221,7 @@ def execute_task(task, variables, log=None): ) def test_3d_plot(self): - doc = data_model.SedDocument() + doc=data_model.SedDocument() doc.models.append(data_model.Model( id='model', @@ -1312,23 +1312,23 @@ def test_3d_plot(self): ], )) - filename = os.path.join(self.tmp_dir, 'test.sedml') + filename=os.path.join(self.tmp_dir, 'test.sedml') io.SedmlSimulationWriter().run(doc, filename) def execute_task(task, variables, log=None): - results = VariableResults() - x = numpy.arange(-5, 5, 0.25) - x, _ = numpy.meshgrid(x, x) - results[doc.data_generators[0].variables[0].id] = x - results[doc.data_generators[1].variables[0].id] = x + results=VariableResults() + x=numpy.arange(-5, 5, 0.25) + x, _=numpy.meshgrid(x, x) + results[doc.data_generators[0].variables[0].id]=x + results[doc.data_generators[1].variables[0].id]=x return results, log - working_dir = os.path.dirname(filename) + working_dir=os.path.dirname(filename) with open(os.path.join(working_dir, doc.models[0].source), 'w'): pass - out_dir = os.path.join(self.tmp_dir, 'results') - _, log = exec.exec_sed_doc(execute_task, filename, working_dir, + out_dir=os.path.join(self.tmp_dir, 'results') + _, log=exec.exec_sed_doc(execute_task, filename, working_dir, out_dir, plot_formats=[PlotFormat.pdf]) self.assertTrue(os.path.isfile(os.path.join(out_dir, 'plot_3d_1.pdf'))) @@ -1367,9 +1367,9 @@ def execute_task(task, variables, log=None): os.remove(os.path.join(out_dir, 'plot_3d_2.pdf')) # error with a surface - doc.data_generators[0].math = 'time * var' + doc.data_generators[0].math='time * var' io.SedmlSimulationWriter().run(doc, filename) - log = init_sed_document_log(doc) + log=init_sed_document_log(doc) with self.assertRaisesRegex(SedmlExecutionError, "name 'var' is not defined"): exec.exec_sed_doc(execute_task, filename, working_dir, out_dir, log=log, plot_formats=[PlotFormat.pdf]) diff --git a/tests/sedml/test_sedml_utils.py b/tests/sedml/test_sedml_utils.py index 836b4653..42280c02 100644 --- a/tests/sedml/test_sedml_utils.py +++ b/tests/sedml/test_sedml_utils.py @@ -448,7 +448,6 @@ def test_add_multiple_elements_to_multiple_targets(self): species = et.xpath("/sbml:sbml/sbml:model/sbml:listOfSpecies/sbml:species", namespaces=namespaces) parameters = et.xpath("/sbml:sbml/sbml:model/sbml:listOfSpecies/sbml:species/sbml:parameter", namespaces=namespaces) species_ids = [s.get('id') for s in species] - parameter_ids = [p.get('id') for p in parameters] # apply changes et = etree.parse(self.FIXTURE_FILENAME) @@ -656,7 +655,6 @@ def test_apply_compute_model_change_new_value(self): self.assertEqual(utils.get_value_of_variable_model_xml_targets(change.variables[0], models), 2.0) self.assertEqual(utils.get_value_of_variable_model_xml_targets(change.variables[1], models), 3.0) - model = data_model.Model(changes=[change]) doc = data_model.SedDocument(models=[change.variables[0].model, change.variables[1].model]) change.variables[0].model.source = 'https://models.com/model_1.xml' @@ -674,7 +672,7 @@ def test_apply_compute_model_change_new_value(self): # calc new value variable_values = {} with self.assertRaisesRegex(ValueError, 'is not defined'): - self.assertEqual(utils.calc_compute_model_change_new_value(change, variable_values), expected_value) + utils.calc_compute_model_change_new_value(change, variable_values) variable_values = { 'x': 2.,