Skip to content

Commit

Permalink
refactoring to support multi-dimensional reports
Browse files Browse the repository at this point in the history
  • Loading branch information
jonrkarr committed Jan 21, 2021
1 parent 68858db commit e09facd
Show file tree
Hide file tree
Showing 8 changed files with 229 additions and 166 deletions.
25 changes: 14 additions & 11 deletions biosimulators_utils/report/data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import enum


__all__ = ['DataGeneratorResults', 'VariableResults', 'ReportResults', 'ReportFormat']
__all__ = ['VariableResults', 'DataGeneratorResults', 'DataSetResults', 'ReportResults', 'ReportFormat']


class VariableResults(dict):
Expand Down Expand Up @@ -38,21 +38,24 @@ class DataGeneratorResults(dict):
pass


class ReportResults(dict):
""" Dictionary that maps the ids of reports (e.g., :obj:`Report`) to their results (:obj:`pandas.DataFrame`)
class DataSetResults(dict):
""" Dictionary that maps the ids of data sets to their results (:obj:`numpy.ndarray`)
* Keys (:obj:`str`): ids of reports (e.g., :obj:`Report`)
* Values (:obj:`pandas.DataFrame`): result of each reports
* Keys (:obj:`str`): ids of data sets
* Values (:obj:`numpy.ndarray`): result of each data set
* Data:
* Steady-state tasks of non-spatial models: results should be arrays of shape ``(number of data sets, 1)``
* One-step tasks of non-spatial models: results should be arrays of shape ``(number of data sets, 2)``
* Uniform time course tasks of non-spatial models: results should be arrays of shape ``(number_of_points + 1)``
"""
pass

* Steady-state tasks of non-spatial models: results should be arrays of shape ``(number of data sets, 1)``
* One-step tasks of non-spatial models: results should be arrays of shape ``(number of data sets, 2)``
* Uniform time course tasks of non-spatial models: results should be arrays of shape ``(number of data sets, number_of_points + 1)``

* Indices (row labels)
class ReportResults(dict):
""" Dictionary that maps the ids of reports (e.g., :obj:`Report`) to their results (:obj:`DataSetResults`)
* Reports: equal to the ids of the data sets if each report
* Keys (:obj:`str`): ids of reports (e.g., :obj:`Report`)
* Values (:obj:`DataSetResults`): result of each report
"""
pass

Expand Down
87 changes: 68 additions & 19 deletions biosimulators_utils/report/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@
"""

from ..config import get_config
from .data_model import ReportFormat
from ..sedml.data_model import Report # noqa: F401
from ..sedml.utils import pad_arrays_to_consistent_shapes
from ..sedml.warnings import RepeatDataSetLabelsWarning
from ..warnings import warn
from .data_model import DataSetResults, ReportFormat
import glob
import numpy
import os
import pandas
import tables
Expand All @@ -23,11 +28,12 @@
class ReportWriter(object):
""" Class for writing reports of simulation results """

def run(self, results, base_path, rel_path, format=ReportFormat.h5):
def run(self, report, results, base_path, rel_path, format=ReportFormat.h5):
""" Save a report
Args:
results (:obj:`pandas.DataFrame`): report results
report (:obj:`Report`): report
results (:obj:`DataSetResults`): results of the data sets
base_path (:obj:`str`): path to save results
* CSV: parent directory to save results
Expand All @@ -40,14 +46,27 @@ def run(self, results, base_path, rel_path, format=ReportFormat.h5):
format (:obj:`ReportFormat`, optional): report format
"""
data_set_labels = [data_set.label for data_set in report.data_sets]
if len(set(data_set_labels)) < len(report.data_sets):
warn('To facilitate machine interpretation, data sets should have unique ids.',
RepeatDataSetLabelsWarning)

results_list = []
data_set_labels = []
for data_set in report.data_sets:
if data_set.id in results:
results_list.append(results[data_set.id])
data_set_labels.append(data_set.label)
results_list = pad_arrays_to_consistent_shapes(results_list)
results_df = pandas.DataFrame(numpy.array(results_list), index=data_set_labels)

if format == ReportFormat.csv:
filename = os.path.join(base_path, rel_path + '.' + format.value)
out_dir = os.path.dirname(filename)
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
results.to_csv(filename,
header=False)
results_df.to_csv(filename,
header=False)

elif format == ReportFormat.h5:
filename = os.path.join(base_path, get_config().H5_REPORTS_PATH)
Expand All @@ -56,14 +75,14 @@ def run(self, results, base_path, rel_path, format=ReportFormat.h5):

with warnings.catch_warnings():
warnings.simplefilter("ignore", tables.NaturalNameWarning)
results.to_hdf(filename,
key=rel_path,
format='table',
complevel=9,
complib='zlib',
mode='a',
append=False,
)
results_df.to_hdf(filename,
key=rel_path,
format='table',
complevel=9,
complib='zlib',
mode='a',
append=False,
)

else:
raise NotImplementedError('Report format {} is not supported'.format(format))
Expand All @@ -72,10 +91,11 @@ def run(self, results, base_path, rel_path, format=ReportFormat.h5):
class ReportReader(object):
""" Class for reading reports of simulation results """

def run(self, base_path, rel_path, format=ReportFormat.h5):
def run(self, report, base_path, rel_path, format=ReportFormat.h5):
""" Read a report for a file
Args:
report (:obj:`Report`): report
base_path (:obj:`str`): path to save results
* CSV: parent directory to save results
Expand All @@ -89,25 +109,54 @@ def run(self, base_path, rel_path, format=ReportFormat.h5):
format (:obj:`ReportFormat`, optional): report format
Returns:
:obj:`pandas.DataFrame`: report results
:obj:`DataSetResults`: report results
"""
if format == ReportFormat.csv:
filename = os.path.join(base_path, rel_path + '.' + format.value)
df = pandas.read_csv(filename,
index_col=0,
header=None)
df.columns = pandas.RangeIndex(start=0, stop=df.shape[1], step=1)
return df

elif format == ReportFormat.h5:
filename = os.path.join(base_path, get_config().H5_REPORTS_PATH)
return pandas.read_hdf(filename,
key=rel_path,
)
df = pandas.read_hdf(filename,
key=rel_path,
)

else:
raise NotImplementedError('Report format {} is not supported'.format(format))

results = DataSetResults()

data_set_labels = [data_set.label for data_set in report.data_sets]
unreadable_data_sets = []
if df.index.tolist() == data_set_labels:
for data_set in report.data_sets:
results[data_set.id] = df.loc[data_set.label, :]

else:
data_set_label_to_index = {}
for i_data_set, data_set_label in enumerate(df.index):
if data_set_label not in data_set_label_to_index:
data_set_label_to_index[data_set_label] = i_data_set
else:
data_set_label_to_index[data_set_label] = None

for data_set in report.data_sets:
i_data_set = data_set_label_to_index.get(data_set.label, None)
if i_data_set is None:
# results[data_set.id] = None
unreadable_data_sets.append(data_set.id)
else:
results[data_set.id] = df.loc[data_set.label, :]

if unreadable_data_sets:
warn('Some data sets could not be read because their labels are not unique:\n - {}'.format(
'\n'.join('`' + id + '`' for id in sorted(unreadable_data_sets))), RepeatDataSetLabelsWarning)

return results

def get_ids(self, base_path, format=ReportFormat.h5):
""" Get the ids of the reports in a file
Expand Down
60 changes: 19 additions & 41 deletions biosimulators_utils/sedml/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,18 @@
from ..log.utils import init_sed_document_log
from ..plot.data_model import PlotFormat
from ..plot.io import write_plot_2d, write_plot_3d
from ..report.data_model import VariableResults, ReportResults, ReportFormat
from ..report.data_model import VariableResults, DataSetResults, ReportResults, ReportFormat # noqa: F401
from ..report.io import ReportWriter
from ..warnings import warn
from .data_model import SedDocument, Task, Report, Plot2D, Plot3D
from .exceptions import SedmlExecutionError
from .warnings import RepeatDataSetLabelsWarning
from .io import SedmlSimulationReader
from .utils import resolve_model_and_apply_xml_changes, get_variables_for_task, calc_data_generators_results
from .warnings import NoTasksWarning, NoOutputsWarning
import capturer
import copy
import datetime
import numpy
import os
import pandas
import sys
import termcolor
import types # noqa: F401
Expand Down Expand Up @@ -307,7 +304,7 @@ def exec_report(report, variable_results, base_out_path, rel_out_path, formats,
Returns:
:obj:`tuple`:
* :obj:`pandas.DataFrame`: report
* :obj:`DataSetResults`: report
* :obj:`Status`: status
* :obj:`Exception`: exception for failure
* :obj:`bool`: whether :obj:`task` contribute a variable to the report
Expand All @@ -318,21 +315,18 @@ def exec_report(report, variable_results, base_out_path, rel_out_path, formats,
data_generators.add(data_set.data_generator)

data_gen_results, data_gen_statuses, data_gen_exceptions, task_contributes_to_report = calc_data_generators_results(
data_generators, variable_results, report, task)
data_generators, variable_results, report, task, make_shapes_consistent=False)

# collect data sets
dataset_labels = []
dataset_results = []
data_set_results = {}

running = False
succeeded = True
failed = False

for data_set in report.data_sets:
dataset_labels.append(data_set.label)

data_gen_res = data_gen_results[data_set.data_generator.id]
dataset_results.append(data_gen_res)
data_set_results[data_set.id] = data_gen_res

data_gen_status = data_gen_statuses[data_set.data_generator.id]
log.data_sets[data_set.id] = data_gen_status
Expand All @@ -343,13 +337,9 @@ def exec_report(report, variable_results, base_out_path, rel_out_path, formats,
else:
succeeded = False

if len(set(dataset_labels)) < len(dataset_labels):
warn('To facilitate machine interpretation, data sets should have unique ids.',
RepeatDataSetLabelsWarning)

output_df = pandas.DataFrame(numpy.array(dataset_results), index=dataset_labels)
for format in formats:
ReportWriter().run(output_df,
ReportWriter().run(report,
data_set_results,
base_out_path,
os.path.join(rel_out_path, report.id) if rel_out_path else report.id,
format=format)
Expand All @@ -366,7 +356,7 @@ def exec_report(report, variable_results, base_out_path, rel_out_path, formats,
else:
status = Status.QUEUED

return output_df, status, data_gen_exceptions, task_contributes_to_report
return data_set_results, status, data_gen_exceptions, task_contributes_to_report


def exec_plot_2d(plot, variable_results, base_out_path, rel_out_path, formats, task, log):
Expand All @@ -375,25 +365,19 @@ def exec_plot_2d(plot, variable_results, base_out_path, rel_out_path, formats, t
Args:
plot (:obj:`Plot2D`): plot
variable_results (:obj:`VariableResults`): result of each data generator
base_out_path (:obj:`str`): path to store the outputs
* CSV: directory in which to save outputs to files
``{base_out_path}/{rel_out_path}/{report.id}.csv``
* HDF5: directory in which to save a single HDF5 file (``{base_out_path}/reports.h5``),
with reports at keys ``{rel_out_path}/{report.id}`` within the HDF5 file
rel_out_path (:obj:`str`, optional): path relative to :obj:`base_out_path` to store the outputs
base_out_path (:obj:`str`): base path to store the plot. Complete path is
``{base_out_path}/{rel_out_path}/{plot.id}.csv``
rel_out_path (:obj:`str`, optional): path relative to :obj:`base_out_path` to store the plot
formats (:obj:`list` of :obj:`PlotFormat`, optional): plot format (e.g., pdf)
task (:obj:`Task`): task
log (:obj:`ReportLog`, optional): log of report
log (:obj:`ReportLog`, optional): log of plot
Returns:
:obj:`tuple`:
* :obj:`pandas.DataFrame`: results of data generators
* :obj:`Status`: status
* :obj:`Exception`: exception for failure
* :obj:`bool`: whether :obj:`task` contribute a variable to the report
* :obj:`bool`: whether :obj:`task` contributes a variable to the plot
"""
# calculate data generators
data_generators = set()
Expand Down Expand Up @@ -456,30 +440,24 @@ def exec_plot_2d(plot, variable_results, base_out_path, rel_out_path, formats, t


def exec_plot_3d(plot, variable_results, base_out_path, rel_out_path, formats, task, log):
""" Execute a 3D plot, generating the curves which are available
""" Execute a 3D plot, generating the surfaces which are available
Args:
plot (:obj:`Plot3D`): plot
variable_results (:obj:`VariableResults`): result of each data generator
base_out_path (:obj:`str`): path to store the outputs
* CSV: directory in which to save outputs to files
``{base_out_path}/{rel_out_path}/{report.id}.csv``
* HDF5: directory in which to save a single HDF5 file (``{base_out_path}/reports.h5``),
with reports at keys ``{rel_out_path}/{report.id}`` within the HDF5 file
rel_out_path (:obj:`str`, optional): path relative to :obj:`base_out_path` to store the outputs
base_out_path (:obj:`str`): base path to store the plot. Complete path is
``{base_out_path}/{rel_out_path}/{plot.id}.pdf``
rel_out_path (:obj:`str`, optional): path relative to :obj:`base_out_path` to store the plot
formats (:obj:`list` of :obj:`PlotFormat`, optional): plot format (e.g., pdf)
task (:obj:`Task`): task
log (:obj:`ReportLog`, optional): log of report
log (:obj:`ReportLog`, optional): log of plot
Returns:
:obj:`tuple`:
* :obj:`pandas.DataFrame`: results of data generators
* :obj:`Status`: status
* :obj:`Exception`: exception for failure
* :obj:`bool`: whether :obj:`task` contribute a variable to the report
* :obj:`bool`: whether :obj:`task` contributes a variable to the plot
"""
# calculate data generators
data_generators = set()
Expand Down
2 changes: 1 addition & 1 deletion biosimulators_utils/sedml/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1076,7 +1076,7 @@ def _read_variables(self, obj_sed, id_to_model_map, id_to_task_map):
var.symbol = var_sed.getSymbol() or None
var.target = var_sed.getTarget() or None

if var.target.startswith('#'):
if var.target and var.target.startswith('#'):
raise NotImplementedError('Variable targets to data descriptions are not supported.')

self._deserialize_reference(var_sed, var, 'task', 'Task', 'task', id_to_task_map)
Expand Down
Loading

0 comments on commit e09facd

Please sign in to comment.