refactoring to support multi-dimensional reports

biosimulators · Jan 21, 2021 · e09facd · e09facd
1 parent 68858db
commit e09facd
Show file tree

Hide file tree

Showing 8 changed files with 229 additions and 166 deletions.
diff --git a/biosimulators_utils/report/data_model.py b/biosimulators_utils/report/data_model.py
@@ -9,7 +9,7 @@
 import enum
 
 
-__all__ = ['DataGeneratorResults', 'VariableResults', 'ReportResults', 'ReportFormat']
+__all__ = ['VariableResults', 'DataGeneratorResults', 'DataSetResults', 'ReportResults', 'ReportFormat']
 
 
 class VariableResults(dict):
@@ -38,21 +38,24 @@ class DataGeneratorResults(dict):
     pass
 
 
-class ReportResults(dict):
-    """ Dictionary that maps the ids of reports (e.g., :obj:`Report`) to their results (:obj:`pandas.DataFrame`)
+class DataSetResults(dict):
+    """ Dictionary that maps the ids of data sets to their results (:obj:`numpy.ndarray`)
 
-    * Keys (:obj:`str`): ids of reports (e.g., :obj:`Report`)
-    * Values (:obj:`pandas.DataFrame`): result of each reports
+    * Keys (:obj:`str`): ids of data sets
+    * Values (:obj:`numpy.ndarray`): result of each data set
 
-        * Data:
+        * Steady-state tasks of non-spatial models: results should be arrays of shape ``(number of data sets, 1)``
+        * One-step tasks of non-spatial models: results should be arrays of shape ``(number of data sets, 2)``
+        * Uniform time course tasks of non-spatial models: results should be arrays of shape ``(number_of_points + 1)``
+    """
+    pass
 
-            * Steady-state tasks of non-spatial models: results should be arrays of shape ``(number of data sets, 1)``
-            * One-step tasks of non-spatial models: results should be arrays of shape ``(number of data sets, 2)``
-            * Uniform time course tasks of non-spatial models: results should be arrays of shape ``(number of data sets, number_of_points + 1)``
 
-        * Indices (row labels)
+class ReportResults(dict):
+    """ Dictionary that maps the ids of reports (e.g., :obj:`Report`) to their results (:obj:`DataSetResults`)
 
-            * Reports: equal to the ids of the data sets if each report
+    * Keys (:obj:`str`): ids of reports (e.g., :obj:`Report`)
+    * Values (:obj:`DataSetResults`): result of each report
     """
     pass
 

diff --git a/biosimulators_utils/report/io.py b/biosimulators_utils/report/io.py
@@ -7,8 +7,13 @@
 """
 
 from ..config import get_config
-from .data_model import ReportFormat
+from ..sedml.data_model import Report  # noqa: F401
+from ..sedml.utils import pad_arrays_to_consistent_shapes
+from ..sedml.warnings import RepeatDataSetLabelsWarning
+from ..warnings import warn
+from .data_model import DataSetResults, ReportFormat
 import glob
+import numpy
 import os
 import pandas
 import tables
@@ -23,11 +28,12 @@
 class ReportWriter(object):
     """ Class for writing reports of simulation results """
 
-    def run(self, results, base_path, rel_path, format=ReportFormat.h5):
+    def run(self, report, results, base_path, rel_path, format=ReportFormat.h5):
         """ Save a report
 
         Args:
-            results (:obj:`pandas.DataFrame`): report results
+            report (:obj:`Report`): report
+            results (:obj:`DataSetResults`): results of the data sets
             base_path (:obj:`str`): path to save results
 
                 * CSV: parent directory to save results
@@ -40,14 +46,27 @@ def run(self, results, base_path, rel_path, format=ReportFormat.h5):
 
             format (:obj:`ReportFormat`, optional): report format
         """
+        data_set_labels = [data_set.label for data_set in report.data_sets]
+        if len(set(data_set_labels)) < len(report.data_sets):
+            warn('To facilitate machine interpretation, data sets should have unique ids.',
+                 RepeatDataSetLabelsWarning)
+
+        results_list = []
+        data_set_labels = []
+        for data_set in report.data_sets:
+            if data_set.id in results:
+                results_list.append(results[data_set.id])
+                data_set_labels.append(data_set.label)
+        results_list = pad_arrays_to_consistent_shapes(results_list)
+        results_df = pandas.DataFrame(numpy.array(results_list), index=data_set_labels)
 
         if format == ReportFormat.csv:
             filename = os.path.join(base_path, rel_path + '.' + format.value)
             out_dir = os.path.dirname(filename)
             if not os.path.isdir(out_dir):
                 os.makedirs(out_dir)
-            results.to_csv(filename,
-                           header=False)
+            results_df.to_csv(filename,
+                              header=False)
 
         elif format == ReportFormat.h5:
             filename = os.path.join(base_path, get_config().H5_REPORTS_PATH)
@@ -56,14 +75,14 @@ def run(self, results, base_path, rel_path, format=ReportFormat.h5):
 
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore", tables.NaturalNameWarning)
-                results.to_hdf(filename,
-                               key=rel_path,
-                               format='table',
-                               complevel=9,
-                               complib='zlib',
-                               mode='a',
-                               append=False,
-                               )
+                results_df.to_hdf(filename,
+                                  key=rel_path,
+                                  format='table',
+                                  complevel=9,
+                                  complib='zlib',
+                                  mode='a',
+                                  append=False,
+                                  )
 
         else:
             raise NotImplementedError('Report format {} is not supported'.format(format))
@@ -72,10 +91,11 @@ def run(self, results, base_path, rel_path, format=ReportFormat.h5):
 class ReportReader(object):
     """ Class for reading reports of simulation results """
 
-    def run(self, base_path, rel_path, format=ReportFormat.h5):
+    def run(self, report, base_path, rel_path, format=ReportFormat.h5):
         """ Read a report for a file
 
         Args:
+            report (:obj:`Report`): report
             base_path (:obj:`str`): path to save results
 
                 * CSV: parent directory to save results
@@ -89,25 +109,54 @@ def run(self, base_path, rel_path, format=ReportFormat.h5):
             format (:obj:`ReportFormat`, optional): report format
 
         Returns:
-            :obj:`pandas.DataFrame`: report results
+            :obj:`DataSetResults`: report results
         """
         if format == ReportFormat.csv:
             filename = os.path.join(base_path, rel_path + '.' + format.value)
             df = pandas.read_csv(filename,
                                  index_col=0,
                                  header=None)
             df.columns = pandas.RangeIndex(start=0, stop=df.shape[1], step=1)
-            return df
 
         elif format == ReportFormat.h5:
             filename = os.path.join(base_path, get_config().H5_REPORTS_PATH)
-            return pandas.read_hdf(filename,
-                                   key=rel_path,
-                                   )
+            df = pandas.read_hdf(filename,
+                                 key=rel_path,
+                                 )
 
         else:
             raise NotImplementedError('Report format {} is not supported'.format(format))
 
+        results = DataSetResults()
+
+        data_set_labels = [data_set.label for data_set in report.data_sets]
+        unreadable_data_sets = []
+        if df.index.tolist() == data_set_labels:
+            for data_set in report.data_sets:
+                results[data_set.id] = df.loc[data_set.label, :]
+
+        else:
+            data_set_label_to_index = {}
+            for i_data_set, data_set_label in enumerate(df.index):
+                if data_set_label not in data_set_label_to_index:
+                    data_set_label_to_index[data_set_label] = i_data_set
+                else:
+                    data_set_label_to_index[data_set_label] = None
+
+            for data_set in report.data_sets:
+                i_data_set = data_set_label_to_index.get(data_set.label, None)
+                if i_data_set is None:
+                    # results[data_set.id] = None
+                    unreadable_data_sets.append(data_set.id)
+                else:
+                    results[data_set.id] = df.loc[data_set.label, :]
+
+        if unreadable_data_sets:
+            warn('Some data sets could not be read because their labels are not unique:\n  - {}'.format(
+                '\n'.join('`' + id + '`' for id in sorted(unreadable_data_sets))), RepeatDataSetLabelsWarning)
+
+        return results
+
     def get_ids(self, base_path, format=ReportFormat.h5):
         """ Get the ids of the reports in a file
 

diff --git a/biosimulators_utils/sedml/exec.py b/biosimulators_utils/sedml/exec.py
@@ -11,21 +11,18 @@
 from ..log.utils import init_sed_document_log
 from ..plot.data_model import PlotFormat
 from ..plot.io import write_plot_2d, write_plot_3d
-from ..report.data_model import VariableResults, ReportResults, ReportFormat
+from ..report.data_model import VariableResults, DataSetResults, ReportResults, ReportFormat  # noqa: F401
 from ..report.io import ReportWriter
 from ..warnings import warn
 from .data_model import SedDocument, Task, Report, Plot2D, Plot3D
 from .exceptions import SedmlExecutionError
-from .warnings import RepeatDataSetLabelsWarning
 from .io import SedmlSimulationReader
 from .utils import resolve_model_and_apply_xml_changes, get_variables_for_task, calc_data_generators_results
 from .warnings import NoTasksWarning, NoOutputsWarning
 import capturer
 import copy
 import datetime
-import numpy
 import os
-import pandas
 import sys
 import termcolor
 import types  # noqa: F401
@@ -307,7 +304,7 @@ def exec_report(report, variable_results, base_out_path, rel_out_path, formats,
     Returns:
         :obj:`tuple`:
 
-            * :obj:`pandas.DataFrame`: report
+            * :obj:`DataSetResults`: report
             * :obj:`Status`: status
             * :obj:`Exception`: exception for failure
             * :obj:`bool`: whether :obj:`task` contribute a variable to the report
@@ -318,21 +315,18 @@ def exec_report(report, variable_results, base_out_path, rel_out_path, formats,
         data_generators.add(data_set.data_generator)
 
     data_gen_results, data_gen_statuses, data_gen_exceptions, task_contributes_to_report = calc_data_generators_results(
-        data_generators, variable_results, report, task)
+        data_generators, variable_results, report, task, make_shapes_consistent=False)
 
     # collect data sets
-    dataset_labels = []
-    dataset_results = []
+    data_set_results = {}
 
     running = False
     succeeded = True
     failed = False
 
     for data_set in report.data_sets:
-        dataset_labels.append(data_set.label)
-
         data_gen_res = data_gen_results[data_set.data_generator.id]
-        dataset_results.append(data_gen_res)
+        data_set_results[data_set.id] = data_gen_res
 
         data_gen_status = data_gen_statuses[data_set.data_generator.id]
         log.data_sets[data_set.id] = data_gen_status
@@ -343,13 +337,9 @@ def exec_report(report, variable_results, base_out_path, rel_out_path, formats,
         else:
             succeeded = False
 
-    if len(set(dataset_labels)) < len(dataset_labels):
-        warn('To facilitate machine interpretation, data sets should have unique ids.',
-             RepeatDataSetLabelsWarning)
-
-    output_df = pandas.DataFrame(numpy.array(dataset_results), index=dataset_labels)
     for format in formats:
-        ReportWriter().run(output_df,
+        ReportWriter().run(report,
+                           data_set_results,
                            base_out_path,
                            os.path.join(rel_out_path, report.id) if rel_out_path else report.id,
                            format=format)
@@ -366,7 +356,7 @@ def exec_report(report, variable_results, base_out_path, rel_out_path, formats,
     else:
         status = Status.QUEUED
 
-    return output_df, status, data_gen_exceptions, task_contributes_to_report
+    return data_set_results, status, data_gen_exceptions, task_contributes_to_report
 
 
 def exec_plot_2d(plot, variable_results, base_out_path, rel_out_path, formats, task, log):
@@ -375,25 +365,19 @@ def exec_plot_2d(plot, variable_results, base_out_path, rel_out_path, formats, t
     Args:
         plot (:obj:`Plot2D`): plot
         variable_results (:obj:`VariableResults`): result of each data generator
-        base_out_path (:obj:`str`): path to store the outputs
-
-            * CSV: directory in which to save outputs to files
-              ``{base_out_path}/{rel_out_path}/{report.id}.csv``
-            * HDF5: directory in which to save a single HDF5 file (``{base_out_path}/reports.h5``),
-              with reports at keys ``{rel_out_path}/{report.id}`` within the HDF5 file
-
-        rel_out_path (:obj:`str`, optional): path relative to :obj:`base_out_path` to store the outputs
+        base_out_path (:obj:`str`): base path to store the plot. Complete path is
+            ``{base_out_path}/{rel_out_path}/{plot.id}.csv``
+        rel_out_path (:obj:`str`, optional): path relative to :obj:`base_out_path` to store the plot
         formats (:obj:`list` of :obj:`PlotFormat`, optional): plot format (e.g., pdf)
         task (:obj:`Task`): task
-        log (:obj:`ReportLog`, optional): log of report
+        log (:obj:`ReportLog`, optional): log of plot
 
     Returns:
         :obj:`tuple`:
 
-            * :obj:`pandas.DataFrame`: results of data generators
             * :obj:`Status`: status
             * :obj:`Exception`: exception for failure
-            * :obj:`bool`: whether :obj:`task` contribute a variable to the report
+            * :obj:`bool`: whether :obj:`task` contributes a variable to the plot
     """
     # calculate data generators
     data_generators = set()
@@ -456,30 +440,24 @@ def exec_plot_2d(plot, variable_results, base_out_path, rel_out_path, formats, t
 
 
 def exec_plot_3d(plot, variable_results, base_out_path, rel_out_path, formats, task, log):
-    """ Execute a 3D plot, generating the curves which are available
+    """ Execute a 3D plot, generating the surfaces which are available
 
     Args:
         plot (:obj:`Plot3D`): plot
         variable_results (:obj:`VariableResults`): result of each data generator
-        base_out_path (:obj:`str`): path to store the outputs
-
-            * CSV: directory in which to save outputs to files
-              ``{base_out_path}/{rel_out_path}/{report.id}.csv``
-            * HDF5: directory in which to save a single HDF5 file (``{base_out_path}/reports.h5``),
-              with reports at keys ``{rel_out_path}/{report.id}`` within the HDF5 file
-
-        rel_out_path (:obj:`str`, optional): path relative to :obj:`base_out_path` to store the outputs
+        base_out_path (:obj:`str`): base path to store the plot. Complete path is
+          ``{base_out_path}/{rel_out_path}/{plot.id}.pdf``
+        rel_out_path (:obj:`str`, optional): path relative to :obj:`base_out_path` to store the plot
         formats (:obj:`list` of :obj:`PlotFormat`, optional): plot format (e.g., pdf)
         task (:obj:`Task`): task
-        log (:obj:`ReportLog`, optional): log of report
+        log (:obj:`ReportLog`, optional): log of plot
 
     Returns:
         :obj:`tuple`:
 
-            * :obj:`pandas.DataFrame`: results of data generators
             * :obj:`Status`: status
             * :obj:`Exception`: exception for failure
-            * :obj:`bool`: whether :obj:`task` contribute a variable to the report
+            * :obj:`bool`: whether :obj:`task` contributes a variable to the plot
     """
     # calculate data generators
     data_generators = set()

diff --git a/biosimulators_utils/sedml/io.py b/biosimulators_utils/sedml/io.py
@@ -1076,7 +1076,7 @@ def _read_variables(self, obj_sed, id_to_model_map, id_to_task_map):
             var.symbol = var_sed.getSymbol() or None
             var.target = var_sed.getTarget() or None
 
-            if var.target.startswith('#'):
+            if var.target and var.target.startswith('#'):
                 raise NotImplementedError('Variable targets to data descriptions are not supported.')
 
             self._deserialize_reference(var_sed, var, 'task', 'Task', 'task', id_to_task_map)