Rename dataset to table (#117)

* Rename dataset to table Fixes #115 * Indicate where the data is being stored * Fix bug due to new pydantic * Clarify that readers exist * Updated the cycle times notebook * Minor addition to documentation
ROVI-org · Nov 21, 2024 · 9a11355 · 9a11355
1 parent e863a24
commit 9a11355
Show file tree

Hide file tree

Showing 12 changed files with 129 additions and 51 deletions.
diff --git a/battdat/consistency/current.py b/battdat/consistency/current.py
@@ -31,7 +31,7 @@ class SignConventionChecker(ConsistencyChecker):
     def check(self, dataset: BatteryDataset) -> List[str]:
         warnings = []
         for subset in self.subsets_to_check:
-            if (warning := self.check_subset(dataset.datasets[subset])) is not None:
+            if (warning := self.check_subset(dataset.tables[subset])) is not None:
                 warnings.append(warning)
         return warnings
 

diff --git a/battdat/data.py b/battdat/data.py
@@ -2,7 +2,7 @@
 import logging
 import warnings
 from pathlib import Path
-from typing import Union, Optional, Collection, List, Dict, Set, Iterator, Tuple
+from typing import Union, Optional, Collection, List, Dict, Set, Iterator, Tuple, Mapping
 
 from pydantic import BaseModel, ValidationError
 from tables import File
@@ -17,34 +17,32 @@
 logger = logging.getLogger(__name__)
 
 
-class BatteryDataset:
+class BatteryDataset(Mapping[str, pd.DataFrame]):
     """Base class for all battery datasets.
 
     Not to be created directly by users. Defines the functions to validate, read, and write from HDF5 or Parquet files.
 
     Args:
-        datasets: Subsets which compose this larger dataset
+        tables: Subsets which compose this larger dataset
         metadata: Metadata for the entire dataset
         schemas: Schemas describing each subset
         check_schemas: Whether to throw an error if datasets lack a schema
     """
 
-    datasets: Dict[str, pd.DataFrame]
-    """List of tabular sub-datasets which are part of this dataset"""
     metadata: BatteryMetadata
     """Information describing the source of a dataset"""
     schemas: Dict[str, ColumnSchema]
     """Schemas describing each dataset"""
-    datasets: Dict[str, pd.DataFrame]
+    tables: Dict[str, pd.DataFrame]
     """Datasets available for users"""
 
     def __init__(self,
-                 datasets: Dict[str, pd.DataFrame],
+                 tables: Dict[str, pd.DataFrame],
                  schemas: Dict[str, ColumnSchema],
                  metadata: BatteryMetadata = None,
                  check_schemas: bool = True):
         self.schemas = schemas.copy()
-        self.datasets = datasets.copy()
+        self.tables = tables.copy()
 
         # Assign default metadata
         if metadata is None:
@@ -67,13 +65,27 @@ def __init__(self,
                 raise
 
         # Check if schemas are missing for some datasets
-        missing_schema = set(self.datasets.keys()).difference(self.schemas)
+        missing_schema = set(self.tables.keys()).difference(self.schemas)
         if len(missing_schema) > 0:
             warn_msg = f'Missing schema for some datasets: {", ".join(missing_schema)}'
             logger.warning(warn_msg)
             if check_schemas:
                 raise ValueError(warn_msg)
 
+    def __getitem__(self, item: str) -> pd.DataFrame:
+        """Access a specific table within the dataset"""
+        return self.tables[item]
+
+    def __contains__(self, item):
+        """Whether the dataset contains a specific table"""
+        return item in self.tables
+
+    def __len__(self):
+        return len(self.tables)
+
+    def __iter__(self):
+        return iter(self.tables.items())
+
     def validate_columns(self, allow_extra_columns: bool = True):
         """Determine whether the column types are appropriate
 
@@ -84,7 +96,7 @@ def validate_columns(self, allow_extra_columns: bool = True):
             (ValueError): If the dataset fails validation
         """
         for attr_name, schema in self.schemas.items():
-            if (data := self.datasets.get(attr_name)) is not None:
+            if (data := self.tables.get(attr_name)) is not None:
                 schema.validate_dataframe(data, allow_extra_columns)
 
     def validate(self) -> List[str]:
@@ -101,7 +113,7 @@ def validate(self) -> List[str]:
         output = []
 
         for attr_name, schema in self.schemas.items():
-            if (data := self.datasets.get(attr_name)) is not None:
+            if (data := self.tables.get(attr_name)) is not None:
                 undefined = set(data.columns).difference(schema.column_names)
                 output.extend([f'Undefined column, {u}, in {attr_name}. Add a description into schemas.{attr_name}.extra_columns'
                                for u in undefined])
@@ -138,15 +150,15 @@ def to_hdf(self,
     @classmethod
     def from_hdf(cls,
                  path_or_buf: Union[str, Path, File],
-                 subsets: Optional[Collection[str]] = None,
+                 tables: Optional[Collection[str]] = None,
                  prefix: Union[str, int] = None) -> 'BatteryDataset':
         """Read the battery data from an HDF file
 
         Use :meth:`all_cells_from_hdf` to read all datasets from a file.
 
         Args:
             path_or_buf: File path or HDFStore object
-            subsets : Which subsets of data to read from the data file (e.g., raw_data, cycle_stats)
+            tables : Which subsets of data to read from the data file (e.g., raw_data, cycle_stats)
             prefix: (``str``) Prefix designating which battery extract from this file,
                 or (``int``) index within the list of available prefixes, sorted alphabetically.
                 The default is to read the default prefix (``None``).
@@ -156,7 +168,7 @@ def from_hdf(cls,
         reader = HDF5Reader()
         reader.output_class = cls
         with as_hdf5_object(path_or_buf) as store:
-            return reader.read_from_hdf(store, prefix, subsets)
+            return reader.read_from_hdf(store, prefix, tables)
 
     @classmethod
     def all_cells_from_hdf(cls, path: Union[str, Path], subsets: Optional[Collection[str]] = None) -> Iterator[Tuple[str, 'CellDataset']]:
@@ -175,7 +187,7 @@ def all_cells_from_hdf(cls, path: Union[str, Path], subsets: Optional[Collection
 
         with File(path, mode='r') as fp:  # Only open once
             for name in names:
-                yield name, cls.from_hdf(fp, prefix=name, subsets=subsets)
+                yield name, cls.from_hdf(fp, prefix=name, tables=subsets)
 
     @staticmethod
     def inspect_hdf(path_or_buf: Union[str, Path, File]) -> tuple[BatteryMetadata, Set[Optional[str]]]:
@@ -269,25 +281,25 @@ class CellDataset(BatteryDataset):
     @property
     def raw_data(self) -> Optional[pd.DataFrame]:
         """Time-series data capturing the state of the battery as a function of time"""
-        return self.datasets.get('raw_data')
+        return self.tables.get('raw_data')
 
     @property
     def cycle_stats(self) -> Optional[pd.DataFrame]:
         """Summary statistics of each cycle"""
-        return self.datasets.get('cycle_stats')
+        return self.tables.get('cycle_stats')
 
     @property
     def eis_data(self) -> Optional[pd.DataFrame]:
         """Electrochemical Impedance Spectroscopy (EIS) data"""
-        return self.datasets.get('eis_data')
+        return self.tables.get('eis_data')
 
     def __init__(self,
                  metadata: Union[BatteryMetadata, dict] = None,
                  raw_data: Optional[pd.DataFrame] = None,
                  cycle_stats: Optional[pd.DataFrame] = None,
                  eis_data: Optional[pd.DataFrame] = None,
                  schemas: Optional[Dict[str, ColumnSchema]] = None,
-                 datasets: Dict[str, pd.DataFrame] = None):
+                 tables: Dict[str, pd.DataFrame] = None):
         _schemas = {
             'raw_data': RawData(),
             'cycle_stats': CycleLevelData(),
@@ -297,10 +309,10 @@ def __init__(self,
             _schemas.update(schemas)
 
         _datasets = {'raw_data': raw_data, 'eis_data': eis_data, 'cycle_stats': cycle_stats}
-        if datasets is not None:
-            _datasets.update(datasets)
+        if tables is not None:
+            _datasets.update(tables)
         super().__init__(
-            datasets=_datasets,
+            tables=_datasets,
             schemas=_schemas,
             metadata=metadata,
         )
diff --git a/battdat/io/hdf.py b/battdat/io/hdf.py
@@ -164,7 +164,7 @@ def read_from_hdf(self, file: File, prefix: Union[int, str, None], subsets: Opti
 
         # Read out the battery metadata
         metadata = BatteryMetadata.model_validate_json(file.root._v_attrs.metadata)
-        return self.output_class(metadata=metadata, datasets=data, schemas=schemas)
+        return self.output_class(metadata=metadata, tables=data, schemas=schemas)
 
     def read_dataset(self, path: PathLike, metadata: Optional[Union[BatteryMetadata, dict]] = None) -> BatteryDataset:
         """Read the default dataset and all subsets from an HDF5 file
@@ -230,7 +230,7 @@ def write_to_hdf(self, dataset: BatteryDataset, file: File, prefix: Optional[str
         #  Note that we use the "table" format to allow for partial reads / querying
         filters = Filters(complevel=self.complevel, complib=self.complib)
         for key, schema in dataset.schemas.items():
-            if (data := dataset.datasets.get(key)) is not None:
+            if (data := dataset.tables.get(key)) is not None:
                 table = write_df_to_table(file, group, key, data, filters=filters)
 
                 # Write the schema, mark as dataset

diff --git a/battdat/io/parquet.py b/battdat/io/parquet.py
@@ -73,7 +73,7 @@ def export(self, dataset: BatteryDataset, path: Path):
         }
         written = {}
         for key, schema in dataset.schemas.items():
-            if (data := dataset.datasets.get(key)) is None:
+            if (data := dataset.tables.get(key)) is None:
                 continue
 
             # Put the metadata for the battery and this specific table into the table's schema in the FileMetaData
@@ -148,5 +148,5 @@ def read_dataset(self, paths: Union[PathLike, Collection[PathLike]], metadata: O
         return self.output_class(
             metadata=BatteryMetadata.model_validate_json(metadata),
             schemas=schemas,
-            datasets=data
+            tables=data
         )
diff --git a/battdat/postprocess/base.py b/battdat/postprocess/base.py
@@ -60,7 +60,7 @@ def add_summaries(self, data: CellDataset):
 
         # Add a cycle summary if not already available
         if data.cycle_stats is None:
-            data.datasets['cycle_stats'] = pd.DataFrame({'cycle_number': sorted(set(data.raw_data['cycle_number']))})
+            data.tables['cycle_stats'] = pd.DataFrame({'cycle_number': sorted(set(data.raw_data['cycle_number']))})
 
         # Perform the update
         self._summarize(data.raw_data, data.cycle_stats)

diff --git a/battdat/streaming/hdf5.py b/battdat/streaming/hdf5.py
@@ -19,7 +19,9 @@
 
 @dataclass
 class HDF5Writer(AbstractContextManager):
-    """Tool to write raw time series data to an HDF5 file incrementally"""
+    """Tool to write raw time series data to an HDF5 file incrementally
+
+    Writes data to the ``raw_data`` key of a different dataset."""
 
     # Attributes defining where and how to write
     hdf5_output: Union[Path, str, File]

diff --git a/docs/user-guide/dataset.rst b/docs/user-guide/dataset.rst
@@ -15,7 +15,7 @@ Every dataset holds three attributes:
 
 #. :attr:`~battdat.data.BatteryDataset.metadata`: Information describing the source of the data
    (see `Source Metadata <schemas/source-metadata.html>`_)
-#. :attr:`~battdat.data.BatteryDataset.datasets`: A named collection of data tables
+#. :attr:`~battdat.data.BatteryDataset.tables`: A named collection of data tables as Pandas :class:`~pd.DataFrame`.
 #. :attr:`~battdat.data.BatteryDataset.schemas`: Descriptions of the columns in each data table
    (see `Column Schema <schemas/column-schema.html>`_)
 
@@ -24,9 +24,66 @@ Datasets describing a single cell may only include a single time series of the m
 whereas a dataset describing an entire system may have time series for each cell in each module
 and those for multiple power conversion systems.
 
+Access the data tables within the dataset by indexing the dataset:
+
+.. code-block:: python
+
+    dataset = BatteryDataset.from_hdf('example.h5')
+
+    # These two ways for accessing a table are equivalent
+    df = dataset['raw_data']
+    df = dataset.tables['raw_data']
+    df['voltage'].max()  # Compute the maximum voltage
+
+
+Creating a ``BatteryDataset``
+-----------------------------
+
+Load data from another file format using battdat's `dataset readers <io.html>`_.
+If there is no available reader,
+build by passing a collection of tables and their schemas along with the metadata to the constructor.
+Once assembled, all component tables will be saved and loaded together.
+
+.. code-block:: python
+
+    from battdat.schemas import BatteryMetadata
+    from battdat.schemas.column import RawData
+    from battdat.data import BatteryDataset
+
+    metadata = BatteryMetadata(name='2_cell_module')
+    col_schema = RawData()  # Use the same schema for both tables
+    dataset = BatteryDataset(
+        data={'cell_1': cell1_df, 'cell_2': cell2_df},
+        schemas={'cell_1': col_schema, 'cell_2': col_schema}
+        metadata=metadata
+    )
+
+Check that your data and metadata agree using the :meth:`~battdat.data.BatteryDataset.validate` method.
+
+.. code-block:: python
+
+    dataset.validate()
+
+The validate function will raise errors if the tables do not match the column schema
+and will return names of columns without descriptions, if desired.
+
+Dataset Templates
++++++++++++++++++
+
 ``battdat`` provides subclasses of :class:`~battdat.data.BatteryDataset` for different types of battery data.
 Each subclass provides suggested names for certain types of data (e.g., ``raw_data`` for measurements
-during operation of a single cell).
+during operation of a single cell) and predefines schema to use for each column.
+
+Dataset templates, like :class:`~battdat.data.CellDataset`, require
+neither supplying schemas for each table
+nor passing the tables as part of a dictionary.
+
+.. code-block:: python
+
+    from battdat.data import CellDataset
+
+    dataset = CellDataset(raw_data=df)
+
 The current template classes are:
 
 .. _type-table:
@@ -35,10 +92,14 @@ The current template classes are:
    :header-rows: 1
 
    * - Class
-     - Use Case
+     - Description
    * - :class:`~battdat.data.CellDataset`
      - Single battery cell with measurements of voltage, current, and other data at specific times
-       or averaged over entire cycles.
+       or averaged over entire cycles. Tables (and their schemas) include:
+
+       - ``raw_data`` (`RawData <schemas/column-schema.html#rawdata>`_): Measurements of system state at specific points in time.
+       - ``cycle_stats`` (`CycleStats <schemas/column-schema.html#cyclestats>`_): Descriptive statistics about state over entire cycles.
+       - ``eis_data`` (`EISData <schemas/column-schema.html#eisdata>`_): EIS measurements at different frequencies, over time.
 
 Loading and Saving
 ------------------

diff --git a/docs/user-guide/io.rst b/docs/user-guide/io.rst
@@ -24,7 +24,7 @@ The :mod:`battdat.io` module provides tools to read and write from :class:`~batt
      - ✔️
      - ✖️
    * - `HDF5 <formats.html#hdf5>`_
-     - :mod:`~battdat.io.hdf5`
+     - :mod:`~battdat.io.hdf`
      - ✔️
      - ✔️
    * - MACCOR