Skip to content

Commit

Permalink
Rename dataset to table (#117)
Browse files Browse the repository at this point in the history
* Rename dataset to table

Fixes #115

* Indicate where the data is being stored

* Fix bug due to new pydantic

* Clarify that readers exist

* Updated the cycle times notebook

* Minor addition to documentation
  • Loading branch information
WardLT authored Nov 21, 2024
1 parent e863a24 commit 9a11355
Show file tree
Hide file tree
Showing 12 changed files with 129 additions and 51 deletions.
2 changes: 1 addition & 1 deletion battdat/consistency/current.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class SignConventionChecker(ConsistencyChecker):
def check(self, dataset: BatteryDataset) -> List[str]:
warnings = []
for subset in self.subsets_to_check:
if (warning := self.check_subset(dataset.datasets[subset])) is not None:
if (warning := self.check_subset(dataset.tables[subset])) is not None:
warnings.append(warning)
return warnings

Expand Down
56 changes: 34 additions & 22 deletions battdat/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import warnings
from pathlib import Path
from typing import Union, Optional, Collection, List, Dict, Set, Iterator, Tuple
from typing import Union, Optional, Collection, List, Dict, Set, Iterator, Tuple, Mapping

from pydantic import BaseModel, ValidationError
from tables import File
Expand All @@ -17,34 +17,32 @@
logger = logging.getLogger(__name__)


class BatteryDataset:
class BatteryDataset(Mapping[str, pd.DataFrame]):
"""Base class for all battery datasets.
Not to be created directly by users. Defines the functions to validate, read, and write from HDF5 or Parquet files.
Args:
datasets: Subsets which compose this larger dataset
tables: Subsets which compose this larger dataset
metadata: Metadata for the entire dataset
schemas: Schemas describing each subset
check_schemas: Whether to throw an error if datasets lack a schema
"""

datasets: Dict[str, pd.DataFrame]
"""List of tabular sub-datasets which are part of this dataset"""
metadata: BatteryMetadata
"""Information describing the source of a dataset"""
schemas: Dict[str, ColumnSchema]
"""Schemas describing each dataset"""
datasets: Dict[str, pd.DataFrame]
tables: Dict[str, pd.DataFrame]
"""Datasets available for users"""

def __init__(self,
datasets: Dict[str, pd.DataFrame],
tables: Dict[str, pd.DataFrame],
schemas: Dict[str, ColumnSchema],
metadata: BatteryMetadata = None,
check_schemas: bool = True):
self.schemas = schemas.copy()
self.datasets = datasets.copy()
self.tables = tables.copy()

# Assign default metadata
if metadata is None:
Expand All @@ -67,13 +65,27 @@ def __init__(self,
raise

# Check if schemas are missing for some datasets
missing_schema = set(self.datasets.keys()).difference(self.schemas)
missing_schema = set(self.tables.keys()).difference(self.schemas)
if len(missing_schema) > 0:
warn_msg = f'Missing schema for some datasets: {", ".join(missing_schema)}'
logger.warning(warn_msg)
if check_schemas:
raise ValueError(warn_msg)

def __getitem__(self, item: str) -> pd.DataFrame:
"""Access a specific table within the dataset"""
return self.tables[item]

def __contains__(self, item):
"""Whether the dataset contains a specific table"""
return item in self.tables

def __len__(self):
return len(self.tables)

def __iter__(self):
return iter(self.tables.items())

def validate_columns(self, allow_extra_columns: bool = True):
"""Determine whether the column types are appropriate
Expand All @@ -84,7 +96,7 @@ def validate_columns(self, allow_extra_columns: bool = True):
(ValueError): If the dataset fails validation
"""
for attr_name, schema in self.schemas.items():
if (data := self.datasets.get(attr_name)) is not None:
if (data := self.tables.get(attr_name)) is not None:
schema.validate_dataframe(data, allow_extra_columns)

def validate(self) -> List[str]:
Expand All @@ -101,7 +113,7 @@ def validate(self) -> List[str]:
output = []

for attr_name, schema in self.schemas.items():
if (data := self.datasets.get(attr_name)) is not None:
if (data := self.tables.get(attr_name)) is not None:
undefined = set(data.columns).difference(schema.column_names)
output.extend([f'Undefined column, {u}, in {attr_name}. Add a description into schemas.{attr_name}.extra_columns'
for u in undefined])
Expand Down Expand Up @@ -138,15 +150,15 @@ def to_hdf(self,
@classmethod
def from_hdf(cls,
path_or_buf: Union[str, Path, File],
subsets: Optional[Collection[str]] = None,
tables: Optional[Collection[str]] = None,
prefix: Union[str, int] = None) -> 'BatteryDataset':
"""Read the battery data from an HDF file
Use :meth:`all_cells_from_hdf` to read all datasets from a file.
Args:
path_or_buf: File path or HDFStore object
subsets : Which subsets of data to read from the data file (e.g., raw_data, cycle_stats)
tables : Which subsets of data to read from the data file (e.g., raw_data, cycle_stats)
prefix: (``str``) Prefix designating which battery extract from this file,
or (``int``) index within the list of available prefixes, sorted alphabetically.
The default is to read the default prefix (``None``).
Expand All @@ -156,7 +168,7 @@ def from_hdf(cls,
reader = HDF5Reader()
reader.output_class = cls
with as_hdf5_object(path_or_buf) as store:
return reader.read_from_hdf(store, prefix, subsets)
return reader.read_from_hdf(store, prefix, tables)

@classmethod
def all_cells_from_hdf(cls, path: Union[str, Path], subsets: Optional[Collection[str]] = None) -> Iterator[Tuple[str, 'CellDataset']]:
Expand All @@ -175,7 +187,7 @@ def all_cells_from_hdf(cls, path: Union[str, Path], subsets: Optional[Collection

with File(path, mode='r') as fp: # Only open once
for name in names:
yield name, cls.from_hdf(fp, prefix=name, subsets=subsets)
yield name, cls.from_hdf(fp, prefix=name, tables=subsets)

@staticmethod
def inspect_hdf(path_or_buf: Union[str, Path, File]) -> tuple[BatteryMetadata, Set[Optional[str]]]:
Expand Down Expand Up @@ -269,25 +281,25 @@ class CellDataset(BatteryDataset):
@property
def raw_data(self) -> Optional[pd.DataFrame]:
"""Time-series data capturing the state of the battery as a function of time"""
return self.datasets.get('raw_data')
return self.tables.get('raw_data')

@property
def cycle_stats(self) -> Optional[pd.DataFrame]:
"""Summary statistics of each cycle"""
return self.datasets.get('cycle_stats')
return self.tables.get('cycle_stats')

@property
def eis_data(self) -> Optional[pd.DataFrame]:
"""Electrochemical Impedance Spectroscopy (EIS) data"""
return self.datasets.get('eis_data')
return self.tables.get('eis_data')

def __init__(self,
metadata: Union[BatteryMetadata, dict] = None,
raw_data: Optional[pd.DataFrame] = None,
cycle_stats: Optional[pd.DataFrame] = None,
eis_data: Optional[pd.DataFrame] = None,
schemas: Optional[Dict[str, ColumnSchema]] = None,
datasets: Dict[str, pd.DataFrame] = None):
tables: Dict[str, pd.DataFrame] = None):
_schemas = {
'raw_data': RawData(),
'cycle_stats': CycleLevelData(),
Expand All @@ -297,10 +309,10 @@ def __init__(self,
_schemas.update(schemas)

_datasets = {'raw_data': raw_data, 'eis_data': eis_data, 'cycle_stats': cycle_stats}
if datasets is not None:
_datasets.update(datasets)
if tables is not None:
_datasets.update(tables)
super().__init__(
datasets=_datasets,
tables=_datasets,
schemas=_schemas,
metadata=metadata,
)
4 changes: 2 additions & 2 deletions battdat/io/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def read_from_hdf(self, file: File, prefix: Union[int, str, None], subsets: Opti

# Read out the battery metadata
metadata = BatteryMetadata.model_validate_json(file.root._v_attrs.metadata)
return self.output_class(metadata=metadata, datasets=data, schemas=schemas)
return self.output_class(metadata=metadata, tables=data, schemas=schemas)

def read_dataset(self, path: PathLike, metadata: Optional[Union[BatteryMetadata, dict]] = None) -> BatteryDataset:
"""Read the default dataset and all subsets from an HDF5 file
Expand Down Expand Up @@ -230,7 +230,7 @@ def write_to_hdf(self, dataset: BatteryDataset, file: File, prefix: Optional[str
# Note that we use the "table" format to allow for partial reads / querying
filters = Filters(complevel=self.complevel, complib=self.complib)
for key, schema in dataset.schemas.items():
if (data := dataset.datasets.get(key)) is not None:
if (data := dataset.tables.get(key)) is not None:
table = write_df_to_table(file, group, key, data, filters=filters)

# Write the schema, mark as dataset
Expand Down
4 changes: 2 additions & 2 deletions battdat/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def export(self, dataset: BatteryDataset, path: Path):
}
written = {}
for key, schema in dataset.schemas.items():
if (data := dataset.datasets.get(key)) is None:
if (data := dataset.tables.get(key)) is None:
continue

# Put the metadata for the battery and this specific table into the table's schema in the FileMetaData
Expand Down Expand Up @@ -148,5 +148,5 @@ def read_dataset(self, paths: Union[PathLike, Collection[PathLike]], metadata: O
return self.output_class(
metadata=BatteryMetadata.model_validate_json(metadata),
schemas=schemas,
datasets=data
tables=data
)
2 changes: 1 addition & 1 deletion battdat/postprocess/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def add_summaries(self, data: CellDataset):

# Add a cycle summary if not already available
if data.cycle_stats is None:
data.datasets['cycle_stats'] = pd.DataFrame({'cycle_number': sorted(set(data.raw_data['cycle_number']))})
data.tables['cycle_stats'] = pd.DataFrame({'cycle_number': sorted(set(data.raw_data['cycle_number']))})

# Perform the update
self._summarize(data.raw_data, data.cycle_stats)
Expand Down
4 changes: 3 additions & 1 deletion battdat/streaming/hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@

@dataclass
class HDF5Writer(AbstractContextManager):
"""Tool to write raw time series data to an HDF5 file incrementally"""
"""Tool to write raw time series data to an HDF5 file incrementally
Writes data to the ``raw_data`` key of a different dataset."""

# Attributes defining where and how to write
hdf5_output: Union[Path, str, File]
Expand Down
69 changes: 65 additions & 4 deletions docs/user-guide/dataset.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Every dataset holds three attributes:

#. :attr:`~battdat.data.BatteryDataset.metadata`: Information describing the source of the data
(see `Source Metadata <schemas/source-metadata.html>`_)
#. :attr:`~battdat.data.BatteryDataset.datasets`: A named collection of data tables
#. :attr:`~battdat.data.BatteryDataset.tables`: A named collection of data tables as Pandas :class:`~pd.DataFrame`.
#. :attr:`~battdat.data.BatteryDataset.schemas`: Descriptions of the columns in each data table
(see `Column Schema <schemas/column-schema.html>`_)

Expand All @@ -24,9 +24,66 @@ Datasets describing a single cell may only include a single time series of the m
whereas a dataset describing an entire system may have time series for each cell in each module
and those for multiple power conversion systems.

Access the data tables within the dataset by indexing the dataset:

.. code-block:: python
dataset = BatteryDataset.from_hdf('example.h5')
# These two ways for accessing a table are equivalent
df = dataset['raw_data']
df = dataset.tables['raw_data']
df['voltage'].max() # Compute the maximum voltage
Creating a ``BatteryDataset``
-----------------------------

Load data from another file format using battdat's `dataset readers <io.html>`_.
If there is no available reader,
build by passing a collection of tables and their schemas along with the metadata to the constructor.
Once assembled, all component tables will be saved and loaded together.

.. code-block:: python
from battdat.schemas import BatteryMetadata
from battdat.schemas.column import RawData
from battdat.data import BatteryDataset
metadata = BatteryMetadata(name='2_cell_module')
col_schema = RawData() # Use the same schema for both tables
dataset = BatteryDataset(
data={'cell_1': cell1_df, 'cell_2': cell2_df},
schemas={'cell_1': col_schema, 'cell_2': col_schema}
metadata=metadata
)
Check that your data and metadata agree using the :meth:`~battdat.data.BatteryDataset.validate` method.

.. code-block:: python
dataset.validate()
The validate function will raise errors if the tables do not match the column schema
and will return names of columns without descriptions, if desired.

Dataset Templates
+++++++++++++++++

``battdat`` provides subclasses of :class:`~battdat.data.BatteryDataset` for different types of battery data.
Each subclass provides suggested names for certain types of data (e.g., ``raw_data`` for measurements
during operation of a single cell).
during operation of a single cell) and predefines schema to use for each column.

Dataset templates, like :class:`~battdat.data.CellDataset`, require
neither supplying schemas for each table
nor passing the tables as part of a dictionary.

.. code-block:: python
from battdat.data import CellDataset
dataset = CellDataset(raw_data=df)
The current template classes are:

.. _type-table:
Expand All @@ -35,10 +92,14 @@ The current template classes are:
:header-rows: 1

* - Class
- Use Case
- Description
* - :class:`~battdat.data.CellDataset`
- Single battery cell with measurements of voltage, current, and other data at specific times
or averaged over entire cycles.
or averaged over entire cycles. Tables (and their schemas) include:

- ``raw_data`` (`RawData <schemas/column-schema.html#rawdata>`_): Measurements of system state at specific points in time.
- ``cycle_stats`` (`CycleStats <schemas/column-schema.html#cyclestats>`_): Descriptive statistics about state over entire cycles.
- ``eis_data`` (`EISData <schemas/column-schema.html#eisdata>`_): EIS measurements at different frequencies, over time.

Loading and Saving
------------------
Expand Down
2 changes: 1 addition & 1 deletion docs/user-guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ The :mod:`battdat.io` module provides tools to read and write from :class:`~batt
- ✔️
- ✖️
* - `HDF5 <formats.html#hdf5>`_
- :mod:`~battdat.io.hdf5`
- :mod:`~battdat.io.hdf`
- ✔️
- ✔️
* - MACCOR
Expand Down
Loading

0 comments on commit 9a11355

Please sign in to comment.