Skip to content

Commit

Permalink
Allow array data types
Browse files Browse the repository at this point in the history
  • Loading branch information
WardLT committed Nov 22, 2024
1 parent f88609c commit aa11ff8
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 11 deletions.
25 changes: 20 additions & 5 deletions battdat/io/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,22 @@ def make_numpy_dtype_from_pandas(df: pd.DataFrame) -> np.dtype:
output = []
for name, dtype in df.dtypes.items():
kind = dtype.kind
if kind in ['O', 'S', 'U']:
shape = ()

# Introspect objects to learn more
if kind == 'O':
example = np.array(df[name].iloc[0])
dtype = example.dtype
kind = dtype.kind
shape = example.shape

if kind in ['S', 'U']:
max_len = df[name].apply(str).apply(len).max()
output.append((name, np.dtype(f'S{max_len}')))
elif kind in ['M', 'm', 'V']:
raise ValueError(f'Data type not supported: {kind}')
else:
output.append((name, dtype))
output.append((name, dtype, shape))
return np.dtype(output)


Expand All @@ -47,7 +56,7 @@ def write_df_to_table(file: File, group: Group, name: str, df: pd.DataFrame, fil
name: Name of the dataset
df: DataFrame to write
filters: Filters to apply to data entering table
expected_rows:
expected_rows: How many rows to expect. Default is to use the length of the dataframe
Returns:
Table object holding the dataset
"""
Expand All @@ -57,7 +66,7 @@ def write_df_to_table(file: File, group: Group, name: str, df: pd.DataFrame, fil
desc, _ = descr_from_dtype(dtype)

# Make the table then fill
table = file.create_table(group, name=name, description=desc, expectedrows=len(df), filters=filters)
table = file.create_table(group, name=name, description=desc, expectedrows=expected_rows or len(df), filters=filters)
row = np.empty((1,), dtype=dtype) # TODO (wardlt): Consider a batched write (pytables might batch internally)
for _, df_row in df.iterrows():
for c in dtype.names:
Expand All @@ -77,7 +86,13 @@ def read_df_from_table(table: Table) -> pd.DataFrame:
array = np.empty((table.nrows,), dtype=table.dtype)
for i, row in enumerate(table.iterrows()):
array[i] = row.fetch_all_fields()
return pd.DataFrame(array)
as_dict = dict((c, array[c]) for c in array.dtype.names)

# Expand ndarrays into a list
for k, v in as_dict.items():
if v.ndim != 1:
as_dict[k] = list(v)
return pd.DataFrame(as_dict)


@contextmanager
Expand Down
9 changes: 7 additions & 2 deletions docs/user-guide/dataset.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Every dataset holds three attributes:

#. :attr:`~battdat.data.BatteryDataset.metadata`: Information describing the source of the data
(see `Source Metadata <schemas/source-metadata.html>`_)
#. :attr:`~battdat.data.BatteryDataset.tables`: A named collection of data tables as Pandas :class:`~pd.DataFrame`.
#. :attr:`~battdat.data.BatteryDataset.tables`: A named collection of data tables as Pandas :class:`~pandas.DataFrame`.
#. :attr:`~battdat.data.BatteryDataset.schemas`: Descriptions of the columns in each data table
(see `Column Schema <schemas/column-schema.html>`_)

Expand All @@ -41,7 +41,7 @@ Creating a ``BatteryDataset``

Load data from another file format using battdat's `dataset readers <io.html>`_.
If there is no available reader,
build by passing a collection of tables and their schemas along with the metadata to the constructor.
build by passing a collection of tables as :class:`~pandas.DataFrame` and their schemas along with the metadata to the constructor.
Once assembled, all component tables will be saved and loaded together.

.. code-block:: python
Expand All @@ -58,6 +58,11 @@ Once assembled, all component tables will be saved and loaded together.
metadata=metadata
)
Columns of the dataframes can be any `NumPy data type <https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html#numpy.dtype.kind>`_
except timedeltas (m), timestamps (M), or voids (v).
Battery data toolkit does not yet support storing these types in HDF5 or Parquet formats.
Columns where all values are arrays of the same size are also supported.

Check that your data and metadata agree using the :meth:`~battdat.data.BatteryDataset.validate` method.

.. code-block:: python
Expand Down
9 changes: 5 additions & 4 deletions tests/io/test_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,21 @@

from battdat.io.hdf import make_numpy_dtype_from_pandas, write_df_to_table, read_df_from_table

example_df = pd.DataFrame({'a': [1, 2], 'b': [1., 3.], 'c': ['charge', 'discharge']})
example_df = pd.DataFrame({'a': [1, 2], 'b': [1., 3.], 'c': ['charge', 'discharge'], 'array': [[[1.]], [[0.]]]})


def test_dtype():
dtype = make_numpy_dtype_from_pandas(example_df)
assert dtype.names == ('a', 'b', 'c')
assert dtype.names == ('a', 'b', 'c', 'array')
assert dtype['array'].shape == (1, 1)


def test_store_df(tmpdir):
with tables.open_file(tmpdir / "example.h5", "w") as file:
group = file.create_group('/', name='base')
table = write_df_to_table(file, group, 'table', example_df)
assert tuple(table[0]) == (1, 1., b'charge')
assert tuple(table[0]) == (1, 1., b'charge', np.ones((1, 1)))

df_copy = read_df_from_table(table)
assert (df_copy.columns == ['a', 'b', 'c']).all()
assert (df_copy.columns == ['a', 'b', 'c', 'array']).all()
assert np.allclose(df_copy['b'], [1., 3.])

0 comments on commit aa11ff8

Please sign in to comment.