Allow array data types

ROVI-org · Nov 22, 2024 · aa11ff8 · aa11ff8
1 parent f88609c
commit aa11ff8
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 11 deletions.
diff --git a/battdat/io/hdf.py b/battdat/io/hdf.py
@@ -28,13 +28,22 @@ def make_numpy_dtype_from_pandas(df: pd.DataFrame) -> np.dtype:
     output = []
     for name, dtype in df.dtypes.items():
         kind = dtype.kind
-        if kind in ['O', 'S', 'U']:
+        shape = ()
+
+        # Introspect objects to learn more
+        if kind == 'O':
+            example = np.array(df[name].iloc[0])
+            dtype = example.dtype
+            kind = dtype.kind
+            shape = example.shape
+
+        if kind in ['S', 'U']:
             max_len = df[name].apply(str).apply(len).max()
             output.append((name, np.dtype(f'S{max_len}')))
         elif kind in ['M', 'm', 'V']:
             raise ValueError(f'Data type not supported: {kind}')
         else:
-            output.append((name, dtype))
+            output.append((name, dtype, shape))
     return np.dtype(output)
 
 
@@ -47,7 +56,7 @@ def write_df_to_table(file: File, group: Group, name: str, df: pd.DataFrame, fil
         name: Name of the dataset
         df: DataFrame to write
         filters: Filters to apply to data entering table
-        expected_rows:
+        expected_rows: How many rows to expect. Default is to use the length of the dataframe
     Returns:
         Table object holding the dataset
     """
@@ -57,7 +66,7 @@ def write_df_to_table(file: File, group: Group, name: str, df: pd.DataFrame, fil
     desc, _ = descr_from_dtype(dtype)
 
     # Make the table then fill
-    table = file.create_table(group, name=name, description=desc, expectedrows=len(df), filters=filters)
+    table = file.create_table(group, name=name, description=desc, expectedrows=expected_rows or len(df), filters=filters)
     row = np.empty((1,), dtype=dtype)  # TODO (wardlt): Consider a batched write (pytables might batch internally)
     for _, df_row in df.iterrows():
         for c in dtype.names:
@@ -77,7 +86,13 @@ def read_df_from_table(table: Table) -> pd.DataFrame:
     array = np.empty((table.nrows,), dtype=table.dtype)
     for i, row in enumerate(table.iterrows()):
         array[i] = row.fetch_all_fields()
-    return pd.DataFrame(array)
+    as_dict = dict((c, array[c]) for c in array.dtype.names)
+
+    # Expand ndarrays into a list
+    for k, v in as_dict.items():
+        if v.ndim != 1:
+            as_dict[k] = list(v)
+    return pd.DataFrame(as_dict)
 
 
 @contextmanager

diff --git a/docs/user-guide/dataset.rst b/docs/user-guide/dataset.rst
@@ -15,7 +15,7 @@ Every dataset holds three attributes:
 
 #. :attr:`~battdat.data.BatteryDataset.metadata`: Information describing the source of the data
    (see `Source Metadata <schemas/source-metadata.html>`_)
-#. :attr:`~battdat.data.BatteryDataset.tables`: A named collection of data tables as Pandas :class:`~pd.DataFrame`.
+#. :attr:`~battdat.data.BatteryDataset.tables`: A named collection of data tables as Pandas :class:`~pandas.DataFrame`.
 #. :attr:`~battdat.data.BatteryDataset.schemas`: Descriptions of the columns in each data table
    (see `Column Schema <schemas/column-schema.html>`_)
 
@@ -41,7 +41,7 @@ Creating a ``BatteryDataset``
 
 Load data from another file format using battdat's `dataset readers <io.html>`_.
 If there is no available reader,
-build by passing a collection of tables and their schemas along with the metadata to the constructor.
+build by passing a collection of tables as :class:`~pandas.DataFrame` and their schemas along with the metadata to the constructor.
 Once assembled, all component tables will be saved and loaded together.
 
 .. code-block:: python
@@ -58,6 +58,11 @@ Once assembled, all component tables will be saved and loaded together.
         metadata=metadata
     )
 
+Columns of the dataframes can be any `NumPy data type <https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html#numpy.dtype.kind>`_
+except timedeltas (m), timestamps (M), or voids (v).
+Battery data toolkit does not yet support storing these types in HDF5 or Parquet formats.
+Columns where all values are arrays of the same size are also supported.
+
 Check that your data and metadata agree using the :meth:`~battdat.data.BatteryDataset.validate` method.
 
 .. code-block:: python

diff --git a/tests/io/test_hdf.py b/tests/io/test_hdf.py
@@ -4,20 +4,21 @@
 
 from battdat.io.hdf import make_numpy_dtype_from_pandas, write_df_to_table, read_df_from_table
 
-example_df = pd.DataFrame({'a': [1, 2], 'b': [1., 3.], 'c': ['charge', 'discharge']})
+example_df = pd.DataFrame({'a': [1, 2], 'b': [1., 3.], 'c': ['charge', 'discharge'], 'array': [[[1.]], [[0.]]]})
 
 
 def test_dtype():
     dtype = make_numpy_dtype_from_pandas(example_df)
-    assert dtype.names == ('a', 'b', 'c')
+    assert dtype.names == ('a', 'b', 'c', 'array')
+    assert dtype['array'].shape == (1, 1)
 
 
 def test_store_df(tmpdir):
     with tables.open_file(tmpdir / "example.h5", "w") as file:
         group = file.create_group('/', name='base')
         table = write_df_to_table(file, group, 'table', example_df)
-        assert tuple(table[0]) == (1, 1., b'charge')
+        assert tuple(table[0]) == (1, 1., b'charge', np.ones((1, 1)))
 
         df_copy = read_df_from_table(table)
-        assert (df_copy.columns == ['a', 'b', 'c']).all()
+        assert (df_copy.columns == ['a', 'b', 'c', 'array']).all()
         assert np.allclose(df_copy['b'], [1., 3.])