Skip to content

Commit

Permalink
Merge pull request #17 from YosefLab/h5ad-update
Browse files Browse the repository at this point in the history
H5ad update
  • Loading branch information
colganwi authored Jun 18, 2024
2 parents 0931bd8 + 0036edd commit a89913c
Show file tree
Hide file tree
Showing 12 changed files with 197 additions and 121 deletions.
20 changes: 19 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,22 @@ and this project adheres to [Semantic Versioning][].

### Added

- Basic tool, preprocessing and plotting functions
## [0.0.2] - 2024-06-18

### Changed

- Empty trees are now allowed to avoid error on subsetting (#13)
- How trees are stored in h5ad and zarr files (#16)
- Format of label column with multiple trees ([1,2] -> 1,2) (#16)

### Fixed

- Fixed issue with slow read/write of large trees

## [0.0.1] - 2024-05-13

### Added

- TreeData class for storing and manipulating trees
- Read/write trees to h5ad and zarr files
- Concatenate trees with similar API to AnnData
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ For questions and bug reports please use the [issue tracker][issue-tracker].
[nx.DiGraph]: https://networkx.org/documentation/stable/reference/classes/digraph.html
[scverse-discourse]: https://discourse.scverse.org/
[issue-tracker]: https://github.com/YosefLab/treedata/issues
[changelog]: https://treedata.readthedocs.io/latest/changelog.html
[changelog]: https://treedata.readthedocs.io/en/latest/changelog.html
[link-docs]: https://treedata.readthedocs.io
[link-getting-started]: https://treedata.readthedocs.io/en/latest/notebooks/getting-started.html
[link-api]: https://treedata.readthedocs.io/latest/api.html
78 changes: 25 additions & 53 deletions docs/_templates/autosummary/class.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,56 +6,28 @@
.. autoclass:: {{ objname }}

{% block attributes %}
{% if attributes %}
Attributes table
~~~~~~~~~~~~~~~~~~

.. autosummary::
{% for item in attributes %}
~{{ fullname }}.{{ item }}
{%- endfor %}
{% endif %}
{% endblock %}

{% block methods %}
{% if methods %}
Methods table
~~~~~~~~~~~~~

.. autosummary::
{% for item in methods %}
{%- if item != '__init__' %}
~{{ fullname }}.{{ item }}
{%- endif -%}
{%- endfor %}
{% endif %}
{% endblock %}

{% block attributes_documentation %}
{% if attributes %}
Attributes
~~~~~~~~~~~

{% for item in attributes %}

.. autoattribute:: {{ [objname, item] | join(".") }}
{%- endfor %}

{% endif %}
{% endblock %}

{% block methods_documentation %}
{% if methods %}
Methods
~~~~~~~

{% for item in methods %}
{%- if item != '__init__' %}

.. automethod:: {{ [objname, item] | join(".") }}
{%- endif -%}
{%- endfor %}

{% endif %}
{% endblock %}
{% block attributes %}
{% if attributes %}
.. rubric:: Attributes

.. autosummary::
:toctree: .
{% for item in attributes %}
~{{ fullname }}.{{ item }}
{%- endfor %}
{% endif %}
{% endblock %}

{% block methods %}
{% if methods %}
.. rubric:: Methods

.. autosummary::
:toctree: .
{% for item in methods %}
{%- if item != '__init__' %}
~{{ fullname }}.{{ item }}
{%- endif -%}
{%- endfor %}
{% endif %}
{% endblock %}
3 changes: 2 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

bibtex_bibfiles = ["references.bib"]
templates_path = ["_templates"]
nitpicky = True # Warn about broken links
nitpicky = False # Warn about broken links
needs_sphinx = "4.0"

html_context = {
Expand Down Expand Up @@ -74,6 +74,7 @@
napoleon_include_init_with_doc = False
napoleon_use_rtype = True # having a separate entry generally helps readability
napoleon_use_param = True
napoleon_custom_sections = [("Params", "Parameters")]
myst_heading_anchors = 6 # create anchors for h1-h6
myst_enable_extensions = [
"amsmath",
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ urls.Source = "https://github.com/YosefLab/treedata"
urls.Home-page = "https://github.com/YosefLab/treedata"
dependencies = [
"anndata",
"h5py",
"numpy",
"pandas",
"pathlib",
"pyarrow",
"networkx",
"session-info",
Expand Down
2 changes: 1 addition & 1 deletion src/treedata/_core/aligned_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def _validate_tree(self, tree: nx.DiGraph, key: str) -> nx.DiGraph:
def _update_tree_labels(self):
if self.parent._tree_label is not None:
if self.parent.allow_overlap:
mapping = self._leaf_to_tree
mapping = {k: ",".join(map(str, v)) for k, v in self._leaf_to_tree.items()}
else:
mapping = {k: v[0] for k, v in self._leaf_to_tree.items()}
getattr(self.parent, self.dim)[self.parent._tree_label] = getattr(self.parent, f"{self.dim}_names").map(
Expand Down
4 changes: 2 additions & 2 deletions src/treedata/_core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def concat(
) -> TreeData:
"""Concatenates TreeData objects along an axis.
Params
------
Parameters
----------
tdatas
The objects to be concatenated. If a Mapping is passed, keys are used for the `keys`
argument and values are concatenated.
Expand Down
27 changes: 21 additions & 6 deletions src/treedata/_core/read.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from __future__ import annotations

import json
from collections.abc import MutableMapping, Sequence
from pathlib import Path
from typing import (
Literal,
)

import anndata as ad
import h5py
import zarr
from scipy import sparse

Expand All @@ -15,16 +17,14 @@
from treedata._utils import dict_to_digraph


def _tdata_from_adata(tdata) -> TreeData:
def _tdata_from_adata(tdata, treedata_attrs=None) -> TreeData:
"""Create a TreeData object parsing attribute from AnnData uns field."""
tdata.__class__ = TreeData
if "treedata_attrs" in tdata.uns.keys():
treedata_attrs = tdata.uns["treedata_attrs"]
if treedata_attrs is not None:
tdata._tree_label = treedata_attrs["label"] if "label" in treedata_attrs.keys() else None
tdata._allow_overlap = bool(treedata_attrs["allow_overlap"])
tdata._obst = AxisTrees(tdata, 0, vals={k: dict_to_digraph(v) for k, v in treedata_attrs["obst"].items()})
tdata._vart = AxisTrees(tdata, 1, vals={k: dict_to_digraph(v) for k, v in treedata_attrs["vart"].items()})
del tdata.uns["treedata_attrs"]
else:
tdata._tree_label = None
tdata._allow_overlap = False
Expand Down Expand Up @@ -71,7 +71,14 @@ def read_h5ad(
as_sparse_fmt=as_sparse_fmt,
chunk_size=chunk_size,
)
return _tdata_from_adata(adata)
with h5py.File(filename, "r") as f:
if "raw.treedata" in f:
treedata_attrs = json.loads(f["raw.treedata"][()])
else:
treedata_attrs = None
tdata = _tdata_from_adata(adata, treedata_attrs)

return tdata


def read_zarr(store: str | Path | MutableMapping | zarr.Group) -> TreeData:
Expand All @@ -83,4 +90,12 @@ def read_zarr(store: str | Path | MutableMapping | zarr.Group) -> TreeData:
The filename, a :class:`~typing.MutableMapping`, or a Zarr storage class.
"""
adata = ad.read_zarr(store)
return _tdata_from_adata(adata)

with zarr.open(store, mode="r") as f:
if "raw.treedata" in f:
treedata_attrs = json.loads(f["raw.treedata"][()])
else:
treedata_attrs = None
tdata = _tdata_from_adata(adata, treedata_attrs)

return tdata
92 changes: 68 additions & 24 deletions src/treedata/_core/treedata.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
from __future__ import annotations

import warnings
import json
from collections.abc import Iterable, Mapping, MutableMapping, Sequence
from copy import deepcopy
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Literal,
)

import anndata as ad
import h5py
import networkx as nx
import numpy as np
import pandas as pd
from anndata._core.index import Index, Index1D
import zarr
from anndata._core.index import Index, Index1D, _subset
from anndata._io import write_h5ad, write_zarr
from scipy import sparse

from treedata._utils import digraph_to_dict
Expand Down Expand Up @@ -282,17 +287,45 @@ def _treedata_attrs(self) -> dict:
"allow_overlap": self.allow_overlap,
}

def _mutated_copy(self, **kwargs):
"""Creating TreeData with attributes optionally specified via kwargs."""
if self.isbacked:
if "X" not in kwargs or (self.raw is not None and "raw" not in kwargs):
raise NotImplementedError(
"This function does not currently handle backed objects "
"internally, this should be dealt with before."
)
new = {}
new["label"] = self.label
new["allow_overlap"] = self.allow_overlap

for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "obst", "vart", "layers"]:
if key in kwargs:
new[key] = kwargs[key]
else:
new[key] = getattr(self, key).copy()
if "X" in kwargs:
new["X"] = kwargs["X"]
elif self._has_X():
new["X"] = self.X.copy()
if "uns" in kwargs:
new["uns"] = kwargs["uns"]
else:
new["uns"] = deepcopy(self._uns)
if "raw" in kwargs:
new["raw"] = kwargs["raw"]
elif self.raw is not None:
new["raw"] = self.raw.copy()

return TreeData(**new)

def copy(self, filename: PathLike | None = None) -> TreeData:
"""Full copy, optionally on disk"""
adata = super().copy(filename=filename)
"""Full copy, optionally on disk."""
if not self.isbacked:
treedata_copy = TreeData(
adata,
obst=self.obst.copy(),
vart=self.vart.copy(),
label=self.label,
allow_overlap=self.allow_overlap,
)
if self.is_view and self._has_X():
return self._mutated_copy(X=_subset(self._adata_ref.X, (self._oidx, self._vidx)).copy())
else:
return self._mutated_copy()
else:
from .read import read_h5ad

Expand All @@ -303,10 +336,8 @@ def copy(self, filename: PathLike | None = None) -> TreeData:
"To load the object into memory, use `.to_memory()`."
)
mode = self.file._filemode
adata.uns["treedata_attrs"] = self._treedata_attrs()
adata.write_h5ad(filename)
treedata_copy = read_h5ad(filename, backed=mode)
return treedata_copy
self.write_h5ad(filename)
return read_h5ad(filename, backed=mode)

def transpose(self) -> TreeData:
"""Transpose whole object
Expand Down Expand Up @@ -347,13 +378,23 @@ def write_h5ad(
Sparse arrays in TreeData object to write as dense. Currently only
supports `X` and `raw/X`.
"""
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.uns["treedata_attrs"] = self._treedata_attrs()
super().write_h5ad(
filename=filename, compression=compression, compression_opts=compression_opts, as_dense=as_dense
if filename is None and not self.isbacked:
raise ValueError("Provide a filename!")
if filename is None:
filename = self.filename

write_h5ad(
Path(filename),
self,
compression=compression,
compression_opts=compression_opts,
as_dense=as_dense,
)
self.uns.pop("treedata_attrs")

with h5py.File(filename, "a") as f:
if "raw.treedata" in f:
del f["raw.treedata"]
f.create_dataset("raw.treedata", data=json.dumps(self._treedata_attrs()))

write = write_h5ad # a shortcut and backwards compat

Expand All @@ -371,9 +412,12 @@ def write_zarr(
chunks
Chunk shape.
"""
adata = self.to_adata()
adata.uns["treedata_attrs"] = self._treedata_attrs()
adata.write_zarr(store=store, chunks=chunks)
write_zarr(store, self.to_adata(), chunks=chunks)

with zarr.open(store, mode="a") as f:
if "treedata" in f:
del f["raw.treedata"]
f.create_dataset("raw.treedata", data=json.dumps(self._treedata_attrs()))

def to_memory(self, copy=False) -> TreeData:
"""Return a new AnnData object with all backed arrays loaded into memory.
Expand Down
2 changes: 1 addition & 1 deletion tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_tree_label(X, tree, dim):
assert getattr(tdata, dim)["tree"].tolist() == ["0", "0", "1"]
# Test tree label with overlap
tdata = td.TreeData(X, obst={"0": tree, "1": tree}, label="tree", vart={"0": tree, "1": tree}, allow_overlap=True)
assert getattr(tdata, dim)["tree"].tolist() == [["0", "1"], ["0", "1"], []]
assert getattr(tdata, dim).loc["0", "tree"] == "0,1"
# Test label already present warning
df = pd.DataFrame({"tree": ["bad", "bad", "bad"]})
with pytest.warns(UserWarning):
Expand Down
Loading

0 comments on commit a89913c

Please sign in to comment.