Skip to content

Commit

Permalink
remove scanpy
Browse files Browse the repository at this point in the history
  • Loading branch information
armaan-abraham committed Jun 20, 2024
1 parent ba480f2 commit 68a889d
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 142 deletions.
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ python = "^3.12"
pandas = "^2.0.0"
xarray = "^2024.6.0"
anndata = "^0.10.7"
scanpy = "^1.10.1"
numpy = "^1.26"

[tool.poetry.dev-dependencies]
Expand Down
12 changes: 5 additions & 7 deletions tensordata/scRNA/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import numpy as np
import numpy.typing as npt
import pandas as pd
import scanpy as sc
from scipy.sparse import csr_matrix, spmatrix
from sklearn.utils.sparsefuncs import inplace_column_scale, mean_variance_axis
from scipy.sparse import csr_matrix

from .read_10x_mtx import read_10x_mtx

DATA_DIR = Path(__file__).parent / "scRNA"

Expand All @@ -23,7 +23,6 @@ def gate_thomson_cells(X) -> npt.ArrayLike:


def import_thomson(
gene_threshold=0.01,
anndata_path=Path("/opt") / "andrew" / "thomson_raw.h5ad",
) -> anndata.AnnData:
"""
Expand Down Expand Up @@ -69,7 +68,6 @@ def import_thomson(


def import_lupus(
gene_threshold=0.1,
anndata_path=Path("/opt") / "andrew" / "lupus" / "lupus.h5ad",
protein_path=Path("/opt")
/ "andrew"
Expand Down Expand Up @@ -137,7 +135,7 @@ def import_citeseq(
files = ["control", "ic_pod1", "ic_pod7", "sc_pod1", "sc_pod7"]

data = {
k: sc.read_10x_mtx(data_dir_path / k, gex_only=False, make_unique=True)
k: read_10x_mtx(data_dir_path / k, gex_only=False, make_unique=True)
for k in files
}

Expand All @@ -160,7 +158,7 @@ def import_HTAN(
data = {}

for filename in files:
data[filename.stem.split("_matrix")[0]] = sc.read_10x_mtx(
data[filename.stem.split("_matrix")[0]] = read_10x_mtx(
HTAN_path,
gex_only=False,
make_unique=True,
Expand Down
193 changes: 59 additions & 134 deletions tensordata/scRNA/read_10x_mtx.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from enum import Enum
from os import PathLike, fspath
from pathlib import Path
from typing import Literal

import anndata
import pandas as pd
from scipy.io import mmread
from scipy.sparse import csr_matrix


class Empty(Enum):
Expand All @@ -15,6 +18,24 @@ def __repr__(self) -> str:

_empty = Empty.token

text_exts = {
"csv",
"tsv",
"tab",
"data",
"txt", # these four are all equivalent
}
avail_exts = {
"anndata",
"xlsx",
"h5",
"h5ad",
"mtx",
"mtx.gz",
"soft.gz",
"loom",
} | text_exts


def read_10x_mtx(
path: Path | str,
Expand Down Expand Up @@ -80,20 +101,17 @@ def _read_10x_mtx(
*,
var_names: Literal["gene_symbols", "gene_ids"] = "gene_symbols",
make_unique: bool = True,
cache: bool = False,
cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
prefix: str = "",
is_legacy: bool,
) -> anndata.AnnData:
"""
Read mex from output from Cell Ranger v2- or v3+
"""
suffix = "" if is_legacy else ".gz"
adata = read(
path / f"{prefix}matrix.mtx{suffix}",
cache=cache,
cache_compression=cache_compression,
).T # transpose the data

filepath = path / f"{prefix}matrix.mtx{suffix}"
assert is_valid_filename(filepath)
adata = read_mtx(filepath).T
genes = pd.read_csv(
path / f"{prefix}{'genes' if is_legacy else 'features'}.tsv{suffix}",
header=None,
Expand All @@ -117,138 +135,45 @@ def _read_10x_mtx(
return adata


def read(
filename: Path | str,
backed: Literal["r", "r+"] | None = None,
*,
sheet: str | None = None,
ext: str | None = None,
delimiter: str | None = None,
first_column_names: bool = False,
backup_url: str | None = None,
cache: bool = False,
cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
**kwargs,
) -> anndata.AnnData:
"""\
Read file and return :class:`~anndata.AnnData` object.
def is_valid_filename(filename: Path, return_ext=False):
"""Check whether the argument is a filename."""
ext = filename.suffixes

# cases for gzipped/bzipped text files
if len(ext) == 2 and ext[0][1:] in text_exts and ext[1][1:] in ("gz", "bz2"):
return ext[0][1:] if return_ext else True
elif ext and ext[-1][1:] in avail_exts:
return ext[-1][1:] if return_ext else True
elif "".join(ext) == ".soft.gz":
return "soft.gz" if return_ext else True
elif "".join(ext) == ".mtx.gz":
return "mtx.gz" if return_ext else True
elif not return_ext:
return False
raise ValueError(
f"""\
{filename!r} does not end on a valid extension.
Please, provide one of the available extensions.
{avail_exts}
Text files with .gz and .bz2 extensions are also supported.\
"""
)


To speed up reading, consider passing ``cache=True``, which creates an hdf5
cache file.
def read_mtx(filename: PathLike, dtype: str = "float32") -> anndata.AnnData:
"""\
Read `.mtx` file.
Parameters
----------
filename
If the filename has no file extension, it is interpreted as a key for
generating a filename via ``sc.settings.writedir / (filename +
sc.settings.file_format_data)``. This is the same behavior as in
``sc.read(filename, ...)``.
backed
If ``'r'``, load :class:`~anndata.AnnData` in ``backed`` mode instead
of fully loading it into memory (`memory` mode). If you want to modify
backed attributes of the AnnData object, you need to choose ``'r+'``.
sheet
Name of sheet/table in hdf5 or Excel file.
ext
Extension that indicates the file type. If ``None``, uses extension of
filename.
delimiter
Delimiter that separates data within text file. If ``None``, will split at
arbitrary number of white spaces, which is different from enforcing
splitting at any single white space ``' '``.
first_column_names
Assume the first column stores row names. This is only necessary if
these are not strings: strings in the first column are automatically
assumed to be row names.
backup_url
Retrieve the file from an URL if not present on disk.
cache
If `False`, read from source, if `True`, read from fast 'h5ad' cache.
cache_compression
See the h5py :ref:`dataset_compression`.
(Default: `settings.cache_compression`)
kwargs
Parameters passed to :func:`~anndata.read_loom`.
Returns
-------
An :class:`~anndata.AnnData` object
The filename.
dtype
Numpy data type.
"""
filename = Path(filename) # allow passing strings
if is_valid_filename(filename):
return _read(
filename,
backed=backed,
sheet=sheet,
ext=ext,
delimiter=delimiter,
first_column_names=first_column_names,
backup_url=backup_url,
cache=cache,
cache_compression=cache_compression,
**kwargs,
)
# generate filename and read to dict
filekey = str(filename)
filename = settings.writedir / (filekey + "." + settings.file_format_data)
if not filename.exists():
raise ValueError(
f"Reading with filekey {filekey!r} failed, "
f"the inferred filename {filename!r} does not exist. "
)
return anndata.read_h5ad(filename, backed=backed)

def _read(
filename: Path,
*,
backed=None,
sheet=None,
ext=None,
delimiter=None,
first_column_names=None,
backup_url=None,
cache=False,
cache_compression=None,
suppress_cache_warning=False,
**kwargs,
):
if ext is not None and ext not in avail_exts:
raise ValueError(
"Please provide one of the available extensions.\n" f"{avail_exts}"
)
else:
ext = is_valid_filename(filename, return_ext=True)
is_present = _check_datafile_present_and_download(filename, backup_url=backup_url)
if not is_present:
logg.debug(f"... did not find original file {filename}")
# read hdf5 files
if ext in {"h5", "h5ad"}:
if sheet is None:
return read_h5ad(filename, backed=backed)
else:
logg.debug(f"reading sheet {sheet} from file {filename}")
return read_hdf(filename, sheet)
# read other file types
path_cache: Path = settings.cachedir / _slugify(filename).replace(
f".{ext}", ".h5ad"
)
if path_cache.suffix in {".gz", ".bz2"}:
path_cache = path_cache.with_suffix("")
if cache and path_cache.is_file():
logg.info(f"... reading from cache file {path_cache}")
return read_h5ad(path_cache)

if not is_present:
raise FileNotFoundError(f"Did not find file {filename}.")
logg.debug(f"reading {filename}")
if not cache and not suppress_cache_warning:
logg.hint(
"This might be very slow. Consider passing `cache=True`, "
"which enables much faster reading from a cache file."
)

adata = read_mtx(filename)

return adata
# could be rewritten accounting for dtype to be more performant
X = mmread(fspath(filename)).astype(dtype)

X = csr_matrix(X)
return anndata.AnnData(X)

0 comments on commit 68a889d

Please sign in to comment.