remove scanpy

meyer-lab · Jun 20, 2024 · 68a889d · 68a889d
1 parent ba480f2
commit 68a889d
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 142 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,6 @@ python = "^3.12"
 pandas = "^2.0.0"
 xarray = "^2024.6.0"
 anndata = "^0.10.7"
-scanpy = "^1.10.1"
 numpy = "^1.26"
 
 [tool.poetry.dev-dependencies]

diff --git a/tensordata/scRNA/main.py b/tensordata/scRNA/main.py
@@ -4,9 +4,9 @@
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
-import scanpy as sc
-from scipy.sparse import csr_matrix, spmatrix
-from sklearn.utils.sparsefuncs import inplace_column_scale, mean_variance_axis
+from scipy.sparse import csr_matrix
+
+from .read_10x_mtx import read_10x_mtx
 
 DATA_DIR = Path(__file__).parent / "scRNA"
 
@@ -23,7 +23,6 @@ def gate_thomson_cells(X) -> npt.ArrayLike:
 
 
 def import_thomson(
-    gene_threshold=0.01,
     anndata_path=Path("/opt") / "andrew" / "thomson_raw.h5ad",
 ) -> anndata.AnnData:
     """
@@ -69,7 +68,6 @@ def import_thomson(
 
 
 def import_lupus(
-    gene_threshold=0.1,
     anndata_path=Path("/opt") / "andrew" / "lupus" / "lupus.h5ad",
     protein_path=Path("/opt")
     / "andrew"
@@ -137,7 +135,7 @@ def import_citeseq(
     files = ["control", "ic_pod1", "ic_pod7", "sc_pod1", "sc_pod7"]
 
     data = {
-        k: sc.read_10x_mtx(data_dir_path / k, gex_only=False, make_unique=True)
+        k: read_10x_mtx(data_dir_path / k, gex_only=False, make_unique=True)
         for k in files
     }
 
@@ -160,7 +158,7 @@ def import_HTAN(
     data = {}
 
     for filename in files:
-        data[filename.stem.split("_matrix")[0]] = sc.read_10x_mtx(
+        data[filename.stem.split("_matrix")[0]] = read_10x_mtx(
             HTAN_path,
             gex_only=False,
             make_unique=True,

diff --git a/tensordata/scRNA/read_10x_mtx.py b/tensordata/scRNA/read_10x_mtx.py
@@ -1,9 +1,12 @@
 from enum import Enum
+from os import PathLike, fspath
 from pathlib import Path
 from typing import Literal
 
 import anndata
 import pandas as pd
+from scipy.io import mmread
+from scipy.sparse import csr_matrix
 
 
 class Empty(Enum):
@@ -15,6 +18,24 @@ def __repr__(self) -> str:
 
 _empty = Empty.token
 
+text_exts = {
+    "csv",
+    "tsv",
+    "tab",
+    "data",
+    "txt",  # these four are all equivalent
+}
+avail_exts = {
+    "anndata",
+    "xlsx",
+    "h5",
+    "h5ad",
+    "mtx",
+    "mtx.gz",
+    "soft.gz",
+    "loom",
+} | text_exts
+
 
 def read_10x_mtx(
     path: Path | str,
@@ -80,20 +101,17 @@ def _read_10x_mtx(
     *,
     var_names: Literal["gene_symbols", "gene_ids"] = "gene_symbols",
     make_unique: bool = True,
-    cache: bool = False,
-    cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
     prefix: str = "",
     is_legacy: bool,
 ) -> anndata.AnnData:
     """
     Read mex from output from Cell Ranger v2- or v3+
     """
     suffix = "" if is_legacy else ".gz"
-    adata = read(
-        path / f"{prefix}matrix.mtx{suffix}",
-        cache=cache,
-        cache_compression=cache_compression,
-    ).T  # transpose the data
+
+    filepath = path / f"{prefix}matrix.mtx{suffix}"
+    assert is_valid_filename(filepath)
+    adata = read_mtx(filepath).T
     genes = pd.read_csv(
         path / f"{prefix}{'genes' if is_legacy else 'features'}.tsv{suffix}",
         header=None,
@@ -117,138 +135,45 @@ def _read_10x_mtx(
     return adata
 
 
-def read(
-    filename: Path | str,
-    backed: Literal["r", "r+"] | None = None,
-    *,
-    sheet: str | None = None,
-    ext: str | None = None,
-    delimiter: str | None = None,
-    first_column_names: bool = False,
-    backup_url: str | None = None,
-    cache: bool = False,
-    cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
-    **kwargs,
-) -> anndata.AnnData:
-    """\
-    Read file and return :class:`~anndata.AnnData` object.
+def is_valid_filename(filename: Path, return_ext=False):
+    """Check whether the argument is a filename."""
+    ext = filename.suffixes
+
+    # cases for gzipped/bzipped text files
+    if len(ext) == 2 and ext[0][1:] in text_exts and ext[1][1:] in ("gz", "bz2"):
+        return ext[0][1:] if return_ext else True
+    elif ext and ext[-1][1:] in avail_exts:
+        return ext[-1][1:] if return_ext else True
+    elif "".join(ext) == ".soft.gz":
+        return "soft.gz" if return_ext else True
+    elif "".join(ext) == ".mtx.gz":
+        return "mtx.gz" if return_ext else True
+    elif not return_ext:
+        return False
+    raise ValueError(
+        f"""\
+{filename!r} does not end on a valid extension.
+Please, provide one of the available extensions.
+{avail_exts}
+Text files with .gz and .bz2 extensions are also supported.\
+"""
+    )
+
 
-    To speed up reading, consider passing ``cache=True``, which creates an hdf5
-    cache file.
+def read_mtx(filename: PathLike, dtype: str = "float32") -> anndata.AnnData:
+    """\
+    Read `.mtx` file.
 
     Parameters
     ----------
     filename
-        If the filename has no file extension, it is interpreted as a key for
-        generating a filename via ``sc.settings.writedir / (filename +
-        sc.settings.file_format_data)``.  This is the same behavior as in
-        ``sc.read(filename, ...)``.
-    backed
-        If ``'r'``, load :class:`~anndata.AnnData` in ``backed`` mode instead
-        of fully loading it into memory (`memory` mode). If you want to modify
-        backed attributes of the AnnData object, you need to choose ``'r+'``.
-    sheet
-        Name of sheet/table in hdf5 or Excel file.
-    ext
-        Extension that indicates the file type. If ``None``, uses extension of
-        filename.
-    delimiter
-        Delimiter that separates data within text file. If ``None``, will split at
-        arbitrary number of white spaces, which is different from enforcing
-        splitting at any single white space ``' '``.
-    first_column_names
-        Assume the first column stores row names. This is only necessary if
-        these are not strings: strings in the first column are automatically
-        assumed to be row names.
-    backup_url
-        Retrieve the file from an URL if not present on disk.
-        
-    cache
-        If `False`, read from source, if `True`, read from fast 'h5ad' cache.
-    cache_compression
-        See the h5py :ref:`dataset_compression`.
-        (Default: `settings.cache_compression`)
-    kwargs
-        Parameters passed to :func:`~anndata.read_loom`.
-
-    Returns
-    -------
-    An :class:`~anndata.AnnData` object
+        The filename.
+    dtype
+        Numpy data type.
     """
-    filename = Path(filename)  # allow passing strings
-    if is_valid_filename(filename):
-        return _read(
-            filename,
-            backed=backed,
-            sheet=sheet,
-            ext=ext,
-            delimiter=delimiter,
-            first_column_names=first_column_names,
-            backup_url=backup_url,
-            cache=cache,
-            cache_compression=cache_compression,
-            **kwargs,
-        )
-    # generate filename and read to dict
-    filekey = str(filename)
-    filename = settings.writedir / (filekey + "." + settings.file_format_data)
-    if not filename.exists():
-        raise ValueError(
-            f"Reading with filekey {filekey!r} failed, "
-            f"the inferred filename {filename!r} does not exist. "
-        )
-    return anndata.read_h5ad(filename, backed=backed)
-
-def _read(
-    filename: Path,
-    *,
-    backed=None,
-    sheet=None,
-    ext=None,
-    delimiter=None,
-    first_column_names=None,
-    backup_url=None,
-    cache=False,
-    cache_compression=None,
-    suppress_cache_warning=False,
-    **kwargs,
-):
-    if ext is not None and ext not in avail_exts:
-        raise ValueError(
-            "Please provide one of the available extensions.\n" f"{avail_exts}"
-        )
-    else:
-        ext = is_valid_filename(filename, return_ext=True)
-    is_present = _check_datafile_present_and_download(filename, backup_url=backup_url)
-    if not is_present:
-        logg.debug(f"... did not find original file {filename}")
-    # read hdf5 files
-    if ext in {"h5", "h5ad"}:
-        if sheet is None:
-            return read_h5ad(filename, backed=backed)
-        else:
-            logg.debug(f"reading sheet {sheet} from file {filename}")
-            return read_hdf(filename, sheet)
-    # read other file types
-    path_cache: Path = settings.cachedir / _slugify(filename).replace(
-        f".{ext}", ".h5ad"
-    )
-    if path_cache.suffix in {".gz", ".bz2"}:
-        path_cache = path_cache.with_suffix("")
-    if cache and path_cache.is_file():
-        logg.info(f"... reading from cache file {path_cache}")
-        return read_h5ad(path_cache)
-
-    if not is_present:
-        raise FileNotFoundError(f"Did not find file {filename}.")
-    logg.debug(f"reading {filename}")
-    if not cache and not suppress_cache_warning:
-        logg.hint(
-            "This might be very slow. Consider passing `cache=True`, "
-            "which enables much faster reading from a cache file."
-        )
-
-    adata = read_mtx(filename)
 
-    return adata
+    # could be rewritten accounting for dtype to be more performant
+    X = mmread(fspath(filename)).astype(dtype)
 
+    X = csr_matrix(X)
+    return anndata.AnnData(X)