diff --git a/pyproject.toml b/pyproject.toml index 10d4381..153fa9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,6 @@ python = "^3.12" pandas = "^2.0.0" xarray = "^2024.6.0" anndata = "^0.10.7" -scanpy = "^1.10.1" numpy = "^1.26" [tool.poetry.dev-dependencies] diff --git a/tensordata/scRNA/main.py b/tensordata/scRNA/main.py index 83a6769..26d29bf 100644 --- a/tensordata/scRNA/main.py +++ b/tensordata/scRNA/main.py @@ -4,9 +4,9 @@ import numpy as np import numpy.typing as npt import pandas as pd -import scanpy as sc -from scipy.sparse import csr_matrix, spmatrix -from sklearn.utils.sparsefuncs import inplace_column_scale, mean_variance_axis +from scipy.sparse import csr_matrix + +from .read_10x_mtx import read_10x_mtx DATA_DIR = Path(__file__).parent / "scRNA" @@ -23,7 +23,6 @@ def gate_thomson_cells(X) -> npt.ArrayLike: def import_thomson( - gene_threshold=0.01, anndata_path=Path("/opt") / "andrew" / "thomson_raw.h5ad", ) -> anndata.AnnData: """ @@ -69,7 +68,6 @@ def import_thomson( def import_lupus( - gene_threshold=0.1, anndata_path=Path("/opt") / "andrew" / "lupus" / "lupus.h5ad", protein_path=Path("/opt") / "andrew" @@ -137,7 +135,7 @@ def import_citeseq( files = ["control", "ic_pod1", "ic_pod7", "sc_pod1", "sc_pod7"] data = { - k: sc.read_10x_mtx(data_dir_path / k, gex_only=False, make_unique=True) + k: read_10x_mtx(data_dir_path / k, gex_only=False, make_unique=True) for k in files } @@ -160,7 +158,7 @@ def import_HTAN( data = {} for filename in files: - data[filename.stem.split("_matrix")[0]] = sc.read_10x_mtx( + data[filename.stem.split("_matrix")[0]] = read_10x_mtx( HTAN_path, gex_only=False, make_unique=True, diff --git a/tensordata/scRNA/read_10x_mtx.py b/tensordata/scRNA/read_10x_mtx.py index d249f95..8044909 100644 --- a/tensordata/scRNA/read_10x_mtx.py +++ b/tensordata/scRNA/read_10x_mtx.py @@ -1,9 +1,12 @@ from enum import Enum +from os import PathLike, fspath from pathlib import Path from typing import Literal import anndata import pandas as pd +from scipy.io import mmread +from scipy.sparse import csr_matrix class Empty(Enum): @@ -15,6 +18,24 @@ def __repr__(self) -> str: _empty = Empty.token +text_exts = { + "csv", + "tsv", + "tab", + "data", + "txt", # these four are all equivalent +} +avail_exts = { + "anndata", + "xlsx", + "h5", + "h5ad", + "mtx", + "mtx.gz", + "soft.gz", + "loom", +} | text_exts + def read_10x_mtx( path: Path | str, @@ -80,8 +101,6 @@ def _read_10x_mtx( *, var_names: Literal["gene_symbols", "gene_ids"] = "gene_symbols", make_unique: bool = True, - cache: bool = False, - cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty, prefix: str = "", is_legacy: bool, ) -> anndata.AnnData: @@ -89,11 +108,10 @@ def _read_10x_mtx( Read mex from output from Cell Ranger v2- or v3+ """ suffix = "" if is_legacy else ".gz" - adata = read( - path / f"{prefix}matrix.mtx{suffix}", - cache=cache, - cache_compression=cache_compression, - ).T # transpose the data + + filepath = path / f"{prefix}matrix.mtx{suffix}" + assert is_valid_filename(filepath) + adata = read_mtx(filepath).T genes = pd.read_csv( path / f"{prefix}{'genes' if is_legacy else 'features'}.tsv{suffix}", header=None, @@ -117,138 +135,45 @@ def _read_10x_mtx( return adata -def read( - filename: Path | str, - backed: Literal["r", "r+"] | None = None, - *, - sheet: str | None = None, - ext: str | None = None, - delimiter: str | None = None, - first_column_names: bool = False, - backup_url: str | None = None, - cache: bool = False, - cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty, - **kwargs, -) -> anndata.AnnData: - """\ - Read file and return :class:`~anndata.AnnData` object. +def is_valid_filename(filename: Path, return_ext=False): + """Check whether the argument is a filename.""" + ext = filename.suffixes + + # cases for gzipped/bzipped text files + if len(ext) == 2 and ext[0][1:] in text_exts and ext[1][1:] in ("gz", "bz2"): + return ext[0][1:] if return_ext else True + elif ext and ext[-1][1:] in avail_exts: + return ext[-1][1:] if return_ext else True + elif "".join(ext) == ".soft.gz": + return "soft.gz" if return_ext else True + elif "".join(ext) == ".mtx.gz": + return "mtx.gz" if return_ext else True + elif not return_ext: + return False + raise ValueError( + f"""\ +{filename!r} does not end on a valid extension. +Please, provide one of the available extensions. +{avail_exts} +Text files with .gz and .bz2 extensions are also supported.\ +""" + ) + - To speed up reading, consider passing ``cache=True``, which creates an hdf5 - cache file. +def read_mtx(filename: PathLike, dtype: str = "float32") -> anndata.AnnData: + """\ + Read `.mtx` file. Parameters ---------- filename - If the filename has no file extension, it is interpreted as a key for - generating a filename via ``sc.settings.writedir / (filename + - sc.settings.file_format_data)``. This is the same behavior as in - ``sc.read(filename, ...)``. - backed - If ``'r'``, load :class:`~anndata.AnnData` in ``backed`` mode instead - of fully loading it into memory (`memory` mode). If you want to modify - backed attributes of the AnnData object, you need to choose ``'r+'``. - sheet - Name of sheet/table in hdf5 or Excel file. - ext - Extension that indicates the file type. If ``None``, uses extension of - filename. - delimiter - Delimiter that separates data within text file. If ``None``, will split at - arbitrary number of white spaces, which is different from enforcing - splitting at any single white space ``' '``. - first_column_names - Assume the first column stores row names. This is only necessary if - these are not strings: strings in the first column are automatically - assumed to be row names. - backup_url - Retrieve the file from an URL if not present on disk. - - cache - If `False`, read from source, if `True`, read from fast 'h5ad' cache. - cache_compression - See the h5py :ref:`dataset_compression`. - (Default: `settings.cache_compression`) - kwargs - Parameters passed to :func:`~anndata.read_loom`. - - Returns - ------- - An :class:`~anndata.AnnData` object + The filename. + dtype + Numpy data type. """ - filename = Path(filename) # allow passing strings - if is_valid_filename(filename): - return _read( - filename, - backed=backed, - sheet=sheet, - ext=ext, - delimiter=delimiter, - first_column_names=first_column_names, - backup_url=backup_url, - cache=cache, - cache_compression=cache_compression, - **kwargs, - ) - # generate filename and read to dict - filekey = str(filename) - filename = settings.writedir / (filekey + "." + settings.file_format_data) - if not filename.exists(): - raise ValueError( - f"Reading with filekey {filekey!r} failed, " - f"the inferred filename {filename!r} does not exist. " - ) - return anndata.read_h5ad(filename, backed=backed) - -def _read( - filename: Path, - *, - backed=None, - sheet=None, - ext=None, - delimiter=None, - first_column_names=None, - backup_url=None, - cache=False, - cache_compression=None, - suppress_cache_warning=False, - **kwargs, -): - if ext is not None and ext not in avail_exts: - raise ValueError( - "Please provide one of the available extensions.\n" f"{avail_exts}" - ) - else: - ext = is_valid_filename(filename, return_ext=True) - is_present = _check_datafile_present_and_download(filename, backup_url=backup_url) - if not is_present: - logg.debug(f"... did not find original file {filename}") - # read hdf5 files - if ext in {"h5", "h5ad"}: - if sheet is None: - return read_h5ad(filename, backed=backed) - else: - logg.debug(f"reading sheet {sheet} from file {filename}") - return read_hdf(filename, sheet) - # read other file types - path_cache: Path = settings.cachedir / _slugify(filename).replace( - f".{ext}", ".h5ad" - ) - if path_cache.suffix in {".gz", ".bz2"}: - path_cache = path_cache.with_suffix("") - if cache and path_cache.is_file(): - logg.info(f"... reading from cache file {path_cache}") - return read_h5ad(path_cache) - - if not is_present: - raise FileNotFoundError(f"Did not find file {filename}.") - logg.debug(f"reading {filename}") - if not cache and not suppress_cache_warning: - logg.hint( - "This might be very slow. Consider passing `cache=True`, " - "which enables much faster reading from a cache file." - ) - - adata = read_mtx(filename) - return adata + # could be rewritten accounting for dtype to be more performant + X = mmread(fspath(filename)).astype(dtype) + X = csr_matrix(X) + return anndata.AnnData(X)