diff --git a/src/cell_type_mapper/anndata_iterator/anndata_iterator.py b/src/cell_type_mapper/anndata_iterator/anndata_iterator.py index 38fc7a66..ad3552b3 100644 --- a/src/cell_type_mapper/anndata_iterator/anndata_iterator.py +++ b/src/cell_type_mapper/anndata_iterator/anndata_iterator.py @@ -19,7 +19,6 @@ _load_disjoint_csr) from cell_type_mapper.utils.anndata_utils import ( - read_df_from_h5ad, infer_attrs ) @@ -154,7 +153,7 @@ def __init__( else: raise RuntimeError( "Do not know how to iterate over anndata " - f"file\n{h5ad_path}") + f"with attrs\n{attrs}") @property def layer(self): @@ -226,7 +225,6 @@ def _initialize_as_csc( boolean indicating whether or not to leave the h5 handle open (should be false when using cuda) """ - write_as_csr = True self.tmp_dir = tempfile.mkdtemp( dir=tmp_dir, prefix='anndata_iterator_') @@ -239,14 +237,12 @@ def _initialize_as_csc( file_size_bytes = file_stats.st_size fudge_factor = 1.1 # just in case - obs = read_df_from_h5ad(h5ad_path, df_name='obs') - var = read_df_from_h5ad(h5ad_path, df_name='var') - array_shape = (len(obs), len(var)) + array_shape = infer_attrs( + src_path=h5ad_path, + dataset=self.layer + )['shape'] if free_bytes < fudge_factor*file_size_bytes: - write_as_csr = False - - if not write_as_csr: raise RuntimeError( "Cannot write data as CSR\n" f"free_bytes {free_bytes}; file size {file_size_bytes}\n" diff --git a/src/cell_type_mapper/test_utils/anndata_utils.py b/src/cell_type_mapper/test_utils/anndata_utils.py index 6d7472ef..27b9a481 100644 --- a/src/cell_type_mapper/test_utils/anndata_utils.py +++ b/src/cell_type_mapper/test_utils/anndata_utils.py @@ -20,9 +20,13 @@ def create_h5ad_without_encoding_type( case where, for whatever reason, that metadata is missing from the h5ad file. - Note: this function will only copy over obs, var, and - the contents of X and layers/. It will ignore the other - data structures in src_path. + Note: this function will only copies + obs + var + X + layers/ + raw/ + uns """ obs = read_df_from_h5ad(src_path, df_name='obs') var = read_df_from_h5ad(src_path, df_name='var') diff --git a/src/cell_type_mapper/utils/anndata_utils.py b/src/cell_type_mapper/utils/anndata_utils.py index 15e438d8..14cc38b0 100644 --- a/src/cell_type_mapper/utils/anndata_utils.py +++ b/src/cell_type_mapper/utils/anndata_utils.py @@ -186,6 +186,10 @@ def copy_layer_to_x( original_h5ad_path=original_h5ad_path, new_h5ad_path=new_h5ad_path, layer_key=layer_key) + else: + raise RuntimeError( + f"Unclear how to parse encoding-type {encoding_type}" + ) def _copy_layer_to_x_dense( @@ -313,6 +317,11 @@ def shuffle_csr_h5ad_rows( dataset='X' ) + if attrs['encoding-type'] != 'csr_matrix': + raise RuntimeError( + f'{src_path} is not CSR encoded. Attrs for X are:\n' + f'{attrs}') + obs = read_df_from_h5ad( h5ad_path=src_path, df_name='obs') diff --git a/src/cell_type_mapper/validation/utils.py b/src/cell_type_mapper/validation/utils.py index 4da4859c..12853903 100644 --- a/src/cell_type_mapper/validation/utils.py +++ b/src/cell_type_mapper/validation/utils.py @@ -1,5 +1,3 @@ -import anndata -import gc import h5py import numpy as np import pandas as pd @@ -127,6 +125,11 @@ def is_data_ge_zero( elif 'csr' in attrs['encoding-type'] \ or 'csc' in attrs['encoding-type']: dtype = in_file[f'{layer_key}/data'].dtype + else: + raise RuntimeError( + "Unclear what to make of encoding-typ in attrs:\n" + f"{attrs}" + ) if np.issubdtype(dtype, np.integer): iinfo = np.iinfo(dtype) @@ -234,41 +237,6 @@ def map_gene_ids_in_var( return new_var, mapping_output['n_unmapped'] -def _get_minmax_x_using_anndata( - h5ad_path, - rows_at_a_time=100000, - layer='X'): - """ - If you cannot intuit how X is encoded in the h5ad file, just use - anndata's API - - Returns - ------- - (min_val, max_val) - """ - if layer != 'X': - raise NotImplementedError( - "No efficient way to get minmax from layers; only X") - - max_val = None - min_val = None - a_data = anndata.read_h5ad(h5ad_path, backed='r') - chunk_iterator = a_data.chunked_X(rows_at_a_time) - for chunk_package in chunk_iterator: - chunk = chunk_package[0] - this_max = chunk.max() - if max_val is None or this_max > max_val: - max_val = this_max - this_min = chunk.min() - if min_val is None or this_min < min_val: - min_val = this_min - - del a_data - gc.collect() - - return (min_val, max_val) - - def _get_minmax_from_dense(x_dataset): """ Get the minimum and maximum values from the X array if it is dense