Skip to content

Commit

Permalink
clean up code
Browse files Browse the repository at this point in the history
  • Loading branch information
danielsf committed Dec 4, 2024
1 parent 22e95b3 commit 5ead218
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 49 deletions.
14 changes: 5 additions & 9 deletions src/cell_type_mapper/anndata_iterator/anndata_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
_load_disjoint_csr)

from cell_type_mapper.utils.anndata_utils import (
read_df_from_h5ad,
infer_attrs
)

Expand Down Expand Up @@ -154,7 +153,7 @@ def __init__(
else:
raise RuntimeError(
"Do not know how to iterate over anndata "
f"file\n{h5ad_path}")
f"with attrs\n{attrs}")

@property
def layer(self):
Expand Down Expand Up @@ -226,7 +225,6 @@ def _initialize_as_csc(
boolean indicating whether or not to leave the h5 handle
open (should be false when using cuda)
"""
write_as_csr = True
self.tmp_dir = tempfile.mkdtemp(
dir=tmp_dir,
prefix='anndata_iterator_')
Expand All @@ -239,14 +237,12 @@ def _initialize_as_csc(
file_size_bytes = file_stats.st_size
fudge_factor = 1.1 # just in case

obs = read_df_from_h5ad(h5ad_path, df_name='obs')
var = read_df_from_h5ad(h5ad_path, df_name='var')
array_shape = (len(obs), len(var))
array_shape = infer_attrs(
src_path=h5ad_path,
dataset=self.layer
)['shape']

if free_bytes < fudge_factor*file_size_bytes:
write_as_csr = False

if not write_as_csr:
raise RuntimeError(
"Cannot write data as CSR\n"
f"free_bytes {free_bytes}; file size {file_size_bytes}\n"
Expand Down
10 changes: 7 additions & 3 deletions src/cell_type_mapper/test_utils/anndata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,13 @@ def create_h5ad_without_encoding_type(
case where, for whatever reason, that metadata is missing
from the h5ad file.
Note: this function will only copy over obs, var, and
the contents of X and layers/. It will ignore the other
data structures in src_path.
Note: this function will only copies
obs
var
X
layers/
raw/
uns
"""
obs = read_df_from_h5ad(src_path, df_name='obs')
var = read_df_from_h5ad(src_path, df_name='var')
Expand Down
9 changes: 9 additions & 0 deletions src/cell_type_mapper/utils/anndata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,10 @@ def copy_layer_to_x(
original_h5ad_path=original_h5ad_path,
new_h5ad_path=new_h5ad_path,
layer_key=layer_key)
else:
raise RuntimeError(
f"Unclear how to parse encoding-type {encoding_type}"
)


def _copy_layer_to_x_dense(
Expand Down Expand Up @@ -313,6 +317,11 @@ def shuffle_csr_h5ad_rows(
dataset='X'
)

if attrs['encoding-type'] != 'csr_matrix':
raise RuntimeError(
f'{src_path} is not CSR encoded. Attrs for X are:\n'
f'{attrs}')

obs = read_df_from_h5ad(
h5ad_path=src_path, df_name='obs')

Expand Down
42 changes: 5 additions & 37 deletions src/cell_type_mapper/validation/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import anndata
import gc
import h5py
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -127,6 +125,11 @@ def is_data_ge_zero(
elif 'csr' in attrs['encoding-type'] \
or 'csc' in attrs['encoding-type']:
dtype = in_file[f'{layer_key}/data'].dtype
else:
raise RuntimeError(
"Unclear what to make of encoding-typ in attrs:\n"
f"{attrs}"
)

if np.issubdtype(dtype, np.integer):
iinfo = np.iinfo(dtype)
Expand Down Expand Up @@ -234,41 +237,6 @@ def map_gene_ids_in_var(
return new_var, mapping_output['n_unmapped']


def _get_minmax_x_using_anndata(
h5ad_path,
rows_at_a_time=100000,
layer='X'):
"""
If you cannot intuit how X is encoded in the h5ad file, just use
anndata's API
Returns
-------
(min_val, max_val)
"""
if layer != 'X':
raise NotImplementedError(
"No efficient way to get minmax from layers; only X")

max_val = None
min_val = None
a_data = anndata.read_h5ad(h5ad_path, backed='r')
chunk_iterator = a_data.chunked_X(rows_at_a_time)
for chunk_package in chunk_iterator:
chunk = chunk_package[0]
this_max = chunk.max()
if max_val is None or this_max > max_val:
max_val = this_max
this_min = chunk.min()
if min_val is None or this_min < min_val:
min_val = this_min

del a_data
gc.collect()

return (min_val, max_val)


def _get_minmax_from_dense(x_dataset):
"""
Get the minimum and maximum values from the X array if it is dense
Expand Down

0 comments on commit 5ead218

Please sign in to comment.