From ce7b5878ebb3c20789f0d5927f4ed83973877b90 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Tue, 12 Jan 2021 10:21:25 +0100 Subject: [PATCH] made dataset identification more robust --- CHANGELOG.md | 4 ++ docs/changelog.md | 4 ++ scaden/preprocessing/bulk_simulation.py | 65 ++++++++++-------------- scaden/preprocessing/create_h5ad_file.py | 12 ++--- setup.py | 57 ++++++++++++--------- 5 files changed, 74 insertions(+), 68 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a53be87..d29cc1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Scaden Changelog +### Version 1.0.1 + +* Made identification of datasets more robust to fix issue [#66](https://github.com/KevinMenden/scaden/issues/66) + ### Version 1.0.0 * Rebuild Scaden model and training to use TF2 Keras API instead of the old compatibility functions diff --git a/docs/changelog.md b/docs/changelog.md index 28e7a4a..1ad49f1 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,9 @@ # Scaden Changelog +### Version 1.0.1 + +* Made identification of datasets more robust to fix issue [#66](https://github.com/KevinMenden/scaden/issues/66) + ### Version 1.0.0 * Rebuild Scaden model and training to use TF2 Keras API instead of the old compatibility functions diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py index 52ce3f7..08961ed 100644 --- a/scaden/preprocessing/bulk_simulation.py +++ b/scaden/preprocessing/bulk_simulation.py @@ -26,12 +26,7 @@ def create_fractions(no_celltypes): return fracs -def create_subsample(x, - y, - sample_size, - celltypes, - available_celltypes, - sparse=False): +def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=False): """ Generate artifical bulk subsample with random fractions of celltypes If sparse is set to true, add random celltypes to the missing celltypes @@ -46,9 +41,9 @@ def create_subsample(x, if sparse: no_keep = np.random.randint(1, len(available_celltypes)) - keep = np.random.choice(list(range(len(available_celltypes))), - size=no_keep, - replace=False) + keep = np.random.choice( + list(range(len(available_celltypes))), size=no_keep, replace=False + ) available_celltypes = [available_celltypes[i] for i in keep] no_avail_cts = len(available_celltypes) @@ -68,8 +63,7 @@ def create_subsample(x, for i in range(no_avail_cts): ct = available_celltypes[i] cells_sub = x.loc[np.array(y["Celltype"] == ct), :] - cells_fraction = np.random.randint(0, cells_sub.shape[0], - samp_fracs[i]) + cells_fraction = np.random.randint(0, cells_sub.shape[0], samp_fracs[i]) cells_sub = cells_sub.iloc[cells_fraction, :] artificial_samples.append(cells_sub) @@ -99,8 +93,9 @@ def create_subsample_dataset(x, y, sample_size, celltypes, no_samples): pbar = tqdm(range(no_samples)) pbar.set_description(desc="Normal samples") for _ in pbar: - sample, label = create_subsample(x, y, sample_size, celltypes, - available_celltypes) + sample, label = create_subsample( + x, y, sample_size, celltypes, available_celltypes + ) X.append(sample) Y.append(label) @@ -109,12 +104,9 @@ def create_subsample_dataset(x, y, sample_size, celltypes, no_samples): pbar = tqdm(range(n_sparse)) pbar.set_description(desc="Sparse samples") for _ in pbar: - sample, label = create_subsample(x, - y, - sample_size, - celltypes, - available_celltypes, - sparse=True) + sample, label = create_subsample( + x, y, sample_size, celltypes, available_celltypes, sparse=True + ) X.append(sample) Y.append(label) X = pd.concat(X, axis=1).T @@ -176,7 +168,7 @@ def load_celltypes(path, name): try: y = pd.read_table(path) # Check if has Celltype column - if not 'Celltype' in y.columns: + if not "Celltype" in y.columns: logger.error( f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column." ) @@ -206,7 +198,7 @@ def load_dataset(name, dir, pattern): y = pd.read_table(os.path.join(dir, name + "_celltypes.txt")) # Check if has Celltype column print(y.columns) - if not 'Celltype' in y.columns: + if not "Celltype" in y.columns: logger.error( f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column." ) @@ -244,9 +236,7 @@ def merge_unkown_celltypes(y, unknown_celltypes): :return: """ celltypes = list(y["Celltype"]) - new_celltypes = [ - "Unknown" if x in unknown_celltypes else x for x in celltypes - ] + new_celltypes = ["Unknown" if x in unknown_celltypes else x for x in celltypes] y["Celltype"] = new_celltypes return y @@ -316,8 +306,9 @@ def generate_signature(x, y): return signature_matrix -def simulate_bulk(sample_size, num_samples, data_path, out_dir, pattern, - unknown_celltypes): +def simulate_bulk( + sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes +): """ Simulate artificial bulk samples from single cell datasets :param sample_size: number of cells per sample @@ -329,19 +320,18 @@ def simulate_bulk(sample_size, num_samples, data_path, out_dir, pattern, """ num_samples = int( - num_samples / - 2) # divide by two so half is sparse and half is normal samples + num_samples / 2 + ) # divide by two so half is sparse and half is normal samples # List available datasets if not data_path.endswith("/"): data_path += "/" files = glob.glob(os.path.join(data_path, pattern)) files = [os.path.basename(x) for x in files] - datasets = [x.split("_")[0] for x in files] + datasets = [x.replace(pattern.replace("*", ""), "") for x in files] if len(datasets) == 0: - logging.error( - "No datasets found! Have you specified the pattern correctly?") + logging.error("No datasets found! Have you specified the pattern correctly?") sys.exit(1) print("Datasets: " + str(datasets)) @@ -371,14 +361,11 @@ def simulate_bulk(sample_size, num_samples, data_path, out_dir, pattern, # Create datasets for i in range(len(xs)): print("Subsampling " + datasets[i] + "...") - tmpx, tmpy = create_subsample_dataset(xs[i], ys[i], sample_size, - celltypes, num_samples) - tmpx.to_csv(out_dir + datasets[i] + "_samples.txt", - sep="\t", - index=False) - tmpy.to_csv(out_dir + datasets[i] + "_labels.txt", - sep="\t", - index=False) + tmpx, tmpy = create_subsample_dataset( + xs[i], ys[i], sample_size, celltypes, num_samples + ) + tmpx.to_csv(out_dir + datasets[i] + "_samples.txt", sep="\t", index=False) + tmpy.to_csv(out_dir + datasets[i] + "_labels.txt", sep="\t", index=False) gc.collect() print("Finished!") diff --git a/scaden/preprocessing/create_h5ad_file.py b/scaden/preprocessing/create_h5ad_file.py index 86ae01e..bf447d8 100644 --- a/scaden/preprocessing/create_h5ad_file.py +++ b/scaden/preprocessing/create_h5ad_file.py @@ -71,7 +71,7 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"): # List available datasets files = glob.glob(data_dir + pattern) files = [os.path.basename(x) for x in files] - datasets = [x.split("_")[0] for x in files] + datasets = [x.replace(pattern.replace("*", ""), "") for x in files] # get celltypes celltypes = load_celltypes(data_dir) @@ -95,15 +95,15 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"): x = x.sort_index(axis=1) ratios = pd.DataFrame(y, columns=celltypes) - ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]), - index=ratios.index) + ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]), index=ratios.index) print("Processing " + str(train_file)) x = pd.DataFrame(x) adata.append( - anndata.AnnData(X=x.to_numpy(), - obs=ratios, - var=pd.DataFrame(columns=[], index=list(x)))) + anndata.AnnData( + X=x.to_numpy(), obs=ratios, var=pd.DataFrame(columns=[], index=list(x)) + ) + ) for i in range(1, len(adata)): print("Concatenating " + str(i)) diff --git a/setup.py b/setup.py index 9ab0c45..0b8f036 100644 --- a/setup.py +++ b/setup.py @@ -2,32 +2,43 @@ from setuptools import setup, find_packages -version = '1.0.0' +version = "1.0.1" with open("README.md", "r", encoding="UTF-8") as fh: long_description = fh.read() -with open('LICENSE', encoding="UTF-8") as f: +with open("LICENSE", encoding="UTF-8") as f: license = f.read() -setup(name='scaden', - version=version, - description="Cell type deconvolution using single cell data", - long_description=long_description, - long_description_content_type="text/markdown", - keywords=[ - 'bioinformatics', 'deep learning', 'machine learning', - 'single cell sequencing', 'deconvolution' - ], - author='Kevin Menden', - author_email='kevin.menden@t-online.de', - url='https://github.com/KevinMenden/scaden', - license="MIT License", - entry_points={"console_scripts": ["scaden=scaden.__main__:main"]}, - packages=find_packages(), - include_package_data=True, - python_requires='>3.6.0', - install_requires=[ - 'pandas', 'numpy', 'scikit-learn', 'tensorflow>=2.0', 'anndata', - 'tqdm', 'click', 'h5py~=2.10.0' - ]) +setup( + name="scaden", + version=version, + description="Cell type deconvolution using single cell data", + long_description=long_description, + long_description_content_type="text/markdown", + keywords=[ + "bioinformatics", + "deep learning", + "machine learning", + "single cell sequencing", + "deconvolution", + ], + author="Kevin Menden", + author_email="kevin.menden@t-online.de", + url="https://github.com/KevinMenden/scaden", + license="MIT License", + entry_points={"console_scripts": ["scaden=scaden.__main__:main"]}, + packages=find_packages(), + include_package_data=True, + python_requires=">3.6.0", + install_requires=[ + "pandas", + "numpy", + "scikit-learn", + "tensorflow>=2.0", + "anndata", + "tqdm", + "click", + "h5py~=2.10.0", + ], +)