From 2c4af8bb83e64f2d1ca83b06760a38a446e3e252 Mon Sep 17 00:00:00 2001 From: eboileau Date: Fri, 19 Mar 2021 09:25:37 +0100 Subject: [PATCH 01/19] ADD h5ad input format option --- scaden/__main__.py | 8 ++- scaden/preprocessing/bulk_simulation.py | 88 +++++++++++++++++-------- scaden/simulate.py | 4 +- 3 files changed, 68 insertions(+), 32 deletions(-) diff --git a/scaden/__main__.py b/scaden/__main__.py index 4200625..a1c9d4d 100644 --- a/scaden/__main__.py +++ b/scaden/__main__.py @@ -193,7 +193,12 @@ def process(data_path, prediction_data, processed_path, var_cutoff): default="data", help="Prefix to append to training .h5ad file [default: data]", ) -def simulate(out, data, cells, n_samples, pattern, unknown, prefix): +@click.option( + "--fmt", + default="txt", + help="Input format sc data txt or h5ad [default: txt]", +) +def simulate(out, data, cells, n_samples, pattern, unknown, prefix, fmt): """ Create artificial bulk RNA-seq data from scRNA-seq dataset(s)""" simulation( simulate_dir=out, @@ -203,6 +208,7 @@ def simulate(out, data, cells, n_samples, pattern, unknown, prefix): pattern=pattern, unknown_celltypes=unknown, out_prefix=prefix, + fmt=fmt ) diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py index c5d80a6..2e91edd 100644 --- a/scaden/preprocessing/bulk_simulation.py +++ b/scaden/preprocessing/bulk_simulation.py @@ -182,46 +182,76 @@ def load_celltypes(path, name): return y -def load_dataset(name, dir, pattern): +def load_dataset(name, dir, pattern, fmt): """ Load a dataset given its name and the directory :param name: name of the dataset :param dir: directory containing the data - :param sig_genes: the signature genes for filtering + :param pattern: pattern of the data files + :param fmt: input file format txt or h5ad :return: X, Y """ pattern = pattern.replace("*", "") print("Loading " + name + " dataset ...") - - # Try to load celltypes - try: - y = pd.read_table(os.path.join(dir, name + "_celltypes.txt")) - # Check if has Celltype column - print(y.columns) - if not "Celltype" in y.columns: + + if fmt == 'txt': + # Try to load celltypes + try: + y = pd.read_table(os.path.join(dir, name + "_celltypes.txt")) + # Check if has Celltype column + print(y.columns) + if not "Celltype" in y.columns: + logger.error( + f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column." + ) + sys.exit() + except FileNotFoundError as e: logger.error( - f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column." + f"No celltypes file found for {name}. It should be called {name}_celltypes.txt." ) - sys.exit() - except FileNotFoundError as e: - logger.error( - f"No celltypes file found for {name}. It should be called {name}_celltypes.txt." - ) - sys.exit(e) + sys.exit(e) - # Try to load data file - try: - x = pd.read_table(os.path.join(dir, name + pattern), index_col=0) - except FileNotFoundError as e: - logger.error( - f"No counts file found for {name}. Was looking for file {name + pattern}" - ) + # Try to load data file + try: + x = pd.read_table(os.path.join(dir, name + pattern), index_col=0) + except FileNotFoundError as e: + logger.error( + f"No counts file found for {name}. Was looking for file {name + pattern}" + ) - # Check that celltypes and count file have same number of cells - if not y.shape[0] == x.shape[0]: + # Check that celltypes and count file have same number of cells + if not y.shape[0] == x.shape[0]: + logger.error( + f"Different number of cells in {name}_celltypes and {name + pattern}! Make sure the data has been processed correctly." + ) + sys.exit(1) + elif fmt == 'h5ad': + from anndata import read_h5ad + try: + xy = read_h5ad(os.path.join(dir, name + pattern)) + except FileNotFoundError as e: + logger.error( + f"No h5ad file found for {name}. Was looking for file {name + pattern}" + ) + sys.exit(e) + # cell types + try: + y = pd.DataFrame(xy.obs.Celltype) + y.reset_index(inplace=True, drop=True) + except: + logger.error( + f"Celltype attribute not found for {name}" + ) + sys.exit(e) + # counts + x = pd.DataFrame(xy.X.todense()) + x.index = xy.obs_names + x.columns = xy.var_names + del xy + else: logger.error( - f"Different number of cells in {name}_celltypes and {name + pattern}! Make sure the data has been processed correctly." - ) + f"Unsuported file format {fmt}!" + ) sys.exit(1) return (x, y) @@ -307,7 +337,7 @@ def generate_signature(x, y): def simulate_bulk( - sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes + sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes, fmt ): """ Simulate artificial bulk samples from single cell datasets @@ -339,7 +369,7 @@ def simulate_bulk( # Load datasets xs, ys = [], [] for i, n in enumerate(datasets): - x, y = load_dataset(n, data_path, pattern=pattern) + x, y = load_dataset(n, data_path, pattern=pattern, fmt=fmt) xs.append(x) ys.append(y) diff --git a/scaden/simulate.py b/scaden/simulate.py index 617775f..4c3f67d 100644 --- a/scaden/simulate.py +++ b/scaden/simulate.py @@ -8,12 +8,12 @@ def simulation(simulate_dir, data_dir, sample_size, num_samples, pattern, - unknown_celltypes, out_prefix): + unknown_celltypes, out_prefix, fmt): # Perform the bulk simulation unknown_celltypes = list(unknown_celltypes) simulate_bulk(sample_size, num_samples, data_dir, simulate_dir, pattern, - unknown_celltypes) + unknown_celltypes, fmt) # Create the h5ad training data file out_name = os.path.join(simulate_dir, out_prefix + ".h5ad") From a45b009b254160f3d3f78af60a77d74e7ed957fb Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sat, 20 Mar 2021 07:35:01 +0100 Subject: [PATCH 02/19] added comment to "scaden example" --- scaden/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scaden/__main__.py b/scaden/__main__.py index 4200625..452f1ee 100644 --- a/scaden/__main__.py +++ b/scaden/__main__.py @@ -220,4 +220,5 @@ def simulate(out, data, cells, n_samples, pattern, unknown, prefix): "--samples", "-n", default=10, help="Number of bulk samples [default: 10]" ) def example(cells, genes, samples, out): + """ Generate an example dataset """ exampleData(n_cells=cells, n_genes=genes, n_samples=samples, out_dir=out) From ebe0029b664c8531f0c50ddebb40a7b92ca4f504 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sat, 20 Mar 2021 08:58:41 +0100 Subject: [PATCH 03/19] started refactoring of bulk simulation --- scaden/__main__.py | 1 - scaden/preprocessing/bulk_simulation.py | 25 ++++++++++++------------ scaden/preprocessing/bulk_simulator.py | 9 +++++++++ scaden/preprocessing/create_h5ad_file.py | 1 - 4 files changed, 22 insertions(+), 14 deletions(-) create mode 100644 scaden/preprocessing/bulk_simulator.py diff --git a/scaden/__main__.py b/scaden/__main__.py index 452f1ee..3bc4f77 100644 --- a/scaden/__main__.py +++ b/scaden/__main__.py @@ -212,7 +212,6 @@ def simulate(out, data, cells, n_samples, pattern, unknown, prefix): @cli.command() -@click.option("--out", "-o", default="./", help="Directory to store output files in") @click.option("--cells", "-c", default=10, help="Number of cells [default: 10]") @click.option("--genes", "-g", default=100, help="Number of genes [default: 100]") @click.option("--out", "-o", default="./", help="Output directory [default: ./]") diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py index c5d80a6..d31e6ae 100644 --- a/scaden/preprocessing/bulk_simulation.py +++ b/scaden/preprocessing/bulk_simulation.py @@ -70,18 +70,18 @@ def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=F df_samp = pd.concat(artificial_samples, axis=0) df_samp = df_samp.sum(axis=0) - return (df_samp, fracs_complete) + return df_samp, fracs_complete def create_subsample_dataset(x, y, sample_size, celltypes, no_samples): """ Generate many artifial bulk samples with known fractions This function will create normal and sparse samples (no_samples) - :param cells: - :param labels: - :param sample_size: - :param no_celltypes: - :param no_samples: + :param x: count matrix + :param y: cell type labels + :param sample_size: number of cells per sample + :param celltypes: list of cell types + :param no_samples: number of samples to simulate :return: dataset and corresponding labels """ X = [] @@ -112,7 +112,7 @@ def create_subsample_dataset(x, y, sample_size, celltypes, no_samples): X = pd.concat(X, axis=1).T Y = pd.DataFrame(Y, columns=celltypes) - return (X, Y) + return X, Y def filter_for_celltypes(x, y, celltypes): @@ -127,12 +127,12 @@ def filter_for_celltypes(x, y, celltypes): keep = [elem in celltypes for elem in cts] x = x.loc[keep, :] y = y.loc[keep, :] - return (x, y) + return x, y def shuffle_dataset(x, y): """ - Shuffle dataset while keeping x and y in synch + Shuffle dataset while keeping x and y in sync :param x: :param y: :return: @@ -140,7 +140,7 @@ def shuffle_dataset(x, y): idx = np.random.permutation(x.index) x_shuff = x.reindex(idx) y_shuff = y.reindex(idx) - return (x_shuff, y_shuff) + return x_shuff, y_shuff def filter_matrix_signature(mat, genes): @@ -187,7 +187,7 @@ def load_dataset(name, dir, pattern): Load a dataset given its name and the directory :param name: name of the dataset :param dir: directory containing the data - :param sig_genes: the signature genes for filtering + :param pattern: the pattern for matchin the data :return: X, Y """ pattern = pattern.replace("*", "") @@ -224,7 +224,7 @@ def load_dataset(name, dir, pattern): ) sys.exit(1) - return (x, y) + return x, y def merge_unkown_celltypes(y, unknown_celltypes): @@ -260,6 +260,7 @@ def get_common_genes(xs, type="intersection"): Get common genes for all matrices xs Can either be the union or the intersection (default) of all genes :param xs: cell x gene matrices + :param type: how to calculate the signature gene list :return: list of common genes """ genes = [] diff --git a/scaden/preprocessing/bulk_simulator.py b/scaden/preprocessing/bulk_simulator.py new file mode 100644 index 0000000..a683fc0 --- /dev/null +++ b/scaden/preprocessing/bulk_simulator.py @@ -0,0 +1,9 @@ +class BulkSimulator(object): + def __init__(self, sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes): + self.sample_size = sample_size + self.num_samples = num_samples + self.data_path = data_path + self.out_dir = out_dir + self.pattern = pattern + self.unknown_celltypes = unknown_celltypes + diff --git a/scaden/preprocessing/create_h5ad_file.py b/scaden/preprocessing/create_h5ad_file.py index 906f324..0938e80 100644 --- a/scaden/preprocessing/create_h5ad_file.py +++ b/scaden/preprocessing/create_h5ad_file.py @@ -78,7 +78,6 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"): logger.info(f"Celltypes: {celltypes}") logger.info(f"Found datasets: {datasets}") adata = [] - me_dict = {} # Create adata datasets for each for i, train_file in enumerate(datasets): From a709216213b9a22675c8da4e52af5632327643d7 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sat, 20 Mar 2021 11:33:06 +0100 Subject: [PATCH 04/19] created BulkSimulator class --- scaden/__main__.py | 19 +- scaden/preprocessing/__init__.py | 1 + scaden/preprocessing/bulk_simulation.py | 8 +- scaden/preprocessing/bulk_simulator.py | 267 +++++++++++++++++++++++- scaden/simulate.py | 17 +- 5 files changed, 294 insertions(+), 18 deletions(-) diff --git a/scaden/__main__.py b/scaden/__main__.py index a8e3dd4..a0ee03d 100644 --- a/scaden/__main__.py +++ b/scaden/__main__.py @@ -158,11 +158,6 @@ def process(data_path, prediction_data, processed_path, var_cutoff): ) -""" -Simulate dataset -""" - - @cli.command() @click.option("--out", "-o", default="./", help="Directory to store output files in") @click.option("--data", "-d", default=".", help="Path to scRNA-seq dataset(s)") @@ -194,11 +189,12 @@ def process(data_path, prediction_data, processed_path, var_cutoff): help="Prefix to append to training .h5ad file [default: data]", ) @click.option( - "--fmt", + "--data-format", + "-f", default="txt", - help="Input format sc data txt or h5ad [default: txt]", + help="Data format of scRNA-seq data, can be 'txt' or 'h5ad' [default: 'txt']", ) -def simulate(out, data, cells, n_samples, pattern, unknown, prefix, fmt): +def simulate(out, data, cells, n_samples, pattern, unknown, prefix, data_format): """ Create artificial bulk RNA-seq data from scRNA-seq dataset(s)""" simulation( simulate_dir=out, @@ -208,10 +204,15 @@ def simulate(out, data, cells, n_samples, pattern, unknown, prefix, fmt): pattern=pattern, unknown_celltypes=unknown, out_prefix=prefix, - fmt=fmt + fmt=data_format ) +""" +Simulate dataset +""" + + """ Generate example data """ diff --git a/scaden/preprocessing/__init__.py b/scaden/preprocessing/__init__.py index e69de29..9b5b492 100644 --- a/scaden/preprocessing/__init__.py +++ b/scaden/preprocessing/__init__.py @@ -0,0 +1 @@ +from .bulk_simulator import BulkSimulator diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py index 2e91edd..5944b66 100644 --- a/scaden/preprocessing/bulk_simulation.py +++ b/scaden/preprocessing/bulk_simulation.py @@ -193,7 +193,7 @@ def load_dataset(name, dir, pattern, fmt): """ pattern = pattern.replace("*", "") print("Loading " + name + " dataset ...") - + if fmt == 'txt': # Try to load celltypes try: @@ -250,8 +250,8 @@ def load_dataset(name, dir, pattern, fmt): del xy else: logger.error( - f"Unsuported file format {fmt}!" - ) + f"Unsuported file format {fmt}!" + ) sys.exit(1) return (x, y) @@ -337,7 +337,7 @@ def generate_signature(x, y): def simulate_bulk( - sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes, fmt + sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes, fmt ): """ Simulate artificial bulk samples from single cell datasets diff --git a/scaden/preprocessing/bulk_simulator.py b/scaden/preprocessing/bulk_simulator.py index a683fc0..2b87ec3 100644 --- a/scaden/preprocessing/bulk_simulator.py +++ b/scaden/preprocessing/bulk_simulator.py @@ -1,9 +1,272 @@ +import logging +import glob +import os +import sys + +import pandas as pd +import anndata as ad +import numpy as np + +from rich.progress import BarColumn, Progress +logger = logging.getLogger(__name__) + + +def create_fractions(no_celltypes): + """ + Create random fractions + :param no_celltypes: number of fractions to create + :return: list of random fractions of length no_celltypes + """ + fracs = np.random.rand(no_celltypes) + fracs_sum = np.sum(fracs) + fracs = np.divide(fracs, fracs_sum) + return fracs + + class BulkSimulator(object): - def __init__(self, sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes): + """ + BulkSimulator class for the simulation of artificial bulk samples + from scRNA-seq datasets + + :param sample_size: number of cells per sample + :param num_samples: number of sample to simulate + :param data_path: path to the data directory + :param out_dir: output directory + :param pattern of the data files + :param unknown_celltypes: which celltypes to merge into the unknown class + :param fmt: the format of the input files, can be txt or h5ad + """ + + def __init__(self, sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes, fmt): self.sample_size = sample_size - self.num_samples = num_samples + self.num_samples = num_samples // 2 self.data_path = data_path self.out_dir = out_dir self.pattern = pattern self.unknown_celltypes = unknown_celltypes + self.format = fmt + + def simulate(self): + """ simulate artificial bulk datasets + """ + + # List available datasets + if not self.data_path.endswith("/"): + self.data_path += "/" + files = glob.glob(os.path.join(self.data_path, self.pattern)) + files = [os.path.basename(x) for x in files] + datasets = [x.replace(self.pattern.replace("*", ""), "") for x in files] + + if len(datasets) == 0: + logging.error("No datasets found! Have you specified the pattern correctly?") + sys.exit(1) + + logger.info("Datasets: [cyan]" + str(datasets) + "[/]") + + # Loop over datasets and simulate bulk data + for i, dataset in enumerate(datasets): + logger.info(f"[bold u]Simulating data from {dataset}") + self.simulate_dataset(dataset) + + logger.info("[bold green]Finished data simulation!") + + def simulate_dataset(self, dataset): + """ + Simulate bulk data from a single scRNA-seq dataset + @param dataset: + @type dataset: + @return: + @rtype: + """ + + # load the dataset + data_x, data_y = self.load_dataset(dataset) + + # Merge unknown celltypes + logger.info(f"Merging unknown cell types: {self.unknown_celltypes}") + data_y = self.merge_unknown_celltypes(data_y) + + logger.info(f"Subsampling [bold cyan]{dataset}[/] ...") + + # Extract celltypes + celltypes = list(set(data_y["Celltype"].tolist())) + + tmp_x, tmp_y = self.create_subsample_dataset(data_x, data_y, celltypes=celltypes) + tmp_x.to_csv(self.out_dir + dataset + "_samples.txt", sep="\t", index=False) + tmp_y.to_csv(self.out_dir + dataset + "_labels.txt", sep="\t", index=False) + + def load_dataset(self, dataset): + """ + Load a dataset + @param dataset: + @type dataset: + @return: + @rtype: + """ + pattern = self.pattern.replace("*", "") + logger.info(f"Loading [cyan]{dataset}[/] dataset ...") + dataset_counts = dataset + pattern + dataset_celltypes = dataset + "_celltypes.txt" + + # Load data in .txt format + if self.format == 'txt': + # Try to load celltypes + try: + y = pd.read_table(os.path.join(self.data_path, dataset_celltypes)) + # Check if has Celltype column + if "Celltype" not in y.columns: + logger.error( + f"No 'Celltype' column found in {dataset}_celltypes.txt! Please make sure to include this " + f"column. " + ) + sys.exit() + except FileNotFoundError as e: + logger.error( + f"No celltypes file found for [cyan]{dataset}[/]. It should be called [cyan]{dataset}_celltypes.txt." + ) + sys.exit(e) + + # Try to load data file + try: + x = pd.read_table(os.path.join(self.data_path, dataset_counts), index_col=0) + except FileNotFoundError as e: + logger.error( + f"No counts file found for [cyan]{dataset}[/]. Was looking for file [cyan]{dataset_counts}[/]" + ) + sys.exit(e) + + # Check that celltypes and count file have the same number of cells + if not y.shape[0] == x.shape[0]: + logger.error( + f"Different number of cells in {dataset}_celltypes and {dataset_counts}! Make sure the data has " + f"been processed correctly. " + ) + sys.exit(1) + + # Load data in .h5ad format + elif self.format == 'h5ad': + try: + data_h5ad = ad.read_h5ad(os.path.join(self.data_path, dataset_counts)) + except FileNotFoundError as e: + logger.error( + f"No h5ad file found for [cyan]{dataset}[/]. Was looking for file [cyan]{dataset_counts}" + ) + sys.exit(e) + # cell types + try: + y = pd.DataFrame(data_h5ad.obs.Celltype) + y.reset_index(inplace=True, drop=True) + except Exception as e: + logger.error( + f"Celltype attribute not found for [cyan]{dataset}" + ) + sys.exit(e) + # counts + x = pd.DataFrame(data_h5ad.X.todense()) + x.index = data_h5ad.obs_names + x.columns = data_h5ad.var_names + del data_h5ad + else: + logger.error( + f"Unsupported file format {self.format}!" + ) + sys.exit(1) + + return x, y + + def merge_unknown_celltypes(self, y): + """ + Merge all unknown celltypes together + :param y: list of cell type labels + :return: list of cell types with unknown cell types merged into "Unknown" type + """ + celltypes = list(y["Celltype"]) + new_celltypes = ["Unknown" if x in self.unknown_celltypes else x for x in celltypes] + y["Celltype"] = new_celltypes + return y + + def create_subsample_dataset(self, x, y, celltypes): + """ + Generate many artifial bulk samples with known fractions + This function will create normal and sparse samples (no_samples) + @param x: + @param y: + @param celltypes: + @return: + """ + sim_x = [] + sim_y = [] + + # Create normal samples + progress_bar = Progress( + "[bold blue]{task.description}", + "[bold cyan]{task.fields[samples]}", + BarColumn(bar_width=None) + ) + with progress_bar: + normal_samples_progress = progress_bar.add_task("Normal samples", total=self.num_samples, samples=0) + sparse_samples_progress = progress_bar.add_task("Sparse samples", total=self.num_samples, samples=0) + + # Create normal samples + for i in range(self.num_samples): + progress_bar.update(normal_samples_progress, advance=1, samples=i+1) + sample, label = self.create_subsample(x, y, celltypes) + sim_x.append(sample) + sim_y.append(label) + + # Create sparase samples + for i in range(self.num_samples): + progress_bar.update(sparse_samples_progress, advance=1, samples=i+1) + sample, label = self.create_subsample(x, y, celltypes, sparse=True) + sim_x.append(sample) + sim_y.append(label) + + sim_x = pd.concat(sim_x, axis=1).T + sim_y = pd.DataFrame(sim_y, columns=celltypes) + + return sim_x, sim_y + + def create_subsample(self, x, y, celltypes, sparse=False): + """ + Generate artifical bulk subsample with random fractions of celltypes + If sparse is set to true, add random celltypes to the missing celltypes + + @param x: + @param y: + @param celltypes: + @param sparse: + @return: + """ + + if sparse: + no_keep = np.random.randint(1, len(celltypes)) + keep = np.random.choice( + list(range(len(celltypes))), size=no_keep, replace=False + ) + available_celltypes = [celltypes[i] for i in keep] + + no_avail_cts = len(celltypes) + + # Create fractions for available celltypes + fracs = create_fractions(no_celltypes=no_avail_cts) + samp_fracs = np.multiply(fracs, self.sample_size) + samp_fracs = list(map(int, samp_fracs)) + + # Make complete fractions + fracs_complete = [0] * len(celltypes) + for i, act in enumerate(celltypes): + idx = celltypes.index(act) + fracs_complete[idx] = fracs[i] + + artificial_samples = [] + for i in range(no_avail_cts): + ct = celltypes[i] + cells_sub = x.loc[np.array(y["Celltype"] == ct), :] + cells_fraction = np.random.randint(0, cells_sub.shape[0], samp_fracs[i]) + cells_sub = cells_sub.iloc[cells_fraction, :] + artificial_samples.append(cells_sub) + + df_sample = pd.concat(artificial_samples, axis=0) + df_sample = df_sample.sum(axis=0) + return df_sample, fracs_complete diff --git a/scaden/simulate.py b/scaden/simulate.py index 4c3f67d..bed5d13 100644 --- a/scaden/simulate.py +++ b/scaden/simulate.py @@ -1,5 +1,6 @@ from scaden.preprocessing.create_h5ad_file import create_h5ad_file from scaden.preprocessing.bulk_simulation import simulate_bulk +from scaden.preprocessing import BulkSimulator import os """ Simulation of artificial bulk RNA-seq samples from scRNA-seq data @@ -10,10 +11,20 @@ def simulation(simulate_dir, data_dir, sample_size, num_samples, pattern, unknown_celltypes, out_prefix, fmt): - # Perform the bulk simulation unknown_celltypes = list(unknown_celltypes) - simulate_bulk(sample_size, num_samples, data_dir, simulate_dir, pattern, - unknown_celltypes, fmt) + bulk_simulator = BulkSimulator(sample_size=sample_size, + num_samples=num_samples, + data_path=data_dir, + out_dir=simulate_dir, + pattern=pattern, + unknown_celltypes=unknown_celltypes, + fmt=fmt) + + bulk_simulator.simulate() + + # Perform the bulk simulation + #simulate_bulk(sample_size, num_samples, data_dir, simulate_dir, pattern, + # unknown_celltypes, fmt) # Create the h5ad training data file out_name = os.path.join(simulate_dir, out_prefix + ".h5ad") From 8787ac9d155d5aa326b4f454a03e20fae099b839 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sat, 20 Mar 2021 12:39:45 +0100 Subject: [PATCH 05/19] refactored data merging functionality --- scaden/preprocessing/bulk_simulator.py | 68 ++++++++++++++++++++++---- scaden/simulate.py | 6 ++- 2 files changed, 63 insertions(+), 11 deletions(-) diff --git a/scaden/preprocessing/bulk_simulator.py b/scaden/preprocessing/bulk_simulator.py index 2b87ec3..213cbb6 100644 --- a/scaden/preprocessing/bulk_simulator.py +++ b/scaden/preprocessing/bulk_simulator.py @@ -2,6 +2,7 @@ import glob import os import sys +import gc import pandas as pd import anndata as ad @@ -37,7 +38,16 @@ class BulkSimulator(object): :param fmt: the format of the input files, can be txt or h5ad """ - def __init__(self, sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes, fmt): + def __init__(self, sample_size=100, + num_samples=1000, + data_path="./", + out_dir="./", + pattern="*_counts.txt", + unknown_celltypes=None, + fmt="txt"): + if unknown_celltypes is None: + unknown_celltypes = ["unknown"] + self.sample_size = sample_size self.num_samples = num_samples // 2 self.data_path = data_path @@ -45,6 +55,8 @@ def __init__(self, sample_size, num_samples, data_path, out_dir, pattern, unknow self.pattern = pattern self.unknown_celltypes = unknown_celltypes self.format = fmt + self.datasets = [] + self.dataset_files = [] def simulate(self): """ simulate artificial bulk datasets @@ -55,16 +67,18 @@ def simulate(self): self.data_path += "/" files = glob.glob(os.path.join(self.data_path, self.pattern)) files = [os.path.basename(x) for x in files] - datasets = [x.replace(self.pattern.replace("*", ""), "") for x in files] + self.datasets = [x.replace(self.pattern.replace("*", ""), "") for x in files] + self.dataset_files = [x + ".h5ad" for x in self.datasets] - if len(datasets) == 0: + if len(self.datasets) == 0: logging.error("No datasets found! Have you specified the pattern correctly?") sys.exit(1) - logger.info("Datasets: [cyan]" + str(datasets) + "[/]") + logger.info("Datasets: [cyan]" + str(self.datasets) + "[/]") # Loop over datasets and simulate bulk data - for i, dataset in enumerate(datasets): + for i, dataset in enumerate(self.datasets): + gc.collect() logger.info(f"[bold u]Simulating data from {dataset}") self.simulate_dataset(dataset) @@ -90,10 +104,17 @@ def simulate_dataset(self, dataset): # Extract celltypes celltypes = list(set(data_y["Celltype"].tolist())) - tmp_x, tmp_y = self.create_subsample_dataset(data_x, data_y, celltypes=celltypes) - tmp_x.to_csv(self.out_dir + dataset + "_samples.txt", sep="\t", index=False) - tmp_y.to_csv(self.out_dir + dataset + "_labels.txt", sep="\t", index=False) + + tmp_x = tmp_x.sort_index(axis=1) + ratios = pd.DataFrame(tmp_y, columns=celltypes) + ratios["ds"] = pd.Series(np.repeat(dataset, tmp_y.shape[0]), index=ratios.index) + + ann_data = ad.AnnData(X=tmp_x.to_numpy(), obs=ratios, var=pd.DataFrame(columns=[], index=list(tmp_x))) + ann_data.uns['unknown'] = self.unknown_celltypes + ann_data.uns['cell_types'] = celltypes + + ann_data.write(self.out_dir + dataset + ".h5ad") def load_dataset(self, dataset): """ @@ -237,7 +258,7 @@ def create_subsample(self, x, y, celltypes, sparse=False): @param sparse: @return: """ - + # todo fix available cell types again if sparse: no_keep = np.random.randint(1, len(celltypes)) keep = np.random.choice( @@ -270,3 +291,32 @@ def create_subsample(self, x, y, celltypes, sparse=False): df_sample = df_sample.sum(axis=0) return df_sample, fracs_complete + + @staticmethod + def merge_datasets(data_dir="./", files=None, out_name="data.h5ad"): + """ + + @param out_name: name of the merged .h5ad file + @param data_dir: directory to look for datasets + @param files: list of files to merge + @return: + """ + if not files: + files = glob.glob(data_dir + "*.h5ad") + + logger.info(f"Merging datasets: {files} into [bold cyan]{out_name}") + + # load first file + adata = ad.read_h5ad(files[0]) + + for i in range(1, len(files)): + adata = adata.concatenate(ad.read_h5ad(files[i]), uns_merge="same") + + # TODO remove NA values + #adata.obs.fillna(0, axis=1, inplace=True) + print(adata.obs['celltype6']) + print(type(adata.obs)) + print(adata.X.shape) + print(adata.var_names) + + adata.write(out_name) diff --git a/scaden/simulate.py b/scaden/simulate.py index bed5d13..84053b2 100644 --- a/scaden/simulate.py +++ b/scaden/simulate.py @@ -22,10 +22,12 @@ def simulation(simulate_dir, data_dir, sample_size, num_samples, pattern, bulk_simulator.simulate() + bulk_simulator.merge_datasets(data_dir=bulk_simulator.out_dir, files=bulk_simulator.dataset_files) + # Perform the bulk simulation #simulate_bulk(sample_size, num_samples, data_dir, simulate_dir, pattern, # unknown_celltypes, fmt) # Create the h5ad training data file - out_name = os.path.join(simulate_dir, out_prefix + ".h5ad") - create_h5ad_file(simulate_dir, out_name, unknown_celltypes) + #out_name = os.path.join(simulate_dir, out_prefix + ".h5ad") + #create_h5ad_file(simulate_dir, out_name, unknown_celltypes) From d0f8615973188be69c7a2a6d1bf275e90c51ae1c Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sat, 20 Mar 2021 13:31:54 +0100 Subject: [PATCH 06/19] use rich instead of tqdm for progress bar --- scaden/__main__.py | 3 +- scaden/model/scaden.py | 48 ++- scaden/preprocessing/bulk_simulation.py | 402 ------------------ scaden/preprocessing/create_h5ad_file.py | 121 ------ scaden/simulate.py | 20 +- .../{preprocessing => simulation}/__init__.py | 0 .../bulk_simulator.py | 34 +- setup.py | 1 - 8 files changed, 54 insertions(+), 575 deletions(-) delete mode 100644 scaden/preprocessing/bulk_simulation.py delete mode 100644 scaden/preprocessing/create_h5ad_file.py rename scaden/{preprocessing => simulation}/__init__.py (100%) rename scaden/{preprocessing => simulation}/bulk_simulator.py (92%) diff --git a/scaden/__main__.py b/scaden/__main__.py index a0ee03d..c188d56 100644 --- a/scaden/__main__.py +++ b/scaden/__main__.py @@ -180,7 +180,8 @@ def process(data_path, prediction_data, processed_path, var_cutoff): "-u", multiple=True, default=["unknown"], - help="Specifiy cell types to merge into the unknown category. Specify this flag for every cell type you want to merge in unknown. [default: unknown]", + help="Specifiy cell types to merge into the unknown category. Specify this flag for every cell type you want to " + "merge in unknown. [default: unknown]", ) @click.option( "--prefix", diff --git a/scaden/model/scaden.py b/scaden/model/scaden.py index 03b1d4c..33da7a7 100644 --- a/scaden/model/scaden.py +++ b/scaden/model/scaden.py @@ -10,9 +10,8 @@ import pandas as pd from anndata import read_h5ad import collections -from .functions import dummy_labels, sample_scaling -from tqdm import tqdm - +from .functions import sample_scaling +from rich.progress import Progress, BarColumn logger = logging.getLogger(__name__) @@ -297,25 +296,35 @@ def train(self, input_path, train_datasets): ) # Training loop - pbar = tqdm(range(self.num_steps)) - for step, _ in enumerate(pbar): + progress_bar = Progress( + "[bold blue]{task.description}", + "[bold cyan]Step: {task.fields[step]}, Loss: {task.fields[loss]}", + BarColumn(bar_width=None) + ) + + with progress_bar: + training_progress = progress_bar.add_task(self.model_name, + total=self.num_steps, + step=0, + loss=1) + + for step in range(self.num_steps): - x, y = self.data_iter.get_next() + x, y = self.data_iter.get_next() - with tf.GradientTape() as tape: - self.logits = self.model(x, training=True) - loss = self.compute_loss(self.logits, y) + with tf.GradientTape() as tape: + self.logits = self.model(x, training=True) + loss = self.compute_loss(self.logits, y) - grads = tape.gradient(loss, self.model.trainable_weights) + grads = tape.gradient(loss, self.model.trainable_weights) - optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) + optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) - desc = f"Step: {step}, Loss: {loss:.4f}" - pbar.set_description(desc=desc) + progress_bar.update(training_progress, advance=1, step=step, loss=f"{loss:.4f}") - # Collect garbage after 100 steps - otherwise runs out of memory - if step % 100 == 0: - gc.collect() + # Collect garbage after 100 steps - otherwise runs out of memory + if step % 100 == 0: + gc.collect() # Save the trained model self.model.save(self.model_dir) @@ -326,11 +335,10 @@ def train(self, input_path, train_datasets): os.path.join(self.model_dir, "genes.txt"), sep="\t" ) - def predict(self, input_path, out_name="scaden_predictions.txt"): + def predict(self, input_path): """ Perform prediction with a pre-trained model - :param out_dir: path to store results in - :param training_data: the dataset used for training + :param input_path: prediction data path :return: """ # Load signature genes and celltype labels @@ -347,4 +355,4 @@ def predict(self, input_path, out_name="scaden_predictions.txt"): pred_df = pd.DataFrame( predictions, columns=self.labels, index=self.sample_names ) - return pred_df \ No newline at end of file + return pred_df diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py deleted file mode 100644 index 5944b66..0000000 --- a/scaden/preprocessing/bulk_simulation.py +++ /dev/null @@ -1,402 +0,0 @@ -######################################################################### -## Simulation of artificial bulk RNA-seq datasets from scRNA-seq data # -######################################################################## -import logging -import sys -import os -import gc -import glob -import pandas as pd -import numpy as np -from tqdm import tqdm -from pathlib import Path - -logger = logging.getLogger(__name__) - - -def create_fractions(no_celltypes): - """ - Create random fractions - :param no_celltypes: number of fractions to create - :return: list of random fracs of length no_cellttypes - """ - fracs = np.random.rand(no_celltypes) - fracs_sum = np.sum(fracs) - fracs = np.divide(fracs, fracs_sum) - return fracs - - -def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=False): - """ - Generate artifical bulk subsample with random fractions of celltypes - If sparse is set to true, add random celltypes to the missing celltypes - :param cells: scRNA-seq expression matrix - :param labels: celltype labels of single cell data (same order) - :param sample_size: number of cells to use - :param celltypes: all celltypes available in all datasets - :param available_celltypes: celltypes available in currently processed dataset - :param sparse: whether to create a sparse datasets (missing celltypes) - :return: expression of artificial subsample along with celltype fractions - """ - - if sparse: - no_keep = np.random.randint(1, len(available_celltypes)) - keep = np.random.choice( - list(range(len(available_celltypes))), size=no_keep, replace=False - ) - available_celltypes = [available_celltypes[i] for i in keep] - - no_avail_cts = len(available_celltypes) - - # Create fractions for available celltypes - fracs = create_fractions(no_celltypes=no_avail_cts) - samp_fracs = np.multiply(fracs, sample_size) - samp_fracs = list(map(int, samp_fracs)) - - # Make complete fracions - fracs_complete = [0] * len(celltypes) - for i, act in enumerate(available_celltypes): - idx = celltypes.index(act) - fracs_complete[idx] = fracs[i] - - artificial_samples = [] - for i in range(no_avail_cts): - ct = available_celltypes[i] - cells_sub = x.loc[np.array(y["Celltype"] == ct), :] - cells_fraction = np.random.randint(0, cells_sub.shape[0], samp_fracs[i]) - cells_sub = cells_sub.iloc[cells_fraction, :] - artificial_samples.append(cells_sub) - - df_samp = pd.concat(artificial_samples, axis=0) - df_samp = df_samp.sum(axis=0) - - return (df_samp, fracs_complete) - - -def create_subsample_dataset(x, y, sample_size, celltypes, no_samples): - """ - Generate many artifial bulk samples with known fractions - This function will create normal and sparse samples (no_samples) - :param cells: - :param labels: - :param sample_size: - :param no_celltypes: - :param no_samples: - :return: dataset and corresponding labels - """ - X = [] - Y = [] - - available_celltypes = list(set(y["Celltype"].tolist())) - - # Create normal samples - pbar = tqdm(range(no_samples)) - pbar.set_description(desc="Normal samples") - for _ in pbar: - sample, label = create_subsample( - x, y, sample_size, celltypes, available_celltypes - ) - X.append(sample) - Y.append(label) - - # Create sparse samples - n_sparse = int(no_samples) - pbar = tqdm(range(n_sparse)) - pbar.set_description(desc="Sparse samples") - for _ in pbar: - sample, label = create_subsample( - x, y, sample_size, celltypes, available_celltypes, sparse=True - ) - X.append(sample) - Y.append(label) - X = pd.concat(X, axis=1).T - Y = pd.DataFrame(Y, columns=celltypes) - - return (X, Y) - - -def filter_for_celltypes(x, y, celltypes): - """ - Filter data for cells belonging to specified celltypes - :param x: - :param y: - :param celltypes: - :return: - """ - cts = list(y["Celltype"]) - keep = [elem in celltypes for elem in cts] - x = x.loc[keep, :] - y = y.loc[keep, :] - return (x, y) - - -def shuffle_dataset(x, y): - """ - Shuffle dataset while keeping x and y in synch - :param x: - :param y: - :return: - """ - idx = np.random.permutation(x.index) - x_shuff = x.reindex(idx) - y_shuff = y.reindex(idx) - return (x_shuff, y_shuff) - - -def filter_matrix_signature(mat, genes): - """ - Filter expression matrix using given genes - Accounts for the possibility that some genes might not be in the matrix, for these genes - a column with zeros is added to the matrix - :param mat: - :param genes: - :return: filtered matrix - """ - n_cells = mat.shape[0] - avail_genes = mat.columns - filt_genes = [g for g in genes if g in avail_genes] - missing_genes = [g for g in genes if g not in avail_genes] - mat = mat[filt_genes] - for mg in missing_genes: - mat[mg] = np.zeros(n_cells) - mat = mat[genes] - return mat - - -def load_celltypes(path, name): - """ Load the cell type information """ - try: - y = pd.read_table(path) - # Check if has Celltype column - if not "Celltype" in y.columns: - logger.error( - f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column." - ) - sys.exit() - except FileNotFoundError as e: - logger.error( - f"No celltypes file found for {name}. It should be called {name}_celltypes.txt." - ) - sys.exit(e) - - return y - - -def load_dataset(name, dir, pattern, fmt): - """ - Load a dataset given its name and the directory - :param name: name of the dataset - :param dir: directory containing the data - :param pattern: pattern of the data files - :param fmt: input file format txt or h5ad - :return: X, Y - """ - pattern = pattern.replace("*", "") - print("Loading " + name + " dataset ...") - - if fmt == 'txt': - # Try to load celltypes - try: - y = pd.read_table(os.path.join(dir, name + "_celltypes.txt")) - # Check if has Celltype column - print(y.columns) - if not "Celltype" in y.columns: - logger.error( - f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column." - ) - sys.exit() - except FileNotFoundError as e: - logger.error( - f"No celltypes file found for {name}. It should be called {name}_celltypes.txt." - ) - sys.exit(e) - - # Try to load data file - try: - x = pd.read_table(os.path.join(dir, name + pattern), index_col=0) - except FileNotFoundError as e: - logger.error( - f"No counts file found for {name}. Was looking for file {name + pattern}" - ) - - # Check that celltypes and count file have same number of cells - if not y.shape[0] == x.shape[0]: - logger.error( - f"Different number of cells in {name}_celltypes and {name + pattern}! Make sure the data has been processed correctly." - ) - sys.exit(1) - elif fmt == 'h5ad': - from anndata import read_h5ad - try: - xy = read_h5ad(os.path.join(dir, name + pattern)) - except FileNotFoundError as e: - logger.error( - f"No h5ad file found for {name}. Was looking for file {name + pattern}" - ) - sys.exit(e) - # cell types - try: - y = pd.DataFrame(xy.obs.Celltype) - y.reset_index(inplace=True, drop=True) - except: - logger.error( - f"Celltype attribute not found for {name}" - ) - sys.exit(e) - # counts - x = pd.DataFrame(xy.X.todense()) - x.index = xy.obs_names - x.columns = xy.var_names - del xy - else: - logger.error( - f"Unsuported file format {fmt}!" - ) - sys.exit(1) - - return (x, y) - - -def merge_unkown_celltypes(y, unknown_celltypes): - """ - Merge all unknown celltypes together - :param x: - :param y: - :param unknown_celltypes: - :return: - """ - celltypes = list(y["Celltype"]) - new_celltypes = ["Unknown" if x in unknown_celltypes else x for x in celltypes] - y["Celltype"] = new_celltypes - return y - - -def collect_celltypes(ys): - """ - Collect all available celltypes given all dataset labels - :param ys: list of dataset labels - :return: list of available celltypes - """ - ct_list = [list(set(y["Celltype"].tolist())) for y in ys] - celltypes = set() - for ct in ct_list: - celltypes = celltypes.union(set(ct)) - celltypes = list(celltypes) - return celltypes - - -def get_common_genes(xs, type="intersection"): - """ - Get common genes for all matrices xs - Can either be the union or the intersection (default) of all genes - :param xs: cell x gene matrices - :return: list of common genes - """ - genes = [] - for x in xs: - genes.append(list(x.columns)) - - genes = [set(g) for g in genes] - com_genes = genes[0] - if type == "union": - for gi in range(1, len(genes)): - com_genes = com_genes.union(genes[gi]) - elif type == "intersection": - for gi in range(1, len(genes)): - com_genes = com_genes.intersection(genes[gi]) - - else: - logging.critical("Wrong type selected to get common genes. Exiting.") - sys.exit() - - if len(com_genes) == 0: - logging.critical("No common genes found. Exiting.") - sys.exit() - - return list(com_genes) - - -def generate_signature(x, y): - """ - Generate signature of matrix using celltypes y - :param x: expression matrix - :param y: celltypes - :return: mean-expression per celltype - """ - - signature_matrix = [] - celltypes = list(set(y["Celltype"])) - for ct in celltypes: - ct_exp = x.loc[np.array(y["Celltype"] == ct), :] - ct_exp = ct_exp.mean(axis=0) - signature_matrix.append(ct_exp) - - signature_matrix = pd.concat(signature_matrix, axis=1) - signature_matrix.columns = celltypes - return signature_matrix - - -def simulate_bulk( - sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes, fmt -): - """ - Simulate artificial bulk samples from single cell datasets - :param sample_size: number of cells per sample - :param num_samples: number of sample to simulate - :param data_path: path to the data directory - :param out_dir: output directory - :param pattern of the data files - :param unknown_celltypes: which celltypes to merge into the unknown class - """ - - num_samples = int( - num_samples / 2 - ) # divide by two so half is sparse and half is normal samples - - # List available datasets - if not data_path.endswith("/"): - data_path += "/" - files = glob.glob(os.path.join(data_path, pattern)) - files = [os.path.basename(x) for x in files] - datasets = [x.replace(pattern.replace("*", ""), "") for x in files] - - if len(datasets) == 0: - logging.error("No datasets found! Have you specified the pattern correctly?") - sys.exit(1) - - logger.info("Datasets: [cyan]" + str(datasets) + "[/]") - - # Load datasets - xs, ys = [], [] - for i, n in enumerate(datasets): - x, y = load_dataset(n, data_path, pattern=pattern, fmt=fmt) - xs.append(x) - ys.append(y) - - # Get common gene list - all_genes = get_common_genes(xs, type="intersection") - logger.info("No. of common genes: " + str(len(all_genes))) - xs = [filter_matrix_signature(m, all_genes) for m in xs] - - # Merge unknown celltypes - logger.info("Merging unknown cell types: " + str(unknown_celltypes)) - for i in range(len(ys)): - ys[i] = merge_unkown_celltypes(ys[i], unknown_celltypes) - - # Collect all available celltypes - celltypes = collect_celltypes(ys) - logger.info("Available celltypes: " + str(celltypes)) - pd.DataFrame(celltypes).to_csv(out_dir + "celltypes.txt", sep="\t") - - # Create datasets - for i in range(len(xs)): - logger.info("Subsampling " + datasets[i] + "...") - - tmpx, tmpy = create_subsample_dataset( - xs[i], ys[i], sample_size, celltypes, num_samples - ) - tmpx.to_csv(out_dir + datasets[i] + "_samples.txt", sep="\t", index=False) - tmpy.to_csv(out_dir + datasets[i] + "_labels.txt", sep="\t", index=False) - gc.collect() - - logger.info("[bold green]Finished data simulation!") diff --git a/scaden/preprocessing/create_h5ad_file.py b/scaden/preprocessing/create_h5ad_file.py deleted file mode 100644 index 0938e80..0000000 --- a/scaden/preprocessing/create_h5ad_file.py +++ /dev/null @@ -1,121 +0,0 @@ -""" -Combine artificial bulk datasets and optionally other datasets into h5ad files -usable for scaden training - -When using additional datasets, they should be in similar format and best have the same output cell types. -""" -import gc -import anndata -import glob -import os -import sys -import logging -import pandas as pd -import numpy as np - -logger = logging.getLogger(__name__) - - -def parse_data(x_path, y_path): - """ - Parse data and labels and divide them into training and testset - :param x_path: - :param y_path: - :return: training and test data and labels - """ - # Load the data - try: - x = pd.read_table(x_path, sep="\t") - y = pd.read_table(y_path, sep="\t") - except FileNotFoundError as e: - logging.error(f" Could not find simulated data files: {e}") - sys.exit() - labels = list(y.columns) - - # Transform Y to numpy array and split in train and testset - yseries = [] - - for i in range(y.shape[0]): - yseries.append(list(y.iloc[i])) - y = np.array(yseries) - - return x, y, labels - - -def load_celltypes(brain_training_dir): - - celltypes_path = os.path.join(brain_training_dir, "celltypes.txt") - celltypes = pd.read_csv(celltypes_path, sep="\t") - celltypes = list(celltypes.iloc[:, 1]) - return celltypes - - -def sort_celltypes(ratios, labels, ref_labels): - """ - Bring ratios in correct order of cell types - :param ratios: - :param labels: - :param ref_labels: - :return: - """ - idx = [labels.index(x) for x in ref_labels] - ratios = ratios[:, idx] - return ratios - - -def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"): - """ - Create h5ad file from simulated data - """ - logger.info("[bold]Creating h5ad file") - # List available datasets - files = glob.glob(data_dir + pattern) - files = [os.path.basename(x) for x in files] - datasets = [x.replace(pattern.replace("*", ""), "") for x in files] - - # get celltypes - celltypes = load_celltypes(data_dir) - logger.info(f"Celltypes: {celltypes}") - logger.info(f"Found datasets: {datasets}") - adata = [] - - # Create adata datasets for each - for i, train_file in enumerate(datasets): - - rna_file = os.path.join(data_dir, train_file + "_samples.txt") - ratios_file = os.path.join(data_dir, train_file + "_labels.txt") - - x, y, labels = parse_data(rna_file, ratios_file) - - # sort y - y = sort_celltypes(y, labels, celltypes) - test = [labels.index(x) for x in celltypes] - labels = [labels[i] for i in test] - - x = x.sort_index(axis=1) - ratios = pd.DataFrame(y, columns=celltypes) - ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]), index=ratios.index) - - logger.info("Processing " + str(train_file)) - x = pd.DataFrame(x) - adata.append( - anndata.AnnData( - X=x.to_numpy(), obs=ratios, var=pd.DataFrame(columns=[], index=list(x)) - ) - ) - - for i in range(1, len(adata)): - logger.info("Concatenating " + str(i)) - adata[0] = adata[0].concatenate(adata[1]) - del adata[1] - gc.collect() - logger.info(len(adata)) - adata = adata[0] - - # add cell types and signature genes - adata.uns["cell_types"] = celltypes - adata.uns["unknown"] = unknown - - # save data - adata.write(out_path) - logger.info(f"Created h5ad file: {out_path}") diff --git a/scaden/simulate.py b/scaden/simulate.py index 84053b2..ef4e621 100644 --- a/scaden/simulate.py +++ b/scaden/simulate.py @@ -1,7 +1,5 @@ -from scaden.preprocessing.create_h5ad_file import create_h5ad_file -from scaden.preprocessing.bulk_simulation import simulate_bulk -from scaden.preprocessing import BulkSimulator -import os +from scaden.simulation import BulkSimulator + """ Simulation of artificial bulk RNA-seq samples from scRNA-seq data and subsequenbt formatting in .h5ad file for training with Scaden @@ -20,14 +18,10 @@ def simulation(simulate_dir, data_dir, sample_size, num_samples, pattern, unknown_celltypes=unknown_celltypes, fmt=fmt) + # Perform dataset simulation bulk_simulator.simulate() - bulk_simulator.merge_datasets(data_dir=bulk_simulator.out_dir, files=bulk_simulator.dataset_files) - - # Perform the bulk simulation - #simulate_bulk(sample_size, num_samples, data_dir, simulate_dir, pattern, - # unknown_celltypes, fmt) - - # Create the h5ad training data file - #out_name = os.path.join(simulate_dir, out_prefix + ".h5ad") - #create_h5ad_file(simulate_dir, out_name, unknown_celltypes) + # Merge the resulting datasets + bulk_simulator.merge_datasets(data_dir=bulk_simulator.out_dir, + files=bulk_simulator.dataset_files, + out_name=out_prefix + ".h5ad") diff --git a/scaden/preprocessing/__init__.py b/scaden/simulation/__init__.py similarity index 100% rename from scaden/preprocessing/__init__.py rename to scaden/simulation/__init__.py diff --git a/scaden/preprocessing/bulk_simulator.py b/scaden/simulation/bulk_simulator.py similarity index 92% rename from scaden/preprocessing/bulk_simulator.py rename to scaden/simulation/bulk_simulator.py index 213cbb6..3fc3607 100644 --- a/scaden/preprocessing/bulk_simulator.py +++ b/scaden/simulation/bulk_simulator.py @@ -258,39 +258,39 @@ def create_subsample(self, x, y, celltypes, sparse=False): @param sparse: @return: """ - # todo fix available cell types again + available_celltypes = celltypes if sparse: - no_keep = np.random.randint(1, len(celltypes)) + no_keep = np.random.randint(1, len(available_celltypes)) keep = np.random.choice( - list(range(len(celltypes))), size=no_keep, replace=False + list(range(len(available_celltypes))), size=no_keep, replace=False ) - available_celltypes = [celltypes[i] for i in keep] + available_celltypes = [available_celltypes[i] for i in keep] - no_avail_cts = len(celltypes) + no_avail_cts = len(available_celltypes) # Create fractions for available celltypes fracs = create_fractions(no_celltypes=no_avail_cts) samp_fracs = np.multiply(fracs, self.sample_size) samp_fracs = list(map(int, samp_fracs)) - # Make complete fractions + # Make complete fracions fracs_complete = [0] * len(celltypes) - for i, act in enumerate(celltypes): + for i, act in enumerate(available_celltypes): idx = celltypes.index(act) fracs_complete[idx] = fracs[i] artificial_samples = [] for i in range(no_avail_cts): - ct = celltypes[i] + ct = available_celltypes[i] cells_sub = x.loc[np.array(y["Celltype"] == ct), :] cells_fraction = np.random.randint(0, cells_sub.shape[0], samp_fracs[i]) cells_sub = cells_sub.iloc[cells_fraction, :] artificial_samples.append(cells_sub) - df_sample = pd.concat(artificial_samples, axis=0) - df_sample = df_sample.sum(axis=0) + df_samp = pd.concat(artificial_samples, axis=0) + df_samp = df_samp.sum(axis=0) - return df_sample, fracs_complete + return df_samp, fracs_complete @staticmethod def merge_datasets(data_dir="./", files=None, out_name="data.h5ad"): @@ -312,11 +312,11 @@ def merge_datasets(data_dir="./", files=None, out_name="data.h5ad"): for i in range(1, len(files)): adata = adata.concatenate(ad.read_h5ad(files[i]), uns_merge="same") - # TODO remove NA values - #adata.obs.fillna(0, axis=1, inplace=True) - print(adata.obs['celltype6']) - print(type(adata.obs)) - print(adata.X.shape) - print(adata.var_names) + combined_celltypes = list(adata.obs.columns) + combined_celltypes.remove("ds") + combined_celltypes.remove("batch") + for ct in combined_celltypes: + adata.obs[ct].fillna(0, inplace=True) + adata.uns['cell_types'] = combined_celltypes adata.write(out_name) diff --git a/setup.py b/setup.py index 004ccd9..4ae23dd 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,6 @@ "scikit-learn", "tensorflow>=2.0", "anndata", - "tqdm", "click", "h5py~=2.10.0", "rich", From 896347e6355abcd66a72f10cd25b63ff2655900d Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sat, 20 Mar 2021 13:36:41 +0100 Subject: [PATCH 07/19] fixed combined celltypes --- scaden/simulation/bulk_simulator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scaden/simulation/bulk_simulator.py b/scaden/simulation/bulk_simulator.py index 3fc3607..fee9e64 100644 --- a/scaden/simulation/bulk_simulator.py +++ b/scaden/simulation/bulk_simulator.py @@ -301,6 +301,7 @@ def merge_datasets(data_dir="./", files=None, out_name="data.h5ad"): @param files: list of files to merge @return: """ + non_celltype_obs = ["ds", "batch"] if not files: files = glob.glob(data_dir + "*.h5ad") @@ -313,8 +314,7 @@ def merge_datasets(data_dir="./", files=None, out_name="data.h5ad"): adata = adata.concatenate(ad.read_h5ad(files[i]), uns_merge="same") combined_celltypes = list(adata.obs.columns) - combined_celltypes.remove("ds") - combined_celltypes.remove("batch") + combined_celltypes = [x for x in combined_celltypes if not x in non_celltype_obs] for ct in combined_celltypes: adata.obs[ct].fillna(0, inplace=True) From 7e5f1ae8237bf1860540fea260dd4b63ada8ff2c Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sat, 20 Mar 2021 13:37:08 +0100 Subject: [PATCH 08/19] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4ae23dd..3233eb5 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages -version = "1.0.2" +version = "1.1.0" with open("README.md", "r", encoding="UTF-8") as fh: From 09ee0a4bfd24045d6faa77af8cac8992385170ef Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sat, 20 Mar 2021 13:41:18 +0100 Subject: [PATCH 09/19] fixed out_name bug --- scaden/predict.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scaden/predict.py b/scaden/predict.py index b0f567d..78cbb14 100644 --- a/scaden/predict.py +++ b/scaden/predict.py @@ -52,7 +52,7 @@ def prediction(model_dir, data_path, out_name, seed=0): ) # Predict ratios preds_256 = cdn256.predict( - input_path=data_path, out_name="scaden_predictions_m256.txt" + input_path=data_path ) # Mid model predictions @@ -65,7 +65,7 @@ def prediction(model_dir, data_path, out_name, seed=0): ) # Predict ratios preds_512 = cdn512.predict( - input_path=data_path, out_name="scaden_predictions_m512.txt" + input_path=data_path ) # Large model predictions @@ -78,7 +78,7 @@ def prediction(model_dir, data_path, out_name, seed=0): ) # Predict ratios preds_1024 = cdn1024.predict( - input_path=data_path, out_name="scaden_predictions_m1024.txt" + input_path=data_path ) # Average predictions From c5a12a78d9e35b650dc55b9b9803e89c6d050e72 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sat, 20 Mar 2021 14:30:29 +0100 Subject: [PATCH 10/19] add number of celltype as argument --- scaden/__main__.py | 16 ++++++---------- scaden/example.py | 9 ++++++--- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/scaden/__main__.py b/scaden/__main__.py index c188d56..2822d7e 100644 --- a/scaden/__main__.py +++ b/scaden/__main__.py @@ -157,7 +157,9 @@ def process(data_path, prediction_data, processed_path, var_cutoff): var_cutoff=var_cutoff, ) - +""" +Simulate dataset +""" @cli.command() @click.option("--out", "-o", default="./", help="Directory to store output files in") @click.option("--data", "-d", default=".", help="Path to scRNA-seq dataset(s)") @@ -209,23 +211,17 @@ def simulate(out, data, cells, n_samples, pattern, unknown, prefix, data_format) ) -""" -Simulate dataset -""" - - """ Generate example data """ - - @cli.command() @click.option("--cells", "-c", default=10, help="Number of cells [default: 10]") +@click.option("--types", "-t", default=5, help="Number of cell types [default: 5]") @click.option("--genes", "-g", default=100, help="Number of genes [default: 100]") @click.option("--out", "-o", default="./", help="Output directory [default: ./]") @click.option( "--samples", "-n", default=10, help="Number of bulk samples [default: 10]" ) -def example(cells, genes, samples, out): +def example(cells, genes, samples, out, types): """ Generate an example dataset """ - exampleData(n_cells=cells, n_genes=genes, n_samples=samples, out_dir=out) + exampleData(n_cells=cells, n_genes=genes, n_samples=samples, out_dir=out, n_types=types) diff --git a/scaden/example.py b/scaden/example.py index 9e44bd5..925ad78 100644 --- a/scaden/example.py +++ b/scaden/example.py @@ -2,23 +2,26 @@ Generate random example data which allows for testing and to give users examples for the input format """ -import string import random import os import logging import pandas as pd import numpy as np +import sys logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -def exampleData(n_cells=10, n_genes=100, n_samples=10, out_dir="./"): +def exampleData(n_cells=10, n_genes=100, n_samples=10, n_types=5, out_dir="./"): """ Generate an example scRNA-seq count file :param n: number of cells :param g: number of genes """ + if n_types > n_cells: + logger.error("You can't specifiy more cell types than cells!") + sys.exit(1) # Generate example scRNA-seq data counts = np.random.randint(low=0, high=1000, size=(n_cells, n_genes)) @@ -28,7 +31,7 @@ def exampleData(n_cells=10, n_genes=100, n_samples=10, out_dir="./"): df = pd.DataFrame(counts, columns=gene_names) # Generate example celltype labels - celltypes = ["celltype"] * np.random.randint(low=2, high=n_cells - 1) + celltypes = ["celltype"] * np.random.randint(n_types) for i in range(len(celltypes)): celltypes[i] = celltypes[i] + str(i) celltype_list = random.choices(celltypes, k=n_cells) From ee4fd14983de93824e8688fd1484f9fb2e86b66c Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sat, 20 Mar 2021 14:34:34 +0100 Subject: [PATCH 11/19] added blog section --- docs/blog.md | 6 ++++++ mkdocs.yml | 1 + 2 files changed, 7 insertions(+) create mode 100644 docs/blog.md diff --git a/docs/blog.md b/docs/blog.md new file mode 100644 index 0000000..ea4d09f --- /dev/null +++ b/docs/blog.md @@ -0,0 +1,6 @@ +# Scaden Blog +Apart from the changelog, this is a more informal section where I will inform about new features +that have been (or will be) implemented in Scaden. + +# Scaden v1.1.0 - Performance Improvements (21.03.2021) + diff --git a/mkdocs.yml b/mkdocs.yml index e78c266..598578f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,5 +4,6 @@ nav: - Installation: installation.md - Usage: usage.md - Datasets: datasets.md + - Blog: blog.md - Changelog: changelog.md theme: readthedocs From 972be66c92c8f4cf8f34957209fb744bacf5cae8 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sat, 20 Mar 2021 14:50:29 +0100 Subject: [PATCH 12/19] updated downloads badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e1b0f11..5a6b46a 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ ![MIT](https://img.shields.io/badge/License-MIT-black) ![Install with pip](https://img.shields.io/badge/Install%20with-pip-blue) ![Install with Bioconda](https://img.shields.io/badge/Install%20with-conda-green) -![Downloads](https://static.pepy.tech/personalized-badge/scaden?period=total&units=international_system&left_color=blue&right_color=green&left_text=Downloads) +[![Downloads](https://pepy.tech/badge/scaden)](https://pepy.tech/project/scaden) ![Docker](https://github.com/kevinmenden/scaden/workflows/Docker/badge.svg) ![Scaden CI](https://github.com/kevinmenden/scaden/workflows/Scaden%20CI/badge.svg) From f1f8abc837eb29246dbea59f922a8e45784d82e7 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sat, 20 Mar 2021 14:51:44 +0100 Subject: [PATCH 13/19] updated version badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a6b46a..a9956f9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ ![Scaden](docs/img/scaden_logo.png) -![Scaden version](https://img.shields.io/badge/scaden-v1.0.2-cyan) +![Scaden version](https://img.shields.io/badge/scaden-v1.1.0-cyan) ![MIT](https://img.shields.io/badge/License-MIT-black) ![Install with pip](https://img.shields.io/badge/Install%20with-pip-blue) From d42a69fd745fd7fef81514c19923171f1e685bbf Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sun, 21 Mar 2021 07:58:43 +0100 Subject: [PATCH 14/19] remove bioconda installation notes --- README.md | 4 ++-- docs/installation.md | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a9956f9..e4cd234 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,6 @@ ![MIT](https://img.shields.io/badge/License-MIT-black) ![Install with pip](https://img.shields.io/badge/Install%20with-pip-blue) -![Install with Bioconda](https://img.shields.io/badge/Install%20with-conda-green) [![Downloads](https://pepy.tech/badge/scaden)](https://pepy.tech/project/scaden) ![Docker](https://github.com/kevinmenden/scaden/workflows/Docker/badge.svg) ![Scaden CI](https://github.com/kevinmenden/scaden/workflows/Scaden%20CI/badge.svg) @@ -39,7 +38,8 @@ To install Scaden via pip, simply run the following command: ### Bioconda -You can also install Scaden via bioconda, using: +Bioconda installation is currently not supported for the newest Scaden versions, but this will hopefully change soon. +It is therefore highly recommended to install via pip. `conda install -c bioconda scaden` diff --git a/docs/installation.md b/docs/installation.md index e7f6c21..6af04f0 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -10,7 +10,8 @@ To install Scaden via pip, simply run the following command: ## Bioconda -You can also install Scaden via bioconda, using:: +Bioconda installation is currently not supported for the newest Scaden versions, but this will hopefully change soon. +It is therefore highly recommended to install via pip. `conda install -c bioconda scaden` From 8bbbc1d8ac1a03736928017800c7476109a5f525 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Sun, 21 Mar 2021 08:16:16 +0100 Subject: [PATCH 15/19] fixed badges --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index e4cd234..bb7c5db 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@ ![Scaden version](https://img.shields.io/badge/scaden-v1.1.0-cyan) - ![MIT](https://img.shields.io/badge/License-MIT-black) ![Install with pip](https://img.shields.io/badge/Install%20with-pip-blue) [![Downloads](https://pepy.tech/badge/scaden)](https://pepy.tech/project/scaden) From eb5e46c9830b8f7cabe2b18736b2282b24bc961b Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Tue, 23 Mar 2021 20:42:34 +0100 Subject: [PATCH 16/19] removed TF logging messages --- scaden/__main__.py | 4 ++-- scaden/model/scaden.py | 12 ++++++------ scaden/train.py | 2 -- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/scaden/__main__.py b/scaden/__main__.py index 2822d7e..3fff2ec 100644 --- a/scaden/__main__.py +++ b/scaden/__main__.py @@ -5,6 +5,7 @@ import rich.logging import logging import os +import tensorflow as tf from scaden.train import training from scaden.predict import prediction from scaden.process import processing @@ -30,8 +31,7 @@ ) ) -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" - +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" def main(): text = """ diff --git a/scaden/model/scaden.py b/scaden/model/scaden.py index 33da7a7..2fda9ab 100644 --- a/scaden/model/scaden.py +++ b/scaden/model/scaden.py @@ -12,8 +12,10 @@ import collections from .functions import sample_scaling from rich.progress import Progress, BarColumn -logger = logging.getLogger(__name__) +logger = logging.getLogger(__name__) +tf.get_logger().setLevel('ERROR') +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" class Scaden(object): """ @@ -299,14 +301,11 @@ def train(self, input_path, train_datasets): progress_bar = Progress( "[bold blue]{task.description}", "[bold cyan]Step: {task.fields[step]}, Loss: {task.fields[loss]}", - BarColumn(bar_width=None) + BarColumn(bar_width=None), ) + training_progress = progress_bar.add_task(self.model_name, total=self.num_steps, step=0, loss=1) with progress_bar: - training_progress = progress_bar.add_task(self.model_name, - total=self.num_steps, - step=0, - loss=1) for step in range(self.num_steps): @@ -326,6 +325,7 @@ def train(self, input_path, train_datasets): if step % 100 == 0: gc.collect() + # Save the trained model self.model.save(self.model_dir) pd.DataFrame(self.labels).to_csv( diff --git a/scaden/train.py b/scaden/train.py index 7f082c1..efe5f78 100644 --- a/scaden/train.py +++ b/scaden/train.py @@ -10,8 +10,6 @@ # Imports import logging -import tensorflow as tf -from anndata import read_h5ad from scaden.model.architectures import architectures from scaden.model.scaden import Scaden From 11678f0c5bdbf3653a2d2b4602dcd50d509d9d20 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Wed, 24 Mar 2021 13:00:39 +0100 Subject: [PATCH 17/19] updated docs --- CHANGELOG.md | 9 +++++++++ docs/blog.md | 5 +++++ docs/changelog.md | 9 +++++++++ docs/usage.md | 3 ++- scaden/__main__.py | 24 ++++++++++++++++++++++-- scaden/merge.py | 18 ++++++++++++++++++ scaden/simulate.py | 2 +- 7 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 scaden/merge.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 9683b0b..e1bb857 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Scaden Changelog +## Version 1.1.0 + +* Reduced memory usage of `scaden simulate` significantly by performing simulation for one dataset at a time. +* Using `.h5ad` format to store simulated data +* Allow reading data in `.h5ad` format for improved performance (courtesy of @eboileau) +* Improved logging and using rich progress bar for training +* Gene subsetting is now done only when merging datasets, which will allow to generate different combinations +of simulated datasets + ### Version 1.0.2 * General improvement of logging using the 'rich' library for colorized output diff --git a/docs/blog.md b/docs/blog.md index ea4d09f..93cc34c 100644 --- a/docs/blog.md +++ b/docs/blog.md @@ -4,3 +4,8 @@ that have been (or will be) implemented in Scaden. # Scaden v1.1.0 - Performance Improvements (21.03.2021) +Scaden v1.1.0 brings significantly improved memory consumption for the data simulation step, which was a asked for +quite frequently. Now, instead of using about 4 GB of memory to simulate a small dataset, Scaden only uses 1 GB. This will +allow to create datasets from large collections of scRNA-seq datasets without needing excessive memory. Furthermore, +Scaden now stores the simulated data in `.h5ad` format with the full list of genes. This way you can simulate from a +scRNA-seq dataset once and combine it with other datasets in the future. diff --git a/docs/changelog.md b/docs/changelog.md index 9683b0b..e1bb857 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,14 @@ # Scaden Changelog +## Version 1.1.0 + +* Reduced memory usage of `scaden simulate` significantly by performing simulation for one dataset at a time. +* Using `.h5ad` format to store simulated data +* Allow reading data in `.h5ad` format for improved performance (courtesy of @eboileau) +* Improved logging and using rich progress bar for training +* Gene subsetting is now done only when merging datasets, which will allow to generate different combinations +of simulated datasets + ### Version 1.0.2 * General improvement of logging using the 'rich' library for colorized output diff --git a/docs/usage.md b/docs/usage.md index 21b06b3..f890b1e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -126,7 +126,8 @@ This command will create the artificial samples in the current working directory For Scaden to work properly, your input files have to be correctly formatted. As long as you use Scadens inbuilt functionality to generate the training data, you should have no problem with formatting there. The prediction file, however, you have to format yourself. This should be a file of shape m X n, where m are your features (genes) and n your samples. So each row corresponds to a gene, and each column to a sample. Leave the column name for the genes empy (just put a `\t` there). This is a rather standard format to store gene expression tables, so you should have not much work assuring that the -format fits. +format fits. Since version `v1.1.0` it is also possible to load data for simulation in `.h5ad` format for improved performance. In this case, the AnnData object should have +a `Celltype` column in the `obs` field. Your data can either be raw counts or normalized, just make sure that they are not in logarithmic space already. When loading a prediction file, Scaden applies its scaling procedure to it, which involves taking the logarithm of your counts. So as long as they are not already in logarithmic space, Scaden will be able to handle both raw and normalized counts / expression values. diff --git a/scaden/__main__.py b/scaden/__main__.py index 3fff2ec..9977ccd 100644 --- a/scaden/__main__.py +++ b/scaden/__main__.py @@ -11,7 +11,7 @@ from scaden.process import processing from scaden.simulate import simulation from scaden.example import exampleData - +from scaden.merge import merge_datasets """ author: Kevin Menden @@ -33,6 +33,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" + def main(): text = """ ____ _ @@ -146,7 +147,7 @@ def predict(data_path, model_dir, outname, seed): "--var_cutoff", default=0.1, help="Filter out genes with a variance less than the specified cutoff. A low cutoff is recommended," - "this should only remove genes that are obviously uninformative.", + "this should only remove genes that are obviously uninformative.", ) def process(data_path, prediction_data, processed_path, var_cutoff): """ Process a dataset for training """ @@ -157,9 +158,12 @@ def process(data_path, prediction_data, processed_path, var_cutoff): var_cutoff=var_cutoff, ) + """ Simulate dataset """ + + @cli.command() @click.option("--out", "-o", default="./", help="Directory to store output files in") @click.option("--data", "-d", default=".", help="Path to scRNA-seq dataset(s)") @@ -211,9 +215,25 @@ def simulate(out, data, cells, n_samples, pattern, unknown, prefix, data_format) ) +""" +Merge simulated datasets +""" + + +@cli.command() +@click.option("--data", "-d", default=".", help="Directory containing simulated datasets (in .h5ad format)") +@click.option("--prefix", "-p", default="data", help="Prefix of output file [default: data]") +@click.option("--files", "-f", default=None, help="Comma-separated list of filenames to merge") +def merge(data, prefix, files): + """ Merge simulated datasets into on training dataset """ + merge_datasets(data_dir=data, prefix=prefix, files=files) + + """ Generate example data """ + + @cli.command() @click.option("--cells", "-c", default=10, help="Number of cells [default: 10]") @click.option("--types", "-t", default=5, help="Number of cell types [default: 5]") diff --git a/scaden/merge.py b/scaden/merge.py new file mode 100644 index 0000000..13f1302 --- /dev/null +++ b/scaden/merge.py @@ -0,0 +1,18 @@ +from scaden.simulation import BulkSimulator + +""" +Merge simulate datasets +""" + + +def merge_datasets(data_dir, prefix, files=None): + + bulk_simulator = BulkSimulator() + + if files: + files = files.split(",") + + # Merge the resulting datasets + bulk_simulator.merge_datasets(data_dir=data_dir, + files=files, + out_name=prefix + ".h5ad") diff --git a/scaden/simulate.py b/scaden/simulate.py index ef4e621..566a5b3 100644 --- a/scaden/simulate.py +++ b/scaden/simulate.py @@ -22,6 +22,6 @@ def simulation(simulate_dir, data_dir, sample_size, num_samples, pattern, bulk_simulator.simulate() # Merge the resulting datasets - bulk_simulator.merge_datasets(data_dir=bulk_simulator.out_dir, + bulk_simulator.merge_datasets(data_dir=simulate_dir, files=bulk_simulator.dataset_files, out_name=out_prefix + ".h5ad") From c984d9146439597e94e7bd1c300cef872491f98a Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Thu, 25 Mar 2021 06:52:16 +0100 Subject: [PATCH 18/19] fixed data simulation bug --- scaden/simulation/bulk_simulator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scaden/simulation/bulk_simulator.py b/scaden/simulation/bulk_simulator.py index fee9e64..80cf959 100644 --- a/scaden/simulation/bulk_simulator.py +++ b/scaden/simulation/bulk_simulator.py @@ -68,7 +68,7 @@ def simulate(self): files = glob.glob(os.path.join(self.data_path, self.pattern)) files = [os.path.basename(x) for x in files] self.datasets = [x.replace(self.pattern.replace("*", ""), "") for x in files] - self.dataset_files = [x + ".h5ad" for x in self.datasets] + self.dataset_files = [os.path.join(self.out_dir, x + ".h5ad") for x in self.datasets] if len(self.datasets) == 0: logging.error("No datasets found! Have you specified the pattern correctly?") @@ -114,7 +114,7 @@ def simulate_dataset(self, dataset): ann_data.uns['unknown'] = self.unknown_celltypes ann_data.uns['cell_types'] = celltypes - ann_data.write(self.out_dir + dataset + ".h5ad") + ann_data.write(os.path.join(self.out_dir, dataset + ".h5ad")) def load_dataset(self, dataset): """ @@ -303,7 +303,7 @@ def merge_datasets(data_dir="./", files=None, out_name="data.h5ad"): """ non_celltype_obs = ["ds", "batch"] if not files: - files = glob.glob(data_dir + "*.h5ad") + files = glob.glob(os.path.join(data_dir, "*.h5ad")) logger.info(f"Merging datasets: {files} into [bold cyan]{out_name}") From 238c2dce4c3c040d72d4284417ed2ecb8cd6d210 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Thu, 25 Mar 2021 07:06:33 +0100 Subject: [PATCH 19/19] updated documentation --- CHANGELOG.md | 1 + docs/blog.md | 15 +++++++++------ docs/changelog.md | 2 ++ docs/index.md | 4 ++++ docs/usage.md | 8 +++++++- 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e1bb857..560b1eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ * Improved logging and using rich progress bar for training * Gene subsetting is now done only when merging datasets, which will allow to generate different combinations of simulated datasets +* Added `scaden merge` command which allows merging of previously created datasets ### Version 1.0.2 diff --git a/docs/blog.md b/docs/blog.md index 93cc34c..8bf375f 100644 --- a/docs/blog.md +++ b/docs/blog.md @@ -2,10 +2,13 @@ Apart from the changelog, this is a more informal section where I will inform about new features that have been (or will be) implemented in Scaden. -# Scaden v1.1.0 - Performance Improvements (21.03.2021) +# Scaden v1.1.0 - Performance Improvements and `scaden merge` tool (21.03.2021) + +Scaden v1.1.0 brings significantly improved memory consumption for the data simulation step, which was a frequently asked for feature. +Now, instead of using about 4 GB of memory to simulate a small dataset, Scaden only uses 1 GB. Memory usage does not increase +with the number of datasets anymore. This will allow to create datasets from large collections of scRNA-seq datasets without +needing excessive memory. Furthermore, Scaden now stores the simulated data in `.h5ad` format with the full list of genes. +This way you can simulate from a scRNA-seq dataset once and combine it with other datasets in the future. To help with this, +I've added the `scaden merge` command, which takes a list of datasets or a directory with `.h5ad` datasets and creates +a new training dataset from it. -Scaden v1.1.0 brings significantly improved memory consumption for the data simulation step, which was a asked for -quite frequently. Now, instead of using about 4 GB of memory to simulate a small dataset, Scaden only uses 1 GB. This will -allow to create datasets from large collections of scRNA-seq datasets without needing excessive memory. Furthermore, -Scaden now stores the simulated data in `.h5ad` format with the full list of genes. This way you can simulate from a -scRNA-seq dataset once and combine it with other datasets in the future. diff --git a/docs/changelog.md b/docs/changelog.md index e1bb857..e5fa106 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -8,6 +8,8 @@ * Improved logging and using rich progress bar for training * Gene subsetting is now done only when merging datasets, which will allow to generate different combinations of simulated datasets +* Added `scaden merge` command which allows merging of previously created datasets + ### Version 1.0.2 diff --git a/docs/index.md b/docs/index.md index f1d7083..00661f1 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,3 +8,7 @@ at the [DZNE Tübingen](https://www.dzne.de/en/about-us/sites/tuebingen/) and th A paper describing Scaden has been published in Science Advances: [Deep-learning based cell composition analysis from tissue expression profiles](https://advances.sciencemag.org/content/6/30/eaba2619) + +For information about how to install Scaden, go to the [Installation](installation.md) section. Look in the [Usage](usage.md) +section for general help with Scaden usage. In the [Datasets](datasets.md) section you'll find a list of prepared training datasets. +You can also have a look in the [Blog](blog.md) section, where I summarize new features that are added to Scaden. \ No newline at end of file diff --git a/docs/usage.md b/docs/usage.md index f890b1e..50d5032 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -120,7 +120,13 @@ An example for a pattern would be `*_counts.txt`. This pattern would find the fo Make sure to include an `*` in your pattern! -This command will create the artificial samples in the current working directory. You can also specificy an output directory using the `--out` parameter. Scaden will also directly create a .h5ad file in this directory, which is the file you will need for training. By default, this file will be called `data.h5ad`, however you can change the prefix using the `--prefix` flag. +This command will create the artificial samples in the current working directory. You can also specificy an output directory using the `--out` parameter. +Scaden will also directly create a .h5ad file in this directory, which is the file you will need for training. +By default, this file will be called `data.h5ad`, however you can change the prefix using the `--prefix` flag. + +Alternatively, you can manually merge `.h5ad` files that have been created with `scaden simulate` from v1.1.0 on using +the `scaden merge` command. Either point it to a directory of `.h5ad` files, or give it a comma-separated list of files +to merge. Type `scaden merge --help` for details. ## File Formats For Scaden to work properly, your input files have to be correctly formatted. As long as you use Scadens inbuilt functionality to generate the training data, you should have no problem