diff --git a/CHANGELOG.md b/CHANGELOG.md index 9683b0b..560b1eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Scaden Changelog +## Version 1.1.0 + +* Reduced memory usage of `scaden simulate` significantly by performing simulation for one dataset at a time. +* Using `.h5ad` format to store simulated data +* Allow reading data in `.h5ad` format for improved performance (courtesy of @eboileau) +* Improved logging and using rich progress bar for training +* Gene subsetting is now done only when merging datasets, which will allow to generate different combinations +of simulated datasets +* Added `scaden merge` command which allows merging of previously created datasets + ### Version 1.0.2 * General improvement of logging using the 'rich' library for colorized output diff --git a/README.md b/README.md index e1b0f11..bb7c5db 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,10 @@ ![Scaden](docs/img/scaden_logo.png) -![Scaden version](https://img.shields.io/badge/scaden-v1.0.2-cyan) - +![Scaden version](https://img.shields.io/badge/scaden-v1.1.0-cyan) ![MIT](https://img.shields.io/badge/License-MIT-black) ![Install with pip](https://img.shields.io/badge/Install%20with-pip-blue) -![Install with Bioconda](https://img.shields.io/badge/Install%20with-conda-green) -![Downloads](https://static.pepy.tech/personalized-badge/scaden?period=total&units=international_system&left_color=blue&right_color=green&left_text=Downloads) +[![Downloads](https://pepy.tech/badge/scaden)](https://pepy.tech/project/scaden) ![Docker](https://github.com/kevinmenden/scaden/workflows/Docker/badge.svg) ![Scaden CI](https://github.com/kevinmenden/scaden/workflows/Scaden%20CI/badge.svg) @@ -39,7 +37,8 @@ To install Scaden via pip, simply run the following command: ### Bioconda -You can also install Scaden via bioconda, using: +Bioconda installation is currently not supported for the newest Scaden versions, but this will hopefully change soon. +It is therefore highly recommended to install via pip. `conda install -c bioconda scaden` diff --git a/docs/blog.md b/docs/blog.md new file mode 100644 index 0000000..8bf375f --- /dev/null +++ b/docs/blog.md @@ -0,0 +1,14 @@ +# Scaden Blog +Apart from the changelog, this is a more informal section where I will inform about new features +that have been (or will be) implemented in Scaden. + +# Scaden v1.1.0 - Performance Improvements and `scaden merge` tool (21.03.2021) + +Scaden v1.1.0 brings significantly improved memory consumption for the data simulation step, which was a frequently asked for feature. +Now, instead of using about 4 GB of memory to simulate a small dataset, Scaden only uses 1 GB. Memory usage does not increase +with the number of datasets anymore. This will allow to create datasets from large collections of scRNA-seq datasets without +needing excessive memory. Furthermore, Scaden now stores the simulated data in `.h5ad` format with the full list of genes. +This way you can simulate from a scRNA-seq dataset once and combine it with other datasets in the future. To help with this, +I've added the `scaden merge` command, which takes a list of datasets or a directory with `.h5ad` datasets and creates +a new training dataset from it. + diff --git a/docs/changelog.md b/docs/changelog.md index 9683b0b..e5fa106 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,16 @@ # Scaden Changelog +## Version 1.1.0 + +* Reduced memory usage of `scaden simulate` significantly by performing simulation for one dataset at a time. +* Using `.h5ad` format to store simulated data +* Allow reading data in `.h5ad` format for improved performance (courtesy of @eboileau) +* Improved logging and using rich progress bar for training +* Gene subsetting is now done only when merging datasets, which will allow to generate different combinations +of simulated datasets +* Added `scaden merge` command which allows merging of previously created datasets + + ### Version 1.0.2 * General improvement of logging using the 'rich' library for colorized output diff --git a/docs/index.md b/docs/index.md index f1d7083..00661f1 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,3 +8,7 @@ at the [DZNE Tübingen](https://www.dzne.de/en/about-us/sites/tuebingen/) and th A paper describing Scaden has been published in Science Advances: [Deep-learning based cell composition analysis from tissue expression profiles](https://advances.sciencemag.org/content/6/30/eaba2619) + +For information about how to install Scaden, go to the [Installation](installation.md) section. Look in the [Usage](usage.md) +section for general help with Scaden usage. In the [Datasets](datasets.md) section you'll find a list of prepared training datasets. +You can also have a look in the [Blog](blog.md) section, where I summarize new features that are added to Scaden. \ No newline at end of file diff --git a/docs/installation.md b/docs/installation.md index e7f6c21..6af04f0 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -10,7 +10,8 @@ To install Scaden via pip, simply run the following command: ## Bioconda -You can also install Scaden via bioconda, using:: +Bioconda installation is currently not supported for the newest Scaden versions, but this will hopefully change soon. +It is therefore highly recommended to install via pip. `conda install -c bioconda scaden` diff --git a/docs/usage.md b/docs/usage.md index 21b06b3..50d5032 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -120,13 +120,20 @@ An example for a pattern would be `*_counts.txt`. This pattern would find the fo Make sure to include an `*` in your pattern! -This command will create the artificial samples in the current working directory. You can also specificy an output directory using the `--out` parameter. Scaden will also directly create a .h5ad file in this directory, which is the file you will need for training. By default, this file will be called `data.h5ad`, however you can change the prefix using the `--prefix` flag. +This command will create the artificial samples in the current working directory. You can also specificy an output directory using the `--out` parameter. +Scaden will also directly create a .h5ad file in this directory, which is the file you will need for training. +By default, this file will be called `data.h5ad`, however you can change the prefix using the `--prefix` flag. + +Alternatively, you can manually merge `.h5ad` files that have been created with `scaden simulate` from v1.1.0 on using +the `scaden merge` command. Either point it to a directory of `.h5ad` files, or give it a comma-separated list of files +to merge. Type `scaden merge --help` for details. ## File Formats For Scaden to work properly, your input files have to be correctly formatted. As long as you use Scadens inbuilt functionality to generate the training data, you should have no problem with formatting there. The prediction file, however, you have to format yourself. This should be a file of shape m X n, where m are your features (genes) and n your samples. So each row corresponds to a gene, and each column to a sample. Leave the column name for the genes empy (just put a `\t` there). This is a rather standard format to store gene expression tables, so you should have not much work assuring that the -format fits. +format fits. Since version `v1.1.0` it is also possible to load data for simulation in `.h5ad` format for improved performance. In this case, the AnnData object should have +a `Celltype` column in the `obs` field. Your data can either be raw counts or normalized, just make sure that they are not in logarithmic space already. When loading a prediction file, Scaden applies its scaling procedure to it, which involves taking the logarithm of your counts. So as long as they are not already in logarithmic space, Scaden will be able to handle both raw and normalized counts / expression values. diff --git a/mkdocs.yml b/mkdocs.yml index e78c266..598578f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,5 +4,6 @@ nav: - Installation: installation.md - Usage: usage.md - Datasets: datasets.md + - Blog: blog.md - Changelog: changelog.md theme: readthedocs diff --git a/scaden/__main__.py b/scaden/__main__.py index 4200625..9977ccd 100644 --- a/scaden/__main__.py +++ b/scaden/__main__.py @@ -5,12 +5,13 @@ import rich.logging import logging import os +import tensorflow as tf from scaden.train import training from scaden.predict import prediction from scaden.process import processing from scaden.simulate import simulation from scaden.example import exampleData - +from scaden.merge import merge_datasets """ author: Kevin Menden @@ -30,7 +31,7 @@ ) ) -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" def main(): @@ -146,7 +147,7 @@ def predict(data_path, model_dir, outname, seed): "--var_cutoff", default=0.1, help="Filter out genes with a variance less than the specified cutoff. A low cutoff is recommended," - "this should only remove genes that are obviously uninformative.", + "this should only remove genes that are obviously uninformative.", ) def process(data_path, prediction_data, processed_path, var_cutoff): """ Process a dataset for training """ @@ -185,7 +186,8 @@ def process(data_path, prediction_data, processed_path, var_cutoff): "-u", multiple=True, default=["unknown"], - help="Specifiy cell types to merge into the unknown category. Specify this flag for every cell type you want to merge in unknown. [default: unknown]", + help="Specifiy cell types to merge into the unknown category. Specify this flag for every cell type you want to " + "merge in unknown. [default: unknown]", ) @click.option( "--prefix", @@ -193,7 +195,13 @@ def process(data_path, prediction_data, processed_path, var_cutoff): default="data", help="Prefix to append to training .h5ad file [default: data]", ) -def simulate(out, data, cells, n_samples, pattern, unknown, prefix): +@click.option( + "--data-format", + "-f", + default="txt", + help="Data format of scRNA-seq data, can be 'txt' or 'h5ad' [default: 'txt']", +) +def simulate(out, data, cells, n_samples, pattern, unknown, prefix, data_format): """ Create artificial bulk RNA-seq data from scRNA-seq dataset(s)""" simulation( simulate_dir=out, @@ -203,21 +211,37 @@ def simulate(out, data, cells, n_samples, pattern, unknown, prefix): pattern=pattern, unknown_celltypes=unknown, out_prefix=prefix, + fmt=data_format ) +""" +Merge simulated datasets +""" + + +@cli.command() +@click.option("--data", "-d", default=".", help="Directory containing simulated datasets (in .h5ad format)") +@click.option("--prefix", "-p", default="data", help="Prefix of output file [default: data]") +@click.option("--files", "-f", default=None, help="Comma-separated list of filenames to merge") +def merge(data, prefix, files): + """ Merge simulated datasets into on training dataset """ + merge_datasets(data_dir=data, prefix=prefix, files=files) + + """ Generate example data """ @cli.command() -@click.option("--out", "-o", default="./", help="Directory to store output files in") @click.option("--cells", "-c", default=10, help="Number of cells [default: 10]") +@click.option("--types", "-t", default=5, help="Number of cell types [default: 5]") @click.option("--genes", "-g", default=100, help="Number of genes [default: 100]") @click.option("--out", "-o", default="./", help="Output directory [default: ./]") @click.option( "--samples", "-n", default=10, help="Number of bulk samples [default: 10]" ) -def example(cells, genes, samples, out): - exampleData(n_cells=cells, n_genes=genes, n_samples=samples, out_dir=out) +def example(cells, genes, samples, out, types): + """ Generate an example dataset """ + exampleData(n_cells=cells, n_genes=genes, n_samples=samples, out_dir=out, n_types=types) diff --git a/scaden/example.py b/scaden/example.py index 9e44bd5..925ad78 100644 --- a/scaden/example.py +++ b/scaden/example.py @@ -2,23 +2,26 @@ Generate random example data which allows for testing and to give users examples for the input format """ -import string import random import os import logging import pandas as pd import numpy as np +import sys logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -def exampleData(n_cells=10, n_genes=100, n_samples=10, out_dir="./"): +def exampleData(n_cells=10, n_genes=100, n_samples=10, n_types=5, out_dir="./"): """ Generate an example scRNA-seq count file :param n: number of cells :param g: number of genes """ + if n_types > n_cells: + logger.error("You can't specifiy more cell types than cells!") + sys.exit(1) # Generate example scRNA-seq data counts = np.random.randint(low=0, high=1000, size=(n_cells, n_genes)) @@ -28,7 +31,7 @@ def exampleData(n_cells=10, n_genes=100, n_samples=10, out_dir="./"): df = pd.DataFrame(counts, columns=gene_names) # Generate example celltype labels - celltypes = ["celltype"] * np.random.randint(low=2, high=n_cells - 1) + celltypes = ["celltype"] * np.random.randint(n_types) for i in range(len(celltypes)): celltypes[i] = celltypes[i] + str(i) celltype_list = random.choices(celltypes, k=n_cells) diff --git a/scaden/merge.py b/scaden/merge.py new file mode 100644 index 0000000..13f1302 --- /dev/null +++ b/scaden/merge.py @@ -0,0 +1,18 @@ +from scaden.simulation import BulkSimulator + +""" +Merge simulate datasets +""" + + +def merge_datasets(data_dir, prefix, files=None): + + bulk_simulator = BulkSimulator() + + if files: + files = files.split(",") + + # Merge the resulting datasets + bulk_simulator.merge_datasets(data_dir=data_dir, + files=files, + out_name=prefix + ".h5ad") diff --git a/scaden/model/scaden.py b/scaden/model/scaden.py index 03b1d4c..2fda9ab 100644 --- a/scaden/model/scaden.py +++ b/scaden/model/scaden.py @@ -10,11 +10,12 @@ import pandas as pd from anndata import read_h5ad import collections -from .functions import dummy_labels, sample_scaling -from tqdm import tqdm +from .functions import sample_scaling +from rich.progress import Progress, BarColumn logger = logging.getLogger(__name__) - +tf.get_logger().setLevel('ERROR') +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" class Scaden(object): """ @@ -297,25 +298,33 @@ def train(self, input_path, train_datasets): ) # Training loop - pbar = tqdm(range(self.num_steps)) - for step, _ in enumerate(pbar): + progress_bar = Progress( + "[bold blue]{task.description}", + "[bold cyan]Step: {task.fields[step]}, Loss: {task.fields[loss]}", + BarColumn(bar_width=None), + ) + + training_progress = progress_bar.add_task(self.model_name, total=self.num_steps, step=0, loss=1) + with progress_bar: + + for step in range(self.num_steps): + + x, y = self.data_iter.get_next() - x, y = self.data_iter.get_next() + with tf.GradientTape() as tape: + self.logits = self.model(x, training=True) + loss = self.compute_loss(self.logits, y) - with tf.GradientTape() as tape: - self.logits = self.model(x, training=True) - loss = self.compute_loss(self.logits, y) + grads = tape.gradient(loss, self.model.trainable_weights) - grads = tape.gradient(loss, self.model.trainable_weights) + optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) - optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) + progress_bar.update(training_progress, advance=1, step=step, loss=f"{loss:.4f}") - desc = f"Step: {step}, Loss: {loss:.4f}" - pbar.set_description(desc=desc) + # Collect garbage after 100 steps - otherwise runs out of memory + if step % 100 == 0: + gc.collect() - # Collect garbage after 100 steps - otherwise runs out of memory - if step % 100 == 0: - gc.collect() # Save the trained model self.model.save(self.model_dir) @@ -326,11 +335,10 @@ def train(self, input_path, train_datasets): os.path.join(self.model_dir, "genes.txt"), sep="\t" ) - def predict(self, input_path, out_name="scaden_predictions.txt"): + def predict(self, input_path): """ Perform prediction with a pre-trained model - :param out_dir: path to store results in - :param training_data: the dataset used for training + :param input_path: prediction data path :return: """ # Load signature genes and celltype labels @@ -347,4 +355,4 @@ def predict(self, input_path, out_name="scaden_predictions.txt"): pred_df = pd.DataFrame( predictions, columns=self.labels, index=self.sample_names ) - return pred_df \ No newline at end of file + return pred_df diff --git a/scaden/predict.py b/scaden/predict.py index b0f567d..78cbb14 100644 --- a/scaden/predict.py +++ b/scaden/predict.py @@ -52,7 +52,7 @@ def prediction(model_dir, data_path, out_name, seed=0): ) # Predict ratios preds_256 = cdn256.predict( - input_path=data_path, out_name="scaden_predictions_m256.txt" + input_path=data_path ) # Mid model predictions @@ -65,7 +65,7 @@ def prediction(model_dir, data_path, out_name, seed=0): ) # Predict ratios preds_512 = cdn512.predict( - input_path=data_path, out_name="scaden_predictions_m512.txt" + input_path=data_path ) # Large model predictions @@ -78,7 +78,7 @@ def prediction(model_dir, data_path, out_name, seed=0): ) # Predict ratios preds_1024 = cdn1024.predict( - input_path=data_path, out_name="scaden_predictions_m1024.txt" + input_path=data_path ) # Average predictions diff --git a/scaden/preprocessing/__init__.py b/scaden/preprocessing/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py deleted file mode 100644 index c5d80a6..0000000 --- a/scaden/preprocessing/bulk_simulation.py +++ /dev/null @@ -1,372 +0,0 @@ -######################################################################### -## Simulation of artificial bulk RNA-seq datasets from scRNA-seq data # -######################################################################## -import logging -import sys -import os -import gc -import glob -import pandas as pd -import numpy as np -from tqdm import tqdm -from pathlib import Path - -logger = logging.getLogger(__name__) - - -def create_fractions(no_celltypes): - """ - Create random fractions - :param no_celltypes: number of fractions to create - :return: list of random fracs of length no_cellttypes - """ - fracs = np.random.rand(no_celltypes) - fracs_sum = np.sum(fracs) - fracs = np.divide(fracs, fracs_sum) - return fracs - - -def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=False): - """ - Generate artifical bulk subsample with random fractions of celltypes - If sparse is set to true, add random celltypes to the missing celltypes - :param cells: scRNA-seq expression matrix - :param labels: celltype labels of single cell data (same order) - :param sample_size: number of cells to use - :param celltypes: all celltypes available in all datasets - :param available_celltypes: celltypes available in currently processed dataset - :param sparse: whether to create a sparse datasets (missing celltypes) - :return: expression of artificial subsample along with celltype fractions - """ - - if sparse: - no_keep = np.random.randint(1, len(available_celltypes)) - keep = np.random.choice( - list(range(len(available_celltypes))), size=no_keep, replace=False - ) - available_celltypes = [available_celltypes[i] for i in keep] - - no_avail_cts = len(available_celltypes) - - # Create fractions for available celltypes - fracs = create_fractions(no_celltypes=no_avail_cts) - samp_fracs = np.multiply(fracs, sample_size) - samp_fracs = list(map(int, samp_fracs)) - - # Make complete fracions - fracs_complete = [0] * len(celltypes) - for i, act in enumerate(available_celltypes): - idx = celltypes.index(act) - fracs_complete[idx] = fracs[i] - - artificial_samples = [] - for i in range(no_avail_cts): - ct = available_celltypes[i] - cells_sub = x.loc[np.array(y["Celltype"] == ct), :] - cells_fraction = np.random.randint(0, cells_sub.shape[0], samp_fracs[i]) - cells_sub = cells_sub.iloc[cells_fraction, :] - artificial_samples.append(cells_sub) - - df_samp = pd.concat(artificial_samples, axis=0) - df_samp = df_samp.sum(axis=0) - - return (df_samp, fracs_complete) - - -def create_subsample_dataset(x, y, sample_size, celltypes, no_samples): - """ - Generate many artifial bulk samples with known fractions - This function will create normal and sparse samples (no_samples) - :param cells: - :param labels: - :param sample_size: - :param no_celltypes: - :param no_samples: - :return: dataset and corresponding labels - """ - X = [] - Y = [] - - available_celltypes = list(set(y["Celltype"].tolist())) - - # Create normal samples - pbar = tqdm(range(no_samples)) - pbar.set_description(desc="Normal samples") - for _ in pbar: - sample, label = create_subsample( - x, y, sample_size, celltypes, available_celltypes - ) - X.append(sample) - Y.append(label) - - # Create sparse samples - n_sparse = int(no_samples) - pbar = tqdm(range(n_sparse)) - pbar.set_description(desc="Sparse samples") - for _ in pbar: - sample, label = create_subsample( - x, y, sample_size, celltypes, available_celltypes, sparse=True - ) - X.append(sample) - Y.append(label) - X = pd.concat(X, axis=1).T - Y = pd.DataFrame(Y, columns=celltypes) - - return (X, Y) - - -def filter_for_celltypes(x, y, celltypes): - """ - Filter data for cells belonging to specified celltypes - :param x: - :param y: - :param celltypes: - :return: - """ - cts = list(y["Celltype"]) - keep = [elem in celltypes for elem in cts] - x = x.loc[keep, :] - y = y.loc[keep, :] - return (x, y) - - -def shuffle_dataset(x, y): - """ - Shuffle dataset while keeping x and y in synch - :param x: - :param y: - :return: - """ - idx = np.random.permutation(x.index) - x_shuff = x.reindex(idx) - y_shuff = y.reindex(idx) - return (x_shuff, y_shuff) - - -def filter_matrix_signature(mat, genes): - """ - Filter expression matrix using given genes - Accounts for the possibility that some genes might not be in the matrix, for these genes - a column with zeros is added to the matrix - :param mat: - :param genes: - :return: filtered matrix - """ - n_cells = mat.shape[0] - avail_genes = mat.columns - filt_genes = [g for g in genes if g in avail_genes] - missing_genes = [g for g in genes if g not in avail_genes] - mat = mat[filt_genes] - for mg in missing_genes: - mat[mg] = np.zeros(n_cells) - mat = mat[genes] - return mat - - -def load_celltypes(path, name): - """ Load the cell type information """ - try: - y = pd.read_table(path) - # Check if has Celltype column - if not "Celltype" in y.columns: - logger.error( - f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column." - ) - sys.exit() - except FileNotFoundError as e: - logger.error( - f"No celltypes file found for {name}. It should be called {name}_celltypes.txt." - ) - sys.exit(e) - - return y - - -def load_dataset(name, dir, pattern): - """ - Load a dataset given its name and the directory - :param name: name of the dataset - :param dir: directory containing the data - :param sig_genes: the signature genes for filtering - :return: X, Y - """ - pattern = pattern.replace("*", "") - print("Loading " + name + " dataset ...") - - # Try to load celltypes - try: - y = pd.read_table(os.path.join(dir, name + "_celltypes.txt")) - # Check if has Celltype column - print(y.columns) - if not "Celltype" in y.columns: - logger.error( - f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column." - ) - sys.exit() - except FileNotFoundError as e: - logger.error( - f"No celltypes file found for {name}. It should be called {name}_celltypes.txt." - ) - sys.exit(e) - - # Try to load data file - try: - x = pd.read_table(os.path.join(dir, name + pattern), index_col=0) - except FileNotFoundError as e: - logger.error( - f"No counts file found for {name}. Was looking for file {name + pattern}" - ) - - # Check that celltypes and count file have same number of cells - if not y.shape[0] == x.shape[0]: - logger.error( - f"Different number of cells in {name}_celltypes and {name + pattern}! Make sure the data has been processed correctly." - ) - sys.exit(1) - - return (x, y) - - -def merge_unkown_celltypes(y, unknown_celltypes): - """ - Merge all unknown celltypes together - :param x: - :param y: - :param unknown_celltypes: - :return: - """ - celltypes = list(y["Celltype"]) - new_celltypes = ["Unknown" if x in unknown_celltypes else x for x in celltypes] - y["Celltype"] = new_celltypes - return y - - -def collect_celltypes(ys): - """ - Collect all available celltypes given all dataset labels - :param ys: list of dataset labels - :return: list of available celltypes - """ - ct_list = [list(set(y["Celltype"].tolist())) for y in ys] - celltypes = set() - for ct in ct_list: - celltypes = celltypes.union(set(ct)) - celltypes = list(celltypes) - return celltypes - - -def get_common_genes(xs, type="intersection"): - """ - Get common genes for all matrices xs - Can either be the union or the intersection (default) of all genes - :param xs: cell x gene matrices - :return: list of common genes - """ - genes = [] - for x in xs: - genes.append(list(x.columns)) - - genes = [set(g) for g in genes] - com_genes = genes[0] - if type == "union": - for gi in range(1, len(genes)): - com_genes = com_genes.union(genes[gi]) - elif type == "intersection": - for gi in range(1, len(genes)): - com_genes = com_genes.intersection(genes[gi]) - - else: - logging.critical("Wrong type selected to get common genes. Exiting.") - sys.exit() - - if len(com_genes) == 0: - logging.critical("No common genes found. Exiting.") - sys.exit() - - return list(com_genes) - - -def generate_signature(x, y): - """ - Generate signature of matrix using celltypes y - :param x: expression matrix - :param y: celltypes - :return: mean-expression per celltype - """ - - signature_matrix = [] - celltypes = list(set(y["Celltype"])) - for ct in celltypes: - ct_exp = x.loc[np.array(y["Celltype"] == ct), :] - ct_exp = ct_exp.mean(axis=0) - signature_matrix.append(ct_exp) - - signature_matrix = pd.concat(signature_matrix, axis=1) - signature_matrix.columns = celltypes - return signature_matrix - - -def simulate_bulk( - sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes -): - """ - Simulate artificial bulk samples from single cell datasets - :param sample_size: number of cells per sample - :param num_samples: number of sample to simulate - :param data_path: path to the data directory - :param out_dir: output directory - :param pattern of the data files - :param unknown_celltypes: which celltypes to merge into the unknown class - """ - - num_samples = int( - num_samples / 2 - ) # divide by two so half is sparse and half is normal samples - - # List available datasets - if not data_path.endswith("/"): - data_path += "/" - files = glob.glob(os.path.join(data_path, pattern)) - files = [os.path.basename(x) for x in files] - datasets = [x.replace(pattern.replace("*", ""), "") for x in files] - - if len(datasets) == 0: - logging.error("No datasets found! Have you specified the pattern correctly?") - sys.exit(1) - - logger.info("Datasets: [cyan]" + str(datasets) + "[/]") - - # Load datasets - xs, ys = [], [] - for i, n in enumerate(datasets): - x, y = load_dataset(n, data_path, pattern=pattern) - xs.append(x) - ys.append(y) - - # Get common gene list - all_genes = get_common_genes(xs, type="intersection") - logger.info("No. of common genes: " + str(len(all_genes))) - xs = [filter_matrix_signature(m, all_genes) for m in xs] - - # Merge unknown celltypes - logger.info("Merging unknown cell types: " + str(unknown_celltypes)) - for i in range(len(ys)): - ys[i] = merge_unkown_celltypes(ys[i], unknown_celltypes) - - # Collect all available celltypes - celltypes = collect_celltypes(ys) - logger.info("Available celltypes: " + str(celltypes)) - pd.DataFrame(celltypes).to_csv(out_dir + "celltypes.txt", sep="\t") - - # Create datasets - for i in range(len(xs)): - logger.info("Subsampling " + datasets[i] + "...") - - tmpx, tmpy = create_subsample_dataset( - xs[i], ys[i], sample_size, celltypes, num_samples - ) - tmpx.to_csv(out_dir + datasets[i] + "_samples.txt", sep="\t", index=False) - tmpy.to_csv(out_dir + datasets[i] + "_labels.txt", sep="\t", index=False) - gc.collect() - - logger.info("[bold green]Finished data simulation!") diff --git a/scaden/preprocessing/create_h5ad_file.py b/scaden/preprocessing/create_h5ad_file.py deleted file mode 100644 index 906f324..0000000 --- a/scaden/preprocessing/create_h5ad_file.py +++ /dev/null @@ -1,122 +0,0 @@ -""" -Combine artificial bulk datasets and optionally other datasets into h5ad files -usable for scaden training - -When using additional datasets, they should be in similar format and best have the same output cell types. -""" -import gc -import anndata -import glob -import os -import sys -import logging -import pandas as pd -import numpy as np - -logger = logging.getLogger(__name__) - - -def parse_data(x_path, y_path): - """ - Parse data and labels and divide them into training and testset - :param x_path: - :param y_path: - :return: training and test data and labels - """ - # Load the data - try: - x = pd.read_table(x_path, sep="\t") - y = pd.read_table(y_path, sep="\t") - except FileNotFoundError as e: - logging.error(f" Could not find simulated data files: {e}") - sys.exit() - labels = list(y.columns) - - # Transform Y to numpy array and split in train and testset - yseries = [] - - for i in range(y.shape[0]): - yseries.append(list(y.iloc[i])) - y = np.array(yseries) - - return x, y, labels - - -def load_celltypes(brain_training_dir): - - celltypes_path = os.path.join(brain_training_dir, "celltypes.txt") - celltypes = pd.read_csv(celltypes_path, sep="\t") - celltypes = list(celltypes.iloc[:, 1]) - return celltypes - - -def sort_celltypes(ratios, labels, ref_labels): - """ - Bring ratios in correct order of cell types - :param ratios: - :param labels: - :param ref_labels: - :return: - """ - idx = [labels.index(x) for x in ref_labels] - ratios = ratios[:, idx] - return ratios - - -def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"): - """ - Create h5ad file from simulated data - """ - logger.info("[bold]Creating h5ad file") - # List available datasets - files = glob.glob(data_dir + pattern) - files = [os.path.basename(x) for x in files] - datasets = [x.replace(pattern.replace("*", ""), "") for x in files] - - # get celltypes - celltypes = load_celltypes(data_dir) - logger.info(f"Celltypes: {celltypes}") - logger.info(f"Found datasets: {datasets}") - adata = [] - me_dict = {} - - # Create adata datasets for each - for i, train_file in enumerate(datasets): - - rna_file = os.path.join(data_dir, train_file + "_samples.txt") - ratios_file = os.path.join(data_dir, train_file + "_labels.txt") - - x, y, labels = parse_data(rna_file, ratios_file) - - # sort y - y = sort_celltypes(y, labels, celltypes) - test = [labels.index(x) for x in celltypes] - labels = [labels[i] for i in test] - - x = x.sort_index(axis=1) - ratios = pd.DataFrame(y, columns=celltypes) - ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]), index=ratios.index) - - logger.info("Processing " + str(train_file)) - x = pd.DataFrame(x) - adata.append( - anndata.AnnData( - X=x.to_numpy(), obs=ratios, var=pd.DataFrame(columns=[], index=list(x)) - ) - ) - - for i in range(1, len(adata)): - logger.info("Concatenating " + str(i)) - adata[0] = adata[0].concatenate(adata[1]) - del adata[1] - gc.collect() - logger.info(len(adata)) - adata = adata[0] - - # add cell types and signature genes - adata.uns["cell_types"] = celltypes - adata.uns["unknown"] = unknown - - # save data - adata.write(out_path) - logger.info(f"Created h5ad file: {out_path}") diff --git a/scaden/simulate.py b/scaden/simulate.py index 617775f..566a5b3 100644 --- a/scaden/simulate.py +++ b/scaden/simulate.py @@ -1,6 +1,5 @@ -from scaden.preprocessing.create_h5ad_file import create_h5ad_file -from scaden.preprocessing.bulk_simulation import simulate_bulk -import os +from scaden.simulation import BulkSimulator + """ Simulation of artificial bulk RNA-seq samples from scRNA-seq data and subsequenbt formatting in .h5ad file for training with Scaden @@ -8,13 +7,21 @@ def simulation(simulate_dir, data_dir, sample_size, num_samples, pattern, - unknown_celltypes, out_prefix): + unknown_celltypes, out_prefix, fmt): - # Perform the bulk simulation unknown_celltypes = list(unknown_celltypes) - simulate_bulk(sample_size, num_samples, data_dir, simulate_dir, pattern, - unknown_celltypes) + bulk_simulator = BulkSimulator(sample_size=sample_size, + num_samples=num_samples, + data_path=data_dir, + out_dir=simulate_dir, + pattern=pattern, + unknown_celltypes=unknown_celltypes, + fmt=fmt) + + # Perform dataset simulation + bulk_simulator.simulate() - # Create the h5ad training data file - out_name = os.path.join(simulate_dir, out_prefix + ".h5ad") - create_h5ad_file(simulate_dir, out_name, unknown_celltypes) + # Merge the resulting datasets + bulk_simulator.merge_datasets(data_dir=simulate_dir, + files=bulk_simulator.dataset_files, + out_name=out_prefix + ".h5ad") diff --git a/scaden/simulation/__init__.py b/scaden/simulation/__init__.py new file mode 100644 index 0000000..9b5b492 --- /dev/null +++ b/scaden/simulation/__init__.py @@ -0,0 +1 @@ +from .bulk_simulator import BulkSimulator diff --git a/scaden/simulation/bulk_simulator.py b/scaden/simulation/bulk_simulator.py new file mode 100644 index 0000000..80cf959 --- /dev/null +++ b/scaden/simulation/bulk_simulator.py @@ -0,0 +1,322 @@ +import logging +import glob +import os +import sys +import gc + +import pandas as pd +import anndata as ad +import numpy as np + +from rich.progress import BarColumn, Progress +logger = logging.getLogger(__name__) + + +def create_fractions(no_celltypes): + """ + Create random fractions + :param no_celltypes: number of fractions to create + :return: list of random fractions of length no_celltypes + """ + fracs = np.random.rand(no_celltypes) + fracs_sum = np.sum(fracs) + fracs = np.divide(fracs, fracs_sum) + return fracs + + +class BulkSimulator(object): + """ + BulkSimulator class for the simulation of artificial bulk samples + from scRNA-seq datasets + + :param sample_size: number of cells per sample + :param num_samples: number of sample to simulate + :param data_path: path to the data directory + :param out_dir: output directory + :param pattern of the data files + :param unknown_celltypes: which celltypes to merge into the unknown class + :param fmt: the format of the input files, can be txt or h5ad + """ + + def __init__(self, sample_size=100, + num_samples=1000, + data_path="./", + out_dir="./", + pattern="*_counts.txt", + unknown_celltypes=None, + fmt="txt"): + if unknown_celltypes is None: + unknown_celltypes = ["unknown"] + + self.sample_size = sample_size + self.num_samples = num_samples // 2 + self.data_path = data_path + self.out_dir = out_dir + self.pattern = pattern + self.unknown_celltypes = unknown_celltypes + self.format = fmt + self.datasets = [] + self.dataset_files = [] + + def simulate(self): + """ simulate artificial bulk datasets + """ + + # List available datasets + if not self.data_path.endswith("/"): + self.data_path += "/" + files = glob.glob(os.path.join(self.data_path, self.pattern)) + files = [os.path.basename(x) for x in files] + self.datasets = [x.replace(self.pattern.replace("*", ""), "") for x in files] + self.dataset_files = [os.path.join(self.out_dir, x + ".h5ad") for x in self.datasets] + + if len(self.datasets) == 0: + logging.error("No datasets found! Have you specified the pattern correctly?") + sys.exit(1) + + logger.info("Datasets: [cyan]" + str(self.datasets) + "[/]") + + # Loop over datasets and simulate bulk data + for i, dataset in enumerate(self.datasets): + gc.collect() + logger.info(f"[bold u]Simulating data from {dataset}") + self.simulate_dataset(dataset) + + logger.info("[bold green]Finished data simulation!") + + def simulate_dataset(self, dataset): + """ + Simulate bulk data from a single scRNA-seq dataset + @param dataset: + @type dataset: + @return: + @rtype: + """ + + # load the dataset + data_x, data_y = self.load_dataset(dataset) + + # Merge unknown celltypes + logger.info(f"Merging unknown cell types: {self.unknown_celltypes}") + data_y = self.merge_unknown_celltypes(data_y) + + logger.info(f"Subsampling [bold cyan]{dataset}[/] ...") + + # Extract celltypes + celltypes = list(set(data_y["Celltype"].tolist())) + tmp_x, tmp_y = self.create_subsample_dataset(data_x, data_y, celltypes=celltypes) + + tmp_x = tmp_x.sort_index(axis=1) + ratios = pd.DataFrame(tmp_y, columns=celltypes) + ratios["ds"] = pd.Series(np.repeat(dataset, tmp_y.shape[0]), index=ratios.index) + + ann_data = ad.AnnData(X=tmp_x.to_numpy(), obs=ratios, var=pd.DataFrame(columns=[], index=list(tmp_x))) + ann_data.uns['unknown'] = self.unknown_celltypes + ann_data.uns['cell_types'] = celltypes + + ann_data.write(os.path.join(self.out_dir, dataset + ".h5ad")) + + def load_dataset(self, dataset): + """ + Load a dataset + @param dataset: + @type dataset: + @return: + @rtype: + """ + pattern = self.pattern.replace("*", "") + logger.info(f"Loading [cyan]{dataset}[/] dataset ...") + dataset_counts = dataset + pattern + dataset_celltypes = dataset + "_celltypes.txt" + + # Load data in .txt format + if self.format == 'txt': + # Try to load celltypes + try: + y = pd.read_table(os.path.join(self.data_path, dataset_celltypes)) + # Check if has Celltype column + if "Celltype" not in y.columns: + logger.error( + f"No 'Celltype' column found in {dataset}_celltypes.txt! Please make sure to include this " + f"column. " + ) + sys.exit() + except FileNotFoundError as e: + logger.error( + f"No celltypes file found for [cyan]{dataset}[/]. It should be called [cyan]{dataset}_celltypes.txt." + ) + sys.exit(e) + + # Try to load data file + try: + x = pd.read_table(os.path.join(self.data_path, dataset_counts), index_col=0) + except FileNotFoundError as e: + logger.error( + f"No counts file found for [cyan]{dataset}[/]. Was looking for file [cyan]{dataset_counts}[/]" + ) + sys.exit(e) + + # Check that celltypes and count file have the same number of cells + if not y.shape[0] == x.shape[0]: + logger.error( + f"Different number of cells in {dataset}_celltypes and {dataset_counts}! Make sure the data has " + f"been processed correctly. " + ) + sys.exit(1) + + # Load data in .h5ad format + elif self.format == 'h5ad': + try: + data_h5ad = ad.read_h5ad(os.path.join(self.data_path, dataset_counts)) + except FileNotFoundError as e: + logger.error( + f"No h5ad file found for [cyan]{dataset}[/]. Was looking for file [cyan]{dataset_counts}" + ) + sys.exit(e) + # cell types + try: + y = pd.DataFrame(data_h5ad.obs.Celltype) + y.reset_index(inplace=True, drop=True) + except Exception as e: + logger.error( + f"Celltype attribute not found for [cyan]{dataset}" + ) + sys.exit(e) + # counts + x = pd.DataFrame(data_h5ad.X.todense()) + x.index = data_h5ad.obs_names + x.columns = data_h5ad.var_names + del data_h5ad + else: + logger.error( + f"Unsupported file format {self.format}!" + ) + sys.exit(1) + + return x, y + + def merge_unknown_celltypes(self, y): + """ + Merge all unknown celltypes together + :param y: list of cell type labels + :return: list of cell types with unknown cell types merged into "Unknown" type + """ + celltypes = list(y["Celltype"]) + new_celltypes = ["Unknown" if x in self.unknown_celltypes else x for x in celltypes] + y["Celltype"] = new_celltypes + return y + + def create_subsample_dataset(self, x, y, celltypes): + """ + Generate many artifial bulk samples with known fractions + This function will create normal and sparse samples (no_samples) + @param x: + @param y: + @param celltypes: + @return: + """ + sim_x = [] + sim_y = [] + + # Create normal samples + progress_bar = Progress( + "[bold blue]{task.description}", + "[bold cyan]{task.fields[samples]}", + BarColumn(bar_width=None) + ) + with progress_bar: + normal_samples_progress = progress_bar.add_task("Normal samples", total=self.num_samples, samples=0) + sparse_samples_progress = progress_bar.add_task("Sparse samples", total=self.num_samples, samples=0) + + # Create normal samples + for i in range(self.num_samples): + progress_bar.update(normal_samples_progress, advance=1, samples=i+1) + sample, label = self.create_subsample(x, y, celltypes) + sim_x.append(sample) + sim_y.append(label) + + # Create sparase samples + for i in range(self.num_samples): + progress_bar.update(sparse_samples_progress, advance=1, samples=i+1) + sample, label = self.create_subsample(x, y, celltypes, sparse=True) + sim_x.append(sample) + sim_y.append(label) + + sim_x = pd.concat(sim_x, axis=1).T + sim_y = pd.DataFrame(sim_y, columns=celltypes) + + return sim_x, sim_y + + def create_subsample(self, x, y, celltypes, sparse=False): + """ + Generate artifical bulk subsample with random fractions of celltypes + If sparse is set to true, add random celltypes to the missing celltypes + + @param x: + @param y: + @param celltypes: + @param sparse: + @return: + """ + available_celltypes = celltypes + if sparse: + no_keep = np.random.randint(1, len(available_celltypes)) + keep = np.random.choice( + list(range(len(available_celltypes))), size=no_keep, replace=False + ) + available_celltypes = [available_celltypes[i] for i in keep] + + no_avail_cts = len(available_celltypes) + + # Create fractions for available celltypes + fracs = create_fractions(no_celltypes=no_avail_cts) + samp_fracs = np.multiply(fracs, self.sample_size) + samp_fracs = list(map(int, samp_fracs)) + + # Make complete fracions + fracs_complete = [0] * len(celltypes) + for i, act in enumerate(available_celltypes): + idx = celltypes.index(act) + fracs_complete[idx] = fracs[i] + + artificial_samples = [] + for i in range(no_avail_cts): + ct = available_celltypes[i] + cells_sub = x.loc[np.array(y["Celltype"] == ct), :] + cells_fraction = np.random.randint(0, cells_sub.shape[0], samp_fracs[i]) + cells_sub = cells_sub.iloc[cells_fraction, :] + artificial_samples.append(cells_sub) + + df_samp = pd.concat(artificial_samples, axis=0) + df_samp = df_samp.sum(axis=0) + + return df_samp, fracs_complete + + @staticmethod + def merge_datasets(data_dir="./", files=None, out_name="data.h5ad"): + """ + + @param out_name: name of the merged .h5ad file + @param data_dir: directory to look for datasets + @param files: list of files to merge + @return: + """ + non_celltype_obs = ["ds", "batch"] + if not files: + files = glob.glob(os.path.join(data_dir, "*.h5ad")) + + logger.info(f"Merging datasets: {files} into [bold cyan]{out_name}") + + # load first file + adata = ad.read_h5ad(files[0]) + + for i in range(1, len(files)): + adata = adata.concatenate(ad.read_h5ad(files[i]), uns_merge="same") + + combined_celltypes = list(adata.obs.columns) + combined_celltypes = [x for x in combined_celltypes if not x in non_celltype_obs] + for ct in combined_celltypes: + adata.obs[ct].fillna(0, inplace=True) + + adata.uns['cell_types'] = combined_celltypes + adata.write(out_name) diff --git a/scaden/train.py b/scaden/train.py index 7f082c1..efe5f78 100644 --- a/scaden/train.py +++ b/scaden/train.py @@ -10,8 +10,6 @@ # Imports import logging -import tensorflow as tf -from anndata import read_h5ad from scaden.model.architectures import architectures from scaden.model.scaden import Scaden diff --git a/setup.py b/setup.py index 004ccd9..3233eb5 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages -version = "1.0.2" +version = "1.1.0" with open("README.md", "r", encoding="UTF-8") as fh: @@ -38,7 +38,6 @@ "scikit-learn", "tensorflow>=2.0", "anndata", - "tqdm", "click", "h5py~=2.10.0", "rich",