From be7d08a93443338e7608bb1cf7f856793384cc7e Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Wed, 16 Dec 2020 07:00:44 +0100 Subject: [PATCH 1/9] fixed docker to install from pip --- Dockerfile | 9 +++++---- environment.yml | 10 ---------- 2 files changed, 5 insertions(+), 14 deletions(-) delete mode 100644 environment.yml diff --git a/Dockerfile b/Dockerfile index 212a600..481b06a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,6 @@ -FROM continuumio/miniconda3 +FROM ubuntu -COPY environment.yml / -RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/scaden/bin:$PATH \ No newline at end of file +RUN apt-get update && apt-get upgrade -y +RUN apt-get install python3 -y +RUN apt-get install python3-pip -y +RUN pip3 install scaden \ No newline at end of file diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 515d843..0000000 --- a/environment.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: scaden -channels: - - bioconda - - r - - defaults - - conda-forge -dependencies: - - scaden=0.9.4=py_0 -prefix: /home/kevin/anaconda3/envs/scaden - From 0d9777cf72ace74a048d1ad3f7edd2b279d27c8d Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Wed, 16 Dec 2020 07:04:21 +0100 Subject: [PATCH 2/9] fixed typos --- .vscode/settings.json | 4 ++-- scaden/model/scaden.py | 2 +- scaden/preprocessing/bulk_simulation.py | 4 ++-- scaden/preprocessing/create_h5ad_file.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 338af49..6a2331d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,6 @@ { "python.pythonPath": "/home/kevin/anaconda3/envs/scaden/bin/python", - "python.linting.pylintEnabled": false, + "python.linting.pylintEnabled": true, "python.linting.enabled": true, - "python.linting.flake8Enabled": true + "python.linting.flake8Enabled": false } \ No newline at end of file diff --git a/scaden/model/scaden.py b/scaden/model/scaden.py index ab252a8..d224f92 100644 --- a/scaden/model/scaden.py +++ b/scaden/model/scaden.py @@ -295,7 +295,7 @@ def train(self, input_path, train_datasets): pd.DataFrame(self.sig_genes).to_csv(self.model_dir + "/genes.txt", sep="\t") - def predict(self, input_path, out_name="cdn_predictions.txt"): + def predict(self, input_path, out_name="scaden_predictions.txt"): """ Perform prediction with a pre-trained model :param out_dir: path to store results in diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py index 3db2d3c..8c83f5a 100644 --- a/scaden/preprocessing/bulk_simulation.py +++ b/scaden/preprocessing/bulk_simulation.py @@ -175,7 +175,7 @@ def load_dataset(name, dir, pattern): try: y = pd.read_table(dir + name + "_celltypes.txt") except FileNotFoundError as e: - logger.error(f"No celltypes file found for {name}. It should be called {name}_celltypes.txt.") + logger.error(f" No celltypes file found for {name}. It should be called {name}_celltypes.txt.") sys.exit() x = pd.read_table(dir + name + pattern, index_col=0) @@ -285,7 +285,7 @@ def simulate_bulk( datasets = [x.split("_")[0] for x in files] if len(datasets) == 0: - logging.error("No datasetes fround! Have you specified the pattern correctly?") + logging.error("No datasets fround! Have you specified the pattern correctly?") sys.exit() print("Datasets: " + str(datasets)) diff --git a/scaden/preprocessing/create_h5ad_file.py b/scaden/preprocessing/create_h5ad_file.py index 3ef423b..6ce1656 100644 --- a/scaden/preprocessing/create_h5ad_file.py +++ b/scaden/preprocessing/create_h5ad_file.py @@ -27,7 +27,7 @@ def parse_data(x_path, y_path): x = pd.read_table(x_path, sep="\t") y = pd.read_table(y_path, sep="\t") except FileNotFoundError as e: - logging.error(f"Could not find simulated data files: {e}") + logging.error(f" Could not find simulated data files: {e}") sys.exit() labels = list(y.columns) From 82ae2e4f303bad4f5f58e492e1b741ac212e14fe Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Wed, 16 Dec 2020 07:22:12 +0100 Subject: [PATCH 3/9] Improved documentation --- docs/usage.md | 13 +++++++++---- setup.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index ab71666..a64bfda 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -96,10 +96,9 @@ Once you have done this, you can use Scaden's command `scaden simulate` to gener The first step is to process your scRNA-seq dataset(s) you want to use for training. I used Scanpy for this, and would therefore recommend to do the same, but you can of course use other software for this purpose. I've uploaded the scripts I used to preprocess the data used for the Scaden paper [here](https://doi.org/10.6084/m9.figshare.8234030.v1). Mainly you have to normalize your count data -and create a file containing the cell type labels. The file for the cell type labels should be of size (n x 2), where n is the number of cells -you have in your data. The two columns correspond to a label for your cells, and a 'Celltype' column. In fact, the only necessary column is the 'Celltype' -column, which Scaden uses to extract the information. The count data should be of size (n x g), where g is the number of genes and n is the number of samples. -The order must be the same as for the cell type labels. +and create a file containing the cell type labels. +The file for the cell type labels should be of size (n x 1), where n is the number of cells +you have in your data. The single column in this file should be labeled 'Celltype'. You can have extra columns if you like, as long as you have a 'Celltype' column which specifies the cell type label in the correct order. The count data should be of size (n x g), where g is the number of genes and n is the number of samples. The order must be the same as for the cell type labels. #### Bulk simulation Once the data is processed, you can use the command `scaden simulate` to generate your artificial bulk samples for training. @@ -116,6 +115,12 @@ As example, you can generate 1000 artificial bulk samples from 100 cells per sam scaden simulate --cells 100 --n_samples 1000 --data --pattern ``` +An example for a pattern would be `*_counts.txt`. This pattern would find the following dataset: +* `dataset_counts.txt` +* `dataset_celltypes.txt` + +Make sure to include an `*` in your pattern! + This command will create the artificial samples in the current working directory. You can also specificy an output directory using the `--out` parameter. Scaden will also directly create a .h5ad file in this directory, which is the file you will need for training. By default, this file will be called `data.h5ad`, however you can change the prefix using the `--prefix` flag. diff --git a/setup.py b/setup.py index 03cd660..b2a448a 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages -version = '0.9.5' +version = '0.9.6' with open("README.md", "r", encoding="UTF-8") as fh: From ae00bb71ddc3d7647eac52c510b8f12c0c8c9ef7 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Wed, 16 Dec 2020 07:30:32 +0100 Subject: [PATCH 4/9] added better warning to simulation --- docs/installation.md | 14 ++++++-------- scaden/preprocessing/bulk_simulation.py | 23 +++++++++++++++++------ 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index c656db0..23afed2 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -3,18 +3,16 @@ Scaden be easily installed on a Linux system, and should also work on Mac. There are currently two options for installing Scaden, either using [Bioconda](https://bioconda.github.io/) or via [pip](https://pypi.org/). -## Bioconda -Installation via Bioconda is the preferred route of installation, and we highly recommend using conda. To install Scaden, use: - -`conda install -c bioconda scaden` +## pip +To install Scaden via pip, simply run the following command: -It is always recommended to create a separate conda environment for installation. +`pip install scaden` -## pip -If you don't want to use conda, you can also install Scaden using pip: +## Bioconda +You can also install Scaden via bioconda, using:: -`pip install scaden` +`conda install -c bioconda scaden` ## Docker diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py index 8c83f5a..8cc495c 100644 --- a/scaden/preprocessing/bulk_simulation.py +++ b/scaden/preprocessing/bulk_simulation.py @@ -160,6 +160,22 @@ def filter_matrix_signature(mat, genes): mat = mat[genes] return mat +def load_celltypes(path, name): + """ Load the cell type information """ + try: + y = pd.read_table(path) + # Check if has Celltype column + if not 'Celltype' in y.columns: + logger.error(f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column.") + sys.exit() + except FileNotFoundError as e: + logger.error(f"No celltypes file found for {name}. It should be called {name}_celltypes.txt.") + sys.exit(e) + + return y + + + def load_dataset(name, dir, pattern): """ @@ -172,12 +188,7 @@ def load_dataset(name, dir, pattern): pattern = pattern.replace("*", "") print("Loading " + name + " dataset ...") - try: - y = pd.read_table(dir + name + "_celltypes.txt") - except FileNotFoundError as e: - logger.error(f" No celltypes file found for {name}. It should be called {name}_celltypes.txt.") - sys.exit() - + y = load_celltypes(dir + name + "_celltypes.txt", name) x = pd.read_table(dir + name + pattern, index_col=0) return (x, y) From dcb3da8f730a20dde44892ada744735c225981a7 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Wed, 16 Dec 2020 08:34:10 +0100 Subject: [PATCH 5/9] updated changelog --- docs/index.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/index.md b/docs/index.md index be88f3c..bc2b7f5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -15,27 +15,31 @@ A pre-print describing the method is available on Biorxiv: ## Changelog -### Version 0.9.5 +#### Version 0.9.6 ++ fixed Dockerfile (switched to pip installation) ++ added better error messages to `simulate` command + +#### Version 0.9.5 + added `scaden simulate` command to perform bulk simulation and training file creation + added `--seed` parameter to allow reproducible Scaden runs -### Version 0.9.4 +#### Version 0.9.4 + fixed dependencies (added python>=3.6 requirement) -### Version 0.9.3 +#### Version 0.9.3 + upgrade to Tensorflow 2 + cleaned up dependencies -### Version 0.9.2 +#### Version 0.9.2 + RAM usage improvement -### Version 0.9.1 +#### Version 0.9.1 + Added automatic removal of duplicate genes in Mixture file + Changed name of final prediction file + Added Scaden logo to main script -### Version 0.9.0 +#### Version 0.9.0 This is the initial release version of Scaden. While this version contains full functionality for pre-processing, training and prediction, it does not contain thorough error messages, plotting functionality and a solid helper function for generation training data. These are all features planned for the release of v.1.0.0. From 7dd37d8110449009e5b03ccf82264ddbc5c8283e Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Wed, 16 Dec 2020 08:35:44 +0100 Subject: [PATCH 6/9] switched to separate changelog --- docs/changelog.md | 30 ++++++++++++++++++++++++++++++ docs/index.md | 36 ------------------------------------ mkdocs.yml | 1 + 3 files changed, 31 insertions(+), 36 deletions(-) create mode 100644 docs/changelog.md diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 0000000..bb5e632 --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,30 @@ +# Changelog + +### Version 0.9.6 ++ fixed Dockerfile (switched to pip installation) ++ added better error messages to `simulate` command + +### Version 0.9.5 ++ added `scaden simulate` command to perform bulk simulation and training file creation ++ added `--seed` parameter to allow reproducible Scaden runs + +### Version 0.9.4 ++ fixed dependencies (added python>=3.6 requirement) + +### Version 0.9.3 ++ upgrade to Tensorflow 2 ++ cleaned up dependencies + +### Version 0.9.2 ++ RAM usage improvement + +### Version 0.9.1 ++ Added automatic removal of duplicate genes in Mixture file ++ Changed name of final prediction file ++ Added Scaden logo to main script + +### Version 0.9.0 +This is the initial release version of Scaden. While this version contains full functionality for pre-processing, training and prediction, it does not +contain thorough error messages, plotting functionality and a solid helper function for generation training data. These are all features +planned for the release of v.1.0.0. +The core functionality of Scaden is, however, implemented and fully operational. Please check the [Usage](usage) section to learn how to use Scaden. \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index bc2b7f5..dc26879 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,39 +8,3 @@ at the [DZNE Tübingen](https://www.dzne.de/en/about-us/sites/tuebingen/) and th A pre-print describing the method is available on Biorxiv: [Deep-learning-based cell composition analysis from tissue expression profiles](https://www.biorxiv.org/content/10.1101/659227v1) - - - - - -## Changelog - -#### Version 0.9.6 -+ fixed Dockerfile (switched to pip installation) -+ added better error messages to `simulate` command - -#### Version 0.9.5 -+ added `scaden simulate` command to perform bulk simulation and training file creation -+ added `--seed` parameter to allow reproducible Scaden runs - -#### Version 0.9.4 -+ fixed dependencies (added python>=3.6 requirement) - -#### Version 0.9.3 -+ upgrade to Tensorflow 2 -+ cleaned up dependencies - -#### Version 0.9.2 -+ RAM usage improvement - -#### Version 0.9.1 -+ Added automatic removal of duplicate genes in Mixture file -+ Changed name of final prediction file -+ Added Scaden logo to main script - - -#### Version 0.9.0 -This is the initial release version of Scaden. While this version contains full functionality for pre-processing, training and prediction, it does not -contain thorough error messages, plotting functionality and a solid helper function for generation training data. These are all features -planned for the release of v.1.0.0. -The core functionality of Scaden is, however, implemented and fully operational. Please check the [Usage](usage) section to learn how to use Scaden. \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 5b64afe..e78c266 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,4 +4,5 @@ nav: - Installation: installation.md - Usage: usage.md - Datasets: datasets.md + - Changelog: changelog.md theme: readthedocs From 8df65cc07d95d1d7a4c5eabbc5d7105a03791a91 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Wed, 16 Dec 2020 11:52:14 +0100 Subject: [PATCH 7/9] added downloads badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 38151de..4cc63ec 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ ![MIT](https://anaconda.org/bioconda/scaden/badges/license.svg) ![Install with Bioconda](https://anaconda.org/bioconda/scaden/badges/installer/conda.svg) +![Downloads](https://static.pepy.tech/personalized-badge/scaden?period=total&units=international_system&left_color=blue&right_color=green&left_text=Downloads) ## Single-cell assisted deconvolutional network Scaden is a deep-learning based algorithm for cell type deconvolution of bulk RNA-seq samples. It was developed From d79807cc6863f394ef66047b16c87a6c68de12d3 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Wed, 16 Dec 2020 12:07:04 +0100 Subject: [PATCH 8/9] added more badges --- README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4cc63ec..d565492 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,13 @@ ![Scaden](docs/img/scaden_logo.png) -![MIT](https://anaconda.org/bioconda/scaden/badges/license.svg) -![Install with Bioconda](https://anaconda.org/bioconda/scaden/badges/installer/conda.svg) + +![Scaden version](https://img.shields.io/badge/scaden-v0.9.5-cyan) +![MIT](https://img.shields.io/badge/License-MIT-black) +![Install with pip](https://img.shields.io/badge/Install%20with-pip-blue) +![Install with Bioconda](https://img.shields.io/badge/Install%20with-conda-green) +![Docker build](https://img.shields.io/docker/cloud/build/kevinmenden/scaden) ![Downloads](https://static.pepy.tech/personalized-badge/scaden?period=total&units=international_system&left_color=blue&right_color=green&left_text=Downloads) + ## Single-cell assisted deconvolutional network Scaden is a deep-learning based algorithm for cell type deconvolution of bulk RNA-seq samples. It was developed From f5757cc53eae222b637a85a8268ad955e8f99452 Mon Sep 17 00:00:00 2001 From: kevinmenden Date: Wed, 16 Dec 2020 17:38:28 +0100 Subject: [PATCH 9/9] cleaned up dependencies --- docs/changelog.md | 1 + setup.py | 9 +++------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index bb5e632..2f0e14a 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -3,6 +3,7 @@ ### Version 0.9.6 + fixed Dockerfile (switched to pip installation) + added better error messages to `simulate` command ++ cleaned up dependencies ### Version 0.9.5 + added `scaden simulate` command to perform bulk simulation and training file creation diff --git a/setup.py b/setup.py index b2a448a..e6c6e4a 100644 --- a/setup.py +++ b/setup.py @@ -30,13 +30,10 @@ 'pandas', 'numpy', 'scikit-learn', - 'scipy', 'tensorflow>=2.0', 'anndata', 'tqdm', - 'click' - ], - extras_require = { - 'scanpy': ["scanpy", "matplotlib", "seaborn"] - } + 'click', + 'h5py~=2.10.0' + ] )