From fea2335ee3e7b550433a99194d6bd35d3c1ba225 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Mon, 8 Jun 2020 12:27:35 +0100 Subject: [PATCH] Reorganise the files in the repository Address other review comments --- LICENSE | 2 +- README.md | 148 ++++++------------ .../Evaluating_pipeline.md | 0 README_nextflow.md => docs/README_nextflow.md | 0 docs/Technical_notes.md | 24 +++ remapping_pipeline.nf => main.nf | 14 +- requirements.txt | 4 +- tests/test_pipeline.sh | 2 +- .../replace_refs.py | 0 .../reverse_strand.py | 0 10 files changed, 84 insertions(+), 110 deletions(-) rename Evaluating_pipeline.md => docs/Evaluating_pipeline.md (100%) rename README_nextflow.md => docs/README_nextflow.md (100%) create mode 100644 docs/Technical_notes.md rename remapping_pipeline.nf => main.nf (96%) rename replace_refs.py => variant_remapping_tools/replace_refs.py (100%) rename reverse_strand.py => variant_remapping_tools/reverse_strand.py (100%) diff --git a/LICENSE b/LICENSE index 261eeb9..3143468 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2020 EMBL-EBI Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md index 2edb99f..54425b0 100644 --- a/README.md +++ b/README.md @@ -4,124 +4,72 @@ Pipeline for remapping VCF variants between two arbitrary assemblies in FASTA fo **Method**: creates reads from the flanking sequences of each variant, then maps them to the new assembly using bowtie2. -Currently, it only supports SNPs, and not indels. +Currently, it only SNPs and short indels but has not been tested with larger or more complex variants -**Prerequisites:** -- `reverse_strand.py`, included (uses [pysam](https://pysam.readthedocs.io/en/latest/api.html), allows reverse strand -allele correction (see Note below for further explanation), and filtering based on bowtie2's alignment score) -- `replace_refs.py`, included (uses old REF allele in the case of new REF=ALT, these then get swapped (see Note 2)) +## Prerequisites +To run this pipeline you will need to install and configure [Nextflow](https://www.nextflow.io/docs/latest/getstarted.html#installation). +The pipeline uses other software that needs to be downloaded and installed locally. You can obtain them manually or use [Miniconda](https://docs.conda.io/en/latest/miniconda.html) + +### Installation using conda +```bash +git clone https://github.com/EBIvariation/variant-remapping.git +conda env create -f conda.yml +conda activate variant-remapping +pip install -r requirements.txt +``` + +### Installation without conda +Download, manually install the following program and make sure the executable are in your `PATH` - [vcf2bed](https://bedops.readthedocs.io/en/latest/content/reference/file-management/conversion/vcf2bed.html) - [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) - [samtools](http://www.htslib.org/download/) - [bedtools](https://bedtools.readthedocs.io/en/latest/) - [bcftools](http://www.htslib.org/download/) +- [Python](https://www.python.org/downloads/) + +Then run +```bash +git clone https://github.com/EBIvariation/variant-remapping.git +pip install -r requirements.txt +``` -**Note: reverse strand correction:** -When a variant is mapped to the reverse strand, the corresponding allele in the output VCF file is reversed, aka -converted to the forward strand, as alleles in VCF files are always described on the forward strand. For example: -INPUT VCF: -Old genome: `G (REF) > A (ALT)` -This maps onto the new genome on the reverse strand: -OUTPUT VCF: -New genome: `C (REF) > T (ALT)` +## Testing the installation +Run the test script to check that you have all the right dependencies installed properly +```bash +tests/test_pipeline.sh +``` -**Note 2: what happens when the new REF=ALT?** -Example: Original variant on the old reference sequence: `G (REF) > T (ALT)` -Once remapped to the new reference sequence, we get: `T (REF) > T (ALT)` -What happens is `replace_refs.py` will search for the old REF allele, and replace the new REF with it. So we get: -`G (REF) > T (ALT)` -And then `bcftools norm` will check to see if any REF alleles are different to the new reference genome, and if so, it -will swap them: -`T (REF) > G (ALT)` -This makes sense because we know that T is now the REF allele, which means the original G allele was technically a -variant of this new reference genome. +## Executing the pipeline +```bash +nextflow run main.nf + --oldgenome + --newgenome + --vcffile + --outfile + [--flankingseq 50] + [--scorecutoff 0.6] + [--diffcutoff 0.04] +``` -## Input -- Old genome assembly file (FASTA format): the genome you have variants for. -- New genome assembly file (FASTA format): the genome you want to remap the variants to. -- Variants file (VCF format): contains the list of variants you want to remap. -- The length of the flanking sequences that generate the reads. -- Percentage of the flanking sequences that should be used as Alignment Score cut-off threshold. -- Percentage of the flanking sequences that should be used as AS-XS difference cut-off threshold. +### Input +- `--oldgenome`: Old genome assembly file (FASTA format): the genome you have variants for. +- `--newgenome`: New genome assembly file (FASTA format): the genome you want to remap the variants to. +- `--vcffile`: Variants file (VCF format): contains the list of variants you want to remap. +- `--flankingseq`: The length of the flanking sequences that generate the reads. +- `--scorecutoff`: Percentage of the flanking sequences that should be used as Alignment Score cut-off threshold. +- `--diffcutoff`: Percentage of the flanking sequences that should be used as AS-XS difference cut-off threshold. -## Output -A VCF file containing: +### Output +`--outfile` specify a VCF file containing: - remapped coordinates (position on the new assembly) - rsIDs - the correct chromosome/contig names - the new REF alleles (reverse strand mapping taken into account) - the ALT, QUAL, FILT and INFO columns of the input VCF -Console output: -``` -"Total number of input variants:" -[number] -"Total number of variants processed (after filters):" -[number] -"Number of remapped variants:" -[number] -"Percentage of remapped variants:" -[percentage] -``` - -## Usage -First, make sure the scripts are executable: -`chmod a+x remapping_commands.sh reverse_strand.py replace_refs.py` - -Command usage: (see Input for details) -``` -./remapping_commands.sh \ - -g [old genome] \ - -n [new genome] \ - -v [vcf file] \ - -f [flanking sequence length] \ - -s [score percentage cut-off] \ - -d [AS-XS difference percentage cut-off] \ - -o [outfile name] -``` - -You can use `time` at the beginning of the previous command to get the runtime and CPU time. - -**Example:** +### Example: "I want to remap the variants in `droso_variants_renamed.vcf` from `droso_dm3.fasta` to `droso_dm6.fasta` (its accession is: `GCA_000001215.4`), with flanking sequences of 50 bases, which will create 101-base reads. The alignment score cut-off will be -(50 x 0.6) = -30, meaning that reads with alignment scores lower than -30 will not be kept. The AS-XS difference threshold will be 0.04, meaning that AS-XS with a difference of less than 50 * 0.04 = 2 will not be kept. The remapped variants will be in `test.vcf`." - -**Command:** -``` -time ./remapping_commands.sh \ - -g droso_dm3.fasta \ - -n droso_dm6.fasta \ - -v droso_variants_renamed.vcf \ - -f 50 \ - -s 0.6 \ - -o test.vcf -``` -**Output:** -``` ------------------------1) Flanking sequence generation----------------------- ----------------------------2) Mapping with bowtie2--------------------------- -Mapping results: -1077 reads; of these: - 1077 (100.00%) were unpaired; of these: - 7 (0.65%) aligned 0 times - 48 (4.46%) aligned exactly 1 time - 1022 (94.89%) aligned >1 times -99.35% overall alignment rate -------------------------------3) Data extraction----------------------------- ------------------------------------Results----------------------------------- -Total number of input variants: -1094 -Total number of variants processed (after filters): -1077 -Number of remapped variants: -1070 -Percentage of remapped variants: -99.300% - -real 0m1.363s -user 0m1.091s -sys 0m0.171s -``` \ No newline at end of file diff --git a/Evaluating_pipeline.md b/docs/Evaluating_pipeline.md similarity index 100% rename from Evaluating_pipeline.md rename to docs/Evaluating_pipeline.md diff --git a/README_nextflow.md b/docs/README_nextflow.md similarity index 100% rename from README_nextflow.md rename to docs/README_nextflow.md diff --git a/docs/Technical_notes.md b/docs/Technical_notes.md new file mode 100644 index 0000000..69dd6b9 --- /dev/null +++ b/docs/Technical_notes.md @@ -0,0 +1,24 @@ + +- `reverse_strand.py`, included (uses [pysam](https://pysam.readthedocs.io/en/latest/api.html), allows reverse strand +allele correction (see Note below for further explanation), and filtering based on bowtie2's alignment score) +- `replace_refs.py`, included (uses old REF allele in the case of new REF=ALT, these then get swapped (see Note 2)) + +**Note: reverse strand correction:** +When a variant is mapped to the reverse strand, the corresponding allele in the output VCF file is reversed, aka +converted to the forward strand, as alleles in VCF files are always described on the forward strand. For example: +INPUT VCF: +Old genome: `G (REF) > A (ALT)` +This maps onto the new genome on the reverse strand: +OUTPUT VCF: +New genome: `C (REF) > T (ALT)` + +**Note 2: what happens when the new REF=ALT?** +Example: Original variant on the old reference sequence: `G (REF) > T (ALT)` +Once remapped to the new reference sequence, we get: `T (REF) > T (ALT)` +What happens is `replace_refs.py` will search for the old REF allele, and replace the new REF with it. So we get: +`G (REF) > T (ALT)` +And then `bcftools norm` will check to see if any REF alleles are different to the new reference genome, and if so, it +will swap them: +`T (REF) > G (ALT)` +This makes sense because we know that T is now the REF allele, which means the original G allele was technically a +variant of this new reference genome. diff --git a/remapping_pipeline.nf b/main.nf similarity index 96% rename from remapping_pipeline.nf rename to main.nf index 659853e..2953341 100755 --- a/remapping_pipeline.nf +++ b/main.nf @@ -131,7 +131,7 @@ process chromSizes { output: - path "genome.fa.chrom.sizes" into chrom_sizes + path "genome.fa.chrom.sizes" into oldgenome_chrom_sizes """ cut -f1,2 genome.fa.fai > genome.fa.chrom.sizes @@ -178,7 +178,7 @@ process flankingRegionBed { input: path "variants.bed" from variants_bed - path "genome.chrom.sizes" from chrom_sizes + path "genome.chrom.sizes" from oldgenome_chrom_sizes output: path "flanking.filtered.bed" into flanking_bed @@ -320,7 +320,9 @@ process reverseStrand { """ # Ensure that we will use the reverse_strand.py from this repo - ${baseDir}/reverse_strand.py -i reads_aligned.sorted.bam -p old_ref_alleles.txt -o variants_remapped.vcf -f $params.flankingseq -s $params.scorecutoff -d $params.diffcutoff + ${baseDir}/variant_remapping_tools/reverse_strand.py -i reads_aligned.sorted.bam \ + -p old_ref_alleles.txt -o variants_remapped.vcf -f $params.flankingseq \ + -s $params.scorecutoff -d $params.diffcutoff """ } @@ -339,8 +341,8 @@ process insertReferenceAllele { ''' # Add interval for variant position in bed format (required by getfasta) - # Reprints all the columns, adding an extra column before the pos column as the pos-1, as bedtools getfasta requires a - # bed file + # Reprints all the columns, adding an extra column before the pos column as the pos-1, + # as bedtools getfasta requires a bed file # Input: # [chr] [pos] [rsID] [ALT] [QUAL] [FILT] [INFO] # Output: @@ -425,7 +427,7 @@ process fixRefAllele { """ # Test each variant to see if REF = ALT, if so, replace the current REF with the corresponding old REF (this is to # deal with variants such as G > G) - ${baseDir}/replace_refs.py -i vcf_out_with_header.vcf -r old_ref_alleles_with_header.txt -o pre_final_vcf.vcf + ${baseDir}/variant_remapping_tools/replace_refs.py -i vcf_out_with_header.vcf -r old_ref_alleles_with_header.txt -o pre_final_vcf.vcf """ } diff --git a/requirements.txt b/requirements.txt index 5225d2b..572cecc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -pysam -biopython \ No newline at end of file +pysam==0.15.4 +biopython==1.77 \ No newline at end of file diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh index d1cf7fb..2ebdd44 100755 --- a/tests/test_pipeline.sh +++ b/tests/test_pipeline.sh @@ -5,7 +5,7 @@ set -Eeuo pipefail SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" SOURCE_DIR=$(dirname $SCRIPT_DIR) -nextflow run ${SOURCE_DIR}/remapping_pipeline.nf \ +nextflow run ${SOURCE_DIR}/main.nf \ --oldgenome ${SCRIPT_DIR}/resources/genome.fa \ --newgenome ${SCRIPT_DIR}/resources/new_genome.fa \ --vcffile ${SCRIPT_DIR}/resources/source.vcf \ diff --git a/replace_refs.py b/variant_remapping_tools/replace_refs.py similarity index 100% rename from replace_refs.py rename to variant_remapping_tools/replace_refs.py diff --git a/reverse_strand.py b/variant_remapping_tools/reverse_strand.py similarity index 100% rename from reverse_strand.py rename to variant_remapping_tools/reverse_strand.py