From 1c0b5bcdbf252ef6a86d8d5b38bd5946f1d1c0bc Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 5 Jun 2024 11:36:25 +0100 Subject: [PATCH 01/10] add labels to nextflow for SLURM migration --- main.nf | 16 +++++++++++++++- prepare_genome.nf | 7 ++++++- variant_to_realignment.nf | 29 ++++++++++++++++------------- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/main.nf b/main.nf index 7ee6f8e..a7f84ee 100755 --- a/main.nf +++ b/main.nf @@ -1,6 +1,5 @@ #!/usr/bin/env nextflow - // Enable syntax extension // See https://www.nextflow.io/docs/latest/dsl2.html nextflow.enable.dsl=2 @@ -46,6 +45,7 @@ outfile_dir = file(params.outfile).getParent() * Uncompress VCF file */ process uncompressInputVCF { + label 'short_time', 'med_mem' input: path "source.vcf" @@ -69,6 +69,7 @@ process uncompressInputVCF { * filter VCF file to remove variant too close the edges of chromosome because we can't get flanking regions */ process filterInputVCF { + label 'default_time', 'med_mem' input: path "source.vcf" @@ -94,6 +95,7 @@ process filterInputVCF { * Store the original VCF header for later use */ process storeVCFHeader { + label 'short_time', 'small_mem' input: path "source.vcf" @@ -114,6 +116,7 @@ include { process_split_reads; process_split_reads_mid; process_split_reads_long * This process convert the original Header to the remapped header and concatenate it with the remapped VCF records */ process generateRemappedVCF { + label 'short_time', 'small_mem' input: path "vcf_header.txt" @@ -148,6 +151,7 @@ process generateRemappedVCF { * This process adds the original header to unmapped variant VCF records and output the results */ process generateUnmappedVCF { + label 'short_time', 'small_mem' publishDir outfile_dir, overwrite: true, @@ -170,6 +174,7 @@ process generateUnmappedVCF { * Sort VCF file */ process sortVCF { + label 'default_time', 'med_mem' input: path "variants_remapped.vcf" @@ -187,6 +192,7 @@ process sortVCF { * Run bcftools norm to swap the REF and ALT alleles if the REF doesn't match the new assembly */ process normalise { + label 'default_time', 'med_mem' input: path "variants_remapped_sorted.vcf.gz" @@ -202,6 +208,7 @@ process normalise { process collectNovelReferenceAlleles { + label 'short_time', 'small_mem' publishDir outfile_dir, overwrite: true, @@ -224,6 +231,7 @@ process collectNovelReferenceAlleles { * Create file containing remapping stats */ process outputStats { + label 'short_time', 'small_mem' publishDir outfile_dir, overwrite: true, @@ -244,6 +252,8 @@ process outputStats { * Concatenate the unmapped variants */ process combineUnmappedVCF { + label 'short_time', 'small_mem' + input: path "variants1.vcf" path "variants2.vcf" @@ -258,6 +268,8 @@ process combineUnmappedVCF { process combineVCF { + label 'short_time', 'small_mem' + input: path "variants1.vcf" path "variants2.vcf" @@ -271,6 +283,8 @@ process combineVCF { } process combineYaml { + label 'short_time', 'small_mem' + input: path "initial_yml" path "round1.yml" diff --git a/prepare_genome.nf b/prepare_genome.nf index 0c60077..44cf2df 100755 --- a/prepare_genome.nf +++ b/prepare_genome.nf @@ -9,8 +9,11 @@ nextflow.enable.dsl=2 * Index the new reference genome using bowtie_build */ process bowtieGenomeIndex { + label 'med_time' + // Memory required is 10 times the size of the fasta in Bytes or at least 1GB - memory Math.max(file(params.newgenome).size() * 10, 1073741824) + ' B' + // Overwrite base_memory so that the standard retry strategy is used + ext base_memory: { Math.max(file(params.newgenome).size() * 10, 1073741824) } input: path "genome_fasta" @@ -25,6 +28,7 @@ process bowtieGenomeIndex { process samtoolsFaidx { + label 'med_time', 'med_mem' input: path "genome_basename" @@ -41,6 +45,7 @@ process samtoolsFaidx { * Extract chomosome/contig sizes */ process chromSizes { + label 'short_time', 'small_mem' input: path "genome.fa.fai" diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index 5083809..6b49fdb 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -1,6 +1,5 @@ #!/usr/bin/env nextflow - // Enable syntax extension // See https://www.nextflow.io/docs/latest/dsl2.html nextflow.enable.dsl=2 @@ -11,6 +10,7 @@ nextflow.enable.dsl=2 * "strand" column. */ process convertVCFToBed { + label 'default_time', 'med_mem' input: path "source.vcf" @@ -38,6 +38,7 @@ process convertVCFToBed { * Based on variants BED, generate the BED file for each flank. */ process flankingRegionBed { + label 'default_time', 'med_mem' input: path "variants.bed" @@ -67,8 +68,7 @@ process flankingRegionBed { * Extract the actual flanking region in fasta format. */ process flankingRegionFasta { - - memory '4 GB' + label 'default_time', 'med_mem' input: path "flanking_r1.bed" @@ -91,8 +91,7 @@ process flankingRegionFasta { * Extract information about the original variants and put it in the fasta header */ process extractVariantInfoToFastaHeader { - - memory '6GB' + label 'default_time', 'med_mem' input: path "flanking_r1.bed" @@ -127,6 +126,7 @@ process extractVariantInfoToFastaHeader { * Split fasta entries into multiple chunks */ process split_fasta { + label 'short_time', 'small_mem' input: path interleaved_fasta @@ -150,13 +150,11 @@ process split_fasta { * Align sequence with minimap2 */ process alignWithMinimap { + label 'med_time' - // Memory required is 5 times the size of the fasta in Bytes or at least 1GB - // Retry on kill (exit status 130) with twice the amount of memory - memory { Math.max(file(params.newgenome).size() * 10, 2000000000) * task.attempt + ' B' } - - errorStrategy { task.exitStatus == 130 ? 'retry' : 'terminate' } - maxRetries 3 + // Memory required is 10 times the size of the fasta in Bytes or at least 2GB + // Overwrite base_memory so that the standard retry strategy is used + ext base_memory: { Math.max(file(params.newgenome).size() * 10, 2000000000) } input: // reads contains paired interleaved (first and second read in the same file) @@ -168,7 +166,6 @@ process alignWithMinimap { output: path "reads_aligned.bam", emit: reads_aligned_bam - script: if (flanklength < 500) """ @@ -199,6 +196,7 @@ process alignWithMinimap { * Sort BAM file by name */ process sortByName { + label 'default_time', 'med_mem' input: path "reads_aligned.bam" @@ -215,9 +213,11 @@ process sortByName { * Align sequence with bowtie2 */ process alignWithBowtie { + label 'med_time' // Memory required is 5 times the size of the fasta in Bytes or at least 1GB - memory Math.max(file(params.newgenome).size() * 5, 1073741824) + ' B' + // Overwrite base_memory so that the standard retry strategy is used + ext base_memory: { Math.max(file(params.newgenome).size() * 5, 1073741824) } input: path "variant_read1.fa" @@ -242,6 +242,7 @@ process alignWithBowtie { * Take the reads and process them to get the remapped variants */ process readsToRemappedVariants { + label 'default_time', 'med_mem' input: path "reads.bam" @@ -276,6 +277,8 @@ process readsToRemappedVariants { * */ process merge_variants { + label 'short_time', 'small_mem' + input: path "remapped*.vcf" path "unmapped*.vcf" From f0778875d79722bf9939f4186b08df17eb34a278 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 5 Jun 2024 13:19:49 +0100 Subject: [PATCH 02/10] try updating conda to fix test run --- .github/workflows/variant_remapping.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/variant_remapping.yml b/.github/workflows/variant_remapping.yml index 4a1d6d2..6e04561 100644 --- a/.github/workflows/variant_remapping.yml +++ b/.github/workflows/variant_remapping.yml @@ -29,6 +29,7 @@ jobs: echo "/tmp/nextflow" >> $GITHUB_PATH cd - # $CONDA is an environment variable pointing to the root of the miniconda directory + $CONDA/bin/conda update conda $CONDA/bin/conda env update -q --file conda.yml --name base $CONDA/bin/conda run pip install -q -r requirements.txt From e44a322a10da2820e7eface17358dfa2a79dabdc Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 5 Jun 2024 13:33:32 +0100 Subject: [PATCH 03/10] conda install correct python version in action --- .github/workflows/variant_remapping.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/variant_remapping.yml b/.github/workflows/variant_remapping.yml index 6e04561..3e9f873 100644 --- a/.github/workflows/variant_remapping.yml +++ b/.github/workflows/variant_remapping.yml @@ -29,7 +29,8 @@ jobs: echo "/tmp/nextflow" >> $GITHUB_PATH cd - # $CONDA is an environment variable pointing to the root of the miniconda directory - $CONDA/bin/conda update conda + $CONDA/bin/conda update conda # https://github.com/conda/conda/issues/13560 + $CONDA/bin/conda install -y python=${{ matrix.python-version }} # https://github.com/actions/setup-python/issues/833 $CONDA/bin/conda env update -q --file conda.yml --name base $CONDA/bin/conda run pip install -q -r requirements.txt From 35358438e2de190d04ce7df6a5a0cbc2d2b6e08d Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 5 Jun 2024 13:39:23 +0100 Subject: [PATCH 04/10] bump python version --- .github/workflows/variant_remapping.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/variant_remapping.yml b/.github/workflows/variant_remapping.yml index 3e9f873..fa046e3 100644 --- a/.github/workflows/variant_remapping.yml +++ b/.github/workflows/variant_remapping.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7] + python-version: [3.8] steps: - uses: actions/checkout@v2 From 9d38801b0f8112d026def14bc1ab8b18e29f282e Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 5 Jun 2024 13:47:37 +0100 Subject: [PATCH 05/10] update channel priority --- conda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda.yml b/conda.yml index 0939745..52efd73 100755 --- a/conda.yml +++ b/conda.yml @@ -1,8 +1,8 @@ name: variant-remapping channels: - - defaults - conda-forge - bioconda + - defaults dependencies: - bedtools - minimap2 From 4ab46ed0785b59eafc026fbb2e0b75e85264ec20 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 5 Jun 2024 15:55:52 +0100 Subject: [PATCH 06/10] pin versions in conda env --- conda.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/conda.yml b/conda.yml index 52efd73..1698268 100755 --- a/conda.yml +++ b/conda.yml @@ -4,8 +4,8 @@ channels: - bioconda - defaults dependencies: - - bedtools - - minimap2 - - samtools - - bcftools - - tabix + - bedtools ==2.30 + - minimap2 ==2.17 + - samtools ==1.12 + - bcftools ==1.14 + - tabix ==0.2.6 From 380b867bfe6cdb1ea018db0b1009b01578e7a8ca Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 5 Jun 2024 15:58:20 +0100 Subject: [PATCH 07/10] remove whitespace from header in test --- tests/test_pipeline_empty.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_pipeline_empty.sh b/tests/test_pipeline_empty.sh index a40b4f4..7d0032d 100755 --- a/tests/test_pipeline_empty.sh +++ b/tests/test_pipeline_empty.sh @@ -20,7 +20,7 @@ cat << EOT > "${SCRIPT_DIR}/resources/source_empty.vcf" ##INFO= ##FORMAT= ##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 EOT nextflow run ${SOURCE_DIR}/main.nf \ From 80a2d4f22fd51cf0cc818c245ae59d9a86c89578 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 5 Jun 2024 16:01:51 +0100 Subject: [PATCH 08/10] undo version pinning --- conda.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/conda.yml b/conda.yml index 1698268..52efd73 100755 --- a/conda.yml +++ b/conda.yml @@ -4,8 +4,8 @@ channels: - bioconda - defaults dependencies: - - bedtools ==2.30 - - minimap2 ==2.17 - - samtools ==1.12 - - bcftools ==1.14 - - tabix ==0.2.6 + - bedtools + - minimap2 + - samtools + - bcftools + - tabix From 1c2c8a1bb238a7a49b243d013f4aecd6beae21af Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 12 Jun 2024 10:51:37 +0100 Subject: [PATCH 09/10] remove unneeded comments --- .github/workflows/variant_remapping.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/variant_remapping.yml b/.github/workflows/variant_remapping.yml index fa046e3..59af196 100644 --- a/.github/workflows/variant_remapping.yml +++ b/.github/workflows/variant_remapping.yml @@ -29,8 +29,8 @@ jobs: echo "/tmp/nextflow" >> $GITHUB_PATH cd - # $CONDA is an environment variable pointing to the root of the miniconda directory - $CONDA/bin/conda update conda # https://github.com/conda/conda/issues/13560 - $CONDA/bin/conda install -y python=${{ matrix.python-version }} # https://github.com/actions/setup-python/issues/833 + $CONDA/bin/conda update conda + $CONDA/bin/conda install -y python=${{ matrix.python-version }} $CONDA/bin/conda env update -q --file conda.yml --name base $CONDA/bin/conda run pip install -q -r requirements.txt From fe250f7f4266b6c89e33ecaf97371733562088d1 Mon Sep 17 00:00:00 2001 From: April Shen Date: Thu, 13 Jun 2024 13:11:30 +0100 Subject: [PATCH 10/10] add example nextflow config and documentation --- README.md | 8 ++++++ tests/resources/config.yml | 5 ---- tests/resources/nextflow.config | 46 +++++++++++++++++++++++++++++++++ tests/test_pipeline.sh | 2 +- tests/test_pipeline_empty.sh | 3 ++- 5 files changed, 57 insertions(+), 7 deletions(-) delete mode 100644 tests/resources/config.yml create mode 100644 tests/resources/nextflow.config diff --git a/README.md b/README.md index 8d690a4..74c3a3d 100644 --- a/README.md +++ b/README.md @@ -64,3 +64,11 @@ Other files are created alongside the main output: - `_nra_variants.vcf` variants successfully remap that landed in a position where the reference allele changed. The output contains the original variant and the original reference allele as alternate. - `_unmapped.vcf` original variant that could not be successfully remap - `_count.yml` YAML file containing counts associated with each round of remapping + +## Configuration + +The pipeline relies on Nextflow configuration to set memory and runtime requirements. This is not required for all users, but it is recommended particularly for HPC and cloud environments. + +There is an [example config](tests/resources/nextflow.config) used for tests that you can modify for your own needs. The main features are the use of labels to group processes into different categories based on their resource needs (small/medium/large), and the use of `base_memory` and `base_time` variables that some processes use to fine-tune their requirements. + +For more about Nextflow configuration, see the [documentation](https://www.nextflow.io/docs/latest/config.html). diff --git a/tests/resources/config.yml b/tests/resources/config.yml deleted file mode 100644 index 4162a2c..0000000 --- a/tests/resources/config.yml +++ /dev/null @@ -1,5 +0,0 @@ -executor { - $local { - memory = '6 GB' - } -} \ No newline at end of file diff --git a/tests/resources/nextflow.config b/tests/resources/nextflow.config new file mode 100644 index 0000000..c0305ea --- /dev/null +++ b/tests/resources/nextflow.config @@ -0,0 +1,46 @@ + +executor { + name = 'local' +} + +process.ext.base_memory = 6.GB +process.ext.base_time = 10.minutes + +process { + executor = 'local' + + // Dynamic resource allocation with retries + errorStrategy = 'retry' + maxRetries = 1 + memory = { task.ext.base_memory * task.attempt } + time = { task.ext.base_time * task.attempt } + + // Labels for specific runtimes + withLabel: short_time { + ext.base_time = 5.minutes + } + withLabel: default_time { + ext.base_time = 10.minutes + } + withLabel: med_time { + ext.base_time = 30.minutes + } + withLabel: long_time { + ext.base_time = 1.hour + } + + // Labels for specific memory usage + withLabel: small_mem { + ext.base_memory = 1.GB + } + withLabel: default_mem { + ext.base_memory = 6.GB + } + withLabel: med_mem { + ext.base_memory = 8.GB + } + withLabel: big_mem { + ext.base_memory = 10.GB + } + +} diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh index 8180be7..c82e85d 100755 --- a/tests/test_pipeline.sh +++ b/tests/test_pipeline.sh @@ -33,7 +33,7 @@ chr1 3710 . T A 50 PASS . GT:GQ 1/1:0 EOT nextflow run ${SOURCE_DIR}/main.nf \ --config ${SCRIPT_DIR}/resources/config.yml \ +-config ${SCRIPT_DIR}/resources/nextflow.config \ --oldgenome ${SCRIPT_DIR}/resources/genome.fa \ --newgenome ${SCRIPT_DIR}/resources/new_genome.fa \ --vcffile ${SCRIPT_DIR}/resources/source.vcf \ diff --git a/tests/test_pipeline_empty.sh b/tests/test_pipeline_empty.sh index 7d0032d..c853cdc 100755 --- a/tests/test_pipeline_empty.sh +++ b/tests/test_pipeline_empty.sh @@ -24,7 +24,7 @@ cat << EOT > "${SCRIPT_DIR}/resources/source_empty.vcf" EOT nextflow run ${SOURCE_DIR}/main.nf \ --config ${SCRIPT_DIR}/resources/config.yml \ +-config ${SCRIPT_DIR}/resources/nextflow.config \ --oldgenome ${SCRIPT_DIR}/resources/genome.fa \ --newgenome ${SCRIPT_DIR}/resources/new_genome.fa \ --vcffile ${SCRIPT_DIR}/resources/source_empty.vcf \ @@ -52,6 +52,7 @@ rm -rf work .nextflow* \ ${SCRIPT_DIR}/resources/source_empty.vcf \ ${SCRIPT_DIR}/resources/expected_remap.vcf \ ${SCRIPT_DIR}/resources/remap_empty.vcf \ + ${SCRIPT_DIR}/resources/remap_empty_nra_variants.vcf \ ${SCRIPT_DIR}/resources/remap_empty_counts.yml \ ${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \ ${SCRIPT_DIR}/resources/new_genome.fa.* \