EBIvariation · apriltuesday · Jun 14, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024
diff --git a/.github/workflows/variant_remapping.yml b/.github/workflows/variant_remapping.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7]
+        python-version: [3.8]
 
     steps:
     - uses: actions/checkout@v2
@@ -29,6 +29,8 @@ jobs:
         echo "/tmp/nextflow" >> $GITHUB_PATH
         cd -
         # $CONDA is an environment variable pointing to the root of the miniconda directory
+        $CONDA/bin/conda update conda
+        $CONDA/bin/conda install -y python=${{ matrix.python-version }}
         $CONDA/bin/conda env update -q --file conda.yml --name base
         $CONDA/bin/conda run pip install -q -r requirements.txt
 

diff --git a/conda.yml b/conda.yml
@@ -1,8 +1,8 @@
 name: variant-remapping
 channels:
-  - defaults
   - conda-forge
   - bioconda
+  - defaults
 dependencies:
   - bedtools
   - minimap2

diff --git a/main.nf b/main.nf
@@ -1,6 +1,5 @@
 #!/usr/bin/env nextflow
 
-
 // Enable syntax extension
 // See https://www.nextflow.io/docs/latest/dsl2.html
 nextflow.enable.dsl=2
@@ -46,6 +45,7 @@ outfile_dir = file(params.outfile).getParent()
  * Uncompress VCF file
  */
 process uncompressInputVCF {
+    label 'short_time', 'med_mem'
 
     input:
         path "source.vcf"
@@ -69,6 +69,7 @@ process uncompressInputVCF {
  * filter VCF file to remove variant too close the edges of chromosome because we can't get flanking regions
  */
 process filterInputVCF {
+    label 'default_time', 'med_mem'
 
     input:
         path "source.vcf"
@@ -94,6 +95,7 @@ process filterInputVCF {
  * Store the original VCF header for later use
  */
 process storeVCFHeader {
+    label 'short_time', 'small_mem'
 
     input:
         path "source.vcf"
@@ -114,6 +116,7 @@ include { process_split_reads; process_split_reads_mid; process_split_reads_long
  * This process convert the original Header to the remapped header and concatenate it with the remapped VCF records
  */
 process generateRemappedVCF {
+    label 'short_time', 'small_mem'
 
     input:
         path "vcf_header.txt"
@@ -148,6 +151,7 @@ process generateRemappedVCF {
  * This process adds the original header to unmapped variant VCF records and output the results
  */
 process generateUnmappedVCF {
+    label 'short_time', 'small_mem'
 
     publishDir outfile_dir,
         overwrite: true,
@@ -170,6 +174,7 @@ process generateUnmappedVCF {
  * Sort VCF file
  */
 process sortVCF {
+    label 'default_time', 'med_mem'
 
     input:
         path "variants_remapped.vcf"
@@ -187,6 +192,7 @@ process sortVCF {
  * Run bcftools norm to swap the REF and ALT alleles if the REF doesn't match the new assembly
  */
 process normalise {
+    label 'default_time', 'med_mem'
 
     input:
         path "variants_remapped_sorted.vcf.gz"
@@ -202,6 +208,7 @@ process normalise {
 
 
 process collectNovelReferenceAlleles {
+    label 'short_time', 'small_mem'
 
     publishDir outfile_dir,
         overwrite: true,
@@ -224,6 +231,7 @@ process collectNovelReferenceAlleles {
  * Create file containing remapping stats
  */
 process outputStats {
+    label 'short_time', 'small_mem'
 
     publishDir outfile_dir,
         overwrite: true,
@@ -244,6 +252,8 @@ process outputStats {
  * Concatenate the unmapped variants
  */
 process combineUnmappedVCF {
+    label 'short_time', 'small_mem'
+
     input:
         path "variants1.vcf"
         path "variants2.vcf"
@@ -258,6 +268,8 @@ process combineUnmappedVCF {
 
 
 process combineVCF {
+    label 'short_time', 'small_mem'
+
     input:
         path "variants1.vcf"
         path "variants2.vcf"
@@ -271,6 +283,8 @@ process combineVCF {
 }
 
 process combineYaml {
+    label 'short_time', 'small_mem'
+
     input:
         path "initial_yml"
         path "round1.yml"

diff --git a/prepare_genome.nf b/prepare_genome.nf
@@ -9,8 +9,11 @@ nextflow.enable.dsl=2
  * Index the new reference genome using bowtie_build
  */
 process bowtieGenomeIndex {
+    label 'med_time'
+
     // Memory required is 10 times the size of the fasta in Bytes or at least 1GB
-    memory Math.max(file(params.newgenome).size() * 10, 1073741824) + ' B'
+    // Overwrite base_memory so that the standard retry strategy is used
+    ext base_memory: { Math.max(file(params.newgenome).size() * 10, 1073741824) } 
 
     input:
         path "genome_fasta"
@@ -25,6 +28,7 @@ process bowtieGenomeIndex {
 
 
 process samtoolsFaidx {
+    label 'med_time', 'med_mem'
 
     input:
         path "genome_basename"
@@ -41,6 +45,7 @@ process samtoolsFaidx {
  * Extract chomosome/contig sizes
  */
 process chromSizes {
+    label 'short_time', 'small_mem'
 
     input:
         path "genome.fa.fai"

diff --git a/tests/test_pipeline_empty.sh b/tests/test_pipeline_empty.sh
@@ -20,7 +20,7 @@ cat << EOT > "${SCRIPT_DIR}/resources/source_empty.vcf"
 ##INFO=<ID=COMMENT,Number=1,Type=String,Description="Comment">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Consensus Genotype across all datasets with called genotype">
 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
-#CHROM	POS	ID	REF	 ALT	QUAL 	FILTER	INFO	FORMAT	HG001
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG001
 EOT
 
 nextflow run ${SOURCE_DIR}/main.nf \

diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
@@ -1,6 +1,5 @@
 #!/usr/bin/env nextflow
 
-
 // Enable syntax extension
 // See https://www.nextflow.io/docs/latest/dsl2.html
 nextflow.enable.dsl=2
@@ -11,6 +10,7 @@ nextflow.enable.dsl=2
  * "strand" column.
  */
 process convertVCFToBed {
+    label 'default_time', 'med_mem'
 
     input:
         path "source.vcf"
@@ -38,6 +38,7 @@ process convertVCFToBed {
  * Based on variants BED, generate the BED file for each flank.
  */
 process flankingRegionBed {
+    label 'default_time', 'med_mem'
 
     input:
         path "variants.bed"
@@ -67,8 +68,7 @@ process flankingRegionBed {
  * Extract the actual flanking region in fasta format.
  */
 process flankingRegionFasta {
-
-    memory '4 GB'
+    label 'default_time', 'med_mem'
 
     input:  
         path "flanking_r1.bed"
@@ -91,8 +91,7 @@ process flankingRegionFasta {
  * Extract information about the original variants and put it in the fasta header
  */
 process extractVariantInfoToFastaHeader {
-
-    memory '6GB'
+    label 'default_time', 'med_mem'
 
     input:  
         path "flanking_r1.bed"
@@ -127,6 +126,7 @@ process extractVariantInfoToFastaHeader {
  * Split fasta entries into multiple chunks
  */
 process split_fasta {
+    label 'short_time', 'small_mem'
 
     input:
         path interleaved_fasta
@@ -150,13 +150,11 @@ process split_fasta {
  * Align sequence with minimap2
  */
 process alignWithMinimap {
+    label 'med_time'
 
-    // Memory required is 5 times the size of the fasta in Bytes or at least 1GB
-    // Retry on kill (exit status 130) with twice the amount of memory
-    memory { Math.max(file(params.newgenome).size() * 10, 2000000000) * task.attempt + ' B' }
-
-    errorStrategy { task.exitStatus == 130 ? 'retry' : 'terminate' }
-    maxRetries 3
+    // Memory required is 10 times the size of the fasta in Bytes or at least 2GB
+    // Overwrite base_memory so that the standard retry strategy is used
+    ext base_memory: { Math.max(file(params.newgenome).size() * 10, 2000000000) }
 
     input:
         // reads contains paired interleaved (first and second read in the same file)
@@ -168,7 +166,6 @@ process alignWithMinimap {
     output:
         path "reads_aligned.bam", emit: reads_aligned_bam
 
-
     script:
     if (flanklength < 500)
         """
@@ -199,6 +196,7 @@ process alignWithMinimap {
  * Sort BAM file by name
  */
 process sortByName {
+    label 'default_time', 'med_mem'
 
     input:
         path "reads_aligned.bam"
@@ -215,9 +213,11 @@ process sortByName {
  * Align sequence with bowtie2
  */
 process alignWithBowtie {
+    label 'med_time'
 
     // Memory required is 5 times the size of the fasta in Bytes or at least 1GB
-    memory Math.max(file(params.newgenome).size() * 5, 1073741824) + ' B'
+    // Overwrite base_memory so that the standard retry strategy is used
+    ext base_memory: { Math.max(file(params.newgenome).size() * 5, 1073741824) }
 
     input:
         path "variant_read1.fa"
@@ -242,6 +242,7 @@ process alignWithBowtie {
  * Take the reads and process them to get the remapped variants
  */
 process readsToRemappedVariants {
+    label 'default_time', 'med_mem'
 
     input:
         path "reads.bam"
@@ -276,6 +277,8 @@ process readsToRemappedVariants {
  *
  */
 process merge_variants {
+    label 'short_time', 'small_mem'
+
     input:
         path "remapped*.vcf"
         path "unmapped*.vcf"