Update stitch (#6832)

* Update stitch * Add bam test and sort snapshot * Set R version for repetability * Add htslib to environment for conda * Fix environment * Add rsync to environment * Fix linting * Update path
nf-core · Oct 25, 2024 · 55e4111 · 55e4111
1 parent 7face45
commit 55e4111
Show file tree

Hide file tree

Showing 11 changed files with 382 additions and 279 deletions.
diff --git a/modules/nf-core/stitch/environment.yml b/modules/nf-core/stitch/environment.yml
@@ -2,4 +2,7 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
+  - conda-forge::r-base=4.3.1
+  - conda-forge::rsync=3.2.7
   - bioconda::r-stitch=1.6.10
+  - bioconda::htslib=1.18
diff --git a/modules/nf-core/stitch/main.nf b/modules/nf-core/stitch/main.nf
@@ -8,8 +8,8 @@ process STITCH {
         'biocontainers/r-stitch:1.6.10--r43h06b5641_0' }"
 
     input:
-    tuple val(meta) , path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen)
-    tuple val(meta2), path(collected_crams), path(collected_crais), path(cramlist)
+    tuple val(meta), path(collected_crams), path(collected_crais), path(cramlist)
+    tuple val(meta2), path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen)
     tuple val(meta3), path(fasta), path(fasta_fai)
     val seed
 

diff --git a/modules/nf-core/stitch/meta.yml b/modules/nf-core/stitch/meta.yml
@@ -23,6 +23,24 @@ tools:
       identifier: biotools:stitch-snijderlab
 input:
   - - meta:
+        type: map
+        description: |
+          Groovy Map containing information about the set of samples
+          e.g. `[ id:'test' ]`
+    - collected_crams:
+        type: file
+        description: List of sorted BAM/CRAM/SAM file
+        pattern: "*.{bam,cram,sam}"
+    - collected_crais:
+        type: file
+        description: List of BAM/CRAM/SAM index files
+        pattern: "*.{bai,crai,sai}"
+    - cramlist:
+        type: file
+        description: |
+          Text file with the path to the cram files to use in imputation, one per line. Since the cram files are staged to the working directory for the process, this file should just contain the file names without any pre-pending path.
+        pattern: "*.txt"
+  - - meta2:
         type: map
         description: |
           Groovy Map containing information about the set of positions to run the imputation over
@@ -55,24 +73,6 @@ input:
         description: Number of generations since founding of the population to use for
           imputation. Refer to the documentation for the `--nGen` argument of STITCH
           for more information.
-  - - meta2:
-        type: map
-        description: |
-          Groovy Map containing information about the set of samples
-          e.g. `[ id:'test' ]`
-    - collected_crams:
-        type: file
-        description: List of sorted BAM/CRAM/SAM file
-        pattern: "*.{bam,cram,sam}"
-    - collected_crais:
-        type: file
-        description: List of BAM/CRAM/SAM index files
-        pattern: "*.{bai,crai,sai}"
-    - cramlist:
-        type: file
-        description: |
-          Text file with the path to the cram files to use in imputation, one per line. Since the cram files are staged to the working directory for the process, this file should just contain the file names without any pre-pending path.
-        pattern: "*.txt"
   - - meta3:
         type: map
         description: |

diff --git a/modules/nf-core/stitch/tests/main.nf.test b/modules/nf-core/stitch/tests/main.nf.test
@@ -0,0 +1,178 @@
+def pathbam = "file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/"
+def pathgenome = "file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/"
+// positions and essential parameters
+def posfile             = "${pathgenome}dbsnp_138.hg38.first_10_biallelic_sites.tsv', checkIfExists: true)"
+def stitch_params       = "[ [ id: 'test_positions' ], $posfile, [], [], 'chr21', 2, 1 ]"
+
+// sequencing data in cram format
+def crams_val = "[${pathbam}cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true), ${pathbam}cram/test2.paired_end.recalibrated.sorted.cram', checkIfExists: true)]"
+def crais_val = "[${pathbam}cram/test.paired_end.recalibrated.sorted.cram.crai', checkIfExists: true), ${pathbam}cram/test2.paired_end.recalibrated.sorted.cram.crai', checkIfExists: true)]"
+def reads_cram = "[ [ id: 'test_reads' ], $crams_val, $crais_val ]"
+
+// sequencing data in bam format
+def bams_val = "[${pathbam}bam/test.paired_end.recalibrated.sorted.bam', checkIfExists: true), ${pathbam}bam/test2.paired_end.recalibrated.sorted.bam', checkIfExists: true)]"
+def bais_val = "[${pathbam}bam/test.paired_end.recalibrated.sorted.bam.bai', checkIfExists: true), ${pathbam}bam/test2.paired_end.recalibrated.sorted.bam.bai', checkIfExists: true)]"
+def reads_bam = "[ [ id:'test_reads' ], $bams_val, $bais_val ]"
+
+// reference genome
+def reference = "[[ id:'test_reference' ], ${pathgenome}genome.fasta', checkIfExists: true), ${pathgenome}genome.fasta.fai', checkIfExists: true)]"
+
+// for reproducibility
+def seed = 1
+
+nextflow_process {
+    name "Test Process STITCH"
+    script "../main.nf"
+    process "STITCH"
+
+    tag "modules"
+    tag "modules_nfcore"
+    tag "stitch"
+
+    test("test_no_seed") {
+        when {
+            process {
+                """
+                filelist = Channel.fromPath( $crams_val )
+                    .map { it[-1] as String } // get only filename
+                    .collectFile( name: "cramlist.txt", newLine: true, sort: true )
+                
+                input[0] = Channel.of( $reads_cram ).combine( filelist )
+                input[1] = $stitch_params
+                input[2] = $reference
+                input[3] = []
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(
+                    process.out.input.collect{ file(it[1]).listFiles().sort().name },
+                    process.out.rdata.collect{ file(it[1]).listFiles().sort().name },
+                    process.out.plots.collect{ file(it[1]).listFiles().sort().name },
+                    process.out.vcf.collect{ file(it[1]).name },
+                    process.out.versions
+                ).match() }
+            )
+        }
+    }
+
+    test("test_with_seed") {
+        when {
+            process {
+                """
+                filelist = Channel.fromPath( $crams_val )
+                    .map { it[-1] as String } // get only filename
+                    .collectFile( name: "cramlist.txt", newLine: true, sort: true )
+                input[0] = Channel.of( $reads_cram ).combine( filelist )
+                input[1] = $stitch_params
+                input[2] = $reference
+                input[3] = $seed
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(
+                    process.out.input.collect{ file(it[1]).listFiles().sort().name },
+                    process.out.rdata.collect{ file(it[1]).listFiles().sort().name },
+                    process.out.plots.collect{ file(it[1]).listFiles().sort().name },
+                    process.out.vcf.collect{ file(it[1]).name },
+                    process.out.versions
+                ).match() }
+            )
+        }
+    }
+
+    test("test_two_stage_imputation") {
+        setup {
+            run ("STITCH", alias: "STITCH_GENERATE_INPUTS") {
+                script "../main.nf"
+                config "./stitch_generate_input.config"
+                process {
+                """
+                filelist = Channel.fromPath( $crams_val )
+                    .map { it[-1] as String } // get only filename
+                    .collectFile( name: "cramlist.txt", newLine: true, sort: true )
+                input[0] = Channel.of( $reads_cram ).combine( filelist )
+                input[1] = $stitch_params
+                input[2] = $reference
+                input[3] = $seed
+                """
+                }
+            }
+        }
+
+        when {
+            config "./stitch_impute_only.config"
+            process {
+                """
+                ch_input_2step = Channel.of( $stitch_params )
+                    .map {
+                        meta, positions, target, rdata, chromosome_name, K, nGen ->
+                        [ meta, positions ]
+                    }
+                    .combine(
+                        STITCH_GENERATE_INPUTS.out.input
+                        .join ( STITCH_GENERATE_INPUTS.out.rdata )
+                    )
+                    .map {
+                        meta, positions, metaT, target, rdata ->
+                        [ metaT, positions, target, rdata, "chr21", 2, 1 ]
+                    }
+                input[0] = [[id: null], [], [], []]
+                input[1] = ch_input_2step
+                input[2] = [[id: null], [], []]
+                input[3] = $seed
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(
+                    process.out.input.collect{ file(it[1]).listFiles().sort().name },
+                    process.out.rdata.collect{ file(it[1]).listFiles().sort().name },
+                    process.out.plots.collect{ file(it[1]).listFiles().sort().name },
+                    process.out.vcf.collect{ file(it[1]).name },
+                    process.out.versions
+                ).match() }
+            )
+        }
+    }
+
+    test("test_with_bam") {
+        when {
+            process {
+                """
+                filelist = Channel.fromPath( $bams_val )
+                    .map { it[-1] as String } // get only filename
+                    .collectFile( name: "cramlist.txt", newLine: true, sort: true )
+                input[0] = Channel.of( $reads_bam ).combine( filelist )
+                input[1] = $stitch_params
+                input[2] = $reference
+                input[3] = $seed
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(
+                    process.out.input.collect{ file(it[1]).listFiles().sort().name },
+                    process.out.rdata.collect{ file(it[1]).listFiles().sort().name },
+                    process.out.plots.collect{ file(it[1]).listFiles().sort().name },
+                    process.out.vcf.collect{ file(it[1]).name },
+                    process.out.versions
+                ).match() }
+            )
+        }
+    }
+
+}