From 53a97948affb07725328a1bba2fec79b331374a6 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jonathan.manning@seqera.io>
Date: Sat, 24 Feb 2024 21:38:33 +0000
Subject: [PATCH] Reorder rnaseq preprocessing, fix minor issues, test
 sortmerna (#4982)

* Trimming should come first in preprocessing

* Update tests to run sortmerna

* sortmerna working in subworkflow

* Don't need test data updates

* Appease eclint
---
 .../nf-core/preprocess_rnaseq/main.nf         | 52 ++++++++++---------
 .../preprocess_rnaseq/tests/main.nf.test      | 28 +++++++---
 .../preprocess_rnaseq/tests/main.nf.test.snap | 20 +++----
 3 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/subworkflows/nf-core/preprocess_rnaseq/main.nf b/subworkflows/nf-core/preprocess_rnaseq/main.nf
index 4ea083a219e..8e0b7b0d9e2 100644
--- a/subworkflows/nf-core/preprocess_rnaseq/main.nf
+++ b/subworkflows/nf-core/preprocess_rnaseq/main.nf
@@ -88,26 +88,6 @@ workflow PREPROCESS_RNASEQ {
 
     ch_versions = ch_versions.mix(CAT_FASTQ.out.versions.first().ifEmpty(null))
 
-    //
-    // MODULE: Remove ribosomal RNA reads
-    //
-    if (remove_ribo_rna) {
-        ch_sortmerna_fastas = Channel.from(ch_ribo_db.readLines())
-            .map { row -> file(row, checkIfExists: true) }
-            .collect()
-
-        SORTMERNA (
-            ch_filtered_reads,
-            ch_sortmerna_fastas
-        )
-        .reads
-        .set { ch_filtered_reads }
-
-        ch_multiqc_files = ch_multiqc_files.mix(SORTMERNA.out.log.map{ it[1] })
-
-        ch_versions = ch_versions.mix(SORTMERNA.out.versions.first())
-    }
-
     //
     // SUBWORKFLOW: Read QC, extract UMI and trim adapters with TrimGalore!
     //
@@ -128,7 +108,6 @@ workflow PREPROCESS_RNASEQ {
         ch_multiqc_files = FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.fastqc_zip
             .mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_zip)
             .mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_log)
-            .map{ it[1] }
             .mix(ch_multiqc_files)
     }
 
@@ -155,7 +134,6 @@ workflow PREPROCESS_RNASEQ {
         ch_multiqc_files = FASTQ_FASTQC_UMITOOLS_FASTP.out.fastqc_raw_zip
             .mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.fastqc_trim_zip)
             .mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.trim_json.map{tuple(it[0], [it[1]])})
-            .map{ it[1] }
             .mix(ch_multiqc_files)
     }
 
@@ -196,11 +174,35 @@ workflow PREPROCESS_RNASEQ {
             [ [], [] ],
             false
         )
-        .primary_fastq
-        .set { ch_filtered_reads }
+
+        BBMAP_BBSPLIT.out.primary_fastq
+            .set { ch_filtered_reads }
+
         ch_versions = ch_versions.mix(BBMAP_BBSPLIT.out.versions.first())
     }
 
+    //
+    // MODULE: Remove ribosomal RNA reads
+    //
+    if (remove_ribo_rna) {
+        ch_sortmerna_fastas = Channel.from(ch_ribo_db.readLines())
+            .map { row -> file(row, checkIfExists: true) }
+            .collect()
+
+        SORTMERNA (
+            ch_filtered_reads,
+            ch_sortmerna_fastas
+        )
+
+        SORTMERNA.out.reads
+            .set { ch_filtered_reads }
+
+        ch_multiqc_files = ch_multiqc_files
+            .mix(SORTMERNA.out.log)
+
+        ch_versions = ch_versions.mix(SORTMERNA.out.versions.first())
+    }
+
     // Branch FastQ channels if 'auto' specified to infer strandedness
     ch_filtered_reads
         .branch {
@@ -248,7 +250,7 @@ workflow PREPROCESS_RNASEQ {
     reads           = ch_strand_inferred_fastq
     trim_read_count = ch_trim_read_count
 
-    multiqc_files   = ch_multiqc_files
+    multiqc_files   = ch_multiqc_files.transpose().map{it[1]}
     versions        = ch_versions                     // channel: [ versions.yml ]
 }
 
diff --git a/subworkflows/nf-core/preprocess_rnaseq/tests/main.nf.test b/subworkflows/nf-core/preprocess_rnaseq/tests/main.nf.test
index 78c0ce74a7c..1b49859b151 100644
--- a/subworkflows/nf-core/preprocess_rnaseq/tests/main.nf.test
+++ b/subworkflows/nf-core/preprocess_rnaseq/tests/main.nf.test
@@ -18,24 +18,31 @@ nextflow_workflow {
     tag "subworkflows/fastq_fastqc_umitools_fastp"
     tag "subworkflows/fastq_subsample_fq_salmon"
 
+
+
     test("homo_sapiens paired-end [fastq] fastp") {
 
         when {
             workflow {
                 """
-                input[0] = Channel.of([
+                ch_reads = Channel.of([
                     [ id:'test', single_end:false, strandedness:'auto' ], // meta map
                     [
                         file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true),
                         file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true)
                     ]
-                ]) // ch_reads
+                ])
+
+                ch_ribo_db = file('ribo_db.txt')
+                ch_ribo_db.append('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta')
+
+                input[0] = ch_reads
                 input[1] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true)) // ch_fasta
                 input[2] = Channel.of(file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/transcriptome.fasta", checkIfExists: true)) // ch_transcript_fasta
                 input[3] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true)) // ch_gtf
                 input[4] = []              // ch_salmon_index
                 input[5] = []              // ch_bbsplit_index
-                input[6] = []              // ch_ribo_db
+                input[6] = ch_ribo_db      // ch_ribo_db
                 input[7] = true            // skip_bbsplit
                 input[8] = false           // skip_fastqc
                 input[9] = false           // skip_trimming
@@ -44,7 +51,7 @@ nextflow_workflow {
                 input[12] = 'fastp'        // trimmer
                 input[13] = 10             // min_trimmed_reads
                 input[14] = true           // save_trimmed
-                input[15] = false          // remove_ribo_rna
+                input[15] = true           // remove_ribo_rna
                 input[16] = false          // with_umi
                 input[17] = 0              // umi_discard_read
                 """
@@ -72,19 +79,24 @@ nextflow_workflow {
         when {
             workflow {
                 """
-                input[0] = Channel.of([
+                ch_reads = Channel.of([
                     [ id:'test', single_end:false, strandedness:'auto' ], // meta map
                     [
                         file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true),
                         file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true)
                     ]
-                ]) // ch_reads
+                ])
+
+                ch_ribo_db = file('ribo_db.txt')
+                ch_ribo_db.append('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta')
+
+                input[0] = ch_reads
                 input[1] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true)) // ch_fasta
                 input[2] = Channel.of(file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/transcriptome.fasta", checkIfExists: true)) // ch_transcript_fasta
                 input[3] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true)) // ch_gtf
                 input[4] = []              // ch_salmon_index
                 input[5] = []              // ch_bbsplit_index
-                input[6] = []              // ch_ribo_db
+                input[6] = ch_ribo_db      // ch_ribo_db
                 input[7] = true            // skip_bbsplit
                 input[8] = false           // skip_fastqc
                 input[9] = false           // skip_trimming
@@ -93,7 +105,7 @@ nextflow_workflow {
                 input[12] = 'fastp'        // trimmer
                 input[13] = 10             // min_trimmed_reads
                 input[14] = true           // save_trimmed
-                input[15] = false          // remove_ribo_rna
+                input[15] = true           // remove_ribo_rna
                 input[16] = false          // with_umi
                 input[17] = 0              // umi_discard_read
                 """
diff --git a/subworkflows/nf-core/preprocess_rnaseq/tests/main.nf.test.snap b/subworkflows/nf-core/preprocess_rnaseq/tests/main.nf.test.snap
index 691e07ac0b2..21bdb2492e3 100644
--- a/subworkflows/nf-core/preprocess_rnaseq/tests/main.nf.test.snap
+++ b/subworkflows/nf-core/preprocess_rnaseq/tests/main.nf.test.snap
@@ -16,7 +16,7 @@
             "nf-test": "0.8.4",
             "nextflow": "23.10.1"
         },
-        "timestamp": "2024-02-07T11:05:51.060371"
+        "timestamp": "2024-02-24T16:19:13.057802"
     },
     "trimgalore_test_pe_reads_2_lines": {
         "content": "eccf3e9e74589ff01c77fce7f4548e41",
@@ -24,7 +24,7 @@
             "nf-test": "0.8.4",
             "nextflow": "23.10.1"
         },
-        "timestamp": "2024-02-07T11:16:44.427598"
+        "timestamp": "2024-02-24T17:44:07.667653"
     },
     "fastp_test_pe_reads_1_size": {
         "content": [
@@ -34,7 +34,7 @@
             "nf-test": "0.8.4",
             "nextflow": "23.10.1"
         },
-        "timestamp": "2024-02-07T11:05:51.019935"
+        "timestamp": "2024-02-24T17:43:46.173892"
     },
     "trimgalore_test_pe_reads_1_size": {
         "content": [
@@ -44,7 +44,7 @@
             "nf-test": "0.8.4",
             "nextflow": "23.10.1"
         },
-        "timestamp": "2024-02-07T11:16:44.398923"
+        "timestamp": "2024-02-24T17:44:07.642318"
     },
     "trimgalore_test_pe_reads_1_lines": {
         "content": "3868fc1caf09367141d2bbf47e158823",
@@ -52,7 +52,7 @@
             "nf-test": "0.8.4",
             "nextflow": "23.10.1"
         },
-        "timestamp": "2024-02-07T11:16:44.395858"
+        "timestamp": "2024-02-24T17:44:07.641186"
     },
     "fastp_test_pe_reads_2_lines": {
         "content": "eccf3e9e74589ff01c77fce7f4548e41",
@@ -60,7 +60,7 @@
             "nf-test": "0.8.4",
             "nextflow": "23.10.1"
         },
-        "timestamp": "2024-02-07T11:05:51.05632"
+        "timestamp": "2024-02-24T17:43:46.235022"
     },
     "fastp_test_pe_reads_2_size": {
         "content": [
@@ -70,7 +70,7 @@
             "nf-test": "0.8.4",
             "nextflow": "23.10.1"
         },
-        "timestamp": "2024-02-07T11:05:51.058326"
+        "timestamp": "2024-02-24T17:43:46.242006"
     },
     "trimgalore_test_pe_reads_2_size": {
         "content": [
@@ -80,7 +80,7 @@
             "nf-test": "0.8.4",
             "nextflow": "23.10.1"
         },
-        "timestamp": "2024-02-07T11:16:44.430226"
+        "timestamp": "2024-02-24T17:44:07.668644"
     },
     "fastp_test_pe_reads_1_lines": {
         "content": "3868fc1caf09367141d2bbf47e158823",
@@ -88,7 +88,7 @@
             "nf-test": "0.8.4",
             "nextflow": "23.10.1"
         },
-        "timestamp": "2024-02-07T11:05:51.015562"
+        "timestamp": "2024-02-24T17:43:46.161535"
     },
     "trimgalore_read_count": {
         "content": [
@@ -107,6 +107,6 @@
             "nf-test": "0.8.4",
             "nextflow": "23.10.1"
         },
-        "timestamp": "2024-02-07T11:16:44.432645"
+        "timestamp": "2024-02-24T17:44:07.669435"
     }
 }
\ No newline at end of file