From 878f96a330ff7c2aa97b399acc160a24553058e9 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 12 Dec 2024 14:00:33 +0000 Subject: [PATCH] Generic UMI handling subworkflow (#7208) * Add starting files * bam_dedup_umi working * undo mistaken file change * Add swf test config * Add snapshot --- subworkflows/nf-core/bam_dedup_umi/main.nf | 131 +++++++++++++ subworkflows/nf-core/bam_dedup_umi/meta.yml | 181 ++++++++++++++++++ .../nf-core/bam_dedup_umi/tests/main.nf.test | 99 ++++++++++ .../bam_dedup_umi/tests/main.nf.test.snap | 109 +++++++++++ .../bam_dedup_umi/tests/nextflow.config | 38 ++++ 5 files changed, 558 insertions(+) create mode 100644 subworkflows/nf-core/bam_dedup_umi/main.nf create mode 100644 subworkflows/nf-core/bam_dedup_umi/meta.yml create mode 100644 subworkflows/nf-core/bam_dedup_umi/tests/main.nf.test create mode 100644 subworkflows/nf-core/bam_dedup_umi/tests/main.nf.test.snap create mode 100644 subworkflows/nf-core/bam_dedup_umi/tests/nextflow.config diff --git a/subworkflows/nf-core/bam_dedup_umi/main.nf b/subworkflows/nf-core/bam_dedup_umi/main.nf new file mode 100644 index 00000000000..fcaa6f2ea79 --- /dev/null +++ b/subworkflows/nf-core/bam_dedup_umi/main.nf @@ -0,0 +1,131 @@ +// +// BAM deduplication with UMI processing +// + +include { BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE as BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_TRANSCRIPTOME } from '../../../subworkflows/nf-core/bam_dedup_stats_samtools_umicollapse' +include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS as BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME } from '../../../subworkflows/nf-core/bam_dedup_stats_samtools_umitools' +include { BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE as BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_GENOME } from '../../../subworkflows/nf-core/bam_dedup_stats_samtools_umicollapse' +include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS as BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME } from '../../../subworkflows/nf-core/bam_dedup_stats_samtools_umitools' +include { BAM_SORT_STATS_SAMTOOLS } from '../../../subworkflows/nf-core/bam_sort_stats_samtools' +include { UMITOOLS_PREPAREFORRSEM } from '../../../modules/nf-core/umitools/prepareforrsem' +include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main' + +workflow BAM_DEDUP_UMI { + take: + ch_genome_bam // channel: [ val(meta), path(bam), path(bai) ] + ch_fasta // channel: [ val(meta), path(fasta) ] + umi_dedup_tool // string: 'umicollapse' or 'umitools' + umitools_dedup_stats // boolean: whether to generate UMI-tools dedup stats + bam_csi_index // boolean: whether to generate CSI index + ch_transcriptome_bam // channel: [ val(meta), path(bam) ] + ch_transcript_fasta // channel: [ val(meta), path(fasta) ] + + main: + ch_versions = Channel.empty() + + if (umi_dedup_tool != "umicollapse" && umi_dedup_tool != "umitools"){ + error("Unknown umi_dedup_tool '${umi_dedup_tool}'") + } + + // Genome BAM deduplication + if (umi_dedup_tool == "umicollapse") { + BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_GENOME ( + ch_genome_bam + ) + UMI_DEDUP_GENOME = BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_GENOME + ch_dedup_log = UMI_DEDUP_GENOME.out.dedup_stats + + } else if (umi_dedup_tool == "umitools") { + BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME ( + ch_genome_bam, + umitools_dedup_stats + ) + UMI_DEDUP_GENOME = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME + ch_dedup_log = UMI_DEDUP_GENOME.out.deduplog + } + + // Co-ordinate sort, index and run stats on transcriptome BAM. This takes + // some preparation- we have to coordinate sort the BAM, run the + // deduplication, then restore name sorting and run a script from umitools + // to prepare for rsem or salmon + + // 1. Coordinate sort + + BAM_SORT_STATS_SAMTOOLS ( + ch_transcriptome_bam, + ch_transcript_fasta + ) + ch_sorted_transcriptome_bam = BAM_SORT_STATS_SAMTOOLS.out.bam + .join(BAM_SORT_STATS_SAMTOOLS.out.bai) + + // 2. Transcriptome BAM deduplication + if (umi_dedup_tool == "umicollapse") { + BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_TRANSCRIPTOME ( + ch_sorted_transcriptome_bam + ) + UMI_DEDUP_TRANSCRIPTOME = BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_TRANSCRIPTOME + ch_dedup_log = ch_dedup_log.mix(UMI_DEDUP_GENOME.out.dedup_stats) + + } else if (umi_dedup_tool == "umitools") { + BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME ( + ch_sorted_transcriptome_bam, + umitools_dedup_stats + ) + UMI_DEDUP_TRANSCRIPTOME = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME + ch_dedup_log = ch_dedup_log.mix(UMI_DEDUP_GENOME.out.deduplog) + } + + // 3. Restore name sorting + SAMTOOLS_SORT ( + UMI_DEDUP_TRANSCRIPTOME.out.bam, + ch_fasta + ) + + // 4. Run prepare_for_rsem.py on paired-end BAM files + // This fixes paired-end reads in name sorted BAM files + // See: https://github.com/nf-core/rnaseq/issues/828 + ended_transcriptome_dedup_bam = SAMTOOLS_SORT.out.bam + .branch { + meta, bam -> + single_end: meta.single_end + return [ meta, bam ] + paired_end: !meta.single_end + return [ meta, bam ] + } + + UMITOOLS_PREPAREFORRSEM ( + ended_transcriptome_dedup_bam.paired_end + .map { meta, bam -> [ meta, bam, [] ] } + ) + + ch_dedup_transcriptome_bam = ended_transcriptome_dedup_bam.single_end + .mix(UMITOOLS_PREPAREFORRSEM.out.bam) + + // Collect files useful for MultiQC into one helpful emission. Don't + // automatically add transcriptome stats- difficult to separate in multiqc + // without a bit more work + + ch_multiqc_files = ch_dedup_log + .mix(UMI_DEDUP_GENOME.out.stats) + .mix(UMI_DEDUP_GENOME.out.flagstat) + .mix(UMI_DEDUP_GENOME.out.idxstats) + .transpose() + .map{it[1]} + + // Record versions + + ch_versions = UMI_DEDUP_GENOME.out.versions + .mix(BAM_SORT_STATS_SAMTOOLS.out.versions) + .mix(UMITOOLS_PREPAREFORRSEM.out.versions) + + emit: + bam = UMI_DEDUP_GENOME.out.bam // channel: [ val(meta), path(bam) ] + bai = bam_csi_index ? UMI_DEDUP_GENOME.out.csi : UMI_DEDUP_GENOME.out.bai // channel: [ val(meta), path(bai) ] + dedup_log = ch_dedup_log // channel: [ val(meta), path(log) ] + stats = UMI_DEDUP_GENOME.out.stats.mix(UMI_DEDUP_TRANSCRIPTOME.out.stats) // channel: [ val(meta), path(stats)] + flagstat = UMI_DEDUP_GENOME.out.flagstat.mix(UMI_DEDUP_TRANSCRIPTOME.out.flagstat) // channel: [ val(meta), path(flagstat)] + idxstats = UMI_DEDUP_GENOME.out.idxstats.mix(UMI_DEDUP_TRANSCRIPTOME.out.idxstats) // channel: [ val(meta), path(idxstats)] + multiqc_files = ch_multiqc_files // channel: file + transcriptome_bam = ch_dedup_transcriptome_bam // channel: [ val(meta), path(bam) ] + versions = ch_versions // channel: [ path(versions.yml) ] +} diff --git a/subworkflows/nf-core/bam_dedup_umi/meta.yml b/subworkflows/nf-core/bam_dedup_umi/meta.yml new file mode 100644 index 00000000000..f88fa2f1c97 --- /dev/null +++ b/subworkflows/nf-core/bam_dedup_umi/meta.yml @@ -0,0 +1,181 @@ +name: "bam_dedup_umi" +description: BAM deduplication with UMI processing for both genome and transcriptome alignments +keywords: + - deduplication + - UMI + - BAM + - genome + - transcriptome + - umicollapse + - umitools + +components: + - umitools/prepareforrsem + - samtools/sort + - bam_dedup_stats_samtools_umicollapse + - bam_dedup_stats_samtools_umitools + - bam_sort_stats_samtools + +input: + - ch_genome_bam: + description: Channel with genome BAM files + structure: + - meta: + type: map + description: Metadata map + - bam: + type: file + description: BAM file + pattern: "*.bam" + - bai: + type: file + description: BAM index file + pattern: "*.bai" + - ch_fasta: + description: Channel with genome FASTA file + structure: + - meta: + type: map + description: Metadata map + - fasta: + type: file + description: Genome FASTA file + pattern: "*.{fa,fasta}" + - umi_dedup_tool: + description: UMI deduplication tool to use + structure: + - value: + type: string + description: Either 'umicollapse' or 'umitools' + - umitools_dedup_stats: + description: Whether to generate UMI-tools deduplication stats + structure: + - value: + type: boolean + description: True or False + - bam_csi_index: + description: Whether to generate CSI index + structure: + - value: + type: boolean + description: True or False + - ch_transcriptome_bam: + description: Channel with transcriptome BAM files + structure: + - meta: + type: map + description: Metadata map + - bam: + type: file + description: BAM file + pattern: "*.bam" + - ch_transcript_fasta: + description: Channel with transcript FASTA file + structure: + - meta: + type: map + description: Metadata map + - fasta: + type: file + description: Transcript FASTA file + pattern: "*.{fa,fasta}" + +output: + - bam: + description: Channel containing deduplicated genome BAM files + structure: + - meta: + type: map + description: Metadata map + - bam: + type: file + description: Deduplicated BAM file + pattern: "*.bam" + - bai: + description: Channel containing indexed BAM (BAI) files + structure: + - meta: + type: map + description: Metadata map + - bai: + type: file + description: BAM index file + pattern: "*.bai" + - csi: + description: Channel containing CSI files (if bam_csi_index is true) + structure: + - meta: + type: map + description: Metadata map + - csi: + type: file + description: CSI index file + pattern: "*.csi" + - dedup_log: + description: Channel containing deduplication log files + structure: + - meta: + type: map + description: Metadata map + - log: + type: file + description: Deduplication log file + pattern: "*.log" + - stats: + description: Channel containing BAM statistics files + structure: + - meta: + type: map + description: Metadata map + - stats: + type: file + description: BAM statistics file + pattern: "*.stats" + - flagstat: + description: Channel containing flagstat files + structure: + - meta: + type: map + description: Metadata map + - flagstat: + type: file + description: Flagstat file + pattern: "*.flagstat" + - idxstats: + description: Channel containing idxstats files + structure: + - meta: + type: map + description: Metadata map + - idxstats: + type: file + description: Idxstats file + pattern: "*.idxstats" + - multiqc_files: + description: Channel containing files for MultiQC + structure: + - file: + type: file + description: File for MultiQC + - transcriptome_bam: + description: Channel containing deduplicated transcriptome BAM files + structure: + - meta: + type: map + description: Metadata map + - bam: + type: file + description: Deduplicated transcriptome BAM file + pattern: "*.bam" + - versions: + description: Channel containing software versions file + structure: + - versions: + type: file + description: File containing versions of the software used + pattern: "versions.yml" + +authors: + - "@pinin4fjords" +maintainers: + - "@pinin4fjords" diff --git a/subworkflows/nf-core/bam_dedup_umi/tests/main.nf.test b/subworkflows/nf-core/bam_dedup_umi/tests/main.nf.test new file mode 100644 index 00000000000..b8e9da9abda --- /dev/null +++ b/subworkflows/nf-core/bam_dedup_umi/tests/main.nf.test @@ -0,0 +1,99 @@ +nextflow_workflow { + + name "Test Workflow BAM_DEDUP_UMI" + script "../main.nf" + workflow "BAM_DEDUP_UMI" + config "./nextflow.config" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/bam_dedup_umi" + tag "bam_dedup_stats_samtools_umicollapse" + tag "bam_dedup_stats_samtools_umitools" + tag "bam_sort_stats_samtools" + tag "umitools/prepareforrsem" + tag "samtools/sort" + + test("sarscov2_bam_bai - umitools - with transcriptome bams") { + + when { + workflow { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.umi.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.umi.sorted.bam.bai', checkIfExists: true) + ] + input[1] = [ + [id:'genome'], + file(params.modules_testdata_base_path + '/genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = 'umitools' + input[3] = false + input[4] = false + input[5] = [ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.umi.sorted.bam', checkIfExists: true) ] + input[6] = [ + [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/transcriptome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + bam(workflow.out.bam[0][1]).getReadsMD5(), + bam(workflow.out.transcriptome_bam[0][1]).getReadsMD5(), + workflow.out.stats, + workflow.out.flagstat, + workflow.out.idxstats, + workflow.out.versions + ).match() }, + { assert path(workflow.out.bai.get(0).get(1)).exists() } + ) + } + + } + + test("sarscov2_bam_bai - umicollapse - no transcriptome bams") { + + when { + workflow { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.umi.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.umi.sorted.bam.bai', checkIfExists: true) + ] + input[1] = [ + [id:'genome'], + file(params.modules_testdata_base_path + '/genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = 'umicollapse' + input[3] = false + input[4] = false + input[5] = Channel.empty() + input[6] = Channel.empty() + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + bam(workflow.out.bam[0][1]).getReadsMD5(), + workflow.out.stats, + workflow.out.flagstat, + workflow.out.idxstats, + workflow.out.versions + ).match() }, + { assert path(workflow.out.bai.get(0).get(1)).exists() } + ) + } + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/bam_dedup_umi/tests/main.nf.test.snap b/subworkflows/nf-core/bam_dedup_umi/tests/main.nf.test.snap new file mode 100644 index 00000000000..94151f04bb4 --- /dev/null +++ b/subworkflows/nf-core/bam_dedup_umi/tests/main.nf.test.snap @@ -0,0 +1,109 @@ +{ + "sarscov2_bam_bai - umicollapse - no transcriptome bams": { + "content": [ + "c1917631c47d16320d002b867e226a2e", + [ + [ + { + "id": "test" + }, + "test.umi_dedup.genome.sorted.bam.stats:md5,a03e635b85a846a2650c7b747926c7ed" + ] + ], + [ + [ + { + "id": "test" + }, + "test.umi_dedup.genome.sorted.bam.flagstat:md5,18d602435a02a4d721b78d1812622159" + ] + ], + [ + [ + { + "id": "test" + }, + "test.umi_dedup.genome.sorted.bam.idxstats:md5,85d20a901eef23ca50c323638a2eb602" + ] + ], + [ + "versions.yml:md5,3fe6727d1a90130a932400464233ab5d", + "versions.yml:md5,5d91176bd409dc54816e7e3a7773b5ef", + "versions.yml:md5,7eaf0c81f627dde8e2710a57e19b4d87", + "versions.yml:md5,d6bba04463b79564f053e7e5033025dd", + "versions.yml:md5,f90574f645fd8876ba52b89b2272afc1" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-12T13:04:25.48875" + }, + "sarscov2_bam_bai - umitools - with transcriptome bams": { + "content": [ + "c1917631c47d16320d002b867e226a2e", + "fe38ce657d4208a850702bdbfaa062ca", + [ + [ + { + "id": "test" + }, + "test.umi_dedup.genome.sorted.bam.stats:md5,a03e635b85a846a2650c7b747926c7ed" + ], + [ + { + "id": "test" + }, + "test.umi_dedup.transcriptome.sorted.bam.stats:md5,720f03daf649606a50c2cccbf135edc5" + ] + ], + [ + [ + { + "id": "test" + }, + "test.umi_dedup.genome.sorted.bam.flagstat:md5,18d602435a02a4d721b78d1812622159" + ], + [ + { + "id": "test" + }, + "test.umi_dedup.transcriptome.sorted.bam.flagstat:md5,18d602435a02a4d721b78d1812622159" + ] + ], + [ + [ + { + "id": "test" + }, + "test.umi_dedup.genome.sorted.bam.idxstats:md5,85d20a901eef23ca50c323638a2eb602" + ], + [ + { + "id": "test" + }, + "test.umi_dedup.transcriptome.sorted.bam.idxstats:md5,85d20a901eef23ca50c323638a2eb602" + ] + ], + [ + "versions.yml:md5,017dce06e6fadba312ce46808e2b19ff", + "versions.yml:md5,0ca2bd583dd1951c792b526f768db343", + "versions.yml:md5,1e761d924c3e0f35284799aa566e7c1a", + "versions.yml:md5,43e34c4518b8d546b632b372bc282ac4", + "versions.yml:md5,9d7724974d6dc288b485b5509c89297b", + "versions.yml:md5,a4562f4cc5d04041a79ba57761e231d9", + "versions.yml:md5,acfef82b716e50563fb22f47f33de00a", + "versions.yml:md5,afd68b9511aecba9cc5d6f7dd4c34bbd", + "versions.yml:md5,c2fda58ea2b120a510502b9db24d8a16", + "versions.yml:md5,d531cd7e744b2f95659bdb544f2cd8a5", + "versions.yml:md5,e22174a7d3003395e8e8c34ebfba1719" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-12T13:04:12.182087" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/bam_dedup_umi/tests/nextflow.config b/subworkflows/nf-core/bam_dedup_umi/tests/nextflow.config new file mode 100644 index 00000000000..cd10b5c785a --- /dev/null +++ b/subworkflows/nf-core/bam_dedup_umi/tests/nextflow.config @@ -0,0 +1,38 @@ +process { + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMI(COLLAPSE|TOOLS)_GENOME:UMI(COLLAPSE|TOOLS_DEDUP)' { + ext.prefix = { "${meta.id}_umi_dedup.genome" } + } + + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMI(COLLAPSE|TOOLS)_TRANSCRIPTOME:UMI(COLLAPSE|TOOLS_DEDUP)' { + ext.prefix = { "${meta.id}.umi_dedup.transcriptome.sorted" } + } + + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMI(COLLAPSE|TOOLS)_GENOME:UMI(COLLAPSE|TOOLS_DEDUP)' { + ext.prefix = { "${meta.id}.umi_dedup.genome.sorted" } + } + + withName: 'UMITOOLS_PREPAREFORRSEM' { + ext.prefix = { "${meta.id}_preparedforrsem" } + } + + withName: 'BAM_DEDUP_UMI:SAMTOOLS_SORT' { + ext.args = '-n' + ext.prefix = { "${meta.id}.umi_dedup.transcriptome" } + } + + withName: 'BAM_DEDUP_UMI:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_SORT' { + ext.prefix = { "${meta.id}.transcriptome.sorted" } + } + + withName: 'BAM_DEDUP_UMI:BAM_SORT_STATS_SAMTOOLS:BAM_STATS_SAMTOOLS:.*' { + ext.prefix = { "${meta.id}.transcriptome.sorted.bam" } + } + + withName: 'BAM_DEDUP_UMI:BAM_DEDUP_STATS_SAMTOOLS_UMI(COLLAPSE|TOOLS)_TRANSCRIPTOME:BAM_STATS_SAMTOOLS:.*' { + ext.prefix = { "${meta.id}.umi_dedup.transcriptome.sorted.bam" } + } + + withName: 'BAM_DEDUP_UMI:BAM_DEDUP_STATS_SAMTOOLS_UMI(COLLAPSE|TOOLS)_GENOME:BAM_STATS_SAMTOOLS:.*' { + ext.prefix = { "${meta.id}.umi_dedup.genome.sorted.bam" } + } +}