Skip to content

Commit

Permalink
Generic UMI handling subworkflow (#7208)
Browse files Browse the repository at this point in the history
* Add starting files

* bam_dedup_umi working

* undo mistaken file change

* Add swf test config

* Add snapshot
  • Loading branch information
pinin4fjords authored Dec 12, 2024
1 parent 9a19690 commit 878f96a
Show file tree
Hide file tree
Showing 5 changed files with 558 additions and 0 deletions.
131 changes: 131 additions & 0 deletions subworkflows/nf-core/bam_dedup_umi/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
//
// BAM deduplication with UMI processing
//

include { BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE as BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_TRANSCRIPTOME } from '../../../subworkflows/nf-core/bam_dedup_stats_samtools_umicollapse'
include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS as BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME } from '../../../subworkflows/nf-core/bam_dedup_stats_samtools_umitools'
include { BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE as BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_GENOME } from '../../../subworkflows/nf-core/bam_dedup_stats_samtools_umicollapse'
include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS as BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME } from '../../../subworkflows/nf-core/bam_dedup_stats_samtools_umitools'
include { BAM_SORT_STATS_SAMTOOLS } from '../../../subworkflows/nf-core/bam_sort_stats_samtools'
include { UMITOOLS_PREPAREFORRSEM } from '../../../modules/nf-core/umitools/prepareforrsem'
include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main'

workflow BAM_DEDUP_UMI {
take:
ch_genome_bam // channel: [ val(meta), path(bam), path(bai) ]
ch_fasta // channel: [ val(meta), path(fasta) ]
umi_dedup_tool // string: 'umicollapse' or 'umitools'
umitools_dedup_stats // boolean: whether to generate UMI-tools dedup stats
bam_csi_index // boolean: whether to generate CSI index
ch_transcriptome_bam // channel: [ val(meta), path(bam) ]
ch_transcript_fasta // channel: [ val(meta), path(fasta) ]

main:
ch_versions = Channel.empty()

if (umi_dedup_tool != "umicollapse" && umi_dedup_tool != "umitools"){
error("Unknown umi_dedup_tool '${umi_dedup_tool}'")
}

// Genome BAM deduplication
if (umi_dedup_tool == "umicollapse") {
BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_GENOME (
ch_genome_bam
)
UMI_DEDUP_GENOME = BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_GENOME
ch_dedup_log = UMI_DEDUP_GENOME.out.dedup_stats

} else if (umi_dedup_tool == "umitools") {
BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME (
ch_genome_bam,
umitools_dedup_stats
)
UMI_DEDUP_GENOME = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME
ch_dedup_log = UMI_DEDUP_GENOME.out.deduplog
}

// Co-ordinate sort, index and run stats on transcriptome BAM. This takes
// some preparation- we have to coordinate sort the BAM, run the
// deduplication, then restore name sorting and run a script from umitools
// to prepare for rsem or salmon

// 1. Coordinate sort

BAM_SORT_STATS_SAMTOOLS (
ch_transcriptome_bam,
ch_transcript_fasta
)
ch_sorted_transcriptome_bam = BAM_SORT_STATS_SAMTOOLS.out.bam
.join(BAM_SORT_STATS_SAMTOOLS.out.bai)

// 2. Transcriptome BAM deduplication
if (umi_dedup_tool == "umicollapse") {
BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_TRANSCRIPTOME (
ch_sorted_transcriptome_bam
)
UMI_DEDUP_TRANSCRIPTOME = BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_TRANSCRIPTOME
ch_dedup_log = ch_dedup_log.mix(UMI_DEDUP_GENOME.out.dedup_stats)

} else if (umi_dedup_tool == "umitools") {
BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME (
ch_sorted_transcriptome_bam,
umitools_dedup_stats
)
UMI_DEDUP_TRANSCRIPTOME = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME
ch_dedup_log = ch_dedup_log.mix(UMI_DEDUP_GENOME.out.deduplog)
}

// 3. Restore name sorting
SAMTOOLS_SORT (
UMI_DEDUP_TRANSCRIPTOME.out.bam,
ch_fasta
)

// 4. Run prepare_for_rsem.py on paired-end BAM files
// This fixes paired-end reads in name sorted BAM files
// See: https://github.com/nf-core/rnaseq/issues/828
ended_transcriptome_dedup_bam = SAMTOOLS_SORT.out.bam
.branch {
meta, bam ->
single_end: meta.single_end
return [ meta, bam ]
paired_end: !meta.single_end
return [ meta, bam ]
}

UMITOOLS_PREPAREFORRSEM (
ended_transcriptome_dedup_bam.paired_end
.map { meta, bam -> [ meta, bam, [] ] }
)

ch_dedup_transcriptome_bam = ended_transcriptome_dedup_bam.single_end
.mix(UMITOOLS_PREPAREFORRSEM.out.bam)

// Collect files useful for MultiQC into one helpful emission. Don't
// automatically add transcriptome stats- difficult to separate in multiqc
// without a bit more work

ch_multiqc_files = ch_dedup_log
.mix(UMI_DEDUP_GENOME.out.stats)
.mix(UMI_DEDUP_GENOME.out.flagstat)
.mix(UMI_DEDUP_GENOME.out.idxstats)
.transpose()
.map{it[1]}

// Record versions

ch_versions = UMI_DEDUP_GENOME.out.versions
.mix(BAM_SORT_STATS_SAMTOOLS.out.versions)
.mix(UMITOOLS_PREPAREFORRSEM.out.versions)

emit:
bam = UMI_DEDUP_GENOME.out.bam // channel: [ val(meta), path(bam) ]
bai = bam_csi_index ? UMI_DEDUP_GENOME.out.csi : UMI_DEDUP_GENOME.out.bai // channel: [ val(meta), path(bai) ]
dedup_log = ch_dedup_log // channel: [ val(meta), path(log) ]
stats = UMI_DEDUP_GENOME.out.stats.mix(UMI_DEDUP_TRANSCRIPTOME.out.stats) // channel: [ val(meta), path(stats)]
flagstat = UMI_DEDUP_GENOME.out.flagstat.mix(UMI_DEDUP_TRANSCRIPTOME.out.flagstat) // channel: [ val(meta), path(flagstat)]
idxstats = UMI_DEDUP_GENOME.out.idxstats.mix(UMI_DEDUP_TRANSCRIPTOME.out.idxstats) // channel: [ val(meta), path(idxstats)]
multiqc_files = ch_multiqc_files // channel: file
transcriptome_bam = ch_dedup_transcriptome_bam // channel: [ val(meta), path(bam) ]
versions = ch_versions // channel: [ path(versions.yml) ]
}
181 changes: 181 additions & 0 deletions subworkflows/nf-core/bam_dedup_umi/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
name: "bam_dedup_umi"
description: BAM deduplication with UMI processing for both genome and transcriptome alignments
keywords:
- deduplication
- UMI
- BAM
- genome
- transcriptome
- umicollapse
- umitools

components:
- umitools/prepareforrsem
- samtools/sort
- bam_dedup_stats_samtools_umicollapse
- bam_dedup_stats_samtools_umitools
- bam_sort_stats_samtools

input:
- ch_genome_bam:
description: Channel with genome BAM files
structure:
- meta:
type: map
description: Metadata map
- bam:
type: file
description: BAM file
pattern: "*.bam"
- bai:
type: file
description: BAM index file
pattern: "*.bai"
- ch_fasta:
description: Channel with genome FASTA file
structure:
- meta:
type: map
description: Metadata map
- fasta:
type: file
description: Genome FASTA file
pattern: "*.{fa,fasta}"
- umi_dedup_tool:
description: UMI deduplication tool to use
structure:
- value:
type: string
description: Either 'umicollapse' or 'umitools'
- umitools_dedup_stats:
description: Whether to generate UMI-tools deduplication stats
structure:
- value:
type: boolean
description: True or False
- bam_csi_index:
description: Whether to generate CSI index
structure:
- value:
type: boolean
description: True or False
- ch_transcriptome_bam:
description: Channel with transcriptome BAM files
structure:
- meta:
type: map
description: Metadata map
- bam:
type: file
description: BAM file
pattern: "*.bam"
- ch_transcript_fasta:
description: Channel with transcript FASTA file
structure:
- meta:
type: map
description: Metadata map
- fasta:
type: file
description: Transcript FASTA file
pattern: "*.{fa,fasta}"

output:
- bam:
description: Channel containing deduplicated genome BAM files
structure:
- meta:
type: map
description: Metadata map
- bam:
type: file
description: Deduplicated BAM file
pattern: "*.bam"
- bai:
description: Channel containing indexed BAM (BAI) files
structure:
- meta:
type: map
description: Metadata map
- bai:
type: file
description: BAM index file
pattern: "*.bai"
- csi:
description: Channel containing CSI files (if bam_csi_index is true)
structure:
- meta:
type: map
description: Metadata map
- csi:
type: file
description: CSI index file
pattern: "*.csi"
- dedup_log:
description: Channel containing deduplication log files
structure:
- meta:
type: map
description: Metadata map
- log:
type: file
description: Deduplication log file
pattern: "*.log"
- stats:
description: Channel containing BAM statistics files
structure:
- meta:
type: map
description: Metadata map
- stats:
type: file
description: BAM statistics file
pattern: "*.stats"
- flagstat:
description: Channel containing flagstat files
structure:
- meta:
type: map
description: Metadata map
- flagstat:
type: file
description: Flagstat file
pattern: "*.flagstat"
- idxstats:
description: Channel containing idxstats files
structure:
- meta:
type: map
description: Metadata map
- idxstats:
type: file
description: Idxstats file
pattern: "*.idxstats"
- multiqc_files:
description: Channel containing files for MultiQC
structure:
- file:
type: file
description: File for MultiQC
- transcriptome_bam:
description: Channel containing deduplicated transcriptome BAM files
structure:
- meta:
type: map
description: Metadata map
- bam:
type: file
description: Deduplicated transcriptome BAM file
pattern: "*.bam"
- versions:
description: Channel containing software versions file
structure:
- versions:
type: file
description: File containing versions of the software used
pattern: "versions.yml"

authors:
- "@pinin4fjords"
maintainers:
- "@pinin4fjords"
Loading

0 comments on commit 878f96a

Please sign in to comment.