Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add dorado #256

Open
wants to merge 12 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,12 @@ jobs:
strategy:
matrix:
profiles:
- "test_nodx_vc"
- "test_nodx_stringtie"
- "test_nodx_noaln"
- "test_nodx_rnamod"
- "test_bc_nodx"
- "test_nobc_dx"
- "test_nobc_nodx_vc"
- "test_nobc_nodx_stringtie"
- "test_nobc_nodx_noaln"
- "test_nobc_nodx_rnamod"
steps:
- name: Check out pipeline code
uses: actions/checkout@v3
Expand Down
14 changes: 7 additions & 7 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def check_samplesheet(file_in, updated_path, file_out):
barcode = "barcode%s" % (barcode.zfill(2))

## Check input file extension
nanopolish_fast5 = ""
fast5 = ""
if input_file:
if input_file.find(" ") != -1:
print_error("Input file contains spaces!", "Line", line)
Expand All @@ -115,12 +115,12 @@ def check_samplesheet(file_in, updated_path, file_out):
if updated_path != "not_changed":
input_file = "/".join([updated_path, input_file.split("/")[-1]])
list_dir = os.listdir(input_file)
nanopolish_fast5 = input_file
fast5 = input_file
if not (all(fname.endswith(".fast5") for fname in list_dir)):
if "fast5" in list_dir and "fastq" in list_dir:
nanopolish_fast5 = input_file + "/fast5"
fast5 = input_file + "/fast5"
## CHECK FAST5 DIRECTORY
if not (all(fname.endswith(".fast5") for fname in os.listdir(nanopolish_fast5))):
if not (all(fname.endswith(".fast5") for fname in os.listdir(fast5))):
print_error("fast5 directory contains non-fast5 files.")
## CHECK PROVIDED BASECALLED FASTQ
fastq_path = input_file + "/fastq"
Expand All @@ -139,8 +139,8 @@ def check_samplesheet(file_in, updated_path, file_out):
'{input_file} path does not end with ".fastq.gz", ".fq.gz", or ".bam" and is not an existing directory with correct fast5 and/or fastq inputs.'
)

## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, nanopolish_fast5 ]}}
sample_info = [barcode, input_file, nanopolish_fast5]
## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, fast5 ]}}
sample_info = [barcode, input_file, fast5]
if group not in sample_info_dict:
sample_info_dict[group] = {}
if replicate not in sample_info_dict[group]:
Expand All @@ -161,7 +161,7 @@ def check_samplesheet(file_in, updated_path, file_out):
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(",".join(["sample", "barcode", "reads", "nanopolish_fast5"]) + "\n")
fout.write(",".join(["sample", "barcode", "reads", "fast5"]) + "\n")
for sample in sorted(sample_info_dict.keys()):
## Check that replicate ids are in format 1..<NUM_REPS>
uniq_rep_ids = set(sample_info_dict[sample].keys())
Expand Down
49 changes: 28 additions & 21 deletions conf/test.config
Original file line number Diff line number Diff line change
@@ -1,33 +1,40 @@
/*
* -------------------------------------------------
* Nextflow config file for running tests
* -------------------------------------------------
* Defines bundled input files and everything required
* to run a fast and simple test. Use as follows:
* nextflow run nf-core/nanoseq -profile test_nobc_dx,<docker/singularity>
*/
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.

Use as follows:
nextflow run nf-core/nanoseq -profile test,<docker/singularity> --outdir <OUTDIR>

----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Limit resources
max_cpus = 2
max_memory = 6.GB
max_time = 12.h
// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '12.h'

// Input data to perform demultipexing
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_dx.csv'
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
run_nanolyse = true
protocol = 'DNA'
// Input data to perform both basecalling and demultiplexing
input = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_dx.csv'
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
protocol = 'cDNA'
flowcell = 'FLO-MIN106'
kit = 'SQK-DCS109'
barcode_kit = 'NBD103/NBD104'
input_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fastq/nondemultiplexed/sample_nobc_dx.fastq.gz'
skip_bigwig = true
skip_bigbed = true
trim_barcodes = true
dorado_model = 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0'
dorado_device = 'cpu'
run_nanolyse = true
skip_quantification = true
skip_fusion_analysis= true
skip_modification_analysis=true
aligner = 'graphmap2'

// This variable is just for reference and isnt actually required for the tests
// Files are downloaded and staged using the "GetTestData" process
input_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fast5/barcoded_multi/'
}
33 changes: 33 additions & 0 deletions conf/test_bc_nodx.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* -------------------------------------------------
* Nextflow config file for running tests
* -------------------------------------------------
* Defines bundled input files and everything required
* to run a fast and simple test. Use as follows:
* nextflow run nf-core/nanoseq -profile test_bc_nodx,<docker/singularity>
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Limit resources so that this can run on Travis
max_cpus = 2
max_memory = 6.GB
max_time = 12.h

// Input data to perform basecalling and to skip demultipexing
input = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_nodx.csv'
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
protocol = 'cDNA'
flowcell = 'FLO-MIN106'
kit = 'SQK-DCS108'
dorado_model = 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0'
dorado_device = 'cpu'
skip_bigbed = true
skip_bigwig = true
skip_demultiplexing = true
skip_quantification = true
skip_fusion_analysis= true
skip_modification_analysis=true
}
33 changes: 33 additions & 0 deletions conf/test_nobc_dx.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* -------------------------------------------------
* Nextflow config file for running tests
* -------------------------------------------------
* Defines bundled input files and everything required
* to run a fast and simple test. Use as follows:
* nextflow run nf-core/nanoseq -profile test_nobc_dx,<docker/singularity>
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Limit resources
max_cpus = 2
max_memory = 6.GB
max_time = 12.h

// Input data to perform demultipexing
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_dx.csv'
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
skip_basecalling = true
run_nanolyse = true
protocol = 'DNA'
barcode_kit = 'NBD103/NBD104'
input_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fastq/nondemultiplexed/sample_nobc_dx.fastq.gz'
skip_bigwig = true
skip_bigbed = true
skip_quantification = true
skip_fusion_analysis= true
skip_modification_analysis=true
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ params {
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_noaln.csv'
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_1-17550000.fa'
gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_1-17500000.gtf'
skip_basecalling = true
protocol = 'directRNA'
skip_demultiplexing = true
skip_alignment = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ params {
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_rnamod.csv'
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/modification_transcriptome_subset.fa'
gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/modification_transcriptome_subset.gtf'
skip_basecalling = true
protocol = 'directRNA'
run_nanolyse = true
skip_bigbed = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ params {
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
protocol = 'directRNA'
skip_basecalling = true
skip_demultiplexing = true
skip_fusion_analysis= true
skip_modification_analysis=true
Expand Down
1 change: 1 addition & 0 deletions conf/test_nodx_vc.config → conf/test_nobc_nodx_vc.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ params {
// Input data to skip demultiplexing and variant call
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_vc.csv'
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
skip_basecalling = true
protocol = 'DNA'
skip_quantification = true
skip_demultiplexing = true
Expand Down
39 changes: 39 additions & 0 deletions conf/test_withpull.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.

Use as follows:
nextflow run nf-core/nanoseq -profile test,<docker/singularity> --outdir <OUTDIR>

----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'

// Input data to perform both basecalling and demultiplexing
input = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_dx.csv'
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
protocol = 'cDNA'
flowcell = 'FLO-MIN106'
kit = 'SQK-DCS109'
barcode_kit = 'EXP-NBD103'
trim_barcodes=true
output_demultiplex_fast5 = true
Comment on lines +29 to +30
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Harshil Align™️!

run_nanolyse = true
skip_quantification = true
skip_fusion_analysis= true
skip_modification_analysis=true

// This variable is just for reference and isnt actually required for the tests
// Files are downloaded and staged using the "GetTestData" process
input_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fast5/barcoded/'
}
29 changes: 29 additions & 0 deletions modules/local/dorado.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
process DORADO {
tag "$meta.id"
label 'process_medium'

container "docker.io/ontresearch/dorado"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to double check, it is OK to use this license wise?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And would this work with singularity stilll?


input:
tuple val(meta), path(pod5_path)
val dorado_device
val dorado_model

output:
tuple val(meta), path("*.fastq.gz") , emit: fastq
path "versions.yml" , emit: versions

script:
"""
dorado download --model $dorado_model
dorado basecaller $dorado_model $pod5_path --device $dorado_device --emit-fastq > basecall.fastq
Comment on lines +18 to +19
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any options a user could theoretically add? Missing ext.args, for example.


cat <<-END_VERSIONS > versions.yml
"${task.process}":
dorado: \$(echo \$(dorado --version 2>&1) | sed -r 's/.{81}//')
END_VERSIONS

gzip basecall.fastq
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably go before the emissions, and should the file be forced to be basecall.fastq for downstream purposes? Otherwise Iw ould recommend using the ${prefix}.fastq system

"""
}

27 changes: 27 additions & 0 deletions modules/local/fast5_to_pod5.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
process FAST5_TO_POD5 {
tag "$meta.id"
label 'process_medium'

conda "conda-forge::r-base=4.0.3 bioconda::bioconductor-bambu=3.0.8 bioconda::bioconductor-bsgenome=1.66.0"
container "docker.io/yuukiiwa/pod5:0.2.4"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same above

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could be a biocontainer


input:
tuple val(meta), path(input_path)

output:
tuple val(meta), path("pod5/") , emit: pod5
yuukiiwa marked this conversation as resolved.
Show resolved Hide resolved

when:
task.ext.when == null || task.ext.when

script:
output_name = "pod5/converted.pod5"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this needs to be hardcoded, why not put directly in the command?

"""
pod5 convert fast5 $input_path --output $output_name

cat <<-END_VERSIONS > versions.yml
"${task.process}":
pod5: \$(echo \$(pod5 --version 2>&1) | sed -r 's/..............//')
END_VERSIONS
"""
}
4 changes: 2 additions & 2 deletions modules/local/get_test_data.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ process GET_TEST_DATA {
container "docker.io/yuukiiwa/git:latest"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Huh, interesting, this isn't how we normally retrieve test data with nf-core (either via URLs, or upstream step in the ci.yml), is there a reason why you do it like this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's in the container?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The container contains the git command. This is not required for most nf-core/test-dataset retrieval, but it is required here as I need to stage the entire fast5 directory containing many fast5 files to test the basecalling and RNA modification detection functionalities


output:
path "test-datasets/fast5/$barcoded/*" , emit: ch_input_fast5s_path
path "test-datasets/fast5/$barcoded/" , emit: ch_input_fast5_dir_path
path "test-datasets/modification_fast5_fastq/", emit: ch_input_dir_path
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
barcoded = (workflow.profile.contains('test_bc_nodx') || workflow.profile.contains('rnamod')) ? "nonbarcoded" : "barcoded"
barcoded = (workflow.profile.contains('test_bc_nodx') || workflow.profile.contains('rnamod')) ? "nonbarcoded_multi" : "barcoded_multi"
yuukiiwa marked this conversation as resolved.
Show resolved Hide resolved
"""
git clone https://github.com/nf-core/test-datasets.git --branch nanoseq --single-branch

Expand Down
2 changes: 1 addition & 1 deletion modules/local/nanopolish_index_eventalign.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ process NANOPOLISH_INDEX_EVENTALIGN {
script:
sample_summary = "$meta.id" +"_summary.txt"
sample_eventalign = "$meta.id" +"_eventalign.txt"
fast5 = "$meta.nanopolish_fast5"
fast5 = "$meta.fast5"
"""
nanopolish index -d $fast5 $fastq
nanopolish eventalign --reads $fastq --bam $bam --genome $genome --scale-events --signal-index --summary $sample_summary --threads $task.cpus > $sample_eventalign
Expand Down
2 changes: 1 addition & 1 deletion modules/local/samplesheet_check.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ process SAMPLESHEET_CHECK {
task.ext.when == null || task.ext.when

script: // This script is bundled with the pipeline, in nf-core/nanoseq/bin/
updated_path = workflow.profile.contains('test_nodx_rnamod') ? "$input_path" : "not_changed"
updated_path = (workflow.profile.contains('test_bc_nodx') || workflow.profile.contains('rnamod')) ? "$input_path" : "not_changed"
"""
check_samplesheet.py \\
$samplesheet \\
Expand Down
21 changes: 13 additions & 8 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,18 @@ params {
gtf = null


// Options: Demultiplexing
// Options: Basecalling and Demultiplexing
input_path = null
flowcell = null
kit = null
barcode_kit = null
barcode_both_ends = false
trim_barcodes = false
gpu_device = 'auto'
gpu_cluster_options = null
dorado_model = null
dorado_device = 'cuda:all'
qcat_min_score = 60
qcat_detect_middle = false
skip_basecalling = false
skip_demultiplexing = false

// Options: Raw read cleaning
Expand Down Expand Up @@ -221,12 +224,14 @@ profiles {
executor.cpus = 16
executor.memory = 60.GB
}
test { includeConfig 'conf/test.config' }
test_full { includeConfig 'conf/test_full.config' }
test { includeConfig 'conf/test.config' }
test_nodx_stringtie { includeConfig 'conf/test_nodx_stringtie.config' }
test_nodx_noaln { includeConfig 'conf/test_nodx_noaln.config' }
test_nodx_vc { includeConfig 'conf/test_nodx_vc.config' }
test_nodx_rnamod { includeConfig 'conf/test_nodx_rnamod.config' }
test_bc_nodx { includeConfig 'conf/test_bc_nodx.config' }
test_nobc_dx { includeConfig 'conf/test_nobc_dx.config' }
test_nobc_nodx_stringtie { includeConfig 'conf/test_nobc_nodx_stringtie.config' }
test_nobc_nodx_noaln { includeConfig 'conf/test_nobc_nodx_noaln.config' }
test_nobc_nodx_vc { includeConfig 'conf/test_nobc_nodx_vc.config' }
test_nobc_nodx_rnamod { includeConfig 'conf/test_nobc_nodx_rnamod.config' }
}


Expand Down
Loading
Loading