nf-core · yuukiiwa · Oct 17, 2023 · Oct 17, 2023 · Oct 18, 2023 · Oct 18, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -49,10 +49,12 @@ jobs:
  strategy:
  matrix:
  profiles:
- - "test_nodx_vc"
- - "test_nodx_stringtie"
- - "test_nodx_noaln"
- - "test_nodx_rnamod"
+ - "test_bc_nodx"
+ - "test_nobc_dx"
+ - "test_nobc_nodx_vc"
+ - "test_nobc_nodx_stringtie"
+ - "test_nobc_nodx_noaln"
+ - "test_nobc_nodx_rnamod"
  steps:
  - name: Check out pipeline code
  uses: actions/checkout@v3

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -101,7 +101,7 @@ def check_samplesheet(file_in, updated_path, file_out):
  barcode = "barcode%s" % (barcode.zfill(2))
 
  ## Check input file extension
- nanopolish_fast5 = ""
+ fast5 = ""
  if input_file:
  if input_file.find(" ") != -1:
  print_error("Input file contains spaces!", "Line", line)
@@ -115,12 +115,12 @@ def check_samplesheet(file_in, updated_path, file_out):
  if updated_path != "not_changed":
  input_file = "/".join([updated_path, input_file.split("/")[-1]])
  list_dir = os.listdir(input_file)
- nanopolish_fast5 = input_file
+ fast5 = input_file
  if not (all(fname.endswith(".fast5") for fname in list_dir)):
  if "fast5" in list_dir and "fastq" in list_dir:
- nanopolish_fast5 = input_file + "/fast5"
+ fast5 = input_file + "/fast5"
  ## CHECK FAST5 DIRECTORY
- if not (all(fname.endswith(".fast5") for fname in os.listdir(nanopolish_fast5))):
+ if not (all(fname.endswith(".fast5") for fname in os.listdir(fast5))):
  print_error("fast5 directory contains non-fast5 files.")
  ## CHECK PROVIDED BASECALLED FASTQ
  fastq_path = input_file + "/fastq"
@@ -139,8 +139,8 @@ def check_samplesheet(file_in, updated_path, file_out):
  '{input_file} path does not end with ".fastq.gz", ".fq.gz", or ".bam" and is not an existing directory with correct fast5 and/or fastq inputs.'
  )
 
- ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, nanopolish_fast5 ]}}
- sample_info = [barcode, input_file, nanopolish_fast5]
+ ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, fast5 ]}}
+ sample_info = [barcode, input_file, fast5]
  if group not in sample_info_dict:
  sample_info_dict[group] = {}
  if replicate not in sample_info_dict[group]:
@@ -161,7 +161,7 @@ def check_samplesheet(file_in, updated_path, file_out):
  out_dir = os.path.dirname(file_out)
  make_dir(out_dir)
  with open(file_out, "w") as fout:
- fout.write(",".join(["sample", "barcode", "reads", "nanopolish_fast5"]) + "\n")
+ fout.write(",".join(["sample", "barcode", "reads", "fast5"]) + "\n")
  for sample in sorted(sample_info_dict.keys()):
  ## Check that replicate ids are in format 1..<NUM_REPS>
  uniq_rep_ids = set(sample_info_dict[sample].keys())

diff --git a/conf/test.config b/conf/test.config
@@ -1,33 +1,40 @@
 /*
- * -------------------------------------------------
- * Nextflow config file for running tests
- * -------------------------------------------------
- * Defines bundled input files and everything required
- * to run a fast and simple test. Use as follows:
- * nextflow run nf-core/nanoseq -profile test_nobc_dx,<docker/singularity>
- */
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Defines input files and everything required to run a fast and simple pipeline test.
+
+ Use as follows:
+ nextflow run nf-core/nanoseq -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
 
 params {
  config_profile_name = 'Test profile'
  config_profile_description = 'Minimal test dataset to check pipeline function'
 
- // Limit resources
- max_cpus  = 2
- max_memory  = 6.GB
- max_time  = 12.h
+ // Limit resources so that this can run on GitHub Actions
+ max_cpus = 2
+ max_memory = '6.GB'
+ max_time = '12.h'
 
- // Input data to perform demultipexing
- input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_dx.csv'
- fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
- gtf  = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
- run_nanolyse = true
- protocol = 'DNA'
+ // Input data to perform both basecalling and demultiplexing
+ input = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_dx.csv'
+ fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+ protocol = 'cDNA'
+ flowcell  = 'FLO-MIN106'
+ kit  = 'SQK-DCS109'
  barcode_kit = 'NBD103/NBD104'
- input_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fastq/nondemultiplexed/sample_nobc_dx.fastq.gz'
- skip_bigwig = true
- skip_bigbed = true
+ trim_barcodes = true
+ dorado_model = 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0'
+ dorado_device = 'cpu'
+ run_nanolyse = true
  skip_quantification = true
  skip_fusion_analysis= true
  skip_modification_analysis=true
- aligner = 'graphmap2'
+
+ // This variable is just for reference and isnt actually required for the tests
+ // Files are downloaded and staged using the "GetTestData" process
+ input_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fast5/barcoded_multi/'
 }
diff --git a/conf/test_bc_nodx.config b/conf/test_bc_nodx.config
@@ -0,0 +1,33 @@
+/*
+ * -------------------------------------------------
+ * Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ * nextflow run nf-core/nanoseq -profile test_bc_nodx,<docker/singularity>
+ */
+
+params {
+ config_profile_name = 'Test profile'
+ config_profile_description = 'Minimal test dataset to check pipeline function'
+
+ // Limit resources so that this can run on Travis
+ max_cpus = 2
+ max_memory = 6.GB
+ max_time = 12.h
+
+ // Input data to perform basecalling and to skip demultipexing
+ input = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_nodx.csv'
+ fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+ protocol = 'cDNA'
+ flowcell = 'FLO-MIN106'
+ kit = 'SQK-DCS108'
+ dorado_model = 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0'
+ dorado_device = 'cpu'
+ skip_bigbed = true
+ skip_bigwig = true
+ skip_demultiplexing = true
+ skip_quantification = true
+ skip_fusion_analysis= true
+ skip_modification_analysis=true
+}
diff --git a/conf/test_nobc_dx.config b/conf/test_nobc_dx.config
@@ -0,0 +1,33 @@
+/*
+ * -------------------------------------------------
+ * Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ * nextflow run nf-core/nanoseq -profile test_nobc_dx,<docker/singularity>
+ */
+
+params {
+ config_profile_name = 'Test profile'
+ config_profile_description = 'Minimal test dataset to check pipeline function'
+
+ // Limit resources
+ max_cpus = 2
+ max_memory = 6.GB
+ max_time = 12.h
+
+ // Input data to perform demultipexing
+ input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_dx.csv'
+ fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
+ gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
+ skip_basecalling = true
+ run_nanolyse = true
+ protocol = 'DNA'
+ barcode_kit = 'NBD103/NBD104'
+ input_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fastq/nondemultiplexed/sample_nobc_dx.fastq.gz'
+ skip_bigwig = true
+ skip_bigbed = true
+ skip_quantification = true
+ skip_fusion_analysis= true
+ skip_modification_analysis=true
+}
diff --git a/conf/test_nodx_noaln.config → conf/test_nobc_nodx_noaln.config b/conf/test_nodx_noaln.config → conf/test_nobc_nodx_noaln.config
@@ -20,6 +20,7 @@ params {
  input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_noaln.csv'
  fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_1-17550000.fa'
  gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_1-17500000.gtf'
+ skip_basecalling = true
  protocol = 'directRNA'
  skip_demultiplexing = true
  skip_alignment = true

diff --git a/conf/test_nodx_rnamod.config → conf/test_nobc_nodx_rnamod.config b/conf/test_nodx_rnamod.config → conf/test_nobc_nodx_rnamod.config
@@ -20,6 +20,7 @@ params {
  input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_rnamod.csv'
  fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/modification_transcriptome_subset.fa'
  gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/modification_transcriptome_subset.gtf'
+ skip_basecalling = true
  protocol = 'directRNA'
  run_nanolyse = true
  skip_bigbed = true

diff --git a/conf/test_nodx_stringtie.config → conf/test_nobc_nodx_stringtie.config b/conf/test_nodx_stringtie.config → conf/test_nobc_nodx_stringtie.config
@@ -21,6 +21,7 @@ params {
  fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
  gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
  protocol = 'directRNA'
+ skip_basecalling = true
  skip_demultiplexing = true
  skip_fusion_analysis= true
  skip_modification_analysis=true

diff --git a/conf/test_nodx_vc.config → conf/test_nobc_nodx_vc.config b/conf/test_nodx_vc.config → conf/test_nobc_nodx_vc.config
@@ -19,6 +19,7 @@ params {
  // Input data to skip demultiplexing and variant call
  input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_vc.csv'
  fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+ skip_basecalling = true
  protocol = 'DNA'
  skip_quantification = true
  skip_demultiplexing = true

diff --git a/conf/test_withpull.config b/conf/test_withpull.config
@@ -0,0 +1,39 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Defines input files and everything required to run a fast and simple pipeline test.
+
+ Use as follows:
+ nextflow run nf-core/nanoseq -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+ config_profile_name = 'Test profile'
+ config_profile_description = 'Minimal test dataset to check pipeline function'
+
+ // Limit resources so that this can run on GitHub Actions
+ max_cpus = 2
+ max_memory = '6.GB'
+ max_time = '6.h'
+
+ // Input data to perform both basecalling and demultiplexing
+ input = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_dx.csv'
+ fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+ protocol = 'cDNA'
+ flowcell = 'FLO-MIN106'
+ kit = 'SQK-DCS109'
+ barcode_kit = 'EXP-NBD103'
+ trim_barcodes=true
+ output_demultiplex_fast5 = true
+ run_nanolyse = true
+ skip_quantification = true
+ skip_fusion_analysis= true
+ skip_modification_analysis=true
+
+ // This variable is just for reference and isnt actually required for the tests
+ // Files are downloaded and staged using the "GetTestData" process
+ input_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fast5/barcoded/'
+}
diff --git a/modules/local/dorado.nf b/modules/local/dorado.nf
@@ -0,0 +1,29 @@
+process DORADO {
+ tag "$meta.id"
+ label 'process_medium'
+
+ container "docker.io/ontresearch/dorado"
+
+ input:
+ tuple val(meta), path(pod5_path)
+ val dorado_device
+ val dorado_model
+
+ output:
+ tuple val(meta), path("*.fastq.gz") , emit: fastq
+ path "versions.yml" , emit: versions
+
+ script:
+ """
+ dorado download --model $dorado_model
+ dorado basecaller $dorado_model $pod5_path --device $dorado_device --emit-fastq > basecall.fastq
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ dorado: \$(echo \$(dorado --version 2>&1) | sed -r 's/.{81}//')
+ END_VERSIONS
+
+ gzip basecall.fastq
+ """
+}
+
diff --git a/modules/local/fast5_to_pod5.nf b/modules/local/fast5_to_pod5.nf
@@ -0,0 +1,27 @@
+process FAST5_TO_POD5 {
+ tag "$meta.id"
+ label 'process_medium'
+
+ conda "conda-forge::r-base=4.0.3 bioconda::bioconductor-bambu=3.0.8 bioconda::bioconductor-bsgenome=1.66.0"
+ container "docker.io/yuukiiwa/pod5:0.2.4"
+
+ input:
+ tuple val(meta), path(input_path)
+
+ output:
+ tuple val(meta), path("pod5/") , emit: pod5
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ output_name = "pod5/converted.pod5"
+ """
+ pod5 convert fast5 $input_path --output $output_name
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ pod5: \$(echo \$(pod5 --version 2>&1) | sed -r 's/..............//')
+ END_VERSIONS
+ """
+}
diff --git a/modules/local/get_test_data.nf b/modules/local/get_test_data.nf
@@ -4,15 +4,15 @@ process GET_TEST_DATA {
  container "docker.io/yuukiiwa/git:latest"
 
  output:
- path "test-datasets/fast5/$barcoded/*" , emit: ch_input_fast5s_path
+ path "test-datasets/fast5/$barcoded/"  , emit: ch_input_fast5_dir_path
  path "test-datasets/modification_fast5_fastq/", emit: ch_input_dir_path
  path "versions.yml" , emit: versions
 
  when:
  task.ext.when == null || task.ext.when
 
  script:
- barcoded = (workflow.profile.contains('test_bc_nodx') || workflow.profile.contains('rnamod')) ? "nonbarcoded" : "barcoded"
+ barcoded = (workflow.profile.contains('test_bc_nodx') || workflow.profile.contains('rnamod')) ? "nonbarcoded_multi" : "barcoded_multi"
  """
  git clone https://github.com/nf-core/test-datasets.git --branch nanoseq --single-branch
 

diff --git a/modules/local/nanopolish_index_eventalign.nf b/modules/local/nanopolish_index_eventalign.nf
@@ -20,7 +20,7 @@ process NANOPOLISH_INDEX_EVENTALIGN {
  script:
  sample_summary = "$meta.id" +"_summary.txt"
  sample_eventalign = "$meta.id" +"_eventalign.txt"
- fast5 = "$meta.nanopolish_fast5"
+ fast5 = "$meta.fast5"
  """
  nanopolish index -d $fast5 $fastq
  nanopolish eventalign --reads $fastq --bam $bam --genome $genome --scale-events --signal-index --summary $sample_summary --threads $task.cpus > $sample_eventalign

diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
@@ -19,7 +19,7 @@ process SAMPLESHEET_CHECK {
  task.ext.when == null || task.ext.when
 
  script: // This script is bundled with the pipeline, in nf-core/nanoseq/bin/
- updated_path = workflow.profile.contains('test_nodx_rnamod') ? "$input_path" : "not_changed"
+ updated_path = (workflow.profile.contains('test_bc_nodx') || workflow.profile.contains('rnamod')) ? "$input_path" : "not_changed"
  """
  check_samplesheet.py \\
  $samplesheet \\

diff --git a/nextflow.config b/nextflow.config
@@ -19,15 +19,18 @@ params {
  gtf = null
 
 
- // Options: Demultiplexing
+ // Options: Basecalling and Demultiplexing
  input_path = null
+ flowcell = null
+ kit = null
  barcode_kit = null
  barcode_both_ends = false
  trim_barcodes = false
- gpu_device  = 'auto'
- gpu_cluster_options = null
+ dorado_model = null
+ dorado_device  = 'cuda:all'
  qcat_min_score = 60
  qcat_detect_middle = false
+ skip_basecalling = false
  skip_demultiplexing = false
 
  // Options: Raw read cleaning
@@ -221,12 +224,14 @@ profiles {
  executor.cpus = 16
  executor.memory = 60.GB
  }
+ test { includeConfig 'conf/test.config' }
  test_full { includeConfig 'conf/test_full.config' }
- test { includeConfig 'conf/test.config' }
- test_nodx_stringtie { includeConfig 'conf/test_nodx_stringtie.config' }
- test_nodx_noaln { includeConfig 'conf/test_nodx_noaln.config' }
- test_nodx_vc { includeConfig 'conf/test_nodx_vc.config' }
- test_nodx_rnamod { includeConfig 'conf/test_nodx_rnamod.config' }
+ test_bc_nodx { includeConfig 'conf/test_bc_nodx.config' }
+ test_nobc_dx { includeConfig 'conf/test_nobc_dx.config' }
+ test_nobc_nodx_stringtie { includeConfig 'conf/test_nobc_nodx_stringtie.config' }
+ test_nobc_nodx_noaln { includeConfig 'conf/test_nobc_nodx_noaln.config' }
+ test_nobc_nodx_vc { includeConfig 'conf/test_nobc_nodx_vc.config' }
+ test_nobc_nodx_rnamod { includeConfig 'conf/test_nobc_nodx_rnamod.config' }
 }