From dfab8f55bdb4495d8473f85eb197c85af85cec13 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Fri, 7 Jul 2023 12:31:21 -0500 Subject: [PATCH 01/11] Update Influenza ref seqs DB to use all Orthomyxoviridae viruses from NCBI FTP site --- .github/workflows/ci.yml | 46 ++- bin/parse_influenza_blast_results.py | 103 +++--- conf/base.config | 5 + conf/modules_illumina.config | 120 ++++--- conf/modules_nanopore.config | 500 ++++++++++++++------------- modules/local/misc.nf | 34 -- modules/local/zstd_decompress.nf | 30 ++ nextflow.config | 4 +- workflows/illumina.nf | 21 +- workflows/nanopore.nf | 16 +- 10 files changed, 473 insertions(+), 406 deletions(-) create mode 100644 modules/local/zstd_decompress.nf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 10d33ae..b06d54d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,6 +10,10 @@ on: env: NXF_ANSI_LOG: false + # URLs to Influenza ref data should be updated in step with nextflow.config + # default ncbi_influenza_fasta and ncbi_influenza_metadata params + FASTA_ZST_URL: https://api.figshare.com/v2/file/download/41415330 + CSV_ZST_URL: https://api.figshare.com/v2/file/download/41415333 concurrency: group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" @@ -58,23 +62,32 @@ jobs: make -j2 make install which seqtk - - name: Cache subsampled influenza.fna.gz + - name: Cache subsampled influenza.fna uses: actions/cache@v3 id: cache-influenza-fna with: - path: influenza-10k.fna.gz + path: influenza-10k.fna.zst key: influenza-fna - name: Subsample NCBI influenza.fna if: steps.cache-influenza-fna.outputs.cache-hit != 'true' run: | - curl --silent -SLk https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz > influenza.fna.gz - echo "Subsample 10k seqs from influenza.fna.gz with seqtk" - seqtk sample -s 789 influenza.fna.gz 10000 | gzip -ck > influenza-10k.fna.gz + curl --silent -SLk ${FASTA_ZST_URL} | zstdcat | seqtk sample -s 789 - 10000 | zstd -ck > influenza-10k.fna.zst + - name: Cache influenza.csv + uses: actions/cache@v3 + id: cache-influenza-csv + with: + path: influenza.csv.zst + key: influenza-csv + - name: Download influenza.csv + if: steps.cache-influenza-csv.outputs.cache-hit != 'true' + run: | + curl --silent -SLk ${CSV_ZST_URL} > influenza.csv.zst - name: Run pipeline with test data run: | nextflow run ${GITHUB_WORKSPACE} \ -profile test_illumina,docker \ - --ncbi_influenza_fasta influenza-10k.fna.gz + --ncbi_influenza_fasta influenza-10k.fna.zst \ + --ncbi_influenza_metadata influenza.csv.zst - name: Upload Artifact if: success() uses: actions/upload-artifact@v1.0.0 @@ -155,25 +168,34 @@ jobs: echo "ERR6359501-10k,$(realpath reads/ERR6359501-10k.fastq)" | tee -a samplesheet.csv echo "ERR6359501,$(realpath run1)" | tee -a samplesheet.csv echo "ERR6359501,$(realpath run2)" | tee -a samplesheet.csv - - name: Cache subsampled influenza.fna.gz + - name: Cache subsampled influenza.fna uses: actions/cache@v3 id: cache-influenza-fna with: - path: influenza-10k.fna.gz + path: influenza-10k.fna.zst key: influenza-fna - name: Subsample NCBI influenza.fna if: steps.cache-influenza-fna.outputs.cache-hit != 'true' run: | - curl --silent -SLk https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz > influenza.fna.gz - echo "Subsample 10k seqs from influenza.fna.gz with seqtk" - seqtk sample -s 789 influenza.fna.gz 10000 | gzip -ck > influenza-10k.fna.gz + curl --silent -SLk ${FASTA_ZST_URL} | zstdcat | seqtk sample -s 789 - 10000 | zstd -ck > influenza-10k.fna.zst + - name: Cache influenza.csv + uses: actions/cache@v3 + id: cache-influenza-csv + with: + path: influenza.csv.zst + key: influenza-csv + - name: Download influenza.csv + if: steps.cache-influenza-csv.outputs.cache-hit != 'true' + run: | + curl --silent -SLk ${CSV_ZST_URL} > influenza.csv.zst - name: Run pipeline with test data run: | nextflow run ${GITHUB_WORKSPACE} \ -profile test_nanopore,docker \ --platform nanopore \ --input samplesheet.csv \ - --ncbi_influenza_fasta influenza-10k.fna.gz + --ncbi_influenza_fasta influenza-10k.fna.zst \ + --ncbi_influenza_metadata influenza.csv.zst - name: Upload pipeline_info/ if: success() uses: actions/upload-artifact@v1.0.0 diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py index 90881b7..c2bdc6c 100755 --- a/bin/parse_influenza_blast_results.py +++ b/bin/parse_influenza_blast_results.py @@ -59,8 +59,8 @@ blast_results_report_columns = [ ("sample", "Sample"), ("sample_segment", "Sample Genome Segment Number"), - ("accession", "Reference NCBI Accession"), - ("subtype", "Reference Subtype"), + ("#Accession", "Reference NCBI Accession"), + ("Genotype", "Reference Subtype"), ("pident", "BLASTN Percent Identity"), ("length", "BLASTN Alignment Length"), ("mismatch", "BLASTN Mismatches"), @@ -75,19 +75,17 @@ ("slen", "Reference Sequence Length"), ("qcovs", "Sample Sequence Coverage of Reference Sequence"), ("stitle", "Reference Sequence ID"), - ("segment", "Reference Genome Segment Number"), - ("virus_name", "Reference Virus Name"), - ("host", "Reference Host"), - ("country", "Reference Country"), - ("date", "Reference Collection Date"), - ("age", "Reference Patient Age"), - ("gender", "Reference Patient Gender"), - ("group_id", "Reference Group ID"), + ("Segment", "Reference Genome Segment Number"), + ("GenBank_Title", "Reference Virus Name"), + ("Host", "Reference Host"), + ("Geo_Location", "Reference Geo Location"), + ("Collection_Date", "Reference Collection Date"), + ("Release_Date", "Reference Release Date"), ] subtype_results_summary_columns = [ "sample", - "subtype", + "Genotype", "H_top_accession", "H_type", "H_virus_name", @@ -100,7 +98,7 @@ columns_H_summary_results = [ "sample", - "subtype", + "Genotype", "H_top_accession", "H_NCBI_Influenza_DB_proportion_matches", "H_NCBI_Influenza_DB_subtype_matches", @@ -121,7 +119,7 @@ columns_N_summary_results = [ "sample", - "subtype", + "Genotype", "N_top_accession", "N_NCBI_Influenza_DB_proportion_matches", "N_NCBI_Influenza_DB_subtype_matches", @@ -142,7 +140,7 @@ subtype_results_summary_final_names = { "sample": "Sample", - "subtype": "Subtype Prediction", + "Genotype": "Subtype Prediction", "N_type": "N: type prediction", "N_top_accession": "N: top match accession", "N_virus_name": "N: top match virus name", @@ -209,7 +207,7 @@ def parse_blast_result( f"and Min Alignment length > {min_aln_length}" ) df_filtered = df_filtered.with_columns([ - pl.col('saccver').str.strip().alias("accession"), + pl.col('saccver').str.strip().alias("#Accession"), pl.lit(sample_name, dtype=pl.Categorical).alias("sample"), pl.col('qaccver').str.extract(r".+_(\d)$").cast(pl.Categorical).alias("sample_segment"), pl.col("stitle").str.extract(regex_subtype_pattern).alias("subtype_from_match_title").cast(pl.Categorical) @@ -217,14 +215,14 @@ def parse_blast_result( logging.info( f"{sample_name} | Merging NCBI Influenza DB genome metadata with BLAST results on accession." ) - df_merge = df_filtered.join(df_metadata, on="accession", how="left") + df_merge = df_filtered.join(df_metadata, on="#Accession", how="left") del df_filtered del df_metadata df_merge = df_merge.with_columns( - pl.when(pl.col("subtype").is_null()) + pl.when(pl.col("Genotype").is_null()) .then(pl.col("subtype_from_match_title")) - .otherwise(pl.col("subtype")) - .alias("subtype") + .otherwise(pl.col("Genotype")) + .alias("Genotype") ) df_merge = df_merge.sort( by=["sample_segment", "bitscore"], descending=[False, True] @@ -240,7 +238,7 @@ def parse_blast_result( subtype_results_summary = {"sample": sample_name} if not get_top_ref: is_iav = True - if df_top_seg_matches.select(pl.col("subtype").is_null().all())[0, 0]: + if df_top_seg_matches.select(pl.col("Genotype").is_null().all())[0, 0]: is_iav = False H_results = None N_results = None @@ -250,7 +248,7 @@ def parse_blast_result( if "6" in segments: N_results = find_h_or_n_type(df_merge, "6", is_iav) subtype_results_summary.update(N_results) - subtype_results_summary["subtype"] = get_subtype_value(H_results, N_results, is_iav) + subtype_results_summary["Genotype"] = get_subtype_value(H_results, N_results, is_iav) return df_top_seg_matches, subtype_results_summary @@ -296,9 +294,9 @@ def find_h_or_n_type(df_merge, seg, is_iav): reg_h_or_n_type = "[Nn]" df_segment = df_merge.filter(pl.col("sample_segment") == seg) if is_iav: - type_counts = df_segment["subtype"].value_counts(sort=True) - type_counts = type_counts.filter(~pl.col("subtype").is_null()) - df_type_counts = type_counts.with_columns(pl.lit(type_counts["subtype"].str.extract(reg_h_or_n_type + r"(\d+)"). + type_counts = df_segment["Genotype"].value_counts(sort=True) + type_counts = type_counts.filter(~pl.col("Genotype").is_null()) + df_type_counts = type_counts.with_columns(pl.lit(type_counts["Genotype"].str.extract(reg_h_or_n_type + r"(\d+)"). alias(type_name))) df_type_counts = df_type_counts.filter(~pl.col(type_name).is_null()) logging.debug(f"{df_type_counts}") @@ -313,7 +311,7 @@ def find_h_or_n_type(df_merge, seg, is_iav): f"{h_or_n}{top_type} n={top_type_count}/{total_count} ({top_type_count / total_count:.1%})" ) df_segment = df_segment.with_columns( - pl.lit(df_segment["subtype"].str.contains(r".*" + reg_h_or_n_type + top_type + r".*") + pl.lit(df_segment["Genotype"].str.contains(r".*" + reg_h_or_n_type + top_type + r".*") .fill_null(False) .alias("type_mask"))) df_seg_top_type = df_segment.filter(pl.col("type_mask") == True).drop("type_mask") @@ -332,12 +330,12 @@ def find_h_or_n_type(df_merge, seg, is_iav): f"{h_or_n}_top_gaps": top_result["gapopen"], f"{h_or_n}_top_bitscore": top_result["bitscore"], f"{h_or_n}_top_align_length": top_result["length"], - f"{h_or_n}_top_accession": top_result["accession"], - f"{h_or_n}_top_host": top_result["host"], - f"{h_or_n}_top_country": top_result["country"], - f"{h_or_n}_top_date": top_result["date"], + f"{h_or_n}_top_accession": top_result["#Accession"], + f"{h_or_n}_top_host": top_result["Host"], + f"{h_or_n}_top_country": top_result["Geo_Location"], + f"{h_or_n}_top_date": top_result["Collection_Date"], f"{h_or_n}_top_seq_length": top_result["slen"], - f"{h_or_n}_virus_name": top_result["virus_name"], + f"{h_or_n}_virus_name": top_result["GenBank_Title"], f"{h_or_n}_NCBI_Influenza_DB_subtype_matches": top_type_count, f"{h_or_n}_NCBI_Influenza_DB_total_matches": total_count, f"{h_or_n}_NCBI_Influenza_DB_proportion_matches": top_type_count / total_count if is_iav else "N/A", @@ -370,33 +368,33 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold, ) logging.info(f'Parsing Influenza metadata file "{flu_metadata}"') + md_cols = [ - ("accession", str), - ("host", pl.Categorical), - ("segment", pl.Categorical), - ("subtype", str), - ("country", pl.Categorical), - ("date", pl.Categorical), - ("seq_length", pl.UInt16), - ("virus_name", pl.Categorical), - ("age", pl.Categorical), - ("gender", pl.Categorical), - ("group_id", pl.Categorical), + ("#Accession", str), + ("Release_Date", pl.Categorical), + ("Genus", pl.Categorical), + ("Length", pl.UInt16), + ("Genotype", pl.Categorical), + ("Segment", pl.Categorical), + ("Publications", str), + ("Geo_Location", pl.Categorical), + ("Host", pl.Categorical), + ("Isolation_Source", pl.Categorical), + ("Collection_Date", pl.Categorical), + ("GenBank_Title", str), ] df_md = pl.read_csv( flu_metadata, - has_header=False, - separator="\t", - new_columns=[name for name, _ in md_cols], + has_header=True, dtypes={name: t for name, t in md_cols}, ) - unique_subtypes = df_md.select("subtype").unique() - unique_subtypes = unique_subtypes.filter(~pl.col("subtype").is_null()) + unique_subtypes = df_md.select("Genotype").unique() + unique_subtypes = unique_subtypes.filter(~pl.col("Genotype").is_null()) logging.info( f"Parsed Influenza metadata file into DataFrame with n={df_md.shape[0]} rows and n={df_md.shape[1]} columns. There are {len(unique_subtypes)} unique subtypes. " ) - regex_subtype_pattern = r"\((H\d+N\d+|" + "|".join(list(unique_subtypes["subtype"])) + r")\)" + regex_subtype_pattern = r"\((H\d+N\d+|" + "|".join(list(unique_subtypes["Genotype"])) + r")\)" results = [ parse_blast_result(blast_result, df_md, regex_subtype_pattern, get_top_ref, top=top, pident_threshold=pident_threshold, @@ -445,8 +443,15 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold, df_blast = df_blast.rename( mapping={k: v for k, v in blast_results_report_columns} ) - df_ref_id = df_blast.select(pl.col(['Sample', 'Sample Genome Segment Number', - 'Reference NCBI Accession', 'BLASTN Bitscore', 'Reference Sequence ID'])) + df_ref_id = df_blast.select( + pl.col([ + 'Sample', + 'Sample Genome Segment Number', + 'Reference NCBI Accession', + 'BLASTN Bitscore', + 'Reference Sequence ID' + ]) + ) df_ref_id = df_ref_id.with_columns( pl.when(pl.col("Reference NCBI Accession").is_null()) .then(pl.col("Reference Sequence ID")) diff --git a/conf/base.config b/conf/base.config index 8e75895..e7e0f06 100644 --- a/conf/base.config +++ b/conf/base.config @@ -16,6 +16,11 @@ process { maxErrors = '-1' // Groupable resource requirements for processes + withLabel:process_single { + cpus = 1 + memory = { check_max( 100.MB * task.attempt, 'memory' ) } + time = { check_max( 1.h * task.attempt, 'time' ) } + } withLabel:process_low { cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 4.GB * task.attempt, 'memory' ) } diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config index d4f35d7..d6c7c65 100644 --- a/conf/modules_illumina.config +++ b/conf/modules_illumina.config @@ -1,63 +1,69 @@ - +// Illumina subworkflow process configuration process { - withName: 'IRMA' { - publishDir = [ - [ - path: { "${params.outdir}/irma"}, - mode: params.publish_dir_mode - ], - [ - path: { "${params.outdir}/consensus/irma/" }, - pattern: "*.consensus.fasta", - mode: params.publish_dir_mode - ] - ] - } + withName: 'IRMA' { + publishDir = [ + [ + path: { "${params.outdir}/irma"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ], + [ + path: { "${params.outdir}/consensus/irma/" }, + pattern: "*.consensus.fasta", + mode: params.publish_dir_mode + ] + ] + } - withName: 'BLAST_MAKEBLASTDB' { - ext.args = '-dbtype nucl' - publishDir = [ - [ - path: { "${params.outdir}/blast"}, - mode: params.publish_dir_mode - ] - ] - } + withName: 'BLAST_MAKEBLASTDB' { + ext.args = '-dbtype nucl' + publishDir = [ + [ + path: { "${params.outdir}/blast/db"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'BLAST_BLASTN' { - ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6' - publishDir = [ - [ - path: { "${params.outdir}/blast"}, - mode: params.publish_dir_mode - ] - ] - } + withName: 'BLAST_BLASTN' { + ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6' + publishDir = [ + [ + path: { "${params.outdir}/blast"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } + withName: 'SUBTYPING_REPORT' { + publishDir = [ + [ + path: { "${params.outdir}/"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'SUBTYPING_REPORT' { - publishDir = [ - [ - path: { "${params.outdir}/"}, - mode: params.publish_dir_mode - ] - ] - } + withName: 'ZSTD_DECOMPRESS_.*' { + publishDir = [ + [ + path: { "${params.outdir}/ncbi-influenza-db"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'GUNZIP_NCBI_FLU_FASTA' { - publishDir = [ - [ - path: { "${params.outdir}/flu_fasta"}, - mode: params.publish_dir_mode - ] - ] - } - withName: 'CAT_ILLUMINA_FASTQ' { - publishDir = [ - [ - path: { "${params.outdir}/fastq"}, - mode: params.publish_dir_mode - ] - ] - } -} \ No newline at end of file + withName: 'CAT_ILLUMINA_FASTQ' { + publishDir = [ + [ + path: { "${params.outdir}/fastq"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } +} diff --git a/conf/modules_nanopore.config b/conf/modules_nanopore.config index 4ae1c19..df0b951 100644 --- a/conf/modules_nanopore.config +++ b/conf/modules_nanopore.config @@ -1,266 +1,284 @@ - +// Nanopore subworkflow process configuration process { - withName: 'IRMA' { - publishDir = [ - [ - path: { "${params.outdir}/irma"}, - mode: params.publish_dir_mode - ], - [ - path: { "${params.outdir}/consensus/irma/" }, - pattern: "*.irma.consensus.fasta", - mode: params.publish_dir_mode - ] - ] - } + withName: 'IRMA' { + publishDir = [ + [ + path: { "${params.outdir}/irma"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ], + [ + path: { "${params.outdir}/consensus/irma/" }, + pattern: "*.irma.consensus.fasta", + mode: params.publish_dir_mode + ] + ] + } + + withName: 'BLAST_MAKEBLASTDB_NCBI' { + ext.args = '-dbtype nucl' + publishDir = [ + [ + path: { "${params.outdir}/blast/db/ncbi" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'BLAST_MAKEBLASTDB_NCBI' { - ext.args = '-dbtype nucl' - publishDir = [ - [ - path: { "${params.outdir}/blast/db/ncbi" }, - mode: params.publish_dir_mode - ] - ] - } - withName: 'BLAST_MAKEBLASTDB_REFDB' { - ext.args = '-dbtype nucl' - publishDir = [ - [ - path: { "${params.outdir}/blast/db/ref_db" }, - mode: params.publish_dir_mode - ] - ] - } + withName: 'BLAST_MAKEBLASTDB_REFDB' { + ext.args = '-dbtype nucl' + publishDir = [ + [ + path: { "${params.outdir}/blast/db/ref_db" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'BLAST_BLASTN_IRMA' { - ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6' - publishDir = [ - [ - path: { "${params.outdir}/blast/blastn/irma" }, - mode: params.publish_dir_mode - ] - ] - } + withName: 'BLAST_BLASTN_IRMA' { + ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6' + publishDir = [ + [ + path: { "${params.outdir}/blast/blastn/irma" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'BLAST_BLASTN_CONSENSUS' { - ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6' - publishDir = [ - [ - path: { "${params.outdir}/blast/blastn/consensus" }, - mode: params.publish_dir_mode - ] - ] - } + withName: 'BLAST_BLASTN_CONSENSUS' { + ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6' + publishDir = [ + [ + path: { "${params.outdir}/blast/blastn/consensus" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'BLAST_BLASTN_CONSENSUS_REF_DB' { - ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6' - publishDir = [ - [ - path: { "${params.outdir}/blast/blastn/against_ref_db" }, - mode: params.publish_dir_mode - ] - ] - } + withName: 'BLAST_BLASTN_CONSENSUS_REF_DB' { + ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6' + publishDir = [ + [ + path: { "${params.outdir}/blast/blastn/against_ref_db" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'BCF_CONSENSUS' { - publishDir = [ - [ - path: { "${params.outdir}/consensus/bcftools/${sample}" }, - mode: params.publish_dir_mode - ] - ] - } + withName: 'BCF_CONSENSUS' { + publishDir = [ + [ + path: { "${params.outdir}/consensus/bcftools/${sample}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'BCFTOOLS_STATS' { - publishDir = [ - [ - path: { "${params.outdir}/variants/${sample}" }, - mode: params.publish_dir_mode - ] - ] - } + withName: 'BCFTOOLS_STATS' { + publishDir = [ + [ + path: { "${params.outdir}/variants/${sample}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'CAT_CONSENSUS' { - publishDir = [ - [ - path: { "${params.outdir}/consensus/bcftools/"}, - pattern: "*.consensus.fasta", - mode: params.publish_dir_mode - ] - ] - } + withName: 'CAT_CONSENSUS' { + publishDir = [ + [ + path: { "${params.outdir}/consensus/bcftools/"}, + pattern: "*.consensus.fasta", + mode: params.publish_dir_mode + ] + ] + } - withName: 'COVERAGE_PLOT' { - publishDir = [ - [ - path: { "${params.outdir}/coverage_plots/${sample}" }, - mode: params.publish_dir_mode - ] - ] - } + withName: 'COVERAGE_PLOT' { + publishDir = [ + [ + path: { "${params.outdir}/coverage_plots/${sample}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'MEDAKA' { - publishDir = [ - [ - path: { "${params.outdir}/variants/${sample}" }, - pattern: "*.{vcf,log}", - mode: params.publish_dir_mode - ], - [ - path: { "${params.outdir}/variants/${sample}/medaka"}, - mode: params.publish_dir_mode, - enable: true - ] - ] - } + withName: 'MEDAKA' { + publishDir = [ + [ + path: { "${params.outdir}/variants/${sample}" }, + pattern: "*.{vcf,log}", + mode: params.publish_dir_mode + ], + [ + path: { "${params.outdir}/variants/${sample}/medaka"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode, + enable: true + ] + ] + } - withName: 'CLAIR3' { - publishDir = [ - [ - path: { "${params.outdir}/variants/${sample}"}, - pattern: "*.{vcf.gz,log}", - mode: params.publish_dir_mode - ], - [ - path: { "${params.outdir}/variants/${sample}/clair3"}, - mode: params.publish_dir_mode, - enable: true - ] - ] - } + withName: 'CLAIR3' { + publishDir = [ + [ + path: { "${params.outdir}/variants/${sample}"}, + pattern: "*.{vcf.gz,log}", + mode: params.publish_dir_mode + ], + [ + path: { "${params.outdir}/variants/${sample}/clair3"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode, + enable: true + ] + ] + } - withName: 'MINIMAP2' { - publishDir = [ - [ - path: { "${params.outdir}/mapping/${sample}"}, - mode: params.publish_dir_mode - ] - ] - } + withName: 'MINIMAP2' { + publishDir = [ + [ + path: { "${params.outdir}/mapping/${sample}"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'MOSDEPTH_GENOME' { - publishDir = [ - [ - path: { "${params.outdir}/mosdepth/${sample}"}, - mode: params.publish_dir_mode - ] - ] - } + withName: 'MOSDEPTH_GENOME' { + publishDir = [ + [ + path: { "${params.outdir}/mosdepth/${sample}"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'PULL_TOP_REF_ID' { - publishDir = [ - [ - path: { "${params.outdir}/reference_sequences/${meta.id}"}, - pattern: "*.csv", - mode: params.publish_dir_mode - ] - ] - } + withName: 'PULL_TOP_REF_ID' { + publishDir = [ + [ + path: { "${params.outdir}/reference_sequences/${meta.id}"}, + pattern: "*.csv", + mode: params.publish_dir_mode + ] + ] + } - withName: 'CHECK_REF_FASTA' { - publishDir = [ - [ - path: { "${params.outdir}/reference_sequences/"}, - pattern: "*.fasta", - mode: params.publish_dir_mode - ] - ] - } + withName: 'CHECK_REF_FASTA' { + publishDir = [ + [ + path: { "${params.outdir}/reference_sequences/"}, + pattern: "*.fasta", + mode: params.publish_dir_mode + ] + ] + } - withName: 'SEQTK_SEQ' { - publishDir = [ - [ - path: { "${params.outdir}/reference_sequences/${sample}"}, - pattern: "*.fasta", - mode: params.publish_dir_mode - ] - ] - } + withName: 'SEQTK_SEQ' { + publishDir = [ + [ + path: { "${params.outdir}/reference_sequences/${sample}"}, + pattern: "*.fasta", + mode: params.publish_dir_mode + ] + ] + } - withName: 'SUBTYPING_REPORT_BCF_CONSENSUS' { - publishDir = [ - [ - path: { "${params.outdir}/"}, - pattern: "*.{xlsx,log}", - mode: params.publish_dir_mode - ] - ] - } + withName: 'SUBTYPING_REPORT_BCF_CONSENSUS' { + publishDir = [ + [ + path: { "${params.outdir}/"}, + pattern: "*.{xlsx,log}", + mode: params.publish_dir_mode + ] + ] + } - withName: 'BLASTN_REPORT' { - publishDir = [ - [ - path: { "${params.outdir}/mismatch_report"}, - pattern: "*.{xlsx}", - mode: params.publish_dir_mode - ] - ] - } + withName: 'BLASTN_REPORT' { + publishDir = [ + [ + path: { "${params.outdir}/mismatch_report"}, + pattern: "*.{xlsx}", + mode: params.publish_dir_mode + ] + ] + } - withName: 'SUBTYPING_REPORT_IRMA_CONSENSUS' { - publishDir = [ - [ - path: { "${params.outdir}/irma"}, - pattern: "*.{xlsx,log}", - mode: params.publish_dir_mode - ] - ] - } + withName: 'SUBTYPING_REPORT_IRMA_CONSENSUS' { + publishDir = [ + [ + path: { "${params.outdir}/irma"}, + pattern: "*.{xlsx,log}", + mode: params.publish_dir_mode + ] + ] + } - withName: 'VCF_FILTER_FRAMESHIFT' { - publishDir = [ - [ - path: { "${params.outdir}/variants/${sample}" }, - pattern: "*.vcf", - mode: params.publish_dir_mode - ] - ] - } + withName: 'VCF_FILTER_FRAMESHIFT' { + publishDir = [ + [ + path: { "${params.outdir}/variants/${sample}" }, + pattern: "*.vcf", + mode: params.publish_dir_mode + ] + ] + } - withName: 'GUNZIP_NCBI_FLU_FASTA' { - publishDir = [ - [ - path: { "${params.outdir}/flu_fasta" }, - mode: params.publish_dir_mode - ] - ] - } + withName: 'ZSTD_DECOMPRESS_.*' { + publishDir = [ + [ + path: { "${params.outdir}/ncbi-influenza-db"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'READ_COUNT_FAIL_TSV' { - publishDir = [ - [ - path: { "${params.outdir}/read_count" }, - mode: params.publish_dir_mode - ] - ] - } + withName: 'READ_COUNT_FAIL_TSV' { + publishDir = [ + [ + path: { "${params.outdir}/read_count" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'READ_COUNT_PASS_TSV' { - publishDir = [ - [ - path: { "${params.outdir}/read_count" }, - mode: params.publish_dir_mode - ] - ] - } + withName: 'READ_COUNT_PASS_TSV' { + publishDir = [ + [ + path: { "${params.outdir}/read_count" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } - withName: 'SOFTWARE_VERSIONS' { - publishDir = [ - [ - path: { "${params.outdir}/pipeline_info" }, - pattern: "software_versions.yml", - mode: params.publish_dir_mode - ] - ] - } + withName: 'SOFTWARE_VERSIONS' { + publishDir = [ + [ + path: { "${params.outdir}/pipeline_info" }, + pattern: "software_versions.yml", + mode: params.publish_dir_mode + ] + ] + } - withName: 'MULTIQC' { - publishDir = [ - [ - path: { "${params.outdir}/MultiQC" }, - mode: params.publish_dir_mode - ] - ] - } -} \ No newline at end of file + withName: 'MULTIQC' { + publishDir = [ + [ + path: { "${params.outdir}/MultiQC" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + mode: params.publish_dir_mode + ] + ] + } +} diff --git a/modules/local/misc.nf b/modules/local/misc.nf index 61c3a68..16a1090 100644 --- a/modules/local/misc.nf +++ b/modules/local/misc.nf @@ -58,40 +58,6 @@ process CAT_DB { """ } -process GUNZIP_NCBI_FLU_FASTA { - tag "$archive" - label 'process_low' - - conda (params.enable_conda ? "conda-forge::sed=4.7" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img" - } else { - container "biocontainers/biocontainers:v1.2.0_cv1" - } - - input: - path archive - - output: - path "*.fna", emit: fna - path "versions.yml" , emit: versions - - script: - def software = getSoftwareName(task.process) - // replace FASTA headers - // >gi|{gi}|gb|{accession}|{description} - // with - // >{accession} {description} - // for easier parsing and processing - """ - zcat $archive | sed -E 's/^>gi\\|[0-9]+\\|gb\\|(\\w+)\\|(.*)/>\\1 \\2/' > influenza.fna - cat <<-END_VERSIONS > versions.yml - "${task.process}": - zcat: \$(echo \$(zcat --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') - END_VERSIONS - """ -} - process CAT_CONSENSUS { tag "$sample" conda (params.enable_conda ? 'bioconda::shiptv=0.4.0' : null) diff --git a/modules/local/zstd_decompress.nf b/modules/local/zstd_decompress.nf new file mode 100644 index 0000000..aecbe87 --- /dev/null +++ b/modules/local/zstd_decompress.nf @@ -0,0 +1,30 @@ +process ZSTD_DECOMPRESS { + + conda 'conda-forge::zstd=1.5.2' + // TODO: using clair3 container here for zstd and since it might be used if running the Nanopore workflow, but should move to multi-package-container with just zstd and maybe curl to combine data fetch functionality + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container 'https://depot.galaxyproject.org/singularity/clair3:1.0.3--py39h8492097_0' + } else { + container 'quay.io/biocontainers/clair3:1.0.3--py39h8492097_0' + } + + input: + path(zstd_file, stageAs: "input*/*") + val(filename) + + output: + path(decompressed_file), emit: file + path('versions.yml'), emit: versions + + script: + def basename = file(zstd_file).getBaseName() + decompressed_file = filename ? "${basename}-${filename}" : basename + """ + zstdcat $zstd_file > $decompressed_file + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + zstd: \$(echo \$(zstd --version 2>&1) | sed 's/^.* v//; s/,.*//') + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index b3bd9bf..fccf72f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,8 +23,8 @@ params { min_aln_length = 700 max_top_blastn = 3 // reference data - ncbi_influenza_fasta = 'https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz' - ncbi_influenza_metadata = 'https://ftp.ncbi.nih.gov/genomes/INFLUENZA/genomeset.dat.gz' + ncbi_influenza_fasta = 'https://api.figshare.com/v2/file/download/41415330' + ncbi_influenza_metadata = 'https://api.figshare.com/v2/file/download/41415333' // Slurm scheduler options slurm_queue = '' slurm_queue_size = 100 diff --git a/workflows/illumina.nf b/workflows/illumina.nf index 427f554..34f51f9 100644 --- a/workflows/illumina.nf +++ b/workflows/illumina.nf @@ -20,6 +20,8 @@ include { GUNZIP_NCBI_FLU_FASTA } from '../modules/local/misc' include { BLAST_MAKEBLASTDB } from '../modules/local/blast_makeblastdb' include { BLAST_BLASTN } from '../modules/local/blastn' include { CAT_ILLUMINA_FASTQ } from '../modules/local/cat_illumina_fastq' +include { FETCH_INFLUENZA_REF_DB } from '../modules/local/fetch_influenza_ref_db' +include { ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_FASTA; ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_CSV } from '../modules/local/zstd_decompress' //============================================================================= // Workflow Params Setup @@ -35,9 +37,14 @@ if (params.irma_module) { //============================================================================= workflow ILLUMINA { - - GUNZIP_NCBI_FLU_FASTA(ch_influenza_db_fasta) - BLAST_MAKEBLASTDB(GUNZIP_NCBI_FLU_FASTA.out.fna) + ch_versions = Channel.empty() + // Decompress reference data + ZSTD_DECOMPRESS_FASTA(ch_influenza_db_fasta, "influenza.fasta") + ch_versions = ch_versions.mix(ZSTD_DECOMPRESS_FASTA.out.versions) + ZSTD_DECOMPRESS_CSV(ch_influenza_metadata, "influenza.csv") + ch_versions = ch_versions.mix(ZSTD_DECOMPRESS_CSV.out.versions) + BLAST_MAKEBLASTDB(ZSTD_DECOMPRESS_FASTA.out.file) + ch_versions = ch_versions.mix(BLAST_MAKEBLASTDB.out.versions) CHECK_SAMPLE_SHEET(Channel.fromPath( params.input, checkIfExists: true)) .splitCsv(header: ['sample', 'fastq1', 'fastq2', 'single_end'], sep: ',', skip: 1) @@ -71,11 +78,17 @@ workflow ILLUMINA { // Credit to nf-core/viralrecon. Source: https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/workflows/illumina.nf#L221 // Concatenate FastQ files from same sample if required CAT_ILLUMINA_FASTQ(ch_input) + ch_versions = ch_versions.mix(CAT_ILLUMINA_FASTQ.out.versions) IRMA(CAT_ILLUMINA_FASTQ.out.reads, irma_module) + ch_versions = ch_versions.mix(IRMA.out.versions) BLAST_BLASTN(IRMA.out.consensus, BLAST_MAKEBLASTDB.out.db) + ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions) ch_blast = BLAST_BLASTN.out.txt.collect({ it[1] }) - SUBTYPING_REPORT(ch_influenza_metadata, ch_blast) + SUBTYPING_REPORT(ZSTD_DECOMPRESS_CSV.out, ch_blast) + ch_versions = ch_versions.mix(SUBTYPING_REPORT.out.versions) + + SOFTWARE_VERSIONS(ch_versions.unique().collectFile(name: 'collated_versions.yml')) } diff --git a/workflows/nanopore.nf b/workflows/nanopore.nf index 0d13d9a..2dd21bd 100644 --- a/workflows/nanopore.nf +++ b/workflows/nanopore.nf @@ -19,7 +19,7 @@ include { BCF_CONSENSUS; BCFTOOLS_STATS } from '../modules include { CLAIR3 } from '../modules/local/clair3' include { MOSDEPTH_GENOME } from '../modules/local/mosdepth' include { CAT_NANOPORE_FASTQ } from '../modules/local/misc' -include { GUNZIP_NCBI_FLU_FASTA } from '../modules/local/misc' +include { ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_FASTA; ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_CSV } from '../modules/local/zstd_decompress' include { CAT_DB } from '../modules/local/misc' include { CAT_CONSENSUS } from '../modules/local/misc' include { SEQTK_SEQ } from '../modules/local/seqtk_seq' @@ -131,10 +131,12 @@ workflow NANOPORE { .map { sample, fqgz, fq, count -> [ [id: sample], fqgz, fq ] } .set { ch_reads } - GUNZIP_NCBI_FLU_FASTA(ch_influenza_db_fasta) - ch_versions = ch_versions.mix(GUNZIP_NCBI_FLU_FASTA.out.versions) + ZSTD_DECOMPRESS_FASTA(ch_influenza_db_fasta, "influenza.fasta") + ch_versions = ch_versions.mix(ZSTD_DECOMPRESS_FASTA.out.versions) + ZSTD_DECOMPRESS_CSV(ch_influenza_metadata, "influenza.csv") + ch_versions = ch_versions.mix(ZSTD_DECOMPRESS_CSV.out.versions) - ch_input_ref_db = GUNZIP_NCBI_FLU_FASTA.out.fna + ch_input_ref_db = ZSTD_DECOMPRESS_FASTA.out.file if (params.ref_db){ ch_ref_fasta = file(params.ref_db, type: 'file') @@ -159,11 +161,11 @@ workflow NANOPORE { //Generate suptype prediction report if (!params.skip_irma_subtyping_report){ ch_blast_irma = BLAST_BLASTN_IRMA.out.txt.collect({ it[1] }) - SUBTYPING_REPORT_IRMA_CONSENSUS(ch_influenza_metadata, ch_blast_irma) + SUBTYPING_REPORT_IRMA_CONSENSUS(ZSTD_DECOMPRESS_CSV.out.file, ch_blast_irma) } // Prepare top ncbi accession id for each segment of each sample sample (id which has top bitscore) - PULL_TOP_REF_ID(BLAST_BLASTN_IRMA.out.txt, ch_influenza_metadata) + PULL_TOP_REF_ID(BLAST_BLASTN_IRMA.out.txt, ZSTD_DECOMPRESS_CSV.out.file) ch_versions = ch_versions.mix(PULL_TOP_REF_ID.out.versions) PULL_TOP_REF_ID.out.accession_id @@ -242,7 +244,7 @@ workflow NANOPORE { ch_versions = ch_versions.mix(BLAST_BLASTN_CONSENSUS.out.versions) ch_blastn_consensus = BLAST_BLASTN_CONSENSUS.out.txt.collect({ it[1] }) - SUBTYPING_REPORT_BCF_CONSENSUS(ch_influenza_metadata, ch_blastn_consensus) + SUBTYPING_REPORT_BCF_CONSENSUS(ZSTD_DECOMPRESS_CSV.out.file, ch_blastn_consensus) ch_versions = ch_versions.mix(SUBTYPING_REPORT_BCF_CONSENSUS.out.versions) if (params.ref_db){ From d36b980c57d29c5bd64fc8be6b59da82f06b06e6 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Fri, 7 Jul 2023 13:51:17 -0500 Subject: [PATCH 02/11] Remove/replace references to GUNZIP_NCBI_FLU_FASTA --- workflows/illumina.nf | 1 - workflows/nanopore.nf | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/illumina.nf b/workflows/illumina.nf index 34f51f9..663bf4d 100644 --- a/workflows/illumina.nf +++ b/workflows/illumina.nf @@ -16,7 +16,6 @@ ch_influenza_metadata = file(params.ncbi_influenza_metadata) include { IRMA } from '../modules/local/irma' include { CHECK_SAMPLE_SHEET } from '../modules/local/check_sample_sheet' include { SUBTYPING_REPORT } from '../modules/local/subtyping_report' -include { GUNZIP_NCBI_FLU_FASTA } from '../modules/local/misc' include { BLAST_MAKEBLASTDB } from '../modules/local/blast_makeblastdb' include { BLAST_BLASTN } from '../modules/local/blastn' include { CAT_ILLUMINA_FASTQ } from '../modules/local/cat_illumina_fastq' diff --git a/workflows/nanopore.nf b/workflows/nanopore.nf index 2dd21bd..be3f1e9 100644 --- a/workflows/nanopore.nf +++ b/workflows/nanopore.nf @@ -142,7 +142,7 @@ workflow NANOPORE { ch_ref_fasta = file(params.ref_db, type: 'file') CHECK_REF_FASTA(ch_ref_fasta) ch_versions = ch_versions.mix(CHECK_REF_FASTA.out.versions) - CAT_DB(GUNZIP_NCBI_FLU_FASTA.out.fna, CHECK_REF_FASTA.out.fasta) + CAT_DB(ZSTD_DECOMPRESS_FASTA.out.file, CHECK_REF_FASTA.out.fasta) ch_input_ref_db = CAT_DB.out.fasta } From 7371191598cf93db5704ebdbac0bf6440a3f56ee Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Fri, 7 Jul 2023 13:53:19 -0500 Subject: [PATCH 03/11] remove missing proc from illumina.nf --- workflows/illumina.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/illumina.nf b/workflows/illumina.nf index 663bf4d..2ccc9e5 100644 --- a/workflows/illumina.nf +++ b/workflows/illumina.nf @@ -19,7 +19,6 @@ include { SUBTYPING_REPORT } from '../modules/local/subtyping_report' include { BLAST_MAKEBLASTDB } from '../modules/local/blast_makeblastdb' include { BLAST_BLASTN } from '../modules/local/blastn' include { CAT_ILLUMINA_FASTQ } from '../modules/local/cat_illumina_fastq' -include { FETCH_INFLUENZA_REF_DB } from '../modules/local/fetch_influenza_ref_db' include { ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_FASTA; ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_CSV } from '../modules/local/zstd_decompress' //============================================================================= From 00f040916e726b96a6773f1d02cfd6d38f25b849 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Fri, 7 Jul 2023 13:59:59 -0500 Subject: [PATCH 04/11] fix illumina.nf --- workflows/illumina.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/illumina.nf b/workflows/illumina.nf index 2ccc9e5..e821739 100644 --- a/workflows/illumina.nf +++ b/workflows/illumina.nf @@ -85,7 +85,7 @@ workflow ILLUMINA { ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions) ch_blast = BLAST_BLASTN.out.txt.collect({ it[1] }) - SUBTYPING_REPORT(ZSTD_DECOMPRESS_CSV.out, ch_blast) + SUBTYPING_REPORT(ZSTD_DECOMPRESS_CSV.out.file, ch_blast) ch_versions = ch_versions.mix(SUBTYPING_REPORT.out.versions) SOFTWARE_VERSIONS(ch_versions.unique().collectFile(name: 'collated_versions.yml')) From e38c9cadeb891b6e1518c29497d09cdc9539d72d Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Fri, 7 Jul 2023 14:09:46 -0500 Subject: [PATCH 05/11] fix illumina.nf --- workflows/illumina.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/illumina.nf b/workflows/illumina.nf index e821739..564f33c 100644 --- a/workflows/illumina.nf +++ b/workflows/illumina.nf @@ -21,6 +21,8 @@ include { BLAST_BLASTN } from '../modules/local/blastn' include { CAT_ILLUMINA_FASTQ } from '../modules/local/cat_illumina_fastq' include { ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_FASTA; ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_CSV } from '../modules/local/zstd_decompress' +include { CUSTOM_DUMPSOFTWAREVERSIONS as SOFTWARE_VERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' + //============================================================================= // Workflow Params Setup //============================================================================= From 0d9db7383edc06b76020f35760906e172b171892 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Fri, 7 Jul 2023 15:32:45 -0500 Subject: [PATCH 06/11] fix parse_influenza_blast_results.py issue with Categorical type and CAT_ILLUMINA_FASTQ versions.yml wonkiness --- bin/parse_influenza_blast_results.py | 2 +- conf/modules_illumina.config | 10 +++ modules/local/cat_illumina_fastq.nf | 105 ++++++++++++++------------- workflows/illumina.nf | 6 +- 4 files changed, 70 insertions(+), 53 deletions(-) diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py index c2bdc6c..b8c03d0 100755 --- a/bin/parse_influenza_blast_results.py +++ b/bin/parse_influenza_blast_results.py @@ -374,7 +374,7 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold, ("Release_Date", pl.Categorical), ("Genus", pl.Categorical), ("Length", pl.UInt16), - ("Genotype", pl.Categorical), + ("Genotype", str), ("Segment", pl.Categorical), ("Publications", str), ("Geo_Location", pl.Categorical), diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config index d6c7c65..e6560a5 100644 --- a/conf/modules_illumina.config +++ b/conf/modules_illumina.config @@ -66,4 +66,14 @@ process { ] ] } + + withName: 'SOFTWARE_VERSIONS' { + publishDir = [ + [ + path: { "${params.outdir}/pipeline_info" }, + pattern: "software_versions.yml", + mode: params.publish_dir_mode + ] + ] + } } diff --git a/modules/local/cat_illumina_fastq.nf b/modules/local/cat_illumina_fastq.nf index fe62e55..0afe2bd 100644 --- a/modules/local/cat_illumina_fastq.nf +++ b/modules/local/cat_illumina_fastq.nf @@ -39,20 +39,21 @@ process CAT_ILLUMINA_FASTQ { } if (meta.single_end) { if (fqList.size >= 1 || fqgzList.size >= 1) { - """ - touch ${prefix}.merged.fastq.gz - if [[ ${fqList.size} > 0 ]]; then - cat ${readList.join(' ')} | gzip -ck >> ${prefix}.merged.fastq.gz - fi - if [[ ${fqgzList.size} > 0 ]]; then - cat ${readList.join(' ')} >> ${prefix}.merged.fastq.gz - fi + """ + touch ${prefix}.merged.fastq.gz + if [[ ${fqList.size} > 0 ]]; then + cat ${readList.join(' ')} | gzip -ck >> ${prefix}.merged.fastq.gz + fi + if [[ ${fqgzList.size} > 0 ]]; then + cat ${readList.join(' ')} >> ${prefix}.merged.fastq.gz + fi - cat <<-END_VERSIONS > versions.yml - "${task.process}": - cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --help 2>&1) | sed 's/ (.*//') + gzip: \$(echo \$(gzip --help 2>&1) | sed 's/ (.*//') + END_VERSIONS + """ } } else { if (readList.size >= 2) { @@ -60,43 +61,49 @@ process CAT_ILLUMINA_FASTQ { def read1gz = [] def read2 = [] def read2gz = [] - fqList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } - fqgzList.eachWithIndex{ v, ix -> ( ix & 1 ? read2gz : read1gz ) << v } - """ - # append 1:N:0:. or 2:N:0:. to forward and reverse reads if "[12]:N:.*" - # not present in the FASTQ header for compatability with IRMA assembly - touch ${prefix}_1.merged.fastq.gz - touch ${prefix}_2.merged.fastq.gz - if [[ ${read1.size} > 0 ]]; then - cat ${read1.join(' ')} \\ - | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){ chomp \$_; print "\$_ 1:N:0:.\n"; } else { print "\$_"; }' \\ - | gzip -ck \\ - >> ${prefix}_1.merged.fastq.gz - fi - if [[ ${read1gz.size} > 0 ]]; then - zcat ${read1gz.join(' ')} \\ - | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){ chomp \$_; print "\$_ 1:N:0:.\n"; } else { print "\$_"; }' \\ - | gzip -ck \\ - >> ${prefix}_1.merged.fastq.gz - fi - if [[ ${read2.size} > 0 ]]; then - cat ${read2.join(' ')} \\ - | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){ chomp \$_; print "\$_ 2:N:0:.\n"; } else { print "\$_"; }' \\ - | gzip -ck \\ - >> ${prefix}_2.merged.fastq.gz - fi - if [[ ${read2gz.size} > 0 ]]; then - zcat ${read2gz.join(' ')} \\ - | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){ chomp \$_; print "\$_ 2:N:0:.\n"; } else { print "\$_"; }' \\ - | gzip -ck \\ - >> ${prefix}_2.merged.fastq.gz - fi + fqList.eachWithIndex { v, ix -> ( ix & 1 ? read2 : read1 ) << v } + fqgzList.eachWithIndex { v, ix -> ( ix & 1 ? read2gz : read1gz ) << v } + // append 1:N:0:. or 2:N:0:. to forward and reverse reads if "[12]:N:.*" + // not present in the FASTQ header for compatability with IRMA assembly +""" +touch ${prefix}_1.merged.fastq.gz - cat <<-END_VERSIONS > versions.yml - "${task.process}": - cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') - END_VERSIONS - """ +touch ${prefix}_2.merged.fastq.gz + +if [[ ${read1.size} > 0 ]]; then + cat ${read1.join(' ')} \\ + | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){ chomp \$_; print "\$_ 1:N:0:.\n"; } else { print "\$_"; }' \\ + | gzip -ck \\ + >> ${prefix}_1.merged.fastq.gz +fi + +if [[ ${read1gz.size} > 0 ]]; then + zcat ${read1gz.join(' ')} \\ + | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){ chomp \$_; print "\$_ 1:N:0:.\n"; } else { print "\$_"; }' \\ + | gzip -ck \\ + >> ${prefix}_1.merged.fastq.gz +fi + +if [[ ${read2.size} > 0 ]]; then + cat ${read2.join(' ')} \\ + | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){ chomp \$_; print "\$_ 2:N:0:.\n"; } else { print "\$_"; }' \\ + | gzip -ck \\ + >> ${prefix}_2.merged.fastq.gz +fi + +if [[ ${read2gz.size} > 0 ]]; then + zcat ${read2gz.join(' ')} \\ + | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){ chomp \$_; print "\$_ 2:N:0:.\n"; } else { print "\$_"; }' \\ + | gzip -ck \\ + >> ${prefix}_2.merged.fastq.gz +fi + +cat <<-END_VERSIONS > versions.yml +"${task.process}": + cat: \$(echo \$(cat --help 2>&1) | sed 's/ (.*//') + gzip: \$(echo \$(gzip --help 2>&1) | sed 's/ (.*//') +END_VERSIONS +""" } } } diff --git a/workflows/illumina.nf b/workflows/illumina.nf index 564f33c..342e6a5 100644 --- a/workflows/illumina.nf +++ b/workflows/illumina.nf @@ -78,13 +78,13 @@ workflow ILLUMINA { // Credit to nf-core/viralrecon. Source: https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/workflows/illumina.nf#L221 // Concatenate FastQ files from same sample if required CAT_ILLUMINA_FASTQ(ch_input) - ch_versions = ch_versions.mix(CAT_ILLUMINA_FASTQ.out.versions) + ch_versions = ch_versions.mix(CAT_ILLUMINA_FASTQ.out.versions.first().ifEmpty(null)) IRMA(CAT_ILLUMINA_FASTQ.out.reads, irma_module) - ch_versions = ch_versions.mix(IRMA.out.versions) + ch_versions = ch_versions.mix(IRMA.out.versions.first().ifEmpty(null)) BLAST_BLASTN(IRMA.out.consensus, BLAST_MAKEBLASTDB.out.db) - ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions) + ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions.first().ifEmpty(null)) ch_blast = BLAST_BLASTN.out.txt.collect({ it[1] }) SUBTYPING_REPORT(ZSTD_DECOMPRESS_CSV.out.file, ch_blast) From 78c5c4b21c5a90bf8e7e2266c612158651577a13 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Mon, 10 Jul 2023 15:18:31 -0500 Subject: [PATCH 07/11] fix merge conflict typo --- bin/parse_influenza_blast_results.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py index bb4c184..584c35a 100755 --- a/bin/parse_influenza_blast_results.py +++ b/bin/parse_influenza_blast_results.py @@ -389,7 +389,6 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold, df_md = pl.read_csv( flu_metadata, has_header=True, - has_header=False, dtypes=dict(md_cols), ) From f7acabac11bb597c8db70796ba29ead17f483778 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Mon, 10 Jul 2023 16:05:50 -0500 Subject: [PATCH 08/11] parse_influenza_blast_results.py: fix missing var, remove unused threads cli opt --- bin/parse_influenza_blast_results.py | 14 +++++++++++--- modules/local/subtyping_report.nf | 11 ++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py index 584c35a..dc06e30 100755 --- a/bin/parse_influenza_blast_results.py +++ b/bin/parse_influenza_blast_results.py @@ -296,6 +296,7 @@ def find_h_or_n_type(df_merge, seg, is_iav): if is_iav: type_counts = df_segment["Genotype"].value_counts(sort=True) type_counts = type_counts.filter(~pl.col("Genotype").is_null()) + reg_h_or_n_type = "[Hh]" if h_or_n == "H" else "[Nn]" df_type_counts = type_counts.with_columns(pl.lit(type_counts["Genotype"].str.extract(reg_h_or_n_type + r"(\d+)").alias(type_name))) df_type_counts = df_type_counts.filter(~pl.col(type_name).is_null()) logging.debug(f"{df_type_counts}") @@ -355,12 +356,19 @@ def find_h_or_n_type(df_merge, seg, is_iav): "--pident-threshold", default=0.85, help="BLAST percent identity threshold" ) @click.option('--min-aln-length', default=50, help="Min BLAST alignment length threshold") -@click.option("--threads", default=4, help="Number of BLAST result parsing threads.") @click.option("--get-top-ref", default=False, help="Get top ref accession id from ncbi database.") @click.option("--sample-name", default="", help="Sample Name.") @click.argument("blast_results", nargs=-1) -def report(flu_metadata, blast_results, excel_report, top, pident_threshold, - min_aln_length, threads, get_top_ref, sample_name): +def report( + flu_metadata, + blast_results, + excel_report, + top, + pident_threshold, + min_aln_length, + get_top_ref, + sample_name +): from rich.traceback import install install(show_locals=True, width=120, word_wrap=True) logging.basicConfig( diff --git a/modules/local/subtyping_report.nf b/modules/local/subtyping_report.nf index 95280af..1c3c625 100644 --- a/modules/local/subtyping_report.nf +++ b/modules/local/subtyping_report.nf @@ -1,9 +1,3 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process SUBTYPING_REPORT { memory { // Dynamically determine how much memory is required for this task based on @@ -33,17 +27,16 @@ process SUBTYPING_REPORT { path(blastn_results) output: - path('iav-subtyping-report.xlsx'), emit: report + path('nf-flu-subtyping-report.xlsx'), emit: report path('parse_influenza_blast_results.log'), emit: log path "versions.yml", emit: versions script: """ parse_influenza_blast_results.py \\ - --threads ${task.cpus} \\ --flu-metadata $genomeset \\ --top ${params.max_top_blastn} \\ - --excel-report iav-subtyping-report.xlsx \\ + --excel-report nf-flu-subtyping-report.xlsx \\ --pident-threshold $params.pident_threshold \\ $blastn_results ln -s .command.log parse_influenza_blast_results.log From dedcfb5a72a15d285800437d879ac9dcf33c685c Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Tue, 11 Jul 2023 11:09:07 -0500 Subject: [PATCH 09/11] fix pull_top_ref_id.nf --- bin/parse_influenza_blast_results.py | 84 +++++++++++++++++----------- modules/local/pull_top_ref_id.nf | 3 +- 2 files changed, 52 insertions(+), 35 deletions(-) diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py index dc06e30..18c4b0b 100755 --- a/bin/parse_influenza_blast_results.py +++ b/bin/parse_influenza_blast_results.py @@ -11,13 +11,13 @@ import logging import re from collections import defaultdict -from typing import Dict, List, Optional, Tuple import click import numpy as np import pandas as pd import polars as pl from rich.logging import RichHandler +from typing import Dict, List, Optional, Tuple LOG_FORMAT = "%(asctime)s %(levelname)s: %(message)s [in %(filename)s:%(lineno)d]" logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) @@ -297,7 +297,8 @@ def find_h_or_n_type(df_merge, seg, is_iav): type_counts = df_segment["Genotype"].value_counts(sort=True) type_counts = type_counts.filter(~pl.col("Genotype").is_null()) reg_h_or_n_type = "[Hh]" if h_or_n == "H" else "[Nn]" - df_type_counts = type_counts.with_columns(pl.lit(type_counts["Genotype"].str.extract(reg_h_or_n_type + r"(\d+)").alias(type_name))) + df_type_counts = type_counts.with_columns( + pl.lit(type_counts["Genotype"].str.extract(reg_h_or_n_type + r"(\d+)").alias(type_name))) df_type_counts = df_type_counts.filter(~pl.col(type_name).is_null()) logging.debug(f"{df_type_counts}") type_to_count = defaultdict(int) @@ -369,47 +370,31 @@ def report( get_top_ref, sample_name ): - from rich.traceback import install - install(show_locals=True, width=120, word_wrap=True) - logging.basicConfig( - format="%(message)s", - datefmt="[%Y-%m-%d %X]", - level=logging.DEBUG, - handlers=[RichHandler(rich_tracebacks=True, tracebacks_show_locals=True)], - ) + init_logging() logging.info(f'Parsing Influenza metadata file "{flu_metadata}"') - md_cols = [ - ("#Accession", str), - ("Release_Date", pl.Categorical), - ("Genus", pl.Categorical), - ("Length", pl.UInt16), - ("Genotype", str), - ("Segment", pl.Categorical), - ("Publications", str), - ("Geo_Location", pl.Categorical), - ("Host", pl.Categorical), - ("Isolation_Source", pl.Categorical), - ("Collection_Date", pl.Categorical), - ("GenBank_Title", str), - ] - df_md = pl.read_csv( - flu_metadata, - has_header=True, - dtypes=dict(md_cols), - ) + df_md = read_refseq_metadata(flu_metadata) unique_subtypes = df_md.select("Genotype").unique() unique_subtypes = unique_subtypes.filter(~pl.col("Genotype").is_null()) logging.info( - f"Parsed Influenza metadata file into DataFrame with n={df_md.shape[0]} rows and n={df_md.shape[1]} columns. There are {len(unique_subtypes)} unique subtypes. " + f"Parsed Influenza metadata file into DataFrame with n={df_md.shape[0]} rows and n={df_md.shape[1]} columns. " + f"There are {len(unique_subtypes)} unique subtypes." ) regex_subtype_pattern = r"\((H\d+N\d+|" + "|".join(list(unique_subtypes["Genotype"])) + r")\)" results = [ - parse_blast_result(blast_result, df_md, regex_subtype_pattern, get_top_ref, top=top, - pident_threshold=pident_threshold, - min_aln_length=min_aln_length) for blast_result in blast_results] + parse_blast_result( + blast_result, + df_md, + regex_subtype_pattern, + get_top_ref, + top=top, + pident_threshold=pident_threshold, + min_aln_length=min_aln_length + ) + for blast_result in blast_results + ] if not get_top_ref: dfs_blast = [] @@ -474,6 +459,39 @@ def report( df_ref_id.write_csv(sample_name + ".topsegments.csv", separator=",", has_header=True) +def read_refseq_metadata(flu_metadata): + md_cols = [ + ("#Accession", str), + ("Release_Date", pl.Categorical), + ("Genus", pl.Categorical), + ("Length", pl.UInt16), + ("Genotype", str), + ("Segment", pl.Categorical), + ("Publications", str), + ("Geo_Location", pl.Categorical), + ("Host", pl.Categorical), + ("Isolation_Source", pl.Categorical), + ("Collection_Date", pl.Categorical), + ("GenBank_Title", str), + ] + return pl.read_csv( + flu_metadata, + has_header=True, + dtypes=dict(md_cols), + ) + + +def init_logging(): + from rich.traceback import install + install(show_locals=True, width=120, word_wrap=True) + logging.basicConfig( + format="%(message)s", + datefmt="[%Y-%m-%d %X]", + level=logging.DEBUG, + handlers=[RichHandler(rich_tracebacks=True, tracebacks_show_locals=True)], + ) + + def get_col_widths(df, index=False): """Calculate column widths based on column headers and contents""" if index: diff --git a/modules/local/pull_top_ref_id.nf b/modules/local/pull_top_ref_id.nf index 6e9f0d7..59ccf64 100644 --- a/modules/local/pull_top_ref_id.nf +++ b/modules/local/pull_top_ref_id.nf @@ -1,6 +1,6 @@ process PULL_TOP_REF_ID { tag "$meta.id" - label 'process_medium' + label 'process_low' conda (params.enable_conda ? 'conda-forge::python=3.10 conda-forge::biopython=1.80 conda-forge::openpyxl=3.1.0 conda-forge::pandas=1.5.3 conda-forge::rich=12.6.0 conda-forge::typer=0.7.0 conda-forge::xlsxwriter=3.0.8 conda-forge::polars=0.17.9 conda-forge::pyarrow=11.0.0' : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { @@ -20,7 +20,6 @@ process PULL_TOP_REF_ID { script: """ parse_influenza_blast_results.py \\ - --threads ${task.cpus} \\ --flu-metadata $genomeset \\ --get-top-ref True \\ --top 1 \\ From b21b89dc48ebad97dc93541fe0c0ad71ec4ca2d9 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Tue, 11 Jul 2023 12:12:32 -0500 Subject: [PATCH 10/11] Update ci.yml --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b06d54d..4648ac8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -202,12 +202,12 @@ jobs: with: name: nanopore-test-results-pipline_info-${{ matrix.nxf_ver }} path: results/pipeline_info - - name: Upload iav-subtyping-report.xlsx + - name: Upload nf-flu-subtyping-report.xlsx if: success() uses: actions/upload-artifact@v1.0.0 with: name: nanopore-test-results-subtyping-report-${{ matrix.nxf_ver }} - path: results/iav-subtyping-report.xlsx + path: results/nf-flu-subtyping-report.xlsx - name: Upload multiqc_report.html if: success() uses: actions/upload-artifact@v1.0.0 From 2ed8bf572eaabae94836ea0db576afb7f6abf335 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Tue, 11 Jul 2023 15:18:14 -0500 Subject: [PATCH 11/11] Update docs --- CHANGELOG.md | 8 ++++++++ README.md | 13 +++++++------ docs/output.md | 2 +- docs/usage.md | 8 ++++---- nextflow.config | 2 +- nextflow_schema.json | 8 ++++---- 6 files changed, 25 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d8833b1..54fa4fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[3.3.0](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.3.0)] - 2023-07-11 + +This release migrates to more recently updated Influenza virus sequences since the last update for the [NCBI Influenza DB FTP data](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/) was in 2020-10-13. By default, all Orthomyxoviridae virus sequences were parsed from the daily updated NCBI Viruses [`AllNucleotide.fa`](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNucleotide/) and [`AllNuclMetadata.csv.gz`](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNuclMetadata/AllNuclMetadata.csv.gz) and uploaded to [Figshare](https://figshare.com/articles/dataset/2023-06-14_-_NCBI_Viruses_-_Orthomyxoviridae/23608782) as Zstd compressed files. nf-flu no longer uses the [influenza.fna.gz](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz) and [genomeset.dat.gz](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/genomeset.dat.gz) files for Influenza sequences and metadata, respectively. + +### Fixes + +* More up-to-date Influenza sequences database used by default (#24) + ## [[3.2.1](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.2.1)] - 2023-07-07 ### Fixes diff --git a/README.md b/README.md index 473efca..9e77960 100644 --- a/README.md +++ b/README.md @@ -17,13 +17,14 @@ After reference sequence selection, the pipeline performs read mapping to each r ## Pipeline summary -1. Download latest [NCBI Influenza DB][] sequences and metadata (or use user-specified files) -2. Merge reads of re-sequenced samples ([`cat`](http://www.linfo.org/cat.html)) (if needed) +1. Download latest [NCBI Orthomyxoviridae sequences](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Undef&id=11308&lvl=3&keep=1&srchmode=1&unlock) and metadata (parsed from [NCBI Viruses FTP data](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNucleotide/)). +2. Merge reads of re-sequenced samples ([`cat`](http://www.linfo.org/cat.html)) (if needed). 3. Assembly of Influenza gene segments with [IRMA][] using the built-in FLU module -4. Nucleotide [BLAST][] search against [NCBI Influenza DB][] -5. Automatically select top match references for segments -6. H/N subtype prediction and Excel XLSX report generation based on BLAST results -7. Perform Variant calling and genome assembly for all segments. +4. Nucleotide [BLAST][] search against [NCBI Influenza DB][] sequences +5. H/N subtype prediction and Excel XLSX report generation based on BLAST results. +6. Automatically select top match reference sequences for segments +7. Read mapping, variant calling and consensus sequence generation for each segment against top reference sequence based on BLAST results. +8. MultiQC report generation. ## Quick Start diff --git a/docs/output.md b/docs/output.md index d0a904e..7f69321 100644 --- a/docs/output.md +++ b/docs/output.md @@ -78,7 +78,7 @@ The primary output from [IRMA][] are the consensus sequences for gene segments, -Nucleotide [BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi) (`blastn`) is used to query [IRMA][] assembled gene segment sequences against the [NCBI Influenza DB][] sequences (and optionally, against user-specified sequences (`--ref_db`) to predict the H and N subtype of each sample if possible (i.e. if segments 4 (hemagglutinin) and/or 6 (neuraminidase) were assembled) and to determine the closest matching reference sequence for each segment for reference mapped assembly. +Nucleotide [BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi) (`blastn`) is used to query [IRMA][] assembled gene segment sequences against [Influenza sequences from NCBI](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNucleotide/) (and optionally, against user-specified sequences (`--ref_db`) to predict the H and N subtype of each sample if possible (i.e. if segments 4 (hemagglutinin) and/or 6 (neuraminidase) were assembled) and to determine the closest matching reference sequence for each segment for reference mapped assembly. ### Coverage Plots diff --git a/docs/usage.md b/docs/usage.md index eef8cee..d59ce7f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -261,17 +261,17 @@ Maximum of top blastn result reported - Optional - Type: string -- Default: `https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz` +- Default: `https://api.figshare.com/v2/file/download/41415330` -Path/URL to NCBI Influenza DB sequences FASTA file. +Path/URL to Zstandard compressed NCBI Influenza virus sequences FASTA file. #### `--ncbi_influenza_metadata` - Optional - Type: string -- Default: `https://ftp.ncbi.nih.gov/genomes/INFLUENZA/genomeset.dat.gz` +- Default: `https://api.figshare.com/v2/file/download/41415333` -Path/URL to NCBI Influenza DB metadata file. +Path/URL to Zstandard compressed NCBI Influenza virus sequences metadata CSV file. ### Generic options diff --git a/nextflow.config b/nextflow.config index ea61c7f..2f4bd39 100644 --- a/nextflow.config +++ b/nextflow.config @@ -153,7 +153,7 @@ manifest { description = 'Influenza A virus genome assembly pipeline' homePage = 'https://github.com/CFIA-NCFAD/nf-flu' author = 'Peter Kruczkiewicz, Hai Nguyen' - version = '3.2.1' + version = '3.3.0' nextflowVersion = '>=21.10' mainScript = 'main.nf' doi = '10.5281/zenodo.7011213' diff --git a/nextflow_schema.json b/nextflow_schema.json index 32515d3..58b0e32 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -213,14 +213,14 @@ }, "ncbi_influenza_fasta": { "type": "string", - "default": "https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz", - "description": "Path/URL to NCBI Influenza DB sequences FASTA file.", + "default": "https://api.figshare.com/v2/file/download/41415330", + "description": "Path/URL to Zstandard compressed NCBI Influenza virus sequences FASTA file.", "fa_icon": "fas fa-file-alt" }, "ncbi_influenza_metadata": { "type": "string", - "default": "https://ftp.ncbi.nih.gov/genomes/INFLUENZA/genomeset.dat.gz", - "description": "Path/URL to NCBI Influenza DB metadata file.", + "default": "https://api.figshare.com/v2/file/download/41415333", + "description": "Path/URL to Zstandard compressed NCBI Influenza virus sequences metadata CSV file.", "fa_icon": "fas fa-file-csv" } },