From 8af77cbe6302acaae9ba29c81fe65f9c3125897d Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Thu, 18 Jul 2024 15:32:54 -0500 Subject: [PATCH] Add VADR v1.6.3 Influenza consensus sequence annotation * Add table2asn v1.28.943 for converting Feature Table files to Genbank * Add bin/sub_seqids_for_table2asn.py to prepare Feature Table output for table2asn since table2asn truncates long seq IDs * Add bin/post_table2asn.py to sub in original seq IDs in Genbank and output GFF, amino acid FASTA and nucleotide FASTA of CDS, mature peptide and signal peptide gene features --- bin/post_table2asn.py | 127 ++++++++++++++++++++++++++++++++ bin/sub_seqids_for_table2asn.py | 72 ++++++++++++++++++ conf/modules_nanopore.config | 22 ++++++ modules/local/misc.nf | 8 +- modules/local/table2asn.nf | 88 ++++++++++++++++++++++ modules/local/vadr.nf | 31 ++++++++ workflows/nanopore.nf | 14 ++++ 7 files changed, 358 insertions(+), 4 deletions(-) create mode 100755 bin/post_table2asn.py create mode 100755 bin/sub_seqids_for_table2asn.py create mode 100644 modules/local/table2asn.nf create mode 100644 modules/local/vadr.nf diff --git a/bin/post_table2asn.py b/bin/post_table2asn.py new file mode 100755 index 0000000..2bced54 --- /dev/null +++ b/bin/post_table2asn.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python + +from pathlib import Path +import typer +from rich.logging import RichHandler +import logging +import re +import os +import os +from Bio.SeqRecord import SeqRecord +from Bio.SeqIO.InsdcIO import _insdc_location_string +from gfflu.io import get_translation, write_gff +from Bio import SeqIO + + +REGEX_SUBNAME = re.compile(r'(SEQUENCE-\d+)') + + +def init_logging(): + from rich.traceback import install + + install(show_locals=True, width=120, word_wrap=True) + logging.basicConfig( + format="%(message)s", + datefmt="[%Y-%m-%d %X]", + level=logging.DEBUG, + handlers=[RichHandler(rich_tracebacks=True, tracebacks_show_locals=True)], + ) + + +def get_namesub(namesub_txt: Path) -> dict[str, str]: + namesub = {} + with open(namesub_txt) as fh: + for line in fh: + line = line.strip() + if line == '': continue + sub_name, original_name = line.split('\t', maxsplit=1) + namesub[sub_name] = original_name + return namesub + + +def write_aa_fasta(recs: list[SeqRecord], out_file: os.PathLike) -> None: + """Write out FASTA file with amino acid sequences""" + with open(out_file, "w") as out_handle: + for rec in recs: + for feature in rec.features: + if feature.type not in {"CDS", "mat_peptide", "sig_peptide"}: + continue + translation = get_translation(rec.seq, feature.location) + try: + gene = feature.qualifiers["gene"][0] + except KeyError: + gene = "" + try: + product = feature.qualifiers["product"][0] + except KeyError: + product = "" + location_str = _insdc_location_string(location=feature.location, rec_length=len(rec.seq)) + out_handle.write(f">{rec.id}|{feature.type}|{gene}|{location_str} {product}\n{translation}\n") + +def write_cds_nt_fasta(recs: list[SeqRecord], out_file: os.PathLike) -> None: + """Write out FASTA file with CDS nt sequences""" + with open(out_file, "w") as out_handle: + for rec in recs: + for feature in rec.features: + if feature.type not in {"CDS", "mat_peptide", "sig_peptide"}: + continue + subseq = str(feature.location.extract(rec.seq)) + try: + gene = feature.qualifiers["gene"][0] + except KeyError: + gene = "" + try: + product = feature.qualifiers["product"][0] + except KeyError: + product = "" + location_str = _insdc_location_string(location=feature.location, rec_length=len(rec.seq)) + out_handle.write(f">{rec.id}|{feature.type}|{gene}|{location_str} {product}\n{subseq}\n") + + +def output_subbed_genbank( + gbk_path: Path, + gbk_out: Path, + namesub: dict[str, str], +) -> None: + with open(gbk_path) as fh, open(gbk_out, 'w') as fout: + for line in fh: + match = REGEX_SUBNAME.search(line) + if match: + subname = match.group(1) + original_name = namesub[subname] + line = line.replace(subname, original_name) + fout.write(line) + + +def main( + table2asn_genbank: Path, + namesub_txt: Path, + outdir: Path = typer.Option(Path('./'), '--outdir', '-o'), + prefix: str = typer.Option('SAMPLE', '--prefix', '-p'), +): + init_logging() + logging.info(f'{table2asn_genbank=}') + logging.info(f'{namesub_txt=}') + logging.info(f'{outdir=}') + logging.info(f'{prefix=}') + namesub = get_namesub(namesub_txt) + logging.info(f'Read {len(namesub)} from "{namesub_txt}"') + gbk_out = outdir / f'{prefix}.gbk' + output_subbed_genbank(table2asn_genbank, gbk_out, namesub) + logging.info(f'Wrote Genbank with original names to "{gbk_out}"') + recs = list(SeqIO.parse(gbk_out, 'genbank')) + logging.info(f"Read {len(recs)} sequence records from '{gbk_out}'. Writing GFF, amino acid and nucleotide FASTAs for sequence features (CDS, mature peptides, signal peptides)...") + gff_out = outdir / f'{prefix}.gff' + write_gff(recs, gff_out) + logging.info(f'Wrote GFF to "{gff_out}"') + faa_out = outdir / f'{prefix}.faa' + write_aa_fasta(recs, faa_out) + logging.info(f'Wrote amino acid FASTA to "{faa_out}"') + ffn_out = outdir / f'{prefix}.ffn' + write_cds_nt_fasta(recs, ffn_out) + logging.info(f'Wrote nucleotide FASTA of gene features to "{faa_out}"') + logging.info('Done!') + + +if __name__ == "__main__": + typer.run(main) diff --git a/bin/sub_seqids_for_table2asn.py b/bin/sub_seqids_for_table2asn.py new file mode 100755 index 0000000..146f607 --- /dev/null +++ b/bin/sub_seqids_for_table2asn.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +from pathlib import Path + +import typer +from rich.logging import RichHandler +import logging + + +def init_logging(): + from rich.traceback import install + + install(show_locals=True, width=120, word_wrap=True) + logging.basicConfig( + format="%(message)s", + datefmt="[%Y-%m-%d %X]", + level=logging.DEBUG, + handlers=[RichHandler(rich_tracebacks=True, tracebacks_show_locals=True)], + ) + + +def main( + input_fasta: Path = typer.Option(..., '--input-fasta', '-f'), + input_feature_table: Path = typer.Option(..., '--input-feature-table', '-t'), + outdir: Path = typer.Option(Path('./'), '--outdir', '-o'), + prefix: str = typer.Option('SAMPLE', '--prefix', '-p'), +): + init_logging() + logging.info(f'{input_fasta=}') + logging.info(f'{input_feature_table=}') + logging.info(f'{outdir=}') + logging.info(f'{prefix=}') + ft_seqids = [] + ft_out = outdir / f'{prefix}.tbl' + with open(input_feature_table) as fh, open(ft_out, 'w') as fout: + seqid_count = 0 + for line in fh: + if line.startswith('>Feature '): + seqid = line.replace('>Feature ', '').strip() + seqid_count += 1 + new_seqid = f'SEQUENCE-{seqid_count}' + ft_seqids.append((seqid, new_seqid)) + fout.write(line.replace(seqid, new_seqid)) + else: + fout.write(line.replace(seqid, new_seqid)) + logging.info(f"table2asn compatible feature table written to '{ft_out}'") + + namesub_path = outdir / f'{prefix}.namesub.txt' + with open(namesub_path, 'w') as fout: + for seqid, new_seqid in ft_seqids: + fout.write(f'{new_seqid}\t{seqid}\n') + logging.info(f"Text file with original and table2asn seqids written to '{namesub_path}'") + fasta_seqids = [] + fasta_out = outdir / f'{prefix}.fa' + seqid_to_new_seqid = {x: y for x, y in ft_seqids} + logging.info(f'{seqid_to_new_seqid=}') + with open(input_fasta) as fh, open(fasta_out, 'w') as fout: + for line in fh: + if line.startswith('>'): + seqid, *_ = line[1:].strip().split(' ') + try: + fout.write(line.replace(seqid, seqid_to_new_seqid[seqid])) + except KeyError as ex: + logging.error(f'{seqid=} not in feature table seqids!') + # raise ex + else: + fout.write(line) + logging.info(f"table2asn compatible fasta written to '{fasta_out}'") + + +if __name__ == "__main__": + typer.run(main) diff --git a/conf/modules_nanopore.config b/conf/modules_nanopore.config index ec5b98b..eba39e9 100644 --- a/conf/modules_nanopore.config +++ b/conf/modules_nanopore.config @@ -244,6 +244,28 @@ process { ] } + withName: 'VADR' { + ext.args = '--mkey flu -r --atgonly --xnocomp --nomisc --alt_fail extrant5,extrant3' + publishDir = [ + [ + path: { "${params.outdir}/annotation/vadr" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ] + } + + withName: 'POST_TABLE2ASN' { + ext.args = '' + publishDir = [ + [ + path: { "${params.outdir}/annotation/${sample}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ] + } + withName: 'VCF_FILTER_FRAMESHIFT' { publishDir = [ [ diff --git a/modules/local/misc.nf b/modules/local/misc.nf index d4f8ca5..02cf17b 100644 --- a/modules/local/misc.nf +++ b/modules/local/misc.nf @@ -58,11 +58,11 @@ process CAT_DB { process CAT_CONSENSUS { tag "$sample" - conda 'bioconda::shiptv=0.4.0' + conda 'bioconda::shiptv=0.4.1' if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container 'https://depot.galaxyproject.org/singularity/shiptv:0.4.0--pyh5e36f6f_0' + container 'https://depot.galaxyproject.org/singularity/shiptv:0.4.1--pyh5e36f6f_0' } else { - container 'quay.io/biocontainers/shiptv:0.4.0--pyh5e36f6f_0' + container 'quay.io/biocontainers/shiptv:0.4.1--pyh5e36f6f_0' } input: @@ -70,7 +70,7 @@ process CAT_CONSENSUS { output: tuple val(sample), path('*.consensus.blastn.fasta'), emit: fasta - path('*.consensus.fasta'), emit: consensus_fasta + tuple val(sample), path('*.consensus.fasta'), emit: consensus_fasta path "versions.yml" , emit: versions script: diff --git a/modules/local/table2asn.nf b/modules/local/table2asn.nf new file mode 100644 index 0000000..65d234b --- /dev/null +++ b/modules/local/table2asn.nf @@ -0,0 +1,88 @@ +process PRE_TABLE2ASN { + tag "$sample" + conda 'bioconda::shiptv=0.4.1' + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container 'https://depot.galaxyproject.org/singularity/shiptv:0.4.1--pyh5e36f6f_0' + } else { + container 'quay.io/biocontainers/shiptv:0.4.1--pyh5e36f6f_0' + } + + input: + tuple val(sample), path(feature_table, stageAs: "input*/*"), path(fasta, stageAs: "input*/*") + + output: + tuple val(sample), path("${sample}.tbl"), path("${sample}.fa"), path("${sample}.namesub.txt"), emit: table2asn_input + path "versions.yml", emit: versions + + script: + """ + sub_seqids_for_table2asn.py -t $feature_table -f $fasta -o ./ -p $sample + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} + +process TABLE2ASN { + tag "$sample" + conda 'bioconda::table2asn=1.28.943' + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container 'https://depot.galaxyproject.org/singularity/table2asn:1.28.943--h48fe88c_0' + } else { + container 'quay.io/biocontainers/table2asn:1.28.943--h48fe88c_0' + } + + input: + tuple val(sample), path(feature_table), path(fasta, stageAs: 'input*/*'), path(namesub) + + output: + tuple val(sample), path("${prefix}.gbf"), path(namesub), optional: true, emit: genbank + path "versions.yml", emit: versions + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${sample}" + """ + # ensure that Genbank output file has the right name + ln -s $fasta ${prefix}.fa + # suppress non-zero exit codes due to warnings from table2asn + table2asn -indir ./ -outdir ./ -V b -c - -i ${prefix}.fa || true + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + table2asn: \$(table2asn -version | sed 's/table2asn: //') + END_VERSIONS + """ +} + +process POST_TABLE2ASN { + tag "$sample" + conda 'bioconda::gfflu=0.0.2' + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container 'https://depot.galaxyproject.org/singularity/gfflu:0.0.2--pyhdfd78af_0' + } else { + container 'quay.io/biocontainers/gfflu:0.0.2--pyhdfd78af_0' + } + + input: + tuple val(sample), path(genbank, stageAs: 'input*/*'), path(namesub, stageAs: 'input*/*') + + output: + tuple val(sample), path("${sample}.gbk"), emit: genbank + tuple val(sample), path("${sample}.gff"), emit: gff + tuple val(sample), path("${sample}.faa"), emit: cds_aa_fasta + tuple val(sample), path("${sample}.ffn"), emit: cds_nt_fasta + path "versions.yml", emit: versions + + script: + """ + post_table2asn.py $genbank $namesub -p $sample -o ./ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/vadr.nf b/modules/local/vadr.nf new file mode 100644 index 0000000..c11e93e --- /dev/null +++ b/modules/local/vadr.nf @@ -0,0 +1,31 @@ +process VADR { + tag "$sample" + label 'process_low' + + conda 'pkru22::vadr=1.6.3' + container 'staphb/vadr:1.6.3-hav-flu2' + + input: + tuple val(sample), path(fasta) + + output: + tuple val(sample), path("${prefix}/*.vadr.pass.tbl"), optional: true, emit: feature_table + tuple val(sample), path("${prefix}/*.vadr.pass.fa"), optional: true, emit: pass_fasta + tuple val(sample), path("${prefix}/"), emit: vadr_outdir + path "versions.yml", emit: versions + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${sample}" + """ + v-annotate.pl \\ + $args \\ + $fasta \\ + ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vadr: \$(v-annotate.pl -h | perl -ne 'print "\$1\\n" if /^# VADR (\\d+\\.\\d+\\.\\d+)/') + END_VERSIONS + """ +} diff --git a/workflows/nanopore.nf b/workflows/nanopore.nf index 584c74b..91641ae 100644 --- a/workflows/nanopore.nf +++ b/workflows/nanopore.nf @@ -32,6 +32,8 @@ include { BLAST_BLASTN as BLAST_BLASTN_IRMA } from '../modules include { BLAST_BLASTN as BLAST_BLASTN_CONSENSUS } from '../modules/local/blastn' include { BLAST_BLASTN as BLAST_BLASTN_CONSENSUS_REF_DB } from '../modules/local/blastn' include { CUSTOM_DUMPSOFTWAREVERSIONS as SOFTWARE_VERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' +include { VADR } from '../modules/local/vadr' +include { PRE_TABLE2ASN; TABLE2ASN; POST_TABLE2ASN } from '../modules/local/table2asn' include { MULTIQC } from '../modules/local/multiqc' def pass_sample_reads = [:] @@ -240,6 +242,18 @@ workflow NANOPORE { CAT_CONSENSUS(ch_final_consensus) ch_versions = ch_versions.mix(CAT_CONSENSUS.out.versions) + VADR(CAT_CONSENSUS.out.consensus_fasta) + ch_versions = ch_versions.mix(VADR.out.versions) + VADR.out.feature_table + .combine(VADR.out.pass_fasta, by: 0) + .set { ch_pre_table2asn } + PRE_TABLE2ASN(ch_pre_table2asn) + ch_versions = ch_versions.mix(PRE_TABLE2ASN.out.versions) + TABLE2ASN(PRE_TABLE2ASN.out.table2asn_input) + ch_versions = ch_versions.mix(TABLE2ASN.out.versions) + POST_TABLE2ASN(TABLE2ASN.out.genbank) + ch_versions = ch_versions.mix(POST_TABLE2ASN.out.versions) + CAT_CONSENSUS.out.fasta .map { [[id:it[0]], it[1]] } .set { ch_cat_consensus }