From 8af77cbe6302acaae9ba29c81fe65f9c3125897d Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Thu, 18 Jul 2024 15:32:54 -0500
Subject: [PATCH] Add VADR v1.6.3 Influenza consensus sequence annotation * Add
 table2asn v1.28.943 for converting Feature Table files to Genbank * Add
 bin/sub_seqids_for_table2asn.py to prepare Feature Table output   for
 table2asn since table2asn truncates long seq IDs * Add bin/post_table2asn.py
 to sub in original seq IDs in Genbank and output GFF, amino acid FASTA and
 nucleotide FASTA of CDS, mature peptide and signal peptide gene features

---
 bin/post_table2asn.py           | 127 ++++++++++++++++++++++++++++++++
 bin/sub_seqids_for_table2asn.py |  72 ++++++++++++++++++
 conf/modules_nanopore.config    |  22 ++++++
 modules/local/misc.nf           |   8 +-
 modules/local/table2asn.nf      |  88 ++++++++++++++++++++++
 modules/local/vadr.nf           |  31 ++++++++
 workflows/nanopore.nf           |  14 ++++
 7 files changed, 358 insertions(+), 4 deletions(-)
 create mode 100755 bin/post_table2asn.py
 create mode 100755 bin/sub_seqids_for_table2asn.py
 create mode 100644 modules/local/table2asn.nf
 create mode 100644 modules/local/vadr.nf

diff --git a/bin/post_table2asn.py b/bin/post_table2asn.py
new file mode 100755
index 0000000..2bced54
--- /dev/null
+++ b/bin/post_table2asn.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+
+from pathlib import Path
+import typer
+from rich.logging import RichHandler
+import logging
+import re
+import os
+import os
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqIO.InsdcIO import _insdc_location_string
+from gfflu.io import get_translation, write_gff
+from Bio import SeqIO
+
+
+REGEX_SUBNAME = re.compile(r'(SEQUENCE-\d+)')
+
+
+def init_logging():
+    from rich.traceback import install
+
+    install(show_locals=True, width=120, word_wrap=True)
+    logging.basicConfig(
+        format="%(message)s",
+        datefmt="[%Y-%m-%d %X]",
+        level=logging.DEBUG,
+        handlers=[RichHandler(rich_tracebacks=True, tracebacks_show_locals=True)],
+    )
+
+
+def get_namesub(namesub_txt: Path) -> dict[str, str]:
+    namesub = {}
+    with open(namesub_txt) as fh:
+        for line in fh:
+            line = line.strip()
+            if line == '': continue
+            sub_name, original_name = line.split('\t', maxsplit=1)
+            namesub[sub_name] = original_name
+    return namesub
+
+
+def write_aa_fasta(recs: list[SeqRecord], out_file: os.PathLike) -> None:
+    """Write out FASTA file with amino acid sequences"""
+    with open(out_file, "w") as out_handle:
+        for rec in recs:
+            for feature in rec.features:
+                if feature.type not in {"CDS", "mat_peptide", "sig_peptide"}:
+                    continue
+                translation = get_translation(rec.seq, feature.location)
+                try:
+                    gene = feature.qualifiers["gene"][0]
+                except KeyError:
+                    gene = ""
+                try:
+                    product = feature.qualifiers["product"][0]
+                except KeyError:
+                    product = ""
+                location_str = _insdc_location_string(location=feature.location, rec_length=len(rec.seq))
+                out_handle.write(f">{rec.id}|{feature.type}|{gene}|{location_str} {product}\n{translation}\n")
+
+def write_cds_nt_fasta(recs: list[SeqRecord], out_file: os.PathLike) -> None:
+    """Write out FASTA file with CDS nt sequences"""
+    with open(out_file, "w") as out_handle:
+        for rec in recs:
+            for feature in rec.features:
+                if feature.type not in {"CDS", "mat_peptide", "sig_peptide"}:
+                    continue
+                subseq = str(feature.location.extract(rec.seq))
+                try:
+                    gene = feature.qualifiers["gene"][0]
+                except KeyError:
+                    gene = ""
+                try:
+                    product = feature.qualifiers["product"][0]
+                except KeyError:
+                    product = ""
+                location_str = _insdc_location_string(location=feature.location, rec_length=len(rec.seq))
+                out_handle.write(f">{rec.id}|{feature.type}|{gene}|{location_str} {product}\n{subseq}\n")
+
+
+def output_subbed_genbank(
+    gbk_path: Path, 
+    gbk_out: Path, 
+    namesub: dict[str, str],
+) -> None:
+    with open(gbk_path) as fh, open(gbk_out, 'w') as fout:
+        for line in fh:
+            match = REGEX_SUBNAME.search(line)
+            if match:
+                subname = match.group(1)
+                original_name = namesub[subname]
+                line = line.replace(subname, original_name)
+            fout.write(line)
+
+
+def main(
+    table2asn_genbank: Path,
+    namesub_txt: Path,
+    outdir: Path = typer.Option(Path('./'), '--outdir', '-o'),
+    prefix: str = typer.Option('SAMPLE', '--prefix', '-p'),
+):
+    init_logging()
+    logging.info(f'{table2asn_genbank=}')
+    logging.info(f'{namesub_txt=}')
+    logging.info(f'{outdir=}')
+    logging.info(f'{prefix=}')
+    namesub = get_namesub(namesub_txt)
+    logging.info(f'Read {len(namesub)} from "{namesub_txt}"')
+    gbk_out = outdir / f'{prefix}.gbk'
+    output_subbed_genbank(table2asn_genbank, gbk_out, namesub)
+    logging.info(f'Wrote Genbank with original names to "{gbk_out}"')
+    recs = list(SeqIO.parse(gbk_out, 'genbank'))
+    logging.info(f"Read {len(recs)} sequence records from '{gbk_out}'. Writing GFF, amino acid and nucleotide FASTAs for sequence features (CDS, mature peptides, signal peptides)...")
+    gff_out = outdir / f'{prefix}.gff'
+    write_gff(recs, gff_out)
+    logging.info(f'Wrote GFF to "{gff_out}"')
+    faa_out = outdir / f'{prefix}.faa'
+    write_aa_fasta(recs, faa_out)
+    logging.info(f'Wrote amino acid FASTA to "{faa_out}"')
+    ffn_out = outdir / f'{prefix}.ffn'
+    write_cds_nt_fasta(recs, ffn_out)
+    logging.info(f'Wrote nucleotide FASTA of gene features to "{faa_out}"')
+    logging.info('Done!')
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/bin/sub_seqids_for_table2asn.py b/bin/sub_seqids_for_table2asn.py
new file mode 100755
index 0000000..146f607
--- /dev/null
+++ b/bin/sub_seqids_for_table2asn.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+from pathlib import Path
+
+import typer
+from rich.logging import RichHandler
+import logging
+
+
+def init_logging():
+    from rich.traceback import install
+
+    install(show_locals=True, width=120, word_wrap=True)
+    logging.basicConfig(
+        format="%(message)s",
+        datefmt="[%Y-%m-%d %X]",
+        level=logging.DEBUG,
+        handlers=[RichHandler(rich_tracebacks=True, tracebacks_show_locals=True)],
+    )
+
+
+def main(
+    input_fasta: Path = typer.Option(..., '--input-fasta', '-f'),
+    input_feature_table: Path = typer.Option(..., '--input-feature-table', '-t'),
+    outdir: Path = typer.Option(Path('./'), '--outdir', '-o'),
+    prefix: str = typer.Option('SAMPLE', '--prefix', '-p'),
+):
+    init_logging()
+    logging.info(f'{input_fasta=}')
+    logging.info(f'{input_feature_table=}')
+    logging.info(f'{outdir=}')
+    logging.info(f'{prefix=}')
+    ft_seqids = []
+    ft_out = outdir / f'{prefix}.tbl'
+    with open(input_feature_table) as fh, open(ft_out, 'w') as fout:
+        seqid_count = 0
+        for line in fh:
+            if line.startswith('>Feature '):
+                seqid = line.replace('>Feature ', '').strip()
+                seqid_count += 1
+                new_seqid = f'SEQUENCE-{seqid_count}'
+                ft_seqids.append((seqid, new_seqid))
+                fout.write(line.replace(seqid, new_seqid))
+            else:
+                fout.write(line.replace(seqid, new_seqid))
+    logging.info(f"table2asn compatible feature table written to '{ft_out}'")
+
+    namesub_path = outdir / f'{prefix}.namesub.txt'
+    with open(namesub_path, 'w') as fout:
+        for seqid, new_seqid in ft_seqids:
+            fout.write(f'{new_seqid}\t{seqid}\n')
+    logging.info(f"Text file with original and table2asn seqids written to '{namesub_path}'")
+    fasta_seqids = []
+    fasta_out = outdir / f'{prefix}.fa'
+    seqid_to_new_seqid = {x: y for x, y in ft_seqids}
+    logging.info(f'{seqid_to_new_seqid=}')
+    with open(input_fasta) as fh, open(fasta_out, 'w') as fout:
+        for line in fh:
+            if line.startswith('>'):
+                seqid, *_ = line[1:].strip().split(' ')
+                try:
+                    fout.write(line.replace(seqid, seqid_to_new_seqid[seqid]))
+                except KeyError as ex:
+                    logging.error(f'{seqid=} not in feature table seqids!')
+                    # raise ex
+            else:
+                fout.write(line)
+    logging.info(f"table2asn compatible fasta written to '{fasta_out}'")
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/conf/modules_nanopore.config b/conf/modules_nanopore.config
index ec5b98b..eba39e9 100644
--- a/conf/modules_nanopore.config
+++ b/conf/modules_nanopore.config
@@ -244,6 +244,28 @@ process {
     ]
   }
 
+  withName: 'VADR' {
+    ext.args = '--mkey flu -r --atgonly --xnocomp --nomisc --alt_fail extrant5,extrant3'
+    publishDir = [
+      [
+        path: { "${params.outdir}/annotation/vadr" },
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+      ]
+    ]
+  }
+
+  withName: 'POST_TABLE2ASN' {
+    ext.args = ''
+    publishDir = [
+      [
+        path: { "${params.outdir}/annotation/${sample}" },
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+      ]
+    ]
+  }
+
   withName: 'VCF_FILTER_FRAMESHIFT' {
     publishDir = [
       [
diff --git a/modules/local/misc.nf b/modules/local/misc.nf
index d4f8ca5..02cf17b 100644
--- a/modules/local/misc.nf
+++ b/modules/local/misc.nf
@@ -58,11 +58,11 @@ process CAT_DB {
 
 process CAT_CONSENSUS {
   tag "$sample"
-  conda 'bioconda::shiptv=0.4.0'
+  conda 'bioconda::shiptv=0.4.1'
   if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-    container 'https://depot.galaxyproject.org/singularity/shiptv:0.4.0--pyh5e36f6f_0'
+    container 'https://depot.galaxyproject.org/singularity/shiptv:0.4.1--pyh5e36f6f_0'
   } else {
-    container 'quay.io/biocontainers/shiptv:0.4.0--pyh5e36f6f_0'
+    container 'quay.io/biocontainers/shiptv:0.4.1--pyh5e36f6f_0'
   }
 
   input:
@@ -70,7 +70,7 @@ process CAT_CONSENSUS {
 
   output:
   tuple val(sample), path('*.consensus.blastn.fasta'), emit: fasta
-  path('*.consensus.fasta'), emit: consensus_fasta
+  tuple val(sample), path('*.consensus.fasta'), emit: consensus_fasta
   path "versions.yml" , emit: versions
 
   script:
diff --git a/modules/local/table2asn.nf b/modules/local/table2asn.nf
new file mode 100644
index 0000000..65d234b
--- /dev/null
+++ b/modules/local/table2asn.nf
@@ -0,0 +1,88 @@
+process PRE_TABLE2ASN {
+  tag "$sample"
+  conda 'bioconda::shiptv=0.4.1'
+  if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+    container 'https://depot.galaxyproject.org/singularity/shiptv:0.4.1--pyh5e36f6f_0'
+  } else {
+    container 'quay.io/biocontainers/shiptv:0.4.1--pyh5e36f6f_0'
+  }
+
+  input:
+  tuple val(sample), path(feature_table, stageAs: "input*/*"), path(fasta, stageAs: "input*/*")
+
+  output:
+  tuple val(sample), path("${sample}.tbl"), path("${sample}.fa"), path("${sample}.namesub.txt"), emit: table2asn_input
+  path "versions.yml", emit: versions
+
+  script:
+  """
+  sub_seqids_for_table2asn.py -t $feature_table -f $fasta -o ./ -p $sample
+
+  cat <<-END_VERSIONS > versions.yml
+  "${task.process}":
+     python: \$(python --version | sed 's/Python //g')
+  END_VERSIONS
+  """
+}
+
+process TABLE2ASN {
+  tag "$sample"
+  conda 'bioconda::table2asn=1.28.943'
+  if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+    container 'https://depot.galaxyproject.org/singularity/table2asn:1.28.943--h48fe88c_0'
+  } else {
+    container 'quay.io/biocontainers/table2asn:1.28.943--h48fe88c_0'
+  }
+
+  input:
+  tuple val(sample), path(feature_table), path(fasta, stageAs: 'input*/*'), path(namesub)
+
+  output:
+  tuple val(sample), path("${prefix}.gbf"), path(namesub), optional: true, emit: genbank
+  path "versions.yml", emit: versions
+
+  script:
+  def args = task.ext.args ?: ''
+  prefix = task.ext.prefix ?: "${sample}"
+  """
+  # ensure that Genbank output file has the right name
+  ln -s $fasta ${prefix}.fa
+  # suppress non-zero exit codes due to warnings from table2asn
+  table2asn -indir ./ -outdir ./ -V b -c - -i ${prefix}.fa || true
+
+  cat <<-END_VERSIONS > versions.yml
+  "${task.process}":
+      table2asn: \$(table2asn -version | sed 's/table2asn: //')
+  END_VERSIONS
+  """
+}
+
+process POST_TABLE2ASN {
+  tag "$sample"
+  conda 'bioconda::gfflu=0.0.2'
+  if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+    container 'https://depot.galaxyproject.org/singularity/gfflu:0.0.2--pyhdfd78af_0'
+  } else {
+    container 'quay.io/biocontainers/gfflu:0.0.2--pyhdfd78af_0'
+  }
+
+  input:
+  tuple val(sample), path(genbank, stageAs: 'input*/*'), path(namesub, stageAs: 'input*/*')
+
+  output:
+  tuple val(sample), path("${sample}.gbk"), emit: genbank
+  tuple val(sample), path("${sample}.gff"), emit: gff
+  tuple val(sample), path("${sample}.faa"), emit: cds_aa_fasta
+  tuple val(sample), path("${sample}.ffn"), emit: cds_nt_fasta
+  path "versions.yml", emit: versions
+
+  script:
+  """
+  post_table2asn.py $genbank $namesub -p $sample -o ./
+
+  cat <<-END_VERSIONS > versions.yml
+  "${task.process}":
+     python: \$(python --version | sed 's/Python //g')
+  END_VERSIONS
+  """
+}
diff --git a/modules/local/vadr.nf b/modules/local/vadr.nf
new file mode 100644
index 0000000..c11e93e
--- /dev/null
+++ b/modules/local/vadr.nf
@@ -0,0 +1,31 @@
+process VADR {
+  tag "$sample"
+  label 'process_low'
+
+  conda 'pkru22::vadr=1.6.3'
+  container 'staphb/vadr:1.6.3-hav-flu2'
+
+  input:
+  tuple val(sample), path(fasta)
+
+  output:
+  tuple val(sample), path("${prefix}/*.vadr.pass.tbl"), optional: true, emit: feature_table
+  tuple val(sample), path("${prefix}/*.vadr.pass.fa"), optional: true, emit: pass_fasta
+  tuple val(sample), path("${prefix}/"), emit: vadr_outdir
+  path "versions.yml", emit: versions
+
+  script:
+  def args = task.ext.args ?: ''
+  prefix = task.ext.prefix ?: "${sample}"
+  """
+  v-annotate.pl \\
+    $args \\
+    $fasta \\
+    ${prefix}
+
+  cat <<-END_VERSIONS > versions.yml
+  "${task.process}":
+      vadr: \$(v-annotate.pl -h | perl -ne 'print "\$1\\n" if /^# VADR (\\d+\\.\\d+\\.\\d+)/')
+  END_VERSIONS
+  """
+}
diff --git a/workflows/nanopore.nf b/workflows/nanopore.nf
index 584c74b..91641ae 100644
--- a/workflows/nanopore.nf
+++ b/workflows/nanopore.nf
@@ -32,6 +32,8 @@ include { BLAST_BLASTN as BLAST_BLASTN_IRMA                   } from '../modules
 include { BLAST_BLASTN as BLAST_BLASTN_CONSENSUS              } from '../modules/local/blastn'
 include { BLAST_BLASTN as BLAST_BLASTN_CONSENSUS_REF_DB       } from '../modules/local/blastn'
 include { CUSTOM_DUMPSOFTWAREVERSIONS  as SOFTWARE_VERSIONS   } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main'
+include { VADR                                                } from '../modules/local/vadr'
+include { PRE_TABLE2ASN; TABLE2ASN; POST_TABLE2ASN            } from '../modules/local/table2asn'
 include { MULTIQC                                             } from '../modules/local/multiqc'
 
 def pass_sample_reads = [:]
@@ -240,6 +242,18 @@ workflow NANOPORE {
   CAT_CONSENSUS(ch_final_consensus)
   ch_versions = ch_versions.mix(CAT_CONSENSUS.out.versions)
 
+  VADR(CAT_CONSENSUS.out.consensus_fasta)
+  ch_versions = ch_versions.mix(VADR.out.versions)
+  VADR.out.feature_table
+    .combine(VADR.out.pass_fasta, by: 0)
+    .set { ch_pre_table2asn }
+  PRE_TABLE2ASN(ch_pre_table2asn)
+  ch_versions = ch_versions.mix(PRE_TABLE2ASN.out.versions)
+  TABLE2ASN(PRE_TABLE2ASN.out.table2asn_input)
+  ch_versions = ch_versions.mix(TABLE2ASN.out.versions)
+  POST_TABLE2ASN(TABLE2ASN.out.genbank)
+  ch_versions = ch_versions.mix(POST_TABLE2ASN.out.versions)
+
   CAT_CONSENSUS.out.fasta
     .map { [[id:it[0]], it[1]] }
     .set { ch_cat_consensus }