Skip to content

Commit

Permalink
Add VADR v1.6.3 Influenza consensus sequence annotation
Browse files Browse the repository at this point in the history
* Add table2asn v1.28.943 for converting Feature Table files to Genbank
* Add bin/sub_seqids_for_table2asn.py to prepare Feature Table output
  for table2asn since table2asn truncates long seq IDs
* Add bin/post_table2asn.py to sub in original seq IDs in Genbank and output GFF, amino acid FASTA and nucleotide FASTA of CDS, mature peptide and signal peptide gene features
  • Loading branch information
peterk87 committed Jul 18, 2024
1 parent a3c05ca commit 8af77cb
Show file tree
Hide file tree
Showing 7 changed files with 358 additions and 4 deletions.
127 changes: 127 additions & 0 deletions bin/post_table2asn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/env python

from pathlib import Path
import typer
from rich.logging import RichHandler
import logging
import re
import os
import os
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO.InsdcIO import _insdc_location_string
from gfflu.io import get_translation, write_gff
from Bio import SeqIO


REGEX_SUBNAME = re.compile(r'(SEQUENCE-\d+)')


def init_logging():
from rich.traceback import install

install(show_locals=True, width=120, word_wrap=True)
logging.basicConfig(
format="%(message)s",
datefmt="[%Y-%m-%d %X]",
level=logging.DEBUG,
handlers=[RichHandler(rich_tracebacks=True, tracebacks_show_locals=True)],
)


def get_namesub(namesub_txt: Path) -> dict[str, str]:
namesub = {}
with open(namesub_txt) as fh:
for line in fh:
line = line.strip()
if line == '': continue
sub_name, original_name = line.split('\t', maxsplit=1)
namesub[sub_name] = original_name
return namesub


def write_aa_fasta(recs: list[SeqRecord], out_file: os.PathLike) -> None:
"""Write out FASTA file with amino acid sequences"""
with open(out_file, "w") as out_handle:
for rec in recs:
for feature in rec.features:
if feature.type not in {"CDS", "mat_peptide", "sig_peptide"}:
continue
translation = get_translation(rec.seq, feature.location)
try:
gene = feature.qualifiers["gene"][0]
except KeyError:
gene = ""
try:
product = feature.qualifiers["product"][0]
except KeyError:
product = ""
location_str = _insdc_location_string(location=feature.location, rec_length=len(rec.seq))
out_handle.write(f">{rec.id}|{feature.type}|{gene}|{location_str} {product}\n{translation}\n")

def write_cds_nt_fasta(recs: list[SeqRecord], out_file: os.PathLike) -> None:
"""Write out FASTA file with CDS nt sequences"""
with open(out_file, "w") as out_handle:
for rec in recs:
for feature in rec.features:
if feature.type not in {"CDS", "mat_peptide", "sig_peptide"}:
continue
subseq = str(feature.location.extract(rec.seq))
try:
gene = feature.qualifiers["gene"][0]
except KeyError:
gene = ""
try:
product = feature.qualifiers["product"][0]
except KeyError:
product = ""
location_str = _insdc_location_string(location=feature.location, rec_length=len(rec.seq))
out_handle.write(f">{rec.id}|{feature.type}|{gene}|{location_str} {product}\n{subseq}\n")


def output_subbed_genbank(
gbk_path: Path,
gbk_out: Path,
namesub: dict[str, str],
) -> None:
with open(gbk_path) as fh, open(gbk_out, 'w') as fout:
for line in fh:
match = REGEX_SUBNAME.search(line)
if match:
subname = match.group(1)
original_name = namesub[subname]
line = line.replace(subname, original_name)
fout.write(line)


def main(
table2asn_genbank: Path,
namesub_txt: Path,
outdir: Path = typer.Option(Path('./'), '--outdir', '-o'),
prefix: str = typer.Option('SAMPLE', '--prefix', '-p'),
):
init_logging()
logging.info(f'{table2asn_genbank=}')
logging.info(f'{namesub_txt=}')
logging.info(f'{outdir=}')
logging.info(f'{prefix=}')
namesub = get_namesub(namesub_txt)
logging.info(f'Read {len(namesub)} from "{namesub_txt}"')
gbk_out = outdir / f'{prefix}.gbk'
output_subbed_genbank(table2asn_genbank, gbk_out, namesub)
logging.info(f'Wrote Genbank with original names to "{gbk_out}"')
recs = list(SeqIO.parse(gbk_out, 'genbank'))
logging.info(f"Read {len(recs)} sequence records from '{gbk_out}'. Writing GFF, amino acid and nucleotide FASTAs for sequence features (CDS, mature peptides, signal peptides)...")
gff_out = outdir / f'{prefix}.gff'
write_gff(recs, gff_out)
logging.info(f'Wrote GFF to "{gff_out}"')
faa_out = outdir / f'{prefix}.faa'
write_aa_fasta(recs, faa_out)
logging.info(f'Wrote amino acid FASTA to "{faa_out}"')
ffn_out = outdir / f'{prefix}.ffn'
write_cds_nt_fasta(recs, ffn_out)
logging.info(f'Wrote nucleotide FASTA of gene features to "{faa_out}"')
logging.info('Done!')


if __name__ == "__main__":
typer.run(main)
72 changes: 72 additions & 0 deletions bin/sub_seqids_for_table2asn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env python

from pathlib import Path

import typer
from rich.logging import RichHandler
import logging


def init_logging():
from rich.traceback import install

install(show_locals=True, width=120, word_wrap=True)
logging.basicConfig(
format="%(message)s",
datefmt="[%Y-%m-%d %X]",
level=logging.DEBUG,
handlers=[RichHandler(rich_tracebacks=True, tracebacks_show_locals=True)],
)


def main(
input_fasta: Path = typer.Option(..., '--input-fasta', '-f'),
input_feature_table: Path = typer.Option(..., '--input-feature-table', '-t'),
outdir: Path = typer.Option(Path('./'), '--outdir', '-o'),
prefix: str = typer.Option('SAMPLE', '--prefix', '-p'),
):
init_logging()
logging.info(f'{input_fasta=}')
logging.info(f'{input_feature_table=}')
logging.info(f'{outdir=}')
logging.info(f'{prefix=}')
ft_seqids = []
ft_out = outdir / f'{prefix}.tbl'
with open(input_feature_table) as fh, open(ft_out, 'w') as fout:
seqid_count = 0
for line in fh:
if line.startswith('>Feature '):
seqid = line.replace('>Feature ', '').strip()
seqid_count += 1
new_seqid = f'SEQUENCE-{seqid_count}'
ft_seqids.append((seqid, new_seqid))
fout.write(line.replace(seqid, new_seqid))
else:
fout.write(line.replace(seqid, new_seqid))
logging.info(f"table2asn compatible feature table written to '{ft_out}'")

namesub_path = outdir / f'{prefix}.namesub.txt'
with open(namesub_path, 'w') as fout:
for seqid, new_seqid in ft_seqids:
fout.write(f'{new_seqid}\t{seqid}\n')
logging.info(f"Text file with original and table2asn seqids written to '{namesub_path}'")
fasta_seqids = []
fasta_out = outdir / f'{prefix}.fa'
seqid_to_new_seqid = {x: y for x, y in ft_seqids}
logging.info(f'{seqid_to_new_seqid=}')
with open(input_fasta) as fh, open(fasta_out, 'w') as fout:
for line in fh:
if line.startswith('>'):
seqid, *_ = line[1:].strip().split(' ')
try:
fout.write(line.replace(seqid, seqid_to_new_seqid[seqid]))
except KeyError as ex:
logging.error(f'{seqid=} not in feature table seqids!')
# raise ex
else:
fout.write(line)
logging.info(f"table2asn compatible fasta written to '{fasta_out}'")


if __name__ == "__main__":
typer.run(main)
22 changes: 22 additions & 0 deletions conf/modules_nanopore.config
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,28 @@ process {
]
}

withName: 'VADR' {
ext.args = '--mkey flu -r --atgonly --xnocomp --nomisc --alt_fail extrant5,extrant3'
publishDir = [
[
path: { "${params.outdir}/annotation/vadr" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
]
}

withName: 'POST_TABLE2ASN' {
ext.args = ''
publishDir = [
[
path: { "${params.outdir}/annotation/${sample}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
]
}

withName: 'VCF_FILTER_FRAMESHIFT' {
publishDir = [
[
Expand Down
8 changes: 4 additions & 4 deletions modules/local/misc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -58,19 +58,19 @@ process CAT_DB {

process CAT_CONSENSUS {
tag "$sample"
conda 'bioconda::shiptv=0.4.0'
conda 'bioconda::shiptv=0.4.1'
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container 'https://depot.galaxyproject.org/singularity/shiptv:0.4.0--pyh5e36f6f_0'
container 'https://depot.galaxyproject.org/singularity/shiptv:0.4.1--pyh5e36f6f_0'
} else {
container 'quay.io/biocontainers/shiptv:0.4.0--pyh5e36f6f_0'
container 'quay.io/biocontainers/shiptv:0.4.1--pyh5e36f6f_0'
}

input:
tuple val(sample), path(consensus)

output:
tuple val(sample), path('*.consensus.blastn.fasta'), emit: fasta
path('*.consensus.fasta'), emit: consensus_fasta
tuple val(sample), path('*.consensus.fasta'), emit: consensus_fasta
path "versions.yml" , emit: versions

script:
Expand Down
88 changes: 88 additions & 0 deletions modules/local/table2asn.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
process PRE_TABLE2ASN {
tag "$sample"
conda 'bioconda::shiptv=0.4.1'
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container 'https://depot.galaxyproject.org/singularity/shiptv:0.4.1--pyh5e36f6f_0'
} else {
container 'quay.io/biocontainers/shiptv:0.4.1--pyh5e36f6f_0'
}

input:
tuple val(sample), path(feature_table, stageAs: "input*/*"), path(fasta, stageAs: "input*/*")

output:
tuple val(sample), path("${sample}.tbl"), path("${sample}.fa"), path("${sample}.namesub.txt"), emit: table2asn_input
path "versions.yml", emit: versions

script:
"""
sub_seqids_for_table2asn.py -t $feature_table -f $fasta -o ./ -p $sample
cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
END_VERSIONS
"""
}

process TABLE2ASN {
tag "$sample"
conda 'bioconda::table2asn=1.28.943'
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container 'https://depot.galaxyproject.org/singularity/table2asn:1.28.943--h48fe88c_0'
} else {
container 'quay.io/biocontainers/table2asn:1.28.943--h48fe88c_0'
}

input:
tuple val(sample), path(feature_table), path(fasta, stageAs: 'input*/*'), path(namesub)

output:
tuple val(sample), path("${prefix}.gbf"), path(namesub), optional: true, emit: genbank
path "versions.yml", emit: versions

script:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${sample}"
"""
# ensure that Genbank output file has the right name
ln -s $fasta ${prefix}.fa
# suppress non-zero exit codes due to warnings from table2asn
table2asn -indir ./ -outdir ./ -V b -c - -i ${prefix}.fa || true
cat <<-END_VERSIONS > versions.yml
"${task.process}":
table2asn: \$(table2asn -version | sed 's/table2asn: //')
END_VERSIONS
"""
}

process POST_TABLE2ASN {
tag "$sample"
conda 'bioconda::gfflu=0.0.2'
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container 'https://depot.galaxyproject.org/singularity/gfflu:0.0.2--pyhdfd78af_0'
} else {
container 'quay.io/biocontainers/gfflu:0.0.2--pyhdfd78af_0'
}

input:
tuple val(sample), path(genbank, stageAs: 'input*/*'), path(namesub, stageAs: 'input*/*')

output:
tuple val(sample), path("${sample}.gbk"), emit: genbank
tuple val(sample), path("${sample}.gff"), emit: gff
tuple val(sample), path("${sample}.faa"), emit: cds_aa_fasta
tuple val(sample), path("${sample}.ffn"), emit: cds_nt_fasta
path "versions.yml", emit: versions

script:
"""
post_table2asn.py $genbank $namesub -p $sample -o ./
cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
END_VERSIONS
"""
}
31 changes: 31 additions & 0 deletions modules/local/vadr.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
process VADR {
tag "$sample"
label 'process_low'

conda 'pkru22::vadr=1.6.3'
container 'staphb/vadr:1.6.3-hav-flu2'

input:
tuple val(sample), path(fasta)

output:
tuple val(sample), path("${prefix}/*.vadr.pass.tbl"), optional: true, emit: feature_table
tuple val(sample), path("${prefix}/*.vadr.pass.fa"), optional: true, emit: pass_fasta
tuple val(sample), path("${prefix}/"), emit: vadr_outdir
path "versions.yml", emit: versions

script:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${sample}"
"""
v-annotate.pl \\
$args \\
$fasta \\
${prefix}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
vadr: \$(v-annotate.pl -h | perl -ne 'print "\$1\\n" if /^# VADR (\\d+\\.\\d+\\.\\d+)/')
END_VERSIONS
"""
}
Loading

0 comments on commit 8af77cb

Please sign in to comment.