-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add VADR v1.6.3 Influenza consensus sequence annotation
* Add table2asn v1.28.943 for converting Feature Table files to Genbank * Add bin/sub_seqids_for_table2asn.py to prepare Feature Table output for table2asn since table2asn truncates long seq IDs * Add bin/post_table2asn.py to sub in original seq IDs in Genbank and output GFF, amino acid FASTA and nucleotide FASTA of CDS, mature peptide and signal peptide gene features
- Loading branch information
Showing
7 changed files
with
358 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
#!/usr/bin/env python | ||
|
||
from pathlib import Path | ||
import typer | ||
from rich.logging import RichHandler | ||
import logging | ||
import re | ||
import os | ||
import os | ||
from Bio.SeqRecord import SeqRecord | ||
from Bio.SeqIO.InsdcIO import _insdc_location_string | ||
from gfflu.io import get_translation, write_gff | ||
from Bio import SeqIO | ||
|
||
|
||
REGEX_SUBNAME = re.compile(r'(SEQUENCE-\d+)') | ||
|
||
|
||
def init_logging(): | ||
from rich.traceback import install | ||
|
||
install(show_locals=True, width=120, word_wrap=True) | ||
logging.basicConfig( | ||
format="%(message)s", | ||
datefmt="[%Y-%m-%d %X]", | ||
level=logging.DEBUG, | ||
handlers=[RichHandler(rich_tracebacks=True, tracebacks_show_locals=True)], | ||
) | ||
|
||
|
||
def get_namesub(namesub_txt: Path) -> dict[str, str]: | ||
namesub = {} | ||
with open(namesub_txt) as fh: | ||
for line in fh: | ||
line = line.strip() | ||
if line == '': continue | ||
sub_name, original_name = line.split('\t', maxsplit=1) | ||
namesub[sub_name] = original_name | ||
return namesub | ||
|
||
|
||
def write_aa_fasta(recs: list[SeqRecord], out_file: os.PathLike) -> None: | ||
"""Write out FASTA file with amino acid sequences""" | ||
with open(out_file, "w") as out_handle: | ||
for rec in recs: | ||
for feature in rec.features: | ||
if feature.type not in {"CDS", "mat_peptide", "sig_peptide"}: | ||
continue | ||
translation = get_translation(rec.seq, feature.location) | ||
try: | ||
gene = feature.qualifiers["gene"][0] | ||
except KeyError: | ||
gene = "" | ||
try: | ||
product = feature.qualifiers["product"][0] | ||
except KeyError: | ||
product = "" | ||
location_str = _insdc_location_string(location=feature.location, rec_length=len(rec.seq)) | ||
out_handle.write(f">{rec.id}|{feature.type}|{gene}|{location_str} {product}\n{translation}\n") | ||
|
||
def write_cds_nt_fasta(recs: list[SeqRecord], out_file: os.PathLike) -> None: | ||
"""Write out FASTA file with CDS nt sequences""" | ||
with open(out_file, "w") as out_handle: | ||
for rec in recs: | ||
for feature in rec.features: | ||
if feature.type not in {"CDS", "mat_peptide", "sig_peptide"}: | ||
continue | ||
subseq = str(feature.location.extract(rec.seq)) | ||
try: | ||
gene = feature.qualifiers["gene"][0] | ||
except KeyError: | ||
gene = "" | ||
try: | ||
product = feature.qualifiers["product"][0] | ||
except KeyError: | ||
product = "" | ||
location_str = _insdc_location_string(location=feature.location, rec_length=len(rec.seq)) | ||
out_handle.write(f">{rec.id}|{feature.type}|{gene}|{location_str} {product}\n{subseq}\n") | ||
|
||
|
||
def output_subbed_genbank( | ||
gbk_path: Path, | ||
gbk_out: Path, | ||
namesub: dict[str, str], | ||
) -> None: | ||
with open(gbk_path) as fh, open(gbk_out, 'w') as fout: | ||
for line in fh: | ||
match = REGEX_SUBNAME.search(line) | ||
if match: | ||
subname = match.group(1) | ||
original_name = namesub[subname] | ||
line = line.replace(subname, original_name) | ||
fout.write(line) | ||
|
||
|
||
def main( | ||
table2asn_genbank: Path, | ||
namesub_txt: Path, | ||
outdir: Path = typer.Option(Path('./'), '--outdir', '-o'), | ||
prefix: str = typer.Option('SAMPLE', '--prefix', '-p'), | ||
): | ||
init_logging() | ||
logging.info(f'{table2asn_genbank=}') | ||
logging.info(f'{namesub_txt=}') | ||
logging.info(f'{outdir=}') | ||
logging.info(f'{prefix=}') | ||
namesub = get_namesub(namesub_txt) | ||
logging.info(f'Read {len(namesub)} from "{namesub_txt}"') | ||
gbk_out = outdir / f'{prefix}.gbk' | ||
output_subbed_genbank(table2asn_genbank, gbk_out, namesub) | ||
logging.info(f'Wrote Genbank with original names to "{gbk_out}"') | ||
recs = list(SeqIO.parse(gbk_out, 'genbank')) | ||
logging.info(f"Read {len(recs)} sequence records from '{gbk_out}'. Writing GFF, amino acid and nucleotide FASTAs for sequence features (CDS, mature peptides, signal peptides)...") | ||
gff_out = outdir / f'{prefix}.gff' | ||
write_gff(recs, gff_out) | ||
logging.info(f'Wrote GFF to "{gff_out}"') | ||
faa_out = outdir / f'{prefix}.faa' | ||
write_aa_fasta(recs, faa_out) | ||
logging.info(f'Wrote amino acid FASTA to "{faa_out}"') | ||
ffn_out = outdir / f'{prefix}.ffn' | ||
write_cds_nt_fasta(recs, ffn_out) | ||
logging.info(f'Wrote nucleotide FASTA of gene features to "{faa_out}"') | ||
logging.info('Done!') | ||
|
||
|
||
if __name__ == "__main__": | ||
typer.run(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#!/usr/bin/env python | ||
|
||
from pathlib import Path | ||
|
||
import typer | ||
from rich.logging import RichHandler | ||
import logging | ||
|
||
|
||
def init_logging(): | ||
from rich.traceback import install | ||
|
||
install(show_locals=True, width=120, word_wrap=True) | ||
logging.basicConfig( | ||
format="%(message)s", | ||
datefmt="[%Y-%m-%d %X]", | ||
level=logging.DEBUG, | ||
handlers=[RichHandler(rich_tracebacks=True, tracebacks_show_locals=True)], | ||
) | ||
|
||
|
||
def main( | ||
input_fasta: Path = typer.Option(..., '--input-fasta', '-f'), | ||
input_feature_table: Path = typer.Option(..., '--input-feature-table', '-t'), | ||
outdir: Path = typer.Option(Path('./'), '--outdir', '-o'), | ||
prefix: str = typer.Option('SAMPLE', '--prefix', '-p'), | ||
): | ||
init_logging() | ||
logging.info(f'{input_fasta=}') | ||
logging.info(f'{input_feature_table=}') | ||
logging.info(f'{outdir=}') | ||
logging.info(f'{prefix=}') | ||
ft_seqids = [] | ||
ft_out = outdir / f'{prefix}.tbl' | ||
with open(input_feature_table) as fh, open(ft_out, 'w') as fout: | ||
seqid_count = 0 | ||
for line in fh: | ||
if line.startswith('>Feature '): | ||
seqid = line.replace('>Feature ', '').strip() | ||
seqid_count += 1 | ||
new_seqid = f'SEQUENCE-{seqid_count}' | ||
ft_seqids.append((seqid, new_seqid)) | ||
fout.write(line.replace(seqid, new_seqid)) | ||
else: | ||
fout.write(line.replace(seqid, new_seqid)) | ||
logging.info(f"table2asn compatible feature table written to '{ft_out}'") | ||
|
||
namesub_path = outdir / f'{prefix}.namesub.txt' | ||
with open(namesub_path, 'w') as fout: | ||
for seqid, new_seqid in ft_seqids: | ||
fout.write(f'{new_seqid}\t{seqid}\n') | ||
logging.info(f"Text file with original and table2asn seqids written to '{namesub_path}'") | ||
fasta_seqids = [] | ||
fasta_out = outdir / f'{prefix}.fa' | ||
seqid_to_new_seqid = {x: y for x, y in ft_seqids} | ||
logging.info(f'{seqid_to_new_seqid=}') | ||
with open(input_fasta) as fh, open(fasta_out, 'w') as fout: | ||
for line in fh: | ||
if line.startswith('>'): | ||
seqid, *_ = line[1:].strip().split(' ') | ||
try: | ||
fout.write(line.replace(seqid, seqid_to_new_seqid[seqid])) | ||
except KeyError as ex: | ||
logging.error(f'{seqid=} not in feature table seqids!') | ||
# raise ex | ||
else: | ||
fout.write(line) | ||
logging.info(f"table2asn compatible fasta written to '{fasta_out}'") | ||
|
||
|
||
if __name__ == "__main__": | ||
typer.run(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
process PRE_TABLE2ASN { | ||
tag "$sample" | ||
conda 'bioconda::shiptv=0.4.1' | ||
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { | ||
container 'https://depot.galaxyproject.org/singularity/shiptv:0.4.1--pyh5e36f6f_0' | ||
} else { | ||
container 'quay.io/biocontainers/shiptv:0.4.1--pyh5e36f6f_0' | ||
} | ||
|
||
input: | ||
tuple val(sample), path(feature_table, stageAs: "input*/*"), path(fasta, stageAs: "input*/*") | ||
|
||
output: | ||
tuple val(sample), path("${sample}.tbl"), path("${sample}.fa"), path("${sample}.namesub.txt"), emit: table2asn_input | ||
path "versions.yml", emit: versions | ||
|
||
script: | ||
""" | ||
sub_seqids_for_table2asn.py -t $feature_table -f $fasta -o ./ -p $sample | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
python: \$(python --version | sed 's/Python //g') | ||
END_VERSIONS | ||
""" | ||
} | ||
|
||
process TABLE2ASN { | ||
tag "$sample" | ||
conda 'bioconda::table2asn=1.28.943' | ||
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { | ||
container 'https://depot.galaxyproject.org/singularity/table2asn:1.28.943--h48fe88c_0' | ||
} else { | ||
container 'quay.io/biocontainers/table2asn:1.28.943--h48fe88c_0' | ||
} | ||
|
||
input: | ||
tuple val(sample), path(feature_table), path(fasta, stageAs: 'input*/*'), path(namesub) | ||
|
||
output: | ||
tuple val(sample), path("${prefix}.gbf"), path(namesub), optional: true, emit: genbank | ||
path "versions.yml", emit: versions | ||
|
||
script: | ||
def args = task.ext.args ?: '' | ||
prefix = task.ext.prefix ?: "${sample}" | ||
""" | ||
# ensure that Genbank output file has the right name | ||
ln -s $fasta ${prefix}.fa | ||
# suppress non-zero exit codes due to warnings from table2asn | ||
table2asn -indir ./ -outdir ./ -V b -c - -i ${prefix}.fa || true | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
table2asn: \$(table2asn -version | sed 's/table2asn: //') | ||
END_VERSIONS | ||
""" | ||
} | ||
|
||
process POST_TABLE2ASN { | ||
tag "$sample" | ||
conda 'bioconda::gfflu=0.0.2' | ||
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { | ||
container 'https://depot.galaxyproject.org/singularity/gfflu:0.0.2--pyhdfd78af_0' | ||
} else { | ||
container 'quay.io/biocontainers/gfflu:0.0.2--pyhdfd78af_0' | ||
} | ||
|
||
input: | ||
tuple val(sample), path(genbank, stageAs: 'input*/*'), path(namesub, stageAs: 'input*/*') | ||
|
||
output: | ||
tuple val(sample), path("${sample}.gbk"), emit: genbank | ||
tuple val(sample), path("${sample}.gff"), emit: gff | ||
tuple val(sample), path("${sample}.faa"), emit: cds_aa_fasta | ||
tuple val(sample), path("${sample}.ffn"), emit: cds_nt_fasta | ||
path "versions.yml", emit: versions | ||
|
||
script: | ||
""" | ||
post_table2asn.py $genbank $namesub -p $sample -o ./ | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
python: \$(python --version | sed 's/Python //g') | ||
END_VERSIONS | ||
""" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
process VADR { | ||
tag "$sample" | ||
label 'process_low' | ||
|
||
conda 'pkru22::vadr=1.6.3' | ||
container 'staphb/vadr:1.6.3-hav-flu2' | ||
|
||
input: | ||
tuple val(sample), path(fasta) | ||
|
||
output: | ||
tuple val(sample), path("${prefix}/*.vadr.pass.tbl"), optional: true, emit: feature_table | ||
tuple val(sample), path("${prefix}/*.vadr.pass.fa"), optional: true, emit: pass_fasta | ||
tuple val(sample), path("${prefix}/"), emit: vadr_outdir | ||
path "versions.yml", emit: versions | ||
|
||
script: | ||
def args = task.ext.args ?: '' | ||
prefix = task.ext.prefix ?: "${sample}" | ||
""" | ||
v-annotate.pl \\ | ||
$args \\ | ||
$fasta \\ | ||
${prefix} | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
vadr: \$(v-annotate.pl -h | perl -ne 'print "\$1\\n" if /^# VADR (\\d+\\.\\d+\\.\\d+)/') | ||
END_VERSIONS | ||
""" | ||
} |
Oops, something went wrong.