Skip to content

Commit

Permalink
Merge pull request #32 from FOI-Bioinformatics/updates_202011
Browse files Browse the repository at this point in the history
Updates 202011
  • Loading branch information
davve2 authored Nov 16, 2020
2 parents e763d3a + a48b6c2 commit afe1319
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 35 deletions.
18 changes: 12 additions & 6 deletions flextaxd/create_databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from importlib import import_module
import shutil


latest_genome_reps = "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/genomic_files_reps/gtdb_genomes_reps.tar.gz"

## If script is executed run pipeline of selected options
def main():
Expand Down Expand Up @@ -89,6 +89,8 @@ def __init__(self, message,expression=""):
download_opts = parser.add_argument_group('download_opts', "Download and file handling")
download_opts.add_argument('-p', '--processes',metavar="",type=int, default = 8, help="Use multiple cores for downloading genomes and kraken if -kp is not set")
download_opts.add_argument('--download', action='store_true', help="Download additional sequences")
download_opts.add_argument('-r', '--representative', action='store_true', help="Download GTDB representative genomes")
download_opts.add_argument('--rep_path', metavar="URL", default=latest_genome_reps, help="Specify GTDB representative version URL full path")
download_opts.add_argument('--force_download', action='store_true', help="Download sequences from genbank if not in refseq (WARNING: might include genome withdrawals)")
download_opts.add_argument('--genomes_path', metavar="",default=None, help='path to genomes')

Expand Down Expand Up @@ -163,7 +165,6 @@ def __init__(self, message,expression=""):
skip=False
if os.path.exists("{db_path}/library/library.fna".format(db_path=args.db_name)):
ans = input("Database library file already exist, (u)se library, (o)verwrite (c)ancel? (u o,c): ")
print(ans)
if ans in ["o", "O"]:
logger.info("Overwrite current build progress")
shutil.rmtree("{db_path}".format(db_path=args.db_name))
Expand All @@ -179,11 +180,16 @@ def __init__(self, message,expression=""):
process_directory_obj = process_directory(args.database)
genomes, missing = process_directory_obj.process_folder(args.genomes_path)
''' 2. Download missing files'''
if args.download:
if args.download or args.representative:
download = dynamic_import("modules", "DownloadGenomes")
download_obj = download(args.processes,outdir=args.outdir,force=args.force_download)
still_missing = download_obj.run(missing)
if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing)))
download_obj = download(args.processes,outdir=args.outdir,force=args.force_download,download_path=args.genomes_path)
new_genome_path, missing = download_obj.run(missing,args.rep_path)
if not new_genome_path:
still_missing = missing
if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing)))
else:
new_genomes, missing = process_directory_obj.process_folder(new_genome_path)
genomes += new_genomes
else:
if len(missing) > 0:
logger.info("Genome annotations with no matching source: {nr}".format(nr=len(missing)))
Expand Down
11 changes: 6 additions & 5 deletions flextaxd/custom_taxonomy_databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@
each column (not <tab>|<tab>).
'''

__version__ = "0.3.4"
__version__ = "0.3.5"
__author__ = "David Sundell"
__credits__ = ["David Sundell"]
__license__ = "GPLv3"
__maintainer__ = "FOI bioinformatics group"
__email__ = ["bioinformatics@foi.se","david.sundell@foi.se"]
__date__ = "2020-10-28"
__date__ = "2020-11-12"
__status__ = "Beta"
__pkgname__="custom_taxonomy_databases"
__github__="https://github.com/FOI-Bioinformatics/flextaxd"
Expand Down Expand Up @@ -252,11 +252,12 @@ def __init__(self, message,expression=""):
read_obj.parse_taxonomy() ## Parse taxonomy file

'''Parse genome2taxid file''' ## Fix at some point only one function should be needed
if not args.genomeid2taxid:
if not args.genomeid2taxid and not args.taxonomy_type == "QIIME":
logger.warning("Warning no genomeid2taxid file given!")
elif args.taxonomy_type == "NCBI" and args.genomeid2taxid:

if args.taxonomy_type == "NCBI" and args.genomeid2taxid:
read_obj.parse_genomeid2taxid(args.genomes_path,args.genomeid2taxid)
elif args.taxonomy_type == "CanSNPer":
elif args.genomeid2taxid: ## Always parse genomeid2taxid if present
read_obj.parse_genomeid2taxid(args.genomeid2taxid)

logger.info("Nodes in taxonomy tree {n} number of taxonomies {k}".format(n=read_obj.length, k=read_obj.ids))
Expand Down
9 changes: 8 additions & 1 deletion flextaxd/modules/CreateKrakenDatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,14 @@
from subprocess import Popen,PIPE,check_output,CalledProcessError
from .database.DatabaseConnection import DatabaseFunctions
from time import sleep
from gzip import BadGzipFile

'''gzip have changed their error format between python version 3.7 and 3.8, this is at least a temporary fix for that'''
try:
from gzip import BadGzipFile
except ImportError:
class BadGzipFile(OSError):
"""Exception raised in some cases for invalid gzip files."""


import logging
logger = logging.getLogger(__name__)
Expand Down
76 changes: 74 additions & 2 deletions flextaxd/modules/DownloadGenomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,20 @@

from modules.functions import download_genomes
from multiprocessing import Process,Manager,Pool
from subprocess import Popen,PIPE
import os

class DownloadGenomes(object):
"""DownloadGenomes takes a list of genome names (GCF or GCA) and download (if possible) files from NCBI refseq and or genbank"""

def __init__(self,processes=20,outdir="./",force=False):
def __init__(self,processes=20,outdir="./",force=False,download_path="./"):
super(DownloadGenomes, self).__init__()
self.not_downloaded = [] ## Place holder for files not possible to download
self.download_map = [] ## Place holder for files in subprocess
self.processes = processes ## Number of simultanous downloads allowed
self.genome_names = []
self.genome_path = {}
self.download_path = download_path
self.outdir = outdir
self.force=force
if self.force:
Expand Down Expand Up @@ -53,7 +56,56 @@ def write_missing(self,missing):
print(gen["genome_id"], end="\n", file=of)
return

def run(self, files):
def download_represenatives(self,genome_path,url="https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/genomic_files_reps/bac120_marker_genes_reps.tar.gz"):
'''Specific function for GTDB download using their tar files instead of downloading from NCBI
Parameters
str - url
str - path to genome directory
Returns
path
'''
logger.debug(genome_path)
input_file_name = url.split("/")[-1]
self.location = genome_path.rstrip("/")+"/representatives"
self.representative_file = url.rsplit("/")[-1]
'''Check if exists already if so ask to replace or continue without downloading'''
if os.path.exists(self.location+"/"+input_file_name):
ans = input("A represenative file already exist, (u)se file, (o)verwrite (c)ancel? (u o,c): ")
if ans in ["o", "O"]:
logger.info("Overwrite current progress")
os.remove("{file}".format(file=input_file_name))
elif ans.strip() in ["u", "U"]:
logger.info("Resume database build")
return input_file_name
else:
exit("Cancel execution!")
logger.info("Download represenative genomes from {url}".format(url=url))
#as suggested by @SilentGhost the `self.location` and `url` should be separate argument
args = ['wget', '-nd', '-r', '-l', '1', '-p', '-P', self.location, url]
logger.debug(" ".join(args))
logger.info("Waiting for download process to finish (this may take a while)!")
p = Popen(args, stdout=PIPE)
(output, err) = p.communicate()
p_status = p.wait()
return input_file_name

def parse_representatives(self,downloaded_file):
'''Parse representative genomes
untar the representative genomes directory to access the source genomes
Parameters
str - path to tar file
Returns
path - path to unzipped folder
'''
args = ["tar", "-xf", downloaded_file]
logger.info("Untar file {f}".format(f=downloaded_file))
logger.debug(" ".join(args))
p = Popen(args, stdout=PIPE,cwd=self.location)
(output, err) = p.communicate()
p_status = p.wait()
return self.location

def download_files(self,files):
'''Download list of GCF and or GCA files from NCBI
Parameters
Expand Down Expand Up @@ -106,3 +158,23 @@ def run(self, files):
if len(self.not_downloaded) > 0:
self.write_missing(self.not_downloaded)
return self.not_downloaded

def run(self, files, representatives=False):
'''Download list of GCF and or GCA files from NCBI or download represenative genomes
Parameters
list - list of GCF/GCA ids
bool - download representative genomes instead of list
Returns
list - list of files not downloaded
'''
if representatives:
tarfile = self.download_represenatives(genome_path=self.download_path, url=representatives)
folder_path = self.parse_representatives(tarfile)
'''Process the downloaded folder'''
return folder_path,[]
else:
if len(files) > 0:
not_downloaded = self.download_files(files)
return False,not_downloaded
4 changes: 2 additions & 2 deletions flextaxd/modules/ModifyTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def parse_modification(self, input,modtype="database"):
logger.debug("new: {new}".format(new=len(self.new_nodes)))
logger.debug("ovl: {ovl}".format(ovl=len(self.existing_nodes & self.new_nodes)))

if self.replace & len(self.existing_nodes) > 0: ## remove nodes connected to old nodes that is not replaced
if self.replace and len(self.existing_nodes) > 0: ## remove nodes connected to old nodes that is not replaced
self.non_overlapping_old_links = set(self.taxonomydb.get_links((self.existing_nodes & self.new_nodes) - set([self.taxonomydb.get_id(self.parent)]))) ## Remove all links related to new nodes

self.overlapping_links = self.existing_links & self.new_links ## (links existing in both networks)
Expand Down Expand Up @@ -364,7 +364,7 @@ def clean_database(self, ncbi=False):
self.annotated_nodes = set(self.taxonomydb.get_genomes(cols="genome,id").keys())
an = len(self.annotated_nodes)
logger.info("Annotated nodes: {an}".format(an=an))
if an == 0:
if an == 0 and not ncbi:
raise InputError("Database has no annotations, the whole database would be cleaned")
logger.info("Get all links in database")
self.all_links = set(self.taxonomydb.get_links())
Expand Down
26 changes: 19 additions & 7 deletions flextaxd/modules/ReadTaxonomyNCBI.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,13 @@ def __init__(self, taxonomy_file=False, database=False):
self.length = 0
self.ids = 0
self.accessionfile = False
#self.add_rank("no rank")
#self.add_link(child=1, parent=1,rank="no rank")

def write_missing(self,missing):
'''Write missing genomes to file'''
with open("FlexTaxD.not_added", "w") as of:
for gen in missing:
print(gen, end="\n", file=of)
return

def set_accession_file(self,file):
self.accessionfile = file
Expand Down Expand Up @@ -89,10 +94,10 @@ def parse_genomeid2taxid(self, genomes_path,annotation_file):
if filename.strip(".gz").endswith(".fna"):
filepath = os.path.join(root, filename)
self.parse_genebank_file(filepath,filename)
logger.info("genomes folder read {n} sequence files found".format(n=len(self.refseqid_to_GCF)))
logger.info("genomes folder read, {n} sequence files found".format(n=len(self.refseqid_to_GCF)))
if not annotation_file.endswith("accession2taxid.gz"):
raise TypeError("The supplied annotation file does not seem to be the ncbi nucl_gb.accession2taxid.gz")
count = 0
annotated_genome = set()
with zopen(annotation_file,"r") as f:
headers = f.readline().split(b"\t")
for row in f:
Expand All @@ -101,8 +106,15 @@ def parse_genomeid2taxid(self, genomes_path,annotation_file):
try:
genebankid = self.refseqid_to_GCF[refseqid]
self.database.add_genome(genome=genebankid,_id=taxid.decode("utf-8"))
annotated_genome.add(refseqid)
except:
count +=1
pass
self.database.commit()
logger.info("Genomes not matching any annotation {len}".format(len=count))
return
missing = set(self.refseqid_to_GCF.keys()) - annotated_genome
missing = [self.refseqid_to_GCF[m] for m in missing] ## Translate to GCF ids
if logging.root.level <=20: ## Equal to --verbose
logger.info("Printing non added genome id´s (GCF) to ./FlexTaxD.not_added")
self.write_missing(missing)
logger.debug(missing) ## If debug also print genomes to terminal
logger.info("Genomes not matching any annotation {len}".format(len=len(missing)))
return missing
35 changes: 23 additions & 12 deletions flextaxd/modules/ReadTaxonomyQIIME.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Read QIIME formatted taxonomy files and holds a dictionary with taxonomy tree and name translation
'''

from .ReadTaxonomy import ReadTaxonomy
from .ReadTaxonomy import ReadTaxonomy,InputError
from .database.DatabaseConnection import DatabaseFunctions
import logging
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -102,33 +102,44 @@ def qiime_to_tree(self, sep="\t"):
self.errors = 0
self.added = 0
taxid_start = self.taxid_base
raiseWarning = False
with open(self.input) as f:
'''Each row defines a genome annotation file connected to a tree level'''
for row in f:
if row.strip() != "": ## If there are trailing empty lines in the file
data = row.strip().split("\t")
try:
if data[0].startswith(("RS","GB")):
'''GTDB genome annotations contain one additional annotation to their genome names eg. RS_, this function removes this'''
genome_id = data[0].split("_",1)[1].strip() ## Genome ID
else:
genome_id = data[0].strip()
#print(genome_id)
except IndexError:
logger.debug("Row {row} could not be parsed".format(row=data))
self.errors +=1
if len(data) != 2: ## If the QIIME file is annotated it will contain two columns
if len(data) > 2:
raise InputError("The input format is not correct for QIIME. THe QIIME source format requires one [QIIME-tree] or two columns [gene_id\\tQIIME-tree]")
raiseWarning = True
genome_id = False
else: ## Annotation is given in column one
try:
if data[0].startswith(("RS","GB")):
'''GTDB genome annotations contain one additional annotation to their genome names eg. RS_, this function removes this'''
genome_id = data[0].split("_",1)[1].strip() ## Genome ID
else:
genome_id = data[0].strip()
#print(genome_id)
except IndexError:
logger.debug("Row {row} could not be parsed".format(row=data))
self.errors +=1

### Walk through tree and make sure all nodes back to root are annotated!
taxonomy = list(reversed(data[-1].split(";")))
taxonomy_i = self.parse_tree(taxonomy)
if taxonomy_i:
if taxonomy_i and genome_id:
test = self.database.add_genome(genome=genome_id,_id=taxonomy_i)
if not str(test).startswith("UNIQUE"):
self.added +=1
elif not genome_id: ## No genome id suggest a QIIME file with no annotation
pass
else:
logger.debug("Warning taxonomy: {taxonomy} could not be parsed!!")
self.missed +=1
self.database.commit()
self.length = self.taxid_base - taxid_start
if raiseWarning: logger.warning("Warning no genomeid2taxid file given and your QIIME formatted file does not seem to contain annotations!")
logger.info("Genomes added to database: {genomes}".format(genomes=self.added))
logger.debug("Genomes not added to database {missed} errors {errors}".format(missed=self.missed,errors=self.errors))
logger.info("New taxonomy ids assigned {taxidnr}".format(taxidnr=self.length))

0 comments on commit afe1319

Please sign in to comment.