diff --git a/flextaxd/create_databases.py b/flextaxd/create_databases.py index 5b0f8a7..3ba1910 100644 --- a/flextaxd/create_databases.py +++ b/flextaxd/create_databases.py @@ -18,7 +18,7 @@ from importlib import import_module import shutil - +latest_genome_reps = "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/genomic_files_reps/gtdb_genomes_reps.tar.gz" ## If script is executed run pipeline of selected options def main(): @@ -89,6 +89,8 @@ def __init__(self, message,expression=""): download_opts = parser.add_argument_group('download_opts', "Download and file handling") download_opts.add_argument('-p', '--processes',metavar="",type=int, default = 8, help="Use multiple cores for downloading genomes and kraken if -kp is not set") download_opts.add_argument('--download', action='store_true', help="Download additional sequences") + download_opts.add_argument('-r', '--representative', action='store_true', help="Download GTDB representative genomes") + download_opts.add_argument('--rep_path', metavar="URL", default=latest_genome_reps, help="Specify GTDB representative version URL full path") download_opts.add_argument('--force_download', action='store_true', help="Download sequences from genbank if not in refseq (WARNING: might include genome withdrawals)") download_opts.add_argument('--genomes_path', metavar="",default=None, help='path to genomes') @@ -163,7 +165,6 @@ def __init__(self, message,expression=""): skip=False if os.path.exists("{db_path}/library/library.fna".format(db_path=args.db_name)): ans = input("Database library file already exist, (u)se library, (o)verwrite (c)ancel? (u o,c): ") - print(ans) if ans in ["o", "O"]: logger.info("Overwrite current build progress") shutil.rmtree("{db_path}".format(db_path=args.db_name)) @@ -179,11 +180,16 @@ def __init__(self, message,expression=""): process_directory_obj = process_directory(args.database) genomes, missing = process_directory_obj.process_folder(args.genomes_path) ''' 2. Download missing files''' - if args.download: + if args.download or args.representative: download = dynamic_import("modules", "DownloadGenomes") - download_obj = download(args.processes,outdir=args.outdir,force=args.force_download) - still_missing = download_obj.run(missing) - if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing))) + download_obj = download(args.processes,outdir=args.outdir,force=args.force_download,download_path=args.genomes_path) + new_genome_path, missing = download_obj.run(missing,args.rep_path) + if not new_genome_path: + still_missing = missing + if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing))) + else: + new_genomes, missing = process_directory_obj.process_folder(new_genome_path) + genomes += new_genomes else: if len(missing) > 0: logger.info("Genome annotations with no matching source: {nr}".format(nr=len(missing))) diff --git a/flextaxd/custom_taxonomy_databases.py b/flextaxd/custom_taxonomy_databases.py index 3365a00..0297117 100644 --- a/flextaxd/custom_taxonomy_databases.py +++ b/flextaxd/custom_taxonomy_databases.py @@ -20,13 +20,13 @@ each column (not |). ''' -__version__ = "0.3.4" +__version__ = "0.3.5" __author__ = "David Sundell" __credits__ = ["David Sundell"] __license__ = "GPLv3" __maintainer__ = "FOI bioinformatics group" __email__ = ["bioinformatics@foi.se","david.sundell@foi.se"] -__date__ = "2020-10-28" +__date__ = "2020-11-12" __status__ = "Beta" __pkgname__="custom_taxonomy_databases" __github__="https://github.com/FOI-Bioinformatics/flextaxd" @@ -252,11 +252,12 @@ def __init__(self, message,expression=""): read_obj.parse_taxonomy() ## Parse taxonomy file '''Parse genome2taxid file''' ## Fix at some point only one function should be needed - if not args.genomeid2taxid: + if not args.genomeid2taxid and not args.taxonomy_type == "QIIME": logger.warning("Warning no genomeid2taxid file given!") - elif args.taxonomy_type == "NCBI" and args.genomeid2taxid: + + if args.taxonomy_type == "NCBI" and args.genomeid2taxid: read_obj.parse_genomeid2taxid(args.genomes_path,args.genomeid2taxid) - elif args.taxonomy_type == "CanSNPer": + elif args.genomeid2taxid: ## Always parse genomeid2taxid if present read_obj.parse_genomeid2taxid(args.genomeid2taxid) logger.info("Nodes in taxonomy tree {n} number of taxonomies {k}".format(n=read_obj.length, k=read_obj.ids)) diff --git a/flextaxd/modules/CreateKrakenDatabase.py b/flextaxd/modules/CreateKrakenDatabase.py index 55a41d3..f435c99 100644 --- a/flextaxd/modules/CreateKrakenDatabase.py +++ b/flextaxd/modules/CreateKrakenDatabase.py @@ -14,7 +14,14 @@ from subprocess import Popen,PIPE,check_output,CalledProcessError from .database.DatabaseConnection import DatabaseFunctions from time import sleep -from gzip import BadGzipFile + +'''gzip have changed their error format between python version 3.7 and 3.8, this is at least a temporary fix for that''' +try: + from gzip import BadGzipFile +except ImportError: + class BadGzipFile(OSError): + """Exception raised in some cases for invalid gzip files.""" + import logging logger = logging.getLogger(__name__) diff --git a/flextaxd/modules/DownloadGenomes.py b/flextaxd/modules/DownloadGenomes.py index 55c621b..d0f4f87 100644 --- a/flextaxd/modules/DownloadGenomes.py +++ b/flextaxd/modules/DownloadGenomes.py @@ -8,17 +8,20 @@ from modules.functions import download_genomes from multiprocessing import Process,Manager,Pool +from subprocess import Popen,PIPE +import os class DownloadGenomes(object): """DownloadGenomes takes a list of genome names (GCF or GCA) and download (if possible) files from NCBI refseq and or genbank""" - def __init__(self,processes=20,outdir="./",force=False): + def __init__(self,processes=20,outdir="./",force=False,download_path="./"): super(DownloadGenomes, self).__init__() self.not_downloaded = [] ## Place holder for files not possible to download self.download_map = [] ## Place holder for files in subprocess self.processes = processes ## Number of simultanous downloads allowed self.genome_names = [] self.genome_path = {} + self.download_path = download_path self.outdir = outdir self.force=force if self.force: @@ -53,7 +56,56 @@ def write_missing(self,missing): print(gen["genome_id"], end="\n", file=of) return - def run(self, files): + def download_represenatives(self,genome_path,url="https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/genomic_files_reps/bac120_marker_genes_reps.tar.gz"): + '''Specific function for GTDB download using their tar files instead of downloading from NCBI + Parameters + str - url + str - path to genome directory + Returns + path + ''' + logger.debug(genome_path) + input_file_name = url.split("/")[-1] + self.location = genome_path.rstrip("/")+"/representatives" + self.representative_file = url.rsplit("/")[-1] + '''Check if exists already if so ask to replace or continue without downloading''' + if os.path.exists(self.location+"/"+input_file_name): + ans = input("A represenative file already exist, (u)se file, (o)verwrite (c)ancel? (u o,c): ") + if ans in ["o", "O"]: + logger.info("Overwrite current progress") + os.remove("{file}".format(file=input_file_name)) + elif ans.strip() in ["u", "U"]: + logger.info("Resume database build") + return input_file_name + else: + exit("Cancel execution!") + logger.info("Download represenative genomes from {url}".format(url=url)) + #as suggested by @SilentGhost the `self.location` and `url` should be separate argument + args = ['wget', '-nd', '-r', '-l', '1', '-p', '-P', self.location, url] + logger.debug(" ".join(args)) + logger.info("Waiting for download process to finish (this may take a while)!") + p = Popen(args, stdout=PIPE) + (output, err) = p.communicate() + p_status = p.wait() + return input_file_name + + def parse_representatives(self,downloaded_file): + '''Parse representative genomes + untar the representative genomes directory to access the source genomes + Parameters + str - path to tar file + Returns + path - path to unzipped folder + ''' + args = ["tar", "-xf", downloaded_file] + logger.info("Untar file {f}".format(f=downloaded_file)) + logger.debug(" ".join(args)) + p = Popen(args, stdout=PIPE,cwd=self.location) + (output, err) = p.communicate() + p_status = p.wait() + return self.location + + def download_files(self,files): '''Download list of GCF and or GCA files from NCBI Parameters @@ -106,3 +158,23 @@ def run(self, files): if len(self.not_downloaded) > 0: self.write_missing(self.not_downloaded) return self.not_downloaded + + def run(self, files, representatives=False): + '''Download list of GCF and or GCA files from NCBI or download represenative genomes + + Parameters + list - list of GCF/GCA ids + bool - download representative genomes instead of list + + Returns + list - list of files not downloaded + ''' + if representatives: + tarfile = self.download_represenatives(genome_path=self.download_path, url=representatives) + folder_path = self.parse_representatives(tarfile) + '''Process the downloaded folder''' + return folder_path,[] + else: + if len(files) > 0: + not_downloaded = self.download_files(files) + return False,not_downloaded diff --git a/flextaxd/modules/ModifyTree.py b/flextaxd/modules/ModifyTree.py index daac01b..964d89b 100644 --- a/flextaxd/modules/ModifyTree.py +++ b/flextaxd/modules/ModifyTree.py @@ -263,7 +263,7 @@ def parse_modification(self, input,modtype="database"): logger.debug("new: {new}".format(new=len(self.new_nodes))) logger.debug("ovl: {ovl}".format(ovl=len(self.existing_nodes & self.new_nodes))) - if self.replace & len(self.existing_nodes) > 0: ## remove nodes connected to old nodes that is not replaced + if self.replace and len(self.existing_nodes) > 0: ## remove nodes connected to old nodes that is not replaced self.non_overlapping_old_links = set(self.taxonomydb.get_links((self.existing_nodes & self.new_nodes) - set([self.taxonomydb.get_id(self.parent)]))) ## Remove all links related to new nodes self.overlapping_links = self.existing_links & self.new_links ## (links existing in both networks) @@ -364,7 +364,7 @@ def clean_database(self, ncbi=False): self.annotated_nodes = set(self.taxonomydb.get_genomes(cols="genome,id").keys()) an = len(self.annotated_nodes) logger.info("Annotated nodes: {an}".format(an=an)) - if an == 0: + if an == 0 and not ncbi: raise InputError("Database has no annotations, the whole database would be cleaned") logger.info("Get all links in database") self.all_links = set(self.taxonomydb.get_links()) diff --git a/flextaxd/modules/ReadTaxonomyNCBI.py b/flextaxd/modules/ReadTaxonomyNCBI.py index 6a838cc..d4173f7 100644 --- a/flextaxd/modules/ReadTaxonomyNCBI.py +++ b/flextaxd/modules/ReadTaxonomyNCBI.py @@ -20,8 +20,13 @@ def __init__(self, taxonomy_file=False, database=False): self.length = 0 self.ids = 0 self.accessionfile = False - #self.add_rank("no rank") - #self.add_link(child=1, parent=1,rank="no rank") + + def write_missing(self,missing): + '''Write missing genomes to file''' + with open("FlexTaxD.not_added", "w") as of: + for gen in missing: + print(gen, end="\n", file=of) + return def set_accession_file(self,file): self.accessionfile = file @@ -89,10 +94,10 @@ def parse_genomeid2taxid(self, genomes_path,annotation_file): if filename.strip(".gz").endswith(".fna"): filepath = os.path.join(root, filename) self.parse_genebank_file(filepath,filename) - logger.info("genomes folder read {n} sequence files found".format(n=len(self.refseqid_to_GCF))) + logger.info("genomes folder read, {n} sequence files found".format(n=len(self.refseqid_to_GCF))) if not annotation_file.endswith("accession2taxid.gz"): raise TypeError("The supplied annotation file does not seem to be the ncbi nucl_gb.accession2taxid.gz") - count = 0 + annotated_genome = set() with zopen(annotation_file,"r") as f: headers = f.readline().split(b"\t") for row in f: @@ -101,8 +106,15 @@ def parse_genomeid2taxid(self, genomes_path,annotation_file): try: genebankid = self.refseqid_to_GCF[refseqid] self.database.add_genome(genome=genebankid,_id=taxid.decode("utf-8")) + annotated_genome.add(refseqid) except: - count +=1 + pass self.database.commit() - logger.info("Genomes not matching any annotation {len}".format(len=count)) - return + missing = set(self.refseqid_to_GCF.keys()) - annotated_genome + missing = [self.refseqid_to_GCF[m] for m in missing] ## Translate to GCF ids + if logging.root.level <=20: ## Equal to --verbose + logger.info("Printing non added genome id´s (GCF) to ./FlexTaxD.not_added") + self.write_missing(missing) + logger.debug(missing) ## If debug also print genomes to terminal + logger.info("Genomes not matching any annotation {len}".format(len=len(missing))) + return missing diff --git a/flextaxd/modules/ReadTaxonomyQIIME.py b/flextaxd/modules/ReadTaxonomyQIIME.py index 1526375..337f7dc 100644 --- a/flextaxd/modules/ReadTaxonomyQIIME.py +++ b/flextaxd/modules/ReadTaxonomyQIIME.py @@ -4,7 +4,7 @@ Read QIIME formatted taxonomy files and holds a dictionary with taxonomy tree and name translation ''' -from .ReadTaxonomy import ReadTaxonomy +from .ReadTaxonomy import ReadTaxonomy,InputError from .database.DatabaseConnection import DatabaseFunctions import logging logger = logging.getLogger(__name__) @@ -102,33 +102,44 @@ def qiime_to_tree(self, sep="\t"): self.errors = 0 self.added = 0 taxid_start = self.taxid_base + raiseWarning = False with open(self.input) as f: '''Each row defines a genome annotation file connected to a tree level''' for row in f: if row.strip() != "": ## If there are trailing empty lines in the file data = row.strip().split("\t") - try: - if data[0].startswith(("RS","GB")): - '''GTDB genome annotations contain one additional annotation to their genome names eg. RS_, this function removes this''' - genome_id = data[0].split("_",1)[1].strip() ## Genome ID - else: - genome_id = data[0].strip() - #print(genome_id) - except IndexError: - logger.debug("Row {row} could not be parsed".format(row=data)) - self.errors +=1 + if len(data) != 2: ## If the QIIME file is annotated it will contain two columns + if len(data) > 2: + raise InputError("The input format is not correct for QIIME. THe QIIME source format requires one [QIIME-tree] or two columns [gene_id\\tQIIME-tree]") + raiseWarning = True + genome_id = False + else: ## Annotation is given in column one + try: + if data[0].startswith(("RS","GB")): + '''GTDB genome annotations contain one additional annotation to their genome names eg. RS_, this function removes this''' + genome_id = data[0].split("_",1)[1].strip() ## Genome ID + else: + genome_id = data[0].strip() + #print(genome_id) + except IndexError: + logger.debug("Row {row} could not be parsed".format(row=data)) + self.errors +=1 + ### Walk through tree and make sure all nodes back to root are annotated! taxonomy = list(reversed(data[-1].split(";"))) taxonomy_i = self.parse_tree(taxonomy) - if taxonomy_i: + if taxonomy_i and genome_id: test = self.database.add_genome(genome=genome_id,_id=taxonomy_i) if not str(test).startswith("UNIQUE"): self.added +=1 + elif not genome_id: ## No genome id suggest a QIIME file with no annotation + pass else: logger.debug("Warning taxonomy: {taxonomy} could not be parsed!!") self.missed +=1 self.database.commit() self.length = self.taxid_base - taxid_start + if raiseWarning: logger.warning("Warning no genomeid2taxid file given and your QIIME formatted file does not seem to contain annotations!") logger.info("Genomes added to database: {genomes}".format(genomes=self.added)) logger.debug("Genomes not added to database {missed} errors {errors}".format(missed=self.missed,errors=self.errors)) logger.info("New taxonomy ids assigned {taxidnr}".format(taxidnr=self.length))