Merge pull request #32 from FOI-Bioinformatics/updates_202011

Updates 202011
FOI-Bioinformatics · Nov 16, 2020 · afe1319 · afe1319
2 parents e763d3a + a48b6c2
commit afe1319
Show file tree

Hide file tree

Showing 7 changed files with 144 additions and 35 deletions.
diff --git a/flextaxd/create_databases.py b/flextaxd/create_databases.py
@@ -18,7 +18,7 @@
 from importlib import import_module
 import shutil
 
-
+latest_genome_reps = "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/genomic_files_reps/gtdb_genomes_reps.tar.gz"
 
 ## If script is executed run pipeline of selected options
 def main():
@@ -89,6 +89,8 @@ def __init__(self,  message,expression=""):
 	download_opts = parser.add_argument_group('download_opts', "Download and file handling")
 	download_opts.add_argument('-p', '--processes',metavar="",type=int, default = 8, help="Use multiple cores for downloading genomes and kraken if -kp is not set")
 	download_opts.add_argument('--download', action='store_true', help="Download additional sequences")
+	download_opts.add_argument('-r', '--representative',   action='store_true', help="Download GTDB representative genomes")
+	download_opts.add_argument('--rep_path', metavar="URL", default=latest_genome_reps, help="Specify GTDB representative version URL full path")
 	download_opts.add_argument('--force_download', action='store_true', help="Download sequences from genbank if not in refseq (WARNING: might include genome withdrawals)")
 	download_opts.add_argument('--genomes_path', metavar="",default=None,  help='path to genomes')
 
@@ -163,7 +165,6 @@ def __init__(self,  message,expression=""):
 	skip=False
 	if os.path.exists("{db_path}/library/library.fna".format(db_path=args.db_name)):
 		ans = input("Database library file already exist, (u)se library, (o)verwrite (c)ancel? (u o,c): ")
-		print(ans)
 		if ans in ["o", "O"]:
 			logger.info("Overwrite current build progress")
 			shutil.rmtree("{db_path}".format(db_path=args.db_name))
@@ -179,11 +180,16 @@ def __init__(self,  message,expression=""):
 		process_directory_obj = process_directory(args.database)
 		genomes, missing = process_directory_obj.process_folder(args.genomes_path)
 		''' 2. Download missing files'''
-		if args.download:
+		if args.download or args.representative:
 			download = dynamic_import("modules", "DownloadGenomes")
-			download_obj = download(args.processes,outdir=args.outdir,force=args.force_download)
-			still_missing = download_obj.run(missing)
-			if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing)))
+			download_obj = download(args.processes,outdir=args.outdir,force=args.force_download,download_path=args.genomes_path)
+			new_genome_path, missing = download_obj.run(missing,args.rep_path)
+			if not new_genome_path:
+				still_missing = missing
+				if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing)))
+			else:
+				new_genomes, missing = process_directory_obj.process_folder(new_genome_path)
+				genomes += new_genomes
 		else:
 			if len(missing) > 0:
 				logger.info("Genome annotations with no matching source: {nr}".format(nr=len(missing)))

diff --git a/flextaxd/custom_taxonomy_databases.py b/flextaxd/custom_taxonomy_databases.py
@@ -20,13 +20,13 @@
 each column (not <tab>|<tab>).
 '''
 
-__version__ = "0.3.4"
+__version__ = "0.3.5"
 __author__ = "David Sundell"
 __credits__ = ["David Sundell"]
 __license__ = "GPLv3"
 __maintainer__ = "FOI bioinformatics group"
 __email__ = ["bioinformatics@foi.se","david.sundell@foi.se"]
-__date__ = "2020-10-28"
+__date__ = "2020-11-12"
 __status__ = "Beta"
 __pkgname__="custom_taxonomy_databases"
 __github__="https://github.com/FOI-Bioinformatics/flextaxd"
@@ -252,11 +252,12 @@ def __init__(self,  message,expression=""):
             read_obj.parse_taxonomy()                                                           ## Parse taxonomy file
 
             '''Parse genome2taxid file'''                                                       ## Fix at some point only one function should be needed
-            if not args.genomeid2taxid:
+            if not args.genomeid2taxid and not args.taxonomy_type == "QIIME":
                 logger.warning("Warning no genomeid2taxid file given!")
-            elif args.taxonomy_type == "NCBI" and args.genomeid2taxid:
+
+            if args.taxonomy_type == "NCBI" and args.genomeid2taxid:
                 read_obj.parse_genomeid2taxid(args.genomes_path,args.genomeid2taxid)
-            elif args.taxonomy_type == "CanSNPer":
+            elif args.genomeid2taxid: ## Always parse genomeid2taxid if present
                 read_obj.parse_genomeid2taxid(args.genomeid2taxid)
 
             logger.info("Nodes in taxonomy tree {n} number of taxonomies {k}".format(n=read_obj.length, k=read_obj.ids))

diff --git a/flextaxd/modules/CreateKrakenDatabase.py b/flextaxd/modules/CreateKrakenDatabase.py
@@ -14,7 +14,14 @@
 from subprocess import Popen,PIPE,check_output,CalledProcessError
 from .database.DatabaseConnection import DatabaseFunctions
 from time import sleep
-from gzip import BadGzipFile
+
+'''gzip have changed their error format between python version 3.7 and 3.8, this is at least a temporary fix for that'''
+try:
+	from gzip import BadGzipFile
+except ImportError:
+	class BadGzipFile(OSError):
+	    """Exception raised in some cases for invalid gzip files."""
+
 
 import logging
 logger = logging.getLogger(__name__)

diff --git a/flextaxd/modules/DownloadGenomes.py b/flextaxd/modules/DownloadGenomes.py
@@ -8,17 +8,20 @@
 
 from modules.functions import download_genomes
 from multiprocessing import Process,Manager,Pool
+from subprocess import Popen,PIPE
+import os
 
 class DownloadGenomes(object):
 	"""DownloadGenomes takes a list of genome names (GCF or GCA) and download (if possible) files from NCBI refseq and or genbank"""
 
-	def __init__(self,processes=20,outdir="./",force=False):
+	def __init__(self,processes=20,outdir="./",force=False,download_path="./"):
 		super(DownloadGenomes, self).__init__()
 		self.not_downloaded = []        ## Place holder for files not possible to download
 		self.download_map   = []        ## Place holder for files in subprocess
 		self.processes = processes      ## Number of simultanous downloads allowed
 		self.genome_names = []
 		self.genome_path = {}
+		self.download_path = download_path
 		self.outdir = outdir
 		self.force=force
 		if self.force:
@@ -53,7 +56,56 @@ def write_missing(self,missing):
 				print(gen["genome_id"], end="\n", file=of)
 		return
 
-	def run(self, files):
+	def download_represenatives(self,genome_path,url="https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/genomic_files_reps/bac120_marker_genes_reps.tar.gz"):
+		'''Specific function for GTDB download using their tar files instead of downloading from NCBI
+			Parameters
+				str - url
+				str - path to genome directory
+			Returns
+				path
+		'''
+		logger.debug(genome_path)
+		input_file_name = url.split("/")[-1]
+		self.location = genome_path.rstrip("/")+"/representatives"
+		self.representative_file = url.rsplit("/")[-1]
+		'''Check if exists already if so ask to replace or continue without downloading'''
+		if os.path.exists(self.location+"/"+input_file_name):
+			ans = input("A represenative file already exist, (u)se file, (o)verwrite (c)ancel? (u o,c): ")
+			if ans in ["o", "O"]:
+				logger.info("Overwrite current progress")
+				os.remove("{file}".format(file=input_file_name))
+			elif ans.strip() in ["u", "U"]:
+				logger.info("Resume database build")
+				return input_file_name
+			else:
+				exit("Cancel execution!")
+		logger.info("Download represenative genomes from {url}".format(url=url))
+		#as suggested by @SilentGhost the `self.location` and `url` should be separate argument
+		args = ['wget', '-nd', '-r', '-l', '1', '-p', '-P', self.location, url]
+		logger.debug(" ".join(args))
+		logger.info("Waiting for download process to finish (this may take a while)!")
+		p = Popen(args, stdout=PIPE)
+		(output, err) = p.communicate() 
+		p_status = p.wait()
+		return input_file_name
+
+	def parse_representatives(self,downloaded_file):
+		'''Parse representative genomes
+			untar the representative genomes directory to access the source genomes
+			Parameters
+				str - path to tar file
+			Returns
+				path - path to unzipped folder
+		'''
+		args = ["tar", "-xf", downloaded_file]
+		logger.info("Untar file {f}".format(f=downloaded_file))
+		logger.debug(" ".join(args))
+		p = Popen(args, stdout=PIPE,cwd=self.location)
+		(output, err) = p.communicate()
+		p_status = p.wait()
+		return self.location
+
+	def download_files(self,files):
 		'''Download list of GCF and or GCA files from NCBI
 
 		Parameters
@@ -106,3 +158,23 @@ def run(self, files):
 		if len(self.not_downloaded) > 0:
 			self.write_missing(self.not_downloaded)
 		return self.not_downloaded
+
+	def run(self, files, representatives=False):
+		'''Download list of GCF and or GCA files from NCBI or download represenative genomes
+
+		Parameters
+			list - list of GCF/GCA ids
+			bool - download representative genomes instead of list
+
+		Returns
+			list - list of files not downloaded
+		'''
+		if representatives:
+			tarfile = self.download_represenatives(genome_path=self.download_path, url=representatives)
+			folder_path = self.parse_representatives(tarfile)
+			'''Process the downloaded folder'''
+			return folder_path,[]
+		else:
+			if len(files) > 0:
+				not_downloaded = self.download_files(files)
+				return False,not_downloaded
diff --git a/flextaxd/modules/ModifyTree.py b/flextaxd/modules/ModifyTree.py
@@ -263,7 +263,7 @@ def parse_modification(self, input,modtype="database"):
 		logger.debug("new: {new}".format(new=len(self.new_nodes)))
 		logger.debug("ovl: {ovl}".format(ovl=len(self.existing_nodes & self.new_nodes)))
 
-		if self.replace & len(self.existing_nodes) > 0:  ## remove nodes connected to old nodes that is not replaced
+		if self.replace and len(self.existing_nodes) > 0:  ## remove nodes connected to old nodes that is not replaced
 			self.non_overlapping_old_links = set(self.taxonomydb.get_links((self.existing_nodes & self.new_nodes) - set([self.taxonomydb.get_id(self.parent)])))  ## Remove all links related to new nodes
 
 		self.overlapping_links = self.existing_links & self.new_links ## (links existing in both networks)
@@ -364,7 +364,7 @@ def clean_database(self, ncbi=False):
 		self.annotated_nodes = set(self.taxonomydb.get_genomes(cols="genome,id").keys())
 		an = len(self.annotated_nodes)
 		logger.info("Annotated nodes: {an}".format(an=an))
-		if an == 0:
+		if an == 0 and not ncbi:
 			raise InputError("Database has no annotations, the whole database would be cleaned")
 		logger.info("Get all links in database")
 		self.all_links = set(self.taxonomydb.get_links())

diff --git a/flextaxd/modules/ReadTaxonomyNCBI.py b/flextaxd/modules/ReadTaxonomyNCBI.py
@@ -20,8 +20,13 @@ def __init__(self, taxonomy_file=False, database=False):
 		self.length = 0
 		self.ids = 0
 		self.accessionfile = False
-		#self.add_rank("no rank")
-		#self.add_link(child=1, parent=1,rank="no rank")
+
+	def write_missing(self,missing):
+		'''Write missing genomes to file'''
+		with open("FlexTaxD.not_added", "w") as of:
+			for gen in missing:
+				print(gen, end="\n", file=of)
+		return
 
 	def set_accession_file(self,file):
 		self.accessionfile = file
@@ -89,10 +94,10 @@ def parse_genomeid2taxid(self, genomes_path,annotation_file):
 				if filename.strip(".gz").endswith(".fna"):
 					filepath = os.path.join(root, filename)
 					self.parse_genebank_file(filepath,filename)
-		logger.info("genomes folder read {n} sequence files found".format(n=len(self.refseqid_to_GCF)))
+		logger.info("genomes folder read, {n} sequence files found".format(n=len(self.refseqid_to_GCF)))
 		if not annotation_file.endswith("accession2taxid.gz"):
 			raise TypeError("The supplied annotation file does not seem to be the ncbi nucl_gb.accession2taxid.gz")
-		count = 0
+		annotated_genome = set()
 		with zopen(annotation_file,"r") as f:
 			headers = f.readline().split(b"\t")
 			for row in f:
@@ -101,8 +106,15 @@ def parse_genomeid2taxid(self, genomes_path,annotation_file):
 					try:
 						genebankid = self.refseqid_to_GCF[refseqid]
 						self.database.add_genome(genome=genebankid,_id=taxid.decode("utf-8"))
+						annotated_genome.add(refseqid)
 					except:
-						count +=1
+						pass
 			self.database.commit()
-		logger.info("Genomes not matching any annotation {len}".format(len=count))
-		return
+		missing = set(self.refseqid_to_GCF.keys()) - annotated_genome
+		missing = [self.refseqid_to_GCF[m] for m in missing] ## Translate to GCF ids
+		if logging.root.level <=20: ## Equal to --verbose
+			logger.info("Printing non added genome id´s (GCF) to ./FlexTaxD.not_added")
+			self.write_missing(missing)
+		logger.debug(missing)  ## If debug also print genomes to terminal
+		logger.info("Genomes not matching any annotation {len}".format(len=len(missing)))
+		return missing
diff --git a/flextaxd/modules/ReadTaxonomyQIIME.py b/flextaxd/modules/ReadTaxonomyQIIME.py
@@ -4,7 +4,7 @@
 Read QIIME formatted taxonomy files and holds a dictionary with taxonomy tree and name translation
 '''
 
-from .ReadTaxonomy import ReadTaxonomy
+from .ReadTaxonomy import ReadTaxonomy,InputError
 from .database.DatabaseConnection import DatabaseFunctions
 import logging
 logger = logging.getLogger(__name__)
@@ -102,33 +102,44 @@ def qiime_to_tree(self, sep="\t"):
 		self.errors = 0
 		self.added = 0
 		taxid_start = self.taxid_base
+		raiseWarning = False
 		with open(self.input) as f:
 			'''Each row defines a genome annotation file connected to a tree level'''
 			for row in f:
 				if row.strip() != "":  ## If there are trailing empty lines in the file
 					data = row.strip().split("\t")
-					try:
-						if data[0].startswith(("RS","GB")):
-							'''GTDB genome annotations contain one additional annotation to their genome names eg. RS_, this function removes this'''
-							genome_id = data[0].split("_",1)[1].strip()   ## Genome ID
-						else:
-							genome_id = data[0].strip()
-						#print(genome_id)
-					except IndexError:
-						logger.debug("Row {row} could not be parsed".format(row=data))
-						self.errors +=1
+					if len(data) != 2:  ## If the QIIME file is annotated it will contain two columns
+						if len(data) > 2:
+							raise InputError("The input format is not correct for QIIME. THe QIIME source format requires one [QIIME-tree] or two columns [gene_id\\tQIIME-tree]")
+						raiseWarning = True
+						genome_id = False
+					else: 				## Annotation is given in column one
+						try:
+							if data[0].startswith(("RS","GB")):
+								'''GTDB genome annotations contain one additional annotation to their genome names eg. RS_, this function removes this'''
+								genome_id = data[0].split("_",1)[1].strip()   ## Genome ID
+							else:
+								genome_id = data[0].strip()
+							#print(genome_id)
+						except IndexError:
+							logger.debug("Row {row} could not be parsed".format(row=data))
+							self.errors +=1
+
 					### Walk through tree and make sure all nodes back to root are annotated!
 					taxonomy = list(reversed(data[-1].split(";")))
 					taxonomy_i = self.parse_tree(taxonomy)
-					if taxonomy_i:
+					if taxonomy_i and genome_id:
 						test = self.database.add_genome(genome=genome_id,_id=taxonomy_i)
 						if not str(test).startswith("UNIQUE"):
 							self.added +=1
+					elif not genome_id: ## No genome id suggest a QIIME file with no annotation
+						pass
 					else:
 						logger.debug("Warning taxonomy: {taxonomy} could not be parsed!!")
 						self.missed +=1
 		self.database.commit()
 		self.length = self.taxid_base - taxid_start
+		if raiseWarning: logger.warning("Warning no genomeid2taxid file given and your QIIME formatted file does not seem to contain annotations!")
 		logger.info("Genomes added to database: {genomes}".format(genomes=self.added))
 		logger.debug("Genomes not added to database {missed} errors {errors}".format(missed=self.missed,errors=self.errors))
 		logger.info("New taxonomy ids assigned {taxidnr}".format(taxidnr=self.length))