diff --git a/src/pynteny/cli.py b/src/pynteny/cli.py index 37a3612..75648d1 100644 --- a/src/pynteny/cli.py +++ b/src/pynteny/cli.py @@ -517,6 +517,22 @@ def download() -> argparse.ArgumentParser: action="store_true", help="force-download database again if already downloaded", ) + optional.add_argument( + "-pgap", + "--pgap", + dest="pgap", + default=False, + action="store_true", + help="download PGAP database (default)", + ) + optional.add_argument( + "-pfam", + "--pfam", + dest="pfam", + default=False, + action="store_true", + help="download PFAM database", + ) optional.add_argument( "-l", "--log", diff --git a/src/pynteny/config.json b/src/pynteny/config.json index 2b2f043..90345dd 100644 --- a/src/pynteny/config.json +++ b/src/pynteny/config.json @@ -1,9 +1,11 @@ { "database_dir": "", "upack_PGAP_database": false, - "data_downloaded": false, + "upack_PFAM_database": false, + "PGAP_data_downloaded": false, + "PFAM_data_downloaded": false, "PGAP_database": "", "PGAP_meta_file": "", - "streamlit_process": "", - "streamlit_log": "" + "PFAM_database": "", + "PFAM_meta_file": "" } \ No newline at end of file diff --git a/src/pynteny/hmm.py b/src/pynteny/hmm.py index cb976fb..549b3df 100644 --- a/src/pynteny/hmm.py +++ b/src/pynteny/hmm.py @@ -10,7 +10,6 @@ import logging import os import sys -from typing import Callable from collections import defaultdict from pathlib import Path import tempfile @@ -250,12 +249,13 @@ def get_meta_info_for_HMM(self, hmm_name: str) -> dict: class PGAP(HMMDatabase): """Tools to parse PGAP hmm database metadata""" - def __init__(self): + def __init__(self, *args, **kwargs): """Initialize class PGAP""" - super().__init__() + super().__init__(*args, **kwargs) self._meta = self._meta.rename(columns={"#ncbi_accession": "accession"}) - self._meta = self.remove_missing_HMMs_from_metadata(meta_outfile=None) + # self._meta = self.remove_missing_HMMs_from_metadata(meta_outfile=None) + @staticmethod def remove_missing_HMMs_from_metadata(self, meta_outfile: Path = None) -> None: """Remove HMMs from metadata that are not in HMM directory @@ -290,10 +290,6 @@ def remove_missing_HMMs_from_metadata(self, meta_outfile: Path = None) -> None: class PFAM(HMMDatabase): """Tools to preprocess the PFAM-A hmm database""" - def __init__(self): - """Initialize class PFAM""" - super().__init__() - @classmethod def from_gz_file( cls, hmm_gz_file: Path, hmm_outdir: Path = None, meta_outfile: Path = None @@ -359,95 +355,83 @@ def construct_meta_file(self, meta_outfile: Path = None) -> None: self._meta = pd.read_csv(meta_outfile, sep="\t") -class Downloader: - """Tools to download and preprocess HMM databases""" - - def __init__(self, download_dir: Path): - """Initialize class Downloader - - Args: - output_dir (Path): path to output directory. - """ - self._download_dir = Path(download_dir) - if self._download_dir.exists(): - logger.warning( - f"{self._download_dir} already exists. Downloader may overwrite files." - ) - - def download_pgap(self, unpack: bool = False) -> None: - """Download PGAP database - - Args: - unpack (bool, optional): if True then PGAP database will be extracted - """ - - data_url = "https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.HMM.tgz" - meta_url = "https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv" - logger.info("Downloading PGAP database") - try: - PGAP_file = self._download_dir / "hmm_PGAP.HMM.tgz" - meta_file = self._download_dir / "hmm_PGAP.tsv" - download_file(data_url, PGAP_file) - download_file(meta_url, meta_file) - except Exception: - logger.exception( - "Failed to download PGAP database. Please check your internet connection." - ) - sys.exit(1) - if unpack: - self.extract_pgap_to_directory(PGAP_file) - logger.info("Database downloaded successfully\n") - - def download_pfam(self, unpack: bool = False) -> None: - """Download PFAM database - - Args: - unpack (bool, optional): if True then PFAM database will be extracted - """ - pfam_file = self.download_dir / "Pfam-A.gz" - # hmm_outdir = self._output_dir.parent / "pfam_hmms" - # meta_outfile = hmm_outdir / f"{pfam_file.stem}_meta.tsv" - logger.info("Downloading PFAM-A hmm database") - try: - url = ( - "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz" - ) - download_file(url, pfam_file) - except Exception: - logger.exception( - "Failed to download PFAM-A database. Please check your internet connection." - ) - sys.exit(1) - if unpack: - self.extract_pfam_to_directory(pfam_file) - logger.info("Database downloaded successfully") - - def extract_pgap_to_directory(self, pgap_tar: Path) -> None: - """Extract PGAP hmm database (tar.gz) to downlaod directory +def download_pgap(download_dir: Path, unpack: bool = False) -> tuple[Path, Path]: + """Download PGAP database - Args: - pgap_tar (Path): path to compressed PGAP database. - """ - pgap_tar = Path(pgap_tar) - if not is_tar_file(pgap_tar): - logger.warning(f"{pgap_tar} is not a tar file. Skipping extraction") - return - logger.info("Extracting hmm files to target directory") - extract_tar_file(pgap_tar, self._download_dir) - flatten_directory(self._download_dir) - logger.info("PGAP database unpacked successfully") - - def extract_pfam_to_directory(self, pfam_gz: Path) -> None: - """Extract PFAM hmm database (gz) to downlaod directory + Args: + download_dir (Path): path to output directory. + unpack (bool, optional): if True then PGAP database will be extracted + """ + if download_dir.exists(): + logger.warning( + f"{download_dir} already exists. Downloader may overwrite files." + ) - Args: - pfam_gz (Path): path to compressed PFAM database. - """ - pfam_gz = Path(pfam_gz) - if not pfam_gz.is_file(): - logger.warning(f"{pfam_gz} is not a file. Skipping extraction") - return - logger.info("Extracting hmm files to target directory") - extract_gz_file(pfam_gz, self._download_dir) - flatten_directory(self.download_dir) - logger.info("PGAP database unpacked successfully") + data_url = "https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.HMM.tgz" + meta_url = "https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv" + PGAP_file = download_dir / "hmm_PGAP.HMM.tgz" + meta_file = download_dir / "hmm_PGAP.tsv" + download_file(data_url, PGAP_file) + download_file(meta_url, meta_file) + if unpack: + destination_path = download_dir / "pgap_hmms" + extract_pgap_to_directory(PGAP_file, destination_dir=destination_path) + return destination_path, meta_file + else: + return PGAP_file, meta_file + + +def download_pfam(download_dir: Path, unpack: bool = False) -> Path: + """Download PFAM database + + Args: + unpack (bool, optional): if True then PFAM database will be extracted + """ + if download_dir.exists(): + logger.warning( + f"{download_dir} already exists. Downloader may overwrite files." + ) + PFAM_file = download_dir / "Pfam-A.gz" + logger.info("Downloading PFAM-A hmm database") + url = "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz" + download_file(url, PFAM_file) + if unpack: + destination_path = download_dir / "pfam_hmms" + extract_pfam_to_directory(PFAM_file, destination_dir=destination_path) + return destination_path + else: + return PFAM_file + + +def extract_pgap_to_directory(pgap_tar: Path, destination_dir: Path) -> None: + """Extract PGAP hmm database (tar.gz) to downlaod directory + + Args: + pgap_tar (Path): path to compressed PGAP database. + """ + pgap_tar = Path(pgap_tar) + if not is_tar_file(pgap_tar): + logger.warning(f"{pgap_tar} is not a tar file. Skipping extraction") + return + logger.info("Extracting hmm files to target directory") + extract_tar_file(pgap_tar, destination_dir) + flatten_directory(destination_dir) + os.remove(pgap_tar) + logger.info("PGAP database unpacked successfully") + + +def extract_pfam_to_directory(pfam_gz: Path, destination_dir: Path) -> None: + """Extract PFAM hmm database (gz) to downlaod directory + + Args: + pfam_gz (Path): path to compressed PFAM database. + """ + pfam_gz = Path(pfam_gz) + if not pfam_gz.is_file(): + logger.warning(f"{pfam_gz} is not a file. Skipping extraction") + return + logger.info("Extracting hmm files to target directory") + extract_gz_file(pfam_gz, destination_dir) + flatten_directory(destination_dir) + os.remove(pfam_gz) + logger.info("PGAP database unpacked successfully") diff --git a/src/pynteny/subcommands.py b/src/pynteny/subcommands.py index 7d9035d..623b2f8 100644 --- a/src/pynteny/subcommands.py +++ b/src/pynteny/subcommands.py @@ -17,12 +17,11 @@ import pynteny.parsers.syntenyparser as syntenyparser from pynteny.filter import SyntenyHits, filter_FASTA_by_synteny_structure -from pynteny.hmm import PGAP, PFAM, Downloader +from pynteny.hmm import PGAP, PFAM, download_pgap, download_pfam from pynteny.preprocessing import Database from pynteny.utils import ( CommandArgs, ConfigParser, - download_file, is_tar_file, ) @@ -74,7 +73,10 @@ def synteny_search(args: Union[CommandArgs, ArgumentParser]) -> SyntenyHits: ) sys.exit(1) if args.hmm_dir is None: - if not config.get_field("data_downloaded"): + if not ( + config.get_field("PGAP_data_downloaded") + or config.get_field("PFAM_data_downloaded") + ): logger.warning( "HMM database not found. Downloading PGAP database from NCBI" ) @@ -84,7 +86,10 @@ def synteny_search(args: Union[CommandArgs, ArgumentParser]) -> SyntenyHits: args.hmm_dir = Path(config.get_field("PGAP_database")) if args.gene_ids: if args.hmm_meta is None: - if not config.get_field("data_downloaded"): + if not ( + config.get_field("PGAP_data_downloaded") + or config.get_field("PFAM_data_downloaded") + ): logger.error( "Please download hmm database first or provide path to hmm metadata file." ) @@ -201,7 +206,10 @@ def parse_gene_ids(args: Union[CommandArgs, ArgumentParser]) -> str: logger = init_logger(args) config = ConfigParser.get_default_config() if args.hmm_meta is None: - if not config.get_field("data_downloaded"): + if not ( + config.get_field("PGAP_data_downloaded") + or config.get_field("PFAM_data_downloaded") + ): logger.error( "Please download hmm database meta file or provide path to existing one first." ) @@ -229,7 +237,19 @@ def download_hmms(args: Union[CommandArgs, ArgumentParser]) -> None: """ logger = init_logger(args) config = ConfigParser.get_default_config() - if (config.get_field("data_downloaded")) and (not args.force): + if (config.get_field("PGAP_data_downloaded")) and (args.pgap) and (not args.force): + logger.info("PGAP HMM database already downloaded. Skipping download") + elif ( + (config.get_field("PFAM_data_downloaded")) and (args.pfam) and (not args.force) + ): + logger.info("PFAM HMM database already downloaded. Skipping download") + elif ( + (config.get_field("PGAP_data_downloaded")) + and (args.pgap) + and (config.get_field("PFAM_data_downloaded")) + and (args.pfam) + and (not args.force) + ): logger.info("HMM databases already downloaded. Skipping download") sys.exit(1) if args.outdir is None: @@ -241,36 +261,46 @@ def download_hmms(args: Union[CommandArgs, ArgumentParser]) -> None: download_dir.mkdir(parents=True, exist_ok=True) config.update_config("database_dir", download_dir.as_posix()) - config.update_config("unpack_PGAP_database", args.unpack) - - downloader = Downloader(download_dir) - - # data_url = "https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.HMM.tgz" - # meta_url = "https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv" - # logger.info("Downloading PGAP database") - # try: - # PGAP_file = download_dir / "hmm_PGAP.HMM.tgz" - # meta_file = download_dir / "hmm_PGAP.tsv" - # download_file(data_url, PGAP_file) - # download_file(meta_url, meta_file) - # logger.info("Database dowloaded successfully\n") - # config.update_config("data_downloaded", True) - # config.update_config("PGAP_database", PGAP_file.as_posix()) - # config.update_config("PGAP_meta_file", meta_file.as_posix()) - # except Exception: - # logger.exception( - # "Failed to download PGAP database. Please check your internet connection." - # ) - # sys.exit(1) - # logger.info("Removing missing entries from PGAP metadata file") - # PGAP(meta_file).remove_missing_HMMs_from_metadata(PGAP_file, meta_file) - # if args.unpack: - # logger.info("Unpacking PGAP database") - # unpacked_PGAP_dir = download_dir / "hmm_PGAP" - # PGAP.extract_PGAP_to_directory(PGAP_file, output_dir=unpacked_PGAP_dir) - # os.remove(PGAP_file) - # config.update_config("PGAP_database", unpacked_PGAP_dir.as_posix()) - # logger.info("PGAP database unpacked successfully") + + if args.pgap: + logger.info("Downloading PGAP database") + try: + PGAP_path, PGAP_meta_file = download_pgap(download_dir, unpack=args.unpack) + PGAP(PGAP_path, PGAP_meta_file).remove_missing_HMMs_from_metadata( + PGAP_meta_file + ) + config.update_config("unpack_PGAP_database", args.unpack) + logger.info("PGAP database downloaded successfully\n") + config.update_config("PGAP_data_downloaded", True) + config.update_config("PGAP_database", PGAP_path.as_posix()) + config.update_config("PGAP_meta_file", PGAP_meta_file.as_posix()) + except Exception: + logger.exception( + "Failed to download PGAP database. Please check your internet connection." + ) + sys.exit(1) + + if args.pfam: + logger.info("Downloading PFAM-A database") + try: + PFAM_meta_file = download_dir / "hmm_PFAM.tsv" + PFAM_path = download_dir / "PFAM_hmms" + PFAM_gz_file = download_pfam(download_dir, unpack=True) + pfam = PFAM.from_gz_file( + PFAM_gz_file, + hmm_outdir=PFAM_path, + meta_outfile=PFAM_meta_file, + ) + config.update_config("unpack_PFAM_database", True) + logger.info("PFAM database downloaded successfully\n") + config.update_config("PFAM_data_downloaded", True) + config.update_config("PFAM_database", PFAM_path.as_posix()) + config.update_config("PFAM_meta_file", PFAM_meta_file.as_posix()) + except Exception: + logger.exception( + "Failed to download PFAM-A database. Please check your internet connection." + ) + sys.exit(1) logging.shutdown() diff --git a/src/pynteny/utils.py b/src/pynteny/utils.py index 7d4aaf8..9bc4c09 100644 --- a/src/pynteny/utils.py +++ b/src/pynteny/utils.py @@ -55,10 +55,13 @@ def initialize_config_file() -> Path: config = { "database_dir": "", "upack_PGAP_database": False, - "data_downloaded": False, + "upack_PFAM_database": False, + "PGAP_data_downloaded": False, + "PFAM_data_downloaded": False, "PGAP_database": "", "PGAP_meta_file": "", - "streamlit_process": "", + "PFAM_database": "", + "PFAM_meta_file": "", } with open(config_file, "w", encoding="UTF-8") as f: json.dump(config, f, indent=4)