Skip to content

Commit

Permalink
Merge pull request #201 from jmchilton/idc_2
Browse files Browse the repository at this point in the history
Enhancements to the IDC scripts
  • Loading branch information
mvdbeek authored Jan 31, 2024
2 parents bacbf87 + cf586a9 commit 99154c3
Show file tree
Hide file tree
Showing 16 changed files with 929 additions and 28 deletions.
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,11 @@ def get_var(var_name):
install_tool_deps=ephemeris.install_tool_deps:main
install-tool-deps=ephemeris.install_tool_deps:main
set-library-permissions=ephemeris.set_library_permissions:main
"""
_idc-lint=ephemeris._idc_lint:main
_idc-split-data-manager-genomes=ephemeris._idc_split_data_manager_genomes:main
_idc-data-managers-to-tools=ephemeris._idc_data_managers_to_tools:main
"""

PACKAGE_DATA = {
# Be sure to update MANIFEST.in for source dist.
}
Expand Down
12 changes: 11 additions & 1 deletion src/ephemeris/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

import yaml
from bioblend import galaxy

Expand All @@ -11,6 +13,14 @@
RAW_CONTENT_URL = f"https://raw.github.com/{PROJECT_USERAME}/{PROJECT_NAME}/master/"


def get_or_create_history(history_name: str, gi: galaxy.GalaxyInstance):
histories = gi.histories.get_histories(name=history_name)
if histories:
return histories[0]
else:
return gi.histories.create_history(name=history_name)


def check_url(url, log=None):
if not url.startswith("http"):
if log:
Expand All @@ -32,7 +42,7 @@ def get_galaxy_connection(args, file=None, log=None, login_required=True):

url = args.galaxy or file_content.get("galaxy_instance")
galaxy_url = check_url(url, log)
api_key = args.api_key or file_content.get("api_key")
api_key = args.api_key or file_content.get("api_key") or os.environ.get("EPHEMERIS_API_KEY")

if args.user and args.password:
return galaxy.GalaxyInstance(url=galaxy_url, email=args.user, password=args.password)
Expand Down
85 changes: 85 additions & 0 deletions src/ephemeris/_config_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from pathlib import Path
from typing import (
Dict,
List,
Optional,
Union,
)

import yaml
from pydantic import (
BaseModel,
Extra,
)

StrOrPath = Union[Path, str]


class RepositoryInstallTarget(BaseModel):
name: str
owner: str
tool_shed_url: Optional[str]
tool_panel_section_id: Optional[str]
tool_panel_section_label: Optional[str]
revisions: Optional[List[str]]
install_tool_dependencies: Optional[bool]
install_repository_dependencies: Optional[bool]
install_resolver_dependencies: Optional[bool]


class RepositoryInstallTargets(BaseModel):
""" """

api_key: Optional[str]
galaxy_instance: Optional[str]
tools: List[RepositoryInstallTarget]


class DataManager(BaseModel, extra=Extra.forbid):
tags: List[str]
tool_id: str


class DataManagers(BaseModel, extra=Extra.forbid):
__root__: Dict[str, DataManager]


class Genome(BaseModel):
id: str # The unique id of the data in Galaxy
description: str # The description of the data, including its taxonomy, version and date
dbkey: Optional[str]
source: Optional[str] # The source of the data. Can be: 'ucsc', an NCBI accession number or a URL to a fasta file.

# The following fields are currently purely for human consumption and unused by
# IDC infrastructure.
doi: Optional[str] # Any DOI associated with the data
blob: Optional[str] # A blob for any other pertinent information
checksum: Optional[str] # A SHA256 checksum of the original
version: Optional[str] # Any version information associated with the data

# Description of actions (data managers) to run on target genome.
indexers: Optional[
List[str]
] # indexers to run - keyed on repository name - see data_managers.yml for how to resolve these to tools
skiplist: Optional[List[str]] # unimplemented: but if we implement classes of indexers, these will be ones to skip


class Genomes(BaseModel):
genomes: List[Genome]


def _read_yaml(path: StrOrPath):
with open(path) as f:
return yaml.safe_load(f)


def read_data_managers(path: StrOrPath) -> DataManagers:
return DataManagers(__root__=_read_yaml(path))


def read_genomes(path: StrOrPath) -> Genomes:
return Genomes(**_read_yaml(path))


def read_tools(path: StrOrPath) -> RepositoryInstallTargets:
return RepositoryInstallTargets(**_read_yaml(path))
105 changes: 105 additions & 0 deletions src/ephemeris/_idc_data_managers_to_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python
"""Helper script for IDC - not yet meant for public consumption.
This script takes a data_managers.yml configuration describing the
set of data managers the IDC configuration targets and builds a
a tools.yml file from it for use with shed_tools.
"""
import argparse
import logging
from typing import (
Dict,
List,
NamedTuple,
)

import yaml

from ._config_models import (
read_data_managers,
RepositoryInstallTargets,
)
from .common_parser import (
add_log_file_argument,
add_verbosity_argument,
)
from .ephemeris_log import (
disable_external_library_logging,
setup_global_logger,
)


class DataManager(NamedTuple):
tool_id: str
repository_name: str
tags: List[str]


def read_data_managers_configuration(path: str) -> Dict[str, DataManager]:
raw_data_managers = read_data_managers(path)
data_managers: Dict[str, DataManager] = {}
for repository_name, data_manager_configuration in raw_data_managers.__root__.items():
data_manager = DataManager(
tool_id=data_manager_configuration.tool_id,
repository_name=repository_name,
tags=data_manager_configuration.tags or [],
)
data_managers[repository_name] = data_manager
return data_managers


def build_shed_install_conf(path: str) -> dict:
data_managers = read_data_managers_configuration(path)
tools = []
for data_manager in data_managers.values():
tool_id = data_manager.tool_id
tool_id_parts = tool_id.split("/")
repo_owner = tool_id_parts[2]
repo_name = tool_id_parts[3]
entry = {
"name": repo_name,
"owner": repo_owner,
"tool_panel_section_label": None,
"tool_shed_url": "toolshed.g2.bx.psu.edu",
}
tools.append(entry)
tools_yaml = {"tools": tools}
return tools_yaml


def write_shed_install_conf(data_manager_conf_path: str, output_path: str) -> None:
tools_yaml = build_shed_install_conf(data_manager_conf_path)

# validate generated dict to ensure we're writing out valid file
RepositoryInstallTargets(**tools_yaml)

with open(output_path, "w") as f:
yaml.safe_dump(tools_yaml, f)


def _parser():
"""returns the parser object."""

parser = argparse.ArgumentParser(add_help=False)
general_group = parser.add_argument_group("General options")
add_verbosity_argument(general_group)
add_log_file_argument(general_group)
parser.add_argument("--data-managers-conf", default="data_managers.yml")
parser.add_argument("--shed-install-output-conf", default="tools.yml")
return parser


def main():
disable_external_library_logging()
parser = _parser()
args = parser.parse_args()
log = setup_global_logger(name=__name__, log_file=args.log_file)
if args.verbose:
log.setLevel(logging.DEBUG)
else:
log.setLevel(logging.INFO)
write_shed_install_conf(args.data_managers_conf, args.shed_install_output_conf)


if __name__ == "__main__":
main()
44 changes: 44 additions & 0 deletions src/ephemeris/_idc_lint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
from pathlib import Path

import yaml

from ._config_models import (
read_data_managers,
read_genomes,
)


def read_yaml(path: Path):
with open(path) as f:
return yaml.safe_load(f)


def lint_idc_directory(directory: Path):
genomes_path = directory / "genomes.yml"
data_managers_path = directory / "data_managers.yml"
assert genomes_path.exists()
assert data_managers_path.exists()
data_managers = read_data_managers(data_managers_path).__root__
genomes = read_genomes(genomes_path)

for data_manager in data_managers.values():
data_manager_tool_id = data_manager.tool_id
if not data_manager_tool_id.startswith("toolshed.g2.bx.psu.edu/"):
raise Exception(
f"Expected a data manager repository from main Galaxy tool shed but discovered tool ID {data_manager_tool_id}"
)

for genome in genomes.genomes:
print(genome)
for indexer in genome.indexers or []:
if indexer not in data_managers:
raise Exception(f"Failed to find data manager {indexer} referenced for genome {genome}")


def main():
lint_idc_directory(Path(os.curdir))


if __name__ == "__main__":
main()
Loading

0 comments on commit 99154c3

Please sign in to comment.