From a6a21c35dcb932244586c9ec0de106037a5ecde3 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Thu, 19 Sep 2024 09:50:45 -0400 Subject: [PATCH] Add docker volume to create directory to store pickled obo files. Adjust startup script to generate obo graph if not found in mounted host directory. Add shell script to build image, make host directory, and run the container. --- docker/Dockerfile | 15 +-------- docker/README.md | 7 +++++ docker/build_run_docker.sh | 5 +++ ...nerate_graph.py => generate_obo_graphs.py} | 4 ++- docker/startup.sh | 10 ++++++ mira/dkg/construct.py | 31 ++++++------------- 6 files changed, 36 insertions(+), 36 deletions(-) create mode 100755 docker/build_run_docker.sh rename docker/{generate_graph.py => generate_obo_graphs.py} (91%) diff --git a/docker/Dockerfile b/docker/Dockerfile index 03b54bfa0..181e710d1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,16 +1,3 @@ -# Create an initial docker image to generate the graph and transfer it to the -# second docker image -# We do this to avoid involving additional imports in the second docker image -FROM python:3.10-slim AS graph-builder - -WORKDIR /graphs -RUN apt-get update && apt-get install -y git -RUN pip install pyobo networkx obonet - -# Copy and run the script to generate the pickled graph -COPY generate_graph.py /graphs/generate_graph.py -RUN python generate_graph.py - FROM ubuntu:focal WORKDIR /sw @@ -42,6 +29,7 @@ RUN wget -O /sw/nodes.tsv.gz https://askem-mira.s3.amazonaws.com/dkg/$domain/bui sed -i 's/#dbms.security.auth_enabled/dbms.security.auth_enabled/' /etc/neo4j/neo4j.conf && \ neo4j-admin import --delimiter='TAB' --skip-duplicate-nodes=true --skip-bad-relationships=true --nodes /sw/nodes.tsv.gz --relationships /sw/edges.tsv.gz +COPY generate_obo_graphs.py /sw/generate_obo_graphs.py # Python packages RUN python -m pip install --upgrade pip && \ python -m pip install git+https://github.com/gyorilab/mira.git@main#egg=mira[web,uvicorn,dkg-client,dkg-construct] && \ @@ -56,6 +44,5 @@ RUN python -m pip install --upgrade pip && \ RUN wget -O /sw/sir_flux_span.json https://raw.githubusercontent.com/gyorilab/mira/main/tests/sir_flux_span.json RUN mkdir -p /graphs -COPY --from=graph-builder /graphs/relabeled_obo_graph.pkl /graphs/relabeled_obo_graph.pkl COPY startup.sh startup.sh ENTRYPOINT ["/bin/bash", "/sw/startup.sh"] diff --git a/docker/README.md b/docker/README.md index d009ec30d..787fe4c5f 100644 --- a/docker/README.md +++ b/docker/README.md @@ -40,6 +40,13 @@ docker run -p 8771:8771 -p 7687:7687 -e MIRA_NEO4J_URL=bolt://0.0.0.0:7687 mira: This exposes a REST API at `http://localhost:8771`. This also exposes Neo4j's bolt port at port 7687. + +Running the `build_run_docker.sh` script builds the docker image, +create directory `docker/mounted_graph_storage` to store the pickled obo +graphs, and start the container. When you first run the script and +start the container, it will take a few minutes to generate and store the +pickled graphs. + ## MIRA Metaregistry The MIRA metaregistry contains the prefixes and their associated metadata for all use cases. diff --git a/docker/build_run_docker.sh b/docker/build_run_docker.sh new file mode 100755 index 000000000..0e4dc4e8b --- /dev/null +++ b/docker/build_run_docker.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +docker build --tag mira_epi_dkg:latest . +mkdir -p mounted_graph_storage +docker run --detach -v ./mounted_graph_storage:/graphs -p 7474:7474 -p 8771:8771 -p 7687:7687 -e MIRA_NEO4J_URL=bolt://0.0.0.0:7687 --name mira mira_epi_dkg:latest \ No newline at end of file diff --git a/docker/generate_graph.py b/docker/generate_obo_graphs.py similarity index 91% rename from docker/generate_graph.py rename to docker/generate_obo_graphs.py index 231caab00..3dcb28cf3 100644 --- a/docker/generate_graph.py +++ b/docker/generate_obo_graphs.py @@ -10,11 +10,13 @@ def download_convert_ncbitaxon_obo_to_graph(): version = get_version(resource_prefix) # Checks to see if the pickled ncbitaxon obo graph exists in the container - cached_relabeled_obo_graph_path = Path("/graphs/relabeled_obo_graph.pkl") + cached_relabeled_obo_graph_path = Path("/graphs/ncbitaxon_obo_graph.pkl") if not cached_relabeled_obo_graph_path.exists(): _, obo_path = _ensure_ontology_path(resource_prefix, force=False, version=version) obo_graph = read_obo(obo_path) + + # Normalize node indices relabeled_graph = networkx.relabel_nodes(obo_graph, lambda node_index: node_index.lower()) diff --git a/docker/startup.sh b/docker/startup.sh index 3f412a327..9ee790f36 100755 --- a/docker/startup.sh +++ b/docker/startup.sh @@ -1,4 +1,14 @@ #!/bin/bash + +# Check if the ncbitaxon pickled graph file exists +if [ ! -f /graphs/ncbitaxon_obo_graph.pkl ]; then + echo "Pickled ncbitaxon obo graph file not found. Generating it" + python /sw/generate_obo_graphs.py +else + echo "Pickled ncbitaxon obo graph file already exists in the container in + /graphs/" +fi + neo4j start sleep 100 neo4j status diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 44eb37b13..595a1db6a 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -434,14 +434,11 @@ def extract_ontology_subtree(curie: str, add_subtree: bool = False): under the corresponding entry's subtree in its respective ontology. Relation information is also extracted with this option. - Running this method for the first time for each specific resource will - take a long time (minutes) as the obo resource file has to be downloaded, - converted to a networkx graph, have their node indices normalized, and - pickled. - - Subsequent runs of this method will take a few seconds as the pickled + Execution of this method will take a few seconds as the pickled graph object has to be loaded. + Currently we only support the addition of ncbitaxon terms. + Parameters ---------- curie : @@ -463,21 +460,13 @@ def extract_ontology_subtree(curie: str, add_subtree: bool = False): resource_prefix = curie.split(":")[0] if resource_prefix == "ncbitaxon": type = "class" - version = get_version(resource_prefix) - cached_relabeled_obo_graph_path = prefix_directory_join(resource_prefix, - name="relabeled_obo_graph.pkl", - version=version) - if not cached_relabeled_obo_graph_path.exists(): - _, obo_path = _ensure_ontology_path(resource_prefix, force=False, - version=version) - obo_graph = read_obo(obo_path) - relabeled_graph = networkx.relabel_nodes(obo_graph, - lambda node_index: node_index.lower()) - with open(cached_relabeled_obo_graph_path,'wb') as relabeled_graph_file: - pickle.dump(relabeled_graph, relabeled_graph_file) - else: - with open(cached_relabeled_obo_graph_path,'rb') as relabeled_graph_file: - relabeled_graph = pickle.load(relabeled_graph_file) + cached_relabeled_obo_graph_path = (Path(__file__).resolve().parents[2] + / "docker" / + "mounted_graph_storage" / + "ncbitaxon_obo_graph.pkl") + + with open(cached_relabeled_obo_graph_path,'rb') as relabeled_graph_file: + relabeled_graph = pickle.load(relabeled_graph_file) else: return nodes, edges