Skip to content

Commit

Permalink
Incorporate xrefs into DKG
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt authored and bgyori committed Jan 24, 2024
1 parent 8ce0d70 commit 0642aa0
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 16 deletions.
27 changes: 16 additions & 11 deletions mira/dkg/construct.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import pyobo
import pystow
from bioontologies import obograph
from bioontologies.obograph import Xref
from bioregistry import manager
from pydantic import BaseModel, Field
from pyobo.struct import part_of
Expand Down Expand Up @@ -231,7 +232,7 @@ def main(
config=config,
refresh=refresh,
do_upload=do_upload,
add_xref_edges=add_xref_edges,
add_xref_edges=True,
summaries=summaries
)

Expand Down Expand Up @@ -642,6 +643,10 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
]
_results_pickle_path.write_bytes(pickle.dumps(parse_results))

if parse_results.graph_document is None:
click.secho(f"No graphs in {prefix}, skipping", fg="red")
continue

_graphs = parse_results.graph_document.graphs
click.secho(
f"{manager.get_name(prefix)} ({len(_graphs)} graphs)", fg="green", bold=True
Expand Down Expand Up @@ -759,29 +764,29 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:

if add_xref_edges:
for xref in node.xrefs:
try:
xref_curie = xref.curie
except ValueError:
if not isinstance(xref, Xref):
raise TypeError(f"Invalid type: {type(xref)}: {xref}")
if not xref.value:
continue
if xref_curie.split(":", 1)[0] in obograph.PROVENANCE_PREFIXES:
if xref.value.prefix in obograph.PROVENANCE_PREFIXES:
# Don't add provenance information as xrefs
continue
edges.append(
(
node.curie,
xref.curie,
xref.value.curie,
"xref",
"oboinowl:hasDbXref",
prefix,
graph_id,
version or "",
)
)
if xref_curie not in nodes:
if xref.value.curie not in nodes:
node_sources[node.replaced_by].add(prefix)
nodes[xref_curie] = NodeInfo(
curie=xref.curie,
prefix=xref.prefix,
nodes[xref.value.curie] = NodeInfo(
curie=xref.value.curie,
prefix=xref.value.prefix,
label="",
synonyms="",
deprecated="false",
Expand All @@ -798,7 +803,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:

for provenance in node.get_provenance():
if ":" in provenance.identifier:
tqdm.write(f"Malformed provenance for {node.curie}")
tqdm.write(f"Malformed provenance for {node.curie}: {provenance}")
provenance_curie = provenance.curie
node_sources[provenance_curie].add(prefix)
if provenance_curie not in nodes:
Expand Down
10 changes: 8 additions & 2 deletions mira/dkg/construct_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
def _construct_embeddings(upload: bool, use_case_paths: UseCasePaths) -> None:
with TemporaryDirectory() as directory:
path = os.path.join(directory, use_case_paths.EDGES_PATH.stem)
with gzip.open(use_case_paths.EDGES_PATH, "rb") as f_in, open(path, "wb") as f_out:
with gzip.open(use_case_paths.EDGES_PATH, "rb") as f_in, open(
path, "wb"
) as f_out:
shutil.copyfileobj(f_in, f_out)
graph = Graph.from_csv(
edge_path=path,
Expand All @@ -26,12 +28,16 @@ def _construct_embeddings(upload: bool, use_case_paths: UseCasePaths) -> None:
directed=True,
name="MIRA-DKG",
)
# TODO remove disconnected nodes
# graph = graph.remove_disconnected_nodes()
embedding = SecondOrderLINEEnsmallen(embedding_size=32).fit_transform(graph)
df = embedding.get_all_node_embedding()[0].sort_index()
df.index.name = "node"
df.to_csv(use_case_paths.EMBEDDINGS_PATH, sep="\t")
if upload:
upload_s3(use_case_paths.EMBEDDINGS_PATH, use_case=use_case_paths.use_case)
upload_s3(
use_case_paths.EMBEDDINGS_PATH, use_case=use_case_paths.use_case
)


@click.command()
Expand Down
11 changes: 8 additions & 3 deletions mira/dkg/construct_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,15 @@ def _construct_rdf(upload: bool, *, use_case_paths: UseCasePaths):
graph.add((_ref(s), p_ref, _ref(o)))

tqdm.write("serializing to turtle")
with gzip.open(use_case_paths.RDF_TTL_PATH, "wb") as file:
graph.serialize(file, format="turtle")
tqdm.write("done")
try:
with gzip.open(use_case_paths.RDF_TTL_PATH, "wb") as file:
graph.serialize(file, format="turtle")
except Exception as e:
click.secho("Failed to serialize RDF", fg="red")
click.echo(str(e))
return

tqdm.write("done")
if upload:
upload_s3(use_case_paths.RDF_TTL_PATH, use_case=use_case_paths.use_case)

Expand Down

0 comments on commit 0642aa0

Please sign in to comment.