Skip to content

Commit

Permalink
Support extraction of multiple relationships types using Pronto (#15)
Browse files Browse the repository at this point in the history
merges #15
closes #14

Updates nxontology.imports to load multiple relationship types.

* pronto_to_multidigraph
* multidigraph_to_digraph
* read_gene_ontology
* README: example with multiple edge type support
  • Loading branch information
dhimmel committed Nov 2, 2021
1 parent edde267 commit b7679f2
Show file tree
Hide file tree
Showing 3 changed files with 225 additions and 6 deletions.
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,28 @@ using pronto to load the ontology:
>>> # Gene Ontology domains are disconnected, expect 3 components
>>> nx.number_weakly_connected_components(nxo.graph)
3
>>> # Note however that the default from_file reader only uses "is a" relationships.
>>> # We can preserve all GO relationship types as follows
>>> from collections import Counter
>>> import pronto
>>> from nxontology import NXOntology
>>> from nxontology.imports import pronto_to_multidigraph, multidigraph_to_digraph
>>> go_pronto = pronto.Ontology(handle=url)
>>> go_multidigraph = pronto_to_multidigraph(go_pronto)
>>> Counter(key for _, _, key in go_multidigraph.edges(keys=True))
Counter({'is a': 71509,
'part of': 7187,
'regulates': 3216,
'negatively regulates': 2768,
'positively regulates': 2756})
>>> go_digraph = multidigraph_to_digraph(go_multidigraph)
>>> go_nxo = NXOntology(go_digraph)
>>> # Notice the similarity increases due to the full set of edges
>>> round(go_nxo.similarity("GO:0042552", "GO:0022008").lin, 3)
0.699
>>> # Note that there is also a dedicated reader for the Gene Ontology
>>> from nxontology.imports import read_gene_ontology
>>> read_gene_ontology(release="2021-02-01")
```

Users can also create their own `networkx.DiGraph` to use this package.
Expand Down
146 changes: 145 additions & 1 deletion nxontology/imports.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import logging
from datetime import date
from os import PathLike
from typing import AnyStr, BinaryIO, Union
from typing import AnyStr, BinaryIO, Counter, List, Optional, Tuple, Union, cast

import networkx as nx
from pronto import Ontology as Prontology # type: ignore [attr-defined]
from pronto.term import Term

from nxontology import NXOntology
from nxontology.exceptions import NodeNotFound
from nxontology.ontology import Node


def pronto_to_nxontology(onto: Prontology) -> NXOntology[str]:
Expand Down Expand Up @@ -62,3 +66,143 @@ def from_file(handle: Union[BinaryIO, str, "PathLike[AnyStr]"]) -> NXOntology[st
"""
onto = Prontology(handle=handle)
return pronto_to_nxontology(onto)


def _pronto_edges_for_term(
term: Term, default_rel_type: str = "is a"
) -> List[Tuple[Node, Node, str]]:
"""
Extract edges including "is a" relationships for a Pronto term.
https://github.com/althonos/pronto/issues/119#issuecomment-956541286
"""
rels = list()
source_id = cast(Node, term.id)
for target in term.superclasses(distance=1, with_self=False):
rels.append((source_id, cast(Node, target.id), default_rel_type))
for rel_type, targets in term.relationships.items():
for target in sorted(targets):
rels.append(
(
cast(Node, term.id),
cast(Node, target.id),
rel_type.name or rel_type.id,
)
)
return rels


def pronto_to_multidigraph(
onto: Prontology, default_rel_type: str = "is a"
) -> nx.MultiDiGraph:
"""
Convert a `pronto.Ontology` to a `networkx.MultiDiGraph`,
including all relationship types including "is a".
The output MultiDiGraph is not directly compatable with NXOntology,
since NXOntology assumes a DiGraph (without parallel edges) and likely
encodes edges in the reverse direction, such that edges go from
superterm to subterm.
## References
- https://github.com/althonos/pronto/issues/119
- https://github.com/related-sciences/nxontology/issues/14
"""
graph = nx.MultiDiGraph()
for term in onto.terms():
if term.obsolete:
# obsolete was unreliable in pronto < v2.4.0
# https://github.com/althonos/pronto/issues/122
continue
if term.id in graph:
logging.warning(f"Skipping node already in graph: {term}")
continue
graph.add_node(
term.id,
identifier=term.id,
label=term.name,
namespace=term.namespace,
)
for term in onto.terms():
for source, target, key in _pronto_edges_for_term( # type: ignore [var-annotated]
term, default_rel_type
):
for node in source, target:
if node not in graph:
logging.warning(
f"Skipping edge: node does not exist in graph: {node}"
)
if graph.has_edge(source, target, key):
logging.warning(
f"Skipping edge already in graph: {source} --> {target} (key={key!r})"
)
graph.add_edge(source, target, key=key)
rel_counts = Counter(key for _, _, key in graph.edges(keys=True))
logging.info(f"MultiDiGraph relationship counts:\n{rel_counts}")
return graph


def multidigraph_to_digraph(
graph: nx.MultiDiGraph, rel_types: Optional[List[str]] = None, reverse: bool = True
) -> nx.DiGraph:
"""
Convert a networkx MultiDiGraph to a DiGraph by aggregating edges accross relationship types.
Can be used to convert the output of `pronto_to_multidigraph` or the obonet package to a DiGraph
suitable for input to NXOntology. In such cases, you will likely want to set reverse=True to reverse
edges directions such that edges point from superterms to subterms.
When rel_types is None, all relationship types are preserved. If rel_types is defined,
then the MultiDiGraph is first filtered for edges with that key (relationship type).
"""
if rel_types is not None:
graph.remove_edges_from(
[
(u, v, key)
for u, v, key in graph.edges(keys=True, data=False)
if key not in rel_types
]
)
if reverse:
graph = graph.reverse(copy=True)
digraph = nx.DiGraph(graph)
for source, target in digraph.edges(data=False):
digraph[source][target]["rel_types"] = sorted(graph[source][target])
return digraph


def read_gene_ontology(
release: str = "current",
source_file: str = "go-basic.json.gz",
rel_types: Optional[List[str]] = [
"is a",
"part of",
"regulates",
"negatively regulates",
"positively regulates",
],
) -> NXOntology[str]:
"""
Load the Gene Ontology into NXOntology,
keeping the relationship types from the basic release.
GO Basic is [described as](http://geneontology.org/docs/download-ontology/):
> This is the basic version of the GO,
> filtered such that the graph is guaranteed to be acyclic
> and annotations can be propagated up the graph.
> The relations included are
> is a, part of, regulates, negatively regulates and positively regulates.
> This version excludes relationships that cross the 3 GO hierarchies.
> This version should be used with most GO-based annotation tools.
"""
if release == "current":
url = f"http://current.geneontology.org/ontology/{source_file}"
else:
date.fromisoformat(release) # check that release is a valid date
url = f"http://release.geneontology.org/{release}/ontology/{source_file}"
go_pronto = Prontology(handle=url)
go_multidigraph = pronto_to_multidigraph(go_pronto, default_rel_type="is a")
go_digraph = multidigraph_to_digraph(
go_multidigraph, rel_types=rel_types, reverse=True
)
go_nxo: NXOntology[str] = NXOntology(go_digraph)
go_nxo.graph.graph["source_url"] = url
return go_nxo
63 changes: 58 additions & 5 deletions nxontology/tests/imports_test.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
import networkx as nx
import pytest
from pronto import Ontology # type: ignore [attr-defined]

from nxontology.imports import from_file, from_obo_library
from nxontology.imports import (
from_file,
from_obo_library,
multidigraph_to_digraph,
pronto_to_multidigraph,
read_gene_ontology,
)

taxrank_formats = [
"owl",
"obo",
]


@pytest.mark.parametrize(
"format",
[
"owl",
"obo",
],
taxrank_formats,
)
def test_from_obo_library_taxrank(format: str) -> None:
"""
Expand Down Expand Up @@ -37,3 +47,46 @@ def test_from_file_go() -> None:
assert info.data["namespace"] == "biological_process"
# has edge from "axon ensheathment" to "myelination"
assert nxo.graph.has_edge("GO:0008366", "GO:0042552")


@pytest.mark.parametrize(
"format",
taxrank_formats,
)
def test_pronto_to_multidigraph(format: str) -> None:
"""
http://www.obofoundry.org/ontology/taxrank.html
"""
slug = f"taxrank.{format}"
onto = Ontology.from_obo_library(slug)
graph = pronto_to_multidigraph(onto, default_rel_type="is_a")
# subterm --> superterm: opposite of NXOntology
assert graph.has_edge(u="TAXRANK:0000034", v="TAXRANK:0000000", key="is_a")


def test_multigraph_to_digraph():
mdg = nx.MultiDiGraph()
mdg.add_edge("a", "b", key="rel_type 1")
mdg.add_edge("a", "b", key="rel_type 2")
mdg.add_edge("b", "c", key="rel_type 1")
dg = multidigraph_to_digraph(mdg)
assert dg.number_of_nodes() == 3
assert dg.number_of_edges() == 2
assert dg["b"]["a"]["rel_types"] == ["rel_type 1", "rel_type 2"]
assert dg["c"]["b"]["rel_types"] == ["rel_type 1"]
dg = multidigraph_to_digraph(mdg, reverse=False)
assert dg.has_edge("a", "b")
assert not dg.has_edge("b", "a")
dg = multidigraph_to_digraph(mdg, rel_types=["rel_type 2"])
assert dg.number_of_nodes() == 3
assert dg.number_of_edges() == 1
assert dg.has_edge("b", "a")


def test_read_gene_ontology():
nxo = read_gene_ontology(release="2021-02-01")
assert (
nxo.graph.graph["source_url"]
== "http://release.geneontology.org/2021-02-01/ontology/go-basic.json.gz"
)
assert "regulates" in nxo.graph["GO:0006310"]["GO:0000018"]["rel_types"]

0 comments on commit b7679f2

Please sign in to comment.