diff --git a/README.md b/README.md index 7706b58..0eeb7ab 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,28 @@ using pronto to load the ontology: >>> # Gene Ontology domains are disconnected, expect 3 components >>> nx.number_weakly_connected_components(nxo.graph) 3 +>>> # Note however that the default from_file reader only uses "is a" relationships. +>>> # We can preserve all GO relationship types as follows +>>> from collections import Counter +>>> import pronto +>>> from nxontology import NXOntology +>>> from nxontology.imports import pronto_to_multidigraph, multidigraph_to_digraph +>>> go_pronto = pronto.Ontology(handle=url) +>>> go_multidigraph = pronto_to_multidigraph(go_pronto) +>>> Counter(key for _, _, key in go_multidigraph.edges(keys=True)) +Counter({'is a': 71509, + 'part of': 7187, + 'regulates': 3216, + 'negatively regulates': 2768, + 'positively regulates': 2756}) +>>> go_digraph = multidigraph_to_digraph(go_multidigraph) +>>> go_nxo = NXOntology(go_digraph) +>>> # Notice the similarity increases due to the full set of edges +>>> round(go_nxo.similarity("GO:0042552", "GO:0022008").lin, 3) +0.699 +>>> # Note that there is also a dedicated reader for the Gene Ontology +>>> from nxontology.imports import read_gene_ontology +>>> read_gene_ontology(release="2021-02-01") ``` Users can also create their own `networkx.DiGraph` to use this package. diff --git a/nxontology/imports.py b/nxontology/imports.py index db3d12b..bcc334b 100644 --- a/nxontology/imports.py +++ b/nxontology/imports.py @@ -1,11 +1,15 @@ import logging +from datetime import date from os import PathLike -from typing import AnyStr, BinaryIO, Union +from typing import AnyStr, BinaryIO, Counter, List, Optional, Tuple, Union, cast +import networkx as nx from pronto import Ontology as Prontology # type: ignore [attr-defined] +from pronto.term import Term from nxontology import NXOntology from nxontology.exceptions import NodeNotFound +from nxontology.ontology import Node def pronto_to_nxontology(onto: Prontology) -> NXOntology[str]: @@ -62,3 +66,143 @@ def from_file(handle: Union[BinaryIO, str, "PathLike[AnyStr]"]) -> NXOntology[st """ onto = Prontology(handle=handle) return pronto_to_nxontology(onto) + + +def _pronto_edges_for_term( + term: Term, default_rel_type: str = "is a" +) -> List[Tuple[Node, Node, str]]: + """ + Extract edges including "is a" relationships for a Pronto term. + https://github.com/althonos/pronto/issues/119#issuecomment-956541286 + """ + rels = list() + source_id = cast(Node, term.id) + for target in term.superclasses(distance=1, with_self=False): + rels.append((source_id, cast(Node, target.id), default_rel_type)) + for rel_type, targets in term.relationships.items(): + for target in sorted(targets): + rels.append( + ( + cast(Node, term.id), + cast(Node, target.id), + rel_type.name or rel_type.id, + ) + ) + return rels + + +def pronto_to_multidigraph( + onto: Prontology, default_rel_type: str = "is a" +) -> nx.MultiDiGraph: + """ + Convert a `pronto.Ontology` to a `networkx.MultiDiGraph`, + including all relationship types including "is a". + The output MultiDiGraph is not directly compatable with NXOntology, + since NXOntology assumes a DiGraph (without parallel edges) and likely + encodes edges in the reverse direction, such that edges go from + superterm to subterm. + + ## References + + - https://github.com/althonos/pronto/issues/119 + - https://github.com/related-sciences/nxontology/issues/14 + """ + graph = nx.MultiDiGraph() + for term in onto.terms(): + if term.obsolete: + # obsolete was unreliable in pronto < v2.4.0 + # https://github.com/althonos/pronto/issues/122 + continue + if term.id in graph: + logging.warning(f"Skipping node already in graph: {term}") + continue + graph.add_node( + term.id, + identifier=term.id, + label=term.name, + namespace=term.namespace, + ) + for term in onto.terms(): + for source, target, key in _pronto_edges_for_term( # type: ignore [var-annotated] + term, default_rel_type + ): + for node in source, target: + if node not in graph: + logging.warning( + f"Skipping edge: node does not exist in graph: {node}" + ) + if graph.has_edge(source, target, key): + logging.warning( + f"Skipping edge already in graph: {source} --> {target} (key={key!r})" + ) + graph.add_edge(source, target, key=key) + rel_counts = Counter(key for _, _, key in graph.edges(keys=True)) + logging.info(f"MultiDiGraph relationship counts:\n{rel_counts}") + return graph + + +def multidigraph_to_digraph( + graph: nx.MultiDiGraph, rel_types: Optional[List[str]] = None, reverse: bool = True +) -> nx.DiGraph: + """ + Convert a networkx MultiDiGraph to a DiGraph by aggregating edges accross relationship types. + Can be used to convert the output of `pronto_to_multidigraph` or the obonet package to a DiGraph + suitable for input to NXOntology. In such cases, you will likely want to set reverse=True to reverse + edges directions such that edges point from superterms to subterms. + + When rel_types is None, all relationship types are preserved. If rel_types is defined, + then the MultiDiGraph is first filtered for edges with that key (relationship type). + """ + if rel_types is not None: + graph.remove_edges_from( + [ + (u, v, key) + for u, v, key in graph.edges(keys=True, data=False) + if key not in rel_types + ] + ) + if reverse: + graph = graph.reverse(copy=True) + digraph = nx.DiGraph(graph) + for source, target in digraph.edges(data=False): + digraph[source][target]["rel_types"] = sorted(graph[source][target]) + return digraph + + +def read_gene_ontology( + release: str = "current", + source_file: str = "go-basic.json.gz", + rel_types: Optional[List[str]] = [ + "is a", + "part of", + "regulates", + "negatively regulates", + "positively regulates", + ], +) -> NXOntology[str]: + """ + Load the Gene Ontology into NXOntology, + keeping the relationship types from the basic release. + GO Basic is [described as](http://geneontology.org/docs/download-ontology/): + + > This is the basic version of the GO, + > filtered such that the graph is guaranteed to be acyclic + > and annotations can be propagated up the graph. + > The relations included are + > is a, part of, regulates, negatively regulates and positively regulates. + > This version excludes relationships that cross the 3 GO hierarchies. + > This version should be used with most GO-based annotation tools. + """ + if release == "current": + url = f"http://current.geneontology.org/ontology/{source_file}" + else: + date.fromisoformat(release) # check that release is a valid date + url = f"http://release.geneontology.org/{release}/ontology/{source_file}" + go_pronto = Prontology(handle=url) + go_multidigraph = pronto_to_multidigraph(go_pronto, default_rel_type="is a") + go_digraph = multidigraph_to_digraph( + go_multidigraph, rel_types=rel_types, reverse=True + ) + go_nxo: NXOntology[str] = NXOntology(go_digraph) + go_nxo.graph.graph["source_url"] = url + return go_nxo diff --git a/nxontology/tests/imports_test.py b/nxontology/tests/imports_test.py index 523676c..c36f2a7 100644 --- a/nxontology/tests/imports_test.py +++ b/nxontology/tests/imports_test.py @@ -1,14 +1,24 @@ +import networkx as nx import pytest +from pronto import Ontology # type: ignore [attr-defined] -from nxontology.imports import from_file, from_obo_library +from nxontology.imports import ( + from_file, + from_obo_library, + multidigraph_to_digraph, + pronto_to_multidigraph, + read_gene_ontology, +) + +taxrank_formats = [ + "owl", + "obo", +] @pytest.mark.parametrize( "format", - [ - "owl", - "obo", - ], + taxrank_formats, ) def test_from_obo_library_taxrank(format: str) -> None: """ @@ -37,3 +47,46 @@ def test_from_file_go() -> None: assert info.data["namespace"] == "biological_process" # has edge from "axon ensheathment" to "myelination" assert nxo.graph.has_edge("GO:0008366", "GO:0042552") + + +@pytest.mark.parametrize( + "format", + taxrank_formats, +) +def test_pronto_to_multidigraph(format: str) -> None: + """ + http://www.obofoundry.org/ontology/taxrank.html + """ + slug = f"taxrank.{format}" + onto = Ontology.from_obo_library(slug) + graph = pronto_to_multidigraph(onto, default_rel_type="is_a") + # subterm --> superterm: opposite of NXOntology + assert graph.has_edge(u="TAXRANK:0000034", v="TAXRANK:0000000", key="is_a") + + +def test_multigraph_to_digraph(): + mdg = nx.MultiDiGraph() + mdg.add_edge("a", "b", key="rel_type 1") + mdg.add_edge("a", "b", key="rel_type 2") + mdg.add_edge("b", "c", key="rel_type 1") + dg = multidigraph_to_digraph(mdg) + assert dg.number_of_nodes() == 3 + assert dg.number_of_edges() == 2 + assert dg["b"]["a"]["rel_types"] == ["rel_type 1", "rel_type 2"] + assert dg["c"]["b"]["rel_types"] == ["rel_type 1"] + dg = multidigraph_to_digraph(mdg, reverse=False) + assert dg.has_edge("a", "b") + assert not dg.has_edge("b", "a") + dg = multidigraph_to_digraph(mdg, rel_types=["rel_type 2"]) + assert dg.number_of_nodes() == 3 + assert dg.number_of_edges() == 1 + assert dg.has_edge("b", "a") + + +def test_read_gene_ontology(): + nxo = read_gene_ontology(release="2021-02-01") + assert ( + nxo.graph.graph["source_url"] + == "http://release.geneontology.org/2021-02-01/ontology/go-basic.json.gz" + ) + assert "regulates" in nxo.graph["GO:0006310"]["GO:0000018"]["rel_types"]