diff --git a/README.md b/README.md index 0eeb7ab..d7d5b80 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ Counter({'is a': 71509, 'regulates': 3216, 'negatively regulates': 2768, 'positively regulates': 2756}) ->>> go_digraph = multidigraph_to_digraph(go_multidigraph) +>>> go_digraph = multidigraph_to_digraph(go_multidigraph, reduce=True) >>> go_nxo = NXOntology(go_digraph) >>> # Notice the similarity increases due to the full set of edges >>> round(go_nxo.similarity("GO:0042552", "GO:0022008").lin, 3) diff --git a/nxontology/imports.py b/nxontology/imports.py index bcc334b..5820a75 100644 --- a/nxontology/imports.py +++ b/nxontology/imports.py @@ -142,7 +142,10 @@ def pronto_to_multidigraph( def multidigraph_to_digraph( - graph: nx.MultiDiGraph, rel_types: Optional[List[str]] = None, reverse: bool = True + graph: nx.MultiDiGraph, + rel_types: Optional[List[str]] = None, + reverse: bool = True, + reduce: bool = False, ) -> nx.DiGraph: """ Convert a networkx MultiDiGraph to a DiGraph by aggregating edges accross relationship types. @@ -152,7 +155,13 @@ def multidigraph_to_digraph( When rel_types is None, all relationship types are preserved. If rel_types is defined, then the MultiDiGraph is first filtered for edges with that key (relationship type). + + If reduce is True, perform a transitive reduction on the DiGraph + to produce a minimum equivalent graph that removes redundant relationships + — i.e. those that are already captured by a more specific ancestral path. + The default is reduce=False since the reduction can be a computationally expensive step. """ + logging.info(f"Received MultiDiGraph with {graph.number_of_edges():,} edges.") if rel_types is not None: graph.remove_edges_from( [ @@ -161,11 +170,23 @@ def multidigraph_to_digraph( if key not in rel_types ] ) + logging.info( + f"Filtered MultiDiGraph to {graph.number_of_edges():,} edges of the following types: {rel_types}." + ) if reverse: graph = graph.reverse(copy=True) digraph = nx.DiGraph(graph) + if reduce: + n_edges_before = digraph.number_of_edges() + digraph = nx.transitive_reduction(digraph) + logging.info( + f"Reduced DiGraph by removing {n_edges_before - digraph.number_of_edges():,} redundant edges." + ) for source, target in digraph.edges(data=False): digraph[source][target]["rel_types"] = sorted(graph[source][target]) + logging.info( + f"Converted MultiDiGraph to DiGraph with {digraph.number_of_nodes():,} nodes and {digraph.number_of_edges():,} edges." + ) return digraph @@ -179,6 +200,7 @@ def read_gene_ontology( "negatively regulates", "positively regulates", ], + reduce: bool = True, ) -> NXOntology[str]: """ Load the Gene Ontology into NXOntology, @@ -198,10 +220,14 @@ def read_gene_ontology( else: date.fromisoformat(release) # check that release is a valid date url = f"http://release.geneontology.org/{release}/ontology/{source_file}" + logging.info(f"Loading Gene Ontology into Pronto from <{url}>.") go_pronto = Prontology(handle=url) go_multidigraph = pronto_to_multidigraph(go_pronto, default_rel_type="is a") go_digraph = multidigraph_to_digraph( - go_multidigraph, rel_types=rel_types, reverse=True + go_multidigraph, + rel_types=rel_types, + reverse=True, + reduce=reduce, ) go_nxo: NXOntology[str] = NXOntology(go_digraph) go_nxo.graph.graph["source_url"] = url diff --git a/nxontology/tests/imports_test.py b/nxontology/tests/imports_test.py index c36f2a7..ba007d6 100644 --- a/nxontology/tests/imports_test.py +++ b/nxontology/tests/imports_test.py @@ -90,3 +90,11 @@ def test_read_gene_ontology(): == "http://release.geneontology.org/2021-02-01/ontology/go-basic.json.gz" ) assert "regulates" in nxo.graph["GO:0006310"]["GO:0000018"]["rel_types"] + # Transitive reduction should remove this edge + # from "defense response to insect" to "negative regulation of defense response to insect" + # since it is redundant with a more specific ancestral path. + # https://github.com/related-sciences/nxontology/pull/16 + assert not nxo.graph.has_edge("GO:0002213", "GO:1900366") + # GO:0002213 --> GO:2000068 --> GO:1900366 is more specific + assert nxo.graph.has_edge("GO:0002213", "GO:2000068") + assert nxo.graph.has_edge("GO:2000068", "GO:1900366")