Fixes for issues #3, #4, #5, and #6.

geneontology · Mar 30, 2018 · 4fb735e · 4fb735e
1 parent 44bf9ac
commit 4fb735e
Show file tree

Hide file tree

Showing 2 changed files with 135 additions and 91 deletions.
diff --git a/pathway_connections.py b/pathway_connections.py
@@ -77,7 +77,6 @@ def __init__(self, id_a, id_b, mechanism, effect, direct, relation, pmid, linenu
 
         self.individuals = {}
         self.enabled_by_stmt_a = None
-        self.regulated_activity_uris = []
 
     def print(self):
         print("[UniProtKB:{ida}] <- enabled_by – [{mechanism}] – [{relation}]-> [{regulated_activity}] – enabled_by-> [UniProtKB:{idb}]".format(ida=self.id_a,
@@ -98,15 +97,17 @@ def declare_a(self, model):
             model.declare_class(self.full_id_a())
 
         # Individuals
-        if self.full_id_a() not in self.individuals:
+        # if self.full_id_a() not in self.individuals:
+        if self.full_id_a() not in model.individuals:
             if self.a_is_complex():
                 uri_a = self.complex_a.declare_entities(model)
             else:
                 uri_a = model.declare_individual(self.full_id_a())
-            self.individuals[self.full_id_a()] = uri_a
+            # self.individuals[self.full_id_a()] = uri_a
+        self.individuals[self.full_id_a()] = model.individuals[self.full_id_a()]
 
-        self.mechanism["uri"] = model.declare_individual(self.mechanism["term"])
-        self.individuals[self.mechanism["term"]] = self.mechanism["uri"]
+        # self.mechanism["uri"] = model.declare_individual(self.mechanism["term"])  # Segregate from singular entity declaration
+        # self.individuals[self.mechanism["term"]] = self.mechanism["uri"]
 
         return model
 
@@ -183,53 +184,54 @@ def full_statement_bnode_in_model(self, model):
                     return candidate_reg_triple
 
 class PathwayConnectionSet():
-    def __init__(self, filename):
+    def __init__(self, filename=None):
         self.connections = []
         linenum = 0
 
-        with open(filename, "r") as f:
-            data = list(csv.DictReader(f, delimiter="\t"))
-            for line in data:
-                linenum += 1
-
-                # If up-regulates (including any variants of this), use RO:0002629 if DIRECT, and use RO:0002213 if not DIRECT
-                relation = None
-                if line["EFFECT"].startswith("up-regulates"):
-                    if line["DIRECT"] == "YES":
-                        relation = "RO:0002629"
-                    elif line["DIRECT"] == "NO":
-                        relation = "RO:0002213"
-                # If down-regulates (including any variants of this), use RO:0002630 if DIRECT, and use RO:0002212 if not DIRECT
-                if line["EFFECT"].startswith("down-regulates"):
-                    if line["DIRECT"] == "YES":
-                        relation = "RO:0002630"
-                    elif line["DIRECT"] == "NO":
-                        relation = "RO:0002212"
-                # If unknown, use RO:0002211
-                if line["EFFECT"] == "unknown":
-                    relation = "RO:0002211"
-                # If form_complex, ignore these lines for now
-                if line["EFFECT"] == "form_complex":
-                    continue
-
-                pc = PathwayConnection(
-                    line["IDA"],
-                    line["IDB"],
-                    line["MECHANISM"],
-                    line["EFFECT"],
-                    line["DIRECT"],
-                    relation,
-                    [line["PMID"]],
-                    linenum
-                )
-
-                # if not (pc.id_a.startswith("SIGNOR") or pc.id_b.startswith("SIGNOR") or line["TYPEA"] == "phenotype" or line["TYPEB"] == "phenotype"):
-                acceptable_types = ['protein','complex']
-                if line["TYPEA"] in acceptable_types and line["TYPEB"] in acceptable_types:
-                    if self.find(pc):
-                        self.append_reference(pc)
-                    else:
-                        self.append(pc)
+        if filename:
+            with open(filename, "r") as f:
+                data = list(csv.DictReader(f, delimiter="\t"))
+                for line in data:
+                    linenum += 1
+
+                    # If up-regulates (including any variants of this), use RO:0002629 if DIRECT, and use RO:0002213 if not DIRECT
+                    relation = None
+                    if line["EFFECT"].startswith("up-regulates"):
+                        if line["DIRECT"] == "YES":
+                            relation = "RO:0002629"
+                        elif line["DIRECT"] == "NO":
+                            relation = "RO:0002213"
+                    # If down-regulates (including any variants of this), use RO:0002630 if DIRECT, and use RO:0002212 if not DIRECT
+                    if line["EFFECT"].startswith("down-regulates"):
+                        if line["DIRECT"] == "YES":
+                            relation = "RO:0002630"
+                        elif line["DIRECT"] == "NO":
+                            relation = "RO:0002212"
+                    # If unknown, use RO:0002211
+                    if line["EFFECT"] == "unknown":
+                        relation = "RO:0002211"
+                    # If form_complex, ignore these lines for now
+                    if line["EFFECT"] == "form_complex":
+                        continue
+
+                    pc = PathwayConnection(
+                        line["IDA"],
+                        line["IDB"],
+                        line["MECHANISM"],
+                        line["EFFECT"],
+                        line["DIRECT"],
+                        relation,
+                        [line["PMID"]],
+                        linenum
+                    )
+
+                    # if not (pc.id_a.startswith("SIGNOR") or pc.id_b.startswith("SIGNOR") or line["TYPEA"] == "phenotype" or line["TYPEB"] == "phenotype"):
+                    acceptable_types = ['protein','complex']
+                    if line["TYPEA"] in acceptable_types and line["TYPEB"] in acceptable_types:
+                        if self.find(pc):
+                            self.append_reference(pc)
+                        else:
+                            self.append(pc)
 
 
     def append(self, pathway_connection):
@@ -264,4 +266,22 @@ def find_other_regulated_activity(self, id_b):
             if pc.mechanism["term"] != "GO:0003674":
                 filtered_reg_pcs.append(pc)
         if len(filtered_reg_pcs) > 0:
-            return filtered_reg_pcs[0]
+            return filtered_reg_pcs[0]
+
+    def find_all_by_id_a_and_id_b(self, pathway_connection):
+        found_connections = PathwayConnectionSet()
+        for pc in self.connections:
+            if pc.id_a == pathway_connection.id_a and pc.id_b == pathway_connection.id_b:
+                found_connections.append(pc)
+        return found_connections
+
+    def find_by_mech_term(self, term):
+        for pc in self.connections:
+            if pc.mechanism["term"] == term:
+                return pc
+
+    def remove_list(self, pc_list):
+        new_connection_list = self.connections
+        for dead_pc in pc_list:
+            new_connection_list = [pc for pc in new_connection_list if not pc.equals(dead_pc)]
+        self.connections = new_connection_list
diff --git a/pathway_importer.py b/pathway_importer.py
@@ -8,6 +8,7 @@
 
 ro = OboRO()
 ENABLED_BY = URIRef(expand_uri(ro.enabled_by))
+HAS_INPUT = URIRef(expand_uri("RO:0002233"))
 
 parser = argparse.ArgumentParser()
 parser.add_argument('-f', "--filename", type=str, required=True,
@@ -42,11 +43,18 @@ def test_label_finding(model):
                 print(labels)
         axiom_counter += 1
 
-
 def main():
+
+    ## Organize connection objects
+    ## Declare entity A GPs and MFs
+    ## Add "Has_input" relations between MF and entity B GPs
+    ##      If entity B not declared, declare it
+    ## Connect regulation relations to all MF's enabled by entity B
+    ##      If no MF for entity B, add root MF enabled by B
+
     args = parser.parse_args()
 
-    model = GoCamModel("connect_all_genes.ttl")
+    model = GoCamModel("delete_uncertain_multi_activities.ttl")
     # p_connections = PathwayConnectionSet("SIGNOR-G2-M_trans_02_03_18.tsv")
     p_connections = PathwayConnectionSet(args.filename)
     linenum = 1
@@ -57,6 +65,25 @@ def main():
     print(total_pcs)
     skipped_count = 0
 
+    # Toss out connections according to precedence rules:
+    # protein kinase activity should be chosen over protein binding
+    # This should be separate from/before any OWL individuals are declared
+    for pc in p_connections.connections:
+        # Get list of all pcs with pc.id_a and pc.id_b
+        # If len of list is > 1
+        #   Look for protein kinase activity (GO:0004672), delete others if remaining are all protein binding (GO:0005515)
+        pc_list = p_connections.find_all_by_id_a_and_id_b(pc)
+        if len(pc_list.connections) > 1:
+            the_good_one = pc_list.find_by_mech_term("GO:0004672")
+            the_bad_one = pc_list.find_by_mech_term("GO:0005515")
+            # adfkafkeae # WHY IS THIS NOT CATCHING ATM TO ABL1 MFs?
+            if the_good_one is not None and the_bad_one is not None:
+                pc_list.connections.remove(the_good_one)
+                p_connections.remove_list(pc_list.connections)
+            elif the_good_one is not None:
+                [uncertain_pc.print() for uncertain_pc in pc_list.connections]
+                p_connections.remove_list(pc_list.connections)
+
     # fill in regulated activities
     for pc in p_connections.connections:
         regulated_activity_pc = p_connections.find_other_regulated_activity(pc.id_b)
@@ -74,38 +101,36 @@ def main():
         else:
             pc.regulated_activity["term"] = regulated_activity_term
             pc.regulated_activity["uri"] = regulated_activity_term_uri
-            # pc.individuals[pc.regulated_activity["term"]] = regulated_activity_term_uri
-
-        # model = pc.declare_entities(model)
-
-        # enabled_by_stmt_a = model.writer.emit(model.individuals[pc.mechanism_go_term], ENABLED_BY, model.individuals[pc.full_id_a()])
-        # if pc.mechanism["term"] in pc.individuals and not model_contains_statement(model, pc.individuals[pc.mechanism["term"]], ENABLED_BY, pc.full_id_a()):
-        # full_statement = pc.full_statement_bnode_in_model(model)
-        # if pc.mechanism["term"] in pc.individuals and full_statement is None:
-        # if full_statement is None:
-        # print("Hey " + pc.full_id_a())
-        # if pc.id_a == "Q13315" and pc.id_b == "P38398":
-        #     print("Dang " + pc.pmid[0])
-        # model = pc.declare_entities(model)
 
         # enabled_by_stmt_a_triple = (pc.mechanism["uri"], ENABLED_BY, pc.individuals[pc.full_id_a()])
         if pc.a_is_complex():
             entity_a = pc.complex_a.uri_in_model(model)
         else:
             entity_a = pc.full_id_a()
-        if entity_a is not None:
-            enabled_by_stmt_a_triples = model.triples_by_ids(pc.mechanism["term"], ENABLED_BY, entity_a)
-        else:
-            enabled_by_stmt_a_triples = []
-        if len(enabled_by_stmt_a_triples) == 0:
-            # If triple A doesn't exist for entities, declare individuals and create it
-            # enabled_by_stmt_a = model.writer.emit(pc.mechanism["term"], ENABLED_BY, pc.full_id_a())
-            model = pc.declare_a(model)
-            pc.enabled_by_stmt_a = model.writer.emit(pc.mechanism["uri"], ENABLED_BY, pc.individuals[pc.full_id_a()])
-            model.add_axiom(pc.enabled_by_stmt_a)
+        # Don't care about existing "statements", just look for existing entity A GP and always create new enabled by statement
+        # if entity_a is not None:
+        #     enabled_by_stmt_a_triples = model.triples_by_ids(pc.mechanism["term"], ENABLED_BY, entity_a)
+        # else:
+        #     enabled_by_stmt_a_triples = []
+        # if len(enabled_by_stmt_a_triples) == 0:
+
+        # If triple A doesn't exist for entities, declare individuals and create it
+        # enabled_by_stmt_a = model.writer.emit(pc.mechanism["term"], ENABLED_BY, pc.full_id_a())
+        if entity_a is None or not isinstance(entity_a, URIRef):
+            for uri_a in model.uri_list_for_individual(pc.full_id_a()):
+                pc.individuals[pc.full_id_a()] = uri_a
+            if pc.full_id_a() not in pc.individuals:
+                model = pc.declare_a(model)
         else:
-            pc.enabled_by_stmt_a = enabled_by_stmt_a_triples[0]
+            pc.individuals[pc.full_id_a()] = entity_a
+        pc.mechanism["uri"] = model.declare_individual(pc.mechanism["term"])
+        pc.individuals[pc.mechanism["term"]] = pc.mechanism["uri"]
+        pc.enabled_by_stmt_a = model.writer.emit(pc.mechanism["uri"], ENABLED_BY, pc.individuals[pc.full_id_a()])
+        axiom_a = model.add_axiom(pc.enabled_by_stmt_a)
+        model.add_evidence(axiom_a, "EXP", ["PMID:" + pmid for pmid in pc.pmid])
 
+        # else:
+        #     pc.enabled_by_stmt_a = enabled_by_stmt_a_triples[0]
 
     # Now that the a's are declared, go check on the b's.
     for pc in p_connections.connections:
@@ -118,28 +143,29 @@ def main():
             enabled_by_stmt_b_triples = model.triples_by_ids(None, ENABLED_BY, entity_b)
         else:
             enabled_by_stmt_b_triples = []
+        # If pointing to activity
         regulated_activity_uris = []
         for b_triple in enabled_by_stmt_b_triples:
             regulated_activity_uris.append(b_triple[0])
+        # If pointing to GP
+        entity_b_uris = model.uri_list_for_individual(pc.full_id_b())
         if len(regulated_activity_uris) == 0:
             model = pc.declare_b(model)
             enabled_by_stmt_b = model.writer.emit(pc.regulated_activity["uri"], ENABLED_BY,
                                                   pc.individuals[pc.full_id_b()])
-            model.add_axiom(enabled_by_stmt_b)
+            axiom_b = model.add_axiom(enabled_by_stmt_b)
+            # model.add_evidence(axiom_b, "EXP", ["PMID:" + pmid for pmid in pc.pmid])  # Maybe don't want to add evidence to B since we're assuming these statements?
             regulated_activity_uris.append(enabled_by_stmt_b[0])
-        # if enabled_by_stmt_b_triple in model.writer.writer.graph:
-        #     enabled_by_stmt_b = next(model.writer.writer.graph.triples(enabled_by_stmt_b_triple))
-        # else:
-        #     enabled_by_stmt_b = model.writer.emit(enabled_by_stmt_b_triple[0], enabled_by_stmt_b_triple[1], enabled_by_stmt_b_triple[2])
-        #     axiom_b = model.add_axiom(enabled_by_stmt_b)
+            entity_b_uris.append(pc.individuals[pc.full_id_b()])
+
+        for entity_b_uri in entity_b_uris:
+            model.writer.emit(pc.enabled_by_stmt_a[0], HAS_INPUT, entity_b_uri)
+            relation_axiom = model.writer.emit_axiom(pc.enabled_by_stmt_a[0], HAS_INPUT, entity_b_uri)
 
         # Connect the two activities
         # Decouple this from ENABLED_BY statements to allow multiple regulation relations from one GP-MF node - issue #2
-        # source_id = model.individuals[pc.mechanism_go_term]
         source_id = pc.enabled_by_stmt_a[0]
         property_id = URIRef(expand_uri(pc.relation))
-        # target_id = model.individuals[pc.regulated_activity_term]
-        # target_id = pc.regulated_activity["uri"]    # This should be an array of target activities (individual_list)
         # if not model_contains_statement(model, source_id, property_id, pc.regulated_activity["term"]):  # Make into for loop
         for reg_activity_uri in regulated_activity_uris:
             target_id = reg_activity_uri
@@ -148,16 +174,14 @@ def main():
             # Add axiom (Source=MF term URI, Property=relation code, Target=MF term URI)
             relation_axiom = model.writer.emit_axiom(source_id, property_id, target_id)
             model.add_evidence(relation_axiom, "EXP", ["PMID:" + pmid for pmid in pc.pmid])
-        # else:
-        #     print("2")
-
-        # pc.print()
 
     with open(model.filepath, 'wb') as f:
         model.writer.writer.serialize(destination=f)
 
     print(skipped_count)
 
+    grouped = map(lambda x:x.id_a, p_connections.connections)
+    print(grouped)
+
 if __name__ == '__main__':
-    main()
-    print("hey")
+    main()