Skip to content

Commit

Permalink
Merge pull request #1460 from bgyori/sqlite
Browse files Browse the repository at this point in the history
Add SQLite-based ontology
  • Loading branch information
bgyori authored Nov 19, 2024
2 parents fa3e35a + 190e02a commit b451d07
Show file tree
Hide file tree
Showing 2 changed files with 185 additions and 1 deletion.
2 changes: 1 addition & 1 deletion indra/ontology/bio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@

indra_ontology_url = get_config('INDRA_ONTOLOGY_URL')
bio_ontology = BioOntology() if not indra_ontology_url else \
VirtualOntology(url=indra_ontology_url)
VirtualOntology(url=indra_ontology_url)
184 changes: 184 additions & 0 deletions indra/ontology/bio/sqlite_ontology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""This module implements an SQLite back end to the
INDRA BioOntology."""

import os
import json
import sqlite3
import logging
from collections import defaultdict
from indra.ontology.ontology_graph import IndraOntology
from indra.ontology.bio.ontology import CACHE_DIR
from indra.ontology.bio import bio_ontology


logger = logging.getLogger(__name__)


DEFAULT_SQLITE_ONTOLOGY = os.path.join(CACHE_DIR, 'bio_ontology.db')


class SqliteOntology(IndraOntology):
def __init__(self, db_path=DEFAULT_SQLITE_ONTOLOGY):
super().__init__()
self.db_path = db_path
build_sqlite_ontology(db_path)
conn = sqlite3.connect(db_path)
self.cur = conn.cursor()

def isa_or_partof(self, ns1, id1, ns2, id2):
q = """SELECT 1 FROM relationships
WHERE child_id=? AND child_ns=? AND parent_id=? AND parent_ns=?
LIMIT 1;"""
self.cur.execute(q, (id1, ns1, id2, ns2))
return self.cur.fetchone() is not None

def child_rel(self, ns, id, rel_types):
q = """SELECT children FROM child_lookup
WHERE parent_id=? AND parent_ns=?
LIMIT 1;"""
self.cur.execute(q, (id, ns))
res = self.cur.fetchone()
if res is None:
yield from []
else:
yield from [tuple(x.split(':', 1)) for x in res[0].split(',')]

def get_parents(self, ns, id):
return list(self.parent_rel(ns, id, {'isa', 'partof'}))

def get_children(self, ns, id, ns_filter=None):
children = list(self.child_rel(ns, id, {'isa', 'partof'}))
if ns_filter:
children = [(cns, cid) for cns, cid in children
if cns in ns_filter]
return children

def parent_rel(self, ns, id, rel_types):
q = """SELECT parents FROM parent_lookup
WHERE child_id=? AND child_ns=?
LIMIT 1;"""
self.cur.execute(q, (id, ns))
res = self.cur.fetchone()
if res is None:
yield from []
else:
yield from [tuple(x.split(':', 1)) for x in res[0].split(',')]

def get_node_property(self, ns, id, property):
q = """SELECT properties FROM node_properties
WHERE id=? AND ns=?
LIMIT 1;"""
self.cur.execute(q, (id, ns))
res = self.cur.fetchone()
if res is None:
return None
props = json.loads(res[0])
return props.get(property)

def get_id_from_name(self, ns, name):
return None


def build_sqlite_ontology(db_path=DEFAULT_SQLITE_ONTOLOGY, force=False):
# If the database already exists and we are not forcing a rebuild, return
if os.path.exists(db_path) and not force:
return

if force:
try:
logger.info('Removing existing SQLite ontology at %s' % db_path)
os.remove(db_path)
except FileNotFoundError:
pass

# Initialize the bio ontology and build the transitive closure
bio_ontology.initialize()
bio_ontology._build_transitive_closure()

# Set up connection
conn = sqlite3.connect(db_path)
cur = conn.cursor()

logger.info('Building SQLite ontology at %s' % db_path)
# First, we create the relationships table and populate
# it with child/parent pairs
q = """CREATE TABLE relationships (
child_id TEXT NOT NULL,
child_ns TEXT NOT NULL,
parent_id TEXT NOT NULL,
parent_ns TEXT NOT NULL,
UNIQUE (child_id, child_ns, parent_id, parent_ns)
);"""
cur.execute(q)

# Insert into the database in chunks
chunk_size = 10000
tc = sorted(bio_ontology.transitive_closure)
all_children = defaultdict(set)
all_parents = defaultdict(set)
for i in range(0, len(tc), chunk_size):
chunk = tc[i:i+chunk_size]
chunk_values = [(child.split(':', 1)[1], child.split(':')[0],
parent.split(':', 1)[1], parent.split(':')[0])
for child, parent in chunk]
for cid, cns, pid, pns in chunk_values:
all_children[(pid, pns)].add('%s:%s' % (cns, cid))
all_parents[(cid, cns)].add('%s:%s' % (pns, pid))
cur.executemany("""INSERT INTO relationships (child_id,
child_ns, parent_id, parent_ns)
VALUES (?, ?, ?, ?);""", chunk_values)
q = """CREATE INDEX idx_child_parent ON relationships
(child_id, child_ns, parent_id, parent_ns);"""
cur.execute(q)

# Next, create child and parent lookup tables and populate them
q = """CREATE TABLE child_lookup (
parent_id TEXT NOT NULL,
parent_ns TEXT NOT NULL,
children TEXT NOT NULL,
UNIQUE (parent_id, parent_ns)
);"""
cur.execute(q)
q = """CREATE TABLE parent_lookup (
child_id TEXT NOT NULL,
child_ns TEXT NOT NULL,
parents TEXT NOT NULL,
UNIQUE (child_id, child_ns)
);"""
cur.execute(q)
for (pid, pns), children in all_children.items():
cur.execute("INSERT INTO child_lookup (parent_id, parent_ns, children) "
"VALUES (?, ?, ?);",
(pid, pns, ','.join(children)))
for (cid, cns), parents in all_parents.items():
cur.execute("INSERT INTO parent_lookup (child_id, child_ns, parents) "
"VALUES (?, ?, ?);",
(cid, cns, ','.join(parents)))
# Now add indices to the lookup tables
q = """CREATE INDEX idx_child_lookup ON child_lookup
(parent_id, parent_ns);"""
cur.execute(q)
q = """CREATE INDEX idx_parent_lookup ON parent_lookup
(child_id, child_ns);"""
cur.execute(q)

# Create node property table
# Here we just keep track of the namespace and ID,
# and then put all the data into a json string
q = """CREATE TABLE node_properties (
id TEXT NOT NULL,
ns TEXT NOT NULL,
properties TEXT NOT NULL,
UNIQUE (id, ns)
);"""
cur.execute(q)

for node in bio_ontology.nodes:
ns, id = bio_ontology.get_ns_id(node)
props = json.dumps(bio_ontology.nodes[node])
cur.execute("INSERT INTO node_properties (id, ns, properties) "
"VALUES (?, ?, ?);", (id, ns, props))

conn.commit()
conn.close()
logger.info('Finished building SQLite ontology')

0 comments on commit b451d07

Please sign in to comment.