Skip to content

Commit

Permalink
new: more adapter housekeeping (#26)
Browse files Browse the repository at this point in the history
* new: debug log on individual node/edge levels

* Update README.md

* cleanup

* config cleanup

* fix: typo

* new: __insert_adb_docs

also: cleans up node/edge insertion in ArangoDB to DGL

* Update setup.cfg

* Update conftest.py

* cleanup: __fetch_adb_docs

was complicating things for no reason

* cleanup & fix: etypes_to_edefinitions

also:

* fix: mypy

* fix: metagraph example

* new: remove ValueError exception

I was under the impression that working with an ArangoDB edge collection that had multiple "from" and "to" vertex types was not possible to convert to DGL. This commit allows for it

* fix: black

* fix: tests

* Update adapter.py

* fix: black

* cleanup

* Update conftest.py

* fix: isort
  • Loading branch information
aMahanna authored Jul 21, 2022
1 parent 93a7dc3 commit 5d98756
Show file tree
Hide file tree
Showing 14 changed files with 230 additions and 227 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
- name: Run mypy
run: mypy ${{env.PACKAGE_DIR}} ${{env.TESTS_DIR}}
- name: Run pytest
run: py.test -s --cov=${{env.PACKAGE_DIR}} --cov-report xml --cov-report term-missing -v --color=yes --no-cov-on-fail --code-highlight=yes
run: pytest --cov=${{env.PACKAGE_DIR}} --cov-report xml --cov-report term-missing -v --color=yes --no-cov-on-fail --code-highlight=yes
- name: Publish to coveralls.io
if: matrix.python == '3.8'
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
- name: Run mypy
run: mypy ${{env.PACKAGE_DIR}} ${{env.TESTS_DIR}}
- name: Run pytest
run: py.test --cov=${{env.PACKAGE_DIR}} --cov-report xml --cov-report term-missing -v --color=yes --no-cov-on-fail --code-highlight=yes
run: pytest --cov=${{env.PACKAGE_DIR}} --cov-report xml --cov-report term-missing -v --color=yes --no-cov-on-fail --code-highlight=yes
- name: Publish to coveralls.io
if: matrix.python == '3.8'
env:
Expand Down
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ Also available as an ArangoDB Lunch & Learn session: [Graph & Beyond Course #2.8
from arango import ArangoClient # Python-Arango driver
from dgl.data import KarateClubDataset # Sample graph from DGL

from adbdgl_adapter import ADBDGL_Adapter

# Let's assume that the ArangoDB "fraud detection" dataset is imported to this endpoint
db = ArangoClient(hosts="http://localhost:8529").db("_system", username="root", password="")

Expand All @@ -66,12 +68,14 @@ dgl_fraud_graph_2 = adbdgl_adapter.arangodb_collections_to_dgl(
# Use Case 1.3: ArangoDB to DGL via Metagraph
metagraph = {
"vertexCollections": {
"account": {"Balance", "account_type", "customer_id", "rank"},
"customer": {"Name", "rank"},
"account": {"Balance", "rank"},
"customer": {"rank"},
"Class": {},
},
"edgeCollections": {
"transaction": {"transaction_amt", "sender_bank_id", "receiver_bank_id"},
"accountHolder": {},
"Relationship": {},
},
}
dgl_fraud_graph_3 = adbdgl_adapter.arangodb_to_dgl("fraud-detection", metagraph)
Expand Down
5 changes: 4 additions & 1 deletion adbdgl_adapter/abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from arango.graph import Graph as ArangoDBGraph
from dgl import DGLGraph
from dgl.heterograph import DGLHeteroGraph
from torch.functional import Tensor
from torch import Tensor

from .typings import ArangoMetagraph, DGLCanonicalEType, Json

Expand Down Expand Up @@ -55,6 +55,9 @@ def __prepare_adb_attributes(self) -> None:
def __fetch_adb_docs(self) -> None:
raise NotImplementedError # pragma: no cover

def __insert_adb_docs(self) -> None:
raise NotImplementedError # pragma: no cover

@property
def DEFAULT_CANONICAL_ETYPE(self) -> List[DGLCanonicalEType]:
return [("_N", "_E", "_N")]
Expand Down
169 changes: 92 additions & 77 deletions adbdgl_adapter/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
from arango.result import Result
from dgl import DGLGraph, DGLHeteroGraph, heterograph
from dgl.view import HeteroEdgeDataView, HeteroNodeDataView
from torch import tensor
from torch.functional import Tensor
from torch import Tensor, tensor

from .abc import Abstract_ADBDGL_Adapter
from .controller import ADBDGL_Controller
Expand Down Expand Up @@ -102,58 +101,56 @@ def arangodb_to_dgl(
},
}
"""
logger.debug(f"Starting arangodb_to_dgl({name}, ...):")
logger.debug(f"--arangodb_to_dgl('{name}')--")

# Maps ArangoDB vertex IDs to DGL node IDs
adb_map: Dict[str, Dict[str, Any]] = dict()
adb_map: Dict[str, Json] = dict()

# Dictionaries for constructing a heterogeneous graph.
data_dict: DGLDataDict = dict()
ndata: DefaultDict[Any, Any] = defaultdict(lambda: defaultdict(list))
edata: DefaultDict[Any, Any] = defaultdict(lambda: defaultdict(list))

ndata: DefaultDict[str, DefaultDict[str, List[Any]]]
ndata = defaultdict(lambda: defaultdict(list))

edata: DefaultDict[str, DefaultDict[str, List[Any]]]
edata = defaultdict(lambda: defaultdict(list))

adb_v: Json
for v_col, atribs in metagraph["vertexCollections"].items():
logger.debug(f"Preparing '{v_col}' vertices")
for i, adb_v in enumerate(
self.__fetch_adb_docs(v_col, atribs, query_options)
):
adb_map[adb_v["_id"]] = {
"id": i,
"col": v_col,
}
for i, adb_v in enumerate(self.__fetch_adb_docs(v_col, query_options)):
adb_id = adb_v["_id"]
logger.debug(f"V{i}: {adb_id}")

adb_map[adb_id] = {"id": i, "col": v_col}
self.__prepare_dgl_features(ndata, atribs, adb_v, v_col)

adb_e: Json
from_col: Set[str] = set()
to_col: Set[str] = set()
edge_dict: DefaultDict[DGLCanonicalEType, DefaultDict[str, List[Any]]]
for e_col, atribs in metagraph["edgeCollections"].items():
logger.debug(f"Preparing '{e_col}' edges")
from_nodes: List[int] = []
to_nodes: List[int] = []
for adb_e in self.__fetch_adb_docs(e_col, atribs, query_options):

edge_dict = defaultdict(lambda: defaultdict(list))

for i, adb_e in enumerate(self.__fetch_adb_docs(e_col, query_options)):
logger.debug(f'E{i}: {adb_e["_id"]}')

from_node = adb_map[adb_e["_from"]]
to_node = adb_map[adb_e["_to"]]
edge_type = (from_node["col"], e_col, to_node["col"])

from_col.add(from_node["col"])
to_col.add(to_node["col"])
if len(from_col | to_col) > 2:
raise ValueError( # pragma: no cover
f"""Can't convert to DGL:
too many '_from' & '_to' collections in {e_col}
"""
)
edge_data = edge_dict[edge_type]
edge_data["from_nodes"].append(from_node["id"])
edge_data["to_nodes"].append(to_node["id"])

from_nodes.append(from_node["id"])
to_nodes.append(to_node["id"])
self.__prepare_dgl_features(edata, atribs, adb_e, edge_type)

self.__prepare_dgl_features(edata, atribs, adb_e, e_col)

data_dict[(from_col.pop(), e_col, to_col.pop())] = (
tensor(from_nodes),
tensor(to_nodes),
)
for edge_type, edges in edge_dict.items():
logger.debug(f"Inserting {edge_type} edges")
data_dict[edge_type] = (
tensor(edges["from_nodes"]),
tensor(edges["to_nodes"]),
)

dgl_g: DGLHeteroGraph = heterograph(data_dict)
has_one_ntype = len(dgl_g.ntypes) == 1
Expand Down Expand Up @@ -237,7 +234,7 @@ def dgl_to_arangodb(
:return: The ArangoDB Graph API wrapper.
:rtype: arango.graph.Graph
"""
logger.debug(f"Starting dgl_to_arangodb({name}, ...):")
logger.debug(f"--dgl_to_arangodb('{name}')--")

is_default = dgl_g.canonical_etypes == self.DEFAULT_CANONICAL_ETYPE
logger.debug(f"Is graph '{name}' using default canonical_etypes? {is_default}")
Expand All @@ -264,17 +261,18 @@ def dgl_to_arangodb(
has_one_ecol = len(adb_e_cols) == 1
logger.debug(f"Is graph '{name}' homogenous? {has_one_vcol and has_one_ecol}")

adb_documents: DefaultDict[str, List[Json]] = defaultdict(list)
node: Tensor
v_col_docs: List[Json] = [] # to-be-inserted ArangoDB vertices
for ntype in dgl_g.ntypes:
v_col = adb_v_cols[0] if is_default else ntype
logger.debug(f"Preparing {dgl_g.number_of_nodes(ntype)} '{v_col}' nodes")

for v_col in adb_v_cols:
v_col_docs = adb_documents[v_col]
ntype = None if is_default else v_col
features = dgl_g.node_attr_schemes(ntype).keys()

node: Tensor
logger.debug(f"Preparing {dgl_g.number_of_nodes(ntype)} '{v_col}' nodes")
for node in dgl_g.nodes(ntype):
for i, node in enumerate(dgl_g.nodes(ntype)):
dgl_node_id = node.item()
logger.debug(f"N{i}: {dgl_node_id}")

adb_vertex = {"_key": str(dgl_node_id)}
self.__prepare_adb_attributes(
dgl_g.ndata,
Expand All @@ -287,45 +285,44 @@ def dgl_to_arangodb(

v_col_docs.append(adb_vertex)

from_col: str
to_col: str
self.__insert_adb_docs(v_col, v_col_docs, import_options)
v_col_docs.clear()

from_n: Tensor
to_n: Tensor
for e_col in adb_e_cols:
e_col_docs = adb_documents[e_col]
etype = None if is_default else e_col
features = dgl_g.edge_attr_schemes(etype).keys()
e_col_docs: List[Json] = [] # to-be-inserted ArangoDB edges
for c_etype in dgl_g.canonical_etypes:
logger.debug(f"Preparing {dgl_g.number_of_edges(c_etype)} {c_etype} edges")

features = dgl_g.edge_attr_schemes(c_etype).keys()

canonical_etype = None
if is_default:
e_col = adb_e_cols[0]
from_col = to_col = adb_v_cols[0]
else:
canonical_etype = dgl_g.to_canonical_etype(e_col)
from_col, _, to_col = canonical_etype
from_col, e_col, to_col = c_etype

for i, (from_n, to_n) in enumerate(zip(*dgl_g.edges(etype=c_etype))):
logger.debug(f"E{i}: ({from_n}, {to_n})")

logger.debug(f"Preparing {dgl_g.number_of_edges(etype)} '{e_col}' edges")
for index, (from_n, to_n) in enumerate(zip(*dgl_g.edges(etype=etype))):
adb_edge = {
"_key": str(index),
"_from": f"{from_col}/{str(from_n.item())}",
"_to": f"{to_col}/{str(to_n.item())}",
}
self.__prepare_adb_attributes(
dgl_g.edata,
features,
index,
i,
adb_edge,
e_col,
has_one_ecol,
canonical_etype,
c_etype,
)

e_col_docs.append(adb_edge)

for col, doc_list in adb_documents.items(): # import documents into ArangoDB
logger.debug(f"Inserting {len(doc_list)} documents into '{col}'")
result = self.__db.collection(col).import_bulk(doc_list, **import_options)
logger.debug(result)
self.__insert_adb_docs(e_col, e_col_docs, import_options)
e_col_docs.clear()

logger.info(f"Created ArangoDB '{name}' Graph")
return adb_graph
Expand All @@ -352,13 +349,21 @@ def etypes_to_edefinitions(
}
]
"""

edge_type_map: DefaultDict[str, DefaultDict[str, Set[str]]]
edge_type_map = defaultdict(lambda: defaultdict(set))
for edge_type in canonical_etypes:
from_col, e_col, to_col = edge_type
edge_type_map[e_col]["from"].add(from_col)
edge_type_map[e_col]["to"].add(to_col)

edge_definitions: List[Json] = []
for dgl_from, dgl_e, dgl_to in canonical_etypes:
for e_col, v_cols in edge_type_map.items():
edge_definitions.append(
{
"from_vertex_collections": [dgl_from],
"edge_collection": dgl_e,
"to_vertex_collections": [dgl_to],
"from_vertex_collections": list(v_cols["from"]),
"edge_collection": e_col,
"to_vertex_collections": list(v_cols["to"]),
}
)

Expand All @@ -369,7 +374,7 @@ def __prepare_dgl_features(
features_data: DefaultDict[Any, Any],
attributes: Set[str],
doc: Json,
col: str,
col: Union[str, DGLCanonicalEType],
) -> None:
"""Convert a set of ArangoDB attributes into valid DGL features
Expand All @@ -379,8 +384,9 @@ def __prepare_dgl_features(
:type attributes: Set[str]
:param doc: The current ArangoDB document
:type doc: adbdgl_adapter.typings.Json
:param col: The collection the current document belongs to
:type col: str
:param col: The collection the current document belongs to. For edge
collections, the entire DGL Canonical eType is specified (src, e, dst)
:type col: str | Tuple[str, str, str]
"""
key: str
for key in attributes:
Expand Down Expand Up @@ -446,15 +452,11 @@ def __prepare_adb_attributes(
tensor = data[key] if has_one_col else data[key][canonical_etype or col]
doc[key] = self.__cntrl._dgl_feature_to_adb_attribute(key, col, tensor[id])

def __fetch_adb_docs(
self, col: str, attributes: Set[str], query_options: Any
) -> Result[Cursor]:
def __fetch_adb_docs(self, col: str, query_options: Any) -> Result[Cursor]:
"""Fetches ArangoDB documents within a collection.
:param col: The ArangoDB collection.
:type col: str
:param attributes: The set of document attributes.
:type attributes: Set[str]
:param query_options: Keyword arguments to specify AQL query options
when fetching documents from the ArangoDB instance.
:type query_options: Any
Expand All @@ -463,11 +465,24 @@ def __fetch_adb_docs(
"""
aql = f"""
FOR doc IN {col}
RETURN MERGE(
KEEP(doc, {list(attributes)}),
{{"_id": doc._id}},
doc._from ? {{"_from": doc._from, "_to": doc._to}}: {{}}
)
RETURN doc
"""

return self.__db.aql.execute(aql, **query_options)

def __insert_adb_docs(
self, col: str, docs: List[Json], import_options: Any
) -> None:
"""Insert ArangoDB documents into their ArangoDB collection.
:param col: The ArangoDB collection name
:type col: str
:param docs: To-be-inserted ArangoDB documents
:type docs: List[Json]
:param import_options: Keyword arguments to specify additional
parameters for ArangoDB document insertion. Full parameter list:
https://docs.python-arango.com/en/main/specs.html#arango.collection.Collection.import_bulk
"""
logger.debug(f"Inserting {len(docs)} documents into '{col}'")
result = self.__db.collection(col).import_bulk(docs, **import_options)
logger.debug(result)
10 changes: 7 additions & 3 deletions adbdgl_adapter/controller.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from typing import Any
from typing import Any, Union

from torch.functional import Tensor
from torch import Tensor

from adbdgl_adapter.typings import DGLCanonicalEType

from .abc import Abstract_ADBDGL_Controller

Expand All @@ -18,7 +20,9 @@ class ADBDGL_Controller(Abstract_ADBDGL_Controller):
consistency between your ArangoDB attributes & your DGL features.
"""

def _adb_attribute_to_dgl_feature(self, key: str, col: str, val: Any) -> Any:
def _adb_attribute_to_dgl_feature(
self, key: str, col: Union[str, DGLCanonicalEType], val: Any
) -> Any:
"""
Given an ArangoDB attribute key, its assigned value (for an arbitrary document),
and the collection it belongs to, convert it to a valid
Expand Down
2 changes: 1 addition & 1 deletion adbdgl_adapter/typings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import Any, Dict, Set, Tuple

from torch.functional import Tensor
from torch import Tensor

Json = Dict[str, Any]
ArangoMetagraph = Dict[str, Dict[str, Set[str]]]
Expand Down
2 changes: 1 addition & 1 deletion examples/ArangoDB_DGL_Adapter.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
"from dgl.data import KarateClubDataset\n",
"\n",
"import torch\n",
"from torch.functional import Tensor\n",
"from torch import Tensor\n",
"\n",
"from adbdgl_adapter import ADBDGL_Adapter, ADBDGL_Controller\n",
"from adbdgl_adapter.typings import Json, ArangoMetagraph, DGLCanonicalEType, DGLDataDict\n",
Expand Down
Loading

0 comments on commit 5d98756

Please sign in to comment.