Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SINr implementation to compute node embeddings from communities #156

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions examples/structral_node_embedding/sinr_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""SINr illustrative example.
Nodes in both cliques (barbell graph) will get the same embedding vectors, except for those connected to the path.
Nodes in the path are in distinct communities with a high-enough gamma, and will thus get distinct vectors.
"""

import networkx as nx
from karateclub.node_embedding.structural import SINr
import matplotlib.pyplot as plt

def embed_and_plot(g, gamma, ax):
model = SINr(gamma=gamma)
model.fit(g)
X = model.get_embedding()


from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_2 = pca.fit_transform(X)

ax.scatter(X_2[:,0], X_2[:,1])
for idx, x in enumerate(X_2):
ax.annotate(idx, (x[0], x[1]))



g = nx.barbell_graph(4,8)
fig, axs = plt.subplots(3)

nx.draw_kamada_kawai(g, with_labels=True, ax=axs[0])

embed_and_plot(g,0.5, axs[1])
embed_and_plot(g,10, axs[2])

plt.show()
1 change: 1 addition & 0 deletions karateclub/node_embedding/structural/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .graphwave import GraphWave
from .role2vec import Role2Vec
from .sinr import SINr
74 changes: 74 additions & 0 deletions karateclub/node_embedding/structural/sinr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from typing import Dict, List, Set
import networkx as nx
from karateclub.estimator import Estimator
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.preprocessing import normalize
import numpy as np


class SINr(Estimator):
r"""An implementation of `"SINr" <https://inria.hal.science/hal-03197434/>`_
from the IDA '21 best paper "SINr: Fast Computing of Sparse Interpretable Node Representations is not a Sin!".
The procedure performs community detection using the Louvain algorithm, and computes the distribution of edges of each node across all communities.
The algorithm is one of the fastest, because it mostly relies on Louvain community detection. It thus runs in quasi-linear time. Regarding space complexity, the adjacency matrix and the community membership matrix need to be stored, it is also quasi-linear.

Args:
gamma (int): modularity multi-resolution parameter. Default is 1.
The dimension parameter does not exist for SINr, gamma should be used instead: the number of dimensions of the embedding space is based on the number of communities uncovered. The higher gamma is, the more communities are detected, the higher the number of dimensions of the latent space are uncovered. For small graphs, setting gamma to 1 is usually sufficient. For bigger graphs, it is recommended to increase gamma (5 or 10 for example). For word co-occurrence graphs, to deal with word embedding, gamma is usually set to 50 in order to get many small communities.
seed (int): Random seed value. Default is 42.
"""

def __init__(
self,
gamma: int = 1,
seed: int = 42,
):

self.gamma = gamma
self.seed = seed


def fit(self, graph: nx.classes.graph.Graph):
"""
Fitting a SINr model.

Arg types:
* **graph** *(NetworkX graph)* - The graph to be embedded.
"""
self._set_seed()
graph = self._check_graph(graph)
# Get the adjacency matrix of the graph
adjacency = nx.adjacency_matrix(graph)
norm_adjacency = normalize(adjacency, "l1") # Make rows of matrix sum at 1
# Detect communities use louvain algorithm with the gamma resolution parameter
communities = nx.community.louvain_communities(graph, resolution = self.gamma, seed = self.seed)
self.dimensions = len(communities)
# Get the community membership of the graph
membership_matrix = self._get_matrix_membership(communities)
#Computes the node-recall: for each node, the distribution of links across communities
self._embedding = norm_adjacency.dot(membership_matrix)

def _get_matrix_membership(self, list_of_communities:List[Set[int]]):
r"""Getting the membership matrix describing for each node (rows), in which community (column) it belongs.

Return types:
* **Membership matrix** *(scipy sparse matrix csr)* - Size nodes, communities
"""
row = list()
col = list()
data = list()
for idx_c, community in enumerate(list_of_communities):
for node in community:
row.append(node)
col.append(idx_c)
data.append(1)
return coo_matrix((data, (row, col)), shape=(len(row), len(list_of_communities))).tocsr()


def get_embedding(self) -> np.array:
r"""Getting the node embedding.

Return types:
* **embedding** *(Numpy array)* - The embedding of nodes.
"""
return self._embedding.toarray()
35 changes: 34 additions & 1 deletion test/structral_node_embedding_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
import networkx as nx
from karateclub import Role2Vec, GraphWave
from karateclub import Role2Vec, GraphWave, SINr


def test_role2vec():
Expand Down Expand Up @@ -73,3 +73,36 @@ def test_graphwave():
assert embedding.shape[0] == graph.number_of_nodes()
assert embedding.shape[1] == 2 * model.sample_number
assert type(embedding) == np.ndarray



def test_sinr():
"""
Testing the SINr class.
"""
model = SINr()

graph = nx.watts_strogatz_graph(100, 10, 0.5)

model.fit(graph)

embedding = model.get_embedding()

assert embedding.shape[0] == graph.number_of_nodes()
assert embedding.shape[1] == model.dimensions
assert type(embedding) == np.ndarray

model = SINr(gamma=5)

graph = nx.watts_strogatz_graph(200, 10, 0.5)

model.fit(graph)

embedding = model.get_embedding()

assert embedding.shape[0] == graph.number_of_nodes()
assert embedding.shape[1] == model.dimensions
model2 = SINr(gamma=10)
model2.fit(graph)
assert model2.dimensions > model.dimensions
assert type(embedding) == np.ndarray
Loading