Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added clustering generative model #13

Merged
merged 13 commits into from
Dec 3, 2023
1 change: 1 addition & 0 deletions Data/clustering.json

Large diffs are not rendered by default.

131 changes: 131 additions & 0 deletions clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import json
import multiprocessing as mp
import os

import numpy as np

from lcs import *

def gen_clustered_clique_number(n,clique_number):
clique_size = n//clique_number#the number of nodes per clique
clique_membership = 0#the number of cliques per node

p_dist = delta_dist(clique_size)
g_dist = delta_dist(clique_membership)
A = clustered_unipartite(clique_number,n,p_dist,g_dist)
return A

def target_ipn(n,clique_number, gamma, c, mode, rho0, tmax, realizations):
WillHWThompson marked this conversation as resolved.
Show resolved Hide resolved
x0 = np.zeros(n)
x0[random.sample(range(n), int(round(rho0 * n)))] = 1
ipn = 0
for _ in range(realizations):
A = gen_clustered_clique_number(n,clique_number)
x = contagion_process(A, gamma, c, x0, tmin=0, tmax=tmax)
ipn += infections_per_node(x, mode) / realizations
return ipn


def single_inference(
fname, gamma, c, b, rho0, A, tmax, p_c, p_rho, nsamples, burn_in, skip
):
n = np.size(A, axis=0)
x0 = np.zeros(n)
x0[random.sample(range(n), int(round(rho0 * n)))] = 1

x = contagion_process(A, gamma, c, x0, tmin=0, tmax=tmax)
p = beta(p_rho[0], p_rho[1]).rvs()
A0 = erdos_renyi(n, p)
samples = infer_adjacency_matrix(
x, A0, p_rho, p_c, nsamples=nsamples, burn_in=burn_in, skip=skip
)

# json dict
data = {}
data["gamma"] = gamma
data["c"] = c.tolist()
data["b"] = b
data["p-rho"] = p_rho.tolist()
data["p-c"] = p_c.tolist()
data["x"] = x.tolist()
data["A"] = A.tolist()
data["samples"] = samples.tolist()

datastring = json.dumps(data)

with open(fname, "w") as output_file:
output_file.write(datastring)


data_dir = "Data/clustering"
os.makedirs(data_dir, exist_ok=True)

for f in os.listdir(data_dir):
os.remove(os.path.join(data_dir, f))

n = 100
k = 6

n_processes = len(os.sched_getaffinity(0))
print("n processes: ",n_processes)
realizations = 10
clique_numbers= np.arange(1,20)

# MCMC parameters
burn_in = 100000
nsamples = 100
skip = 1500
p_c = np.ones((2, n))
p_rho = np.array([1, 1])

# contagion functions and parameters
cf1 = lambda nu, beta: 1 - (1 - beta) ** nu # simple contagion
cf2 = lambda nu, beta: beta * (nu >= 2) # complex contagion, tau=2
cf3 = lambda nu, beta: beta * (nu >= 3) # complex contagion, tau=3

cfs = [cf1, cf2, cf3]

rho0 = 1.0
gamma = 0.1
b = 0.04
mode = "max"

tmax = 1000

A = gen_clustered_clique_number(n,3)

arglist = []
for clique_number in clique_numbers:

c = cfs[0](np.arange(n), b)
ipn = target_ipn(n,clique_number, gamma, c, mode, rho0, tmax, 1000)
for i, cf in enumerate(cfs):
if i != 0:
A = gen_clustered_clique_number(n,clique_number)
bscaled = fit_ipn(0.5, ipn, cf, gamma, A, rho0, tmax, mode)
else:
bscaled = b
c = cf(np.arange(n), bscaled)
print((clique_number, i), flush=True)

for r in range(realizations):
A = gen_clustered_clique_number(n,clique_number)
arglist.append(
(
f"{data_dir}/{clique_number}_{i}_{r}",
gamma,
c,
bscaled,
rho0,
A,
tmax,
p_c,
p_rho,
nsamples,
burn_in,
skip,
)
)

with mp.Pool(processes=n_processes) as pool:
pool.starmap(single_inference, arglist)
210 changes: 210 additions & 0 deletions clustering_debug.ipynb.ipynb
WillHWThompson marked this conversation as resolved.
Show resolved Hide resolved

Large diffs are not rendered by default.

77 changes: 77 additions & 0 deletions collect_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import json
import os

import numpy as np

from lcs import *

plist = set()
clist = set()
rlist = set()
beta = []
frac = []


data_dir = "Data/clustering"

for f in os.listdir(data_dir):
d = f.split(".json")[0].split("_")
try:
p = float(d[0])
c = int(d[1])
r = int(d[2])
except:
p = float(d[0] + "-" + d[1])
c = int(d[2])
r = int(d[3])

plist.add(p)
clist.add(c)
rlist.add(r)

clist = sorted(clist)
plist = sorted(plist)
rlist = sorted(rlist)

c_dict = {c: i for i, c in enumerate(clist)}
p_dict = {p: i for i, p in enumerate(plist)}
r_dict = {r: i for i, r in enumerate(rlist)}


ps = np.zeros((len(clist), len(plist), len(rlist)))
sps = np.zeros((len(clist), len(plist), len(rlist)))

for f in os.listdir(data_dir):
d = f.split(".json")[0].split("_")
try:
p = float(d[0])
c = int(d[1])
r = int(d[2])
except:
p = float(d[0] + "-" + d[1])
c = int(d[2])
r = int(d[3])

i = c_dict[c]
j = p_dict[p]
k = r_dict[r]

fname = os.path.join(data_dir, f)

with open(fname, "r") as file:
data = json.loads(file.read())

A = np.array(data["A"])
samples = np.array(data["samples"])

ps[i, j, k] = posterior_similarity(samples, A)
sps[i, j, k] = samplewise_posterior_similarity(samples, A)

data = {}
data["clique_number"] = plist
data["sps"] = sps.tolist()
data["ps"] = ps.tolist()
datastring = json.dumps(data)

with open("Data/clustering.json", "w") as output_file:
output_file.write(datastring)
75 changes: 73 additions & 2 deletions lcs/generative.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import networkx as nx
import numpy as np
import xgi
from scipy.stats import rv_discrete


def zkc():
Expand Down Expand Up @@ -42,5 +43,75 @@ def sbm(n, k, epsilon, seed=None):
return nx.adjacency_matrix(G).todense()


def projected_bipartite(k, s, seed=None):
return 0
def delta_dist(x_prime):
return rv_discrete(name = 'custom',values = ([x_prime],[1.]))


def generate_hypergraph_bipartite_edge_list(N_groups, N_inds, p_dist, g_dist,seed = None):
WillHWThompson marked this conversation as resolved.
Show resolved Hide resolved
"""
generate_hypergraph_bipartite_edge_list(): generates a hypergraph in the style of Newman's model in "Community Structure in social and biological networks"
inputs:
N_groups: the number of groups or cliques to create
WillHWThompson marked this conversation as resolved.
Show resolved Hide resolved
N_inds: the number of individuals to create(may be less than this total)
p_dist: The distribution of clique sizes, must be from numpy.random
g_dist: The distribution of number of cliques belonged to per individual
seed: seed for pseudorandom number generator
output:
edge_list: the edge list for a bi-partite graph. The first n-indices represent the clique edges and the rest represent individuals
"""

#generate rng with seed
if seed is not None:
rng = np.random.default_rng(seed)
else:
rng = np.random.default_rng()


chairs = []
butts = []

# generate chairs

for i in range(1, N_inds + 1):
g_m = g_dist.rvs() + 1 # select the number of butts in clique i
butts.extend([i for _ in range(g_m)]) # add g_m butts to individuals
WillHWThompson marked this conversation as resolved.
Show resolved Hide resolved

for i in range(1, N_groups + 1):
p_n = p_dist.rvs() # select the number of chairs in clique i
#p_n = int(p_n if len(chairs) + p_n <= len(butts) else len(butts) - len(chairs)) # pull a random length or select a length to make the two lists equal if we are bout to go over
p_n = int(p_n if i < N_groups else len(butts) - len(chairs)) # pull a random length or select a length to make the two lists equal if we are bout to go over
print(p_n)
chairs.extend([i for _ in range(int(p_n))]) # add p_n chairs belonging to clique i
#chairs.extend([chairs[-1] for i in range(len(butts) - len(chairs))])
chairs = [chair + N_inds for chair in chairs]

# shuffle the lists
rng.shuffle(chairs)
rng.shuffle(butts)

# generate edge_list
edge_list = list(zip(chairs, butts))
edge_list = [(int(edge[0]), int(edge[1])) for edge in edge_list]

# create vertex meta_data, if the index is a clique, give it a 0, if the vertex is in individual give it a 1
vertex_attributes = {i: 1 if i <= max(butts) else 2 for i in set(chairs + butts)}

return edge_list, vertex_attributes


def bipartite_graph(edge_list):
B = nx.Graph()
a = np.vstack(edge_list)
node_list1,node_list2 = np.unique(a[:,1]),np.unique(a[:,0])
B.add_nodes_from(node_list1,bipartite=0)
B.add_nodes_from(node_list2,bipartite=1)
B.add_edges_from(edge_list)
return B


def clustered_unipartite(n_groups,n_ind,my_p_dist,my_g_dist,**kwargs):
edge_list,vertex_attributes = generate_hypergraph_bipartite_edge_list(n_groups,n_ind,my_p_dist,my_g_dist)
projected_nodes = [k for k,v in vertex_attributes.items() if v == 1]#identify ndes to project graph onto
B = bipartite_graph(edge_list)
WillHWThompson marked this conversation as resolved.
Show resolved Hide resolved
U = nx.projected_graph(B,projected_nodes)#create unipartite projection
return nx.adjacency_matrix(U).todense()
112 changes: 112 additions & 0 deletions plot_graph_topology.ipynb

Large diffs are not rendered by default.