-
Notifications
You must be signed in to change notification settings - Fork 9
/
graph_utils.py
232 lines (213 loc) · 11.6 KB
/
graph_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import numpy as np
import pickle
import os
import random
def graph_similarity(graph):
alpha = 0.99
row_sums = graph.sum(axis=1)
normalized_graph = graph / row_sums[:, np.newaxis]
x = np.identity(graph.shape[0], dtype=np.int8) - alpha * normalized_graph
node_similarity = np.linalg.inv(x)
for i in range(len(node_similarity)):
node_similarity[i, i] = node_similarity[i, i] / 2
return node_similarity
def data_process(args, interval, biased, time_windows=2, data_directory='./data/DBLP/edgelist.txt', output_directory="./data/DBLP", directed=False):
node_dict, node_index, original_node_index, node_value, node_level = preprocess_edgelist(data_directory, interval, directed, time_windows)
if not os.path.exists("{}/{}_emb".format(output_directory, args.data)):
os.system("python ./deepwalk/main.py --representation-size {} --input {} --output {}/{}_emb".format(
args.emb_size, data_directory[:-4] + '_new' + data_directory[-4:], output_directory, args.data))
# original graph
graph = np.zeros((len(original_node_index), len(original_node_index)), dtype=np.int8)
with open(data_directory, 'r+') as f:
for line in f:
nodes = list(map(int, line.split()))
i = original_node_index[nodes[0]]
j = original_node_index[nodes[1]]
graph[i, j] = 1
graph[j, i] = 1
for i in range(len(original_node_index)):
graph[i, i] = 1
# newly-built graph
new_graph = np.zeros((len(node_index), len(node_index)), dtype=np.float32)
with open(data_directory[:-4] + '_new' + data_directory[-4:], 'r+') as f:
for line in f:
nodes = list(map(int, line.split()))
new_graph[nodes[0], nodes[1]] = 1.0
new_graph[nodes[1], nodes[0]] = 1.0
for i in range(len(node_index)):
new_graph[i, i] = 1
degree = np.array(np.sum(new_graph, axis=1), dtype=np.int)
node_similarity = graph_similarity(new_graph)
data = dict()
data['original_index'] = original_node_index
data['index'] = node_index
data['value'] = node_value
data['dict'] = node_dict
data['graph'] = graph
data['node_level'] = node_level
pickle.dump(data, open(output_directory + '/graph.pickle', "wb"))
role_level_emb = dict()
with open("{}/{}_emb".format(output_directory, args.data), 'r') as f_emb:
next(f_emb)
for line in f_emb:
line = line.split()
role_level_emb[line[0]] = np.array(list(map(float, line[1:])))
with open("{}/{}_node_level_emb".format(output_directory, args.data), 'w') as f_emb:
node_level_emb = dict()
f_emb.write('{} {}\n'.format(len(node_level), len(role_level_emb['0'])))
for i in range(len(node_level)):
sum = role_level_emb[str(node_level[i][0])]
for j in range(1, len(node_level[i])):
sum = sum + role_level_emb[str(node_level[i][j])]
# node_level_emb[str(i)] = sum/len(node_level[i])
f_emb.write('{} '.format(i) + ' '.join(map(str, sum/len(node_level[i]))) + '\n')
temporal_random_walks(node_dict, node_similarity, degree, output_directory, biased, node_index, node_value)
def dictionary_search(dictionary, search_value):
for key, value in dictionary.items():
if value == search_value:
return key
def temporal_random_walks(node_dict, node_similarity, degree, output_directory, biased, node_index, node_value):
walk_length = 20
if biased:
temporal_random_walk = biased_temporal_neighbor_selection_with_prob
repeated = list(map(int, np.array(degree / 3) + 1))
else:
# less sequences but faster speed
temporal_random_walk = unbiased_temporal_neighbor_selection_with_prob
repeated = [3 for _ in range(len(degree))]
count = 0
with open('{}/sequences.txt'.format(output_directory), 'w+') as f:
with open('{}/sequences_generation.txt'.format(output_directory), 'w+') as f_g:
with open('{}/sequences_negative.txt'.format(output_directory), 'w+') as f_neg:
for item, v in node_dict.items():
for j in range(repeated[count]):
# adding the initial node to the sequence and initializing the neighbor list
sequence = [int(node_index[item])]
neighbor_list = v
node = item
for i in range(1, walk_length):
# select neighbors
prob_list = temporal_random_walk(neighbor_list, node_similarity, node, node_index)
index = choose_action(prob_list)
while index is None:
index = choose_action(prob_list)
node = '{}_{}'.format(neighbor_list[index][0], neighbor_list[index][1])
# adding the selected node to the list
sequence.append(node_index[node])
# reinitialized the neighbor_list
neighbor_list = node_dict[node]
f.write(', '.join(map(str, sequence)) + '\n')
# generate sequence for sequence generation
f_g.write(', '.join(map(str, sequence)) + '\n')
# generate sequence for negative sample by substituting only 1-3 nodes in the sequence
neg_seq = np.array(sequence, copy=True)
indices = random.choices(range(len(sequence)), k=random.randint(1, 3))
for index in indices:
neg_seq[index] = random.randint(0, len(node_dict)-1)
f_neg.write(', '.join(map(str, neg_seq)) + '\n')
count += 1
# Pr(w) = 1/|Γt (v)| , where the mapping function Γt (v) = T - time_windows (T >= time_windows)
def unbiased_temporal_neighbor_selection_with_prob(neighbor_list, node_similarity=None, k=None, node_index=None):
"""
input:
neighbor_list: a list of neighbor stored in the form of a tuple (node, timestamp).
current_timestamp: the current timestamp
output:
prob_list: the probability of each neighbor being chosen in the next temporal random walk
"""
prob_list = [1.0 / len(neighbor_list) for _ in range(len(neighbor_list))]
return prob_list
def biased_temporal_neighbor_selection_with_prob(neighbor_list, node_similarity, k, node_index):
"""
input:
neighbor_list: a list of neighbor stored in the form of a tuple (node, timestamp).
current_timestamp: the current timestamp
output:
prob_list: the probability of each neighbor being chosen in the next temporal random walk
"""
prob_list = [node_similarity[node_index[k], node_index['{}_{}'.format(item[0], item[1])]] for item in neighbor_list]
prob_list = prob_list / np.sum(prob_list)
return prob_list
def choose_action(c):
r = np.random.random()
c = np.array(c)
for i in range(1, len(c)):
c[i] = c[i]+c[i-1]
for i in range(len(c)):
if c[i] >= r:
return i
def preprocess_edgelist(data_directory, interval, directed, time_windows):
node_dict = dict()
node_index = dict()
node_value = dict()
original_node_index = dict()
count = 0
original_node_count = 0
min_time_stamp = np.inf
max_time_stamp = 0
# find the minimal timestamp
with open(data_directory, 'r') as f:
for line in f:
line = list(map(int, line.split()))
if line[2] < min_time_stamp:
min_time_stamp = line[2]
if line[2] > max_time_stamp:
max_time_stamp = line[2]
time_slice = int((max_time_stamp - min_time_stamp + 1) / interval)
assert (time_slice != 0), "Please check if interval is correctly set!"
with open(data_directory, 'r') as f:
with open(data_directory[:-4] + '_new' + data_directory[-4:], 'w') as f_out:
for line in f:
nodes = list(map(int, line.split()))
nodes[2] = int((nodes[2] - min_time_stamp) / time_slice)
# index dictionary by which we could map newly-created nodes back to original graph
if nodes[0] not in original_node_index:
original_node_index[nodes[0]] = original_node_count
original_node_count += 1
if nodes[1] not in original_node_index:
original_node_index[nodes[1]] = original_node_count
original_node_count += 1
# map original nodes to newly-created nodes to encode the time info
if '{}_{}'.format(nodes[0], nodes[2]) not in node_index:
node_value[count] = '{}_{}'.format(nodes[0], nodes[2])
node_index['{}_{}'.format(nodes[0], nodes[2])] = count
node_dict['{}_{}'.format(nodes[0], nodes[2])] = [(nodes[0], nodes[2])]
count += 1
# if the second node does not exist in the dictionary, then add it to the dictionary
if '{}_{}'.format(nodes[1], nodes[2]) not in node_index:
node_value[count] = '{}_{}'.format(nodes[1], nodes[2])
node_index['{}_{}'.format(nodes[1], nodes[2])] = count
node_dict['{}_{}'.format(nodes[1], nodes[2])] = [(nodes[1], nodes[2])]
count += 1
if (nodes[1], nodes[2]) not in node_dict['{}_{}'.format(nodes[0], nodes[2])]:
node_dict['{}_{}'.format(nodes[0], nodes[2])].append((nodes[1], nodes[2]))
if not directed:
if (nodes[0], nodes[2]) not in node_dict['{}_{}'.format(nodes[1], nodes[2])]:
node_dict['{}_{}'.format(nodes[1], nodes[2])].append((nodes[0], nodes[2]))
if max_time_stamp < nodes[2]:
max_time_stamp = nodes[2]
f_out.write("{} {} {}\n".format(node_index['{}_{}'.format(nodes[1], nodes[2])], node_index[
'{}_{}'.format(nodes[0], nodes[2])], nodes[2]))
f_out.write("{} {} {}\n".format(node_index['{}_{}'.format(nodes[0], nodes[2])], node_index[
'{}_{}'.format(nodes[1], nodes[2])], nodes[2]))
with open(data_directory[:-4] + '_new' + data_directory[-4:], 'a+') as f_out:
for node in node_dict.keys():
nodes = list(map(int, node.split('_')))
# expand the neighbour of this node if the time stamp is within a certain range
for i in range(max(0, nodes[1] - time_windows), min(interval, nodes[1] + time_windows)):
if i != nodes[1] and '{}_{}'.format(nodes[0], i) in node_dict:
# if this found neighbor is the node itself, then skip it
node_dict[node].append((nodes[0], i))
f_out.write("{} {} {}\n".format(node_index[node], node_index['{}_{}'.format(nodes[0], i)], i))
f_out.write("{} {} {}\n".format(node_index['{}_{}'.format(nodes[0], i)], node_index[node], i))
node_level = dict()
for i in range(len(node_index)):
node_level[i] = [i]
for node in node_dict.keys():
nodes = list(map(int, node.split('_')))
# expand the neighbour of this node if the time stamp is within a certain range
for i in range(interval):
if i != nodes[1] and '{}_{}'.format(nodes[0], i) in node_dict:
node_level[node_index[node]].append(node_index['{}_{}'.format(nodes[0], i)])
print("Finish Remapping nodes! Total number of nodes = {}".format(count))
return node_dict, node_index, original_node_index, node_value, node_level