-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatacamp.py
executable file
·793 lines (527 loc) · 20.2 KB
/
datacamp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
# Use a list comprehension to get the nodes of interest: noi
noi = [n for n, d in T.nodes(data=True) if d['occupation'] == 'scientist']
# Use a list comprehension to get the edges of interest: eoi
eoi = [(u, v) for u, v, d in T.edges(data = True) if d['date'] < date(2010, 1, 1)]
# Set the weight of the edge
T.edges[1, 10]['weight'] = 2
# Iterate over all the edges (with metadata)
for u, v, d in T.edges(data=True):
# Check if node 293 is involved
if 293 in [u, v]:
# Set the weight to 1.1
T.edges[u, v]['weight'] = 1.1
# Define find_selfloop_nodes()
def find_selfloop_nodes(G):
"""
Finds all nodes that have self-loops in the graph G.
"""
nodes_in_selfloops = []
# Iterate over all the edges of G
for u, v in G.edges():
# Check if node u and node v are the same
if u == v:
# Append node u to nodes_in_selfloops
nodes_in_selfloops.append(u)
return nodes_in_selfloops
# Check whether number of self loops equals the number of nodes in self loops
assert T.number_of_selfloops() == len(find_selfloop_nodes(T))
# Import nxviz
import nxviz as nv
# Create the MatrixPlot object: m
m = nv.MatrixPlot(A_lcc)
# Draw m to the screen
m.draw()
# Display the plot
plt.show()
# Convert T to a matrix format: A
A = nx.to_numpy_matrix(T)
# Convert A back to the NetworkX form as a directed graph: T_conv
T_conv = nx.from_numpy_matrix(A, create_using=nx.DiGraph())
# Check that the `category` metadata field is lost from each node
for n, d in T_conv.nodes(data=True):
assert 'category' not in d.keys()
# Import necessary modules
import matplotlib.pyplot as plt
from nxviz import ArcPlot
# Create the un-customized ArcPlot object: a
a = ArcPlot(A_lcc)
# Draw a to the screen
a.draw()
# Display the plot
plt.show()
# Create the customized ArcPlot object: a2
a2 = ArcPlot(A_lcc, node_order = 'color', node_color = 'color')
# Draw a2 to the screen
a2.draw()
# Display the plot
plt.show()
# Define nodes_with_m_nbrs()
def nodes_with_m_nbrs(G, m):
"""
Returns all nodes in graph G that have m neighbors.
"""
nodes = set()
# Iterate over all nodes in G
for n in G.nodes():
# Check if the number of neighbors of n matches m
if len(list(G.neighbors(n))) == m:
# Add the node n to the set
nodes.add(n)
# Return the nodes with m neighbors
return nodes
# Compute and print all nodes in T that have 6 neighbors
six_nbrs = nodes_with_m_nbrs(T, 6)
print(six_nbrs)
# Compute the degree of every node: degrees
degrees = [len(list(A_lcc.neighbors(n))) for n in A_lcc.nodes()]
# Print the degrees
print(degrees)
# Import matplotlib.pyplot
import matplotlib.pyplot as plt
# Compute the degree centrality of the Twitter network: deg_cent
deg_cent = nx.degree_centrality(A_lcc)
# Plot a histogram of the degree centrality distribution of the graph.
plt.figure()
plt.hist(list(deg_cent.values()))
plt.show()
# Plot a histogram of the degree distribution of the graph
plt.figure()
plt.hist(degrees)
plt.show()
# Plot a scatter plot of the centrality distribution and the degree distribution
plt.figure()
plt.scatter(x=degrees, y=list(deg_cent.values()))
plt.show()
# BFS Algorithm
def path_exists(G, node1, node2):
"""
This function checks whether a path exists between two nodes (node1, node2) in graph G.
"""
visited_nodes = set()
queue = [node1]
for node in queue:
neighbors = G.neighbors(node)
if node2 in neighbors:
print('Path exists between nodes {0} and {1}'.format(node1, node2))
return True
break
else:
visited_nodes.add(node)
queue.extend([n for n in neighbors if n not in visited_nodes])
# Check to see if the final element of the queue has been reached
if node == queue[-1]:
print('Path does not exist between nodes {0} and {1}'.format(node1, node2))
# Place the appropriate return statement
return False
# Betweenes und Degree Centrality
# Compute the betweenness centrality of T: bet_cen
bet_cen = nx.betweenness_centrality(A_lcc)
# Compute the degree centrality of T: deg_cen
deg_cen = nx.degree_centrality(A_lcc)
# Create a scatter plot of betweenness centrality and degree centrality
plt.scatter(x = list(bet_cen.values()), y = list(deg_cen.values()))
# Display the plot
plt.show()
# Nodes mit höchster Degree Centrality
# Define find_nodes_with_highest_deg_cent()
def find_nodes_with_highest_deg_cent(G):
# Compute the degree centrality of G: deg_cent
deg_cent = nx.degree_centrality(G)
# Compute the maximum degree centrality: max_dc
max_dc = max(list(deg_cent.values()))
nodes = set()
# Iterate over the degree centrality dictionary
for k, v in deg_cent.items():
# Check if the current value has the maximum degree centrality
if v == max_dc:
# Add the current node to the set of nodes
nodes.add(k)
return nodes
# Find the node(s) that has the highest degree centrality in T: top_dc
top_dc = find_nodes_with_highest_deg_cent(A_lcc)
print(top_dc)
# Write the assertion statement
for node in top_dc:
assert nx.degree_centrality(A_lcc)[node] == max(nx.degree_centrality(A_lcc).values())
# Node mit der höchsten Betweenes Centrality
# Define find_node_with_highest_bet_cent()
def find_node_with_highest_bet_cent(G):
# Compute betweenness centrality: bet_cent
bet_cent = nx.betweenness_centrality(G)
# Compute maximum betweenness centrality: max_bc
max_bc = max(list(bet_cent.values()))
nodes = set()
# Iterate over the betweenness centrality dictionary
for k, v in bet_cent.items():
# Check if the current value has the maximum betweenness centrality
if v == max_bc:
# Add the current node to the set of nodes
nodes.add(k)
return nodes
# Use that function to find the node(s) that has the highest betweenness centrality in the network: top_bc
top_bc = find_node_with_highest_bet_cent(A_lcc)
print(top_bc)
# Write an assertion statement that checks that the node(s) is/are correctly identified.
for node in top_bc:
assert nx.betweenness_centrality(A_lcc)[node] == max(nx.betweenness_centrality(A_lcc).values())
# Triangels
from itertools import combinations
# Define is_in_triangle()
def is_in_triangle(G, n):
"""
Checks whether a node `n` in graph `G` is in a triangle relationship or not.
Returns a boolean.
"""
in_triangle = False
# Iterate over all possible triangle relationship combinations
for n1, n2 in combinations(G.neighbors(n), 2):
# Check if an edge exists between n1 and n2
if G.has_edge(n1, n2):
in_triangle = True
break
return in_triangle
from itertools import combinations
# Write a function that identifies all nodes in a triangle relationship with a given node.
def nodes_in_triangle(G, n):
"""
Returns the nodes in a graph `G` that are involved in a triangle relationship with the node `n`.
"""
triangle_nodes = set([n])
# Iterate over all possible triangle relationship combinations
for n1, n2 in combinations(G.neighbors(n), 2):
# Check if n1 and n2 have an edge between them
if G.has_edge(n1, n2):
# Add n1 to triangle_nodes
triangle_nodes.add(n1)
# Add n2 to triangle_nodes
triangle_nodes.add(n2)
return triangle_nodes
# Write the assertion statement
# assert len(nodes_in_triangle(T, 1)) == 35
print(len(nodes_in_triangle(A_lcc, 2512)))
from itertools import combinations
# Define node_in_open_triangle()
def node_in_open_triangle(G, n):
"""
Checks whether pairs of neighbors of node `n` in graph `G` are in an 'open triangle' relationship with node `n`.
"""
in_open_triangle = False
# Iterate over all possible triangle relationship combinations
for n1, n2 in combinations(G.neighbors(n), 2):
# Check if n1 and n2 do NOT have an edge between them
if not G.has_edge(n1, n2):
in_open_triangle = True
break
return in_open_triangle
# Compute the number of open triangles in T
num_open_triangles = 0
# Iterate over all the nodes in T
for n in T.nodes():
# Check if the current node is in an open triangle
if node_in_open_triangle(T, n):
# Increment num_open_triangles
num_open_triangles += 1
print(num_open_triangles)
# Define maximal_cliques()
def maximal_cliques(G, size):
"""
Finds all maximal cliques in graph `G` that are of size `size`.
"""
mcs = []
for clique in nx.find_cliques(G):
if len(clique) == size:
mcs.append(clique)
return mcs
# Subgraph für die Nachbarn einzelner Nodes of interest
nodes_of_interest = [29, 38, 42]
# Define get_nodes_and_nbrs()
def get_nodes_and_nbrs(G, nodes_of_interest):
"""
Returns a subgraph of the graph `G` with only the `nodes_of_interest` and their neighbors.
"""
nodes_to_draw = []
# Iterate over the nodes of interest
for n in nodes_of_interest:
# Append the nodes of interest to nodes_to_draw
nodes_to_draw.append(n)
# Iterate over all the neighbors of node n
for nbr in G.neighbors(n):
# Append the neighbors of n to nodes_to_draw
nodes_to_draw.append(nbr)
return G.subgraph(nodes_to_draw)
# Extract the subgraph with the nodes of interest: T_draw
T_draw = T.subgraph(get_nodes_and_nbrs(T, nodes_of_interest))
# Draw the subgraph to the screen
nx.draw(T_draw)
plt.show()
# Subgraph anhand einer Metadatenvariable
# Extract the nodes of interest: nodes
nodes = [n for n, d in T.nodes(data=True) if d['occupation'] == 'celebrity']
# Create the set of nodes: nodeset
nodeset = set(nodes)
# Iterate over nodes
for n in nodes:
# Compute the neighbors of n: nbrs
nbrs = T.neighbors(n)
# Compute the union of nodeset and nbrs: nodeset
nodeset = nodeset.union(nbrs)
# Compute the subgraph using nodeset: T_sub
T_sub = T.subgraph(nodeset)
# Draw T_sub to the screen
nx.draw(T_sub)
plt.show()
# Use Case
# Degree Centrality Distribution
# Import necessary modules
import matplotlib.pyplot as plt
import networkx as nx
# Plot the degree distribution of the GitHub collaboration network
plt.hist(list(nx.degree_centrality(A_lcc).values()), bins=50)
plt.show()
# Plot the degree distribution of the GitHub collaboration network
plt.hist(list(nx.betweenness_centrality(A_lcc).values()), bins=50)
plt.show()
# Visualisations
# MatrixPlot
# Import necessary modules
from nxviz import MatrixPlot
import matplotlib.pyplot as plt
# Calculate the largest connected component subgraph: largest_ccs
largest_ccs = sorted(nx.connected_component_subgraphs(G), key=lambda x: len(x))[-1]
# Create the customized MatrixPlot object: h
h = MatrixPlot(graph = largest_ccs, node_grouping = 'grouping')
# Draw the MatrixPlot to the screen
h.draw()
plt.show()
# Arc Plot
# Import necessary modules
from nxviz.plots import ArcPlot
import matplotlib.pyplot as plt
# Iterate over all the nodes in G, including the metadata
for n, d in G.nodes(data=True):
# Calculate the degree of each node: G.node[n]['degree']
G.node[n]['degree'] = nx.degree(G, n)
# Create the ArcPlot object: a
a = ArcPlot(G, node_order = 'degree')
# Draw the ArcPlot to the screen
a.draw()
plt.show()
# Circos Plot
# Import necessary modules
from nxviz import CircosPlot
import matplotlib.pyplot as plt
# Iterate over all the nodes, including the metadata
for n, d in G.nodes(data=True):
# Calculate the degree of each node: G.node[n]['degree']
G.node[n]['degree'] = nx.degree(G, n)
# Create the CircosPlot object: c
c = CircosPlot(G, node_order = 'degree', node_grouping = 'grouping', node_color = 'grouping')
# Draw the CircosPlot object to the screen
c.draw()
plt.show()
# Cliques
# Calculate the maximal cliques in G: cliques
cliques = nx.find_cliques(G)
# Count and print the number of maximal cliques in G
print(len(list(cliques)))
# Import necessary modules
import networkx as nx
from nxviz import CircosPlot
import matplotlib.pyplot as plt
# Find the author(s) that are part of the largest maximal clique: largest_clique
largest_clique = sorted(nx.find_cliques(G), key=lambda x:len(x))[-1]
# Create the subgraph of the largest_clique: G_lc
G_lc = G.subgraph(largest_clique)
# Create the CircosPlot object: c
c = CircosPlot(G_lc)
# Draw the CircosPlot to the screen
c.draw()
plt.show()
# Recomendation System
# Compute the degree centralities of G: deg_cent
deg_cent = nx.degree_centrality(G)
# Compute the maximum degree centrality: max_dc
max_dc = max(deg_cent.values())
# Find the user(s) that have collaborated the most: prolific_collaborators
prolific_collaborators = [n for n, dc in deg_cent.items() if dc == max_dc]
# Print the most prolific collaborator(s)
print(prolific_collaborators)
# Import necessary modules
from nxviz import ArcPlot
import matplotlib.pyplot as plt
# Identify the largest maximal clique: largest_max_clique
largest_max_clique = set(sorted(nx.find_cliques(G), key=lambda x: len(x))[-1])
# Create a subgraph from the largest_max_clique: G_lmc
G_lmc = G.subgraph(largest_max_clique).copy()
# Go out 1 degree of separation
for node in list(G_lmc.nodes()):
G_lmc.add_nodes_from(G.neighbors(node))
G_lmc.add_edges_from(zip([node]*len(list(G.neighbors(node))), G.neighbors(node)))
# Record each node's degree centrality score
for n in G_lmc.nodes():
G_lmc.node[n]['degree centrality'] = nx.degree_centrality(G_lmc)[n]
# Create the ArcPlot object: a
a = ArcPlot(G_lmc, node_order = 'degree centrality')
# Draw the ArcPlot to the screen
a.draw()
plt.show()
# Import necessary modules
from itertools import combinations
from collections import defaultdict
# Initialize the defaultdict: recommended
recommended = defaultdict(int)
# Iterate over all the nodes in G
for n, d in G.nodes(data=True):
# Iterate over all possible triangle relationship combinations
for n1, n2 in combinations(G.neighbors(n), 2):
# Check whether n1 and n2 do not have an edge
if not G.has_edge(n1, n2):
# Increment recommended
recommended[(n1, n2)] += 1
# Identify the top 10 pairs of users
all_counts = sorted(recommended.values())
top10_pairs = [pair for pair, count in recommended.items() if count > all_counts[-10]]
print(top10_pairs)
# Kurs II
# Add the degree centrality score of each node to their metadata dictionary
dcs = nx.degree_centrality(G)
for n in G.nodes():
G.node[n]['centrality'] = dcs[n]
# Create the CircosPlot object: c
c = CircosPlot(G, node_color='bipartite', node_grouping='bipartite', node_order='centrality')
# Draw c to the screen
c.draw()
# Display the plot
plt.show()
# Bipartite Graph
# Define get_nodes_from_partition(G, partition)
def get_nodes_from_partition(G, partition):
# Initialize an empty list for nodes to be returned
nodes = []
# Iterate over each node in the graph G
for n in G.nodes():
# Check that the node belongs to the particular partition
if G.node[n]['bipartite'] == partition:
# If so, append it to the list of nodes
nodes.append(n)
return nodes
# Print the number of nodes in the 'projects' partition
print(len(get_nodes_from_partition(G, 'projects')))
# Print the number of nodes in the 'users' partition
print(len(get_nodes_from_partition(G, 'users')))
# Import matplotlib
import matplotlib.pyplot as plt
# Get the 'users' nodes: user_nodes
user_nodes = get_nodes_from_partition(G, 'users')
# Compute the degree centralities: dcs
dcs = nx.degree_centrality(G)
# Get the degree centralities for user_nodes: user_dcs
user_dcs = [dcs[n] for n in user_nodes]
# Plot the degree distribution of users_dcs
plt.yscale('log')
plt.hist(user_dcs, bins=20)
plt.show()
# Get the 'projects' nodes: project_nodes
project_nodes = get_nodes_from_partition(G, 'projects')
# Compute the degree centralities: dcs
dcs = nx.degree_centrality(G)
# Get the degree centralities for project_nodes: project_dcs
project_dcs = [dcs[n] for n in project_nodes]
# Plot the degree distribution of project_dcs
plt.yscale('log')
plt.hist(project_dcs, bins=20)
plt.show()
# function to calculate the set of nodes that are shared between two nodes
def shared_partition_nodes(G, node1, node2):
# Check that the nodes belong to the same partition
assert G.node[node1]['bipartite'] == G.node[node2]['bipartite']
# Get neighbors of node 1: nbrs1
nbrs1 = G.neighbors(node1)
# Get neighbors of node 2: nbrs2
nbrs2 = G.neighbors(node2)
# Compute the overlap using set intersections
overlap = set(nbrs1).intersection(nbrs2)
return overlap
# Print the number of shared repositories between users 'u7909' and 'u2148'
print(len(shared_partition_nodes(G, 'u7909', 'u2148')))
def user_similarity(G, user1, user2, proj_nodes):
# Check that the nodes belong to the 'users' partition
assert G.node[user1]['bipartite'] == 'users'
assert G.node[user2]['bipartite'] == 'users'
# Get the set of nodes shared between the two users
shared_nodes = shared_partition_nodes(G, user1, user2)
# Return the fraction of nodes in the projects partition
return len(shared_nodes) / len(proj_nodes)
# Compute the similarity score between users 'u4560' and 'u1880'
project_nodes = get_nodes_from_partition(G, 'projects')
similarity_score = user_similarity(G, 'u4560', 'u1880', project_nodes)
print(similarity_score)
from collections import defaultdict
def most_similar_users(G, user, user_nodes, proj_nodes):
# Data checks
assert G.node[user]['bipartite'] == 'users'
# Get other nodes from user partition
user_nodes = set(user_nodes)
user_nodes.remove(user)
# Create the dictionary: similarities
similarities = defaultdict(list)
for n in user_nodes:
similarity = user_similarity(G, user, n, proj_nodes)
similarities[similarity].append(n)
# Compute maximum similarity score: max_similarity
max_similarity = max(similarities.keys())
# Return list of users that share maximal similarity
return similarities[max_similarity]
user_nodes = get_nodes_from_partition(G, 'users')
project_nodes = get_nodes_from_partition(G, 'projects')
print(most_similar_users(G,'u4560', user_nodes, project_nodes))
def recommend_repositories(G, from_user, to_user):
# Get the set of repositories that from_user has contributed to
from_repos = set(G.neighbors(from_user))
# Get the set of repositories that to_user has contributed to
to_repos = set(G.neighbors(to_user))
# Identify repositories that the from_user is connected to that the to_user is not connected to
return from_repos.difference(to_repos)
# Print the repositories to be recommended
print(recommend_repositories(G, 'u7909', 'u2148'))
# Import networkx
import networkx as nx
# Read in the data: g
G = nx.read_edgelist('american-revolution.edgelist')
# Assign nodes to 'clubs' or 'people' partitions
for n, d in G.nodes(data=True):
if '.' in n:
G.node[n]['bipartite'] = 'people'
else:
G.node[n]['bipartite'] = 'clubs'
# Print the edges of the graph
print(G.edges())
[n for n, d in G.nodes(data=True) if d['key'] == 'some_value']
# Prepare the nodelists needed for computing projections: people, clubs
people = [n for n in G.nodes() if G.node[n]['bipartite'] == 'people']
clubs = [n for n, d in G.nodes(data=True) if d['bipartite'] == 'clubs']
# Compute the people and clubs projections: peopleG, clubsG
peopleG = nx.bipartite.projected_graph(G, people)
clubsG = nx.bipartite.projected_graph(G, clubs)
import matplotlib.pyplot as plt
# Plot the degree centrality distribution of both node partitions from the original graph
plt.figure()
original_dc = nx.bipartite.degree_centrality(G, people)
plt.hist(list(original_dc.values()), alpha=0.5)
plt.yscale('log')
plt.title('Bipartite degree centrality')
plt.show()
# Plot the degree centrality distribution of the peopleG graph
plt.figure()
people_dc = nx.degree_centrality(peopleG)
plt.hist(list(people_dc.values()))
plt.yscale('log')
plt.title('Degree centrality of people partition')
plt.show()
# Plot the degree centrality distribution of the clubsG graph
plt.figure()
clubs_dc = nx.degree_centrality(clubsG)
plt.hist(list(clubs_dc.values()))
plt.yscale('log')
plt.title('Degree centrality of clubs partition')
plt.show()