-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdoctorwho_dataset.py
84 lines (74 loc) · 3.46 KB
/
doctorwho_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import networkx as nx
import pandas as pd
def load_simple_graph_with_roles(default_url = True,\
edges_url='datasets/doctorwho.csv',\
roles_url='datasets/roles/doctorwho_roles.csv',\
era='all'):
"""
Read an edgelist and list of nodes info and return a graph with edges in edgelist and nodes attributes as in the nodes info file.
The Doctor Who simple graph is an undirected weighted graph where nodes are characters\
with information about their roles in the series and exist an edge between node i and node j with weight w\
iff i and j appear in the same episode w times.
Parameters
----------
default_url = bool, optional
choose if you want to use default url or not(default is True). It's important to set this bool to False\
if you want to load particular era dataset in no default location
edges_url : str, optional
The csv edgelist location(default is the location in the github repo).
roles_url: str, optional
The csv roles location(default is the location in the github repo)
era: str, optional
The doctor who era you want in the graph(default is all). You can choose 'all','classic','new'.
Returns
-------
networkx.Graph
The doctor who simple graph
"""
if era == 'classic' and default_url:
edges_url = 'datasets/classic-who.csv'
roles_url = 'datasets/roles/classic-who_roles.csv'
if era == 'new' and default_url:
edges_url = 'datasets/new-who.csv'
roles_url = 'datasets/roles/new-who_roles.csv'
df = pd.read_csv(edges_url)
g = nx.from_pandas_edgelist(df,'Source','Target',['Weight'])
roles = pd.read_csv(roles_url).set_index('Id')['Label'].map(lambda x: eval(x)).to_dict()
nx.set_node_attributes(g,roles,'roles')
return g
def load_multigraph_with_roles(default_url = True,\
edges_url='datasets/multigraph/doctorwho_multigraph.csv',\
roles_url='datasets/roles/doctorwho_roles.csv',\
era='all'):
"""
Read an edgelist and list of nodes info and return a multigraph with edges in edgelist, edges and nodes attributes as in the csv files.
The Doctor Who multigraph is an undirected multigraph where nodes are characters\
with information about their roles in the series and exist an edge between node i and node j\
iff i and j appear in the same episode and the edge is marked with the title of the episode.
Parameters
----------
default_url = bool, optional
choose if you want to use default url or not(default is True). It's important to set this bool to False\
if you want to load particular era dataset in no default location
edges_url : str, optional
The csv edgelist location(default is the location in the github repo).
roles_url: str, optional
The csv roles location(default is the location in the github repo)
era: str, optional
The doctor who era you want in the graph(default is all). You can choose 'all','classic','new'.
Returns
-------
networkx.MultiGraph
The doctor who multigraph
"""
if era == 'classic' and default_url:
edges_url = 'datasets/multigraph/classic-who_multigraph.csv'
roles_url = 'datasets/roles/classic-who_roles.csv'
if era == 'new' and default_url:
edges_url = 'datasets/multigraph/new-who_multigraph.csv'
roles_url = 'datasets/roles/new-who_roles.csv'
df = pd.read_csv(edges_url)
g = nx.from_pandas_edgelist(df,'Source','Target',edge_attr='Story',create_using=nx.MultiGraph)
roles = pd.read_csv(roles_url).set_index('Id')['Label'].map(lambda x: eval(x)).to_dict()
nx.set_node_attributes(g,roles,'roles')
return g