-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_data.py
160 lines (124 loc) · 5.82 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import pandas as pd
import numpy as np
import os
import torch
from torch_geometric.data import Data
def load_nba_data(
base_path: str = os.getcwd(),
class_labels: str = 'SALARY',
sens_attribute: str = 'country'
):
"""
Load the NBA dataset from the .csv files and prepare it for GNNs.
"""
# Set the path to the data files
if base_path is None:
base_path = os.getcwd()
input_data_path = base_path + "/Datasets/nba"
# Load the data files
user_labels_path = os.path.join(input_data_path, "nba.csv")
user_edges_path = os.path.join(input_data_path, "nba_relationship.csv")
# Create dataframes to store the information from the .csv files
user_labels = pd.read_csv(user_labels_path)
user_edges = pd.read_csv(user_edges_path)
# Prepare the data for GNNs
node_features = torch.tensor(user_labels.iloc[:, 1:].values, dtype=torch.float)
edge_index = torch.tensor(user_edges.values, dtype=torch.long).t().contiguous()
user_edges = user_edges[user_edges['uid1'].isin(user_labels['user_id']) & user_edges['uid2'].isin(user_labels['user_id'])]
# Extract node features from user_labels dataframe
node_features = user_labels.iloc[:, 1:]
node_features = torch.tensor(node_features.values, dtype=torch.float)
# Extract edges from user_edges dataframe
edges = user_edges[['uid1', 'uid2']]
edges['uid1'] = edges['uid1'].map(dict(zip(user_labels['user_id'], range(len(user_labels)))))
edges['uid2'] = edges['uid2'].map(dict(zip(user_labels['user_id'], range(len(user_labels)))))
# Convert edges dataframe to tensor
edges_tensor = torch.tensor(edges.values, dtype=torch.long).t().contiguous()
# Create edge_index tensor
edge_index = edges_tensor
user_labels['SALARY'] = user_labels['SALARY'].map({-1: 0, 0: 1, 1: 1})
# Create torch-geometric data
data = Data(x=node_features, edge_index=edge_index)
num_nodes = node_features.size(0)
num_classes = 2
num_node_features = data.num_node_features
# Create masks for training, and testing
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
# 60-20-20 Train and Test data split
num_train = int(num_nodes * 0.6)
num_val = int(num_nodes * 0.8)
train_mask[:num_train] = True
val_mask[num_train:num_val] = True
test_mask[num_val:] = True
data.train_mask = train_mask
data.test_mask = test_mask
data.val_mask = val_mask
# Labels from the data
labels_values = user_labels[class_labels].values
data.y = torch.tensor(labels_values, dtype=torch.long)
# Sensitivite attribute
sens_values = user_labels[sens_attribute].values
sens_attribute_tensor = torch.tensor(sens_values, dtype=torch.long)
return data, num_classes, num_node_features, sens_attribute_tensor, labels_values, sens_values
def load_german_data(
base_path: str = os.getcwd(),
class_labels: str = 'GoodCustomer',
sens_attribute: str = 'Gender'
):
"""
Load the German dataset from the .csv files and prepare it for GNNs.
"""
if base_path is None:
base_path = os.getcwd()
input_data_path = base_path + "/Datasets/german"
# Load the data files
user_labels_path = os.path.join(input_data_path, "german.csv")
user_edges_path = os.path.join(input_data_path, "german_edges.csv")
# Create dataframes to store the information from the .csv files
user_labels = pd.read_csv(user_labels_path)
user_edges = pd.read_csv(user_edges_path)
user_labels['Gender'] = user_labels['Gender'].replace({'Female': 1, 'Male': 0})
user_labels['GoodCustomer'] = user_labels['GoodCustomer'].replace({1: 1, -1: 0})
user_labels.insert(0, 'user_id', user_labels.index)
user_labels = user_labels.drop('PurposeOfLoan', axis=1)
user_edges = user_edges[user_edges['uid1'].isin(user_labels['user_id']) & user_edges['uid2'].isin(user_labels['user_id'])]
user_labels_train = user_labels
user_labels_train = user_labels_train.drop(columns=['GoodCustomer'])
# Extract node features from user_labels dataframe
node_features = user_labels_train.iloc[:, 1:]
node_features = torch.tensor(node_features.values, dtype=torch.float)
# Extract edges from user_edges dataframe
edges = user_edges[['uid1', 'uid2']]
edges['uid1'] = edges['uid1'].map(dict(zip(user_labels['user_id'], range(len(user_labels)))))
edges['uid2'] = edges['uid2'].map(dict(zip(user_labels['user_id'], range(len(user_labels)))))
# Convert edges dataframe to tensor
edges_tensor = torch.tensor(edges.values, dtype=torch.long).t().contiguous()
# Create edge_index tensor
edge_index = edges_tensor
# Create torch-geometric data
data = Data(x=node_features, edge_index=edge_index)
num_nodes = node_features.size(0)
num_classes = 2
num_node_features = data.num_node_features
# Create masks for training, and testing
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
# 80 - 20 Train and Test data split
num_train = int(num_nodes * 0.6)
num_val = int(num_nodes * 0.8)
train_mask[:num_train] = True
val_mask[num_train:num_val] = True
test_mask[num_val:] = True
data.train_mask = train_mask
data.test_mask = test_mask
data.val_mask = val_mask
# Labels from the data
labels_values = user_labels[class_labels].values
data.y = torch.tensor(labels_values, dtype=torch.long)
# Sensitivite attribute
sens_values = user_labels[sens_attribute].values
sens_attribute_tensor = torch.tensor(sens_values, dtype=torch.long)
return data, num_classes, num_node_features, sens_attribute_tensor, labels_values, sens_values