-
Notifications
You must be signed in to change notification settings - Fork 1
/
structs.py
199 lines (156 loc) · 9.51 KB
/
structs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import pandas as pd
import numpy as np
from collections import defaultdict
import scipy.sparse
from tqdm import tqdm
import json
from random import random, shuffle
import random
import pdb
from apple_datareader import df_interactions_processed, df_items_processed, user_data, item_data, sorted_reindexed_interactions_by_key
from spotify_data_reader import df_interactions_processed, df_items_processed, user_data, item_data, sorted_reindexed_interactions_by_key
'''
In the InteractionGraph we initialize the self. variables for user data, item data, interaction data, and the adjacent matrix.
Keywords:
- Edges --> interactions between the user and item data
- Cold items --> items that are interacted with the least (in our case, any items with 5 interactions)
- Common items --> intersection betweem training, testing and validation
We use the train edges to train our model, test and validation to view the performance of the model.
We use the intersection of training data with testing and validation to make sure that the model can perform on data that it has seen.
We use the bipartite graph to create the nodes of the user and prepare the adjacent matrix for training.
InteractionGraph prepares the basic framework of functionality for the MusicInteractionGraph
'''
class InteractionGraph:
def __init__(self, user_data, item_data, interaction_data) -> None:
self.user_data = user_data
self.item_data = item_data
self.interaction_data = interaction_data
self.train_edges, self.validation_edges, self.test_edges = [], [], []
self.adj_matrix: scipy.sparse.dok_matrix = None
def split_statistics(self):
training_items = set(self.train_edges[:, 1])
validation_items = set(self.validation_edges[:, 1])
test_items = set(self.test_edges[:, 1])
print("Total number of items = {}".format(len(self.item_data)))
print("Number of items present across training edges = {}".format(len(training_items)))
print("Number of items present across val edges = {}".format(len(validation_items)))
print("Number of items present across test edges = {}".format(len(test_items)))
print("Average item degree = {}".format(np.mean(self.item_degrees)))
print("Average user degree = {}".format(np.mean(self.user_degrees)))
# Make sure the items seen in training are seen in validation and test
train_val_common_items = training_items.intersection(validation_items)
train_test_common_items = training_items.intersection(test_items)
print('Number of items common between train and validation edges = {}'.format(len(train_val_common_items)))
print('Number of items common between train and test edges = {}'.format(len(train_test_common_items)))
validation_items = np.array(list(validation_items))
test_items = np.array(list(test_items))
# Check if the items are seen in val and test to help the model understand the patterns
num_cold_items_in_val = np.sum(self.is_cold[validation_items]) # cold items have 5 interactions
num_cold_items_in_test = np.sum(self.is_cold[test_items]) # cold items have 5 interactions
print('Number of cold items in validation set = {}'.format(num_cold_items_in_val))
print('Number of cold items in test set = {}'.format(num_cold_items_in_test))
def create_bipartite_graph(self):
num_nodes = len(self.user_data) + len(self.item_data) # Num users + num items = size of matrix
self.adj_matrix = scipy.sparse.dok_matrix((num_nodes, num_nodes), dtype=bool) # initialize as empty, will have boolean values of 0 or 1
for edge in self.train_edges:
self.adj_matrix[edge[0], edge[1]] = 1 # Edge of user to item
self.adj_matrix[edge[1], edge[0]] = 1 # Edge of item to user
self.adj_matrix = self.adj_matrix.tocsr() # .toscr(), compress the sparse row and save the non zero elements
def compute_tail_distribution(self, warm_threshold): # This function is used to store cold items
self.is_cold = np.zeros((self.adj_matrix.shape[0]), dtype=bool)
self.start_item_id = len(self.user_data)
self.user_degrees = np.array(self.adj_matrix[:self.start_item_id].sum(axis=1)).flatten()
self.item_degrees = np.array(self.adj_matrix[self.start_item_id:].sum(axis=1)).flatten()
cold_items = np.argsort(self.item_degrees)[:int((1 - warm_threshold) * len(self.item_degrees))] + self.start_item_id
self.is_cold[cold_items] = True
def create_data_split(self):
raise NotImplementedError()
'''
MusicInteractionGraph inherits from the InteractionGraph function. It uses a "warm threshold" parameter to determine
the proportion to be used by the tail distribution.
The data split method will be using one element of each tuple in our interaction data for testing and another for validation.
Then it will create its edges based on that specific split
'''
class MusicInteractionGraph(InteractionGraph):
def __init__(self, user_data, item_data, interaction_data, warm_threshold=0.2) -> None:
super().__init__(user_data, item_data, interaction_data)
self.create_data_split()
self.create_bipartite_graph()
assert (warm_threshold < 1.0 and warm_threshold > 0.0)
self.warm_threshold = warm_threshold
self.compute_tail_distribution()
def create_data_split(self):
# Leave one out validation
self.all_edges = set()
self.train_edges_set = set()
for user_id in tqdm(self.interaction_data):
sorted_interactions = sorted(self.interaction_data[user_id], key=lambda x : x[1])
test_edge = [user_id, sorted_interactions[-1][0]]
val_edge = [user_id, sorted_interactions[-2][0]]
self.all_edges.add((user_id, sorted_interactions[-2][0]))
train_edges = [[user_id, interaction[0]] for interaction in sorted_interactions[:-2]]
for interaction in sorted_interactions[:-2]:
self.all_edges.add((user_id, interaction[0]))
self.train_edges_set.add((user_id, interaction[0]))
self.train_edges += train_edges
self.validation_edges.append(val_edge)
self.test_edges.append(test_edge)
self.train_edges = np.array(self.train_edges)
self.validation_edges = np.array(self.validation_edges)
self.test_edges = np.array(self.test_edges)
# pdb.set_trace()
def compute_tail_distribution(self):
return super().compute_tail_distribution(self.warm_threshold)
def __getitem__(self, user_id):
assert user_id < len(self.user_data), "User ID out of bounds"
assert isinstance(self.adj_matrix, scipy.sparse.csr_matrix), "Bipartite graph not created: must call create_bipartite_graph first"
return np.array(self.adj_matrix[user_id, self.start_item_id:].todense()).flatten().nonzero()[0] + self.start_item_id
# Initialize the MusicInteractionGraph with the loaded data
graph = MusicInteractionGraph(user_data, item_data, sorted_reindexed_interactions_by_key, warm_threshold=0.2)
# Check some of the results
print("Train edges:\n", graph.train_edges[:15])
print("Validation edges:\n", graph.validation_edges[:5])
print("Test edges:\n", graph.test_edges[:5])
print("Adjacency matrix:\n", graph.adj_matrix[:5])
def plot_adjacency_matrix(graph, max_users=100, max_items=100):
adj_matrix = graph.adj_matrix[:max_users, :max_items]
plt.figure(figsize=(12, 8))
plt.spy(adj_matrix, markersize=1)
plt.xlabel('Items')
plt.ylabel('Users')
plt.title('Adjacency Matrix (User-Item Interactions)')
plt.show()
def plot_degree_distribution(graph):
graph.compute_tail_distribution(warm_threshold=0.2)
plt.figure(figsize=(10, 6))
plt.hist(graph.item_degrees, bins=30, alpha=0.6, color='b', label='All items')
plt.hist(graph.item_degrees[graph.is_cold[len(graph.user_data):]], bins=30, alpha=0.6, color='r', label='Cold items')
plt.xlabel('Number of Interactions')
plt.ylabel('Number of Items')
plt.title('Distribution of Item Interactions')
plt.legend()
plt.show()
# Visualize the adjacency matrix and degree distribution
plot_adjacency_matrix(graph)
plot_degree_distribution(graph)
# graph.split_statistics()
import networkx as nx
import matplotlib.pyplot as plt
# Sample a smaller subset of interactions for visualization
sampled_users = random.sample(list(user_data.keys()), 10) # Sample 10 users
sampled_items = random.sample(list(item_data.keys()), 50) # Sample 50 items
sampled_interaction_data = {user: interactions for user, interactions in sorted_reindexed_interactions_by_key.items() if user in sampled_users}
sampled_edges = [(user, item) for user in sampled_interaction_data for item, _ in sampled_interaction_data[user] if item in sampled_items]
# Plotting the interaction graph using NetworkX
B = nx.Graph()
# Add nodes with the node attribute "bipartite"
B.add_nodes_from(sampled_users, bipartite=0)
B.add_nodes_from(sampled_items, bipartite=1)
# Add edges (interactions)
B.add_edges_from(sampled_edges)
# Plot the bipartite graph
pos = nx.drawing.layout.bipartite_layout(B, sampled_users)
plt.figure(figsize=(10, 8))
nx.draw(B, pos, with_labels=True, node_color=['skyblue' if node in sampled_users else 'lightgreen' for node in B.nodes()], node_size=500, font_size=10)
plt.title('Sampled Interaction Graph (Bipartite Layout)')
plt.show()