-
Notifications
You must be signed in to change notification settings - Fork 0
/
Converter_UAM.py
115 lines (93 loc) · 4.33 KB
/
Converter_UAM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Converts crawled listening events data from Last.fm to user-artist-matrix
__author__ = 'mms'
# Load required modules
import csv
import numpy as np
# Parameters
LE_FILE = "./data/listening_events.csv"
UAM_FILE = "./data/UAM.csv" # user-artist-matrix (UAM)
ARTISTS_FILE = "./data/UAM_artists.csv" # artist names for UAM
USERS_FILE = "./data/UAM_users.csv" # user names for UAM
# Main program
if __name__ == '__main__':
artists = {} # dictionary, (mis)used as ordered list of artists without duplicates
users = {} # dictionary, (mis)used as ordered list of users without duplicates
listening_events = {} # dictionary to store assignments between user and artist
# Read listening events from provided file
with open(LE_FILE, 'r') as f:
reader = csv.reader(f, delimiter='\t') # create reader
headers = reader.next() # skip header
for row in reader:
user = row[0]
artist = row[1]
track = row[2]
time = row[3]
# create ordered set (list) of unique elements (for artists / tracks)
artists[artist] = None
users[user] = None
# initialize listening event counter, access by tuple (user, artist) in dictionary
listening_events[(user, artist)] = 0
# Read listening events from provided file (to fill user-artist matrix)
with open(LE_FILE, 'r') as f:
reader = csv.reader(f, delimiter='\t') # create reader
headers = reader.next() # skip header
for row in reader:
user = row[0]
artist = row[1]
track = row[2]
time = row[3]
# increase listening counter for (user, artist) pair/tuple
listening_events[(user, artist)] += 1
# Assign a unique index to all artists and users in dictionary (we need these to create the UAM)
# Artists
counter = 0
for artist in artists.keys():
artists[artist] = counter
counter += 1
# Users
counter = 0
for user in users.keys():
users[user] = counter
counter += 1
# Now we use numpy to create the UAM
UAM = np.zeros(shape=(len(users.keys()), len(artists.keys())), dtype=np.float32)
# dtype='uint32') # first, create an empty matrix
# iterate through all (user, artist) tuples in listening_events
for u in users.keys():
for a in artists.keys():
try:
# get correct index for user u and artist a
idx_u = users[u]
idx_a = artists.get(a)
# insert number of listening events of user u to artist a in UAM
UAM[idx_u, idx_a] = listening_events[(u, a)]
print "Inserted into UAM the triple (", u, ", ", a, ", ", listening_events[(u,a)], ")"
except KeyError: # if user u did not listen to artist a, we continue
continue
# Get sum of play events per user and per artist
sum_pc_user = np.sum(UAM, axis=1)
sum_pc_artist = np.sum(UAM, axis=0)
# Normalize the UAM (simply by computing the fraction of listening events per artist for each user)
no_users = UAM.shape[0]
no_artists = UAM.shape[1]
# np.tile: take sum_pc_user no_artists times (results in an array of length no_artists*no_users)
# np.reshape: reshape the array to a matrix
# np.transpose: transpose the reshaped matrix
artist_sum_copy = np.tile(sum_pc_user, no_artists).reshape(no_artists, no_users).transpose()
# Perform sum-to-1 normalization
UAM = UAM / artist_sum_copy
# Inform user
print "UAM created. Users: " + str(UAM.shape[0]) + ", Artists: " + str(UAM.shape[1])
# Write everything to text file (artist names, user names, UAM)
# Write artists to text file
with open(ARTISTS_FILE, 'w') as outfile: # "a" to append
outfile.write('artist\n')
for key in artists.keys(): # for all artists listened to by any user
outfile.write(key + "\n")
# Write users to text file
with open(USERS_FILE, 'w') as outfile:
outfile.write('user\n')
for key in users.keys(): # for all users
outfile.write(key + "\n")
# Write UAM
np.savetxt(UAM_FILE, UAM, fmt='%0.6f', delimiter='\t', newline='\n')