-
Notifications
You must be signed in to change notification settings - Fork 0
/
ComparingModelTrainingWithDifferentInputParameters.py
342 lines (244 loc) · 14.2 KB
/
ComparingModelTrainingWithDifferentInputParameters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import pylab as graphplot
from collections import Counter
from sklearn.metrics import jaccard_similarity_score
from sklearn import linear_model
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.utils import shuffle
from sklearn.externals import joblib
followerFile = pd.read_csv("follows.csv")
interestFile = pd.read_csv("interests.csv")
FollowerFileWithTargetZero = pd.read_csv("TargetZeroList.csv")
def ComputeJaccardSimilarityIndexes(listToAnalyze):
#Create two new lists to store the jaccard similarity scores
JaccardFolloweeList = list()
JaccardFollowerList= list()
JaccardInterestList= list()
for UserNumber in range(0,len(listToAnalyze)):
if listToAnalyze[UserNumber,0] in FolloweeList.keys() and listToAnalyze[UserNumber,1] in FolloweeList.keys():
JaccardFolloweeValue = float(len(list(set(FolloweeList[listToAnalyze[UserNumber,0]]).intersection(FolloweeList[listToAnalyze[UserNumber,1]]))))/(len(set(FolloweeList[listToAnalyze[UserNumber,0]] + FolloweeList[listToAnalyze[UserNumber,1]])))
else:
JaccardFolloweeValue = 0
if listToAnalyze[UserNumber,1] in FollowerList.keys() and listToAnalyze[UserNumber,0] in FollowerList.keys():
JaccardFollowerValue = float(len(list(set(FollowerList[listToAnalyze[UserNumber,0]]).intersection(FollowerList[listToAnalyze[UserNumber,1]]))))/(len(set(FollowerList[listToAnalyze[UserNumber,0]] + FollowerList[listToAnalyze[UserNumber,1]])))
else:
JaccardFollowerValue = 0
#For interest we have to find that the users ie follower as well followee have to exist in the interest user file or not
if (listToAnalyze[UserNumber,0] in InterestUserList.keys()) and (listToAnalyze[UserNumber,1] in InterestUserList.keys()):
JaccardInterestValue = float(len(list(set(InterestUserList[listToAnalyze[UserNumber,0]]).intersection(InterestUserList[listToAnalyze[UserNumber,1]]))))/(len(set(InterestUserList[listToAnalyze[UserNumber,0]] + InterestUserList[listToAnalyze[UserNumber,1]])))
else:
JaccardInterestValue = 0
#Append FolloweeIndex for the pair to Followee list
JaccardFolloweeList.append(JaccardFolloweeValue)
#Append FolloweeIndex to Follower list
JaccardFollowerList.append(JaccardFollowerValue)
#Append Interest Jaccard score to Interest list
JaccardInterestList.append(JaccardInterestValue)
return JaccardFolloweeList,JaccardFollowerList,JaccardInterestList
#Populate the training and testing datasets
def populateDatasets():
#create lists which will be populated
TrainTargetList = list()
TestTargetList = list()
TrainList = list()
TestList = list()
DataframeTrainingInputMerged = [followerFile.iloc[:40000,2:],FollowerFileWithTargetZero.iloc[:40000,2:]]
DataframeTrainingInputMerged = pd.concat(DataframeTrainingInputMerged)
#Shuffle the dataset created to haveshuffled target values in the dataset
DataframeTrainingInputMerged = shuffle(DataframeTrainingInputMerged)
#First three columns are the training jaccard inputs
# Format of dataset Jaccard1 Jaccard2 Jaccard3 Target
TrainList = DataframeTrainingInputMerged.iloc[:,:3]
#Last column i.e. column 3 is the target column
TrainTargetList = DataframeTrainingInputMerged.iloc[:,3]
#Join the two dataset i.e. target =1 and target = 0 datasets to one dataframe
DataframeTestingInputMerged = [followerFile.iloc[40001:,2:],FollowerFileWithTargetZero.iloc[40001:,2:]]
DataframeTestingInputMerged = pd.concat(DataframeTestingInputMerged)
#Shuffle the dataset
DataframeTestingInputMerged = shuffle(DataframeTestingInputMerged)
#First three columns are the testing jaccard inputs
# Format of dataset Jaccard1 Jaccard2 Jaccard3 Target
TestList = DataframeTestingInputMerged.iloc[:,:3]
#Last column i.e. column 3 is the target column
TestTargetList = DataframeTestingInputMerged.iloc[:,3]
#return the lists created
return TrainTargetList,TestTargetList,TrainList,TestList
#This function takes the false positive rate,true positive rate and the roc_area under the curve calculated before. It also takes the name of the model as an argument
def plotRocCurve(fpr, tpr,roc_auc,NameOfModel):
lw = 1
plt.plot(fpr, tpr,
lw=lw, label='ROC curve for ' + NameOfModel + ' ' + str(roc_auc))
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example for: ' + NameOfModel)
plt.legend(loc="lower right")
plt.show()
##############################################################################################################################################################
#Who does the user follows.
#creates a dictionary of user id as the key and the users they follow as values
FollowerList = {k:interest["followee_id"].tolist() for k,interest in followerFile.groupby("follower_id")}
#Who follows the particular users
#creates a dictionary of user id as the key and the users who follow the user as values
FolloweeList = {k:interest["follower_id"].tolist() for k,interest in followerFile.groupby("followee_id")}
#Which categories the user likes
#Creates a list of user id as key and the interests they have as values
InterestUserList = {k:interest["category"].tolist() for k,interest in interestFile.groupby("user_id")}
listOfUsersTargetOne = list() #List of users with target value 1 i.e. follower follows followee
listOfUsersTargetZero = list() #List of users with target value 0 i.e. follower doesnot follows followee
JaccardFolloweeListTargetOne = list() #List to store jaccard index of similarity between the users that follow the client
JaccardFollowerListTargetOne = list() #List to store jaccard index of similarity between the users that the client follows
JaccardInterestListTargetOne = list() #List to store jaccard index of interests between the users
JaccardFolloweeListTargetZero = list() #List to store jaccard index of similarity for target zero file
JaccardFollowerListTargetZero = list() #List to store jaccard index of similarity for target zero file
JaccardInterestListTargetZero = list() #List to store jaccard index of interests between the users for target zero file
#1.Create a list of users and their followee from the folows.csv
#2. Now we have the list for which the target is zero since we know that these users already follow the other users
#3. We calculate the jaccard index for these user combinations i.e. follower followee combination
for rowIndex in range(0,len(followerFile)):
listOfUsersTargetOne.append(followerFile.iloc[rowIndex,:])
#Populate numpy array for target zero lists
for rowIndex in range(0,len(FollowerFileWithTargetZero)):
listOfUsersTargetZero.append(FollowerFileWithTargetZero.iloc[rowIndex,:])
listOfUsersTargetOne = np.array(listOfUsersTargetOne)
listOfUsersTargetZero = np.array(listOfUsersTargetZero)
#Call the function to find jaccard Similarity index for Target One list
JaccardFolloweeValueTargetOne,JaccardFollowerValueTargetOne,JaccardInterestListTargetOne = ComputeJaccardSimilarityIndexes(listOfUsersTargetOne)
#Compute Jaccard Index for list with no connections in between users
JaccardFolloweeValueTargetZero,JaccardFollowerValueTargetZero,JaccardInterestListTargetZero = ComputeJaccardSimilarityIndexes(listOfUsersTargetZero)
#Populate Dataframe for target 1 elements
followerFile['JaccardFollowerIndex'] = JaccardFollowerValueTargetOne
followerFile['JaccardFolloweeIndex'] = JaccardFolloweeValueTargetOne
followerFile['JaccardInterestIndex'] = JaccardInterestListTargetOne
targetListForPositiveCases = [1] * len(followerFile) #target = 1 since they follow the followee
followerFile['Target'] = targetListForPositiveCases #Create a target column in the dataframe
#Populate dataframe with target 0 elements
FollowerFileWithTargetZero['JaccardFollowerIndex'] = JaccardFollowerValueTargetZero
FollowerFileWithTargetZero['JaccardFolloweeIndex'] = JaccardFolloweeValueTargetZero
FollowerFileWithTargetZero['JaccardInterestIndex'] = JaccardInterestListTargetZero
targetListForNoConnectionCases = [0] * len(FollowerFileWithTargetZero) #target = 0 since they don't follow the followee as per the dataset i.e. no info given
FollowerFileWithTargetZero['Target'] = targetListForNoConnectionCases #Create a target column in the dataframe
########################################################################################################################################################
#Creating datasets for analysis
TrainTargetList = list()
TestTargetList = list()
TrainList = list()
TestList = list()
#Populate the training and testing lists using the below function
TrainTargetList,TestTargetList,TrainList,TestList = populateDatasets()
TrainTargetList = np.array(TrainTargetList)
TestTargetList = np.array(TestTargetList)
TrainList = np.array(TrainList)
TestList = np.array(TestList)
########################################################################################################################################################
#Model Training below this comment
#Train a logistic regression model on the input dataset using jaccard for interest similarity
LogisticRegression = linear_model.LogisticRegression()
LogisticRegression.fit (TrainList[:,2].reshape(-1, 1),TrainTargetList)
print 'Logistic Regression for Interest similarity :',LogisticRegression.score(TestList[:,2].reshape(-1, 1),TestTargetList)
######################
#Plot roc for logistic
fpr = dict()
tpr = dict()
roc_auc = dict()
prob_y = LogisticRegression.predict_proba(TestList[:,2].reshape(-1, 1))
fpr, tpr,thresholds = roc_curve(TestTargetList, prob_y[:,1]) #the first column has the scores for target 1
roc_auc = auc(fpr, tpr)
plotRocCurve(fpr, tpr,roc_auc,'Logistic Regression for Interest similarity')
######################
#Train a logistic regression model on the input dataset using jaccard for follower similarity
LogisticRegression = linear_model.LogisticRegression()
LogisticRegression.fit (TrainList[:,0].reshape(-1, 1),TrainTargetList)
print 'Logistic Regression for Follower Similarity :',LogisticRegression.score(TestList[:,0].reshape(-1, 1),TestTargetList)
######################
#Plot roc for logistic
fpr = dict()
tpr = dict()
roc_auc = dict()
prob_y = LogisticRegression.predict_proba(TestList[:,0].reshape(-1, 1))
fpr, tpr,thresholds = roc_curve(TestTargetList, prob_y[:,1]) #the first column has the scores for target 1
roc_auc = auc(fpr, tpr)
plotRocCurve(fpr, tpr,roc_auc,'Logistic Regression for Follower Similarity')
######################
#Train a logistic regression model on the input dataset using jaccard for followee similarity
LogisticRegression = linear_model.LogisticRegression()
LogisticRegression.fit (TrainList[:,1].reshape(-1, 1),TrainTargetList)
print 'Logistic Regression for followee similarity :',LogisticRegression.score(TestList[:,1].reshape(-1, 1),TestTargetList)
######################
#Plot roc for logistic
fpr = dict()
tpr = dict()
roc_auc = dict()
prob_y = LogisticRegression.predict_proba(TestList[:,1].reshape(-1, 1))
fpr, tpr,thresholds = roc_curve(TestTargetList, prob_y[:,1]) #the first column has the scores for target 1
roc_auc = auc(fpr, tpr)
plotRocCurve(fpr, tpr,roc_auc,'Logistic Regression for followee similarity')
######################
######################
#Train a logistic regression model on the input dataset using jaccard for follower and followee similarity
LogisticRegression = linear_model.LogisticRegression()
LogisticRegression.fit (TrainList[:,[0,1]],TrainTargetList)
print 'Logistic Regression for followee and follower similarity :',LogisticRegression.score(TestList[:,[0,1]],TestTargetList)
######################
#Plot roc for logistic
fpr = dict()
tpr = dict()
roc_auc = dict()
prob_y = LogisticRegression.predict_proba(TestList[:,[0,1]])
fpr, tpr,thresholds = roc_curve(TestTargetList, prob_y[:,1]) #the first column has the scores for target 1
roc_auc = auc(fpr, tpr)
plotRocCurve(fpr, tpr,roc_auc,'Logistic Regression for followee and follower similarity')
######################
######################
#Train a logistic regression model on the input dataset using jaccard for followee and interest similarity
LogisticRegression = linear_model.LogisticRegression()
LogisticRegression.fit (TrainList[:,[1,2]],TrainTargetList)
print 'Logistic Regression for followee and interest similarity :',LogisticRegression.score(TestList[:,[1,2]],TestTargetList)
######################
#Plot roc for logistic
fpr = dict()
tpr = dict()
roc_auc = dict()
prob_y = LogisticRegression.predict_proba(TestList[:,[1,2]])
fpr, tpr,thresholds = roc_curve(TestTargetList, prob_y[:,1]) #the first column has the scores for target 1
roc_auc = auc(fpr, tpr)
plotRocCurve(fpr, tpr,roc_auc,'Logistic Regression for followee and interest similarity')
######################
######################
#Train a logistic regression model on the input dataset using jaccard for follower and interest similarity
LogisticRegression = linear_model.LogisticRegression()
LogisticRegression.fit (TrainList[:,[0,2]],TrainTargetList)
print 'Logistic Regression for follower and interest similarity :',LogisticRegression.score(TestList[:,[0,2]],TestTargetList)
######################
#Plot roc for logistic
fpr = dict()
tpr = dict()
roc_auc = dict()
prob_y = LogisticRegression.predict_proba(TestList[:,[0,2]])
fpr, tpr,thresholds = roc_curve(TestTargetList, prob_y[:,1]) #the first column has the scores for target 1
roc_auc = auc(fpr, tpr)
plotRocCurve(fpr, tpr,roc_auc,'Logistic Regression for follower and interest similarity')
######################
######################
#Train a logistic regression model on the input dataset using jaccard for all similarity indexes
LogisticRegression = linear_model.LogisticRegression()
LogisticRegression.fit (TrainList,TrainTargetList)
print 'Logistic Regression for full dataset similarity :',LogisticRegression.score(TestList,TestTargetList)
######################
#Plot roc for logistic
fpr = dict()
tpr = dict()
roc_auc = dict()
prob_y = LogisticRegression.predict_proba(TestList)
fpr, tpr,thresholds = roc_curve(TestTargetList, prob_y[:,1]) #the first column has the scores for target 1
roc_auc = auc(fpr, tpr)
plotRocCurve(fpr, tpr,roc_auc,'Logistic Regression for all indexes')
######################