-
Notifications
You must be signed in to change notification settings - Fork 0
/
ClusteringTitanicDataset.py
397 lines (276 loc) · 14.8 KB
/
ClusteringTitanicDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
import json
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import random #to generate initial random position of cluster centroid
f = open('titanic.json','r') #read json from the file
d = json.load(f) #read from file is typically what you want
#Passenger details array is not used in the code since we are only using the useful inputs hence MeaningfulInputvector inputs.
#If we want to plot all the inputs ie fare,age,gender,passengercount and embarked location,we will use this vector.
passengerDetails = np.zeros((len(d),5))
#MeaningfulInputVector basically contians those inputs which are considered meaningful ie age passenger count and gender and the target
MeaningfulInputVector = np.zeros((len(d),4))
Target = np.zeros(len(d))
medianAge = 0.0 #medianOfAge initialised to 0
#Calculate the median values for missing values of age
def calculateMedianOfAge():
ageList = list() #create a list to store all the age values to find medain for imputation
for i in range(0,len(d)):
if (d[i]['Age']) != '':
ageList.append(d[i]['Age'])
ageList = np.array(ageList).astype(np.float) #convert array of ages form unsigned type to float
#print ageList.dtype #verify that the array type is float
medianAge =np.median(ageList) #find median of the age array using numpy inbuilt median function
####################################################
'''
#code to test if location has a missing value.
for i in range(0,len(d)):
if (d[i]['Embarked']) == '':
print 'Missing Location for :',d[i]['PassengerName']
#code to test if age has a missing value.
for i in range(0,len(d)):
if (d[i]['Age']) == '':
print 'Missing Age for :',d[i]['PassengerName']
'''
####################################################
def populateDataAndTarget():
calculateMedianOfAge()
for i in range(0,len(d)):
passengerDetails[i][0] = d[i]['Fare']
#handling missing values while creating list and populating them with average age value
if d[i]['Age'] != '':
passengerDetails[i][1] = d[i]['Age']
else:
passengerDetails[i][1] = medianAge
#store age in meaningful vector inputs
MeaningfulInputVector[i][0] = passengerDetails[i][1]
passengerDetails[i][2] = d[i]['ParentsAndChildren'] + d[i]['SiblingsAndSpouses']
#store passenger count in meaningful vector inputs
MeaningfulInputVector[i][1] = passengerDetails[i][2]
#encoding the string variables in this part of the code
if d[i]['Embarked'] == 'C':
passengerDetails[i][3] = 1
elif d[i]['Embarked'] == 'Q':
passengerDetails[i][3] = 2
elif d[i]['Embarked'] == 'S':
passengerDetails[i][3] = 3
elif d[i]['Embarked'] == '':
passengerDetails[i][3] = 2 #replacing missing values of embarked location to 'Q' ie 2
#encoding gender in this code
if d[i]['Sex'] == 'male':
passengerDetails[i][4] = 0
elif d[i]['Sex'] == 'female':
passengerDetails[i][4] = 1
#store genderin meaningful vector inputs
MeaningfulInputVector[i][2] = passengerDetails[i][4]
Target[i] = d[i]['Survived'] #populating target list. 0 = died, 1 = survived
MeaningfulInputVector[i][3] = Target[i] #Adding target to the list so as to check if gender played an important part in survival
def normaliseInputs():
numberOfColumns = len(MeaningfulInputVector[1] -1 ) #since target is not normalised
for j in range(0,numberOfColumns):
minValue = min(MeaningfulInputVector[:,j]) #find minimum element in jth column
maxValue = max(MeaningfulInputVector[:,j]) #find maximum element in jth column
for i in range(0,len(MeaningfulInputVector)): #for each row element in jth column ie loops runs for number of rows in data array
MeaningfulInputVector[i,j] = (MeaningfulInputVector[i,j] - minValue)/float(maxValue - minValue)
#perform hierarchial clustering
def plotDendogram():
#Only use inputs for dendogram hence we subset the array ie [0:3] ie first three columns.Since 3rd column has target
Z = linkage(MeaningfulInputVector[:,0:3], method='ward', metric='euclidean') # distance between clusters and metric
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Titanic Data Set')
plt.ylabel('distance')
plt.axhline(y=4.5) #creates a horizintal line depicting the cluster threshold i.e. line where to cut dendogram
#creates a dendrogram hierarchial plot
dendrogram(
Z,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8. # font size for the x axis labels
)
#display both figures
plt.show()
def performKmeans():
#create 3 initial random cluster centroid since I have selected 3 clusters.since we have 3 inputs hence each centroid will have 3 demensions ie 9 times random function [[cluster 1 ->3 points],[cluster 2 ->3 points],[cluster 3 ->3 points]
#clusterCentroid gives the centroid of each feature for each of the cluster
#clusterCentroid[0][0] - Age centroid for first cluster
#clusterCentroid[0][1] - Siblings/Spouse count centroid for first cluster
#clusterCentroid[0][2] - Gender centroid for first cluster
ClusterCentroid = [[random.uniform(0,1),random.uniform(0,1),random.uniform(0,1)],[random.uniform(0,1),random.uniform(0,1),random.uniform(0,1)],[random.uniform(0,1),random.uniform(0,1),random.uniform(0,1)]]
#initialise distance list to 3 float values.ie distances from each cluster centroid
Dist = [0.0,0.0,0.0]
index = 0
while(index < 10):
#compute distance of each point from each centroid and then assign to the corresponding cluster from whose centroid dist is minimum
#create 1 list of lists.initialise each list to 0. these internal lists will store points corresponding to a cluster
cluster = [[0],[0],[0]]
for i in range(0,len(MeaningfulInputVector)):
computeDistance(MeaningfulInputVector[i],ClusterCentroid,Dist)
ClusterToAssign = FindClusterIndex(Dist)
#append the point to the cluster found earlier.this cluster number(ClusterToAssign) is used to access the list
cluster[ClusterToAssign].append(MeaningfulInputVector[i])
#since we initialised the cluster list to 0 initially,so each inside list will have 0 as the first element which has to be popped
cluster[0].pop(0)
cluster[1].pop(0)
cluster[2].pop(0)
#now plot clusters.Total 10 plots will be plotted with a centroid collection at each iteration
plotCluster(cluster,ClusterCentroid)
#Compute centroid location. i.e. new location after adding the points
computeClusterCentroid(ClusterCentroid,cluster)
index+=1 #increment the index for each iteration
#Plot clusters based on the target values.
plotClustersBasedOnTarget(cluster,ClusterCentroid)
def computeDistance(InputList,ClusterCentroid,Dist):
#distance formula ie sqrt((x1-c1)^2). using it to calculate distance of input from centroid.since input has 3 useful features ie age,sibling and gender,hence 3 calculations
#Syntatic meaning is not symantic meaning. Hence we only consider points for age,gender and Number of sibling count as the import feature while computing centroids
#Dist[0] is distance of point from first cluster centroid
#Dist[1] is distance of point from second cluster centroid
#Dist[2] is distance of point from third cluster centroid
Dist[0] = ((InputList[0] - ClusterCentroid[0][0]) **2) + ((InputList[1] - ClusterCentroid[0][1]) **2) + ((InputList[2] - ClusterCentroid[0][2]) **2)
Dist[1] = ((InputList[0] - ClusterCentroid[1][0]) **2) + ((InputList[1] - ClusterCentroid[1][1]) **2) + ((InputList[2] - ClusterCentroid[1][2]) **2)
Dist[2] = ((InputList[0] - ClusterCentroid[2][0]) **2) + ((InputList[1] - ClusterCentroid[2][1]) **2) + ((InputList[2] - ClusterCentroid[2][2]) **2)
def FindClusterIndex(Dist):
minDistance = min(Dist)
clusterIndex = 0
for i in range(0,len(Dist)):
if minDistance == Dist[i]:
return i
def computeClusterCentroid(ClusterCentroid,cluster):
ElementsInCluster = 0
#iterate for 3 times since we have 3 clusters.Calculate centroid everytime
for i in range(0,len(cluster)):
ElementsInCluster = len(cluster[i])
SumOfFeatureInTheCluster = sum(cluster[i])
#Check if cluster is not empty to prevent divide by zero in next step for average calculation
if (ElementsInCluster != 0):
ClusterCentroid[i] = SumOfFeatureInTheCluster/float(ElementsInCluster)
def plotCluster(cluster,ClusterCentroid):
colorIndex = ['b','g','m']
ax = Axes3D(plt.gcf())
for i in range(0,len(cluster)):
x_point = []
y_point = [] #These store the x,y and z axises which have to be plotted for a particular point.Initialised for each cluster
z_point = []
for j in range(0,len(cluster[i])):
x_point.append(cluster[i][j][0])
y_point.append(cluster[i][j][1])
z_point.append(cluster[i][j][2])
if len(cluster[i]):
ax.scatter(x_point, y_point, z_point,c = colorIndex[i])
ax.scatter(ClusterCentroid[i][0],ClusterCentroid[i][1],ClusterCentroid[i][2],marker='x',s=100,color = colorIndex[i])
plt.title("Clustering of input vectors")
ax.set_xlabel('X intercept - Age')
ax.set_ylabel('Y intercept - Number of Members')
ax.set_zlabel('Z intercept - Gender')
plt.show()
def plotClustersBasedOnTarget(cluster,ClusterCentroid):
#1.This function takes the clusters generated after performing k means and then plot the points(Inputs) based on the target value
#2. If the input has target value as survived,it checks if input is for gender = female or male
#3. if input is female then it adds it to survived women list else survived men list
#4. If the input has target value as dead,it checks if input is for gender = female or male
#5. if input is female then it adds it to dead women list else dead men list
#6. Next we plot the curves and color code the points based on gender and survival
#7. women_survived - green,men_survived - blue
#8. women_dead - red, men_dead - black
#9 based on the color coding we can identify which gender was given more preference in the survival operations
ax = Axes3D(plt.gcf())
dead_women_x = []
dead_women_y = []
dead_women_z = []
survived_women_x = []
survived_women_y = [] #Store different lists for women and men based on their survival
survived_women_z = []
dead_men_x = []
dead_men_y = []
dead_men_z = []
survived_men_x = []
survived_men_y = []
survived_men_z = []
for i in range(0,len(cluster)):
for j in range(0,len(cluster[i])):
if (int(cluster[i][j][3])) == 1: #if target is 1 i.e. survived
if(int(cluster[i][j][2]) == 1): #if input value is female
survived_women_x.append(cluster[i][j][0])
survived_women_y.append(cluster[i][j][1])
survived_women_z.append(cluster[i][j][2])
else: #if input is for male
survived_men_x.append(cluster[i][j][0])
survived_men_y.append(cluster[i][j][1])
survived_men_z.append(cluster[i][j][2])
else: #if target is 0 i.e. dead
if(int(cluster[i][j][2]) == 1):
dead_women_x.append(cluster[i][j][0])
dead_women_y.append(cluster[i][j][1])
dead_women_z.append(cluster[i][j][2])
else:
dead_men_x.append(cluster[i][j][0])
dead_men_y.append(cluster[i][j][1])
dead_men_z.append(cluster[i][j][2])
ax.scatter(survived_women_x, survived_women_y, survived_women_z,c = 'green') #Green for survived women
ax.scatter(survived_men_x, survived_men_y, survived_men_z,c = 'blue') # Blue for survived men
ax.scatter(dead_women_x, dead_women_y, dead_women_z,c = 'red') #red for dead women
ax.scatter(dead_men_x, dead_men_y, dead_men_z,c = 'black') #black for dead men
plt.title('Gender bias in survival operations')
plt.figtext(0.80, 0.09, 'Green = women survived',color = 'green')
plt.figtext(0.80, 0.07, 'blue = men survived',color = 'blue')
plt.figtext(0.80, 0.05, 'red = women dead',color = 'red')
plt.figtext(0.80, 0.03, 'blue = men dead',color = 'black')
ax.set_xlabel('X intercept - Age')
ax.set_ylabel('Y intercept - Number of Members')
ax.set_zlabel('Z intercept - Gender')
plt.show()
print 'Number of men rescued : ', len(survived_men_x)
print 'Number of men Dead : ', len(dead_men_x)
print 'Number of women rescued : ', len(survived_women_x)
print 'Number of women dead : ', len(dead_women_x)
#Code below is basically used to check if clusters have changed after each iteration.Not used now since distances never converge
'''
def CheckIfAnyChangeInCentroid(ClusterCentroidOld,ClusterCentroid):
cluster1Same = False
cluster2Same = False
cluster3Same = False
cluster4Same = False
cluster5Same = False
if(ClusterCentroid[0][1] == ClusterCentroidOld[0][1] and ClusterCentroid[0][2] == ClusterCentroidOld[0][2] and ClusterCentroid[0][4] == ClusterCentroidOld[0][4]):
cluster1Same = True
if (ClusterCentroid[1][1] == ClusterCentroidOld[1][1] and ClusterCentroid[1][2] == ClusterCentroidOld[1][2] and ClusterCentroid[1][4] == ClusterCentroidOld[1][4]):
cluster2Same = True
if (ClusterCentroid[2][1] == ClusterCentroidOld[2][1] and ClusterCentroid[2][2] == ClusterCentroidOld[2][2] and ClusterCentroid[2][4] == ClusterCentroidOld[2][4]):
cluster3Same = True
if(ClusterCentroid[3][1] == ClusterCentroidOld[3][1] and ClusterCentroid[3][2] == ClusterCentroidOld[3][2] and ClusterCentroid[3][4] == ClusterCentroidOld[3][4]):
cluster3Same = True
if (ClusterCentroid[4][1] == ClusterCentroidOld[4][1] and ClusterCentroid[4][2] == ClusterCentroidOld[4][2] and ClusterCentroid[4][4] == ClusterCentroidOld[4][4]):
cluster4Same = True
if(cluster1Same and cluster2Same and cluster3Same and cluster4Same and cluster5Same):
print 'yes'
return True
else:
print 'no'
return False
'''
###################################################################
#below this comment, all the functions are called
#Step1: populate data into arrays
#step2: normalise the inputs
#step3: plot dendogram using inbuilt library.select a cutoff
#step4: do k means analysis
# step4a: initiate a cluster with 2 in it...Each array will store the point assigned to that cluster
# step4b: initiate clustercentroid list which contains centroids generated randomly
# step4c: calculate distance of each point in the dataset and assign it to corresponding cluster
# step4d: plot cluster
# step4e: recalculate cluster centroid
#step5: Plot the cluster and calculate the centroid's new locations after adding the points to the cluster.
#step6: calcualte distances of points from the new centroids and change the assignment based on distance
#step8: repeat this for 10 times and then plot for every iteration
#step9: after we have the cluster. check if the survival operation was more biased towards a specific gender
#step10: color code the points in the cluster based on gender and their survival to get a better idea.
#####################################################################################################
#CALLING ALL THE FUNCTIONS BELOW THIS IN THE ORDER
#call populate function for populating dataset and Target
populateDataAndTarget()
#normalise inputs to have a range 0 to 1 in the input vectors
normaliseInputs()
#plot dendogram
plotDendogram()
#perfrom k means clustering
performKmeans()