-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtitanic_data.py
240 lines (165 loc) · 7.46 KB
/
titanic_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# -*- coding: utf-8 -*-
"""titanic_data.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1G3lZA_zEpd6pS_SSe-vexlRFD98AZYYh
"""
#import the files required
from google.colab import files
import io
uploaded =files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))
#Load the required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#read the data from the file
titanic_data = pd.read_csv('train.csv')
print("Rows, columns: " + str(titanic_data.shape))
titanic_data.dtypes
#Viewing the data
titanic_data.sample(10)
# Missing Values
titanic_data.isna().sum()
# Plotting the percentage of missing values
total = titanic_data.isnull().sum().sort_values(ascending = False)
percent_total = (titanic_data.isnull().sum()/titanic_data.isnull().count()).sort_values(ascending=False)*100
missing = pd.concat([total, percent_total], axis=1, keys=["Total", "Percentage"])
missing_data = missing[missing['Total']>0]
missing_data
"""The Age, Cabin and Embarked column have null values.
Roughly 20% of Age data is missing.The cabin data has just too much of missing values this will be probably dropped further.
#Data visualization
"""
sns.set_style('whitegrid')
sns.countplot(x='Survived',data=titanic_data)
plt.show()
"""Here, 0 represents non-survived and 1 as survived.Few people have survived when compared to non-survived."""
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=titanic_data)
plt.show()
"""More than 400 males didn't survive where as around less than 100 females survived."""
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=titanic_data)
plt.show()
"""Passenger class 1 has very less number of people who died but for passenger class 3 there were many people who died."""
sns.set_style('whitegrid')
sns.countplot(x='SibSp',data=titanic_data)
plt.show()
"""1. Maximum people didn't have siblings and spouse along with them which is denoted with 0 value.
2. Value 1, nearly 200 people were traveling with spouse.
"""
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Parch',data=titanic_data)
plt.show()
"""People have 0 parents or children on board are not likely to survive."""
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Embarked',data=titanic_data)
plt.show()
"""People embarked from Southampton (S) are not likely to survive."""
sns.displot(titanic_data['Age'].dropna(),kde=False, color='blue', bins=40)
plt.show()
"""1. In the range of 5 to 10 age and beyond 78 age there were very few people.
2. The average age count lies between 16 to 30.
"""
correlations = titanic_data.corr()['Age'].sort_values(ascending=True)
print(correlations*100)
"""Passenger class and age are 36% related to each other.So, let's use Pclass to find the average age of the people."""
#Boxplot of the Age column
sns.set_style('whitegrid')
plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass',y='Age',data=titanic_data)
plt.show()
#Summary of the Age based on Pclass
titanic_data.groupby('Pclass')[['Age']].agg(['mean', 'median'])
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
if pd.isnull(Age):
if Pclass == 1:
return 38
elif Pclass == 2:
return 29
else:
return 25
else:
return Age
titanic_data['Age'] = titanic_data[['Age','Pclass']].apply(impute_age,axis=1)
titanic_data.drop('Cabin',axis=1,inplace=True)
titanic_data.dropna(inplace=True)
titanic_data.isna().sum()
"""#correlation """
correlations = titanic_data.corr()['Survived'].sort_values(ascending=False)
correlations.plot(kind='bar')
print(correlations*100)
"""#convert categorical features"""
titanic_data.info()
print(titanic_data['Embarked'].unique())
#Label Encoding for Embarked
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
titanic_data['embark']= label_encoder.fit_transform(titanic_data['Embarked'])
print(titanic_data['Sex'].unique())
#create dummies for Sex
sex = pd.get_dummies(titanic_data['Sex'])
sex.head()
titanic_data = pd.concat([titanic_data,sex],axis=1)
# Create new feature FamilySize as a combination of SibSp and Parch
titanic_data['FamilySize'] = titanic_data['SibSp'] + titanic_data['Parch'] + 1
# create another feature called IsAlone based on FamilySize
titanic_data['IsAlone'] = 0
titanic_data.loc[titanic_data['FamilySize'] == 1, 'IsAlone'] = 1
titanic_data[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()
#later on dropping FamilySize,Parch and SibSp
# create bin for Age features
titanic_data['Age_bin'] = pd.cut(titanic_data['Age'], bins=[0,12,20,40,120], labels=['Children','Teenage','Adult','Elder'])
titanic_data.dtypes
titanic_data['Age_bin']= label_encoder.fit_transform(titanic_data['Age_bin'])
#create new feature based on Name feature
titanic_data['Title'] = titanic_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(titanic_data['Title'], titanic_data['Sex'])
#We can replace many titles with a more common name or classify them as Rare.
titanic_data['Title'] = titanic_data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
titanic_data['Title'] = titanic_data['Title'].replace('Mlle', 'Miss')
titanic_data['Title'] = titanic_data['Title'].replace('Ms', 'Miss')
titanic_data['Title'] = titanic_data['Title'].replace('Mme', 'Mrs')
titanic_data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()
#We can convert the categorical titles to ordinal.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
titanic_data['Title'] = titanic_data['Title'].map(title_mapping)
titanic_data['Title'] = titanic_data['Title'].fillna(0)
titanic_data.head()
# create bin for fare features
sns.displot(titanic_data['Fare'],kde=False, color='blue', bins=40)
plt.show()
titanic_data['Fare'].describe()
titanic_data['Fare_bin'] = pd.cut(titanic_data['Fare'], bins=[0,7.89,14.45,31,512.32], labels=['Low','median','Average','high'])
titanic_data['Fare_bin']= label_encoder.fit_transform(titanic_data['Fare_bin'].astype(str))
titanic_data.dtypes
titanic_data.drop(['Sex','Age', 'Embarked', 'Name', 'Ticket', 'PassengerId','Fare','Parch', 'SibSp'],axis=1,inplace=True)
titanic_data.head()
"""#Machine learning"""
# Splitting the dataset
X = titanic_data.drop('Survived', axis=1)
y = titanic_data.Survived
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.4, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.fit_transform(X_val)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
reg_lon=LogisticRegression(max_iter=1000)
reg_lon.fit(X_train,y_train)
reg_lon.score(X_train,y_train)
reg_lon_predict = reg_lon.predict(X_val)
rmse_lon=np.sqrt(mean_squared_error(reg_lon_predict,y_val))
print('RMSE for Logistic Regression:{0:.2f}'.format(rmse_lon))
accuracy_lon = reg_lon.score(X_val,y_val)
print('Accuracy of the Logistic Regression model:',accuracy_lon*100,'%')
"""The LogisticRegression model gave an accurate on the training data was of 76.40 %.Further feature engineering techniques are required to apply on it.
"""