titanic_data.py

# -*- coding: utf-8 -*-
"""titanic_data.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1G3lZA_zEpd6pS_SSe-vexlRFD98AZYYh
"""

#import the files required
from google.colab import files
import io
uploaded =files.upload()
for fn in uploaded.keys():
   print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

#Load the required libraries 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#read the data from the file
titanic_data = pd.read_csv('train.csv')
print("Rows, columns: " + str(titanic_data.shape))

titanic_data.dtypes

#Viewing the data
titanic_data.sample(10)

# Missing Values
titanic_data.isna().sum()

# Plotting the percentage of missing values
total = titanic_data.isnull().sum().sort_values(ascending = False)
percent_total = (titanic_data.isnull().sum()/titanic_data.isnull().count()).sort_values(ascending=False)*100
missing = pd.concat([total, percent_total], axis=1, keys=["Total", "Percentage"])
missing_data = missing[missing['Total']>0]
missing_data

"""The Age, Cabin and Embarked column have null values.
Roughly 20% of Age data is missing.The cabin data has just too much of missing values this will be probably dropped further.

#Data visualization
"""

sns.set_style('whitegrid')
sns.countplot(x='Survived',data=titanic_data)
plt.show()

"""Here, 0 represents non-survived and 1 as survived.Few people have survived when compared to non-survived."""

sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=titanic_data)
plt.show()

"""More than 400 males didn't survive where as around less than 100 females survived."""

sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=titanic_data)
plt.show()

"""Passenger class 1 has very less number of people who died but for passenger class 3 there were many people who died."""

sns.set_style('whitegrid')
sns.countplot(x='SibSp',data=titanic_data)
plt.show()

"""1. Maximum people didn't have siblings and spouse along with them which is denoted with 0 value.
2. Value 1, nearly 200 people were traveling with spouse.

"""

sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Parch',data=titanic_data)
plt.show()

"""People have 0 parents or children on board are not likely to survive."""

sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Embarked',data=titanic_data)
plt.show()

"""People embarked from Southampton (S) are not likely to survive."""

sns.displot(titanic_data['Age'].dropna(),kde=False, color='blue', bins=40)
plt.show()

"""1. In the range of 5 to 10 age and beyond 78 age there were very few people.
2. The average age count lies between 16 to 30.
"""

correlations = titanic_data.corr()['Age'].sort_values(ascending=True)
print(correlations*100)

"""Passenger class and age are 36% related to each other.So, let's use Pclass to find the average age of the people."""

#Boxplot of the Age column
sns.set_style('whitegrid')
plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass',y='Age',data=titanic_data)
plt.show()

#Summary of the Age based on Pclass
titanic_data.groupby('Pclass')[['Age']].agg(['mean', 'median'])

def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
                    return 38
        elif Pclass == 2:
                    return 29
        else:
                    return 25
    else:
                return Age

titanic_data['Age'] = titanic_data[['Age','Pclass']].apply(impute_age,axis=1)

titanic_data.drop('Cabin',axis=1,inplace=True)

titanic_data.dropna(inplace=True)

titanic_data.isna().sum()

"""#correlation """

correlations = titanic_data.corr()['Survived'].sort_values(ascending=False)
correlations.plot(kind='bar')

print(correlations*100)

"""#convert categorical features"""

titanic_data.info()

print(titanic_data['Embarked'].unique())

#Label Encoding for Embarked
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder()  
titanic_data['embark']= label_encoder.fit_transform(titanic_data['Embarked'])

print(titanic_data['Sex'].unique())

#create dummies for Sex
sex = pd.get_dummies(titanic_data['Sex'])
sex.head()

titanic_data = pd.concat([titanic_data,sex],axis=1)

# Create new feature FamilySize as a combination of SibSp and Parch
titanic_data['FamilySize'] = titanic_data['SibSp'] + titanic_data['Parch'] + 1

# create another feature called IsAlone based on FamilySize
titanic_data['IsAlone'] = 0
titanic_data.loc[titanic_data['FamilySize'] == 1, 'IsAlone'] = 1
titanic_data[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()
#later on dropping FamilySize,Parch and SibSp

# create bin for Age features
titanic_data['Age_bin'] = pd.cut(titanic_data['Age'], bins=[0,12,20,40,120], labels=['Children','Teenage','Adult','Elder'])
titanic_data.dtypes

titanic_data['Age_bin']= label_encoder.fit_transform(titanic_data['Age_bin'])

#create new feature based on  Name feature
titanic_data['Title'] = titanic_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(titanic_data['Title'], titanic_data['Sex'])

#We can replace many titles with a more common name or classify them as Rare.
titanic_data['Title'] = titanic_data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
titanic_data['Title'] = titanic_data['Title'].replace('Mlle', 'Miss')
titanic_data['Title'] = titanic_data['Title'].replace('Ms', 'Miss')
titanic_data['Title'] = titanic_data['Title'].replace('Mme', 'Mrs')
    
titanic_data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

#We can convert the categorical titles to ordinal.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

titanic_data['Title'] = titanic_data['Title'].map(title_mapping)
titanic_data['Title'] = titanic_data['Title'].fillna(0)

titanic_data.head()

# create bin for fare features
sns.displot(titanic_data['Fare'],kde=False, color='blue', bins=40)
plt.show()

titanic_data['Fare'].describe()

titanic_data['Fare_bin'] = pd.cut(titanic_data['Fare'], bins=[0,7.89,14.45,31,512.32], labels=['Low','median','Average','high'])

titanic_data['Fare_bin']= label_encoder.fit_transform(titanic_data['Fare_bin'].astype(str))

titanic_data.dtypes

titanic_data.drop(['Sex','Age', 'Embarked', 'Name', 'Ticket', 'PassengerId','Fare','Parch', 'SibSp'],axis=1,inplace=True)

titanic_data.head()

"""#Machine learning"""

# Splitting the dataset
X = titanic_data.drop('Survived', axis=1)
y = titanic_data.Survived

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.4, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.fit_transform(X_val)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
reg_lon=LogisticRegression(max_iter=1000)
reg_lon.fit(X_train,y_train)
reg_lon.score(X_train,y_train)

reg_lon_predict = reg_lon.predict(X_val)

rmse_lon=np.sqrt(mean_squared_error(reg_lon_predict,y_val))
print('RMSE for Logistic Regression:{0:.2f}'.format(rmse_lon))


accuracy_lon = reg_lon.score(X_val,y_val)
print('Accuracy of the Logistic Regression model:',accuracy_lon*100,'%')

"""The LogisticRegression model gave an accurate on the training data was of 76.40 %.Further feature engineering techniques are required to apply on it.


"""