-
Notifications
You must be signed in to change notification settings - Fork 0
/
titanic_base.py
46 lines (30 loc) · 1.1 KB
/
titanic_base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
'''
Baseline model. Kaggle score: 0.77511
'''
train_data = pd.read_csv('train.csv')
# Survived - the predicted y-value.
y = train_data.Survived
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
'''
Needed for final. Not needed in accuracy check.
test_data = pd.read_csv('test.csv')
X_test = pd.get_dummies(test_data[features])
'''
# Split into training and testing sets to measure accuracy.
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=0.5,
random_state=1)
# Create and fit model.
model = RandomForestClassifier(n_estimators=100, max_depth=5,
random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy score.
accuracy = accuracy_score(y_test, y_pred)
print('Base model accuracy score is...')
print(accuracy)