This repository has been archived by the owner on May 6, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
titanic_estimator.py
109 lines (91 loc) · 3.59 KB
/
titanic_estimator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# pylint: disable=unused-import
# pylint: disable=missing-docstring
# pylint: disable=trailing-newlines
import tensorflow as tf
#import titanic_data
def feature_columns():
# each passenger has a unique ID so this feature should be categorical
passenger_id = tf.feature_column.categorical_column_with_identity(
key='PassengerId',
num_buckets=1310
)
# as of the time of writing this, hashed categorical columns
# are not supported in DNNClassifier Estimator
# we have to use either indicator column or embedding column to wrap our feature in
passenger_id = tf.feature_column.indicator_column(passenger_id)
# 667 unique values in family feature so we use hashed column
family = tf.feature_column.categorical_column_with_hash_bucket(
key='Family',
hash_bucket_size=667
)
family = tf.feature_column.indicator_column(family)
# pclass is a multiclass low dimension feature
# so IdentityCategoricalColumn would be the best option
p_class = tf.feature_column.categorical_column_with_vocabulary_list(
key='Pclass',
vocabulary_list=[1, 2, 3])
p_class = tf.feature_column.indicator_column(p_class)
# two class feature with vocabulary list
sex = tf.feature_column.categorical_column_with_vocabulary_list(
key='Sex',
vocabulary_list=['male', 'female'])
sex = tf.feature_column.indicator_column(sex)
# age is obviously a numeric value but in our problem
# it's better to bucketize it in a way it is more affective in training
numeric_age = tf.feature_column.numeric_column(
key='Age',
dtype=tf.int8)
age = tf.feature_column.bucketized_column(
source_column=numeric_age,
boundaries=[12, 20, 40, 60, 80]
)
# a simple numeric column
family_members = tf.feature_column.numeric_column(
key='FamilyMembers',
dtype=tf.int64
)
embarked = tf.feature_column.categorical_column_with_vocabulary_list(
key='Embarked',
vocabulary_list=['S', 'C', 'Q']
)
embarked = tf.feature_column.indicator_column(embarked)
# again a low dimensional categorical value
deck = tf.feature_column.categorical_column_with_vocabulary_list(
key='Deck',
vocabulary_list=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'U']
)
deck = tf.feature_column.indicator_column(deck)
columns = [passenger_id, family, p_class, sex, family_members, age, embarked, deck]
return columns
def input_fn(features, labels, batch_size):
"""An input function for training"""
# Convert the inputs to a Dataset.
dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
# Shuffle, repeat, and batch the examples.
return dataset.shuffle(1000).repeat().batch(batch_size)
def eval_input_fn(features, labels, batch_size):
features = dict(features)
if labels is None:
# No labels, use only features.
inputs = features
else:
inputs = (features, labels)
# Convert the inputs to a Dataset.
dataset = tf.data.Dataset.from_tensor_slices(inputs)
# Batch the examples
assert batch_size is not None, "batch_size must not be None"
dataset = dataset.batch(batch_size)
# Return the dataset.
return dataset
#pylint: disable=invalid-name
classifier = tf.estimator.DNNClassifier(
feature_columns=feature_columns(),
hidden_units=[20, 20, 20, 20, 20, 20],
n_classes=2,
model_dir="/tmp/titanic_model",
optimizer=tf.train.ProximalAdagradOptimizer(
learning_rate=0.05,
l1_regularization_strength=0.001,
l2_regularization_strength=0.001
)
)