-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathimplementation.py
153 lines (134 loc) · 6.21 KB
/
implementation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#written by kmr0877 for comp9444
import tensorflow as tf
import numpy as np
from string import punctuation
import glob #this will be useful when reading reviews from file
import re
import tarfile
batch_size = 50
def load_data(glove_dict):
"""
Take reviews from text files, vectorize them, and load them into a
numpy array. Any preprocessing of the reviews should occur here. The first
12500 reviews in the array should be the positive reviews, the 2nd 12500
reviews should be the negative reviews.
RETURN: numpy array of data with each row being a review in vectorized
form"""
def read_and_clean_data(path):
with open(path, "r") as o:
text = o.read()
punc_rem = re.sub(r'[^\w\s]', '', text)
vec_words = []
words = punc_rem.split()
if len(words)> 40:
words = words[:40]
else:
padding = np.zeros(40 - len(words))
words += list(padding)
for word in words:
try:
idx = glove_dict[word.lower()]
except:
idx = 0
vec_words.append(idx)
return vec_words
data = []
folders = ["pos", "neg"]
for folder in folders:
for path in glob.glob(folder + "/*.txt"):
vectorized_sent = read_and_clean_data(path)
data.append(vectorized_sent)
return data
def load_glove_embeddings():
"""
Load the glove embeddings into a array and a dictionary with words as
keys and their associated index as the value. Assumes the glove
embeddings are located in the same directory and named "glove.6B.50d.txt"
RETURN: embeddings: the array containing word vectors
word_index_dict: a dictionary matching a word in string form to
its index in the embeddings array. e.g. {"apple": 119"}
"""
with open("glove.6B.50d.txt",'r') as o:
embeddings = o.readlines()
words_list = map(lambda x: x.split(" ", 1), embeddings)
a, b = zip(*words_list)
word_index_dict = dict(zip(a, range(len(a))))
# if you are running on the CSE machines, you can load the glove data from here
# data = open("/home/cs9444/public_html/17s2/hw2/glove.6B.50d.txt",'r',encoding="utf-8")
glove_array = np.array(list(map(lambda x: x.split(), b)))
glove_array = glove_array.astype(np.float32)
return glove_array, word_index_dict
def define_graph(glove_embeddings_arr):
"""
Define the tensorflow graph that forms your model. You must use at least
one recurrent unit. The input placeholder should be of size [batch_size,
40] as we are restricting each review to it's first 40 words. The
following naming convention must be used:
Input placeholder: name="input_data"
labels placeholder: name="labels"
accuracy tensor: name="accuracy"
loss tensor: name="loss"
RETURN: input placeholder, labels placeholder, optimizer, accuracy and loss
tensors"""
max_document_length = 40
sequence_length = max_document_length
rnn_data_classes = 2
rnn_cell_size = 100
rnn_data_embedding_size = 50
rnn_data_vec_size = rnn_data_embedding_size
rnn_lstm_forget_bias = 1.0
rnn_dropout_keep_prob = 0.5
rnn_learning_rate = 0.001
vocab_size = len(glove_embeddings_arr)
with tf.device("/cpu:0"):
# Placeholders for input, output and dropout
labels = tf.placeholder(tf.int32, [None, rnn_data_classes], name="labels")
# print labels.shape
input_data = tf.placeholder(tf.int32, [None, max_document_length],
name="input_data") # Place holder size format [rows, columns].
# print input_data.shape
# # one column.
#
# # # Define weights and biases for linear activation, using rnn inner loop last output
weights = tf.Variable(tf.random_normal([rnn_cell_size, rnn_data_classes]))
biases = tf.Variable(tf.random_normal([rnn_data_classes]))
dropout_keep_prob = tf.placeholder_with_default(1.0, [])
# embedding = tf.get_variable("embedding", [vocab_size, embedding_size])
# print embedding.shape
#print(input_data.shape)
embedded_data = tf.nn.embedding_lookup(glove_embeddings_arr, input_data)
#print(embedded_data.shape)
embedded_data_dropout = tf.nn.dropout(embedded_data, rnn_dropout_keep_prob)
#print(embedded_data_dropout.shape)
# add LSTM cell and dropout nodes
rnn_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_cell_size, forget_bias=rnn_lstm_forget_bias)
rnn_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(rnn_lstm_cell, output_keep_prob=dropout_keep_prob)
#print("RNN rnn_data_X: ", embedded_data_dropout)
rnn_data_X = embedded_data_dropout
# print(rnn_data_X.shape_)
# Permuting batch_size and sequence_length
rnn_data_X1 = tf.transpose(rnn_data_X, [1, 0, 2])
# print rnn_data_X1.shape
# print ("RNN After transpose rnn_data_X: ", rnn_data_X)
# Reshaping to (sequence_length * batch_size, rnn_data_vec_size)
rnn_data_X2 = tf.reshape(rnn_data_X1, [-1, rnn_data_vec_size])
# print rnn_data_X2.shape
# print ("RNN After reshape rnn_data_X: ", rnn_data_X)
# Split to get a list of 'sequence_length' tensors of shape (batch_size, rnn_data_vec_size)
rnn_data_X_splited = tf.split(rnn_data_X2, sequence_length, 0)
# print len(rnn_data_X_splited),rnn_data_X_splited[0].shape
# print ("RNN After split len(rnn_data_X): ", len(rnn_data_X), rnn_data_X[0])
# # Get lstm cell output
outputs, states = tf.nn.static_rnn(rnn_lstm_cell, rnn_data_X_splited, dtype=tf.float32)
# outputs = tf.stack(outputs)
# print outputs[-1].shape
logits = tf.matmul(outputs[-1], weights) + biases
# Define loss and optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))
optimizer = tf.train.AdamOptimizer(learning_rate=rnn_learning_rate).minimize(loss)
#
# # Evaluate model
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),name="accuracy")
#print(accuracy.name)
return input_data,labels, dropout_keep_prob, optimizer, accuracy, loss