-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathskipgram_ppdb_model.py
287 lines (235 loc) · 9.75 KB
/
skipgram_ppdb_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
import numpy as np
import torch, pdb
import torch.nn as nn
import torch.nn.init as init
from torch.autograd import Variable
from itertools import ifilter
from random import randint
from joblib import Parallel, delayed
BATCH_SIZE = 100
class Word2Vec(nn.Module):
def __init__(self, vocab_size, hid_dim, pretrained=None):
super(Word2Vec, self).__init__()
self.hid_dim = hid_dim
#these are by intent to learn separate embedding matrices, we return word_emb
self.word_emb = nn.Embedding(vocab_size, hid_dim)
if pretrained is not None:
self.word_emb.weight.data.copy_(pretrained)
self.context_emb = nn.Embedding(vocab_size, hid_dim)
#if pretrained is not None:
# self.context_emb.weight.data.copy_(pretrained)
self.sigmoid = nn.LogSigmoid()
def forward(self, wrd, cntxt, labels):
wrd_vec = self.word_emb(wrd) # N * 1 * D
cntxt_vec = self.context_emb(cntxt) # N * 5 * D
cntxt_vec = torch.transpose(cntxt_vec, 1, 2)
res = torch.bmm(wrd_vec, cntxt_vec)
res = res.squeeze(1)
res = res * labels
res = self.sigmoid(res)
# these are N * (1 + neg_exmpl) logsigmoid values
# for each mini-batch we have a probability score for the 5 contexts
# return res
return (torch.sum(res)*-1.0)/res.size(0)
def negative_sampling_tbl(vocab, tok_freq, vocab_size, idx2word):
total_cn = 0
for wrd in vocab:
total_cn += pow(tok_freq[wrd],0.75)
tbl_size, wrd_idx = int(1e6), 0
table = torch.LongTensor(tbl_size) # defaults to a column vector with only 1 dimension
wrd_prob = pow(tok_freq[idx2word[wrd_idx]], 0.75)/total_cn
for i in range(0, tbl_size):
table[i] = wrd_idx
ind = i*1.0
if ind/tbl_size > wrd_prob:
wrd_idx += 1
wrd_prob += pow(tok_freq[idx2word[wrd_idx]], 0.75)/total_cn
if wrd_idx >= vocab_size:
wrd_idx -= 1
return table
# return the sample context the first one being the true word and other being negative
def sample_context(table, neg_cn, cntxt):
cntxts, i = [], 0
cntxts.append(cntxt)
while i < neg_cn:
ind = randint(0, len(table) - 1)
neg_ctx = table[ind]
if neg_ctx != cntxt:
cntxts.append(neg_ctx)
i += 1
return cntxts
def train_pair(wrd_idx, cntxts, labels, mdl, criterion, optimizer, index2word):
"""
wrd_idx: is the input word which is predicting the context
cntxts: contains 1 positive word idx's and remaining negative words idx's forming the context
"""
loss = mdl(wrd_idx, cntxts, labels)
#preds = mdl(wrd_idx, cntxts, labels)
#loss = criterion(preds, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
return loss.data[0]
def per_trainepoch(mdl, lines, table, criterion, optimizer, labels, word2index, index2word, neg_exmpl=20, win_size=5):
track_loss, batch_wrd_idx, batch_cntxts = [], [], []
batch_count = 0
for k, l in enumerate(lines):
l = l.strip()
wrds = l.split(" ")
for i, wrd in enumerate(wrds):
wrd_idx = word2index[wrd]
for j in range(max(0, i - win_size), min(len(wrds), i + win_size)):
cntxt_wrd = wrds[j]
if j != i:
cntxt_idx = word2index[cntxt_wrd]
cntxts = sample_context(table, neg_exmpl, cntxt_idx)
batch_wrd_idx.append(wrd_idx)
batch_cntxts.append(cntxts)
if len(batch_wrd_idx) == BATCH_SIZE:
batch_count += 1
var_wrd_idx = Variable(torch.LongTensor(batch_wrd_idx)).unsqueeze(1)
var_cntxts = Variable(torch.LongTensor(batch_cntxts))
if torch.cuda.is_available():
var_wrd_idx = var_wrd_idx.cuda()
var_cntxts = var_cntxts.cuda()
lossval = train_pair(var_wrd_idx, var_cntxts, labels, mdl, criterion, optimizer, index2word)
if k % 50000 == 0:
print("loss:{} line:{}".format(lossval, k))
track_loss.append(lossval)
batch_wrd_idx[:], batch_cntxts[:] = [], []
print("tuples processed (wrd, cntxt):", batch_count*BATCH_SIZE)
return sum(track_loss)/len(track_loss)
def get_sim(wrd, k, mat, word2index):
if wrd not in word2index:
return None
vec = mat[word2index[wrd], :].unsqueeze(1)
othrs = torch.mm(mat, vec)
othrs, ind = torch.sort(othrs, 0, descending=True)
topk = ind[:k]
for i in range(topk.size()[0]):
print(index2word[topk[i][0]])
def get_glovedict(glove_path):
vocab_d = set()
with open(glove_path) as f:
for line in f:
word, vec = line.split(' ', 1)
word = word.strip().lower()
vocab_d.add(word)
return vocab_d
def get_gloveready(glove_path, vocab_size, dim, word2index):
pretrained_weight = torch.FloatTensor(vocab_size, dim)
fnd = 0
with open(glove_path) as f:
for line in f:
word, vec = line.split(' ', 1)
word = word.strip().lower()
if word in word2index:
ind = word2index[word]
pretrained_weight[ind, :] = torch.from_numpy(np.array(list(map(float, vec.split()))))
fnd += 1
print('Found {0} words with glove vectors, total was {1}'.format(fnd, vocab_size))
return pretrained_weight
def process_lines(data):
pairs, vocab = set(), {}
for cn, l in enumerate(data):
dt = l.split("|||")
score = float(dt[3].split(" ")[1].split("=")[1])
if score < 3.3:
continue
wrd1, wrd2 = dt[1], dt[2]
wrd1, wrd2 = wrd1.strip(), wrd2.strip()
if ".pdf" not in wrd1 and ".pdf" not in wrd2 and wrd1.isalpha() and wrd2.isalpha():
sc = editdist_score(wrd1, wrd2)
if sc > min(len(wrd1), len(wrd2))/2 + 2:
if wrd1 + " " + wrd2 not in pairs and wrd2 + " " + wrd1 not in pairs:
pairs.add(wrd1 + " " + wrd2)
if wrd1 not in vocab:
vocab[wrd1] = 1
else:
vocab[wrd1] += 1
if wrd2 not in vocab:
vocab[wrd2] = 1
else:
vocab[wrd2] += 1
return pairs, vocab
def get_vocab(min_freq, flName=None, lines=None):
if flName is not None:
with open(flName) as fp:
lines = fp.readlines()
return process_lines(lines)
def get_chunks(lines, cn):
chunks = []
chunk_size = len(lines)//cn
for i in range(0, chunk_size*cn + 1):
chunk = lines[i*chunk_size:i*chunk_size + chunk_size]
chunks.append(chunk)
return chunks
def editdist_score(p1, p2):
n, m = len(p1), len(p2)
dp = [[0 for x in range(m+1)] for x in range(n+1)]
for i in range(n+1):
for j in range(m+1):
if i == 0:
dp[0][j] = j
elif j == 0:
dp[i][0] = i
elif p1[i-1] == p2[j-1]:
dp[i][j] = dp[i-1][j-1]
else:
dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
return dp[n][m]
def filter_data(pairs, word2index):
new_pairs = set()
fp = open("ppdb-processed.txt", "w")
for line in pairs:
p1, p2 = line.split(" ")
if p1 in word2index and p2 in word2index:
new_pairs.add(p1 + " " + p2)
fp.write(line)
fp.write("\n")
fp.close()
return new_pairs
def main(EPOCHS):
glove_path, dim, min_count, neg_exmpl = "glove.6B.50d.txt", 50, 1, 60
g_vocab = get_glovedict(glove_path)
pairs, tok_freq = get_vocab(min_count, flName="ppdb-2.0-l-lexical")
vocab = set(tok_freq.keys())
vocab = vocab.intersection(g_vocab)
word2index, index2word = {}, {}
for wrd in vocab:
if tok_freq[wrd] >= min_count:
index2word[len(index2word)] = wrd
word2index[wrd] = len(index2word) - 1
else:
tok_freq[wrd] = 0
pairs = filter_data(pairs, word2index)
vocab_size = len(index2word)
#chunks = get_chunks(lines, 7)
#retvals = Parallel(n_jobs=7)(delayed(process_line)([chunk, g_vocab]) for chunk in chunks)
print("Data ready: {} {} {}".format(vocab_size, len(pairs), len(vocab)))
pretrained_weight = get_gloveready(glove_path, vocab_size, dim, word2index)
pretrained_weight = torch.nn.functional.normalize(pretrained_weight)
print("Glove loaded")
negative_tbl = negative_sampling_tbl(vocab, tok_freq, vocab_size, index2word)
print("filtered data size:", len(pairs))
#free memory
del vocab, g_vocab, tok_freq
mdl = Word2Vec(vocab_size, dim, pretrained_weight)
#init.xavier_normal(mdl.word_emb.weight)
#init.xavier_normal(mdl.context_emb.weight)
#free memory
del pretrained_weight
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(mdl.parameters(), lr = 0.1)
print('Training..')
labels = Variable(torch.ones(BATCH_SIZE, 1 + neg_exmpl) * -1.0)
labels[:, 0] = labels[:, 0] * -1.0
if torch.cuda.is_available():
labels = labels.cuda()
negative_tbl = negative_tbl.cuda()
mdl.cuda()
for u in range(EPOCHS):
loss_epoch = per_trainepoch(mdl, pairs, negative_tbl, criterion, optimizer, labels, word2index, index2word,neg_exmpl)
print("---completed: {} and loss: {} ---".format(u, loss_epoch))
torch.save(mdl.state_dict(), './mdl_skipgm.pth')
main(10)