-
Notifications
You must be signed in to change notification settings - Fork 1
/
seq2sequtil.py
314 lines (237 loc) · 10 KB
/
seq2sequtil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
import numpy as np
import torch
from IPython.display import display
# symbolic symbols
from sympy import Symbol, exp, \
cos, sin, tan, \
cosh, sinh, tanh, ln, log, E
x = Symbol('x')
class Seq2SeqDataPreparer:
'''
This class maps the source (i.e., input) and target (i.e, output)
sequences of characters into sequences of indices. The source data
are split into x_train, x_valid, and x_test sets and similarly for
the target data.
Create a data preparer using
dd = Seq2SeqDataPreparer(X, Y, fractions)
where,
fractions: a 2-tuple containing the three-way split of data.
e.g.: (50/60, 55/60) means split the data as follows
(50000, 5000, 5000)
'''
def __init__(self, X, Y,
fractions=[50/60, 55/60]):
self.fractions = fractions
# Get maximum sequence length for input expressions
self.x_max_seq_len = max([len(z) for z in X])
# Get maximum sequence length for target expressions
self.y_max_seq_len = max([len(z) for z in Y])
# get length of splits into train, valid, test
N = int(len(X)*fractions[0])
M = int(len(X)*fractions[1])
# Create token to index map for source sequences
t = self.token_tofrom_index(X[:N])
self.x_token2index, self.x_index2token = t
# Create token to index map for target sequences
t = self.token_tofrom_index(Y[:N])
self.y_token2index,self.y_index2token = t
# Structure data into a list of blocks, where each block
# comprises a tuple (x_data, y_data) whose elements have
# x_data.shape: (x_seq_len, batch_size)
# y_data.shape: (y_seq_len, batch_size)
#
# The sequence and batch sizes can vary from block to block.
self.train_data, self.n_train = self.code_data(X[:N], Y[:N])
self.valid_data, self.n_valid = self.code_data(X[N:M],Y[N:M])
self.test_data, self.n_test = self.code_data(X[M:], Y[M:])
def __del__(self):
pass
def __len__(self):
n = 0
n += self.n_train
n += self.n_valid
n += self.n_test
return n
def __str__(self):
s = ''
s += 'number of seq-pairs (train): %8d\n' % self.n_train
s += 'number of seq-pairs (valid): %8d\n' % self.n_valid
s += 'number of seq-pairs (test): %8d\n' % self.n_test
s += '\n'
s += 'number of source tokens: %8d\n' % \
len(self.x_token2index)
s += 'max source sequence length: %8d\n' % \
self.x_max_seq_len
try:
s += '\n'
s += 'number of target tokens: %8d\n' % \
len(self.y_token2index)
s += 'max target sequence length: %8d' % \
self.y_max_seq_len
except:
pass
return s
def num_tokens(self, which='source'):
if which[0] in ['s', 'i']:
return len(self.x_token2index)
else:
return len(self.y_token2index)
def max_seq_len(self, which='source'):
if which[0] in ['s', 'i']:
return self.x_max_seq_len
else:
return self.y_max_seq_len
def decode(self, indices):
# map list of indices to a list of tokens
return ''.join([self.y_index2token[i] for i in indices])
def token_tofrom_index(self, expressions):
chars = set()
chars.add(' ') # for padding
chars.add('?') # for unknown characters
for expression in expressions:
for char in expression:
chars.add(char)
chars = sorted(list(chars))
char2index = dict([(char, i) for i, char in enumerate(chars)])
index2char = dict([(i, char) for i, char in enumerate(chars)])
return (char2index, index2char)
def get_block_indices(self, X, Y):
# X, and Y are just arrays of strings.
#
# 1. Following Michael Andrews' suggestion double sort
# expressions, first with targets then sources. But, also
# note the ordinal values "i" of the expressions in X, Y.
sizes = [(len(a), len(b), i)
for i, (a, b) in enumerate(zip(Y, X))]
sizes.sort()
# 2. Find ordinal values (indices) of all expression pairs
# for which the sources are the same length and the
# targets are the same length. In general, the sources and
# targets differ in length.
block_indices = []
n, m, i = sizes[0] # n, m, i = len(target), len(source), index
previous = (n, m)
indices = [i] # cache index of first expression
for n, m, i in sizes[1:]: # skip first expression
size = (n, m)
if size == previous:
indices.append(i) # cache index of expression
else:
# found a new boundary, so save previous
# set of indices...
block_indices.append(indices)
# ...and start a new list of indices
indices = [i]
previous = size
# cache expression indices of last block
block_indices.append(indices)
return block_indices
def make_block(self, expressions, indices, token2index, unknown):
# batch size of current block
batch_size = len(indices)
# By construction, all expressions of a block have
# the same length, so can use the length of first expression
seq_len = len(expressions[indices[0]])
# Create an empty block of correct shape and size
data = np.zeros((seq_len, batch_size), dtype='long')
#print('seq_len, batch_size: (%d, %d)' % (seq_len, batch_size))
# loop over expressions for current block
# m: ordinal value of expression in current block
# k: ordinal value of expression in original list of expressions
# n: ordinal value of character in a given expression
for m, k in enumerate(indices):
expr = expressions[k]
#print('%5d expr[%d] | %s |' % (m, k, expr[1:-1]))
# copy coded characters to 2D arrays
for n, char in enumerate(expr):
#print('\t\t(n, m): (%d, %d)' % (n, m))
try:
data[n, m] = token2index[char]
except:
data[n, m] = unknown
return data
def code_data(self, X, Y):
# Implement Arvind's idea
# X, Y consist of delimited strings:
# \tab<characters\newline
# loop over sequence pairs and convert them to sequences
# of integers using the two token2index maps
x_space = self.x_token2index[' ']
x_unknown = self.x_token2index['?']
y_space = self.y_token2index[' ']
y_unknown = self.y_token2index['?']
# 1. Get blocks containing sequences of the same length.
block_indices = self.get_block_indices(X, Y)
# 2. Loop over the indices associated with each block of coded
# sequences. The indices are the ordinal values of the
# sequence pairs X and Y.
blocks = []
n_data = 0
for indices in block_indices:
x_data = self.make_block(X, indices,
self.x_token2index, x_unknown)
y_data = self.make_block(Y, indices,
self.y_token2index, y_unknown)
blocks.append((x_data, y_data))
n = len(indices)
n_data += n
assert n_data == len(X)
return blocks, n_data
class Seq2SeqDataLoader:
'''
dataloader = Seq2seqDataLoader(dataset, device, sample=True)
'''
def __init__(self, dataset, device, sample=True):
self.dataset = dataset
self.device = device
self.sample = sample
self.init()
def __iter__(self):
return self
def __next__(self):
# increment iteration counter
self.count += 1
if self.count <= self.max_count:
# 1. randomly pick a block or return blocks in order.
if self.sample:
k = np.random.randint(len(self.dataset))
else:
k = self.count-1 # must subtract one!
# 2. create tensors directly on the device of interest
X = torch.tensor(self.dataset[k][0],
device=self.device)
Y = torch.tensor(self.dataset[k][1],
device=self.device)
# shape of X and Y: (seq_len, batch_size)
return X, Y
else:
self.count = 0
raise StopIteration
def init(self, max_count=0, sample=True):
n_data = len(self.dataset)
self.max_count = n_data if max_count < 1 else min(max_count,
n_data)
self.sample= sample
self.count = 0
# Delimit each sequence in filtered sequences
# The start of sequence (SOS) and end of sequence (EOS)
# tokens are "\t" and "\n", respectively.
def loadData(inpfile):
# format of data:
# input expression<tab>target expression<newline>
data = [a.split('\t') for a in open(inpfile).readlines()]
X, Y = [], []
for i, (x, y) in enumerate(data):
X.append('\t%s\n' % x)
# get rid of spaces in target sequence
y = ''.join(y.split())
Y.append('\t%s\n' % y)
print('Example source:')
print(X[-1])
pprint(X[-1])
print('Example target:')
print(Y[-1])
pprint(Y[-1])
return (X, Y)
def pprint(expr):
display(eval(expr))