This repository has been archived by the owner on Dec 29, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
294 lines (230 loc) · 10.6 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
import os
import numpy as np
import pickle as pkl
from tqdm import tqdm
import copy
import nltk
nltk.download('punkt')
from nltk import tokenize
import torch
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer, T5Model, AutoModelForQuestionAnswering, AutoTokenizer, pipeline, RobertaModel, AutoModelForSeq2SeqLM
# prepare data -> retrieve the top k context sentences (join with space)
class CustomData(Dataset):
'''
Process raw data
'''
def __init__(self, file_dir, model, k=1):
self.file = file_dir
self.article_name = []
self.questions = []
self.answers = []
self.q_diffi = []
self.a_diffi = []
self.article_path = []
self.context_nn = {}
self.context = {} # only fill when load the dataset
self.context_embed = {}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model = model.to(device)
model.eval()
# get question answer pairs
for div in ['S08', 'S09', 'S10']:
skip = True
qa_path = os.path.join(self.file, div, "question_answer_pairs.txt")
num_lines = sum(1 for line in open(qa_path,'rb'))
with open(qa_path, 'rb') as f:
for line in tqdm(f, total=num_lines):
if skip:
skip = False # skip the first line
continue
try: # only continue if the decoding is valid for utf-8
row = line.decode().split('\t')
except:
continue
if "NULL" in row:
continue # if any feature does not exist -> skip
context_file = self.file + "/" + div + "/"+ row[5][:-1] + ".txt" # path to the context file
if not (os.path.exists(context_file) and os.path.isfile(context_file)): # otherwise context doesn't exist: invalid
continue
# only process document embedding when needed (article first found)
if row[0] not in self.context_embed.keys():
# check if context could be extracted
try:
with open(context_file, 'rb') as f:
curr_context = f.read().decode() # could be decoded, otherwise skip
except:
continue
curr_context = curr_context.split('Related Wikipedia Articles')[0] # ignore everything after Related articles
curr_context = curr_context.replace('\n',' ')
self.context[row[0]] = tokenize.sent_tokenize(curr_context)
# encode context and add to corresponding files
c_embed = []
for context in self.context[row[0]]:
output = self.model.encode(context)
c_embed.append(output)
self.context_embed[row[0]] = np.vstack(c_embed)
# get top-1 similar context
qa_input = row[0] + " " + row[1]
# qa embedding
encoded_qa = self.model.encode(qa_input)# detach to save gpu mem
c_embed = self.context_embed[row[0]] # load the context embeddings
# compute knn score: dot product
# print("c_embed size: ", c_embed.shape)
# print("qa_embed size: ", encoded_qa.shape)
scores = c_embed.dot(encoded_qa)
k_nn = scores.argsort()[-k:][::-1]
k_nn = list(k_nn)
# the text of the closest neighbor
nn_context = " ".join([top_context for top_context in np.array(self.context[row[0]])[k_nn]])
self.context_nn[row[0]] = nn_context
# other info
self.article_name.append(row[0])
self.questions.append(row[1])
self.answers.append(row[2])
self.q_diffi.append(row[3]) # difficulty
self.a_diffi.append(row[4])
self.article_path.append(div + "/"+ row[5][:-1]) # get rid of '\n
print("length of dataset: ", len(self.questions))
def __len__(self):
return len(self.questions)
def __getitem__(self, idx):
return self.questions[idx], self.answers[idx], self.context_nn[self.article_name[idx]]
# dump the raw dataset to json file
def dump_set(data, output_file):
l = []
for q, a, c in data:
l.add({"question": q, "answer": a, "context": context})
with open(output_file, 'w') as fp:
json.dump(l, fp)
# load the raw dataset from json file as a dataset object
def load_json_data(json_file):
test_dataset = load_dataset('json', data_files={'all':json_file})
return test_dataset['all']
# tokenize the text features
def prepare_features_q(data, cache_path, tokenizer, max_len_inp=512,max_len_out=512):
'''
tokenize the text features and embed them by the inputed tokenizer
the features are dump to the cache_path via pickle to avoid re-computation next imte
'''
inputs = []
targets = []
for q, a, c in tqdm(data):
input_ = f"context: {c} answer: {a}" # T5 Input format for QA tasks
target = f"question: {str(q)}" # Output format we require
# tokenize inputs
tokenized_inputs = tokenizer.batch_encode_plus(
[input_], max_length=max_len_inp,padding='max_length',
return_tensors="pt" #pytorch tensors
)
# tokenize targets
tokenized_targets = tokenizer.batch_encode_plus(
[target], max_length=max_len_out,
padding='max_length',return_tensors="pt"
)
inputs.append(tokenized_inputs)
targets.append(tokenized_targets)
all_features = {}
all_features['input'] = inputs
all_features['target'] = targets
pkl.dump(all_features, open(cache_path, 'wb')) # dump the features somewhere
# tokenize the text features
def prepare_features_a(data, cache_path, tokenizer, max_len_inp=512,max_len_out=512):
inputs = []
targets = []
cls_index = 0 # for roberta
for q, a, c in tqdm(data):
input_ = f"question: {q} context: {c}" # T5 Input format for QA tasks
target = f"answer: {str(a)}" # Output format we require
# tokenize inputs
tokenized_inputs = tokenizer(input_, max_length=max_len_inp,padding='max_length',
return_tensors="pt" #pytorch tensors
)
# tokenize targets
tokenized_targets = tokenizer(target, max_length=max_len_out,
padding='max_length',return_tensors="pt"
)
tokenized_inputs['start'] = cls_index
tokenized_inputs['end'] = cls_index
inputs.append(tokenized_inputs)
targets.append(tokenized_targets)
all_features = {}
all_features['input'] = inputs
all_features['target'] = targets
pkl.dump(all_features, open(cache_path, 'wb')) # dump the features somewhere
class FeatureData(Dataset):
'''
Dataset for the preprocessed features
'''
def __init__(self, feat_path, split, split_point):
self.feat_path = feat_path
# load features
feats = pkl.load(open(self.feat_path, 'rb' )) # load the features and extract
if split == 'train':
self.inputs = feats['input'][:-split_point]
self.questions = feats['target'][:-split_point]
elif split == 'test':
self.inputs = feats['input'][-split_point:]
self.questions = feats['target'][-split_point:]
print(f"length of feature {split} set: ", len(self.questions))
def __len__(self):
return len(self.questions)
def __getitem__(self, index):
# retrieve context here -> less mem storage overhead
input_ids = self.inputs[index]['input_ids'].squeeze()
target_ids = self.questions[index]['input_ids'].squeeze()
input_mask = self.inputs[index]['attention_mask'].squeeze()
target_mask = self.questions[index]['attention_mask'].squeeze()
labels = copy.deepcopy(target_ids)
labels[labels == 0] = -100
return {'input_ids': input_ids, 'input_mask': input_mask,
'target_ids': target_ids, 'target_mask': target_mask,
'labels': labels}
class FeatureData_A(Dataset):
'''
Dataset for the preprocessed features
'''
def __init__(self, feat_path, split, split_point):
self.feat_path = feat_path
# load features
feats = pkl.load(open(self.feat_path, 'rb' )) # load the features and extract
if split == 'train':
self.inputs = feats['input'][:-split_point]
self.questions = feats['target'][:-split_point]
elif split == 'test':
self.inputs = feats['input'][-split_point:]
self.questions = feats['target'][-split_point:]
print(f"length of feature {split} set: ", len(self.questions))
def __len__(self):
return len(self.questions)
def __getitem__(self, index):
# retrieve context here -> less mem storage overhead
input_ids = self.inputs[index]['input_ids'].squeeze()
target_ids = self.questions[index]['input_ids'].squeeze()
input_mask = self.inputs[index]['attention_mask'].squeeze()
target_mask = self.questions[index]['attention_mask'].squeeze()
start = self.inputs[index]['start']
end = self.inputs[index]['end']
labels = copy.deepcopy(target_ids)
labels[labels == 0] = -100
return {'input_ids': input_ids, 'input_mask': input_mask,
'target_ids': target_ids, 'target_mask': target_mask,
'start': start, 'end': end,
'labels': labels}
def get_dataloaders(feats_train, feats_test, batch_size):
# split here
dataloader_train = DataLoader(feats_train, batch_size=batch_size)
dataloader_test = DataLoader(feats_test, batch_size=batch_size)
print(f"Loaded train feature data with {len(dataloader_train)} batches")
print(f"Loaded test feature data with {len(dataloader_test)} batches")
return dataloader_train, dataloader_test
def get_model(checkpoint: str, device: str, task):
if task == "answer":
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)
else:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
return model.to(device)
def get_tokenizer(checkpoint: str):
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
return tokenizer