-
Notifications
You must be signed in to change notification settings - Fork 1
/
offer_predictor.py
194 lines (147 loc) · 6.3 KB
/
offer_predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import numpy as np
import pandas as pd
import torch, sys
import torch.nn as nn
from sklearn.model_selection import train_test_split
import transformers
from transformers import AutoTokenizer, RobertaForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
from keras.preprocessing.sequence import pad_sequences
import re, emoji
device = torch.device("cuda")
cuda_status = 0
############### PREEPROCESS TWEET ##########################
def preprocess(sentence):
sentence = re.sub(r'http\S+', ' ', sentence)
words = sentence.split()
words = [word for word in words if ('.com' not in word and 'www' not in word and '\\\\' not in word and '\\u' not in word and '//' not in word)]
sentence = " ".join(words)
sentence = re.sub("([!]){1,}", " ! ", sentence)
sentence = re.sub("([.]){1,}", " . ", sentence)
sentence = re.sub("([?]){1,}", " ? ", sentence)
sentence = re.sub("([;]){1,}", " ; ", sentence)
sentence = re.sub("([:]){1,}", " : ", sentence)
sentence = re.sub("([,]){1,}", " : ", sentence)
for c in sentence:
if c in emoji.UNICODE_EMOJI:
sentence = re.sub(c, " ", sentence)
#sentence = re.sub(c, " "+emoji.demojize(c).replace("_"," ")+" ", sentence)
words = sentence.split()
words = [word for word in words if (word[0] != '#' and '@' not in word and '.com' not in word and 'www' not in word and '\\\\' not in word and '\\u' not in word and '//' not in word)]
sentence = " ".join(words)
sentence = re.sub('_', " ", sentence)
sentence = re.sub('\s+', ' ', sentence)
sentence = re.sub('&', " and ", sentence)
#sentence = sentence.encode('ascii', 'ignore')
return sentence
############### CREATE TOKENIZER ##########################
from transformers import AutoTokenizer, AutoModelForMaskedLM
#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
#define a batch size
batch_size = 32
############### BERT ##########################
class mBERT(nn.Module):
def __init__(self, bert, num_labels, hidden_layers, drop=0.1):
super(mBERT, self).__init__()
self.bert = bert
self.dropout = nn.Dropout(p=drop, inplace=False)
self.linear1 = nn.Linear(hidden_layers, 16)
self.linear2 = nn.Linear(16, num_labels)
#define the forward pass
def forward(self, sent_id_1, mask_1):
out = self.bert(sent_id_1, token_type_ids=None, attention_mask=mask_1)
sent_vec = out.hidden_states[-1][:,0,:]
sent_vec = self.dropout(sent_vec)
sent_vec = self.linear1(sent_vec)
sent_vec = self.linear2(sent_vec)
return sent_vec
def test(model, test_dataloader):
# deactivate dropout layers
model.eval()
# empty list to save the model predictions
all_labels = []
confidence_scores = []
# iterate over batches
for step,batch in enumerate(test_dataloader):
# push the batch to gpu
if torch.cuda.is_available():
batch = [t.to(device) for t in batch]
else:
batch = [t for t in batch]
#batch = [t for t in batch]
sent_id, mask = batch
# deactivate autograd
with torch.no_grad():
# model predictions
preds = model(sent_id, mask)
sigpreds = nn.Sigmoid()(preds)
sigpreds=sigpreds.detach().cpu().numpy()
preds = preds.detach().cpu().numpy()
total_preds = np.argmax(preds, axis=1)
all_labels.extend(total_preds)
confidence_scores.extend(np.max(sigpreds, axis=1))
return all_labels, confidence_scores
def create_loaders(seq, mask):
# wrap tensors
data = TensorDataset(seq, mask)
# sampler for sampling the data during training
sampler = RandomSampler(data)
# dataLoader for train set
dataloader = DataLoader(data, sampler=None, batch_size=batch_size)
return dataloader
def tokenize_and_preserve_labels(sentence):
tokenized_sentence = []
for word in sentence:
tokenized_word = tokenizer.tokenize(word)
tokenized_sentence.extend(tokenized_word)
tokenized_sentence = [x for x in tokenized_sentence if not x.isdigit()]
return tokenized_sentence
def getData4Bert(data, tokenizer, MAX_LEN=100):
sentences = data
tokenized_texts_and_labels = [
tokenize_and_preserve_labels(sent)
for sent in sentences
]
tokenized_texts = [token_label_pair for token_label_pair in tokenized_texts_and_labels]
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
maxlen=MAX_LEN, dtype="long", value=0.0,
truncating="post", padding="post")
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]
return torch.tensor(input_ids), torch.tensor(attention_masks)
def predict(data, fname):
data = [preprocess(x) for x in data]
hidden_layers = 768
test_seq, test_mask = getData4Bert(data, tokenizer)
test_dataloader = create_loaders(test_seq, test_mask)
############### LOAD BERT ##########################
bert = RobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels = 2, output_attentions = False, output_hidden_states = True, return_dict=True)
############### LOAD BERT ##########################
model = mBERT(bert, 2, hidden_layers)
# push the model to GPU
if torch.cuda.is_available():
model = model.to(device)
model.load_state_dict(torch.load(fname))
else:
model = model.to('cpu')
model.load_state_dict(torch.load(fname,map_location=torch.device('cpu')))
# get predictions for test data
with torch.no_grad():
all_labels, confidence_scores = test(model, test_dataloader)
return all_labels, confidence_scores
if __name__ == '__main__':
# execute only if run as the entry point into the program
test_file = sys.argv[1] #"offer_samples.txt"
fname = sys.argv[2] #"offer.pt"
fname_RESULT = sys.argv[3] #"result.txt"
f = open(test_file)
data = f.readlines()
f.close()
data = [x.strip() for x in data]
labels, cscores = predict(data, fname)
assert(len(labels) == len(data))
f = open(fname_RESULT, 'w+')
for index in range(len(labels)):
f.write(str(labels[index])+","+str(round(cscores[index],4))+'\n')
f.close()