forked from hritik25/Dynamic-CNN-for-Modelling-Sentences
-
Notifications
You must be signed in to change notification settings - Fork 1
/
generateDataset.py
63 lines (53 loc) · 2.21 KB
/
generateDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pytreebank
import vocabulary
import csv
import re
dataset = pytreebank.load_sst('trees/')
# I am training the DCNN for binary classification only,
# and not fine-grained classification. In the Stanford
# Treebank Dataset, ratings left and right of 2 denote
# positive and negative reviews respectively, so I will
# pick only polar reviews and add the corresponding labels.
polarTrainingReivews = []
for example in dataset['train'][:]:
for newSentence in example.to_labeled_lines():
label, sentence = newSentence
if label != 2:
polarTrainingReivews.append(newSentence)
polarValidationReivews = []
for example in dataset['dev'][:]:
newSentence = example.to_labeled_lines()[0]
label, sentence = newSentence
if label != 2:
polarValidationReivews.append(newSentence)
polarTestReivews = []
for example in dataset['test'][:]:
newSentence = example.to_labeled_lines()[0]
label, sentence = newSentence
if label != 2:
polarTestReivews.append(newSentence)
vocab = vocabulary.generateVocab('stanfordSentimentTreebank/datasetSentences.txt')
# The network-friendly dataset will be generated in the directory 'myDataset'
directory = 'myDataset/'
filenames = [('train.txt', 'train_label.txt'), ('dev.txt', 'dev_label.txt'), ('test.txt', 'test_label.txt')]
polarReviews = [polarTrainingReivews, polarValidationReivews, polarTestReivews]
for files, data in zip(filenames, polarReviews):
labels = []
with open(directory + files[0], 'wb') as txtfile:
writer = csv.writer(txtfile)
for number in xrange(len(data)):
label = data[number][0]/3
labels.append([label])
breakup = [i for i in re.split(r'\s|\W', data[number][1].lower()) if i]
# mapping the words to embeddings indices
mappedSentence = []
for token in breakup:
if token in vocab:
mappedSentence.append(str(vocab[token]))
else:
mappedSentence.append(15448)
writer.writerow(mappedSentence)
with open(directory + files[1], 'wb') as txtfile:
writer = csv.writer(txtfile)
writer.writerows(labels)
print files, 'done!'