-
Notifications
You must be signed in to change notification settings - Fork 0
/
SentimentAnalysis_NLP
123 lines (94 loc) · 4.34 KB
/
SentimentAnalysis_NLP
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Sentiment analysis
# (4B assignment):
# AG News dataset contains text from 127600 online news articles, from 4 different categories: World, Sports, Business, and Science/Technology.
# import torchtext
# agnews_train, agnews_test = torchtext.datasets.text_classification.DATASETS["AG_NEWS"](root="./datasets")
import torchtext
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
ngrams = 1
train_csv_path = './datasets/ag_news_csv/train.csv'
test_csv_path = './datasets/ag_news_csv/test.csv'
vocab = torchtext.vocab.build_vocab_from_iterator(
torchtext.datasets.text_classification._csv_iterator(train_csv_path, ngrams))
train_data, train_labels = torchtext.datasets.text_classification._create_data_from_iterator(
vocab, torchtext.datasets.text_classification._csv_iterator(train_csv_path, ngrams, yield_cls=True), False)
test_data, test_labels = torchtext.datasets.text_classification._create_data_from_iterator(
vocab, torchtext.datasets.text_classification._csv_iterator(test_csv_path, ngrams, yield_cls=True), False)
if len(train_labels ^ test_labels) > 0:
raise ValueError("Training and test labels don't match")
agnews_train = torchtext.datasets.TextClassificationDataset(vocab, train_data, train_labels)
agnews_test = torchtext.datasets.TextClassificationDataset(vocab, test_data, test_labels)
print(agnews_train[0])
print("Length of the first text example: {}".format(len(agnews_train[0][1])))
print("Length of the second text example: {}".format(len(agnews_train[1][1])))
padded_exs = pad_sequence([agnews_train[0][1], agnews_train[1][1]])
print("First sequence padded: {}".format(padded_exs[:,0]))
print("First sequence length: {}".format(len(padded_exs[:,0])))
print("Second sequence padded: {}".format(padded_exs[:,1]))
print("Second sequence length: {}".format(len(padded_exs[:,1])))
def collator(batch):
labels = torch.tensor([example[0] for example in batch])
sentences = [example[1] for example in batch]
data = pad_sequence(sentences)
return [data, labels]
BATCH_SIZE = 128
train_loader = torch.utils.data.DataLoader(agnews_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collator)
test_loader = torch.utils.data.DataLoader(agnews_test, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collator)
print(len(train_loader))
print(len(agnews_train.get_labels()))
VOCAB_SIZE = len(agnews_train.get_vocab())
EMBED_DIM = 100
HIDDEN_DIM = 64
NUM_OUTPUTS = len(agnews_train.get_labels())
NUM_EPOCHS = 3
class SWEM(nn.Module):
def __init__(self, vocab_size, embedding_size, hidden_dim, num_outputs):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_size)
self.fc1 = nn.Linear(embedding_size, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, num_outputs)
def forward(self, x):
embed = self.embedding(x)
embed_mean = torch.mean(embed, dim=0)
h = self.fc1(embed_mean)
h = F.relu(h)
h = self.fc2(h)
return h
## Training
# Instantiate model
model = SWEM(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_OUTPUTS)
# Binary cross-entropy (BCE) Loss and Adam Optimizer
#criterion = nn.BCEWithLogitsLoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
# Iterate through train set minibatchs
for epoch in range(NUM_EPOCHS):
correct = 0
num_examples = 0
for inputs, labels in train_loader:
# Zero out the gradients
optimizer.zero_grad()
# Forward pass
y = model(inputs)
#labels = labels.unsqueeze(-1)
loss = criterion(y, labels)
# Backward pass
loss.backward()
optimizer.step()
predictions = torch.round(torch.sigmoid(y))
correct = 0
total = len(test_loader)
with torch.no_grad():
# Iterate through test set minibatchs
for inputs, labels in test_loader:
# Forward pass
#x = images.view(-1, 28*28)
y = model(inputs)
predictions = torch.argmax(y, dim=1)
correct += torch.sum((predictions == labels).float())
print('Test accuracy: {}'.format(correct/total))