-
Notifications
You must be signed in to change notification settings - Fork 0
/
trainer.py
187 lines (152 loc) · 6.99 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
from __future__ import division
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import numpy as np
import sys
import argparse
import json
import pprint
import time
from relational_network import RelationalNetwork
from data_pipeline import ClevrDataset
from config import cfg, cfg_from_file
from progressbar import ETA, Bar, Percentage, ProgressBar
# Define the arguments to be parsed for training
parser = argparse.ArgumentParser(description="CLEVR relational network example")
parser.add_argument('--batch_size', type=int, default=256, metavar='N',
help='input size of a mini-batch for training (default: 64)')
parser.add_argument('--epochs', type=int, default=800, metavar='N',
help='number of epochs for training of the network (default: 100)')
parser.add_argument('--no_cuda', action='store_true', default=False,
help='enable CUDA training')
parser.add_argument('--seed', type=int, default=10, metavar='S',
help='random seed (default: 10)')
parser.add_argument('--log_interval', type=int, default=1000, metavar='N',
help='batches to wait for before logging the training status (default: 1000)')
parser.add_argument('--cfg', dest='cfg_file',
help='optional configuration file to set the cfg in config')
parser.add_argument('--gpu', dest='gpu_id', default=0,
help='GPU device to perform training on, works with --no_cuda set to False')
# Parse the arguments and see if cuda is available
# Make sure to create a level of cuda check when creating a Variable
# or putting the model on gpu.
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
cfg.TRAIN.USE_CUDA = args.cuda
if args.cfg_file is not None:
cfg_from_file(args.cfg_file)
if args.gpu_id != -1:
cfg.GPU_ID = args.gpu_id
# Initialize the state using the defined seed
torch.manual_seed(args.seed)
if args.cuda:
torch.cuda.manual_seed(args.seed)
# Define the obj to access the dataset
data_dir = cfg.DATA_DIR
dataset = ClevrDataset(train_data_dir=data_dir + '/train/', val_data_dir=data_dir + '/val')
train_dataset = dataset.get_train_data()
val_dataset = dataset.get_val_data()
# Define variables to access misc dataset
word_to_ix_file = data_dir + '/word_to_ix.json'
answer_to_ix_file = data_dir + '/answer_to_ix.json'
with open(word_to_ix_file, 'r') as vocab_dict, open(answer_to_ix_file, 'r') as answer_dict:
word_to_ix = json.load(vocab_dict)
answer_to_ix = json.load(answer_dict)
# Define the parameters that have to be reset for the cfg file
cfg.TRAIN.VOCAB_SIZE = len(word_to_ix) + 1
cfg.TRAIN.ANSWER_SIZE = len(answer_to_ix)
cfg.TRAIN.BATCH_SIZE = args.batch_size
print("Current configuration being used")
pprint.pprint(cfg)
# Define the model and port it to gpu
model = RelationalNetwork()
if torch.cuda.device_count() > 1:
print("Using ", torch.cuda.device_count(), " GPUs")
model = nn.DataParallel(model)
if args.cuda:
model = model.cuda()
# Define the optimizer for training the network
optimizer = optim.Adam(model.parameters(), lr=cfg.TRAIN.LEARNING_RATE)
# criterion = nn.CrossEntropyLoss()
# Define the train step
def train(updates_per_epoch):
train_loss = 0.0
train_accuracy = 0.0
for iteration in range(updates_per_epoch):
pbar.update(iteration)
images, questions, answers = train_dataset.next_batch(args.batch_size)
# Convert the input images into tensor Variables
images = Variable(torch.from_numpy(images).permute(0, 3, 1, 2).float())
# Process the questions and answers separately
questions = Variable(torch.LongTensor(questions))
answers = Variable(torch.LongTensor(answers)).view(args.batch_size)
if args.cuda:
images = images.cuda()
questions = questions.cuda()
answers = answers.cuda()
model.zero_grad()
answers_hat = model(images, questions)
loss = F.nll_loss(answers_hat, answers)
# print("The loss is getting calculated here")
loss.backward()
optimizer.step()
train_loss += loss.data[0]
pred = answers_hat.data.max(1)[1]
correct = pred.eq(answers.data).cpu().sum()
accuracy = correct * 100. / len(answers)
train_accuracy += accuracy
train_accuracy = train_accuracy / updates_per_epoch
train_loss = train_loss / updates_per_epoch
return train_loss, train_accuracy
# Define the validation step, currently doing it for 10 random batches
def val():
model.eval()
val_accuracy = 0.0
for val_iteration in range(updates_per_epoch_val):
val_images, val_questions, val_answers = val_dataset.next_batch(args.batch_size)
# Convert the validation images into tensor Variables
val_images = Variable(torch.from_numpy(val_images).permute(0, 3, 1, 2).float())
# Process the questions and the answers
val_questions = Variable(torch.LongTensor(val_questions))
val_answers = Variable(torch.LongTensor(val_answers)).view(args.batch_size)
if args.cuda:
val_images = val_images.cuda()
val_questions = val_questions.cuda()
val_answers = val_answers.cuda()
val_answers_hat = model(val_images, val_questions)
val_pred = val_answers_hat.data.max(1)[1]
val_correct = val_pred.eq(val_answers.data).cpu().sum()
accuracy = val_correct * 100.0 / len(val_answers)
val_accuracy += accuracy
val_accuracy = val_accuracy / 10.0
return val_accuracy
# Define the parameters to be used by the progress bar
number_examples = train_dataset._num_examples # See this value and if this works.
updates_per_epoch = number_examples // cfg.TRAIN.BATCH_SIZE
number_examples_val = val_dataset._num_examples
updates_per_epoch_val = number_examples_val // cfg.TRAIN.BATCH_SIZE
# Create one-hot answers dictionary to be used in prepare answers
with open('./data/clver_rn/answer_to_ix.json', 'r') as answer_file:
answer_to_ix = json.load(answer_file)
answer_to_one_hot = {}
one_hot_init_vector = [0] * len(answer_to_ix)
# Set up the training loop
for epoch in range(1, args.epochs + 1):
epoch_start_time = time.time()
widgets = ['epoch #%d|' % epoch, Percentage(), Bar(), ETA()]
pbar = ProgressBar(maxval=updates_per_epoch, widgets=widgets)
pbar.start()
# Call the train and the test step for the dataset
epoch_loss, epoch_accuracy = train(updates_per_epoch)
log_line_train = '%s: %0.4f; %s: %0.4f; ' % ("Training Loss", epoch_loss, "Training Accuracy", epoch_accuracy)
val_accuracy = val(updates_per_epoch_val)
log_line_val = '%s: %0.4f ' % ("Validation Accuracy", val_accuracy)
epoch_end_time = time.time()
time_taken = epoch_end_time - epoch_start_time
log_time_line = '%s: %0.4f' % ("Time taken for the current epoch", time_taken)
sys.stdout.flush()
print("Epoch %d | " % (epoch) + log_line_train + log_line_val + log_time_line)