-
Notifications
You must be signed in to change notification settings - Fork 26
/
visual_question_answering.lua
228 lines (179 loc) · 7.39 KB
/
visual_question_answering.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
require 'torch'
require 'nn'
require 'image'
require 'loadcaffe'
utils = require 'misc.utils'
local preprocess = utils.preprocess
local TorchModel = torch.class('VQATorchModel')
function TorchModel:__init(proto_file, model_file, input_sz, backend, layer_name, model_path, input_encoding_size, rnn_size, rnn_layers, common_embedding_size, num_output, seed, gpuid)
self.input_sz = input_sz
self.layer_name = layer_name
self.model_path = model_path
self.input_encoding_size = input_encoding_size
self.rnn_size = rnn_size
self.rnn_layers = rnn_layers
self.common_embedding_size = common_embedding_size
self.num_output = num_output
self.seed = seed
self.gpuid = gpuid
self:loadModel(proto_file, model_file, backend)
torch.manualSeed(self.seed)
-- For GPU
torch.setdefaulttensortype('torch.FloatTensor')
-- For CPU
-- torch.setdefaulttensortype('torch.DoubleTensor')
if self.gpuid >= 0 then
require 'cunn'
require 'cutorch'
cutorch.setDevice(1)
cutorch.manualSeed(self.seed)
end
collectgarbage()
end
function TorchModel:loadModel(proto_file, model_file, backend)
-- Load CNN
self.net = loadcaffe.load(proto_file, model_file, backend)
-- Set to evaluate and remove linear+softmax layer
self.net:evaluate()
self.net:remove()
self.net:remove()
self.net:add(nn.Normalize(2))
-- Clone & replace ReLUs for Guided Backprop
local cnn_gb = self.net:clone()
cnn_gb:replace(utils.guidedbackprop)
self.cnn_gb = cnn_gb
-- VQA-specific dependencies
-- https://github.com/VT-vision-lab/VQA_LSTM_CNN/blob/master/eval.lua
-- Below is a hacky solution since opt.gpuid is used in VQA_LSTM_CNN/misc.RNNUtils
opt = {}
opt.gpuid = self.gpuid
require 'VQA_LSTM_CNN/misc.netdef'
require 'VQA_LSTM_CNN/misc.RNNUtils'
LSTM = require 'VQA_LSTM_CNN/misc.LSTM'
cjson = require 'cjson'
-- Load vocabulary
local file = io.open('VQA_LSTM_CNN/data_prepro.json','r')
local text = file:read()
file:close()
local json_file = cjson.decode(text)
local vocabulary_size_q = 0
for i, w in pairs(json_file['ix_to_word']) do vocabulary_size_q = vocabulary_size_q + 1 end
-- VQA model definition
local embedding_net_q = nn.Sequential()
:add(nn.Linear(vocabulary_size_q, self.input_encoding_size))
:add(nn.Dropout(0.5))
:add(nn.Tanh())
local encoder_net_q = LSTM.lstm_conventional(self.input_encoding_size, self.rnn_size, 1, self.rnn_layers, 0.5)
local multimodal_net = nn.Sequential()
:add(netdef.AxB(2 * self.rnn_size * self.rnn_layers, 4096, self.common_embedding_size, 0.5))
:add(nn.Dropout(0.5))
:add(nn.Linear(self.common_embedding_size, self.num_output))
local dummy_state_q = torch.Tensor(self.rnn_size * self.rnn_layers * 2):fill(0)
local dummy_output_q = torch.Tensor(1):fill(0)
-- Ship model to GPU
if self.gpuid >= 0 then
embedding_net_q:cuda()
encoder_net_q:cuda()
multimodal_net:cuda()
dummy_state_q = dummy_state_q:cuda()
dummy_output_q = dummy_output_q:cuda()
end
-- Set to evaluate
embedding_net_q:evaluate()
encoder_net_q:evaluate()
multimodal_net:evaluate()
-- Zero gradients
embedding_net_q:zeroGradParameters()
encoder_net_q:zeroGradParameters()
multimodal_net:zeroGradParameters()
-- Load pretrained VQA model
embedding_w_q, embedding_dw_q = embedding_net_q:getParameters()
encoder_w_q, encoder_dw_q = encoder_net_q:getParameters()
multimodal_w, multimodal_dw = multimodal_net:getParameters()
model_param = torch.load(self.model_path)
embedding_w_q:copy(model_param['embedding_w_q'])
encoder_w_q:copy(model_param['encoder_w_q'])
multimodal_w:copy(model_param['multimodal_w'])
local encoder_net_buffer_q = dupe_rnn(encoder_net_q, 26)
-- all below variables are used in predict method
self.embedding_net_q = embedding_net_q
self.encoder_net_buffer_q = encoder_net_buffer_q
self.vocabulary_size_q = vocabulary_size_q
self.multimodal_net = multimodal_net
self.dummy_state_q = dummy_state_q
self.json_file = json_file
end
function TorchModel:predict(input_image_path, input_sz, input_sz, input_question, input_answer, out_path)
-- Load image
local img = utils.preprocess(input_image_path, input_sz, input_sz)
-- Ship CNNs and image to GPU
if self.gpuid >= 0 then
self.net:cuda()
self.cnn_gb:cuda()
img = img:cuda()
end
-- Forward pass
fv_im = self.net:forward(img)
fv_im_gb = self.cnn_gb:forward(img)
-- Tokenize question
local cmd = 'python misc/prepro_ques.py --question "'.. input_question..'"'
os.execute(cmd)
cmd = nil
local file = io.open('ques_feat.json')
local text = file:read()
file:close()
q_feats = cjson.decode(text)
question = right_align(torch.LongTensor{q_feats.ques}, torch.LongTensor{q_feats.ques_length})
fv_sorted_q = sort_encoding_onehot_right_align(question, torch.LongTensor{q_feats.ques_length}, self.vocabulary_size_q)
-- Ship question features to GPU
if self.gpuid >= 0 then
fv_sorted_q[1] = fv_sorted_q[1]:cuda()
fv_sorted_q[3] = fv_sorted_q[3]:cuda()
fv_sorted_q[4] = fv_sorted_q[4]:cuda()
else
fv_sorted_q[1] = fv_sorted_q[1]:double()
end
local question_max_length = fv_sorted_q[2]:size(1)
-- Embedding forward
local word_embedding_q = split_vector(self.embedding_net_q:forward(fv_sorted_q[1]), fv_sorted_q[2])
-- Encoder forward
local states_q, _ = rnn_forward(self.encoder_net_buffer_q, torch.repeatTensor(self.dummy_state_q:fill(0), 1, 1), word_embedding_q, fv_sorted_q[2])
-- Multimodal forward
local tv_q = states_q[question_max_length + 1]:index(1, fv_sorted_q[4])
local scores = self.multimodal_net:forward({tv_q, fv_im})
-- Get predictions
_, pred = torch.max(scores:double(), 2)
answer = self.json_file['ix_to_ans'][tostring(pred[{1, 1}])]
local inv_vocab = utils.table_invert(self.json_file['ix_to_ans'])
if input_answer ~= '' and inv_vocab[input_answer] ~= nil then answer_idx = inv_vocab[input_answer] else input_answer = answer answer_idx = inv_vocab[answer] end
-- Set gradInput
local doutput = utils.create_grad_input(self.multimodal_net.modules[#self.multimodal_net.modules], answer_idx)
-- Multimodal backward
local tmp = self.multimodal_net:backward({tv_q, fv_im}, doutput:view(1,-1))
local dcnn = tmp[2]
-- Grad-CAM
local gcam = utils.grad_cam(self.net, self.layer_name, dcnn)
gcam = image.scale(gcam:float(), self.input_sz, self.input_sz)
local result = {}
local hm = utils.to_heatmap(gcam)
image.save(out_path .. 'vqa_gcam_raw_' .. input_answer .. '.png', image.toDisplayTensor(gcam))
result['vqa_gcam_raw'] = out_path .. 'vqa_gcam_raw_' .. input_answer .. '.png'
image.save(out_path .. 'vqa_gcam_' .. input_answer .. '.png', image.toDisplayTensor(hm))
result['vqa_gcam'] = out_path .. 'vqa_gcam_' .. input_answer .. '.png'
-- Guided Backprop
local gb_viz = self.cnn_gb:backward(img, dcnn)
-- self.cnn_gb = nil
-- collectgarbage()
-- BGR to RGB
gb_viz = gb_viz:index(1, torch.LongTensor{3, 2, 1})
image.save(out_path .. 'vqa_gb_' .. input_answer .. '.png', image.toDisplayTensor(gb_viz))
result['vqa_gb'] = out_path .. 'vqa_gb_' .. input_answer .. '.png'
-- Guided Grad-CAM
local gb_gcam = gb_viz:float():cmul(gcam:expandAs(gb_viz))
image.save(out_path .. 'vqa_gb_gcam_' .. input_answer .. '.png', image.toDisplayTensor(gb_gcam))
result['vqa_gb_gcam'] = out_path .. 'vqa_gb_gcam_' .. input_answer .. '.png'
result['input_answer'] = input_answer
result['answer'] = answer
result['input_image'] = input_image_path
return result
end