-
Notifications
You must be signed in to change notification settings - Fork 26
/
captioning.lua
154 lines (110 loc) · 4.14 KB
/
captioning.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
require 'torch'
require 'nn'
require 'image'
utils = require 'misc.utils'
local preprocess = utils.preprocess
local TorchModel = torch.class('CaptioningTorchModel')
function TorchModel:__init(model_path, backend, input_sz, layer, seed, gpuid)
self.model_path = model_path
self.backend = backend
self.input_sz = input_sz
self.layer = layer
self.seed = seed
self.gpuid = gpuid
if self.gpuid >= 0 then
require 'cunn'
require 'cudnn'
require 'cutorch'
cutorch.setDevice(1)
cutorch.manualSeed(self.seed)
end
self:loadModel(model_path)
torch.manualSeed(self.seed)
torch.setdefaulttensortype('torch.FloatTensor')
-- neuraltalk2-specific dependencies
-- https://github.com/karpathy/neuraltalk2
end
function TorchModel:loadModel(model_path)
-- Load the models
local lm_misc_utils = require 'neuraltalk2.misc.utils'
require 'neuraltalk2.misc.LanguageModel'
local net_utils = require 'neuraltalk2.misc.net_utils'
self.net = torch.load(model_path)
print(self.net)
local cnn_lm_model = self.net
local cnn = cnn_lm_model.protos.cnn
local lm = cnn_lm_model.protos.lm
local vocab = cnn_lm_model.vocab
net_utils.unsanitize_gradients(cnn)
local lm_modules = lm:getModulesList()
for k,v in pairs(lm_modules) do
net_utils.unsanitize_gradients(v)
end
-- Set to evaluate mode
lm:evaluate()
cnn:evaluate()
self.cnn = cnn
self.lm = lm
self.net_utils = net_utils
self.vocab = vocab
end
function TorchModel:predict(input_image_path, input_sz, input_sz, input_caption, out_path)
print(input_image_path)
local img = utils.preprocess(input_image_path, input_sz, input_sz)
-- Clone & replace ReLUs for Guided Backprop
local cnn_gb = self.cnn:clone()
cnn_gb:replace(utils.guidedbackprop)
-- Ship model to GPU
if self.gpuid >= 0 then
self.cnn:cuda()
cnn_gb:cuda()
img = img:cuda()
self.lm:cuda()
end
-- Forward pass
im_feats = self.cnn:forward(img)
im_feat = im_feats:view(1, -1)
im_feat_gb = cnn_gb:forward(img)
-- get the prediction from model
local seq, seqlogps = self.lm:sample(im_feat, sample_opts)
seq[{{}, 1}] = seq
local caption = self.net_utils.decode_sequence(self.vocab, seq)
if input_caption == '' then
print("No caption provided, using generated caption for Grad-CAM.")
input_caption = caption[1]
end
print("Generated caption: ", caption[1])
print("Grad-CAM caption: ", input_caption)
local seq_length = self.seq_length or 16
local labels = utils.sent_to_label(self.vocab, input_caption, seq_length)
if self.gpuid >=0 then labels = labels:cuda() end
local logprobs = self.lm:forward({im_feat, labels})
local doutput = utils.create_grad_input_lm(logprobs, labels)
if self.gpuid >=0 then doutput = doutput:cuda() end
-- lm backward
local dlm, ddummy = unpack(self.lm:backward({im_feat, labels}, doutput))
local dcnn = dlm[1]
-- Grad-CAM
local gcam = utils.grad_cam(self.cnn, self.layer, dcnn)
gcam = image.scale(gcam:float(), self.input_sz, self.input_sz)
local result = {}
local hm = utils.to_heatmap(gcam)
image.save(out_path .. 'captioning_gcam_raw_' .. input_caption .. '.png', image.toDisplayTensor(gcam))
result['captioning_gcam_raw'] = out_path .. 'captioning_gcam_raw_' .. input_caption .. '.png'
image.save(out_path .. 'caption_gcam_' .. input_caption .. '.png', image.toDisplayTensor(hm))
result['captioning_gcam'] = out_path .. 'caption_gcam_' .. input_caption .. '.png'
-- Guided Backprop
local gb_viz = cnn_gb:backward(img, dcnn)
-- BGR to RGB
gb_viz = gb_viz:index(1, torch.LongTensor{3, 2, 1})
image.save(out_path .. 'caption_gb_' .. input_caption .. '.png', image.toDisplayTensor(gb_viz))
result['captioning_gb'] = out_path .. 'caption_gb_' .. input_caption .. '.png'
-- Guided Grad-CAM
local gb_gcam = gb_viz:float():cmul(gcam:expandAs(gb_viz))
image.save(out_path .. 'caption_gb_gcam_' .. input_caption .. '.png', image.toDisplayTensor(gb_gcam))
result['captioning_gb_gcam'] = out_path .. 'caption_gb_gcam_' .. input_caption .. '.png'
result['input_image'] = input_image_path
result['input_caption'] = input_caption
result['pred_caption'] = caption[1]
return result
end