-
Notifications
You must be signed in to change notification settings - Fork 158
/
10_Homework_MNIST_BoltzmannMachine.py
355 lines (272 loc) · 11.7 KB
/
10_Homework_MNIST_BoltzmannMachine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# coding: utf-8
# # Homework: Boltzmann Machines applied to MNIST
#
# This shows how to train a Boltzmann machine, to sample from an observed probability distribution. It uses the MNIST digits images that are included in every keras installation.
# Example code for the lecture series "Machine Learning for Physicists" by Florian Marquardt
#
# Lecture 9, Homework (this is discussed in session 10)
#
# See https://machine-learning-for-physicists.org and the current course website linked there!
#
# This notebook is distributed under the Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) license:
#
# https://creativecommons.org/licenses/by-sa/4.0/
# This notebook shows how to:
# - use a Boltzmann machine to sample from an observed high-dimensional probability distribution (e.g. produce images that look similar to observed training images); applied to the case of MNIST
#
# It also implements some of the tricks discussed by the inventor of RBM, G. Hinton, in
#
# https://www.cs.toronto.edu/~hinton/absps/guideTR.pdf
# In[1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi']=300 # highres display
import tensorflow as tf
from IPython.display import clear_output
from time import sleep
# In[2]:
def BoltzmannStep(v,b,w,do_random_sampling=True):
"""
Perform a single step of the Markov chain,
going from visible units v to hidden units h,
according to biases b and weights w.
z_j = b_j + sum_i v_i w_ij
and P(h_j=1|v) = 1/(exp(-z_j)+1)
Note: you can go from h to v, by inserting
instead of v the h, instead of b the a, and
instead of w the transpose of w
"""
batchsize=np.shape(v)[0]
hidden_dim=np.shape(w)[1]
z=b+np.dot(v,w)
P=1/(np.exp(-z)+1)
# now, the usual trick to obtain 0 or 1 according
# to a given probability distribution:
# just produce uniform (in [0,1]) random numbers and
# check whether they are below the cutoff given by P
if do_random_sampling:
p=np.random.uniform(size=[batchsize,hidden_dim])
return(np.array(p<=P,dtype='int'))
else:
return(P) # no binary random output, just the prob. distribution itself!
def BoltzmannSequence(v,a,b,w,drop_h_prime=False,do_random_sampling=True,
do_not_sample_h_prime=False,
do_not_sample_v_prime=False):
"""
Perform one sequence of steps v -> h -> v' -> h'
of a Boltzmann machine, with the given
weights w and biases a and b!
All the arrays have a shape [batchsize,num_neurons]
(where num_neurons is num_visible for v and
num_hidden for h)
You can set drop_h_prime to True if you want to
use this routine to generate arbitrarily long sequences
by calling it repeatedly (then don't use h')
Returns: v,h,v',h'
"""
h=BoltzmannStep(v,b,w,do_random_sampling=do_random_sampling)
if do_not_sample_v_prime:
v_prime=BoltzmannStep(h,a,np.transpose(w),do_random_sampling=False)
else:
v_prime=BoltzmannStep(h,a,np.transpose(w),do_random_sampling=do_random_sampling)
if not drop_h_prime:
if do_not_sample_h_prime: # G. Hinton recommends not sampling in the v'->h' step (reduces noise)
h_prime=BoltzmannStep(v_prime,b,w,do_random_sampling=False)
else:
h_prime=BoltzmannStep(v_prime,b,w,do_random_sampling=do_random_sampling)
else:
h_prime=np.zeros(np.shape(h))
return(v,h,v_prime,h_prime)
def trainStep(v,a,b,w,do_random_sampling=True,do_not_sample_h_prime=False,
do_not_sample_v_prime=False):
"""
Given a set of randomly selected training samples
v (of shape [batchsize,num_neurons_visible]),
and given biases a,b and weights w: update
those biases and weights according to the
contrastive-divergence update rules:
delta w_ij = eta ( <v_i h_j> - <v'_i h'_j> )
delta a_i = eta ( <v_i> - <v'_i>)
delta b_j = eta ( <h_j> - <h'_j>)
Returns delta_a, delta_b, delta_w, but without the eta factor!
It is up to you to update a,b,w!
"""
v,h,v_prime,h_prime=BoltzmannSequence(v,a,b,w,do_random_sampling=do_random_sampling,
do_not_sample_h_prime=do_not_sample_h_prime,
do_not_sample_v_prime=do_not_sample_v_prime)
return( np.average(v,axis=0)-np.average(v_prime,axis=0) ,
np.average(h,axis=0)-np.average(h_prime,axis=0) ,
np.average(v[:,:,None]*h[:,None,:],axis=0)-
np.average(v_prime[:,:,None]*h_prime[:,None,:],axis=0) )
# In[3]:
def produce_sample_images(batchsize,num_visible,x_train,threshold=0.7,do_digitize=True):
"""
Produce 'batchsize' samples, of length num_visible.
Returns array v of shape [batchsize,num_visible]
"""
j=np.random.randint(low=0,high=np.shape(x_train)[0],size=batchsize) # pick random samples
# reshape suitably, and digitize (so output is 0/1 values)
if do_digitize:
return( np.array( np.reshape(x_train[j,:,:],[batchsize,num_visible])>threshold, dtype='int' ) )
else:
return( np.reshape(x_train[j,:,:],[batchsize,num_visible]) )
# In[4]:
# Load the MNIST data using tensorflow/keras
(x_train,y_train),(x_test,y_test)=tf.keras.datasets.mnist.load_data(path="mnist.npz")
x_train=x_train/256.
# In[5]:
# Show the shape of these arrays
print(np.shape(x_train),np.shape(y_train),np.shape(x_test),np.shape(y_test))
# In[6]:
# display a few images, for fun
nimages=10
fig,ax=plt.subplots(ncols=nimages,nrows=1,figsize=(nimages,1))
for n in range(nimages):
ax[n].imshow(x_train[n,:,:])
ax[n].set_title(str(y_train[n])) # the label
ax[n].axis('off')
plt.show()
# In[7]:
# pick a few random samples:
nimages=7
Npixels=28
samples=produce_sample_images(batchsize=nimages,num_visible=Npixels**2,x_train=x_train)
# now unpack them again and display them:
fig,ax=plt.subplots(ncols=nimages,nrows=1,figsize=(nimages,1))
for n in range(nimages):
ax[n].imshow(np.reshape(samples[n,:],[Npixels,Npixels]))
ax[n].axis('off')
plt.show()
# In[8]:
# a little trick (useful later): show them all at once, in one imshow
# some weird reshape/transpose gymnastics, found by trial and error
plt.imshow(np.transpose(np.reshape(np.transpose(np.reshape(samples,[nimages,Npixels,Npixels]),
axes=[0,2,1]),[nimages*Npixels,Npixels])))
plt.axis('off')
plt.show()
# ## Basic training, according to the simple principle of an RBM presented in the lecture (all units are binary 0 or 1 all the time, randomly chosen according to the calculated probability distribution)
# In[92]:
# Now: the training
# here: purely using random binary sampling of all
# units at all times (this is not the most efficient way,
# but implements directly the basic principle shown in the lecture)
Npixels=28
num_visible=Npixels**2
num_hidden=60
batchsize=50
eta=0.1
nsteps=10000
skipsteps=10
a=np.random.randn(num_visible)
b=np.random.randn(num_hidden)
w=0.01*np.random.randn(num_visible,num_hidden)
# test_samples=np.zeros([num_visible,nsteps])
for j in range(nsteps):
v=produce_sample_images(batchsize,num_visible,x_train)
da,db,dw=trainStep(v,a,b,w)
a+=eta*da
b+=eta*db
w+=eta*dw
print("{:05d} / {:05d}".format(j,nsteps),end=" \r")
# In[93]:
# Now: visualize the typical samples generated (from some starting point)
# run several times to continue this. It basically is a random walk
# through the space of all possible configurations, hopefully according
# to the probability distribution that has been trained!
nsteps=1000
num_samples=20
test_samples=np.zeros([num_samples,num_visible])
skipsteps=1
substeps=400 # how many steps to take before showing a new picture
v_prime=np.zeros(num_visible)
h=np.zeros(num_hidden)
h_prime=np.zeros(num_hidden)
for j in range(nsteps):
for k in range(substeps):
v,h,v_prime,h_prime=BoltzmannSequence(v,a,b,w,drop_h_prime=True) # step from v via h to v_prime!
test_samples[j%num_samples,:]=v[0,:]
v=np.copy(v_prime) # use the new v as a starting point for next step!
if j%skipsteps==0 or j==nsteps-1:
clear_output(wait=True)
plt.imshow(np.transpose(np.reshape(np.transpose(np.reshape(test_samples,[num_samples,Npixels,Npixels]),
axes=[0,2,1]),[num_samples*Npixels,Npixels])),
interpolation='none')
plt.axis('off')
plt.show()
# ## More advanced training: do not randomly sample h' and v'
# In[29]:
# Now: the training
#
# Here we use the more sophisticated approach, where
# h' and v' are not binary (not randomly sampled), rather
# they are taken as the probability distribution (numbers
# between 0 and 1). This is a trick recommend by G. Hinton
# in his review of Boltzmann Machines. It effectively means
# less sampling noise.
#
# Also, we initialize the biases and weights according to the
# tricks given in that review!
Npixels=28
num_visible=Npixels**2
num_hidden=60
batchsize=10
eta=0.0001
nsteps=10*30000
skipsteps=10
# get average brightness of training images:
p_avg=np.average(np.reshape(x_train,[np.shape(x_train)[0],Npixels**2]),axis=0)
a=np.log(p_avg/(1.0+1e-6-p_avg)+1e-6) # recipe for visible biases
b=np.zeros(num_hidden) # recipe for hidden biases
w=0.01*np.random.randn(num_visible,num_hidden) # recipe for weights
# test_samples=np.zeros([num_visible,nsteps])
for j in range(nsteps):
v=produce_sample_images(batchsize,num_visible,x_train,
do_digitize=False)
da,db,dw=trainStep(v,a,b,w,
do_not_sample_h_prime=True,
do_not_sample_v_prime=True)
a+=eta*da
b+=eta*db
w+=eta*dw
print("{:06d} / {:06d}".format(j,nsteps),end=" \r")
# In[32]:
# Now: visualize the typical samples generated (from some starting point)
# run several times to continue this. It basically is a random walk
# through the space of all possible configurations, hopefully according
# to the probability distribution that has been trained!
nsteps=20
num_samples=20
test_samples=np.zeros([num_samples,batchsize,num_visible])
test_hidden=np.zeros([num_samples,batchsize,num_hidden])
skipsteps=1
substeps=400 # how many steps to take before showing a new picture
v_prime=np.zeros([batchsize,num_visible])
h=np.zeros([batchsize,num_hidden])
h_prime=np.zeros([batchsize,num_hidden])
v=produce_sample_images(batchsize,num_visible,x_train,
do_digitize=False)
for j in range(nsteps):
for k in range(substeps):
v,h,v_prime,h_prime=BoltzmannSequence(v,a,b,w,
drop_h_prime=True,
do_not_sample_v_prime=True) # step from v via h to v_prime!
test_samples[j%num_samples,:,:]=v[:,:]
test_hidden[j%num_samples,:]=h[:,:]
v=np.copy(v_prime) # use the new v as a starting point for next step!
if j%skipsteps==0 or j==nsteps-1:
clear_output(wait=True)
fig,ax=plt.subplots(ncols=1,nrows=batchsize,figsize=(num_samples,batchsize))
for n in range(batchsize):
ax[n].imshow(np.transpose(np.reshape(np.transpose(np.reshape(test_samples[:,n,:],[num_samples,Npixels,Npixels]),
axes=[0,2,1]),[num_samples*Npixels,Npixels])),
interpolation='none')
ax[n].axis('off')
plt.show()
fig,ax=plt.subplots(ncols=1,nrows=batchsize,figsize=(num_samples,batchsize))
for n in range(batchsize):
ax[n].imshow(np.transpose(np.reshape(np.transpose(np.reshape(test_hidden[:,n,:],[num_samples,6,10]),
axes=[0,2,1]),[num_samples*10,6])),
interpolation='none')
ax[n].axis('off')
plt.show()