Variational Auto Encoder
A variational autoencoder (VAE) provides a probabilistic manner for describing an observation in latent space. Thus, rather than building an encoder which outputs a single value to describe each latent state attribute, we’ll formulate our encoder to describe a probability distribution for each latent attribute.
We can only see x, but we would like to infer the characteristics of z.
However the denominator term is intractable
To approximate this we can use variational inference where we are interested in maximizing the evidence lower bound (ELBO). Let’s approximate p(z | x) by another distribution q(z | x) which we’ll define such that it has a tractable distribution. If we can define the parameters of q(z | x) such that it is very similar to p(z | x), we can use it to perform approximate inference of the intractable distribution. |
Recall that the KL divergence is a measure of difference between two probability distributions. Thus, if we wanted to ensure that q(z | x) was similar to p(z | x), we could minimize the KL divergence between the two distributions. |
Since we cannot directly backprop over the sampling we use the reparametrization trick. First, we sample a noise variable ϵ from a simple distribution like the standard normal.
Then, we apply a deterministic transformation $g _ { \phi } ( \epsilon , x )$ that maps the random noise into a more complex distribution.
Instead of writing $z \sim q _ { \mu , \sigma } ( z ) = \mathcal { N } ( \mu , \sigma )$ we can write
The architecture of Variational Auto encoder is as follows
from __future__ import print_function, division
from builtins import range, input
import util
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
st = None
try:
st = tf.contrib.bayesflow.stochastic_tensor
except:
# doesn't exist in later versions of TF
# we will use the reparameterization trick instead
# watch the later lecture on the reparameterization trick
# to learn about it.
pass
Normal = tf.contrib.distributions.Normal
Bernoulli = tf.contrib.distributions.Bernoulli
class DenseLayer(object):
def __init__(self, M1, M2, f=tf.nn.relu):
# self.M1 = M1
# self.M2 = M2
self.W = tf.Variable(tf.random_normal(shape=(M1, M2)) * 2 / np.sqrt(M1))
self.b = tf.Variable(np.zeros(M2).astype(np.float32))
self.f = f
def forward(self, X):
return self.f(tf.matmul(X, self.W) + self.b)
class VariationalAutoencoder:
def __init__(self, D, hidden_layer_sizes):
# hidden_layer_sizes specifies the size of every layer
# in the encoder
# up to the final hidden layer Z
# the decoder will have the reverse shape
# represents a batch of training data
self.X = tf.placeholder(tf.float32, shape=(None, D))
# encoder
self.encoder_layers = []
M_in = D
for M_out in hidden_layer_sizes[:-1]:
h = DenseLayer(M_in, M_out)
self.encoder_layers.append(h)
M_in = M_out
# for convenience, we'll refer to the final encoder size as M
# also the input to the decoder size
M = hidden_layer_sizes[-1]
# the encoder's final layer output is unbounded
# so there is no activation function
# we also need 2 times as many units as specified by M_out
# since there needs to be M_out means + M_out variances
h = DenseLayer(M_in, 2 * M, f=lambda x: x)
self.encoder_layers.append(h)
# get the mean and variance / std dev of Z.
# note that the variance must be > 0
# we can get a sigma (standard dev) > 0 from an unbounded variable by
# passing it through the softplus function.
# add a small amount for smoothing.
current_layer_value = self.X
for layer in self.encoder_layers:
current_layer_value = layer.forward(current_layer_value)
self.means = current_layer_value[:, :M]
self.stddev = tf.nn.softplus(current_layer_value[:, M:]) + 1e-6
# get a sample of Z
# we need to use a stochastic tensor
# in order for the errors to be backpropagated past this point
if st is None:
# doesn't exist in later versions of Tensorflow
# we'll use the same trick we use in Theano
standard_normal = Normal(
loc=np.zeros(M, dtype=np.float32),
scale=np.ones(M, dtype=np.float32)
)
e = standard_normal.sample(tf.shape(self.means)[0])
self.Z = e * self.stddev + self.means
# note: this also works because Tensorflow
# now does the "magic" for you
# n = Normal(
# loc=self.means,
# scale=self.stddev,
# )
# self.Z = n.sample()
else:
with st.value_type(st.SampleValue()):
self.Z = st.StochasticTensor(Normal(loc=self.means, scale=self.stddev))
# to get back Q(Z), the distribution of Z
# we will later use self.Z.distribution
# decoder
self.decoder_layers = []
M_in = M
for M_out in reversed(hidden_layer_sizes[:-1]):
h = DenseLayer(M_in, M_out)
self.decoder_layers.append(h)
M_in = M_out
# the decoder's final layer should technically go through a sigmoid
# so that the final output is a binary probability (e.g. Bernoulli)
# but Bernoulli accepts logits (pre-sigmoid) so we will take those
# so no activation function is needed at the final layer
h = DenseLayer(M_in, D, f=lambda x: x)
self.decoder_layers.append(h)
# get the logits
current_layer_value = self.Z
for layer in self.decoder_layers:
current_layer_value = layer.forward(current_layer_value)
logits = current_layer_value
posterior_predictive_logits = logits # save for later
# get the output
self.X_hat_distribution = Bernoulli(logits=logits)
# take samples from X_hat
# we will call this the posterior predictive sample
self.posterior_predictive = self.X_hat_distribution.sample()
self.posterior_predictive_probs = tf.nn.sigmoid(logits)
# take sample from a Z ~ N(0, 1)
# and put it through the decoder
# we will call this the prior predictive sample
standard_normal = Normal(
loc=np.zeros(M, dtype=np.float32),
scale=np.ones(M, dtype=np.float32)
)
Z_std = standard_normal.sample(1)
current_layer_value = Z_std
for layer in self.decoder_layers:
current_layer_value = layer.forward(current_layer_value)
logits = current_layer_value
prior_predictive_dist = Bernoulli(logits=logits)
self.prior_predictive = prior_predictive_dist.sample()
self.prior_predictive_probs = tf.nn.sigmoid(logits)
# prior predictive from input
# only used for generating visualization
self.Z_input = tf.placeholder(tf.float32, shape=(None, M))
current_layer_value = self.Z_input
for layer in self.decoder_layers:
current_layer_value = layer.forward(current_layer_value)
logits = current_layer_value
self.prior_predictive_from_input_probs = tf.nn.sigmoid(logits)
# now build the cost
if st is None:
kl = -tf.log(self.stddev) + 0.5*(self.stddev**2 + self.means**2) - 0.5
kl = tf.reduce_sum(kl, axis=1)
else:
kl = tf.reduce_sum(
tf.contrib.distributions.kl_divergence(
self.Z.distribution, standard_normal
),
1
)
expected_log_likelihood = tf.reduce_sum(
self.X_hat_distribution.log_prob(self.X),
1
)
# equivalent
# expected_log_likelihood = -tf.nn.sigmoid_cross_entropy_with_logits(
# labels=self.X,
# logits=posterior_predictive_logits
# )
# expected_log_likelihood = tf.reduce_sum(expected_log_likelihood, 1)
self.elbo = tf.reduce_sum(expected_log_likelihood - kl)
self.train_op = tf.train.RMSPropOptimizer(learning_rate=0.001).minimize(-self.elbo)
# set up session and variables for later
self.init_op = tf.global_variables_initializer()
self.sess = tf.InteractiveSession()
self.sess.run(self.init_op)
def fit(self, X, epochs=30, batch_sz=64):
costs = []
n_batches = len(X) // batch_sz
print("n_batches:", n_batches)
for i in range(epochs):
print("epoch:", i)
np.random.shuffle(X)
for j in range(n_batches):
batch = X[j*batch_sz:(j+1)*batch_sz]
_, c, = self.sess.run((self.train_op, self.elbo), feed_dict={self.X: batch})
c /= batch_sz # just debugging
costs.append(c)
if j % 100 == 0:
print("iter: %d, cost: %.3f" % (j, c))
plt.plot(costs)
plt.show()
def transform(self, X):
return self.sess.run(
self.means,
feed_dict={self.X: X}
)
def prior_predictive_with_input(self, Z):
return self.sess.run(
self.prior_predictive_from_input_probs,
feed_dict={self.Z_input: Z}
)
def posterior_predictive_sample(self, X):
# returns a sample from p(x_new | X)
return self.sess.run(self.posterior_predictive, feed_dict={self.X: X})
def prior_predictive_sample_with_probs(self):
# returns a sample from p(x_new | z), z ~ N(0, 1)
return self.sess.run((self.prior_predictive, self.prior_predictive_probs))
def main():
X, Y = util.get_mnist()
# convert X to binary variable
X = (X > 0.5).astype(np.float32)
vae = VariationalAutoencoder(784, [200, 100])
vae.fit(X)
# plot reconstruction
done = False
while not done:
i = np.random.choice(len(X))
x = X[i]
im = vae.posterior_predictive_sample([x]).reshape(28, 28)
plt.subplot(1,2,1)
plt.imshow(x.reshape(28, 28), cmap='gray')
plt.title("Original")
plt.subplot(1,2,2)
plt.imshow(im, cmap='gray')
plt.title("Sampled")
plt.show()
ans = input("Generate another?")
if ans and ans[0] in ('n' or 'N'):
done = True
# plot output from random samples in latent space
done = False
while not done:
im, probs = vae.prior_predictive_sample_with_probs()
im = im.reshape(28, 28)
probs = probs.reshape(28, 28)
plt.subplot(1,2,1)
plt.imshow(im, cmap='gray')
plt.title("Prior predictive sample")
plt.subplot(1,2,2)
plt.imshow(probs, cmap='gray')
plt.title("Prior predictive probs")
plt.show()
ans = input("Generate another?")
if ans and ans[0] in ('n' or 'N'):
done = True
if __name__ == '__main__':
main()
References:
- Kingma, D. P. and Welling, M. (2014). Auto-encoding variational bayes. InProceedings of the Interna-tional Conference on Learning Representations (ICLR).
- Tim Salimans, Diederik Kingma, and Max Welling. Markov chain montecarlo and variational inference: Bridging the gap.ICML, 2015
- Alex Krizhevsky, Ilya Sutskever, and Geoff Hinton. Imagenet classifica-tion with deep convolutional neural networks. InNIPS, 2012