Learning Day 37: Implementing Variational Autoencoder in Pytorch

De Jun Huang

Published in

dejunhuang

4 min readMay 22, 2021

Building on top of vanilla autoencoder from Day 36

Modify the model script

Modify forward function between encoder and decoder to calculate the additional loss term, KL divergence.
init needs to be modified a little for the decoder part

2. Modify the main script

To include additional loss term
Minor changes to fetch new values from the model script

1. Model script changes

forward function

def forward(self, x):
    batch_size = x.shape[0]

    # flatten
    x = x.reshape(batch_size, 784)

    # encoder
    # [b, 20], consist of 10 mu and 10 sigma
    h_ = self.encoder(x)

    # [b, 20] => [b, 10] (mu) and [b, 10] (sigma)
    mu, sigma = h_.chunk(2, dim=1)

    # reparameterization trick
    h = mu + sigma * torch.randn_like(sigma)

    # kl divergence
    kld = 0.5 * torch.sum(mu ** 2 + sigma ** 2 - torch.log(1e-8 + sigma ** 2) - 1) / np.prod(x.shape)

    # decoder
    x = self.decoder(h)

    # reshape
    x = x.reshape(batch_size, 1, 28, 28)

    return x, kld

Explanations:

The output of encoder is [b, 20], which is saved as h_, is no longer the latent code, but consists of [b, 10] mean and [b, 10] standard deviation for constructing the distribution of latent code.

h_ = self.encoder(x)mu, sigma = h_.chunk(2, dim=1)

The new latent code, h, is calculated using reparameterization trick to make it differentiable (h is now size 10 instead of 20)

h = mu + sigma * torch.randn_like(sigma)

Additional loss term is the KL divergence (comparing difference in distribution between re-generated and original data)
Here the two distributions are assumed to be gaussian. Therefore KL divergence formula is as stated here
The objective is to minimise the difference in distribution between re-generated and original data)
The final value is divided by np.prod(x.shape) since kld is the sum of kld calculated for all pixels (w, h) in all batches (b) and channels (c ) for x’s shape=[b, c, w, h]. Without dividing this value, the kld is a lot bigger than the loss term itself. Then it cannot learn properly.

kld = 0.5 * torch.sum(mu ** 2 + sigma ** 2 - torch.log(1e-8 + sigma ** 2) - 1) / np.prod(x.shape)

Beside x, return kld at the end of forward function

return x, kld

init function decoder

since now the new latent code has size of 10 instead of 20 ([b, 20] from encoder consists of mean and sigma of [b, 10] each), the first input dimension is changed to 10 from 20
No other changes

self.decoder = nn.Sequential(
    nn.Linear(10, 64),
    nn.ReLU(),
    nn.Linear(64, 256),
    nn.ReLU(),
    nn.Linear(256, 784),
    nn.Sigmoid()
)

2. main script

In the epochs

for epoch in range(1000):

    for batch_idx, (x, _) in enumerate(mnist_train):

        # [b, 1, 28, 28]
        x = x.to(device)

        x_hat, kld = model(x)
        loss = criterion(x_hat, x)

        if kld is not None:
            elbo = -loss - 1.0 * kld
            loss = - elbo

        # backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(epoch, 'loss', loss.item(), kld.item())

    x, _ = next(iter(mnist_test))
    x = x.to(device)

    with torch.no_grad():
        x_hat, _ = model(x)

    viz.images(x, nrow=8, win='x', opts=dict(title='x'))
    viz.images(x_hat, nrow=8, win='x_hat', opts=dict(title='x_hat'))

Explanations:

Since kld is returned from model, need to unpack this value

x_hat, kld = model(x)

Include kld in the loss term

if kld is not None:
            elbo = -loss - 1.0 * kld
            loss = - elbo

To avoid bug in the test step, model needs to unpack the additional kld term although it is not useful here

with torch.no_grad():
        x_hat, _ = model(x)

Complete script

Model script (vae.py)

import torch
from torch import nn
import numpy as np

class VAE(nn.Module):

    def __init__(self):
        super(VAE, self).__init__()

        # [b, 784] => [b, 20]
        # u: [b, 10]
        # sigma: [b, 10]
        self.encoder = nn.Sequential(
            nn.Linear(784, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 20),
            nn.ReLU()
        )

        # [b, 20] => [b, 784]
        self.decoder = nn.Sequential(
            nn.Linear(10, 64),
            nn.ReLU(),
            nn.Linear(64, 256),
            nn.ReLU(),
            nn.Linear(256, 784),
            nn.Sigmoid()
        )

    def forward(self, x):
        batch_size = x.shape[0]

        # flatten
        x = x.reshape(batch_size, 784)

        # encoder
        # [b, 20], consist of 10 mu and 10 sigma
        h_ = self.encoder(x)

        # [b, 20] => [b, 10] (mu) and [b, 10] (sigma)
        mu, sigma = h_.chunk(2, dim=1)

        # reparameterization trick
        h = mu + sigma * torch.randn_like(sigma)

        # kl divergence
        kld = 0.5 * torch.sum(mu ** 2 + sigma ** 2 - torch.log(1e-8 + sigma ** 2) - 1) / np.prod(x.shape)

        # decoder
        x = self.decoder(h)

        # reshape
        x = x.reshape(batch_size, 1, 28, 28)

        return x, kld

Main script (main.py)

import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
from vae import VAE
import visdom


def main():
    mnist_train = datasets.MNIST('mnist', train=True, transform=transforms.Compose([
        transforms.ToTensor(),
        ]), download=True)
    mnist_train = DataLoader(mnist_train, batch_size=32, shuffle=True)

    mnist_test = datasets.MNIST('mnist', train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        ]), download=True)
    mnist_test = DataLoader(mnist_test, batch_size=32, shuffle=True)

    x, _ = next(iter(mnist_train))
    print(x.shape)


    device = torch.device('cuda')
    model = VAE().to(device)
    criterion = nn.MSELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    print(model)

    viz = visdom.Visdom()

    for epoch in range(1000):

        for batch_idx, (x, _) in enumerate(mnist_train):

            # [b, 1, 28, 28]
            x = x.to(device)

            x_hat, kld = model(x)
            loss = criterion(x_hat, x)

            if kld is not None:
                elbo = -loss - 1.0 * kld
                loss = - elbo

            # backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(epoch, 'loss', loss.item(), kld.item())

        x, _ = next(iter(mnist_test))
        x = x.to(device)

        with torch.no_grad():
            x_hat, _ = model(x)

        viz.images(x, nrow=8, win='x', opts=dict(title='x'))
        viz.images(x_hat, nrow=8, win='x_hat', opts=dict(title='x_hat'))


if __name__ == '__main__':
    main()