16. Bayesian inference, Pyro, PyStan and VAEs

In this section, we give some examples on how to work with variational autoencoders and Bayesian inference using Pyro and PyStan.

Take a look at the VAE presentation for some theoretical details on the matter

This tutorial is meant to run using Nvidia CUDA processors. If you don’t have a GPU installed in your computer, you can download this Jupyter notebook and upload it to Google Colab.

[1]:
# Uncoment the line below to install pystan and pyro
#!pip install pyro-ppl pystan
[2]:
import pyro
import pyro.distributions as dist
from pyro.infer.mcmc import MCMC, HMC, NUTS
from pyro.optim import Adam
from pyro.infer import SVI, Trace_ELBO
import torch.distributions.constraints as constraints

import pystan

from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV


import numpy as np
import scipy.stats as stats
import pandas as pd

import matplotlib.pyplot as plt
import pandas as pd

import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
[3]:
# Setup some data
theta = 0.6
n = 1000
y = stats.bernoulli.rvs(theta, size=n)

16.1. Get MCMC samples for this model using Stan

[4]:
#Compile model

model_code = """
data {
    int<lower=0> n;
    int y[n];
}
parameters {
    real<lower=0, upper=1> theta;
}
model {
    // likehood:
    y ~ bernoulli(theta);

    // prior:
    theta ~ beta(0.5, 0.5);
}
"""

sm = pystan.StanModel(model_code=model_code)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_cbff015f4398d041dc3dc90aad40531c NOW.
/home/marco/anaconda3/lib/python3.6/site-packages/Cython/Compiler/Main.py:367: FutureWarning: Cython directive 'language_level' not set, using 2 for now (Py2). This will change in a later release! File: /tmp/tmp7800okyd/stanfit4anon_model_cbff015f4398d041dc3dc90aad40531c_6714102133510205523.pyx
  tree = Parsing.p_module(s, pxd, full_module_name)
[5]:
# Sample model
data_dict = {'y': y, 'n': n}
fit = sm.sampling(data=data_dict, iter=1000, chains=4)
[6]:
# Extract samples
theta = fit.extract(permuted=True)['theta']

# Print some statistics
print("Some samples:", theta[:10])
print("Mean:", np.mean(theta, axis=0))
print("Standard deviation:", np.std(theta, axis=0))

# Prepare plots
_, ax = plt.subplots(2, 2)

# histograms
# warning: for a caveat about using histogram see
# https://stats.stackexchange.com/a/51753
ax[0, 0].hist(theta, 15)
ax[0, 1].hist(theta, 30)

# Empirical cumulative distribution
ecdf = ECDF(theta)
ax[1, 0].plot(ecdf.x, ecdf.y)

# Density estimation using KDE (with tuning parameter chosen by 3 fold CV)
params_for_kde_cv = {'bandwidth': np.logspace(-2, 3, 10)}
grid = GridSearchCV(KernelDensity(), params_for_kde_cv, cv=3)
grid.fit(theta.reshape(-1, 1))
x_kde = np.linspace(0, 1, 10000).reshape(-1, 1)
y_kde = np.exp(grid.best_estimator_.score_samples(x_kde))
ax[1, 1].plot(x_kde, y_kde)
Some samples: [0.59673709 0.65125153 0.59704444 0.59649746 0.60513172 0.61008555
 0.59284969 0.61128971 0.60243952 0.59258868]
Mean: 0.6080634100472536
Standard deviation: 0.015205897937997375
[6]:
[<matplotlib.lines.Line2D at 0x7f33b195d4e0>]
../../_images/sections_vae_7_2.png

16.2. Get MCMC samples for this model using Pyro

[7]:
y_tensor = torch.as_tensor(y, dtype=torch.float32)

def model(y_tensor):
    prior_dist = dist.Beta(torch.Tensor([.5]), torch.Tensor([.5]))
    theta = pyro.sample('theta', prior_dist)
    with pyro.plate('observe_data'):
        pyro.sample('obs', dist.Bernoulli(theta), obs=y_tensor)

nuts_kernel = NUTS(model, adapt_step_size=True)
mcmc_run = MCMC(nuts_kernel, num_samples=500, warmup_steps=300).run(y_tensor)
posterior = mcmc_run.marginal('theta').empirical['theta']
Sample: 100%|██████████| 800/800 [00:04<00:00, 162.75it/s, step size=9.10e-01, acc. rate=0.892]
[8]:
print("Some samples:", posterior.sample((10,)).reshape(-1))
print("Mean:", posterior.mean)
print("Standard deviation:", posterior.stddev)
Some samples: tensor([0.6201, 0.5971, 0.5829, 0.6080, 0.5949, 0.5982, 0.5949, 0.6106, 0.5870,
        0.6037])
Mean: tensor([0.6059])
Standard deviation: tensor([0.0159])

16.3. Get replications (new instances of similar to data) from MCMC samples

[9]:
no_replications = 10000

posterior_samples = posterior.sample((no_replications,))
posterior_samples = posterior_samples.numpy().reshape(-1)

replications = stats.bernoulli.rvs(posterior_samples)
bins = np.arange(0, replications.max() + 1.5) - 0.5
_, ax = plt.subplots()
ax.hist(replications, bins)
ax.set_xticks(bins + 0.5)
[9]:
[<matplotlib.axis.XTick at 0x7f33ac49aba8>,
 <matplotlib.axis.XTick at 0x7f33ac49a4e0>,
 <matplotlib.axis.XTick at 0x7f33ac49a390>]
../../_images/sections_vae_12_1.png

16.4. Get approximate Bayesian inference for Pyro and stochatisc variational inference

[10]:
def model(y_tensor):
    prior_dist = dist.Beta(torch.Tensor([.5]), torch.Tensor([.5]))
    theta = pyro.sample('theta', prior_dist)
    with pyro.plate('observe_data'):
        pyro.sample('obs', dist.Bernoulli(theta), obs=y_tensor)

def guide(y_tensor):
    alpha = pyro.param("alpha", torch.Tensor([1.0]),
                       constraint=constraints.positive)
    beta = pyro.param("beta", torch.Tensor([1.0]),
                       constraint=constraints.positive)
    theta = pyro.sample('theta', dist.Beta(alpha, beta))
[11]:
# set up the optimizer
pyro.clear_param_store()
adam_params = {"lr": 0.2, "betas": (0.90, 0.999)}
optimizer = Adam(adam_params)

# setup the inference algorithm
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

n_steps = 100
# do gradient steps
for step in range(n_steps):
    svi.step(y_tensor)
[12]:
alpha = pyro.param("alpha")
beta = pyro.param("beta")

inf_distribution = stats.beta(alpha.data.numpy(), beta.data.numpy())
print("Some samples:", inf_distribution.rvs(10))
print("Mean:", inf_distribution.mean())
print("Standard deviation:", inf_distribution.std())

_, axes = plt.subplots(2)

# Plot the posterior
x_svi = np.linspace(0, 1, 10000)
y_svi = inf_distribution.pdf(x_svi)
axes[0].plot(x_svi, y_svi)

# Plot replications
posterior_samples = posterior.sample((no_replications,))
posterior_samples = posterior_samples.numpy().reshape(-1)

replications = stats.bernoulli.rvs(posterior_samples)
bins = np.arange(0, replications.max() + 1.5) - 0.5
axes[1].hist(replications, bins)
axes[1].set_xticks(bins + 0.5)
Some samples: [0.45259923 0.50321583 0.58682453 0.58075843 0.44931109 0.46521427
 0.43809253 0.741608   0.61864718 0.59422454]
Mean: [0.56469017]
Standard deviation: [0.11815722]
[12]:
[<matplotlib.axis.XTick at 0x7f33ac428438>,
 <matplotlib.axis.XTick at 0x7f33ac421d30>,
 <matplotlib.axis.XTick at 0x7f33ac421a58>]
../../_images/sections_vae_16_2.png

16.5. Using GPU and data subsampling with Pyro

[13]:
# Setup some data for another model
mu = -0.6
sigma = 1.8

n2 = 10000
y2 = stats.norm.rvs(mu, sigma, size=n2)
y2_tensor = torch.as_tensor(y2, dtype=torch.float32).cuda()
[14]:
def model(y2_tensor):
    # Priors:
    prior_dist_mu = dist.Normal(torch.Tensor([0.]).cuda(),
                                torch.Tensor([1.]).cuda())
    mu = pyro.sample('mu', prior_dist_mu)

    prior_dist_sigma = dist.Gamma(torch.Tensor([1.]).cuda(),
                                  torch.Tensor([1.]).cuda())
    sigma = pyro.sample('sigma', prior_dist_sigma)

    # Likelihood:
    with pyro.plate('observe_data', size=len(y2_tensor),
        subsample_size=5000, use_cuda=True) as ind:
        pyro.sample('obs', dist.Normal(mu, sigma),
            obs=y2_tensor.index_select(0, ind))


def guide(y2_tensor):
    alpha_mu = pyro.param("alpha_mu", torch.Tensor([0.0]).cuda())
    beta_mu = pyro.param("beta_mu", torch.Tensor([3.0]).cuda(),
        constraint=constraints.positive)
    mu = pyro.sample('mu', dist.Normal(alpha_mu, beta_mu))

    alpha_sigma = pyro.param("alpha_sigma", torch.Tensor([1.0]).cuda(),
        constraint=constraints.positive)
    beta_sigma = pyro.param("beta_sigma", torch.Tensor([1.0]).cuda(),
        constraint=constraints.positive)
    sigma = pyro.sample('sigma', dist.Gamma(alpha_sigma, beta_sigma))
[15]:
# set up the optimizer
pyro.clear_param_store()
adam_params = {"lr": 0.2, "betas": (0.90, 0.999)}
optimizer = Adam(adam_params)

# setup the inference algorithm
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

n_steps = 10
# do gradient steps
for step in range(n_steps):
    svi.step(y2_tensor)
[16]:
# Generate replications

alpha_mu = pyro.param("alpha_mu")
beta_mu = pyro.param("beta_mu")
alpha_sigma = pyro.param("alpha_sigma")
beta_sigma = pyro.param("beta_sigma")

mu_distribution = stats.norm(alpha_mu.item(), beta.data.item())
sigma_distribution = stats.gamma(alpha_sigma.item(), beta_sigma.item())

mu_samples = mu_distribution.rvs(no_replications)
sigma_samples = sigma_distribution.rvs(no_replications)

data_replications = stats.norm(mu_samples, sigma_samples).rvs()

# Density estimation using KDE (with tuning parameter chosen by 3 fold CV)
params_for_kde_cv = {'bandwidth': np.logspace(-2, 3, 10)}
grid = GridSearchCV(KernelDensity(), params_for_kde_cv, cv=3)
grid.fit(data_replications.reshape(-1, 1))
x_kde = np.linspace(-20, 20, 10000).reshape(-1, 1)
y_kde = np.exp(grid.best_estimator_.score_samples(x_kde))
plt.plot(x_kde, y_kde)
[16]:
[<matplotlib.lines.Line2D at 0x7f33ac1bb630>]
../../_images/sections_vae_21_1.png

16.6. Variational autoencoders

[17]:
# define the PyTorch module that parameterizes the
# diagonal gaussian distribution q(z|x)
class Encoder(nn.Module):
    def __init__(self, z_dim, hidden_dim, input_dim):
        super(Encoder, self).__init__()
        # setup the three linear transformations used
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc21 = nn.Linear(hidden_dim, z_dim)
        self.fc22 = nn.Linear(hidden_dim, z_dim)
        # setup the non-linearities
        self.softplus = nn.Softplus()

    def forward(self, x):
        # then compute the hidden units
        hidden = self.softplus(self.fc1(x))
        # then return a mean vector and a (positive) square root covariance
        # each of size batch_size x z_dim
        z_loc = self.fc21(hidden)
        z_scale = torch.exp(self.fc22(hidden))
        return z_loc, z_scale


# define the PyTorch module that parameterizes the
# observation likelihood p(x|z)
class Decoder(nn.Module):
    def __init__(self, z_dim, hidden_dim, input_dim):
        super(Decoder, self).__init__()
        # setup the two linear transformations used
        self.fc1 = nn.Linear(z_dim, hidden_dim)
        self.fc21 = nn.Linear(hidden_dim, input_dim)
        self.fc22 = nn.Linear(hidden_dim, input_dim)
        # setup the non-linearities
        self.softplus = nn.Softplus()

    def forward(self, z):
        # define the forward computation on the latent z
        # first compute the hidden units
        hidden = self.softplus(self.fc1(z))

        mu = self.fc21(hidden)
        sigma = torch.exp(self.fc22(hidden))
        return mu, sigma


# define a PyTorch module for the VAE
class VAE(nn.Module):
    # by default our latent space is 50-dimensional
    # and we use 400 hidden units
    def __init__(self, input_dim,
        z_dim=50, hidden_dim=400, use_cuda=False):
        super(VAE, self).__init__()
        # create the encoder and decoder networks
        self.encoder = Encoder(z_dim, hidden_dim, input_dim=input_dim)
        self.decoder = Decoder(z_dim, hidden_dim, input_dim=input_dim)

        if use_cuda:
            # calling cuda() here will put all the parameters of
            # the encoder and decoder networks into gpu memory
            self.cuda()
        self.use_cuda = use_cuda
        self.z_dim = z_dim

    # define the model p(x|z)p(z)
    def model(self, x):
        # register PyTorch module `decoder` with Pyro
        pyro.module("decoder", self.decoder)
        with pyro.plate("data", x.shape[0]):
            # setup hyperparameters for prior p(z)
            z_loc = torch.zeros(x.shape[0], self.z_dim, dtype=x.dtype, device=x.device)
            z_scale = torch.ones(x.shape[0], self.z_dim, dtype=x.dtype, device=x.device)
            # sample from prior (value will be sampled by guide when computing the ELBO)
            z = pyro.sample("latent", dist.Normal(z_loc, z_scale).to_event(1))
            # decode the latent code z
            mu, sigma = self.decoder.forward(z)
            # score against actual images
            pyro.sample("obs", dist.Normal(mu, sigma).to_event(1), obs=x)
            # return the loc so we can visualize it later
            #return loc_img

    # define the guide (i.e. variational distribution) q(z|x)
    def guide(self, x):
        # register PyTorch module `encoder` with Pyro
        pyro.module("encoder", self.encoder)
        with pyro.plate("data", x.shape[0]):
            # use the encoder to get the parameters used to define q(z|x)
            z_loc, z_scale = self.encoder.forward(x)
            # sample the latent code z
            pyro.sample("latent", dist.Normal(z_loc, z_scale).to_event(1))

    # define a helper function for reconstructing images
    def reconstruct_img(self, x):
        # encode image x
        z_loc, z_scale = self.encoder(x)
        # sample in latent space
        z = dist.Normal(z_loc, z_scale).sample()
        # decode the image (note we don't sample in image space)
        loc_img = self.decoder(z)
        return loc_img

    def new_instances(self, size=1):
         z = stats.norm.rvs(size=(size, self.z_dim))
         mu, sigma = self.decoder.forward(torch.as_tensor(z,
             device=torch.device('cuda'), dtype=torch.float32))
         return stats.norm.rvs(mu.data.cpu().numpy(), sigma.data.cpu().numpy())
[18]:
# clear param store
pyro.clear_param_store()

no_instances = 20000
input_dim = 3
mu = stats.norm.rvs(size=input_dim)

# Generate a positive definite matrix
sigma = stats.norm.rvs(size=(input_dim, input_dim))
sigma[np.triu_indices(input_dim)] = 0
sigma += np.diag(np.abs(stats.norm.rvs(size=input_dim)))
sigma = np.matmul(sigma.transpose(), sigma) # inverse cholesky decomposition

dataset = stats.multivariate_normal.rvs(mu, sigma, size=no_instances)
dataset = torch.as_tensor(dataset, dtype=torch.float32)
dataset = TensorDataset(dataset)
train_loader = DataLoader(dataset, batch_size=1000, shuffle=True,
     num_workers=1, pin_memory=True, drop_last=False)

# setup the VAE
vae = VAE(use_cuda=True, input_dim=input_dim)

adam_args = {"lr": 0.001}
optimizer = Adam(adam_args)

# setup the inference algorithm
svi = SVI(vae.model, vae.guide, optimizer, loss=Trace_ELBO())

train_elbo = []
for epoch in range(100):
    # initialize loss accumulator
    epoch_loss = 0.
    # do a training epoch over each mini-batch x returned
    # by the data loader
    for x, in train_loader:
        x = x.cuda()
        epoch_loss += svi.step(x)

    # report training diagnostics
    if not epoch % 10:
        normalizer_train = len(train_loader.dataset)
        total_epoch_loss_train = epoch_loss / normalizer_train
        train_elbo.append(total_epoch_loss_train)
        print("[epoch %03d]  average training loss: %.4f" %
             (epoch, total_epoch_loss_train))
[epoch 000]  average training loss: 15.6065
[epoch 010]  average training loss: 5.2432
[epoch 020]  average training loss: 4.3725
[epoch 030]  average training loss: 4.2777
[epoch 040]  average training loss: 4.3108
[epoch 050]  average training loss: 4.2928
[epoch 060]  average training loss: 4.2694
[epoch 070]  average training loss: 4.2631
[epoch 080]  average training loss: 4.2529
[epoch 090]  average training loss: 4.1238
[19]:
# Generating new instances (replications) from the trained VAE
new_instances = vae.new_instances(100000)

print("True means")
print(mu)
print("Empirical means of replications:")
print(new_instances.mean(0))

print("----------------------------------------")

print("True covariance matrix")
print(sigma)
print("Empirical covariance matrix of replications:")
print(np.cov(new_instances, rowvar=False))
True means
[-0.64582724  0.14164411 -2.05161795]
Empirical means of replications:
[-0.6678121   0.15514899 -1.99562418]
----------------------------------------
True covariance matrix
[[ 3.07215953 -0.42461851  0.84076918]
 [-0.42461851  0.29007804 -1.34326707]
 [ 0.84076918 -1.34326707  7.24112768]]
Empirical covariance matrix of replications:
[[ 3.02623676 -0.39646564  0.8401477 ]
 [-0.39646564  0.27476945 -1.27772535]
 [ 0.8401477  -1.27772535  6.96393829]]