2. Boostrapping and permutation tests

2.1. Boostrapping

[1]:
import numpy as np
import pandas as pd
from scipy import stats
import scipy
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from tqdm import tqdm
from multiprocessing import Pool
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
import itertools
%matplotlib inline
[2]:
try:
    from torch import get_num_threads
    cpu_count = get_num_threads()
except ImportError:
    import os
    cpu_count = os.cpu_count()
[3]:
true_dist = stats.norm(.3, 1.3)
true_mean = true_dist.mean()
[4]:
# Get a 90% confidence interval for the mean using the central limit theorem

sample = true_dist.rvs(900)
mu_hat = sample.mean()
sigma_hat = sample.std() / np.sqrt(len(sample))
ci_interval_lower = stats.norm(mu_hat, sigma_hat).ppf(0.05)
ci_interval_upper = stats.norm(mu_hat, sigma_hat).ppf(0.95)
ci_interval_lower, ci_interval_upper
[4]:
(0.2994930692394472, 0.44398437591947654)
[5]:
# Get a 90% confidence interval for the mean using boostrap

sample = true_dist.rvs(900)
bootstrap_means = []
for _ in range(200):
    b_sample = np.random.choice(sample, len(sample))
    bootstrap_means.append(b_sample.mean())
bootstrap_means = np.sort(bootstrap_means)
ci_interval_lower = bootstrap_means[9]
ci_interval_upper = bootstrap_means[-10]
ci_interval_lower, ci_interval_upper
[5]:
(0.2612161649512032, 0.40027678090733715)
[6]:
# Check whether a 90% CI using CLT contains the true value 90% of the time (i.e. on 90% of experiments)

interval_contained_the_true_value = []
for _ in range(1000):
    sample = true_dist.rvs(900)

    mu_hat = sample.mean()
    sigma_hat = sample.std() / np.sqrt(len(sample))

    ci_interval_lower = stats.norm(mu_hat, sigma_hat).ppf(0.05)
    ci_interval_upper = stats.norm(mu_hat, sigma_hat).ppf(0.95)

    contained = ci_interval_lower <= true_mean and true_mean <= ci_interval_upper
    interval_contained_the_true_value.append(contained)

np.mean(interval_contained_the_true_value)
[6]:
0.904
[7]:
# Check whether a 90% CI using boostrap contains the true value 90% of the time (i.e. on 90% of experiments)

interval_contained_the_true_value = []
for _ in range(1000):
    sample = true_dist.rvs(900)

    bootstrap_means = []
    for _ in range(200):
        b_sample = np.random.choice(sample, len(sample))
        bootstrap_means.append(b_sample.mean())
    bootstrap_means = np.sort(bootstrap_means)

    ci_interval_lower = bootstrap_means[9]
    ci_interval_upper = bootstrap_means[-10]

    contained = ci_interval_lower <= true_mean and true_mean <= ci_interval_upper
    interval_contained_the_true_value.append(contained)

np.mean(interval_contained_the_true_value)
[7]:
0.892

2.1.1. Boostrapping on a logistic regression

[8]:
def data_generator():
    x = np.random.random((1000, 4))
    beta = stats.norm().rvs((4,1))
    beta[[0,-1]] = 0
    y = np.matmul(x, beta) + stats.norm().rvs((1000, 1))
    y = scipy.special.expit(y)
    y = np.random.random(y.shape) < y
    return x,y,beta
x,y,_ = data_generator()
[9]:
# Obtain bootstrap samples of the estimated coefficients

def bootstrap_coefs_samples_generator(x,y):
    bootstrap_coefs = []
    for _ in range(200):
        resampling_idx = np.random.choice(len(y), len(y))
        bts_y = y[resampling_idx]
        bts_x = x[resampling_idx]
        bts_est = LogisticRegression(solver='saga').fit(bts_x, bts_y.ravel())
        bootstrap_coefs.append(bts_est.coef_)
    return np.vstack(bootstrap_coefs)

bootstrap_coefs = bootstrap_coefs_samples_generator(x,y)
bootstrap_coefs.shape # (n_boostrap_sample, n_features)
[9]:
(200, 4)
[10]:
# Just as before, obtain bootstrap samples of the estimated coefficients
# but now using multiprocessing

def func(i):
    np.random.seed(i)
    resampling_idx = np.random.choice(len(y), len(y))
    bts_y = y[resampling_idx].copy()
    bts_x = x[resampling_idx].copy()
    bts_est = LogisticRegression(solver='saga').fit(bts_x, bts_y.ravel())
    return bts_est.coef_

def bootstrap_coefs_samples_generator(x,y):
    bootstrap_coefs = []

    bootstrap_coefs = Pool(cpu_count).map(func, range(200))
    return np.vstack(bootstrap_coefs)

bootstrap_coefs = bootstrap_coefs_samples_generator(x,y)
bootstrap_coefs.shape # (n_boostrap_sample, n_features)
[10]:
(200, 4)
[11]:
# Obtain CI 90% interval of the estimated coefficients using bootstrap

def obtain_ci_90(bootstrap_coefs):
    def extract_quantiles_5_and_95(x):
        p1 = 5*len(x)//100-1
        p2 = 95*len(x)//100
        return np.sort(x)[[p1,p2]]

    return np.apply_along_axis(extract_quantiles_5_and_95, 1, bootstrap_coefs.T)

obtain_ci_90(bootstrap_coefs)
[11]:
array([[-0.45130088,  0.19511093],
       [ 0.80341797,  1.54924732],
       [-0.83550332, -0.1421124 ],
       [-0.15388632,  0.56060763]])
[12]:
# Obtain the individual p-values of the coeficients (i.e.: testing H0: beta_i = 0)

def obtain_pvalues(bootstrap_coefs):
    def obtain_the_quantile_of_point(x, h0_point=0):
        pvalue1 = np.sum(np.sort(x)>=h0_point) / len(x)
        pvalue2 = np.sum(np.sort(x)<=h0_point) / len(x)
        return min(pvalue1, pvalue2) * 2

    return np.apply_along_axis(obtain_the_quantile_of_point, 1, bootstrap_coefs.T)

obtain_pvalues(bootstrap_coefs)
[12]:
array([0.58, 0.  , 0.05, 0.26])
[13]:
# Checking wheather our 90% CI contains the true value 90% of the time (i.e. on 90% of experiments)

ci_contains_that_coef = []
for _ in tqdm(range(1000)):
    x, y, betas = data_generator()
    bootstrap_coefs = bootstrap_coefs_samples_generator(x,y)
    intervs = obtain_ci_90(bootstrap_coefs)
    is_contained = [(beta <= intervs[i,1] and beta >= intervs[i,0]).item() for i,beta in enumerate(betas)]
    ci_contains_that_coef.append(is_contained)

np.array(ci_contains_that_coef).mean(0)
100%|██████████| 1000/1000 [02:55<00:00,  5.70it/s]
[13]:
array([0.908, 0.763, 0.771, 0.892])
[14]:
# Obtaining the a sample of the distribution of our pvalues

pvalues = []
for _ in tqdm(range(1000)):
    x, y, betas = data_generator()
    bootstrap_coefs = bootstrap_coefs_samples_generator(x,y)
    pvalues.append(obtain_pvalues(bootstrap_coefs))

pvalues = np.vstack(pvalues)
100%|██████████| 1000/1000 [03:07<00:00,  5.33it/s]
[15]:
def ecdf_plot(x, ax, *args, **kwargs):
    xc = np.concatenate(([0], np.sort(x), [1]))
    y = np.linspace(0, 1, len(x) + 1)
    yc = np.concatenate(([0], y))
    ax.step(xc, yc, *args, **kwargs)

_, axes = plt.subplots(2, 2, figsize=(14,13))

for i, ax in enumerate(axes.flatten()):
    ecdf_plot(pvalues[:,i], ax, color='green')
    ax.plot(np.linspace(0,1), np.linspace(0,1), color='red')
../../_images/sections_bootstrapping_and_permutation_tests_17_0.png

2.2. Permutation tests

Here we check compare the performance of the estimator y given x with the performance y given x_permuted, with following procedure:

  • Split the dataset (y, x) into (y_train, x_train), (y_val, x_val)

  • Fit classifier to (y_train, x_train) and get its score on (y_val, x_val)

  • Store the score on score_unpermutted

  • For feature f in features:

    • For i in range(n_permutations):

      • Set x_train_p as x_train with feature f permutted, do the same for x_val

      • Fit classifier to (y_train, x_train_p) and get its score on (y_val, x_val_p)

      • Store the score on scores_permutted[i, f]

The pvalue for feature f is given by:

pvalue = score_unpermuted <= scores_permuted[:, f]
pvalue = (pvalue.sum() + 1) / (len(scores_permuted[:, f])+1)

See https://arxiv.org/pdf/1906.09735 for details

[16]:
def nl_data_generator(size=1000):
    x = np.random.random((size, 4))
    beta = stats.norm().rvs((4,1))
    beta[[0,-1]] = 0
    y = np.matmul(np.cos(x), beta) + stats.norm(0,.1).rvs((size, 1))
    y = y > y.mean()
    return x,y

x, y = nl_data_generator()
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1)
del x, y
[17]:
def func(args):
    i, x_to_permute = args
    np.random.seed(i + x_to_permute*1000)

    # permute x_train for feature x_to_permute
    perm_idx = np.random.choice(len(x_train), len(x_train), False)
    x_train_p = x_train.copy()
    x_train_p[:, x_to_permute] = x_train_p[perm_idx, x_to_permute]

    bts_est = ExtraTreesClassifier()
    bts_est.fit(x_train_p, y_train.ravel())

    # permute x_val for feature x_to_permute
    perm_idx = np.random.choice(len(x_val), len(x_val), False)
    x_val_p = x_val.copy()
    x_val_p[:, x_to_permute] = x_val_p[perm_idx, x_to_permute]

    score = bts_est.score(x_val_p, y_val)
    return score

bts_est = ExtraTreesClassifier()
bts_est.fit(x_train, y_train.ravel())
score_unpermuted = bts_est.score(x_val, y_val)

def get_scores_of_permuted_features():
    scores_permuted = []
    for x_to_permute in range(x_train.shape[1]):
        score = Pool(cpu_count).map(func, itertools.product(range(99), [x_to_permute]))
        scores_permuted.append(score)
    return np.array(scores_permuted).T

scores_permuted = get_scores_of_permuted_features()
scores_permuted.shape # (n_boostrap_sample, n_features)
[17]:
(99, 4)
[18]:
def get_pvalues_from_scores(score_unpermuted, scores_permuted):
    pvalues = score_unpermuted <= scores_permuted
    pvalues = (pvalues.sum(0) + 1) / (len(scores_permuted)+1)
    return pvalues

get_pvalues_from_scores(score_unpermuted, scores_permuted)
[18]:
array([0.14, 0.01, 0.01, 0.2 ])
[19]:
# Checking the distribution of our pvalues

pvalues = []
for _ in tqdm(range(100)):
    x, y = nl_data_generator()
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1)
    del x, y

    bts_est = ExtraTreesClassifier()
    bts_est.fit(x_train, y_train.ravel())
    score_unpermuted = bts_est.score(x_val, y_val)

    scores_permuted = get_scores_of_permuted_features()
    pvalues_of_this_experiment = get_pvalues_from_scores(score_unpermuted, scores_permuted)

    pvalues.append(pvalues_of_this_experiment)

pvalues = np.vstack(pvalues)
100%|██████████| 100/100 [25:52<00:00, 15.52s/it]
[20]:
pvalues = np.vstack(pvalues)
[21]:
_, axes = plt.subplots(2, 2, figsize=(14,13))

for i, ax in enumerate(axes.flatten()):
    ecdf_plot(pvalues[:,i], ax, color='green')
    ax.plot(np.linspace(0,1), np.linspace(0,1), color='red')
../../_images/sections_bootstrapping_and_permutation_tests_24_0.png
[22]:
# Proportional of experiments where the pvalue is lower or equal to than 10%
# i.e.: how often each feature had pvalue lower or equal to 10%
np.apply_along_axis(lambda x: x in x<=0.1, 0, pvalues).mean(0)
[22]:
array([0.1 , 0.7 , 0.63, 0.05])
[23]:
# How often each feature had the lowest pvalue
np.apply_along_axis(lambda x: x in x==0, 1, np.argsort(pvalues)).mean(0)
[23]:
array([0.02, 0.19, 0.45, 0.34])