Sitemap
TDS Archive

An archive of data science, data analytics, data engineering, machine learning, and artificial intelligence writing from the former Towards Data Science Medium publication.

Follow publication

Practical Computer Simulations for Product Analysts

21 min readApr 30, 2024

--

Image by DALL-E 3

What is bootstrap?

Working with observational data

Estimating average

import numpy as np
from scipy.stats import norm, t

def get_normal_confidence_interval(data, confidence=0.95):
# Calculate sample mean and standard deviation
sample_mean = np.mean(data)
sample_std = np.std(data, ddof=1)
n = len(data)

# Calculate the critical value (z) based on the confidence level
z = norm.ppf((1 + confidence) / 2)

# Calculate the margin of error using standard error
margin_of_error = z * sample_std / np.sqrt(n)

# Calculate the confidence interval
lower_bound = sample_mean - margin_of_error
upper_bound = sample_mean + margin_of_error

return lower_bound, upper_bound

get_normal_confidence_interval(df.kms_during_program.values)
# (111.86, 260.55)
def get_ttest_confidence_interval(data, confidence=0.95):
# Calculate sample mean and standard deviation
sample_mean = np.mean(data)
sample_std = np.std(data, ddof=1)
n = len(data)

# Calculate the critical value (z) based on the confidence level
z = t.ppf((1 + confidence) / 2, df=len(data) - 1)

# Calculate the margin of error using standard error
margin_of_error = z * sample_std / np.sqrt(n)

# Calculate the confidence interval
lower_bound = sample_mean - margin_of_error
upper_bound = sample_mean + margin_of_error

return lower_bound, upper_bound

get_ttest_confidence_interval(df.kms_during_program.values)
# (102.72, 269.69)
import tqdm
import matplotlib.pyplot as plt

def get_kms_confidence_interval(num_batches, confidence = 0.95):
# Running simulations
tmp = []
for i in tqdm.tqdm(range(num_batches)):
tmp_df = df.sample(df.shape[0], replace = True)
tmp.append(
{
'iteration': i,
'mean_kms': tmp_df.kms_during_program.mean()
}
)
# Saving data
bootstrap_df = pd.DataFrame(tmp)

# Calculating confidence interval
lower_bound = bootstrap_df.mean_kms.quantile((1 - confidence)/2)
upper_bound = bootstrap_df.mean_kms.quantile(1 - (1 - confidence)/2)

# Creating a chart
ax = bootstrap_df.mean_kms.hist(bins = 50, alpha = 0.6,
color = 'purple')
ax.set_title('Average kms during the program, iterations = %d' % num_batches)

plt.axvline(x=lower_bound, color='navy', linestyle='--',
label='lower bound = %.2f' % lower_bound)

plt.axvline(x=upper_bound, color='navy', linestyle='--',
label='upper bound = %.2f' % upper_bound)

ax.annotate('CI lower bound: %.2f' % lower_bound,
xy=(lower_bound, ax.get_ylim()[1]),
xytext=(-10, -20),
textcoords='offset points',
ha='center', va='top',
color='navy', rotation=90)
ax.annotate('CI upper bound: %.2f' % upper_bound,
xy=(upper_bound, ax.get_ylim()[1]),
xytext=(-10, -20),
textcoords='offset points',
ha='center', va='top',
color='navy', rotation=90)
plt.xlim(ax.get_xlim()[0] - 20, ax.get_xlim()[1] + 20)
plt.show()
get_kms_confidence_interval(100)

Estimating custom metrics

import tqdm
import matplotlib.pyplot as plt

def get_refund_share_confidence_interval(num_batches, confidence = 0.95):
# Running simulations
tmp = []
for i in tqdm.tqdm(range(num_batches)):
tmp_df = df.sample(df.shape[0], replace = True)
tmp_df['refund'] = list(map(
lambda kms, passed: 1 if (kms >= 150) and (passed == 0) else 0,
tmp_df.kms_during_program,
tmp_df.finished_marathon
))

tmp.append(
{
'iteration': i,
'refund_share': tmp_df.refund.mean()
}
)

# Saving data
bootstrap_df = pd.DataFrame(tmp)

# Calculating confident interval
lower_bound = bootstrap_df.refund_share.quantile((1 - confidence)/2)
upper_bound = bootstrap_df.refund_share.quantile(1 - (1 - confidence)/2)

# Creating a chart
ax = bootstrap_df.refund_share.hist(bins = 50, alpha = 0.6,
color = 'purple')
ax.set_title('Share of refunds, iterations = %d' % num_batches)
plt.axvline(x=lower_bound, color='navy', linestyle='--',
label='lower bound = %.2f' % lower_bound)
plt.axvline(x=upper_bound, color='navy', linestyle='--',
label='upper bound = %.2f' % upper_bound)
ax.annotate('CI lower bound: %.2f' % lower_bound,
xy=(lower_bound, ax.get_ylim()[1]),
xytext=(-10, -20),
textcoords='offset points',
ha='center', va='top',
color='navy', rotation=90)
ax.annotate('CI upper bound: %.2f' % upper_bound,
xy=(upper_bound, ax.get_ylim()[1]),
xytext=(-10, -20),
textcoords='offset points',
ha='center', va='top',
color='navy', rotation=90)
plt.xlim(-0.1, 1)
plt.show()

Estimation of effects

def get_races_coef_confidence_interval(num_batches, confidence = 0.95):
# Running simulations
tmp = []
for i in tqdm.tqdm(range(num_batches)):
tmp_df = df.sample(df.shape[0], replace = True)
# Linear regression model
model = smf.ols('kms_during_program ~ races_before', data = tmp_df).fit()

tmp.append(
{
'iteration': i,
'races_coef': model.params['races_before']
}
)

# Saving data
bootstrap_df = pd.DataFrame(tmp)

# Calculating confident interval
lower_bound = bootstrap_df.races_coef.quantile((1 - confidence)/2)
upper_bound = bootstrap_df.races_coef.quantile(1 - (1 - confidence)/2)

# Creating a chart
ax = bootstrap_df.races_coef.hist(bins = 50, alpha = 0.6, color = 'purple')
ax.set_title('Coefficient between kms during the program and previous races, iterations = %d' % num_batches)
plt.axvline(x=lower_bound, color='navy', linestyle='--', label='lower bound = %.2f' % lower_bound)
plt.axvline(x=upper_bound, color='navy', linestyle='--', label='upper bound = %.2f' % upper_bound)
ax.annotate('CI lower bound: %.2f' % lower_bound,
xy=(lower_bound, ax.get_ylim()[1]),
xytext=(-10, -20),
textcoords='offset points',
ha='center', va='top',
color='navy', rotation=90)
ax.annotate('CI upper bound: %.2f' % upper_bound,
xy=(upper_bound, ax.get_ylim()[1]),
xytext=(10, -20),
textcoords='offset points',
ha='center', va='top',
color='navy', rotation=90)
# plt.legend()
plt.xlim(ax.get_xlim()[0] - 5, ax.get_xlim()[1] + 5)
plt.show()

return bootstrap_df

Simulations for A/B testing

Power analysis

import statsmodels.stats.power as stat_power
import statsmodels.stats.proportion as stat_prop

base_retention = before_df.retention.mean()
ret_effect_size = stat_prop.proportion_effectsize(base_retention + 0.03,
base_retention)


sample_size = 2*stat_power.tt_ind_solve_power(
effect_size = ret_effect_size,
alpha = 0.05, power = 0.9,
nobs1 = None, # we specified nobs1 as None to get an estimation for it
alternative='larger'
)

# ret_effect_size = 0.0632, sample_size = 8573.86
val_effect_size = 20/before_df.customer_value.std()

sample_size = 2*stat_power.tt_ind_solve_power(
effect_size = val_effect_size,
alpha = 0.05, power = 0.9,
nobs1 = None,
alternative='larger'
)
# val_effect_size = 0.0527, sample_size = 12324.13
stat_power.tt_ind_solve_power(
effect_size = ret_effect_size,
alpha = 0.05, power = None,
nobs1 = 2500,
alternative='larger'
)
# 0.7223

stat_power.tt_ind_solve_power(
effect_size = val_effect_size,
alpha = 0.05, power = None,
nobs1 = 2500,
alternative='larger'
)
# 0.5867
def get_sample_for_value(pop_df, sample_size, effect_size):
# getting sample of needed size
sample_df = pop_df.sample(sample_size)

# randomly assign treatment
sample_df['treatment'] = sample_df.index.map(
lambda x: 1 if np.random.uniform() > 0.5 else 0)

# add efffect for the treatment group
sample_df['predicted_value'] = sample_df['customer_value'] \
+ effect_size * sample_df.treatment

return sample_df
import statsmodels.formula.api as smf
val_model = smf.ols('customer_value ~ num_family_members + country_avg_annual_earning',
data = before_df).fit(disp = 0)
val_model.summary().tables[1]
def get_ci_for_value(df, boot_iters, confidence_level):
tmp_data = []

for iter in range(boot_iters):
sample_df = df.sample(df.shape[0], replace = True)
val_model = smf.ols('predicted_value ~ treatment + num_family_members + country_avg_annual_earning',
data = sample_df).fit(disp = 0)
tmp_data.append(
{
'iteration': iter,
'coef': val_model.params['treatment']
}
)

coef_df = pd.DataFrame(tmp_data)
return coef_df.coef.quantile((1 - confidence_level)/2),
coef_df.coef.quantile(1 - (1 - confidence_level)/2)
def run_simulations_for_value(pop_df, sample_size, effect_size, 
boot_iters, confidence_level, num_simulations):

tmp_data = []

for sim in tqdm.tqdm(range(num_simulations)):
sample_df = get_sample_for_value(pop_df, sample_size, effect_size)
num_users_treatment = sample_df[sample_df.treatment == 1].shape[0]
value_treatment = sample_df[sample_df.treatment == 1].predicted_value.mean()
num_users_control = sample_df[sample_df.treatment == 0].shape[0]
value_control = sample_df[sample_df.treatment == 0].predicted_value.mean()

ci_lower, ci_upper = get_ci_for_value(sample_df, boot_iters, confidence_level)

tmp_data.append(
{
'experiment_id': sim,
'num_users_treatment': num_users_treatment,
'value_treatment': value_treatment,
'num_users_control': num_users_control,
'value_control': value_control,
'sample_size': sample_size,
'effect_size': effect_size,
'boot_iters': boot_iters,
'confidence_level': confidence_level,
'ci_lower': ci_lower,
'ci_upper': ci_upper
}
)

return pd.DataFrame(tmp_data)
val_sim_df = run_simulations_for_value(before_df, sample_size = 100, 
effect_size = 20, boot_iters = 1000, confidence_level = 0.95,
num_simulations = 20)
val_sim_df.set_index('simulation')[['sample_size', 'ci_lower', 'ci_upper']].head()
val_sim_df['successful_experiment'] = val_sim_df.ci_lower.map(
lambda x: 1 if x > 0 else 0)

val_sim_df.groupby(['sample_size', 'effect_size']).aggregate(
{
'successful_experiment': 'mean',
'experiment_id': 'count'
}
)
tmp_dfs = []
for sample_size in [100, 250, 500, 1000, 2500, 5000, 10000, 25000]:
print('Simulation for sample size = %d' % sample_size)
tmp_dfs.append(
run_simulations_for_value(before_df, sample_size = sample_size, effect_size = 20,
boot_iters = 1000, confidence_level = 0.95, num_simulations = 20)
)

val_lowres_sim_df = pd.concat(tmp_dfs)
tmp_dfs = []
for effect_size in [1, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100]:
print('Simulation for effect size = %d' % effect_size)
tmp_dfs.append(
run_simulations_for_value(before_df, sample_size = 12500, effect_size = effect_size,
boot_iters = 1000, confidence_level = 0.95, num_simulations = 100)
)

val_effect_size_sim_df = pd.concat(tmp_dfs)
import tqdm

def get_sample_for_retention(pop_df, sample_size, effect_size):
base_ret_model = smf.logit('retention ~ num_family_members', data = pop_df).fit(disp = 0)
tmp_pop_df = pop_df.copy()
tmp_pop_df['predicted_retention_proba'] = base_ret_model.predict()
sample_df = tmp_pop_df.sample(sample_size)
sample_df['treatment'] = sample_df.index.map(lambda x: 1 if np.random.uniform() > 0.5 else 0)
sample_df['predicted_retention_proba'] = sample_df['predicted_retention_proba'] + effect_size * sample_df.treatment
sample_df['retention'] = sample_df.predicted_retention_proba.map(lambda x: 1 if x >= np.random.uniform() else 0)
return sample_df

def get_ci_for_retention(df, boot_iters, confidence_level):
tmp_data = []

for iter in range(boot_iters):
sample_df = df.sample(df.shape[0], replace = True)
ret_model = smf.logit('retention ~ treatment + num_family_members', data = sample_df).fit(disp = 0)
tmp_data.append(
{
'iteration': iter,
'coef': ret_model.params['treatment']
}
)

coef_df = pd.DataFrame(tmp_data)
return coef_df.coef.quantile((1 - confidence_level)/2), coef_df.coef.quantile(1 - (1 - confidence_level)/2)

def run_simulations_for_retention(pop_df, sample_size, effect_size,
boot_iters, confidence_level, num_simulations):
tmp_data = []

for sim in tqdm.tqdm(range(num_simulations)):
sample_df = get_sample_for_retention(pop_df, sample_size, effect_size)
num_users_treatment = sample_df[sample_df.treatment == 1].shape[0]
retention_treatment = sample_df[sample_df.treatment == 1].retention.mean()
num_users_control = sample_df[sample_df.treatment == 0].shape[0]
retention_control = sample_df[sample_df.treatment == 0].retention.mean()

ci_lower, ci_upper = get_ci_for_retention(sample_df, boot_iters, confidence_level)

tmp_data.append(
{
'experiment_id': sim,
'num_users_treatment': num_users_treatment,
'retention_treatment': retention_treatment,
'num_users_control': num_users_control,
'retention_control': retention_control,
'sample_size': sample_size,
'effect_size': effect_size,
'boot_iters': boot_iters,
'confidence_level': confidence_level,
'ci_lower': ci_lower,
'ci_upper': ci_upper
}
)

return pd.DataFrame(tmp_data)
base_ret_model = smf.logit('retention ~ num_family_members', data = before_df).fit(disp = 0)
base_ret_model.summary().tables[1]
base_ret_model = smf.logit('retention ~ num_family_members', data = pop_df)\
.fit(disp = 0)
tmp_pop_df = pop_df.copy()
tmp_pop_df['predicted_retention_proba'] = base_ret_model.predict()
sample_df = tmp_pop_df.sample(sample_size)
sample_df['treatment'] = sample_df.index.map(
lambda x: 1 if np.random.uniform() > 0.5 else 0)
sample_df['predicted_retention_proba'] = sample_df['predicted_retention_proba'] \
+ effect_size * sample_df.treatment
sample_df['retention'] = sample_df.predicted_retention_proba.map(
lambda x: 1 if x > np.random.uniform() else 0)
get_sample_for_retention(before_df, 10000, 0.3)\
.groupby('treatment', as_index = False).retention.mean()

# | | treatment | retention |
# |---:|------------:|------------:|
# | 0 | 0 | 0.640057 |
# | 1 | 1 | 0.937648 |

Analysing results

value_model = smf.ols(
'customer_value ~ treatment + num_family_members + country_avg_annual_earning',
data = experiment_df).fit(disp = 0)
value_model.summary().tables[1]
get_ci_for_value(experiment_df.rename(
columns = {'customer_value': 'predicted_value'}), 1000, 0.95)
# (16.28, 34.63)
retention_model = smf.logit('retention ~ treatment + num_family_members',
data = experiment_df).fit(disp = 0)
retention_model.summary().tables[1]
get_ci_for_retention(experiment_df, 1000, 0.95)
# (0.072, 0.187)
experiment_df['treatment_eq_1'] = 1
experiment_df['treatment_eq_0'] = 0

experiment_df['retention_proba_treatment'] = retention_model.predict(
experiment_df[['retention', 'treatment_eq_1', 'num_family_members']]\
.rename(columns = {'treatment_eq_1': 'treatment'}))

experiment_df['retention_proba_control'] = retention_model.predict(
experiment_df[['retention', 'treatment_eq_0', 'num_family_members']]\
.rename(columns = {'treatment_eq_0': 'treatment'}))

experiment_df['proba_diff'] = experiment_df.retention_proba_treatment \
- experiment_df.retention_proba_control

experiment_df.proba_diff.mean()
# 0.0281

Summary

Reference

--

--

TDS Archive
TDS Archive

Published in TDS Archive

An archive of data science, data analytics, data engineering, machine learning, and artificial intelligence writing from the former Towards Data Science Medium publication.

Mariya Mansurova
Mariya Mansurova

Written by Mariya Mansurova

Data & Product Analytics Lead at Wise | ClickHouse Evangelist

Responses (1)