# Let's first import pandas, scipy, numpy, seaborn, pymc3 and arviz packages. Those are just standard packages
# useful for data manipulation and Bayesian analysis.

import pandas as pd
from scipy import stats
import numpy as np
import seaborn as sns
import pymc3 as pm
import arviz as az

# Now let's take in the item orders and views data which is stored in the assets folder. 
orders_and_views = pd.read_csv('assets/a_b_testing_for_ordering_and_viewing_items.csv')

# And take a look.
orders_and_views


# So the next step of data analysis might be grouping each test and compute the number of times, 
# as well as the proportion of product purchase from online customers. 

# We can use a very convenient pandas function called the pivot_table function to group each test group 
# and compute statistics using specific aggregating functions. Here it's a typical pandas function helping 
# aggregate the number and proportion of success item purchase.

# We specify the test_number as index so that the resulting table will group by each test. We'll assign the 
# client purchase for each record as values, so we'll compute the successful purchases according to each test. 
# First of all, we sum the purchases and save the result as a pandas DataFrame.

order_view_summary = orders_and_views.pivot_table(
    index='test_number', values='test_assignment', aggfunc=np.sum)

# We add the next column called that computes how many customers were assigned to each test group.
# So this time we switch the aggregate function to the len function that returns the number of rows 
# for each test group.
order_view_summary['total'] = orders_and_views.pivot_table(
    index='test_number', values='test_assignment', aggfunc=lambda x: len(x))

# If you're not famaliar with the lambda function, the lambda function works just analogous to defining a function. 

# (Read only) The whole meaning of lambda x: len(x) is that:
# it returns the length of dataset for each test_number group.
# The only difference is that the lambda function result will not cause extra memory like defining function
# because it is not saved to the variable list in Python IDE.

# Finally, we add a new column called rate, and this time we won't pass any aggregate function, and it automatically
# computes the proportion of test customers actually agreed to buy the product.
# I think is a very nice trick of data manipulation using the pivot function.
order_view_summary['rate'] = orders_and_views.pivot_table(
    values='test_assignment', index='test_number')

# After all, we can look at the summary description.
order_view_summary


# Without much background knowledge about how the company crafted the 6 online store designs, we can
# extract the purchase outcome for each record separately for each test design group, and name the variables
# design_a, design_b, and so on.
# This will be used as the observed data immediately when we set up the Bayesian A/B testing model in PyMC3.

design_a = orders_and_views[orders_and_views['test_number'] == 'test_a']['test_assignment']
design_b = orders_and_views[orders_and_views['test_number'] == 'test_b']['test_assignment']
design_c = orders_and_views[orders_and_views['test_number'] == 'test_c']['test_assignment']
design_d = orders_and_views[orders_and_views['test_number'] == 'test_d']['test_assignment']
design_e = orders_and_views[orders_and_views['test_number'] == 'test_e']['test_assignment']
design_f = orders_and_views[orders_and_views['test_number'] == 'test_f']['test_assignment']

# Cool! So we have six variables storing the purchase outcome for 6 designs.


# Traditionally, we can do A/B testing directly at this point and compute the p-value for each design comparison.
# On method is to use the ttest_ind function in the scipy stats module. We'll jump right to the result since 
# the code of A/B testing below just a demo for traditional method, not the Bayesian method.


# Let's take a look for all five tests.
from scipy.stats import ttest_ind

print(f'p-value design a versus b: {ttest_ind(a, b, equal_var=False, alternative="less").pvalue:.1%}')
print(f'p-value design a versus c: {ttest_ind(a, c, equal_var=False, alternative="less").pvalue:.1%}')
print(f'p-value design a versus d: {ttest_ind(a, d, equal_var=False, alternative="less").pvalue:.1%}')
print(f'p-value design a versus e: {ttest_ind(a, e, equal_var=False, alternative="less").pvalue:.1%}')
print(f'p-value design a versus f: {ttest_ind(a, f, equal_var=False, alternative="less").pvalue:.1%}')

# We see that online store design c and design d are more likely to be considered as better designs over the 
# current design, while design b, design e and design f are more likely to be regarded as worse designs
# compared to the current design. 

# (Read) The definition of p-value: a p-value of 13.2% with design c means that it is of 0.132 probability 
# that we'll randomly observe a new design that can bring about as much improvement as or more improvement 
# than the design c. The point is about randomness. If we can hardly observe a random design that can at least
# improve the sales as much as the test design, we are very confident that such new design leads to significant
# improvement. In this case, the p-value is low (mostly falls below the 0.05 threshold.)

p-value design a versus b: 70.6%
p-value design a versus c: 13.2%
p-value design a versus d: 28.3%
p-value design a versus e: 59.5%
p-value design a versus f: 56.0%


# We don't appreciate the use of p-value as much as 20 years ago due to the sheer advancement of Bayesian methods
# thanks to the PyMC3 community.
# Now let's start creating a PyMC3 model to figure out how likely that a particular online store design 
# approach can actually boost the product sales significantly.

# We create a context manager called pm.Model() and name this model as ab_testing.
with pm.Model() as ab_testing:
    
    # In the first step, we need to decide what prior distribution for the proportion of purchase, which is 
    # kind of conversion rate. Recall that in course 1 we've showed a conversion rate example for online ads
    # and conversion rates can be between 0 and 1. So for simplicity, we can choose the Beta distribution which
    # makes sense to model the "probability of probability". 
    
    # Remember our purpose is to determine whether a certain design improves the proportion of customer sales.
    # Therefore, for each proportion of customer purchase among the 6 designs, we assign the same prior belief.
    # Some of you may already discover that assigning same prior for each group and conduct a test is synonymous 
    # to specify a fair prior we've used for hypothesis testing. And again, Bayesian A/B testing is exactly one
    # form of hypothesis testing to be clear. 
    
    # Because we already know that using the current design_a, the success rate for product purchase 
    # are slightly less than a half (from the historical data). Let's specify the probability of 0.45 in Beta
    # distribution by setting 45 for alpha and 55 for beta. I personally think its a faily strong prior distribution

    test_a = pm.Beta("test_a", alpha = 45, beta = 55)
    test_b = pm.Beta("test_b", alpha = 45, beta = 55)
    test_c = pm.Beta("test_c", alpha = 45, beta = 55)
    test_d = pm.Beta("test_d", alpha = 45, beta = 55)
    test_e = pm.Beta("test_e", alpha = 45, beta = 55)
    test_f = pm.Beta("test_f", alpha = 45, beta = 55)
    
    # After setting the same prior for each test design, we're confident that the difference the model shows up 
    # between tests reveal the real differences in terms of customer conversion coming from the data.
    
    # So the next step is to define improvement in terms of proportion of purchase as a deterministic variable 
    # because the subtraction of test_b and test_a - and any pair of two designs is a fixed variable, given the 
    # variable is determined by test_a to test_f above.
    b_improvement = pm.Deterministic("b_improvement", test_b - test_a)
    c_improvement = pm.Deterministic("c_improvement", test_c - test_a)
    d_improvement = pm.Deterministic("d_improvement", test_d - test_a)
    e_improvement = pm.Deterministic("e_improvement", test_e - test_a)
    f_improvement = pm.Deterministic("f_improvement", test_f - test_a)
    
    # We now need to define the likelihood. Since conversion is binary, i.e. success and failure. There is nothing
    # in between. One straightforward way is to specify the Bernoulli variables for each binary observed data 
    # because they can take only the binary values - 0 and 1. We use pm.Bernoulli for the likelihood of each design,
    # set up the variable name 'a_obs', 'b_obs' and so on, and utilize a single probability parameter from the test_a
    # up to test_f from above. In other words, we just reuse the probability 
    # parameter from the prior for each test group, and use the observed option with the arrays of binary data. 
    # Awesome!
    
    a_obs = pm.Bernoulli('a_obs', p = test_a, observed = a)
    b_obs = pm.Bernoulli('b_obs', p = test_b, observed = b)
    c_obs = pm.Bernoulli('c_obs', p = test_c, observed = c)
    d_obs = pm.Bernoulli('d_obs', p = test_d, observed = d)
    e_obs = pm.Bernoulli('e_obs', p = test_e, observed = e)
    f_obs = pm.Bernoulli('f_obs', p = test_f, observed = f)
    
    # We finally reached the sampling step. This time let's draw more samples for each simulation. 
    # Let's return 3,000 samples for each simulation, each with 2,000 initial samples discarded 
    # before the model starts to return sample values. This time I still recruit three groups of samples.
    # (What does return_inferencedata = True means?)
    trace = pm.sample(3000, chains = 3, tune = 2000, return_inferencedata = True)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 4 jobs)
NUTS: [test_f, test_e, test_d, test_c, test_b, test_a]

Sampling 3 chains for 2_000 tune and 3_000 draw iterations (6_000 + 9_000 draws total) took 18 seconds.


# Cool! It took less than a minute to sample from the posterior distribution of the differences between designs.
# With the trace object containing the samples, we can start with the plot_posterior function in arviz to plot the 
# 95% highest posterior density interval.
# It's notable that if we don't need to plot everything, we can specify the var_names. This time we only focus 
# on the improvement of each design over the design a (in test_ab). So we specify 'b_improvement', 'c_improvement'
# and so on in a list, and set hdi_prob to 0.95 to plot the 95% highest posterior density interval for each design
# test.

az.plot_posterior(trace, hdi_prob = 0.95, 
                  var_names = ['b_improvement', 'c_improvement', 'd_improvement',
                               'e_improvement', 'f_improvement'])

# You can see that none of the 95% highest posterior interval contains only negative or positive number. 

# It means that in this experiment, we can't see visible improvement of using any of the alternative design 
# (test_b, test_c, test_d, test_e, test_f) to boost the proportion of customer purchasing products online, 
# assuming that the products are the same for each condition. 

# In other words, we as data scientists should report to the decision makers to continue using the current design
# based on this outcome. But this by no means indicates that the new features can't help the company to gain profits.

# Therefore, despite the dissapointment about this non-exciting result, it's actually a good practice as data
# scientists to look deeper into any good sign of better design in order to create designs for the next
# iteration. We can combine several new features together in the next Bayesian A/B test and see if any substantial
# improvement will be visible compared to the current design. 

# In other words, we're unlikely to draw conclusion that the current design is good enough for online store and shut
# down the research group right away.

# Looking closely to the posterior mean, we find that the some new designs are likely to operate slightly better
# than the current design. Among the six test groups, the posterior mean of 
# design c and design d improvements are positive. We can argue that those improvement is anecdotal 
# (it's good to make such interpretation) so we might tell the company to pay more attention to these new features 
# to make new designs. But still, because none of the posterior distributions locate
# solely on the positive side, it's important to emphasize that we do not have sufficient
# evidence to establish a conclusive decision to switch to a different online store design immediately.

array([[<AxesSubplot:title={'center':'b_improvement'}>,
        <AxesSubplot:title={'center':'c_improvement'}>,
        <AxesSubplot:title={'center':'d_improvement'}>],
       [<AxesSubplot:title={'center':'e_improvement'}>,
        <AxesSubplot:title={'center':'f_improvement'}>, <AxesSubplot:>]],
      dtype=object)


# Let's also compute how many times that the new designs actually outperform the control group, which is
# the current design.
# We can extract the posterior from the trace object using the trace.posterior function with a bracket indicating
# the variable and find their values. We can compute the proportion of positive values by the np.mean function
# comparing the posterior sample value against 0.
print('Probability that design b is better:', np.mean(trace.posterior['b_improvement'].values > 0))
print('Probability that design c is better:', np.mean(trace.posterior['c_improvement'].values > 0))
print('Probability that design d is better:', np.mean(trace.posterior['d_improvement'].values > 0))
print('Probability that design e is better:', np.mean(trace.posterior['e_improvement'].values > 0))
print('Probability that design f is better:', np.mean(trace.posterior['f_improvement'].values > 0))

# The probability that design c and design d attracts higher proportion of customers for buying products is 0.863
# and 0.714, respectively (every time we run the model, the proportion will change due to randomness in modeling). 
# Those indicate that the features of design c and design d might be good to be refactored,
# or embedded into new designs for the next iteration. But the probability drops below 0.5 for the other designs.

Probability that design b is better: 0.29588888888888887
Probability that design c is better: 0.8626666666666667
Probability that design d is better: 0.7101111111111111
Probability that design e is better: 0.4032222222222222
Probability that design f is better: 0.4358888888888889


# At this late stage of posterior diagnostic, we're going to validate if we can actually report our study 
# with audience. So we'll diagnose three statistics: 1) effective sample size, 2) convergence - r_hat and 
# 3) autocorrelation.

# Now let's qualify effective sample size and convergence. 
# We could visit the pm.summary() function to generate a tabular form of posterior statistics.
pm.summary(trace)

# It's immediately clear that we pass all effective sample size and convergence conditions. Let's pause a bit
# and answer why I'm confident with saying that. 

# (In class question: what does the effective sample size and convergent statistics indicate in this table?)
# Answer: the R had statistics one for all variables indicating good convergence.
# Answer: The ESS_bulk is much higher than the critical threshold of 200, indicating that our
# posterior samples are reliable.


# We also need to use az.plot_autocorr function for the trace object and set the combined option as True 
# in order to show the autocorrelation for each variable with all chains of samples combined.
az.plot_autocorr(trace, combined=True)

# You can see that all autocorrelations plots start at around -0.25 but the autocorrelation
# quickly dwindle towards 0 as sample trajectory goes large. So we can say that the posterior estimates
# we collect from the NUTS algorithm tends to be reliable because the samples are largely independent.

array([[<AxesSubplot:title={'center':'test_a'}>,
        <AxesSubplot:title={'center':'test_b'}>,
        <AxesSubplot:title={'center':'test_c'}>],
       [<AxesSubplot:title={'center':'test_d'}>,
        <AxesSubplot:title={'center':'test_e'}>,
        <AxesSubplot:title={'center':'test_f'}>],
       [<AxesSubplot:title={'center':'b_improvement'}>,
        <AxesSubplot:title={'center':'c_improvement'}>,
        <AxesSubplot:title={'center':'d_improvement'}>],
       [<AxesSubplot:title={'center':'e_improvement'}>,
        <AxesSubplot:title={'center':'f_improvement'}>, <AxesSubplot:>]],
      dtype=object)

	item_id	test_assignment	test_number	test_start_date
0	3542.0	0.0	test_a	2013-01-05 00:00:00
1	1981.0	1.0	test_a	2013-01-05 00:00:00
2	3114.0	0.0	test_a	2013-01-05 00:00:00
3	1295.0	1.0	test_a	2013-01-05 00:00:00
4	2624.0	0.0	test_a	2013-01-05 00:00:00
...	...	...	...	...
13183	2646.0	0.0	test_f	2013-01-05 00:00:00
13184	1602.0	0.0	test_f	2013-01-05 00:00:00
13185	463.0	1.0	test_f	2013-01-05 00:00:00
13186	512.0	0.0	test_f	2013-01-05 00:00:00
13187	2944.0	1.0	test_f	2013-01-05 00:00:00

	test_assignment	total	rate
test_number
test_a	1086.0	2198.0	0.494086
test_b	1068.0	2198.0	0.485896
test_c	1123.0	2198.0	0.510919
test_d	1105.0	2198.0	0.502730
test_e	1078.0	2198.0	0.490446
test_f	1081.0	2198.0	0.491811

	mean	sd	hdi_3%	hdi_97%	ess_bulk	ess_tail	r_hat
test_a	0.492	0.011	0.473	0.512	18430.0	7806.0	1.0
test_b	0.484	0.011	0.465	0.505	17592.0	7609.0	1.0
test_c	0.508	0.010	0.490	0.529	15869.0	7119.0	1.0
test_d	0.500	0.011	0.481	0.521	16952.0	6659.0	1.0
test_e	0.489	0.010	0.469	0.508	17520.0	6856.0	1.0
test_f	0.490	0.010	0.471	0.510	14205.0	6693.0	1.0
b_improvement	-0.008	0.015	-0.036	0.021	17258.0	7375.0	1.0
c_improvement	0.016	0.015	-0.012	0.044	16679.0	7642.0	1.0
d_improvement	0.008	0.015	-0.019	0.036	17641.0	7611.0	1.0
e_improvement	-0.004	0.015	-0.031	0.025	18436.0	6825.0	1.0
f_improvement	-0.002	0.015	-0.029	0.026	17001.0	7158.0	1.0

Video 1: Bayesian A/B Testing: Introduction¶

Question 1: What is Bayesian A/B Testing?¶

Question 2: Why do we need Bayesian A/B Testing to solve academic and commercial problems?¶

Question 3: What are general considerations when doing Bayesian A/B testing experiments?¶

Video 2: Bayesian A/B Testing: Online Store Example¶

Question 3: How to implement Bayesian A/B Testing in the online store example?¶

Research Question:¶

Video 3: Communicating Bayesian Estimation Results¶

Question 1: After conducting a Bayesian estimation model, how could we report the results that most people can understand?¶