# From the ESPN.com, we extracted the records for at bats of two famous MLB players - Yuli Gurriel from Houston Astros
# and Vladimir Guerrero Jr. from Toronto Blue Jays. We're interested to see which player had a better at bat performance. 

# So let's import the pandas package
import pandas as pd

# and use the pd.read_csv function to bring in the real data from espn for the two top ranked MLB players in 2021.
batting_average = pd.read_csv('assets/mlb_batting_average.csv')

# Let's look at the first few rows of the data
batting_average.head()

# Cool! The data indicates that in year 2016, Yuli Gurriel failed to hit the first 5 bats.


# Okay, now let's see the last few rows of data using the tail() function.
batting_average.tail()

# You can see that in 2021, Vladimir Guerrero Jr. hit 3 out of the last 5 bats, which is awesome!

print(batting_average['Bat Outcome'].mean())

0.2888239116309292


# For the convenience of modeling, we can import the numpy library
import numpy as np

# and save two numpy arrays, one with all records of Yuli 
yuli = np.array(batting_average[batting_average['player'] == 'Yuli Gurriel']['Bat Outcome'])
# and another with Vlad.
vlad = np.array(batting_average[batting_average['player'] == "Vladimir Guerrero Jr."]['Bat Outcome'])


prob = len(batting_average[batting_average['Bat Outcome'] == 1]) / len(batting_average['Bat Outcome']) 
np.sqrt(prob * (1 - prob))

0.4532159084819711


%%time
# We need the pymc3 package since it is the toolkit to conduct probabilistic modeling.
# So we import pymc3 as pm, it's shorthand.
import pymc3 as pm


# Prior for group mean. Null hypothesis that the posterior probability distribution of the two MLB players
# are exactly the same, versus the alternative hypothesis that the psoterior prob of the two MLB players
# are different from each other.
# mu_m = batting_average['Bat Outcome'].mean()
# mu_s = batting_average['Bat Outcome'].std() * 2

alpha = len(batting_average[batting_average['Bat Outcome'] == 1])
beta = len(batting_average['Bat Outcome']) - alpha

# # Prior for group standard deviation
# sigma_low = 0.05
# sigma_high = 0.4

# The PyMC3 model is wrapped by the with statement, and let's call the model best_model.
with pm.Model() as best_model:
    # The first step of model specification is creating Beta priors for both players.
    # group 1 - Yuli: We create group1_prob variable with alpha, and beta being the parameters
    group1_prob = pm.Beta('group1_prob', alpha = alpha, beta = beta)
    # group 2 - Vlad: We create group2_prob as exactly the same prior distribution since the purpose of this
    # model is quantifying the difference between the two players, so we should start with the same prior
    # for both players.
    group2_prob = pm.Beta('group2_prob', alpha = alpha, beta = beta)
    
    # Uniform prior for group standard deviation - assuming we're not expert analyst for MLB
    group1_std = pm.Deterministic('group1_std', np.sqrt(group1_prob * (1 - group1_prob) / len(yuli)))
    group2_std = pm.Deterministic('group2_std', np.sqrt(group2_prob * (1 - group2_prob) / len(vlad)))
    
    # Exponential prior for degree of freedom - large sample
    # This will be used to calculate the effect size
    v = pm.Exponential('v_minus_one', 1/len(batting_average)) + 1
    
    # Transform the standard deviation to precision
    lambda_1 = group1_std ** -2
    lambda_2 = group2_std ** -2
    
    # Assign likelihood using T-distribution - which is when we use data, there should be little bit
    # longer tails than the normal distribution
    # You can see that we use the same prior for both drug and placebo population
    group1 = pm.StudentT("yuli", nu = v, mu = group1_mean, lam = lambda_1, observed = yuli)
    group2 = pm.StudentT("vlad", nu = v, mu = group2_mean, lam = lambda_2, observed = vlad)
    
    # Deterministic nodes for difference of means and standard deviations between two groups
    # These variables are not stochastic, so it just represents the same value of group mean and standard deviation
    # difference, and finally the effect size.
    diff_means = pm.Deterministic("difference of means", group1_mean - group2_mean)
    diff_stds = pm.Deterministic("difference of stds", group1_std - group2_std)
    # Formula for effect size
    effect_size = pm.Deterministic("effect size", diff_means / np.sqrt((group1_std ** 2 + group2_std **2) / 2))
    
    # Fit the model and evaluate the output as trace
    # The sample function runs 3000 iterations and returns a Trace object containing the samples 
    # collected in all chains, in the order they were collected.
    trace = pm.sample(3000, chains = 4, tune = 500)

---------------------------------------------------------------------------
MissingInputError                         Traceback (most recent call last)
<timed exec> in <module>

~/opt/anaconda3/lib/python3.8/site-packages/pymc3/sampling.py in sample(draws, step, init, n_init, start, trace, chain_idx, chains, cores, tune, progressbar, model, random_seed, discard_tuned_samples, compute_convergence_checks, callback, jitter_max_retries, return_inferencedata, idata_kwargs, mp_ctx, pickle_backend, **kwargs)
    426     start = deepcopy(start)
    427     if start is None:
--> 428         check_start_vals(model.test_point, model)
    429     else:
    430         if isinstance(start, dict):

~/opt/anaconda3/lib/python3.8/site-packages/pymc3/util.py in check_start_vals(start, model)
    232             )
    233 
--> 234         initial_eval = model.check_test_point(test_point=elem)
    235 
    236         if not np.all(np.isfinite(initial_eval)):

~/opt/anaconda3/lib/python3.8/site-packages/pymc3/model.py in check_test_point(self, test_point, round_vals)
   1382 
   1383         return Series(
-> 1384             {RV.name: np.round(RV.logp(test_point), round_vals) for RV in self.basic_RVs},
   1385             name="Log-probability of test_point",
   1386         )

~/opt/anaconda3/lib/python3.8/site-packages/pymc3/model.py in <dictcomp>(.0)
   1382 
   1383         return Series(
-> 1384             {RV.name: np.round(RV.logp(test_point), round_vals) for RV in self.basic_RVs},
   1385             name="Log-probability of test_point",
   1386         )

~/opt/anaconda3/lib/python3.8/site-packages/pymc3/model.py in logp(self)
    415     def logp(self):
    416         """Compiled log probability density function"""
--> 417         return self.model.fn(self.logpt)
    418 
    419     @property

~/opt/anaconda3/lib/python3.8/site-packages/pymc3/model.py in fn(self, outs, mode, *args, **kwargs)
   1276         Compiled Theano function
   1277         """
-> 1278         return LoosePointFunc(self.makefn(outs, mode, *args, **kwargs), self)
   1279 
   1280     def fastfn(self, outs, mode=None, *args, **kwargs):

~/opt/anaconda3/lib/python3.8/site-packages/pymc3/model.py in makefn(self, outs, mode, *args, **kwargs)
   1252         """
   1253         with self:
-> 1254             return theano.function(
   1255                 self.vars,
   1256                 outs,

~/opt/anaconda3/lib/python3.8/site-packages/theano/compile/function/__init__.py in function(inputs, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input)
    335         # note: pfunc will also call orig_function -- orig_function is
    336         #      a choke point that all compilation must pass through
--> 337         fn = pfunc(
    338             params=inputs,
    339             outputs=outputs,

~/opt/anaconda3/lib/python3.8/site-packages/theano/compile/function/pfunc.py in pfunc(params, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input, output_keys)
    522         inputs.append(si)
    523 
--> 524     return orig_function(
    525         inputs,
    526         cloned_outputs,

~/opt/anaconda3/lib/python3.8/site-packages/theano/compile/function/types.py in orig_function(inputs, outputs, mode, accept_inplace, name, profile, on_unused_input, output_keys)
   1968     try:
   1969         Maker = getattr(mode, "function_maker", FunctionMaker)
-> 1970         m = Maker(
   1971             inputs,
   1972             outputs,

~/opt/anaconda3/lib/python3.8/site-packages/theano/compile/function/types.py in __init__(self, inputs, outputs, mode, accept_inplace, function_builder, profile, on_unused_input, fgraph, output_keys, name)
   1582             # make the fgraph (copies the graph, creates NEW INPUT AND
   1583             # OUTPUT VARIABLES)
-> 1584             fgraph, additional_outputs = std_fgraph(inputs, outputs, accept_inplace)
   1585             fgraph.profile = profile
   1586         else:

~/opt/anaconda3/lib/python3.8/site-packages/theano/compile/function/types.py in std_fgraph(input_specs, output_specs, accept_inplace)
    186     orig_outputs = [spec.variable for spec in output_specs] + updates
    187 
--> 188     fgraph = FunctionGraph(orig_inputs, orig_outputs, update_mapping=update_mapping)
    189 
    190     for node in fgraph.apply_nodes:

~/opt/anaconda3/lib/python3.8/site-packages/theano/graph/fg.py in __init__(self, inputs, outputs, features, clone, update_mapping)
    160 
    161         for output in outputs:
--> 162             self.import_var(output, reason="init")
    163         for i, output in enumerate(outputs):
    164             self.clients[output].append(("output", i))

~/opt/anaconda3/lib/python3.8/site-packages/theano/graph/fg.py in import_var(self, var, reason)
    328         # Imports the owners of the variables
    329         if var.owner and var.owner not in self.apply_nodes:
--> 330             self.import_node(var.owner, reason=reason)
    331         elif (
    332             var.owner is None

~/opt/anaconda3/lib/python3.8/site-packages/theano/graph/fg.py in import_node(self, apply_node, check, reason)
    381                             "for more information on this error."
    382                         )
--> 383                         raise MissingInputError(error_msg, variable=var)
    384 
    385         for node in new_nodes:

MissingInputError: Input 0 of the graph (indices start from 0), used to compute InplaceDimShuffle{x}(group1_mean), was not provided and not given a value. Use the Theano flag exception_verbosity='high', for more information on this error.


%%time
# We need the pymc3 package since it is the toolkit to conduct probabilistic modeling.
# So we import pymc3 as pm, just the shorthand
import pymc3 as pm


# Prior for group mean. Null hypothesis that the posterior probability distribution of the two MLB players
# are exactly the same, versus the alternative hypothesis that the psoterior prob of the two MLB players
# are different from each other.
mu_m = batting_average['Bat Outcome'].mean()
mu_s = batting_average['Bat Outcome'].std() * 2

# Prior for group standard deviation
sigma_low = 0.05
sigma_high = 0.4

# Set up prior for 2 groups
with pm.Model() as model:
    # Normal prior for group mean k = 1,2
    # group 1 - yuli
    group1_mean = pm.Normal('group1_mean', mu = mu_m, sd = mu_s)
    # group 2 - vlad
    group2_mean = pm.Normal('group2_mean', mu = mu_m, sd = mu_s)
    
    # Uniform prior for group standard deviation - assuming we're not expert analyst for MLB
    group1_std = pm.Uniform('group1_std', lower = sigma_low, upper = sigma_high)
    group2_std = pm.Uniform('group2_std', lower = sigma_low, upper = sigma_high)
    
    # Exponential prior for degree of freedom - large sample
    # This will be used to calculate the effect size
    v = pm.Exponential('v_minus_one', 1/len(batting_average)) + 1
    
    # Transform the standard deviation to precision
    lambda_1 = group1_std ** -2
    lambda_2 = group2_std ** -2
    
    # Assign likelihood using T-distribution - which is when we use data, there should be little bit
    # longer tails than the normal distribution
    # You can see that we use the same prior for both drug and placebo population
    group1 = pm.StudentT("yuli", nu = v, mu = group1_mean, lam = lambda_1, observed = yuli)
    group2 = pm.StudentT("vlad", nu = v, mu = group2_mean, lam = lambda_2, observed = vlad)
    
    # Deterministic nodes for difference of means and standard deviations between two groups
    # These variables are not stochastic, so it just represents the same value of group mean and standard deviation
    # difference, and finally the effect size.
    diff_means = pm.Deterministic("difference of means", group1_mean - group2_mean)
    diff_stds = pm.Deterministic("difference of stds", group1_std - group2_std)
    # Formula for effect size
    effect_size = pm.Deterministic("effect size", diff_means / np.sqrt((group1_std ** 2 + group2_std **2) / 2))
    
    # Fit the model and evaluate the output as trace
    # The sample function runs 3000 iterations and returns a Trace object containing the samples 
    # collected in all chains, in the order they were collected.
    trace = pm.sample(3000, chains = 4, tune = 500)

<timed exec>:53: FutureWarning: In v4.0, pm.sample will return an `arviz.InferenceData` object instead of a `MultiTrace` by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [v_minus_one, group2_std, group1_std, group2_mean, group1_mean]

Sampling 4 chains for 500 tune and 3_000 draw iterations (2_000 + 12_000 draws total) took 29 seconds.
There were 7 divergences after tuning. Increase `target_accept` or reparameterize.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.

CPU times: user 9.58 s, sys: 1.33 s, total: 10.9 s
Wall time: 40.9 s


# We can double-check for example, the number of samples for the mean batting average for Yuli.
len(trace['group1_mean'])

12000


# The first step of visualization comes with using the plot_posterior function. 
# It takes the trace object and show the posterior mean, standard deviation and degree of freedom of the model
pm.plot_posterior(
    trace,
    var_names=["group1_mean", "group2_mean", "group1_std", "group2_std", "v_minus_one"],
    color="#87ceeb",
    hdi_prob = 0.95
)

/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/arviz/data/io_pymc3.py:96: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  warnings.warn(

array([[<AxesSubplot:title={'center':'group1_mean'}>,
        <AxesSubplot:title={'center':'group2_mean'}>,
        <AxesSubplot:title={'center':'group1_std'}>],
       [<AxesSubplot:title={'center':'group2_std'}>,
        <AxesSubplot:title={'center':'v_minus_one'}>, <AxesSubplot:>]],
      dtype=object)


# We move on to plot the posterior distribution of the difference in means, standard deviation and effect size of the two groups
pm.plot_posterior(trace,
                  var_names = ["difference of means", "difference of stds", "effect size"],
                  ref_val = 0,
                  color = "#87ceeb",
                 hdi_prob = 0.95)

# very small effect size - 2 groups do not have provide substantial evidence that their batting averages are different
# from each other.
# In general, if the percent goes down to 2.5% and go up to 97.5% (indicates 95% HDI)

/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/arviz/data/io_pymc3.py:96: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  warnings.warn(

array([<AxesSubplot:title={'center':'difference of means'}>,
       <AxesSubplot:title={'center':'difference of stds'}>,
       <AxesSubplot:title={'center':'effect size'}>], dtype=object)


# Now we make a forest plot to see how much mean does the two groups differ
pm.forestplot(trace, 
          var_names = ["group1_mean", "group2_mean"],
             hdi_prob = 0.95)

<ipython-input-32-5cfd4cf075ec>:2: DeprecationWarning: The function `forestplot` from PyMC3 is just an alias for `plot_forest` from ArviZ. Please switch to `pymc3.plot_forest` or `arviz.plot_forest`.
  pm.forestplot(trace,
/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/arviz/data/io_pymc3.py:96: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  warnings.warn(

array([<AxesSubplot:title={'center':'95.0% HDI'}>], dtype=object)


# We also want to see how much spread difference occurred between the two groups by comparing standard deviation
pm.forestplot(trace, 
          var_names = ["group1_std", "group2_std"],
             hdi_prob = 0.95)

<ipython-input-33-6f1012633019>:2: DeprecationWarning: The function `forestplot` from PyMC3 is just an alias for `plot_forest` from ArviZ. Please switch to `pymc3.plot_forest` or `arviz.plot_forest`.
  pm.forestplot(trace,
/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/arviz/data/io_pymc3.py:96: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  warnings.warn(

array([<AxesSubplot:title={'center':'95.0% HDI'}>], dtype=object)


# Finally we look at the parameters in table-like view using az.summary
az.summary(trace)

/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/arviz/data/io_pymc3.py:96: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  warnings.warn(


# Appendix A: Draw 95% Posterior Credible Interval in seaborn

ax = sns.kdeplot(mid_batting_average)
# Let's add the title, the x label
ax.set(title = "95% credible interval of player's batting average", xlabel = "Batting average")
# and colors the entire vertical axis according to the 95% credible interval using the axvspan function.

upper_bound = stats.beta(90 + 130, 240 + 245).ppf(1)
lower_bound = stats.beta(90 + 130, 240 + 245).ppf(0.05)

ax.axvspan(lower_bound, upper_bound, alpha = 0.2)

ax.arrow(x = 0.42,                #x start point
         y = 16,                  #y start point
         dx = -0.08,              #change in x 
         dy = 0,                  #change in y
         head_width=0.7,          #arrow head width
         head_length=0.03,        #arrow head length
         width=0.005,             #arrow stem width
         fc='black',              #arrow fill color
         ec='black')              #arrow edge color
# Cool! We can finally add "95% plausible" as the text at somewhere right below the arrow using the 
# annotation function.
ax.annotate('95% plausible', xy=(0.3, 14))

Text(0.3, 14, '95% plausible')


# Appendix B: Another 95% Posterior Credible Interval

ax = sns.kdeplot(mid_batting_average)
# Let's add the title, the x label
ax.set(title = "95% credible interval of player's batting average", xlabel = "Batting average")
# and colors the entire vertical axis according to the 95% credible interval using the axvspan function.

upper_bound = stats.beta(90 + 130, 240 + 245).ppf(0.95)
lower_bound = stats.beta(90 + 130, 240 + 245).ppf(0)

ax.axvspan(lower_bound, upper_bound, alpha = 0.2)

ax.arrow(x = 0.37,                #x start point
         y = 16,                  #y start point
         dx = -0.04,              #change in x 
         dy = 0,                  #change in y
         head_width=0.7,          #arrow head width
         head_length=0.02,        #arrow head length
         width=0.005,             #arrow stem width
         fc='black',              #arrow fill color
         ec='black')              #arrow edge color
# Cool! We can finally add "95% plausible" as the text at somewhere right below the arrow using the 
# annotation function.
ax.annotate('95% plausible', xy=(0.3, 14))

Text(0.3, 14, '95% plausible')


# Appendix C: Another 95% Posterior Credible Interval

# You can actually plot the posterior distribution of batting average again in an ax object.

ax = sns.kdeplot(mid_batting_average)
# Let's add the title, the x label
ax.set(title = "95% credible interval of player's batting average", xlabel = "Batting average")
# and indicate the 95% credible interval, which is a range of values that represent 95% plausibility 
# based on the posterior distribution using the axvspan function. We can feed the lower bound and upper
# bound to the axvspan function so that it colors the entire vertical axis according to the 95% credible interval.
ax.axvspan(lower_bound, upper_bound, alpha = 0.2)
# I've been working with educational technology researchers for years, many of which loves to make
# annotations in graph to highlight significant content in tutorials. Just for those who loves using
# arrows for clarity in presentation, there is an arrow function that adds an arrow in a plot.
# I'm not going to detail each of the optional parameters in this lecture, but the general idea is
# customizing the starting point, the end point and the size of the arrow.
ax.arrow(x = 0.36,                #x start point
         y = 16,                  #y start point
         dx = -0.04,              #change in x 
         dy = 0,                  #change in y
         head_width=0.7,          #arrow head width
         head_length=0.01,        #arrow head length
         width=0.005,             #arrow stem width
         fc='black',              #arrow fill color
         ec='black')              #arrow edge color
# Cool! We can finally add "95% plausible" as the text at somewhere right below the arrow using the 
# annotation function.
ax.annotate('95% plausible', xy=(0.3, 14))
# Excellent! I hope you like this example which shows that if we add up the plausibility of all the values
# between 27.8% and 34.7% based on the posterior distribution, we've arrived at 95% plausibility. 
# In other words, given the player has made 130 base hits out of 375 attempts, we believe there's a 
# 95% chance that the player's true batting average rate is between 27.8% and 34.7%.

Text(0.3, 14, '95% plausible')


# Import the arviz package as az 
import arviz as az
# I won’t go into the detail of the arviz package at this point as that’s not the point of this tutorial. 
# Suffice to say, Arviz is a visualization package particularly focusing on offering a handful of visuals
# which supports Bayesian EDA.

# The narrowest credible interval possible to contain a given portion of the posterior density is of 
# particular significance. We call that interval the highest density interval (HDI). We can use the 
# hdi function in the arviz package, passing through the posterior distribution of batting average and
# 0.95 to indicate we want the 95% highest density interval.
az.hdi(mid_batting_average, hdi_prob=.95)

# Excellent! The shortest interval that captures 95% plausibility about the player's batting average
# is from 27.8% and 34.5%. Indeed, compared to a symmetric 95% credible interval (from 27.8% to 34.7%),
# the HDI is even shorter. It's hence true that the overall posterior density within the HDI is the 
# highest, which is exactly the reason why statisticians call that interval HDI.

array([0.28105725, 0.34846632])


# In Arviz, we can provide an array to the plot_posterior function to tell the program to plot a 
# 95% HDI with the specification of the hdi_prob option and set it as 0.95. Let's round the statistics
# to 3-digits and take a look!
az.plot_posterior(mid_batting_average, hdi_prob=0.95, round_to = 3).set(title = "95% HDI for player's batting average", xlabel = "Batting average")

# Cool! This graph shows the player is 95% plausible to hit 27.7% up to 34.5% of base hits and
# this interval is the shortest interval possible to cumulate 95% plausibility for the posterior distribution 
# of the player's batting average.

[Text(0.5, 1.0, "95% HDI for player's batting average"),
 Text(0.5, 0, 'Batting average')]

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
group1_mean	0.291	0.008	0.275	0.307	0.000	0.000	10305.0	8354.0	1.0
group2_mean	0.283	0.013	0.258	0.307	0.000	0.000	10232.0	8259.0	1.0
group1_std	0.399	0.001	0.398	0.400	0.000	0.000	8704.0	5718.0	1.0
group2_std	0.398	0.001	0.396	0.400	0.000	0.000	7482.0	4518.0	1.0
v_minus_one	3685.731	3134.258	100.158	9383.717	31.059	22.546	8426.0	6636.0	1.0
difference of means	0.008	0.016	-0.021	0.038	0.000	0.000	10142.0	8037.0	1.0
difference of stds	0.001	0.002	-0.002	0.004	0.000	0.000	10521.0	9373.0	1.0
effect size	0.021	0.040	-0.054	0.095	0.000	0.000	10136.0	7981.0	1.0

Bayesian Estimation Supersedes the T-test (BEST)¶

Question 1: What kind of question does BEST solve?¶

Question 2: What are the procedure of implementing BEST and diagnostics that BEST offer?¶

Question 3: Research Question¶

Refined research question:¶

Posterior Diagnosis¶

	player	year
0	Yuli Gurriel	2016
1	Yuli Gurriel	2016
2	Yuli Gurriel	2016
3	Yuli Gurriel	2016
4	Yuli Gurriel	2016