# In programming, we can import numpy and seaborn
import numpy as np
import seaborn as sns

# First of all, we can use np.linspace to create an equally-spaced sequence from -5 to 5.
# I want the sequence to have small distance to each other so I set the number of elements to 500
x = np.linspace(-5,5,500)

# The scatterplot function in seaborn allow us to specify the points at x-axis and y-axis, and plot
# their relationship altogether
sns.scatterplot(x = x, y = 1/(1+np.exp(-x))).set(title = "Logistic Function", xlabel = "Log odd", ylabel = "probability")

[Text(0.5, 1.0, 'Logistic Function'),
 Text(0.5, 0, 'Log odd'),
 Text(0, 0.5, 'probability')]


# and then the odd is 0.6 / ( 1 - 0.6) 
odd = 0.6 / (1 - 0.6)
print("The odd of getting up before 8:00 AM:", odd)
# = 1.5, a value greater than 1.

The odd of getting up before 8:00 AM: 1.4999999999999998


#  And if we take the log of it by using the np.log function,
import numpy as np
print("The log odd of getting up before 8:00 AM:", np.log(odd)) # Log odd
# We get log(1.5) = 0.405, which is a positive number. It is great to know that anytime when the probability
# of something happening is greater than 0.5, the log odd is a positive number, and this indicates the
# odd is in favor of what is going to happen. Otherwise if the log odd is a positive number, then we say the 
# odd is against what is going to happen.

# To reiterate, Log odd = log(odd) = log(p / 1 - p), we need to keep in mind this quantity because it is
# exactly the value that the logistic regression would predict, for all regression coefficients and for all 
# data put into the model.

The log odd of getting up before 8:00 AM: 0.4054651081081642


# From log odd, therefore, we can convert it back to probability by the following computation.
# We exponentiate the log odd so we get an odd. And since odd = p / (1 - p), then
# odd * (1-p) = p, then odd = p * (1+odd), then the probability = odd / (1 + odd), or 1 / (1 + 1/odd) similar.

print("Probability of getting up before 8:00 AM:", np.exp(np.log(odd)) / (1 + np.exp(np.log(odd))))

# Cool! So we get back to the 0.6 probability. Now you may know how log odd and probability can convert 
# to each other.

Probability of getting up before 8:00 AM: 0.5999999999999999


# Bayesian logistic regression has proven capable of generating the predicting rules that will 
# allow you to predict the winners of March Madness tournament based on a decision boundary.

decision_boundary = sns.scatterplot(x = x, y = 1/(1+np.exp(-x)))
decision_boundary.set(title = "Logistic Function", xlabel = "Log odd", ylabel = "probability")
decision_boundary.axvline(0)

# The decision boundary in Bayesian logistic regression is set to 0, because a 0 log odd corresponds with
# 0.5 in probability, which is when the success and failure probabilities are equal. 
# Left decision boundary predicts losing the game, a failure.
# Right decision boundary predicts winning the game, a success.

<matplotlib.lines.Line2D at 0x7fa0fabd4fd0>


# Now let's look at the data.
# The NCAA march madness data is collected from https://data.world/sports/ncaa-mens-march-madness/workspace/file?filename=NCAA+Mens+March+Madness+Historical+Results.csv
# where it records march madness matches since 1985. It's an interesting data about sport analytics
# that I found that at least three competing research groups have been trying to dig out the best techniques 
# to predict outcomes based on several numerical features in the dataset.

# We can import the necessary package for data science works
import pymc3 as pm
import numpy as np
import theano.tensor as t


# Now, let's import pandas for data manipulation and import a web dataset that stores NCAA Men's March 
# Madness historical results since the 1980s.
import pandas as pd
march_madness = pd.read_csv('https://query.data.world/s/3cmo5s4axblcl2bh3sjtivo3xrhbox')
# and let's take a look about the complete data
march_madness


# Let's also look at its shape
march_madness.shape

(2050, 10)


# We are going to encode the win and loss as categorical variables with the numbers 0 and 1.

round_of_64 = march_madness[march_madness['Round'] == 'Round of 64']

win = round_of_64[['Date', 'Round', 'Region', 'Winning Seed', 'Winning Score']]
win['Seed'] = win['Winning Seed']
win['Score'] = win['Winning Score']
lose = round_of_64[['Date', 'Round', 'Region', 'Losing Seed', 'Losing Score']]
lose['Seed'] = lose['Losing Seed']
lose['Score'] = lose['Losing Score']

win['Outcome'] = 1
lose['Outcome'] = 0

<ipython-input-10-6f0be467e7e7>:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  win['Seed'] = win['Winning Seed']
<ipython-input-10-6f0be467e7e7>:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  win['Score'] = win['Winning Score']
<ipython-input-10-6f0be467e7e7>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lose['Seed'] = lose['Losing Seed']
<ipython-input-10-6f0be467e7e7>:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lose['Score'] = lose['Losing Score']
<ipython-input-10-6f0be467e7e7>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  win['Outcome'] = 1
<ipython-input-10-6f0be467e7e7>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lose['Outcome'] = 0


# We then concatenate the dataset 
combined = pd.concat([win,lose])

# and save this pandas DataFrame to a csv file.
combined.to_csv('assets/march_madness.csv')


# Now let's import the data that specifically shows the round of 64 for the years before.
round_of_64 = pd.read_csv('assets/march_madness.csv')
# and look at the first few rows.
round_of_64.head()


# 1) Winning Seed at round of 64
# Seaborn is a very convenient visualization tool. Let's import seaborn.
import seaborn as sns
# Since we might know that all eligible teams for tournament are seeded 1 to 16, so we treat the team seed a 
# categorical, or discrete variable. We can use histogram therefore to visualize the frequency distribution of 
# winning given teams' seed.
sns.histplot(x = round_of_64['Winning Seed'], kde=True)

# Cool! Almost decreasing steadily from seed 1 to seed 16. Highest prob for first seed to get to the second round,
# and it features the least probability for 15 and 16th seed teams to advance to the second round.

<AxesSubplot:xlabel='Winning Seed', ylabel='Count'>


# 2) Losing Seed round_of_64

# It's always nice to check different features and see other interesting patterns.
sns.histplot(x = round_of_64['Losing Seed'], kde = True)

# We can tell directly that it will show the lowest frequency for the first seed teams and the highest frequency for 
# the lowest seed teams because the losing teams are the rival teams to the winning teams. Cool!

<AxesSubplot:xlabel='Losing Seed', ylabel='Count'>


# 3) Winning and Losing Score

# Let's now compare the winning and losing scores among 1000+ 64th round games.
# We can plot two kdeplots using the kdeplot function to compare the distribution of winning game scores
sns.kdeplot(x = round_of_64['Winning Score'])
# and the distribution of losing game scores.
sns.kdeplot(x = round_of_64['Losing Score'])

# By looking at the graph, we can see that both distributions look quite symmetric, with the highest density
# located at around 60 and 80, respectively. Could we just say that there are 20 points difference happening
# per game on average? If that's true, many people might be surprised!

<AxesSubplot:xlabel='Winning Score', ylabel='Density'>


# 4) Winning Margin Distribution = Winning Score - Losing Score 

# But we can't directly draw the difference in winning and losing score. We need to first drop all non-applicable
# values called NAs and convert the pd.Series() into two separate np.arrays as multi-dimensional arrays, only then
# we are taking their differences. Let's see the plot.

# Because the winning margin should always be an integer, we could not draw a kdeplot here or otherwise will return
# error message
sns.histplot(x = np.array(round_of_64['Winning Score'].dropna()) - np.array(round_of_64['Losing Score'].dropna()), kde = True)

# Great! So by observation, I think the winning margin for an average game, depending on how to interpret, would 
# better use its median value, instead of mean value, because there were certain 10s of extremely lopsided games
# raising the average winning margin up.

# The winning margin at the median should be around 10.
print(np.median(np.array(round_of_64['Winning Score'].dropna()) - np.array(round_of_64['Losing Score'].dropna())))

11.0


# In a similar fashion, we start by defining a context manager and this time we call it NCAA_prediction_model.

with pm.Model() as NCAA_prediction_model:
    # To make a regression model, we treat everything as a random variable as ultimately we want to know 
    # the posterior probability distribution of the parameters and in this case, the regression 
    # coefficients.

    # Getting back to logistic regression, we need to specify a prior and likelihood in order to 
    # draw samples from the posterior distribution.
    # Here we need to first understand what the initial effect means to us. An initial effect here is the log odds
    # of a certain team (with seed = 0 and score = 0) winning the first round NCAA tournament game. Although
    # it's impossible the score 0 in a game, and a team is seeded as 0, certain patterns could be inferred.
    
    # From the data visualizations, we are clear that the larger number of seeds indicates a lower chance of
    # winning, while scoring more points leads to higher probability of winning a game. Since seeding and 
    # scoring points are conflicting, we can tell the system that we're highly unsure of the initial effect
    # by setting the mu parameter as 0, and sd to be like 3, which is a large value in terms of the change
    # of log odds.
    intercept = pm.Normal("intercept", mu=0, sd = 3)

    # Specify seed effect: Recall that the higher the team's seed, the higher the probability of the team
    # losing the game. Therefore, we can quantify the diminishing probability when the number of seeds
    # increases by setting a negative effect on log odds. I choose to initialize at -0.2 as I think the change
    # in log odds from the top seed to the 16th seed should change approximately from 90% to 10%. Here I
    # implied some mathematical calculations but just take it as a value that indicates decreasing probability
    # when team's seed increases.
    seed = pm.Normal("seed", mu=-0.2, sd = 3)
    
    # Specify score effect: Recall that the more points scored, the higher the probability of the team
    # winning the game. Therefore, we can quantity the increasing probability when team score rises by
    # setting a positive effect on log odds in the mu option. I choose to initialize at 0.03 because I think
    # the change in log odds per point could be small but not negligible. Since the effect of one point is
    # expected to have minor effect on the winning probability, I also set a smaller number in the sd option
    # where the standard deviation of the score effect, to 0.5. Cool!
    score = pm.Normal('score', mu= 0.03, sd = 0.5)

    # Likelihood: In likelihood of Logistic regression, we typically define the regression effect as log odds.
    # We denote z as the log odds of winning the 64th round game. The log odds is computed through the intercept,
    # adds the seed number times the seed effect, and adds the number of points scored times the scoring effect.
    z = intercept + seed * round_of_64['Seed'] + score * round_of_64['Score']

    # Finally, we convert the regression effect to probability, by taking a (sigmoid) transformation where we divide
    # 1 by 1 plus the exponentiated form of the log odd. This is actually the predictive probability of that 
    # team winning the game. So this quantity represents the probability parameter of the pm.Bernoulli likelihood.
    # Subsequently, we should specify the observed data as the win or lose outcome column from the 64th round 
    # competition data.
    yhat = pm.Bernoulli("yhat", 1 / (1 + t.exp(-z)), observed=round_of_64['Outcome'])
    # Now as we're modeling success and failure outcomes, a kind of binary data, we want to set a higher number
    # of posterior samples in order to ensure that we have a high chance to get reliable and unautocorrelated
    # posterior samples and make meaningful posterior diagnostics and interpretation. Again, we want to experiment
    # three groups of games and get a simulation from each chain, and each simulation has the first 2000 samples
    # discarded.
    trace = pm.sample(10000, chains = 3, tune = 2000)
    
    # Cool!

<ipython-input-17-648524c2b14c>:53: FutureWarning: In v4.0, pm.sample will return an `arviz.InferenceData` object instead of a `MultiTrace` by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning.
  trace = pm.sample(10000, chains = 3, tune = 2000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 4 jobs)
NUTS: [score, seed, intercept]

Sampling 3 chains for 2_000 tune and 10_000 draw iterations (6_000 + 30_000 draws total) took 72 seconds.


# Traceplot
# We can import the arviz package for posterior diagnostics.
import arviz as az

# Let's make a trace plot to take a glimpse about the distribution and variability of posterior samples.
pm.traceplot(trace)

# Cool! We now see that the initial effect of a team winning a game is at around -3 in log odds based on the
# highest posterior density, which represents
# less than 0.5 probability so that the team tends to lose given if the team scored no point at all.
# That's so true right?!

# As you can see the seed row, we found that the lower seed rank for each will reduce the log odds of winning the 
# game by around 0.3 unit based on the highest posterior density.

# Now you can see the score row, we found that the higher points scored in a game will boost the log odds of winning
# the game by around 0.08 per point increase, based on the highest posterior density.

# From the right panel, the simulated samples are mostly similar in fluctuation behaviours. We don't see any 
# special patterns that indicate any chain is off from other chains.

<ipython-input-18-2d861c3859dc>:6: DeprecationWarning: The function `traceplot` from PyMC3 is just an alias for `plot_trace` from ArviZ. Please switch to `pymc3.plot_trace` or `arviz.plot_trace`.
  pm.traceplot(trace)
/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/arviz/data/io_pymc3.py:96: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  warnings.warn(

array([[<AxesSubplot:title={'center':'intercept'}>,
        <AxesSubplot:title={'center':'intercept'}>],
       [<AxesSubplot:title={'center':'seed'}>,
        <AxesSubplot:title={'center':'seed'}>],
       [<AxesSubplot:title={'center':'score'}>,
        <AxesSubplot:title={'center':'score'}>]], dtype=object)


# Cool! Now let's dive deeper into the posterior statistics.
# We can develop a tabular summary of posterior statistics using pm.summary() and as we're more accustomed
# to the 95% level, let's also specify the hdi_prob option to 0.95.

pm.summary(trace, hdi_prob = 0.95)

# So now we see the posterior statistics, the posterior mean, posterior standard deviation and can build up
# 95% posterior credible interval for the initial effect, the effect of team's seed and total scores using 
# the values of hdi_2.5% and hdi_97.5%.

# So based on the table, we find the 95% credible intervals of the variables are:
# 1. [-3.666, -2.019] for initial effect, so we are 95% confident that the log odds of winning the 64th round 
# game is between -3.666 and -2.019 units.
# 2. [-0.328, -0.267] for seed effect, so we are 95% confident that the log odds of winning the 64th round game
# would decrease from -0.328 to -0.267 units. That means, we're more than 95% sure that a lower seed team will
# be more likely to lose the game.
# 3. [0.066, 0.088] for score effect, so we are 95% confident that the log odds of winning the 64th round game
# would increase from 0.066 to 0.088 units. That means, we're more than 95% sure that each point scored more in
# a game is associated with an increase in winning probability.

# Notwithstanding the complexity of computing the probability just using the pm.summary, we actually think the
# regression effects make much sense. A lower seed team has lower probability to win the game corroborates the
# histogram showing the winning frequency against the team's seed. Scoring more points increase the probability of
# winning corroborates the common sense that a team with higher points win the game. 

# Now from the table, we also see r_hat is 1.0, that means all chains of posterior samples have good convergence.
# We can also look at the effective sample size from the ess_bulk column. We see that the effective samples for 
# each variable are really high, much higher than the threshold of 200, so the posterior estimates are reliable.

/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/arviz/data/io_pymc3.py:96: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  warnings.warn(


# After that, we can draw 95% highest posterior density plot with a reference value of 0 to test if the 
# 95% posterior credible of the seed and score effect strongly indicates change in winning probability.

# Let's use the az.plot_posterior function, with ref_val set to 0 as reference point.
az.plot_posterior(trace, hdi_prob = 0.95, ref_val= 0)

# It's nice to see that all three 95% highest posterior density plots show the 100% description in orange color.
# What does that mean? 100% at the reference value = 0 indicates that the team's seed and game score significantly
# affect the winning probability. Both predictors have good predictability and are useful to explain which 
# team will win the 64th round game.

/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/arviz/data/io_pymc3.py:96: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  warnings.warn(

array([<AxesSubplot:title={'center':'intercept'}>,
       <AxesSubplot:title={'center':'seed'}>,
       <AxesSubplot:title={'center':'score'}>], dtype=object)


# Now, let's check the autocorrelation of the posterior samples to see if there are special patterns.
az.plot_autocorr(trace)
# We only see the autocorrelation for each plot shrinks steadily to 0 very quickly.
# This is a good indication that the posterior estimates from our model simulation are reliable, double-confirmed.

/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/arviz/data/io_pymc3.py:96: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  warnings.warn(

array([[<AxesSubplot:title={'center':'intercept\n0'}>,
        <AxesSubplot:title={'center':'intercept\n1'}>,
        <AxesSubplot:title={'center':'intercept\n2'}>],
       [<AxesSubplot:title={'center':'seed\n0'}>,
        <AxesSubplot:title={'center':'seed\n1'}>,
        <AxesSubplot:title={'center':'seed\n2'}>],
       [<AxesSubplot:title={'center':'score\n0'}>,
        <AxesSubplot:title={'center':'score\n1'}>,
        <AxesSubplot:title={'center':'score\n2'}>]], dtype=object)


# Let's create an array of score from very low 25 points, to a maximum of 150
score = np.arange(25,150)

# We take a case study of a first seed team and keep the seed as constant. We can substitute the posterior mean
# for intercept, seed effect and score effect together to compute the log_odd for a first seed team to win
# the game by scoring 25 to 150 points.
log_odd = -2.846 + 1 * -0.298 + score * 0.077

# Now plot a curve representing the change of winning probability by scoring different points.
sns.scatterplot(x = score, y = 1/(1+np.exp(-log_odd))).set(title = "Winning Probability vs Score", 
                                                           xlabel = "Score", ylabel = "probability")

# Keeping the first seed unchanged, scoring around 45 points will have a predicted probability of winning
# as great as 0.5, so score = 45 is the typical decision boundary for top seed teams.
# But the decision boundary will go higher for lower seed teams because of the negative slope for 
# increasing seed value.

# Using the same logic, you can compute the logistic curve for second seed teams, third seed teams, fourth, etc.
# Do you know how to change that function?

[Text(0.5, 1.0, 'Winning Probability vs Score'),
 Text(0.5, 0, 'Score'),
 Text(0, 0.5, 'probability')]


# First let's plot the seed effect. We can create an integer sequence from 1 to 16.
seed = np.arange(1,17)

# Let's assume that at the end, a team would score 80 points in a playoff game. We can keep 80 points as constant,
# and substitute all effects on the formula to compute the log odds of winning the game.
log_odd = -2.846 + seed * -0.298 + 80 * 0.077

# Now plot a curve representing the change of winning probability by team's seed
sns.scatterplot(x = seed, y = 1/(1+np.exp(-log_odd))).set(title = "Winning Probability vs Score", xlabel = "Score", ylabel = "probability")

# Assuming a team obtaining 80 points as constant, we see that even the eleventh seed has a predicted 
# winning probability larger than 0.5, thus predicting as winning a game. 
# The decision boundary is therefore at seed = 11, where teams with lower seeds than 11 have the majority
# of time losing a game given the team scores 80 points.

[Text(0.5, 1.0, 'Winning Probability vs Score'),
 Text(0.5, 0, 'Score'),
 Text(0, 0.5, 'probability')]


# We can use the trace data for seed variable, intercept and score variable, and add them together based on the 
# dataset values. We now get the predicted log-odds for each data point.

round_of_64['log_odds'] = trace['intercept'].mean() + round_of_64['Seed'] * trace['seed'].mean() + trace['score'].mean() * round_of_64['Score']


round_of_64.shape
# (2048, 12)

(2048, 12)


round_of_64.head()


# So we can use these concepts to compute the true positive, for those whose log odds is positive, and outcome is 1

true_positive = round_of_64[(round_of_64['Outcome'] == 1) & (round_of_64['log_odds'] > 0)].shape[0]
# and print the number of true positive outcomes
print("True Positive:", true_positive)

# Same for true negative, false positive, and false negatives.

true_negative = round_of_64[(round_of_64['Outcome'] == 0) & (round_of_64['log_odds'] < 0)].shape[0]
print("True Negative:", true_negative)

false_positive = round_of_64[(round_of_64['Outcome'] == 0) & (round_of_64['log_odds'] > 0)].shape[0]
print("False Positive:", false_positive)

false_negative = round_of_64[(round_of_64['Outcome'] == 1) & (round_of_64['log_odds'] < 0)].shape[0]
print("False Negative:", false_negative)

# So there are 803 true positive, 813 true negative, 211 false positive, and 221 false negative throughout.
# So we envision it's prediction can correctly identify around 80% of winning and losing competitors.

True Positive: 803
True Negative: 813
False Positive: 211
False Negative: 221


# Now we've learned the concept and the formula. Let's compute precision, recall, and F1 score

precision = true_positive / (true_positive + false_positive)
# and print it out
print("Precision:", precision)

recall = true_positive / (true_positive + false_negative)
print("Recall:", recall)

# F-1 score is equivalent to the harmonious average of precision and recall
f1_score = 2 * (precision * recall) / (precision + recall)
print("F-1 Score:", f1_score)

# Cool! If we know the team seed and the final score beforehand, we can make 80% of the game predictions
# correct. That does not mean we can use it to predict 2021 since this year is over. A brilliant thought
# is that we can use it to predict the next year NCAA march madness of 2022, assuming that the scoring 
# pattern for winning and losing team stayed at the average value from 25-year historical figures.

# Question: What if we don't know anything, except for past statistics about the teams?
# How great shall we predict the game results before watching the game?

Precision: 0.7919132149901381
Recall: 0.7841796875
F-1 Score: 0.7880274779195289

	Date	Round	Region	Winning Seed	Winner	Winning Score	Losing Seed	Loser	Losing Score	Overtime
0	3/14/85	Round of 64	East	1	Georgetown	68	16	Lehigh	43	NaN
1	3/14/85	Round of 64	East	4	Loyola, Illinois	59	13	Iona	58	NaN
2	3/14/85	Round of 64	East	5	Southern Methodist	85	12	Old Dominion	68	NaN
3	3/14/85	Round of 64	East	8	Temple	60	9	Virginia Tech	57	NaN
4	3/14/85	Round of 64	Midwest	1	Oklahoma	96	16	North Carolina A&T	83	NaN
...	...	...	...	...	...	...	...	...	...	...
2045	3/27/16	Elite Eight	East	1	North Carolina	88	6	Notre Dame	74	NaN
2046	3/27/16	Elite Eight	Midwest	10	Syracuse	68	1	Virginia	62	NaN
2047	4/2/16	National Semifinals	NaN	1	North Carolina	83	10	Syracuse	66	NaN
2048	4/2/16	National Semifinals	NaN	2	Villanova	95	2	Oklahoma	51	NaN
2049	4/4/16	National Championship	NaN	2	Villanova	77	1	North Carolina	74	NaN

	Unnamed: 0	Date	Round	Region	Winning Seed	Winning Score	Seed	Score	Outcome	Losing Seed	Losing Score
0	0	3/14/85	Round of 64	East	1.0	68.0	1	68	1	NaN	NaN
1	1	3/14/85	Round of 64	East	4.0	59.0	4	59	1	NaN	NaN
2	2	3/14/85	Round of 64	East	5.0	85.0	5	85	1	NaN	NaN
3	3	3/14/85	Round of 64	East	8.0	60.0	8	60	1	NaN	NaN
4	4	3/14/85	Round of 64	Midwest	1.0	96.0	1	96	1	NaN	NaN

	mean	sd	hdi_2.5%	hdi_97.5%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
intercept	-2.867	0.409	-3.670	-2.064	0.004	0.003	8733.0	10679.0	1.0
seed	-0.297	0.016	-0.327	-0.267	0.000	0.000	14153.0	13681.0	1.0
score	0.077	0.006	0.067	0.089	0.000	0.000	9012.0	11694.0	1.0

	Unnamed: 0	Date	Round	Region	Winning Seed	Winning Score	Seed	Score	Outcome	Losing Seed	Losing Score	log_odds
0	0	3/14/85	Round of 64	East	1.0	68.0	1	68	1	NaN	NaN	2.092105
1	1	3/14/85	Round of 64	East	4.0	59.0	4	59	1	NaN	NaN	0.504170
2	2	3/14/85	Round of 64	East	5.0	85.0	5	85	1	NaN	NaN	2.216790
3	3	3/14/85	Round of 64	East	8.0	60.0	8	60	1	NaN	NaN	-0.608074
4	4	3/14/85	Round of 64	Midwest	1.0	96.0	1	96	1	NaN	NaN	4.256729

Bayesian Logistic Regression - NCAA case study¶

Video 1: Logistic Regression - Concept¶

Question 1: What is Bayesian logistic regression?¶

Question 2: What is log odd, odd and how to use log odd to define decision boundary for Logistic regression?¶

Video 2: Bayesian Logistic Regression - Modeling¶

Side Track: NCAA¶

Research Questions:¶

Question 3: What kind of feature engineering should we consider before modeling this NCAA data?¶

Question 4: How could we create a Bayesian logistic regression model using PyMC3?¶

Question 4: Where does the decision boundary locate for Logistic regression?¶

Video 3: Bayesian Logistic Regression - Classification Prediction¶

Question 5: What decision rule does Bayesian logistic regression implement?¶

Question 6: What are typical metrics that Bayesian logistic regression uses for prediction?¶

Appendix A: Concept of Logistic Regression¶

Appendix B: IELTS Prediction Problem¶