# Step 1: Load necessary packages and import dataset

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

placement = pd.read_csv('/Users/michiganboy/Documents/course2/assets/placement_data_full_class.csv')


placement.head()


# Step 2: Data Visualization

# We can draw a pairplot for the placement with 2 facets - status Yes and No
sns.pairplot(placement, hue = 'status')

/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:306: UserWarning: Dataset has 0 variance; skipping density estimate.
  warnings.warn(msg, UserWarning)

<seaborn.axisgrid.PairGrid at 0x7fbb58046790>


# And draw a histogram indicating Placed vs Not Placed
sns.histplot(placement['status'])

<AxesSubplot:xlabel='status', ylabel='Count'>


# We can visually inspect like does Gender affect salary. We can draw a kernel density plot
sns.kdeplot(data = placement, x = 'salary', hue = 'gender')

# The graph shows that:
# 1. Most of the candidates scored around 60 percentage got a decent package of around 3 lakhs PA
# 2. Not many candidates received salary more than 4 lakhs PA

<AxesSubplot:xlabel='salary', ylabel='Density'>


# We also want to understand the correlation relationship among numerical variables.
# We can draw a Heatmap using the sns.heatmap funciton.

# Set the figure size so that the characters are clear
plt.figure(figsize = (10,6))
sns.heatmap(placement.corr(), annot = True, cmap="YlGnBu")

<AxesSubplot:>


# We also want to see Pie chart to see placement for a list of columns
numerical_col = placement[['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']]

plt.figure(figsize = (16,4))
for i in range(len(numerical_col.columns)):
    plt.subplot(1, len(numerical_col.columns), i+1)
    sns.boxplot(x = 'status', y = numerical_col.columns[i], hue = 'gender', data = placement)


# Visualize Variation Inflation Factor (VIF) - observe whether multicolliniarity exists in dataset
from statsmodels.stats.outliers_influence import variance_inflation_factor

VIF = pd.DataFrame()
VIF['Features'] = N.columns
VIF['VIF_value'] = [variance_inflation_factor(N.values, i) for i in range(len(N.columns))]

# Wow! 112 is an extremely high value in terms of VIF! Does that indicate a high possibility of having
# multicolliniarity among the 5 features? Yes!


# Visualize VIF

# Learn something: The kind of plot to produce:
# ‘line’ : line plot (default) ‘bar’ : vertical bar plot ‘barh’ : horizontal bar plot
# ‘hist’ : histogram ‘box’ : boxplot ‘kde’ : Kernel Density Estimation plot
# ‘density’ : same as ‘kde’ ‘area’ : area plot ‘pie’ : pie plot
# ‘scatter’ : scatter plot ‘hexbin’ : hexbin plot.

VIF.plot(kind = 'bar', y = 'VIF_value', x = 'Features')

<AxesSubplot:xlabel='Features'>


# Step 3 - Data Preprocessing 
# First, we can obtain all categorical data columns for encoding

cat_cols = []
# Write a for loop to extract all categorical columns - let them become dummies
for i in placement.columns:
    if type(placement[i][1]) == str:
        cat_cols.append(i)


# Show the categorical columns
cat_cols
# Those are gender,

['gender',
 'ssc_b',
 'hsc_b',
 'hsc_s',
 'degree_t',
 'workex',
 'specialisation',
 'status']


# The get_dummies function in pandas returns all possible categories and 1 if that row belong to, and 0 if not.
# Create new columns that records the dummy variables - update the dataset everytime using concat

pd.get_dummies(placement['gender'], drop_first = True)
# for i in cat_cols:
#     placement = pd.concat([placement, pd.get_dummies(placement[i])])


# Although we can generate graphs for each factors that we are interested, now status, gender and workex
# are still categorical, so we can't directly use them for model unless we attribute them values, such
# as attributing 0 for female, and 1 for male candidates. This actually can be achieved by label encoding:

# Although is the first time to use a label encoder, I want to import sklearn label encoder. 
from sklearn.preprocessing import LabelEncoder

# It's an one-line encoder to transform binary categorical variables into 0 and 1. 
# We can apply the label encoder function for the category which has only two types of classes

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()

# We create a list of categorical variables that we're going to transform
object_cols = ['gender', 'workex', 'status']
# and write a for loop for col in all object columns,
for col in object_cols:
    # We call the label_encoder to transform the object column into 0-1 binary form
    placement[col] = label_encoder.fit_transform(placement[col])
    
# and now we see the data again.    
placement.head()

# OK now you see the gender variable, the work experience variable, as well as the status variable 
# are labeled zero and one, they all become binary.


# Step 4: Create Logistic Regression Model

# Defining the independent variables as X and dependent variable as y
X = placement[['gender', 'etest_p', 'workex']]
y = placement[['status']]

# Build a logistic regression model and fit the data
log_reg = sm.Logit(y, X).fit()

# In the output, ‘Iterations‘ =6 refers to the model took 6 times to iterate over the data, 
# trying to optimise the model.

Optimization terminated successfully.
         Current function value: 0.574354
         Iterations 6


# We can print the summary table of the logistic regression, which gives us a descriptive 
# summary about the regression results. 
print(log_reg.summary())

                           Logit Regression Results                           
==============================================================================
Dep. Variable:                 status   No. Observations:                  215
Model:                          Logit   Df Residuals:                      212
Method:                           MLE   Df Model:                            2
Date:                Tue, 07 Sep 2021   Pseudo R-squ.:                 0.07421
Time:                        18:53:33   Log-Likelihood:                -123.49
converged:                       True   LL-Null:                       -133.39
Covariance Type:            nonrobust   LLR p-value:                 5.022e-05
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
gender         0.2151      0.318      0.677      0.498      -0.407       0.838
etest_p        0.0042      0.004      1.190      0.234      -0.003       0.011
workex         1.4006      0.382      3.666      0.000       0.652       2.149
==============================================================================


# Step 5: Make prediction

# Now we shall test our model on new test data. The predictions obtained are fractional values(between 0 and 1) 
# which denote the probability of getting placed with a job. 
# These values are hence rounded, to obtain the discrete values of 1 or 0.

# Performing predictions on the test dataset -- in this simple case I did not split the data into two parts
yhat = log_reg.predict(X)
prediction = list(map(round, yhat))
 
# We can print out the actual and predicted values to compare the two sets,
print('Actual values', list(y['status']))
print('Predictions :', prediction)
# We observe that most predicted values of y are correct with the actual value.

Actual values [1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0]
Predictions : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


# So we finally create a simple test to understand the accuracy of the logistic regression model

# We can import the confusion matrix and accuracy score, precision, recall, f1 score, etc from sklearn.
from sklearn.metrics import (confusion_matrix,
                           accuracy_score)
 
# We can draw the confusion matrix by using the confusion_matrix function over the test data and prediction.
cm = confusion_matrix(y['status'], prediction)
print ("Confusion Matrix : \n", cm)
 
# We can also print out the accuracy score of the model.
print('Test accuracy = ', accuracy_score(y['status'], prediction))

# The confusion matrix shows that there are 0 true negatives (predict 0 as 0), 0 false negatives (predict 1 as 0),
# 67 false positives (predict 0 as 1), and 148 true positives (predict 1 as 1).

# The accuracy is given by all true predictions divided by all predictions. The test accuracy is 0.6884.

Confusion Matrix : 
 [[  0  67]
 [  0 148]]
Test accuracy =  0.6883720930232559

	sl_no	gender	ssc_p	ssc_b	hsc_p	hsc_b	hsc_s	degree_p	degree_t	workex	etest_p	specialisation	mba_p	status	salary
0	1	M	67.00	Others	91.00	Others	Commerce	58.00	Sci&Tech	No	55.0	Mkt&HR	58.80	Placed	270000.0
1	2	M	79.33	Central	78.33	Others	Science	77.48	Sci&Tech	Yes	86.5	Mkt&Fin	66.28	Placed	200000.0
2	3	M	65.00	Central	68.00	Central	Arts	64.00	Comm&Mgmt	No	75.0	Mkt&Fin	57.80	Placed	250000.0
3	4	M	56.00	Central	52.00	Central	Science	52.00	Sci&Tech	No	66.0	Mkt&HR	59.43	Not Placed	NaN
4	5	M	85.80	Central	73.60	Central	Commerce	73.30	Comm&Mgmt	No	96.8	Mkt&Fin	55.50	Placed	425000.0

	M
0	1
1	1
2	1
3	1
4	1
...	...
210	1
211	1
212	1
213	0
214	1

	sl_no	gender	ssc_p	ssc_b	hsc_p	hsc_b	hsc_s	degree_p	degree_t	workex	etest_p	specialisation	mba_p	status	salary
0	1	1	67.00	Others	91.00	Others	Commerce	58.00	Sci&Tech	0	55.0	Mkt&HR	58.80	1	270000.0
1	2	1	79.33	Central	78.33	Others	Science	77.48	Sci&Tech	1	86.5	Mkt&Fin	66.28	1	200000.0
2	3	1	65.00	Central	68.00	Central	Arts	64.00	Comm&Mgmt	0	75.0	Mkt&Fin	57.80	1	250000.0
3	4	1	56.00	Central	52.00	Central	Science	52.00	Sci&Tech	0	66.0	Mkt&HR	59.43	0	NaN
4	5	1	85.80	Central	73.60	Central	Commerce	73.30	Comm&Mgmt	0	96.8	Mkt&Fin	55.50	1	425000.0

Logistic Regression Tutorial¶

Data Variable Description:¶

Research Question:¶