# Step 1: Load necessary packages

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt


# Step 2: Import dataset

# We start by importing the Ted talk dataset using the pd.read_csv function 
ted_talk = pd.read_csv('/Users/michiganboy/Documents/course2/assets/ted_main.csv')
# and have a sneak peek on the header of the data.
ted_talk.head()


# Step 3: Data Pre-processing

# Pre-processing is optional

# Choose columns: Only keeping the variables to be used in the analysis
ted_talk = ted_talk[["comments","duration","languages","main_speaker","num_speaker",
                    "published_date","ratings","speaker_occupation","tags","views","title"]]

# Since we found the published date of Ted talks are coded with Unix timestamp, and the timestamps 
# is coded with unit second, we can transform those timestamps into readable dates by passing the 
# timestamps and specifying unit as second to the pd.to_datetime function. It's useful very often 
# to convert different time formats
ted_talk['published_date'] = pd.to_datetime(ted_talk['published_date'], unit='s')

# To simplify the interpretation of video views, I also broadcast the entire columns by dividing the 
# quantity by 1 million, so now the views are represented by a unit of million.
ted_talk['views'] = ted_talk['views'] / 1000000

# It also sounds better to convert the video duration from seconds to minutes by dividing 
# the duration by 60.
ted_talk['duration'] = ted_talk['duration'] / 60

views = ted_talk['views']
languages = ted_talk['languages']

# Now let's take a glimpse at the data set again.
ted_talk.head()

# The duration, and the views columns now look easier to interpret! In the future it's not necessary 
# to do that but it's good to keep in mind that broadcasting is a convenient way to clean a dataset.


# Step 4: Perform quantile regression

# Next, we’ll fit a quantile regression model using language available as the predictor variable 
# and Ted Talk views as the response variable.

# We’ll use the model to predict the expected 90th percentile of views based on the number of languages available.
model = smf.quantreg('views ~ languages', ted_talk).fit(q=0.9)

# and print out the summary

print(model.summary())

                         QuantReg Regression Results                          
==============================================================================
Dep. Variable:                  views   Pseudo R-squared:               0.1411
Model:                       QuantReg   Bandwidth:                      0.2549
Method:                 Least Squares   Sparsity:                        8.189
Date:                Tue, 07 Sep 2021   No. Observations:                 2550
Time:                        08:18:03   Df Residuals:                     2548
                                        Df Model:                            1
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3665      0.239      1.536      0.125      -0.101       0.835
languages      0.0973      0.009     11.381      0.000       0.081       0.114
==============================================================================


# Step 5: Visualize the result

fig, ax = plt.subplots(figsize=(8,6))

# get y values
get_y = lambda a,b: a + b * languages
y = get_y(model.params['Intercept'], model.params['languages'])

# Plot the data with quantile regression overlaid
ax.plot(languages, y)
ax.scatter(languages, views, alpha = .2)
ax.set(xlabel = "Languages Available", ylabel = "Views (in million)", title = "TED Talk View Quantile Regression")

[Text(0.5, 0, 'Languages Available'),
 Text(0, 0.5, 'Views (in million)'),
 Text(0.5, 1.0, 'TED Talk View Quantile Regression')]

	comments	description	duration	event	film_date	languages	main_speaker	name	num_speaker	published_date	ratings	related_talks	speaker_occupation	tags	title	url	views
0	4553	Sir Ken Robinson makes an entertaining and pro...	1164	TED2006	1140825600	60	Ken Robinson	Ken Robinson: Do schools kill creativity?	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 19645}, {...	[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...	Author/educator	['children', 'creativity', 'culture', 'dance',...	Do schools kill creativity?	https://www.ted.com/talks/ken_robinson_says_sc...	47227110
1	265	With the same humor and humanity he exuded in ...	977	TED2006	1140825600	43	Al Gore	Al Gore: Averting the climate crisis	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...	[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...	Climate advocate	['alternative energy', 'cars', 'climate change...	Averting the climate crisis	https://www.ted.com/talks/al_gore_on_averting_...	3200520
2	124	New York Times columnist David Pogue takes aim...	1286	TED2006	1140739200	26	David Pogue	David Pogue: Simplicity sells	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...	[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...	Technology columnist	['computers', 'entertainment', 'interface desi...	Simplicity sells	https://www.ted.com/talks/david_pogue_says_sim...	1636292
3	200	In an emotionally charged talk, MacArthur-winn...	1116	TED2006	1140912000	35	Majora Carter	Majora Carter: Greening the ghetto	1	1151367060	[{'id': 3, 'name': 'Courageous', 'count': 760}...	[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...	Activist for environmental justice	['MacArthur grant', 'activism', 'business', 'c...	Greening the ghetto	https://www.ted.com/talks/majora_carter_s_tale...	1697550
4	593	You've never seen data presented like this. Wi...	1190	TED2006	1140566400	48	Hans Rosling	Hans Rosling: The best stats you've ever seen	1	1151440680	[{'id': 9, 'name': 'Ingenious', 'count': 3202}...	[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...	Global health expert; data visionary	['Africa', 'Asia', 'Google', 'demo', 'economic...	The best stats you've ever seen	https://www.ted.com/talks/hans_rosling_shows_t...	12005869

	comments	duration	languages	main_speaker	num_speaker	published_date	ratings	speaker_occupation	tags	views	title
0	4553	0.005389	60	Ken Robinson	1	2006-06-27 00:11:00	[{'id': 7, 'name': 'Funny', 'count': 19645}, {...	Author/educator	['children', 'creativity', 'culture', 'dance',...	4.722711e-11	Do schools kill creativity?
1	265	0.004523	43	Al Gore	1	2006-06-27 00:11:00	[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...	Climate advocate	['alternative energy', 'cars', 'climate change...	3.200520e-12	Averting the climate crisis
2	124	0.005954	26	David Pogue	1	2006-06-27 00:11:00	[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...	Technology columnist	['computers', 'entertainment', 'interface desi...	1.636292e-12	Simplicity sells
3	200	0.005167	35	Majora Carter	1	2006-06-27 00:11:00	[{'id': 3, 'name': 'Courageous', 'count': 760}...	Activist for environmental justice	['MacArthur grant', 'activism', 'business', 'c...	1.697550e-12	Greening the ghetto
4	593	0.005509	48	Hans Rosling	1	2006-06-27 20:38:00	[{'id': 9, 'name': 'Ingenious', 'count': 3202}...	Global health expert; data visionary	['Africa', 'Asia', 'Google', 'demo', 'economic...	1.200587e-11	The best stats you've ever seen

Quantile Regression¶