# Step 1: Load necessary packages
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
# Step 2: Import dataset
# We start by importing the Ted talk dataset using the pd.read_csv function
ted_talk = pd.read_csv('/Users/michiganboy/Documents/course2/assets/ted_main.csv')
# and have a sneak peek on the header of the data.
ted_talk.head()
comments | description | duration | event | film_date | languages | main_speaker | name | num_speaker | published_date | ratings | related_talks | speaker_occupation | tags | title | url | views | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4553 | Sir Ken Robinson makes an entertaining and pro... | 1164 | TED2006 | 1140825600 | 60 | Ken Robinson | Ken Robinson: Do schools kill creativity? | 1 | 1151367060 | [{'id': 7, 'name': 'Funny', 'count': 19645}, {... | [{'id': 865, 'hero': 'https://pe.tedcdn.com/im... | Author/educator | ['children', 'creativity', 'culture', 'dance',... | Do schools kill creativity? | https://www.ted.com/talks/ken_robinson_says_sc... | 47227110 |
1 | 265 | With the same humor and humanity he exuded in ... | 977 | TED2006 | 1140825600 | 43 | Al Gore | Al Gore: Averting the climate crisis | 1 | 1151367060 | [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... | [{'id': 243, 'hero': 'https://pe.tedcdn.com/im... | Climate advocate | ['alternative energy', 'cars', 'climate change... | Averting the climate crisis | https://www.ted.com/talks/al_gore_on_averting_... | 3200520 |
2 | 124 | New York Times columnist David Pogue takes aim... | 1286 | TED2006 | 1140739200 | 26 | David Pogue | David Pogue: Simplicity sells | 1 | 1151367060 | [{'id': 7, 'name': 'Funny', 'count': 964}, {'i... | [{'id': 1725, 'hero': 'https://pe.tedcdn.com/i... | Technology columnist | ['computers', 'entertainment', 'interface desi... | Simplicity sells | https://www.ted.com/talks/david_pogue_says_sim... | 1636292 |
3 | 200 | In an emotionally charged talk, MacArthur-winn... | 1116 | TED2006 | 1140912000 | 35 | Majora Carter | Majora Carter: Greening the ghetto | 1 | 1151367060 | [{'id': 3, 'name': 'Courageous', 'count': 760}... | [{'id': 1041, 'hero': 'https://pe.tedcdn.com/i... | Activist for environmental justice | ['MacArthur grant', 'activism', 'business', 'c... | Greening the ghetto | https://www.ted.com/talks/majora_carter_s_tale... | 1697550 |
4 | 593 | You've never seen data presented like this. Wi... | 1190 | TED2006 | 1140566400 | 48 | Hans Rosling | Hans Rosling: The best stats you've ever seen | 1 | 1151440680 | [{'id': 9, 'name': 'Ingenious', 'count': 3202}... | [{'id': 2056, 'hero': 'https://pe.tedcdn.com/i... | Global health expert; data visionary | ['Africa', 'Asia', 'Google', 'demo', 'economic... | The best stats you've ever seen | https://www.ted.com/talks/hans_rosling_shows_t... | 12005869 |
# Step 3: Data Pre-processing
# Pre-processing is optional
# Choose columns: Only keeping the variables to be used in the analysis
ted_talk = ted_talk[["comments","duration","languages","main_speaker","num_speaker",
"published_date","ratings","speaker_occupation","tags","views","title"]]
# Since we found the published date of Ted talks are coded with Unix timestamp, and the timestamps
# is coded with unit second, we can transform those timestamps into readable dates by passing the
# timestamps and specifying unit as second to the pd.to_datetime function. It's useful very often
# to convert different time formats
ted_talk['published_date'] = pd.to_datetime(ted_talk['published_date'], unit='s')
# To simplify the interpretation of video views, I also broadcast the entire columns by dividing the
# quantity by 1 million, so now the views are represented by a unit of million.
ted_talk['views'] = ted_talk['views'] / 1000000
# It also sounds better to convert the video duration from seconds to minutes by dividing
# the duration by 60.
ted_talk['duration'] = ted_talk['duration'] / 60
views = ted_talk['views']
languages = ted_talk['languages']
# Now let's take a glimpse at the data set again.
ted_talk.head()
# The duration, and the views columns now look easier to interpret! In the future it's not necessary
# to do that but it's good to keep in mind that broadcasting is a convenient way to clean a dataset.
comments | duration | languages | main_speaker | num_speaker | published_date | ratings | speaker_occupation | tags | views | title | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4553 | 0.005389 | 60 | Ken Robinson | 1 | 2006-06-27 00:11:00 | [{'id': 7, 'name': 'Funny', 'count': 19645}, {... | Author/educator | ['children', 'creativity', 'culture', 'dance',... | 4.722711e-11 | Do schools kill creativity? |
1 | 265 | 0.004523 | 43 | Al Gore | 1 | 2006-06-27 00:11:00 | [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... | Climate advocate | ['alternative energy', 'cars', 'climate change... | 3.200520e-12 | Averting the climate crisis |
2 | 124 | 0.005954 | 26 | David Pogue | 1 | 2006-06-27 00:11:00 | [{'id': 7, 'name': 'Funny', 'count': 964}, {'i... | Technology columnist | ['computers', 'entertainment', 'interface desi... | 1.636292e-12 | Simplicity sells |
3 | 200 | 0.005167 | 35 | Majora Carter | 1 | 2006-06-27 00:11:00 | [{'id': 3, 'name': 'Courageous', 'count': 760}... | Activist for environmental justice | ['MacArthur grant', 'activism', 'business', 'c... | 1.697550e-12 | Greening the ghetto |
4 | 593 | 0.005509 | 48 | Hans Rosling | 1 | 2006-06-27 20:38:00 | [{'id': 9, 'name': 'Ingenious', 'count': 3202}... | Global health expert; data visionary | ['Africa', 'Asia', 'Google', 'demo', 'economic... | 1.200587e-11 | The best stats you've ever seen |
# Step 4: Perform quantile regression
# Next, we’ll fit a quantile regression model using language available as the predictor variable
# and Ted Talk views as the response variable.
# We’ll use the model to predict the expected 90th percentile of views based on the number of languages available.
model = smf.quantreg('views ~ languages', ted_talk).fit(q=0.9)
# and print out the summary
print(model.summary())
QuantReg Regression Results ============================================================================== Dep. Variable: views Pseudo R-squared: 0.1411 Model: QuantReg Bandwidth: 0.2549 Method: Least Squares Sparsity: 8.189 Date: Tue, 07 Sep 2021 No. Observations: 2550 Time: 08:18:03 Df Residuals: 2548 Df Model: 1 ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 0.3665 0.239 1.536 0.125 -0.101 0.835 languages 0.0973 0.009 11.381 0.000 0.081 0.114 ==============================================================================
From the output, we can see the estimated regression equation:
90th percentile of Ted Talk View = 0.3665 + 0.0973*(languages) in million views
For example, the 90th percentile of Ted Talk view for all videos with 10 available languages is expected to be 0.3665 + 10 * 0.0973 = 1.3395 million views
# Step 5: Visualize the result
fig, ax = plt.subplots(figsize=(8,6))
# get y values
get_y = lambda a,b: a + b * languages
y = get_y(model.params['Intercept'], model.params['languages'])
# Plot the data with quantile regression overlaid
ax.plot(languages, y)
ax.scatter(languages, views, alpha = .2)
ax.set(xlabel = "Languages Available", ylabel = "Views (in million)", title = "TED Talk View Quantile Regression")
[Text(0.5, 0, 'Languages Available'), Text(0, 0.5, 'Views (in million)'), Text(0.5, 1.0, 'TED Talk View Quantile Regression')]
Unlike a simple linear regression line, notice that this fitted line doesn’t represent the “line of best fit” for the data. Instead, it goes through the estimated 90th percentile at each level of the predictor variable. Hopefully now you've learnt how to fit quantile regression models in Python!