import pandas as pd
import numpy as np
import chart_studio.plotly as py
import seaborn as sns
import cufflinks as cf
import plotly.express as ax
#!pip install chart_studio
#!pip install plotly
#!pip install cufflinks
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected = True)
cf.go_offline()
What countries or regions rank the highest in overall happiness and each of the six factors contributing to happiness? How did country ranks or scores change between the 2018 and 2019 as well as the 2016 and 2017 reports, for example? Did any country experience a significant increase or decrease in happiness?
Use the read_csv()
function to import data.
happiness_report_2018 =
happiness_report_2019 =
happiness_report_2018 = pd.read_csv('Data/2018.csv')
happiness_report_2019 = pd.read_csv('Data/2019.csv')
# Download any csv file from website, and try to use pd.read_csv(file path) to import the data to Jupyter Notebook.
# View the first few rows
happiness_report_2018.head(15)
Overall rank | Country or region | Score | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | Finland | 7.632 | 1.305 | 1.592 | 0.874 | 0.681 | 0.202 | 0.393 |
1 | 2 | Norway | 7.594 | 1.456 | 1.582 | 0.861 | 0.686 | 0.286 | 0.340 |
2 | 3 | Denmark | 7.555 | 1.351 | 1.590 | 0.868 | 0.683 | 0.284 | 0.408 |
3 | 4 | Iceland | 7.495 | 1.343 | 1.644 | 0.914 | 0.677 | 0.353 | 0.138 |
4 | 5 | Switzerland | 7.487 | 1.420 | 1.549 | 0.927 | 0.660 | 0.256 | 0.357 |
5 | 6 | Netherlands | 7.441 | 1.361 | 1.488 | 0.878 | 0.638 | 0.333 | 0.295 |
6 | 7 | Canada | 7.328 | 1.330 | 1.532 | 0.896 | 0.653 | 0.321 | 0.291 |
7 | 8 | New Zealand | 7.324 | 1.268 | 1.601 | 0.876 | 0.669 | 0.365 | 0.389 |
8 | 9 | Sweden | 7.314 | 1.355 | 1.501 | 0.913 | 0.659 | 0.285 | 0.383 |
9 | 10 | Australia | 7.272 | 1.340 | 1.573 | 0.910 | 0.647 | 0.361 | 0.302 |
10 | 11 | United Kingdom | 7.190 | 1.244 | 1.433 | 0.888 | 0.464 | 0.262 | 0.082 |
11 | 12 | Austria | 7.139 | 1.341 | 1.504 | 0.891 | 0.617 | 0.242 | 0.224 |
12 | 13 | Costa Rica | 7.072 | 1.010 | 1.459 | 0.817 | 0.632 | 0.143 | 0.101 |
13 | 14 | Ireland | 6.977 | 1.448 | 1.583 | 0.876 | 0.614 | 0.307 | 0.306 |
14 | 15 | Germany | 6.965 | 1.340 | 1.474 | 0.861 | 0.586 | 0.273 | 0.280 |
# View the last few rows
happiness_report_2018.tail(10)
Overall rank | Country or region | Score | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | |
---|---|---|---|---|---|---|---|---|---|
146 | 147 | Malawi | 3.587 | 0.186 | 0.541 | 0.306 | 0.531 | 0.210 | 0.080 |
147 | 148 | Haiti | 3.582 | 0.315 | 0.714 | 0.289 | 0.025 | 0.392 | 0.104 |
148 | 149 | Liberia | 3.495 | 0.076 | 0.858 | 0.267 | 0.419 | 0.206 | 0.030 |
149 | 150 | Syria | 3.462 | 0.689 | 0.382 | 0.539 | 0.088 | 0.376 | 0.144 |
150 | 151 | Rwanda | 3.408 | 0.332 | 0.896 | 0.400 | 0.636 | 0.200 | 0.444 |
151 | 152 | Yemen | 3.355 | 0.442 | 1.073 | 0.343 | 0.244 | 0.083 | 0.064 |
152 | 153 | Tanzania | 3.303 | 0.455 | 0.991 | 0.381 | 0.481 | 0.270 | 0.097 |
153 | 154 | South Sudan | 3.254 | 0.337 | 0.608 | 0.177 | 0.112 | 0.224 | 0.106 |
154 | 155 | Central African Republic | 3.083 | 0.024 | 0.000 | 0.010 | 0.305 | 0.218 | 0.038 |
155 | 156 | Burundi | 2.905 | 0.091 | 0.627 | 0.145 | 0.065 | 0.149 | 0.076 |
# happiness_report_2018.iplot
columns = ['Score', 'GDP per capita', 'Social support', 'Healthy life expectancy',
'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']
sns.heatmap(happiness_report_2018[['GDP per capita', 'Social support', 'Score']])
<AxesSubplot:>
happiness_report_2018_pivot = happiness_report_2018.pivot('GDP per capita', 'Social support', "Score")
sns.heatmap(happiness_report_2018_pivot)
<AxesSubplot:xlabel='Social support', ylabel='GDP per capita'>
sns.kdeplot(happiness_report_2018['Score'], shade = True)
<AxesSubplot:xlabel='Score', ylabel='Density'>
sns.kdeplot(happiness_report_2018['Healthy life expectancy'], shade = True, bw_adjust = 0.5)
# You'll notice that most countries have healthy life expectancy at between 0.6 to 0.9. It's because
# the density shows the highest points at these areas.
# The density tapers off quickly on both sides, which means there are far fewer countries where the healthy life
# expectancy fell below 0.6 and went above 0.9
# If we adjust the bandwidth to a smaller number, you'll see that the graph becomes more coersed.
# We'll gain clearer insight about the locations where many, or virtually no data points hovering around.
# If we increase the bandwidth, however, you'll see the opposite - the graph becomes smoother.
<AxesSubplot:xlabel='Healthy life expectancy', ylabel='Density'>
sns.histplot(happiness_report_2018['Healthy life expectancy'])
# Kernel density estimation is just an alternative method to represent "frequency distribution", but
# it works more appropriately for continuous data. (Histogram discretizes the continuous variables into bins,
# which may compromize some information inherent in the data.)
<AxesSubplot:xlabel='Healthy life expectancy', ylabel='Count'>
gdp_generosity = ['GDP per capita', 'Generosity']
sns.kdeplot(x = happiness_report_2018['GDP per capita'], y = happiness_report_2018['Generosity'], shade = True)
# This 2-dimensional kdeplot shows that we're most likely to see countries in which the GDP per capita is around 1.0,
# and the generosity index is around 0.1.
# The joint distribution tapers off from a narrow circle to four directions.
<AxesSubplot:xlabel='GDP per capita', ylabel='Generosity'>
sns.jointplot(x = happiness_report_2018['GDP per capita'], y = happiness_report_2018['Generosity'])
# Scatterplot + Histogram = Jointplot
# It shows that most countries feature the generosity index between 0.15 to 0.2, with GDP per capita as high as 1.0.
# The GDP per capita for 150+ countries is skewed to the right, which means there are a few countries have very high GDP
# while many countries have GDP per capita below the average.
<seaborn.axisgrid.JointGrid at 0x7fbed17352e0>
# If we want to know what columns exist in a data, you can type data.columns to find it out.
happiness_report_2018.columns
Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption'], dtype='object')
Plotly is an open-source library that can be used for data visualization and understanding data easily. Plotly supports various types of plots with the interaction feature. For example, when your mouse is hovering at any data points in the graph, the interactive feature will show the x and y coordinates.
import plotly.express as px
# Create the figure instance
# What are the happiest countries in the world? Happiness is a subjective metric. However, some countries perform
# consistently well on the happiness index rankings.
# We can create a line plot, with the x-axis being the country rank, and the y-axis being the happiness score in 2018.
px.line(x = happiness_report_2018['Overall rank'], y = happiness_report_2018['Score'])
# For example, the 20th country with the highest happiness index shows 6.774 points out of 10-point scale.
# Overall, the happiness score drops rapidly at the beginning (say from 1-25), and the decreasing rate reduces
# until around x = 140, where the happiness score drops significantly/dramatically towards the end.
# What if we're only interested in the top 25 countries with the highest happiness score?
# How to plot that line chart again?
# In data science, there is a term called slicing. Slicing means you subset a portion of the data points
# from the entire dataset or population.
# We can use the bracket [], and mention the starting point and the end point enclosing all the data you
# want to use for problem solving. : means to. [0:25] from 0 to 25 (exclusive)
reduced_report = happiness_report_2018[0:25]
# Now we use the same px.line function to make the plot, but modify happiness_report_2018 (the entire data)
# to reduced_report (the reduced, sliced data)
px.line(x = reduced_report['Overall rank'], y = reduced_report['Score'])
A bar chart is a pictoral representation of data that presents categorical data with rectangular bars. The heights or lengths of the rectangular bars are proportional to the values that the dataset shows. We require a dataset that contains the numerical values of variable that represents the quantity/frequency of each category.
Using bar chart we can compare the values/population from each category and make inference on choosing (for example) the appropriate ranges to communicate to the audience. Some bar charts are more complicated. The stacked bar charts can represent the proportion of each subcategory under a big category.
# Make a data that talks about how many medals each country is awarded in a Game.
# Create the nations
nation = ['South Korea', "Japan", "United States", "Australia",
'South Korea', "Japan", "United States", "Australia",
'South Korea', "Japan", "United States", "Australia"]
# Create the medals
medal = ['gold', 'silver', 'bronze']
medal = np.repeat(medal, 4)
# Create the counts of medals for each country
count = [24, 10, 55, 31, 30, 40, 45, 33, 18, 20, 35, 20]
# We can use the pd.DataFrame function, pass through a dictionary to create a dataset just like the world happiness
# report. Dictionary we use the {} (curly braces) to create - create a key : value pairs
award = pd.DataFrame({'nation': nation, 'medal': medal, 'count': count})
award
# Hmmmm, different nations are awarded with different number of gold, silver and bronze medals.
nation | medal | count | |
---|---|---|---|
0 | South Korea | gold | 24 |
1 | Japan | gold | 10 |
2 | United States | gold | 55 |
3 | Australia | gold | 31 |
4 | South Korea | silver | 30 |
5 | Japan | silver | 40 |
6 | United States | silver | 45 |
7 | Australia | silver | 33 |
8 | South Korea | bronze | 18 |
9 | Japan | bronze | 20 |
10 | United States | bronze | 35 |
11 | Australia | bronze | 20 |
# Create a bar chart using px.bar
px.bar(award, x = "nation", y = "count", color = "medal", title = "Number of medals")
# You can see that
# 1. The United States was awarded the highest number of medals overall.
# 2. The United States was awarded the highest number of bronze and gold medals.
# 3. Among gold, silver and bronze medals, the 4 countries seem to obtain similar number of silver medals.
# 4. Japan obtained the lowest number of gold medals.
# 5. The total number of medals obtained by Japan and South Korea were pretty similar.
Violin plot shows the distribution of numerical data across several levels of categorical variables, such the the distribution could be compared.
# We'll use px.violin to make a violin plot - but this plot is interactive. Let's say if we hover the mouse on the
# violin surface, what do you see?
px.violin(award, x = "nation", y = "count", title = "Number of medals")
# On each side of the gray line of the violinplot, there is a kernel density estimation to show the distribution shape
# of the data points for each category. The wider sections of the violin plot, the higher probability that members of
# the population will take on the given value. On the other hand, the skinner the sections, the lower probability that
# any data point will hover around that area.
# Through the violin plot, we can see that the variance of Japan is the highest, which means if we were to predict
# the number of awards obtained by Japan, we'll suffer the highest uncertainty. This means predicting Japan is of
# the highest difficulty/imprecision most of the time.
# Because plotly's violin plot does not indicate the central tendency using an inner bar, I recommend you to use
# the seaborn package which deals with statistical plottings. sns.violinplot makes the graph looks nicer.
sns.violinplot(x = "nation", y = "count", data= award)
# This string represents the central tendency of the dataset. The thicker bar involves the Q1, median, Q3,
# and the thinner bar will include all the points that should be considered normal.
<AxesSubplot:xlabel='nation', ylabel='count'>