import pandas as pd
import numpy as np
import chart_studio.plotly as py
import seaborn as sns
import cufflinks as cf
import plotly.express as ax


#!pip install chart_studio
#!pip install plotly
#!pip install cufflinks


from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected = True)
cf.go_offline()


happiness_report_2018 = 

happiness_report_2019 =


happiness_report_2018 = pd.read_csv('Data/2018.csv')

happiness_report_2019 = pd.read_csv('Data/2019.csv')

# Download any csv file from website, and try to use pd.read_csv(file path) to import the data to Jupyter Notebook.


# View the first few rows
happiness_report_2018.head(15)


# View the last few rows
happiness_report_2018.tail(10)


# happiness_report_2018.iplot


columns = ['Score', 'GDP per capita', 'Social support', 'Healthy life expectancy',
           'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']

sns.heatmap(happiness_report_2018[['GDP per capita', 'Social support', 'Score']])

<AxesSubplot:>


happiness_report_2018_pivot = happiness_report_2018.pivot('GDP per capita', 'Social support', "Score")
sns.heatmap(happiness_report_2018_pivot)

<AxesSubplot:xlabel='Social support', ylabel='GDP per capita'>


sns.kdeplot(happiness_report_2018['Score'], shade = True)

<AxesSubplot:xlabel='Score', ylabel='Density'>


sns.kdeplot(happiness_report_2018['Healthy life expectancy'], shade = True, bw_adjust = 0.5)

# You'll notice that most countries have healthy life expectancy at between 0.6 to 0.9. It's because
# the density shows the highest points at these areas.
# The density tapers off quickly on both sides, which means there are far fewer countries where the healthy life
# expectancy fell below 0.6 and went above 0.9

# If we adjust the bandwidth to a smaller number, you'll see that the graph becomes more coersed.
# We'll gain clearer insight about the locations where many, or virtually no data points hovering around.
# If we increase the bandwidth, however, you'll see the opposite - the graph becomes smoother.

<AxesSubplot:xlabel='Healthy life expectancy', ylabel='Density'>


sns.histplot(happiness_report_2018['Healthy life expectancy'])

# Kernel density estimation is just an alternative method to represent "frequency distribution", but
# it works more appropriately for continuous data. (Histogram discretizes the continuous variables into bins,
# which may compromize some information inherent in the data.)

<AxesSubplot:xlabel='Healthy life expectancy', ylabel='Count'>


gdp_generosity = ['GDP per capita', 'Generosity']

sns.kdeplot(x = happiness_report_2018['GDP per capita'], y = happiness_report_2018['Generosity'], shade = True)
# This 2-dimensional kdeplot shows that we're most likely to see countries in which the GDP per capita is around 1.0,
# and the generosity index is around 0.1.
# The joint distribution tapers off from a narrow circle to four directions.

<AxesSubplot:xlabel='GDP per capita', ylabel='Generosity'>


sns.jointplot(x = happiness_report_2018['GDP per capita'], y = happiness_report_2018['Generosity'])
# Scatterplot + Histogram = Jointplot
# It shows that most countries feature the generosity index between 0.15 to 0.2, with GDP per capita as high as 1.0.
# The GDP per capita for 150+ countries is skewed to the right, which means there are a few countries have very high GDP
# while many countries have GDP per capita below the average.

<seaborn.axisgrid.JointGrid at 0x7fbed17352e0>


# If we want to know what columns exist in a data, you can type data.columns to find it out.
happiness_report_2018.columns

Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption'],
      dtype='object')


import plotly.express as px

# Create the figure instance


# What are the happiest countries in the world? Happiness is a subjective metric. However, some countries perform
# consistently well on the happiness index rankings.
# We can create a line plot, with the x-axis being the country rank, and the y-axis being the happiness score in 2018.
px.line(x = happiness_report_2018['Overall rank'], y = happiness_report_2018['Score'])

# For example, the 20th country with the highest happiness index shows 6.774 points out of 10-point scale.
# Overall, the happiness score drops rapidly at the beginning (say from 1-25), and the decreasing rate reduces
# until around x = 140, where the happiness score drops significantly/dramatically towards the end.


# What if we're only interested in the top 25 countries with the highest happiness score?
# How to plot that line chart again?

# In data science, there is a term called slicing. Slicing means you subset a portion of the data points
# from the entire dataset or population.

# We can use the bracket [], and mention the starting point and the end point enclosing all the data you
# want to use for problem solving. : means to. [0:25] from 0 to 25 (exclusive)
reduced_report = happiness_report_2018[0:25]


# Now we use the same px.line function to make the plot, but modify happiness_report_2018 (the entire data)
# to reduced_report (the reduced, sliced data)
px.line(x = reduced_report['Overall rank'], y = reduced_report['Score'])


# Make a data that talks about how many medals each country is awarded in a Game.

# Create the nations
nation = ['South Korea', "Japan", "United States", "Australia",
         'South Korea', "Japan", "United States", "Australia",
         'South Korea', "Japan", "United States", "Australia"]
# Create the medals
medal = ['gold', 'silver', 'bronze']
medal = np.repeat(medal, 4)
# Create the counts of medals for each country
count = [24, 10, 55, 31, 30, 40, 45, 33, 18, 20, 35, 20]


# We can use the pd.DataFrame function, pass through a dictionary to create a dataset just like the world happiness
# report. Dictionary we use the {} (curly braces) to create - create a key : value pairs
award = pd.DataFrame({'nation': nation, 'medal': medal, 'count': count})

award
# Hmmmm, different nations are awarded with different number of gold, silver and bronze medals.


# Create a bar chart using px.bar

px.bar(award, x = "nation", y = "count", color = "medal", title = "Number of medals")
# You can see that 
# 1. The United States was awarded the highest number of medals overall.
# 2. The United States was awarded the highest number of bronze and gold medals.
# 3. Among gold, silver and bronze medals, the 4 countries seem to obtain similar number of silver medals.
# 4. Japan obtained the lowest number of gold medals.
# 5. The total number of medals obtained by Japan and South Korea were pretty similar.


# We'll use px.violin to make a violin plot - but this plot is interactive. Let's say if we hover the mouse on the 
# violin surface, what do you see?
px.violin(award, x = "nation", y = "count", title = "Number of medals")

# On each side of the gray line of the violinplot, there is a kernel density estimation to show the distribution shape
# of the data points for each category. The wider sections of the violin plot, the higher probability that members of
# the population will take on the given value. On the other hand, the skinner the sections, the lower probability that
# any data point will hover around that area.

# Through the violin plot, we can see that the variance of Japan is the highest, which means if we were to predict 
# the number of awards obtained by Japan, we'll suffer the highest uncertainty. This means predicting Japan is of 
# the highest difficulty/imprecision most of the time.


# Because plotly's violin plot does not indicate the central tendency using an inner bar, I recommend you to use
# the seaborn package which deals with statistical plottings. sns.violinplot makes the graph looks nicer.
sns.violinplot(x = "nation", y = "count", data= award)

# This string represents the central tendency of the dataset. The thicker bar involves the Q1, median, Q3, 
# and the thinner bar will include all the points that should be considered normal.

<AxesSubplot:xlabel='nation', ylabel='count'>

	Overall rank	Country or region	Score	GDP per capita	Social support	Healthy life expectancy	Freedom to make life choices	Generosity	Perceptions of corruption
0	1	Finland	7.632	1.305	1.592	0.874	0.681	0.202	0.393
1	2	Norway	7.594	1.456	1.582	0.861	0.686	0.286	0.340
2	3	Denmark	7.555	1.351	1.590	0.868	0.683	0.284	0.408
3	4	Iceland	7.495	1.343	1.644	0.914	0.677	0.353	0.138
4	5	Switzerland	7.487	1.420	1.549	0.927	0.660	0.256	0.357
5	6	Netherlands	7.441	1.361	1.488	0.878	0.638	0.333	0.295
6	7	Canada	7.328	1.330	1.532	0.896	0.653	0.321	0.291
7	8	New Zealand	7.324	1.268	1.601	0.876	0.669	0.365	0.389
8	9	Sweden	7.314	1.355	1.501	0.913	0.659	0.285	0.383
9	10	Australia	7.272	1.340	1.573	0.910	0.647	0.361	0.302
10	11	United Kingdom	7.190	1.244	1.433	0.888	0.464	0.262	0.082
11	12	Austria	7.139	1.341	1.504	0.891	0.617	0.242	0.224
12	13	Costa Rica	7.072	1.010	1.459	0.817	0.632	0.143	0.101
13	14	Ireland	6.977	1.448	1.583	0.876	0.614	0.307	0.306
14	15	Germany	6.965	1.340	1.474	0.861	0.586	0.273	0.280

	Overall rank	Country or region	Score	GDP per capita	Social support	Healthy life expectancy	Freedom to make life choices	Generosity	Perceptions of corruption
146	147	Malawi	3.587	0.186	0.541	0.306	0.531	0.210	0.080
147	148	Haiti	3.582	0.315	0.714	0.289	0.025	0.392	0.104
148	149	Liberia	3.495	0.076	0.858	0.267	0.419	0.206	0.030
149	150	Syria	3.462	0.689	0.382	0.539	0.088	0.376	0.144
150	151	Rwanda	3.408	0.332	0.896	0.400	0.636	0.200	0.444
151	152	Yemen	3.355	0.442	1.073	0.343	0.244	0.083	0.064
152	153	Tanzania	3.303	0.455	0.991	0.381	0.481	0.270	0.097
153	154	South Sudan	3.254	0.337	0.608	0.177	0.112	0.224	0.106
154	155	Central African Republic	3.083	0.024	0.000	0.010	0.305	0.218	0.038
155	156	Burundi	2.905	0.091	0.627	0.145	0.065	0.149	0.076

Import Package¶

Import Data¶

Heatmap¶

Kernel Density Estimation¶

Pairplot, Jointplot¶

Plotly¶

Bar Chart¶

Violinplot¶

	nation	medal	count
0	South Korea	gold	24
1	Japan	gold	10
2	United States	gold	55
3	Australia	gold	31
4	South Korea	silver	30
5	Japan	silver	40
6	United States	silver	45
7	Australia	silver	33
8	South Korea	bronze	18
9	Japan	bronze	20
10	United States	bronze	35
11	Australia	bronze	20