# Generate fake data
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
# You might be interested in age groups in a city.
young = stats.norm.rvs(25, 5, 500)
senior = stats.norm.rvs(55, 5, 500)
x = np.linspace(0, 100, 101)
final = np.concatenate([young,senior])
# Kernel density estimation
sns.kdeplot(final)
# The probability density peaks on two locations - around 25 and 55. This is an indication that we might
# prefer using bivariate distributions to describe the problem. (Bivariate normal)
<AxesSubplot:ylabel='Density'>
iq_young = stats.norm.rvs(105, 10, 500)
iq_senior = stats.norm.rvs(107, 13, 500)
iq_final = np.concatenate([iq_young, iq_senior])
sns.kdeplot(iq_senior)
# The peak of the IQ distribution occurs at around 105, with the probability density tapers off on both
# sides quite symmetrically.
<AxesSubplot:ylabel='Density'>
type(iq_final)
numpy.ndarray
data = pd.DataFrame({"Age": final, "IQ": iq_final})
data
Age | IQ | |
---|---|---|
0 | 21.399652 | 100.936664 |
1 | 25.858700 | 103.811431 |
2 | 23.789812 | 100.802383 |
3 | 24.131691 | 87.388801 |
4 | 20.056220 | 101.292560 |
... | ... | ... |
995 | 59.563507 | 112.911657 |
996 | 58.604462 | 96.105767 |
997 | 54.309280 | 94.484930 |
998 | 52.706558 | 121.984776 |
999 | 56.970346 | 108.810030 |
1000 rows × 2 columns
# Rule: People who are younger than 40 years old are considered young = 0, otherwise old = 1.
data['Senior'] = 0
# For people whose age are younger than 40, you assign 1 to the 'Senior' column.
data['Senior'][data['Age'] > 40] = 1
# Now people who are above 40 will be coded as 1.
<ipython-input-33-cf4132d92236>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['Senior'][data['Age'] > 40] = 1
data
Age | IQ | Senior | |
---|---|---|---|
0 | 21.399652 | 100.936664 | 0 |
1 | 25.858700 | 103.811431 | 0 |
2 | 23.789812 | 100.802383 | 0 |
3 | 24.131691 | 87.388801 | 0 |
4 | 20.056220 | 101.292560 | 0 |
... | ... | ... | ... |
995 | 59.563507 | 112.911657 | 1 |
996 | 58.604462 | 96.105767 | 1 |
997 | 54.309280 | 94.484930 | 1 |
998 | 52.706558 | 121.984776 | 1 |
999 | 56.970346 | 108.810030 | 1 |
1000 rows × 3 columns
sns.kdeplot(data['IQ'], hue = data['Senior'])
<AxesSubplot:xlabel='IQ', ylabel='Density'>