In [1]:
# Generate fake data

import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
In [11]:
# You might be interested in age groups in a city.
young = stats.norm.rvs(25, 5, 500)

senior = stats.norm.rvs(55, 5, 500)

x = np.linspace(0, 100, 101)
final = np.concatenate([young,senior])
# Kernel density estimation
sns.kdeplot(final)

# The probability density peaks on two locations - around 25 and 55. This is an indication that we might
# prefer using bivariate distributions to describe the problem. (Bivariate normal)
Out[11]:
<AxesSubplot:ylabel='Density'>
In [15]:
iq_young = stats.norm.rvs(105, 10, 500)
iq_senior = stats.norm.rvs(107, 13, 500)

iq_final = np.concatenate([iq_young, iq_senior])

sns.kdeplot(iq_senior)

# The peak of the IQ distribution occurs at around 105, with the probability density tapers off on both
# sides quite symmetrically.
Out[15]:
<AxesSubplot:ylabel='Density'>
In [16]:
type(iq_final)
Out[16]:
numpy.ndarray
In [19]:
data = pd.DataFrame({"Age": final, "IQ": iq_final})
data
Out[19]:
Age IQ
0 21.399652 100.936664
1 25.858700 103.811431
2 23.789812 100.802383
3 24.131691 87.388801
4 20.056220 101.292560
... ... ...
995 59.563507 112.911657
996 58.604462 96.105767
997 54.309280 94.484930
998 52.706558 121.984776
999 56.970346 108.810030

1000 rows × 2 columns

In [31]:
# Rule: People who are younger than 40 years old are considered young = 0, otherwise old = 1.
data['Senior'] = 0
In [33]:
# For people whose age are younger than 40, you assign 1 to the 'Senior' column.
data['Senior'][data['Age'] > 40] = 1

# Now people who are above 40 will be coded as 1.
<ipython-input-33-cf4132d92236>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Senior'][data['Age'] > 40] = 1
In [35]:
data
Out[35]:
Age IQ Senior
0 21.399652 100.936664 0
1 25.858700 103.811431 0
2 23.789812 100.802383 0
3 24.131691 87.388801 0
4 20.056220 101.292560 0
... ... ... ...
995 59.563507 112.911657 1
996 58.604462 96.105767 1
997 54.309280 94.484930 1
998 52.706558 121.984776 1
999 56.970346 108.810030 1

1000 rows × 3 columns

In [36]:
sns.kdeplot(data['IQ'], hue = data['Senior'])
Out[36]:
<AxesSubplot:xlabel='IQ', ylabel='Density'>