# We start by importing all necessary packages for exploratory data analysis. They are numpy
import numpy as np
# pandas for data processing
import pandas as pd
# seaborn and matplotlib to tackle main data visualization tasks.
import seaborn as sns  
import matplotlib.pyplot as plt

# Now, let's import the Breast Tumor Dataset
data = pd.read_csv('assets/breast_tumor.csv')
# and take a glimse of the first few rows.
data.head()


# Separate tumor targets from tumor data
target = data['diagnosis']
drop_cols = ['Unnamed: 32', 'id', 'diagnosis']
indep_variable = data.drop(drop_cols, axis = 1)
indep_variable.head()


# Plot Diagnosis Distributions
# The target contains the diagnosis with binary class labels, M or B, for malignant and benign tumors respectively.
# The sns.countplot function shows the counts of observations in each categorical bin using bars

ax = sns.countplot(target, label = "Count")
B, M = target.value_counts()

# Looks like a class imbalanced problem

/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


# Visualizing Standardized Data with Seaborn

data = indep_variable
# As the columns in the data set take on values of varying range, we need to standardize 
# the data before proceeding with further analysis and visualization. 
data_std = (data - data.mean()) / data.std()

# To minimize clutter in our visualizations, we divide the features into three batches of ten 
# features and produce separate plots for them.
data = pd.concat([target, data_std.iloc[:, 0:10]], axis = 1)
# Unpivot the data into wide format
data = pd.melt(data, id_vars = 'diagnosis',
              var_name = 'features',
              value_name = 'value')
plt.figure(figsize=(10, 10))
# The sns.violinplot function can show the standardized data and identify the difference between benign and 
# malicious tumors. Inside the function, we use the hue parameter to determine that the diagnosis column in the 
# data frame should be used for colour encoding.
sns.violinplot(x = 'features', y='value', hue='diagnosis', 
               data = data, split = True, inner='quart')
plt.xticks(rotation=45)

# Violin plots are similar to box plots, except that they also show the probability density of the data 
# at different values
# Check that the median or variability of the malign and benign are of huge difference.
# well-separated, beneficial for classification.

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 [Text(0, 0, 'radius_mean'),
  Text(1, 0, 'texture_mean'),
  Text(2, 0, 'perimeter_mean'),
  Text(3, 0, 'area_mean'),
  Text(4, 0, 'smoothness_mean'),
  Text(5, 0, 'compactness_mean'),
  Text(6, 0, 'concavity_mean'),
  Text(7, 0, 'concave points_mean'),
  Text(8, 0, 'symmetry_mean'),
  Text(9, 0, 'fractal_dimension_mean')])


# Violin Plots and Box Plots

# 11-20 features
data = pd.concat([target, data_std.iloc[:, 10:20]], axis = 1)
# Unpivot the data into wide format
data = pd.melt(data, id_vars = 'diagnosis',
              var_name = 'features',
              value_name = 'value')
plt.figure(figsize=(10, 10))
# hue indicates the color
sns.violinplot(x = 'features', y='value', hue='diagnosis', 
               data = data, split = True, inner='quart')
plt.xticks(rotation=45)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 [Text(0, 0, 'radius_se'),
  Text(1, 0, 'texture_se'),
  Text(2, 0, 'perimeter_se'),
  Text(3, 0, 'area_se'),
  Text(4, 0, 'smoothness_se'),
  Text(5, 0, 'compactness_se'),
  Text(6, 0, 'concavity_se'),
  Text(7, 0, 'concave points_se'),
  Text(8, 0, 'symmetry_se'),
  Text(9, 0, 'fractal_dimension_se')])


# Boxplots
# 11-20 features
data = pd.concat([target, data_std.iloc[:, 10:20]], axis = 1)
# Unpivot the data into wide format
data = pd.melt(data, id_vars = 'diagnosis',
              var_name = 'features',
              value_name = 'value')
plt.figure(figsize=(10, 10))
sns.violinplot(x = 'features', y='value', hue='diagnosis', 
               data = data, split = True, inner='quart')
plt.xticks(rotation=45)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 [Text(0, 0, 'radius_se'),
  Text(1, 0, 'texture_se'),
  Text(2, 0, 'perimeter_se'),
  Text(3, 0, 'area_se'),
  Text(4, 0, 'smoothness_se'),
  Text(5, 0, 'compactness_se'),
  Text(6, 0, 'concavity_se'),
  Text(7, 0, 'concave points_se'),
  Text(8, 0, 'symmetry_se'),
  Text(9, 0, 'fractal_dimension_se')])


# Box plots are especially useful in identifying outliers in the data.

plt.figure(figsize=(10, 10))
# hue indicates the color
sns.boxplot(x = 'features', y='value', hue='diagnosis', 
            data = data)
plt.xticks(rotation=45)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 [Text(0, 0, 'radius_se'),
  Text(1, 0, 'texture_se'),
  Text(2, 0, 'perimeter_se'),
  Text(3, 0, 'area_se'),
  Text(4, 0, 'smoothness_se'),
  Text(5, 0, 'compactness_se'),
  Text(6, 0, 'concavity_se'),
  Text(7, 0, 'concave points_se'),
  Text(8, 0, 'symmetry_se'),
  Text(9, 0, 'fractal_dimension_se')])


# Using Joint Plots for Feature Comparison
# Joint plots come in handy to illustrate the relationship between two features.

sns.jointplot(indep_variable.loc[:, 'concavity_worst'], 
             indep_variable.loc[:, 'concave points_worst'],
             kind = 'reg', scatter_kws={"s": 5})
# draw a scatter plot with marginal histograms and kernel density fits.
# relationship between any two features using the Pearson correlation coefficient of the regression 
# through our scatter plot.

/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

<seaborn.axisgrid.JointGrid at 0x7ff869386610>


# Observing the Distribution of Values and their Variance with Swarm Plots

# # We have learned that violin plots are a great tool for visualizing sparse distributions. As our data set contains close to 600 rows, 
# we might want to simply display each point in the same visualization. 

# Aesthetically pleasing
sns.set(style = 'whitegrid', palette='muted')

# Reuse the standardized data
data = pd.concat([target, data_std.iloc[:, 0:10]], axis = 1)
# Unpivot the data into wide format
data = pd.melt(data, id_vars = 'diagnosis',
              var_name = 'features',
              value_name = 'value')
plt.figure(figsize=(10, 10))

sns.swarmplot(x = 'features', y='value', hue='diagnosis', 
               data = data, s = 1.8)
plt.xticks(rotation=45)
# Clear indication of classification
# some representation of the underlying distribution.

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 [Text(0, 0, 'radius_mean'),
  Text(1, 0, 'texture_mean'),
  Text(2, 0, 'perimeter_mean'),
  Text(3, 0, 'area_mean'),
  Text(4, 0, 'smoothness_mean'),
  Text(5, 0, 'compactness_mean'),
  Text(6, 0, 'concavity_mean'),
  Text(7, 0, 'concave points_mean'),
  Text(8, 0, 'symmetry_mean'),
  Text(9, 0, 'fractal_dimension_mean')])


data = pd.concat([target, data_std.iloc[:, 10:20]], axis = 1)
# Unpivot the data into wide format
data = pd.melt(data, id_vars = 'diagnosis',
              var_name = 'features',
              value_name = 'value')
plt.figure(figsize=(10, 10))
# hue indicates the color
sns.swarmplot(x = 'features', y='value', hue='diagnosis', 
               data = data, s = 1.8)
plt.xticks(rotation=45)

/Users/michiganboy/opt/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 21.3% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 [Text(0, 0, 'radius_se'),
  Text(1, 0, 'texture_se'),
  Text(2, 0, 'perimeter_se'),
  Text(3, 0, 'area_se'),
  Text(4, 0, 'smoothness_se'),
  Text(5, 0, 'compactness_se'),
  Text(6, 0, 'concavity_se'),
  Text(7, 0, 'concave points_se'),
  Text(8, 0, 'symmetry_se'),
  Text(9, 0, 'fractal_dimension_se')])


data = pd.concat([target, data_std.iloc[:, 0:10]], axis = 1)
# Unpivot the data into wide format
data = pd.melt(data, id_vars = 'diagnosis',
              var_name = 'features',
              value_name = 'value')
plt.figure(figsize=(10, 10))
# hue indicates the color
sns.swarmplot(x = 'features', y='value', hue='diagnosis', 
               data = data, s = 1.8)
plt.xticks(rotation=45)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 [Text(0, 0, 'radius_mean'),
  Text(1, 0, 'texture_mean'),
  Text(2, 0, 'perimeter_mean'),
  Text(3, 0, 'area_mean'),
  Text(4, 0, 'smoothness_mean'),
  Text(5, 0, 'compactness_mean'),
  Text(6, 0, 'concavity_mean'),
  Text(7, 0, 'concave points_mean'),
  Text(8, 0, 'symmetry_mean'),
  Text(9, 0, 'fractal_dimension_mean')])


# Observing all Pair-wise Correlations
# A good way to identify correlations between features is to visualize the correlation matrix as a heatmap. 
# We use the sns.heatmap function to visualize all pair-wise correlations as a color-encoded grid
# Such a grid is typically named heatmap since the color scale can distinguish strong and weak correlations.
f, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(indep_variable.corr(), annot = True,
           linewidth = .6, fmt='.1f', ax = ax)
# Diagonal cells are 1. Pairwise correlation for the rest.
# Should we combine several features? Shall we drop one of the two correlated features?

<AxesSubplot:>

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	...	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	Unnamed: 32
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	...	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890	NaN
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	...	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902	NaN
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	...	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758	NaN
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	...	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300	NaN
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	...	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678	NaN

	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	...	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

Advanced Seaborn (Optional)¶

Tumor Diagnosis Exploratory Data Analysis in Seaborn¶