Skip to content
Snippets Groups Projects
Commit 26474c36 authored by François  LAURENT's avatar François LAURENT
Browse files

sklearn material update

parent 877264cf
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:b955dcc3 tags:
<img alt="https://allisonhorst.github.io/palmerpenguins/" src="https://raw.githubusercontent.com/allisonhorst/palmerpenguins/master/man/figures/lter_penguins.png" width=60% />
%% Cell type:code id:416c5291 tags:
``` python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
```
%% Cell type:code id:9d9ba356 tags:
``` python
penguins = pd.read_csv("https://github.com/allisonhorst/palmerpenguins/raw/5b5891f01b52ae26ad8cb9755ec93672f49328a8/data/penguins_size.csv")
penguins.head()
```
%% Output
species_short island culmen_length_mm culmen_depth_mm \
0 Adelie Torgersen 39.1 18.7
1 Adelie Torgersen 39.5 17.4
2 Adelie Torgersen 40.3 18.0
3 Adelie Torgersen NaN NaN
4 Adelie Torgersen 36.7 19.3
flipper_length_mm body_mass_g sex
0 181.0 3750.0 MALE
1 186.0 3800.0 FEMALE
2 195.0 3250.0 FEMALE
3 NaN NaN NaN
4 193.0 3450.0 FEMALE
%% Cell type:markdown id:e1a82ffe tags:
# PCA
## Q
Remove the undefined values and get the sample size for each species.
%% Cell type:markdown id:623d14fd tags:
## A
%% Cell type:code id:b2e86638 tags:
``` python
```
%% Cell type:markdown id:fc6e86fd tags:
## Q
Plot a `pairplot` so that each species can be distinguished by a different color.
%% Cell type:markdown id:b0b71cf0 tags:
## A
%% Cell type:code id:82fef377 tags:
``` python
```
%% Cell type:markdown id:3ac3648c tags:
## Q
We will focus on the continuous variables only, and scale them.
Pick the continuous variables, standardize their values and perform a PCA on the scaled data, with as many components as possible.
%% Cell type:code id:1722dc49 tags:
``` python
penguin_data = penguins[[
"culmen_length_mm",
"culmen_depth_mm",
"flipper_length_mm",
"body_mass_g",
]].values
from sklearn.preprocessing import StandardScaler
scaled_penguin_data = StandardScaler().fit_transform(penguin_data)
```
%% Cell type:markdown id:59d834f0 tags:
Perform a PCA on the scaled data, with all 4 components, and draw a scree plot to choose a number of principal components.
Draw a scree plot to choose a number of principal components.
%% Cell type:markdown id:6aa45291 tags:
## A
%% Cell type:code id:6c71cd2f tags:
%% Cell type:code id:1722dc49 tags:
``` python
```
%% Cell type:markdown id:6fbc37aa tags:
## Q
Perform a new PCA with a number of principal components, projects the data onto the principal axes, and plot the final data representing each species with a different color (hint: look at [this plotting function](https://plotly.github.io/plotly.py-docs/generated/plotly.express.scatter_3d.html)).
%% Cell type:markdown id:4fe3999e tags:
## A
%% Cell type:code id:3782d970 tags:
``` python
```
%% Cell type:markdown id:10f75594 tags:
# UMAP
%% Cell type:markdown id:398dd715 tags:
## Q
Play around with UMAP and the scaled penguin data.
Plot a 2D UMAP projection of the scaled penguin data (continuous variables, one penguin = one dot), and color the dots by species.
%% Cell type:markdown id:9f71db23 tags:
## A
%% Cell type:code id:fd18eba0 tags:
``` python
```
%% Cell type:markdown id:4f02b5f3-7dc6-4d1f-99ee-903acd622445 tags:
## Q
Adelie and Chinstrap penguins look more difficult to separate. To quantify how separated are the two corresponding clouds of points, we can use the silhouette score applied to the projected data for Adelie and Chinstrap penguins only.
%% Cell type:code id:6b062178-f700-46db-bd23-9cd592fb75f9 tags:
``` python
from sklearn.metrics import silhouette_score
```
%% Cell type:markdown id:2741e1a7-dcfd-4753-b544-82a0068af462 tags:
Find the combination of values that maximizes the above score, using a grid search with neighbor counts 2, 3, 5, 10, 20, 30, 50, 100 and minimum distances 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0.
%% Cell type:markdown id:32ce12af-d202-4104-b51b-3b1be08e0e73 tags:
## A
%% Cell type:code id:a2193b0b-5009-4269-bf7d-c25af101c4a3 tags:
``` python
```
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
Source diff could not be displayed: it is too large. Options to address this: view the blob.
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import decomposition
from scipy import stats
def illustration_pca_transform(X, origin, unit_axes, axes_norm):
def arrow3d(name, origin, vec, color='red', relative_cone_height=.2):
rod = np.stack(
(origin, origin + (1 - relative_cone_height) * vec),
axis=0,
)
traces = []
traces.append(go.Scatter3d(
name=name,
x=rod[:,0],
y=rod[:,1],
z=rod[:,2],
mode='lines',
line=dict(color=color),
showlegend=False,
))
cone_tip = origin + vec
cone_main_axis = relative_cone_height * vec
traces.append(go.Cone(
name=name,
x=[cone_tip[0]],
y=[cone_tip[1]],
z=[cone_tip[2]],
u=[cone_main_axis[0]],
v=[cone_main_axis[1]],
w=[cone_main_axis[2]],
anchor='tip',
sizeref=1,
colorscale=[(0, color), (1, color)],
showscale=False,
))
return traces
fig = go.Figure()
fig.add_trace(
go.Scatter3d(
name='observation',
x=X[:,0],
y=X[:,1],
z=X[:,2],
mode='markers',
marker=dict(size=1.5, line=dict(width=1)),
showlegend=False,
),
)
for axis_index, axis_name in enumerate(['first', 'second', 'third']):
for trace in arrow3d(
f'{axis_name} principal axis',
origin,
unit_axes[axis_index] * 5,
) + arrow3d(
'xyz'[axis_index],
np.zeros_like(origin),
np.eye(3)[axis_index] * 5,
'green',
):
fig.add_trace(trace)
fig.show()
def illustration_double_scatter3d(X, y, axis_names=('first PC', 'second PC', 'third PC'), labels=[('Setosa', 0), ('Versicolour', 1), ('Virginica', 2)]):
fig = make_subplots(rows=1, cols=2, specs=[[{'is_3d': True}, {'is_3d': True}]])
# with group labels
for name, species_y in labels:
species_data = X[y==species_y, :]
fig.add_trace(
go.Scatter3d(
name=name,
x=species_data[:,0],
y=species_data[:,1],
z=species_data[:,2],
mode='markers',
marker=dict(size=2, line=dict(width=1)),
),
row=1,
col=2,
)
# without group labels
fig.add_trace(
go.Scatter3d(
name='All',
x=X[:,0],
y=X[:,1],
z=X[:,2],
mode='markers',
marker=dict(size=2, line=dict(width=1)),
),
row=1,
col=1,
)
fig.update_layout(
scene=dict(
xaxis_title=axis_names[0],
yaxis_title=axis_names[1],
zaxis_title=axis_names[2],
),
)
for scene_index in (1, 2):
fig.update_layout({
f'scene{scene_index}': {axis+'axis_title': axis_name \
for axis, axis_name in zip('xyz', axis_names)}})
fig.show()
def illustration_4D_pca_weights(pca, Pdf):
_, axes = plt.subplots(1, 4)
common_kwargs = dict(size=10, orient="h", jitter=False, linewidth=1, edgecolor="w")
for j in range(4):
ax = axes[j]
sns.stripplot(x=pca.components_[j,:], y=Pdf.columns, ax=ax, **common_kwargs)
if j > 0:
ax.set_yticklabels([])
ax.yaxis.grid(True)
ax.axvline(0, color='k', linestyle=':', linewidth=1)
ax.set_xlabel(Pdf.index[j])
def scree_plot(X):
pca = decomposition.PCA()
pca.fit(X)
ax_left = ax = plt.gca()
ax_left.plot(range(1, X.shape[1]+1), pca.explained_variance_, 'b-')
ax_left.set_ylabel('$\lambda$', color='b')
ax_left.tick_params(axis='y', colors='b')
total_variance = np.sum(pca.explained_variance_)
cumulated_explained_variance = np.cumsum(pca.explained_variance_)
ax_right = ax.twinx()
ax_right.plot(np.r_[0, cumulated_explained_variance / total_variance], 'g-')
ax_right.set_ylabel('cumulated explained variance ratio', color='g')
ax_right.tick_params(axis='y', colors='g')
ax.set_xlabel('component');
def illustration_probabilistic_pca(Xdf):
_, axes = plt.subplots(2, 2, figsize=(13.3,8.2))
for axes_row, feature_name in zip(axes, ('sepal length (cm)', 'petal length (cm)')):
for ax in axes_row:
sns.scatterplot(x=feature_name, y='petal width (cm)', data=Xdf, ax=ax)
X_ = Xdf[[feature_name, 'petal width (cm)']].values
ax.plot(*X_.mean(axis=0), 'r+')
xlim, ylim = ax.get_xlim(), ax.get_ylim()
step = min(xlim[1]-xlim[0], ylim[1]-ylim[0]) / 100
x, y = np.arange(xlim[0], xlim[1], step), np.arange(ylim[0], ylim[1], step)
x_grid, y_grid = np.meshgrid(x, y)
grid = np.stack((x_grid.flatten(), y_grid.flatten()), axis=1)
z = stats.multivariate_normal(np.mean(X_, axis=0), np.cov(X_.T)).pdf(grid)
z_grid = z.reshape(x_grid.shape)
ax.contour(x_grid, y_grid, z_grid, 4);
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment