機械学習(次元圧縮, 行列分解)を利用した可視化#
Hint
必要ならば以下をインストール:
pip install palmerpenguins
pip install umap-learn
import numpy as np
import pandas as pd
import sklearn
import plotly.express as px
import palmerpenguins
from sklearn import decomposition
palmerpenguins_df = palmerpenguins.load_penguins().dropna()
display(palmerpenguins_df.head())
display(palmerpenguins_df.info())
display(palmerpenguins_df.describe())
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
---|---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
5 | Adelie | Torgersen | 39.3 | 20.6 | 190.0 | 3650.0 | male | 2007 |
<class 'pandas.core.frame.DataFrame'>
Index: 333 entries, 0 to 343
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 species 333 non-null object
1 island 333 non-null object
2 bill_length_mm 333 non-null float64
3 bill_depth_mm 333 non-null float64
4 flipper_length_mm 333 non-null float64
5 body_mass_g 333 non-null float64
6 sex 333 non-null object
7 year 333 non-null int64
dtypes: float64(4), int64(1), object(3)
memory usage: 23.4+ KB
None
bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | year | |
---|---|---|---|---|---|
count | 333.000000 | 333.000000 | 333.000000 | 333.000000 | 333.000000 |
mean | 43.992793 | 17.164865 | 200.966967 | 4207.057057 | 2008.042042 |
std | 5.468668 | 1.969235 | 14.015765 | 805.215802 | 0.812944 |
min | 32.100000 | 13.100000 | 172.000000 | 2700.000000 | 2007.000000 |
25% | 39.500000 | 15.600000 | 190.000000 | 3550.000000 | 2007.000000 |
50% | 44.500000 | 17.300000 | 197.000000 | 4050.000000 | 2008.000000 |
75% | 48.600000 | 18.700000 | 213.000000 | 4775.000000 | 2009.000000 |
max | 59.600000 | 21.500000 | 231.000000 | 6300.000000 | 2009.000000 |
class Cate2id:
def __init__(self, column):
set_col = set(column)
if column.dtype == int:
set_col = sorted(set_col)
self.id2cate = {id:key for id, key in enumerate(set_col)}
self.cate2id = {key:id for id, key in enumerate(set_col)}
island = Cate2id(palmerpenguins_df.island)
sex = Cate2id(palmerpenguins_df.sex)
year = Cate2id(palmerpenguins_df.year)
df = palmerpenguins_df.copy()
df.island = palmerpenguins_df.island.apply(lambda key: island.cate2id[key])
df.sex = palmerpenguins_df.sex.apply(lambda key: sex.cate2id[key])
df.year = palmerpenguins_df.year.apply(lambda key: year.cate2id[key])
df
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
---|---|---|---|---|---|---|---|---|
0 | Adelie | 0 | 39.1 | 18.7 | 181.0 | 3750.0 | 0 | 0 |
1 | Adelie | 0 | 39.5 | 17.4 | 186.0 | 3800.0 | 1 | 0 |
2 | Adelie | 0 | 40.3 | 18.0 | 195.0 | 3250.0 | 1 | 0 |
4 | Adelie | 0 | 36.7 | 19.3 | 193.0 | 3450.0 | 1 | 0 |
5 | Adelie | 0 | 39.3 | 20.6 | 190.0 | 3650.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
339 | Chinstrap | 2 | 55.8 | 19.8 | 207.0 | 4000.0 | 0 | 2 |
340 | Chinstrap | 2 | 43.5 | 18.1 | 202.0 | 3400.0 | 1 | 2 |
341 | Chinstrap | 2 | 49.6 | 18.2 | 193.0 | 3775.0 | 0 | 2 |
342 | Chinstrap | 2 | 50.8 | 19.0 | 210.0 | 4100.0 | 0 | 2 |
343 | Chinstrap | 2 | 50.2 | 18.7 | 198.0 | 3775.0 | 1 | 2 |
333 rows × 8 columns
from sklearn.preprocessing import MinMaxScaler as Scaler
scaler = Scaler()
df[["bill_length_mm","bill_depth_mm", "flipper_length_mm", "body_mass_g"]] \
= scaler.fit_transform(df[["bill_length_mm","bill_depth_mm", "flipper_length_mm", "body_mass_g"]])
df.describe()
island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
---|---|---|---|---|---|---|---|
count | 333.000000 | 333.000000 | 333.000000 | 333.000000 | 333.000000 | 333.000000 | 333.000000 |
mean | 1.228228 | 0.432465 | 0.483912 | 0.490966 | 0.418627 | 0.495495 | 1.042042 |
std | 0.678088 | 0.198861 | 0.234433 | 0.237555 | 0.223671 | 0.500732 | 0.812944 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 1.000000 | 0.269091 | 0.297619 | 0.305085 | 0.236111 | 0.000000 | 0.000000 |
50% | 1.000000 | 0.450909 | 0.500000 | 0.423729 | 0.375000 | 0.000000 | 1.000000 |
75% | 2.000000 | 0.600000 | 0.666667 | 0.694915 | 0.576389 | 1.000000 | 2.000000 |
max | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 |
from tqdm.auto import tqdm
from umap import UMAP
from sklearn.random_projection import GaussianRandomProjection,SparseRandomProjection
def results_of_decomposers(methods, df):
tmp = []
for method in tqdm(methods):
_decomposer = method(n_components=2)
_embeddings = _decomposer.fit_transform(df.iloc[:,1:])
_embeddings_df = pd.DataFrame()
_embeddings_df["PC1"]= _embeddings[:,0]
_embeddings_df["PC2"]= _embeddings[:,1]
_embeddings_df["species"] = df.species
_embeddings_df["decomposion_method"] = _decomposer.__class__.__name__
tmp.append(_embeddings_df)
return pd.concat(tmp)
methods = [eval(f"decomposition.{key}") for key in decomposition.__all__ if key[0].isupper()]
methods.remove(sklearn.decomposition.SparseCoder)
methods += [UMAP, GaussianRandomProjection,SparseRandomProjection]
embeddings_df = results_of_decomposers(methods, df)
/Users/mriki/workspace/prpy/.venv/lib/python3.11/site-packages/sklearn/decomposition/_nmf.py:2353: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
warnings.warn(
ipywidgetsを使った場合のコード
from ipywidgets import interact, Select, widgets
def show_plot(decomposion_method):
df = embeddings_df[embeddings_df.decomposion_method == decomposion_method]
fig = px.scatter(
df,
x="PC1",y="PC2",
color=df.species,
title=f"palmerpenguins: 2D Scatter Plot using {decomposion_method}",
#animation_frame="decomposion_method",
width=800,
height=800,
)
fig.show()
#def show_plot(decomposion_method):
# figs[decomposion_method].show()
w1 = Select(description='method:', options=set(embeddings_df.decomposion_method), rows=10)
interact(show_plot, decomposion_method=w1)
from IPython.display import display, Markdown
def show_plot(decomposion_method):
display(Markdown(f"### {decomposion_method}"))
df = embeddings_df[embeddings_df.decomposion_method == decomposion_method]
fig = px.scatter(
df,
x="PC1",y="PC2",
color=df.species,
title=f"palmerpenguins: 2D Scatter Plot using {decomposion_method}",
#animation_frame="decomposion_method",
width=800,
height=800,
)
fig.show()
for method in set(embeddings_df.decomposion_method):
show_plot(method)
GaussianRandomProjection
FastICA
PCA
MiniBatchNMF
LatentDirichletAllocation
FactorAnalysis
UMAP
SparsePCA
NMF
IncrementalPCA
DictionaryLearning
SparseRandomProjection
MiniBatchDictionaryLearning
TruncatedSVD
KernelPCA
MiniBatchSparsePCA
for method_name in set(embeddings_df.decomposion_method):
df = embeddings_df[embeddings_df.decomposion_method == method_name]
fig = px.scatter(
df,
x="PC1",y="PC2",
color=df.species,
title=f"palmerpenguins: 2D Scatter Plot using {method_name}",
#animation_frame="decomposion_method",
width=800,
height=800,
)
fig.show()