機械学習(次元圧縮, 行列分解)を利用した可視化

機械学習(次元圧縮, 行列分解)を利用した可視化#

import numpy as np
import pandas as pd
import sklearn
import plotly.express as px
import palmerpenguins
from sklearn import decomposition
palmerpenguins_df = palmerpenguins.load_penguins().dropna()
display(palmerpenguins_df.head())
display(palmerpenguins_df.info())
display(palmerpenguins_df.describe())
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 male 2007
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 female 2007
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 female 2007
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 female 2007
5 Adelie Torgersen 39.3 20.6 190.0 3650.0 male 2007
<class 'pandas.core.frame.DataFrame'>
Index: 333 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    object 
 7   year               333 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 23.4+ KB
None
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g year
count 333.000000 333.000000 333.000000 333.000000 333.000000
mean 43.992793 17.164865 200.966967 4207.057057 2008.042042
std 5.468668 1.969235 14.015765 805.215802 0.812944
min 32.100000 13.100000 172.000000 2700.000000 2007.000000
25% 39.500000 15.600000 190.000000 3550.000000 2007.000000
50% 44.500000 17.300000 197.000000 4050.000000 2008.000000
75% 48.600000 18.700000 213.000000 4775.000000 2009.000000
max 59.600000 21.500000 231.000000 6300.000000 2009.000000
class Cate2id:
    def __init__(self, column):
        set_col = set(column)
        if column.dtype == int:
            set_col = sorted(set_col)
        self.id2cate = {id:key for id, key in enumerate(set_col)}
        self.cate2id = {key:id for id, key in enumerate(set_col)}

island = Cate2id(palmerpenguins_df.island)
sex = Cate2id(palmerpenguins_df.sex)
year = Cate2id(palmerpenguins_df.year)

df = palmerpenguins_df.copy()
df.island = palmerpenguins_df.island.apply(lambda key: island.cate2id[key])
df.sex = palmerpenguins_df.sex.apply(lambda key: sex.cate2id[key])
df.year = palmerpenguins_df.year.apply(lambda key: year.cate2id[key])
df
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
0 Adelie 0 39.1 18.7 181.0 3750.0 0 0
1 Adelie 0 39.5 17.4 186.0 3800.0 1 0
2 Adelie 0 40.3 18.0 195.0 3250.0 1 0
4 Adelie 0 36.7 19.3 193.0 3450.0 1 0
5 Adelie 0 39.3 20.6 190.0 3650.0 0 0
... ... ... ... ... ... ... ... ...
339 Chinstrap 2 55.8 19.8 207.0 4000.0 0 2
340 Chinstrap 2 43.5 18.1 202.0 3400.0 1 2
341 Chinstrap 2 49.6 18.2 193.0 3775.0 0 2
342 Chinstrap 2 50.8 19.0 210.0 4100.0 0 2
343 Chinstrap 2 50.2 18.7 198.0 3775.0 1 2

333 rows × 8 columns

from sklearn.preprocessing import MinMaxScaler as Scaler
scaler = Scaler()
df[["bill_length_mm","bill_depth_mm", "flipper_length_mm", "body_mass_g"]] \
    = scaler.fit_transform(df[["bill_length_mm","bill_depth_mm", "flipper_length_mm", "body_mass_g"]])
df.describe()
island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
count 333.000000 333.000000 333.000000 333.000000 333.000000 333.000000 333.000000
mean 1.228228 0.432465 0.483912 0.490966 0.418627 0.495495 1.042042
std 0.678088 0.198861 0.234433 0.237555 0.223671 0.500732 0.812944
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.269091 0.297619 0.305085 0.236111 0.000000 0.000000
50% 1.000000 0.450909 0.500000 0.423729 0.375000 0.000000 1.000000
75% 2.000000 0.600000 0.666667 0.694915 0.576389 1.000000 2.000000
max 2.000000 1.000000 1.000000 1.000000 1.000000 1.000000 2.000000
from tqdm.auto import tqdm
from umap import UMAP
from sklearn.random_projection import GaussianRandomProjection,SparseRandomProjection
def results_of_decomposers(methods, df):
    tmp = []
    for method in tqdm(methods):
        _decomposer = method(n_components=2)
        _embeddings = _decomposer.fit_transform(df.iloc[:,1:])
        _embeddings_df = pd.DataFrame()
        _embeddings_df["PC1"]= _embeddings[:,0]
        _embeddings_df["PC2"]= _embeddings[:,1]
        _embeddings_df["species"] = df.species
        _embeddings_df["decomposion_method"] = _decomposer.__class__.__name__
        tmp.append(_embeddings_df)
    return pd.concat(tmp)

methods = [eval(f"decomposition.{key}") for key in decomposition.__all__ if key[0].isupper()]
methods.remove(sklearn.decomposition.SparseCoder)
methods += [UMAP, GaussianRandomProjection,SparseRandomProjection]

embeddings_df = results_of_decomposers(methods, df)
/Users/mriki/workspace/prpy/.venv/lib/python3.11/site-packages/sklearn/decomposition/_nmf.py:2353: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn(
from IPython.display import display, Markdown
def show_plot(decomposion_method):
    display(Markdown(f"### {decomposion_method}"))
    df = embeddings_df[embeddings_df.decomposion_method == decomposion_method]
    fig = px.scatter(
        df,
        x="PC1",y="PC2",
        color=df.species,
        title=f"palmerpenguins: 2D Scatter Plot using {decomposion_method}",
        #animation_frame="decomposion_method",
        width=800,
        height=800,
    )
    fig.show()

for method in set(embeddings_df.decomposion_method):
    show_plot(method)

GaussianRandomProjection

FastICA

PCA

MiniBatchNMF

LatentDirichletAllocation

FactorAnalysis

UMAP

SparsePCA

NMF

IncrementalPCA

DictionaryLearning

SparseRandomProjection

MiniBatchDictionaryLearning

TruncatedSVD

KernelPCA

MiniBatchSparsePCA

for method_name in set(embeddings_df.decomposion_method):
  df = embeddings_df[embeddings_df.decomposion_method == method_name]
  fig = px.scatter(
      df,
      x="PC1",y="PC2",
      color=df.species,
      title=f"palmerpenguins: 2D Scatter Plot using {method_name}",
      #animation_frame="decomposion_method",
      width=800,
      height=800,
  )
  fig.show()