Scikit-Learn overview#

import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import pandas as pd

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import seaborn as sns

クラス分類 (Classification)#

与えられた膨大なデータを,あらかじめ人間が割り当てたカテゴリ(クラス)に割り当てるタスク.

ここではSVM(サポートベクターマシン)のデモを行います.

擬似乱数のSEEDを設定

SEED = 42

np.random.seed(SEED)

データの読み込み

df1 = load_iris(as_frame=True)["frame"]

print("---"*10,"raw data","---"*30)
display(df1.head(10))
print("---"*10,"info","---"*30)
print(df1.info())
print("---"*10,"describe","---"*30)
print(df1.describe())
------------------------------ raw data ------------------------------------------------------------------------------------------
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
5 5.4 3.9 1.7 0.4 0
6 4.6 3.4 1.4 0.3 0
7 5.0 3.4 1.5 0.2 0
8 4.4 2.9 1.4 0.2 0
9 4.9 3.1 1.5 0.1 0
------------------------------ info ------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB
None
------------------------------ describe ------------------------------------------------------------------------------------------
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)      target  
count        150.000000  150.000000  
mean           1.199333    1.000000  
std            0.762238    0.819232  
min            0.100000    0.000000  
25%            0.300000    0.000000  
50%            1.300000    1.000000  
75%            1.800000    2.000000  
max            2.500000    2.000000  

データを訓練用とテスト用に分ける

X_train, X_test, y_train, y_test = train_test_split(
    df1.iloc[:,:4], df1["target"],
    stratify=df1["target"],
    test_size=0.3,
    shuffle=True,
    )

サポートベクターマシンの初期化と訓練

classifier = SVC()
classifier.fit(X_train,y_train)
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

テストデータに対する予測結果の表示

predicted_class_label = classifier.predict(X_test)
predicted_class_label
array([2, 1, 2, 1, 2, 2, 1, 1, 0, 2, 0, 0, 2, 2, 0, 2, 1, 0, 0, 0, 1, 0,
       1, 2, 2, 1, 1, 1, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 0, 2, 1, 0, 1, 2,
       1])

結果の可視化

results1 = X_test.copy()
results1["predicted_class_label"] = predicted_class_label
sns.pairplot(results1, hue="predicted_class_label")
<seaborn.axisgrid.PairGrid at 0x16ff28b90>
../_images/ff4d4d284d77e892a5bb5920aeca731e5727e173319ab084bc9921d8a63ccdd2.png
results1["true_class_label"] = y_test
results1.head(10)
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) predicted_class_label true_class_label
107 7.3 2.9 6.3 1.8 2 2
63 6.1 2.9 4.7 1.4 1 1
133 6.3 2.8 5.1 1.5 2 2
56 6.3 3.3 4.7 1.6 1 1
127 6.1 3.0 4.9 1.8 2 2
140 6.7 3.1 5.6 2.4 2 2
53 5.5 2.3 4.0 1.3 1 1
69 5.6 2.5 3.9 1.1 1 1
20 5.4 3.4 1.7 0.2 0 0
141 6.9 3.1 5.1 2.3 2 2
classifier.score(X_test,y_test)
0.9555555555555556

回帰 (Regression)#

与えられた膨大なデータから,それらに対応する数値を予測するタスク.

ここではランダムフォレストを使ったデモを行います.

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_diabetes
df2 = load_diabetes(as_frame=True)["frame"]
print("---"*10,"raw data","---"*30)
display(df2.head(10))
print("---"*10,"info","---"*30)
print(df2.info())
print("---"*10,"describe","---"*30)
print(df2.describe())
------------------------------ raw data ------------------------------------------------------------------------------------------
age sex bmi bp s1 s2 s3 s4 s5 s6 target
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019907 -0.017646 151.0
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068332 -0.092204 75.0
2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356 -0.002592 0.002861 -0.025930 141.0
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 0.034309 0.022688 -0.009362 206.0
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 -0.002592 -0.031988 -0.046641 135.0
5 -0.092695 -0.044642 -0.040696 -0.019442 -0.068991 -0.079288 0.041277 -0.076395 -0.041176 -0.096346 97.0
6 -0.045472 0.050680 -0.047163 -0.015999 -0.040096 -0.024800 0.000779 -0.039493 -0.062917 -0.038357 138.0
7 0.063504 0.050680 -0.001895 0.066629 0.090620 0.108914 0.022869 0.017703 -0.035816 0.003064 63.0
8 0.041708 0.050680 0.061696 -0.040099 -0.013953 0.006202 -0.028674 -0.002592 -0.014960 0.011349 110.0
9 -0.070900 -0.044642 0.039062 -0.033213 -0.012577 -0.034508 -0.024993 -0.002592 0.067737 -0.013504 310.0
------------------------------ info ------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  target  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB
None
------------------------------ describe ------------------------------------------------------------------------------------------
                age           sex           bmi            bp            s1  \
count  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02   
mean  -2.511817e-19  1.230790e-17 -2.245564e-16 -4.797570e-17 -1.381499e-17   
std    4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02   
min   -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123988e-01 -1.267807e-01   
25%   -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665608e-02 -3.424784e-02   
50%    5.383060e-03 -4.464164e-02 -7.283766e-03 -5.670422e-03 -4.320866e-03   
75%    3.807591e-02  5.068012e-02  3.124802e-02  3.564379e-02  2.835801e-02   
max    1.107267e-01  5.068012e-02  1.705552e-01  1.320436e-01  1.539137e-01   

                 s2            s3            s4            s5            s6  \
count  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02   
mean   3.918434e-17 -5.777179e-18 -9.042540e-18  9.293722e-17  1.130318e-17   
std    4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02   
min   -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260971e-01 -1.377672e-01   
25%   -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324559e-02 -3.317903e-02   
50%   -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947171e-03 -1.077698e-03   
75%    2.984439e-02  2.931150e-02  3.430886e-02  3.243232e-02  2.791705e-02   
max    1.987880e-01  1.811791e-01  1.852344e-01  1.335973e-01  1.356118e-01   

           target  
count  442.000000  
mean   152.133484  
std     77.093005  
min     25.000000  
25%     87.000000  
50%    140.500000  
75%    211.500000  
max    346.000000  
X_train2,X_test2,y_train2,y_test2 = train_test_split(
    df2.iloc[:,:-1], df2["target"],
    test_size=0.3,
    shuffle=True,
    )
regressor = RandomForestRegressor()
regressor.fit(X_train2,y_train2)
RandomForestRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

予測結果

predicted_value = regressor.predict(X_test2)
predicted_value
array([130.75,  89.33, 148.52, 168.82,  90.81, 173.92, 115.08,  97.47,
       249.04, 253.01, 171.58, 104.28, 184.22,  93.06, 112.24, 279.37,
       109.33, 121.46,  94.48, 188.71, 106.02, 219.32,  98.57, 128.64,
       253.92, 188.17, 148.01, 226.34,  87.36,  98.86, 165.6 , 199.5 ,
       111.83, 138.82, 167.41,  98.5 , 103.2 , 107.83, 149.75, 283.  ,
       247.3 , 225.36, 186.03, 198.68, 188.52, 175.25, 144.57,  85.44,
       107.83,  89.09, 215.06, 175.73, 194.03, 190.17,  90.92,  73.63,
       148.88,  86.92,  84.84, 108.3 ,  92.55, 108.25, 156.03, 165.02,
       240.1 , 224.75, 123.68, 225.14, 103.08, 107.86, 189.89, 269.44,
       127.77, 100.59, 240.83, 151.35,  93.61, 175.42, 225.9 ,  98.64,
       169.71, 195.16, 112.2 , 102.33,  73.17, 198.85, 109.89, 202.56,
       187.47, 197.38, 150.23,  69.78, 155.9 , 227.56, 131.47, 168.91,
       183.24, 177.13, 132.77, 161.82,  93.27, 246.47, 182.01,  92.69,
       153.69, 148.82, 157.84, 144.17, 154.28,  96.33, 215.61, 226.44,
        68.31,  82.83, 149.05, 257.22, 109.39,  97.14, 110.33, 149.31,
        98.12, 151.62, 215.8 , 207.76, 225.17,  74.94, 190.76,  99.67,
       173.43,  76.58, 222.36, 103.33, 204.19])

結果の可視化

fig = plt.figure()
fig.suptitle("Results of this regression task")
ax = fig.add_subplot(111)
ax.plot(np.arange(len(y_test2)),y_test2,label="true values")
ax.plot(np.arange(len(y_test2)),predicted_value, label="predicted values")
ax.legend()
plt.show()
../_images/080018eab6b69ad9d37bd4e284a70561663ee94746140b31197f6b7d719253d0.png
results2 = X_test2.copy()
results2["predicted_value"] = predicted_value
results2["true_value"] = y_test2
results2.head(10)
age sex bmi bp s1 s2 s3 s4 s5 s6 predicted_value true_value
183 0.045341 0.050680 -0.035307 0.063187 -0.004321 -0.001627 -0.010266 -0.002592 0.015568 0.056912 130.75 185.0
288 0.070769 0.050680 -0.016984 0.021872 0.043837 0.056305 0.037595 -0.002592 -0.070209 -0.017646 89.33 80.0
54 -0.049105 -0.044642 0.025051 0.008101 0.020446 0.017788 0.052322 -0.039493 -0.041176 0.007207 148.52 182.0
365 0.034443 -0.044642 -0.038540 -0.012556 0.009439 0.005262 -0.006584 -0.002592 0.031193 0.098333 168.82 206.0
136 -0.092695 -0.044642 -0.081653 -0.057313 -0.060735 -0.068014 0.048640 -0.076395 -0.066490 -0.021788 90.81 85.0
65 -0.045472 0.050680 -0.024529 0.059744 0.005311 0.014970 -0.054446 0.071210 0.042341 0.015491 173.92 163.0
63 -0.034575 -0.044642 -0.037463 -0.060756 0.020446 0.043466 -0.013948 -0.002592 -0.030748 -0.071494 115.08 128.0
306 0.009016 0.050680 -0.001895 0.021872 -0.038720 -0.024800 -0.006584 -0.039493 -0.039809 -0.013504 97.47 44.0
290 0.059871 0.050680 0.076786 0.025315 0.001183 0.016849 -0.054446 0.034309 0.029935 0.044485 249.04 332.0
254 0.030811 0.050680 0.056307 0.076958 0.049341 -0.012274 -0.036038 0.071210 0.120051 0.090049 253.01 310.0
regressor.score(X_test2,y_test2)
0.47975011322424

クラスタリング (Clustering)#

与えられた膨大なデータをいくつかのグループに分けるタスク.ここで見つかったグループをクラスターと呼ぶ.

ここではkmeansを使ってデモを行います.

from sklearn.cluster import KMeans
clustering = KMeans(n_clusters=2, n_init="auto")
clustering.fit(X_train)
KMeans(n_clusters=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
clustering.transform(X_train)[:10]
array([[1.92048332, 2.42265007],
       [3.64172282, 0.8523378 ],
       [0.41690734, 4.03925862],
       [5.15481766, 1.25135927],
       [2.92431009, 1.1437106 ],
       [4.55075458, 0.75495562],
       [0.35501261, 4.07961902],
       [0.42548085, 3.9903091 ],
       [0.78770451, 4.32085791],
       [5.2050649 , 1.28253091]])
clustering.predict(X_train)
array([0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0], dtype=int32)
clustering.predict(X_test)
array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       1], dtype=int32)
results1.keys()
Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'predicted_class_label', 'true_class_label'],
      dtype='object')
results1["cluster"] = clustering.predict(X_test)
sns.pairplot(
    results1[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)','petal width (cm)', "cluster"]],
    hue="cluster",
    )
<seaborn.axisgrid.PairGrid at 0x3720589d0>
../_images/b75b8f4ad5536b235f7718b354c65f57a7244400a1e90a4c0bc031218b8112bd.png
results1.head(10)[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'true_class_label',
       'cluster']]
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) true_class_label cluster
107 7.3 2.9 6.3 1.8 2 1
63 6.1 2.9 4.7 1.4 1 1
133 6.3 2.8 5.1 1.5 2 1
56 6.3 3.3 4.7 1.6 1 1
127 6.1 3.0 4.9 1.8 2 1
140 6.7 3.1 5.6 2.4 2 1
53 5.5 2.3 4.0 1.3 1 1
69 5.6 2.5 3.9 1.1 1 1
20 5.4 3.4 1.7 0.2 0 0
141 6.9 3.1 5.1 2.3 2 1

次元削減#

多次元データの情報をできるだけ欠損させずに,より低次元で表現するタスク.

ここではLSI(Latent Semantic Indexing)でデモを行います.

from sklearn.decomposition import TruncatedSVD as LSI
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
news = fetch_20newsgroups()
vectorizer = TfidfVectorizer(max_features=2000, stop_words="english")
vectorizer.fit(news.data)
TfidfVectorizer(max_features=2000, stop_words='english')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
X = vectorizer.transform(news.data)
decomposer = LSI(n_components=2,)
decomposer.fit(X)
TruncatedSVD()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
embed = decomposer.transform(X)
news_df = pd.DataFrame(embed)
news_df["class"] = news.target
news_df.head()
0 1 class
0 0.187086 -0.045149 7
1 0.130343 -0.079269 4
2 0.262456 -0.022060 4
3 0.254814 -0.056732 1
4 0.210019 -0.006634 14
for i in range(0,20):
    x = embed[:,0][news.target==i]
    y = embed[:,1][news.target==i]
    plt.scatter(x[:20], y[:20])
../_images/301b6903eb4e94bdcd7dc6921f46f8f5f35ef0214f9bf868b0d1f22848314df6.png