728x90

k-means

In [1]:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# %matplotlib inline
from sklearn.preprocessing import scale
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# 실루엣 분석 평가 지표 값을 구하기 위한 API 추가
from sklearn.metrics import silhouette_samples, silhouette_score

In [2]:

iris = load_iris()

# Convert to Data Frame
iris_df = pd.DataFrame(data = iris.data, columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
iris_df.head(3)

Out[2]:

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2

k-means clustering

In [3]:

kmeans = KMeans(n_clusters=3, init = 'k-means++', max_iter= 300, random_state= 0)
kmeans.fit(iris_df)

Out[3]:

KMeans(n_clusters=3, random_state=0)

In [4]:

print(kmeans.labels_) # 각 데이터 포인트가 속한 군집 중심점 레이블

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]

군집화 확인

보통 비교할 만한 타깃 레이블을 가지고 있지 않습니다.

예시 데이터이기 때문에 타깃 테이블과 비교해봅니다.

In [5]:

iris_df['target'] = iris.target
iris_df['cluster'] = kmeans.labels_
iris_result = iris_df.groupby(['target','cluster'])['sepal_length'].count()
print(iris_result)

target  cluster
0       1          50
1       0          48
        2           2
2       0          14
        2          36
Name: sepal_length, dtype: int64

군집화 시각화

In [6]:

# 속성이 4개 이므로 2차원 평면상 시각화를 위해 2개로 차원축소
pca = PCA(n_components=2)
pca_transformed = pca.fit_transform(iris.data)

In [7]:

iris_df['pca_x'] = pca_transformed[:, 0]
iris_df['pca_y'] = pca_transformed[:, 1]
iris_df.head(3)

Out[7]:

	sepal_length	sepal_width	petal_length	petal_width	cluster	pca_x	pca_y
0	5.1	3.5	1.4	0.2	1	-2.684126	0.319397
1	4.9	3.0	1.4	0.2	1	-2.714142	-0.177001
2	4.7	3.2	1.3	0.2	1	-2.888991	-0.144949

In [8]:

# cluster 값이 0, 1, 2 인 경우마다 별도의 Index로 추출
marker0_ind = iris_df[iris_df['cluster']==0].index
marker1_ind = iris_df[iris_df['cluster']==1].index
marker2_ind = iris_df[iris_df['cluster']==2].index
markers = ['o','s','^']
marker_ind_list = [marker0_ind, marker1_ind, marker2_ind]

# cluster값 0, 1, 2에 해당하는 Index로 각 cluster 레벨의 pca_x, pca_y 값 추출. o, s, ^ 로 marker 표시
for i in range(len(marker_ind_list)):
    plt.scatter(x=iris_df.loc[marker_ind_list[i],'pca_x'], y=iris_df.loc[marker_ind_list[i],'pca_y'], marker = markers[i]) 
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    plt.title('3 Clusters Visualization by 2 PCA Components')
plt.show()

군집 평가

비지도 학습의 특성상 어떠한 지표도 정확한 성능을 평가하기 어렵습니다.

다만, 대표적인 평가 방법인 실루엣 분석을 알아보았습니다.

실루엣 분석

In [9]:

# iris의 모든 개별 데이터에 실루엣 계수 값
score_samples = silhouette_samples(iris.data, iris_df['cluster'])
print('실루엣 계수 값의 shape', score_samples.shape)

실루엣 계수 값의 shape (150,)

In [10]:

# iris_df에 실루엣 계수 칼럼 추가
iris_df['silhouette_coeff'] = score_samples

In [11]:

# 모든 데이터의 평균 실루엣 계수 값
average_score = silhouette_score(iris.data, iris_df['cluster']) # X feature 데이터 세트와 피처 데이터 세트가 속한 군집 레이블 데이터 입력
print('붓꽃 데이터 세트 평가:{0:.3f}'.format(average_score))
iris_df.head(3)

붓꽃 데이터 세트 평가:0.553

Out[11]:

	sepal_length	sepal_width	petal_length	petal_width	cluster	pca_x	pca_y	silhouette_coeff
0	5.1	3.5	1.4	0.2	1	-2.684126	0.319397	0.852955
1	4.9	3.0	1.4	0.2	1	-2.714142	-0.177001	0.815495
2	4.7	3.2	1.3	0.2	1	-2.888991	-0.144949	0.829315

In [12]:

iris_df.groupby(['cluster'])['silhouette_coeff'].mean()

Out[12]:

cluster
0    0.417320
1    0.798140
2    0.451105
Name: silhouette_coeff, dtype: float64

클러스터 개수 최적화 방법

클러스터별 평균 실루엣 계수의 시각화를 이용합니다.

references

https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

In [13]:

import os
os.chdir("C:/Users/KANG/Desktop/pyml/pyml_data/")

In [14]:

# references 참조
from visualize_silhouette import *

In [15]:

visualize_silhouette([ 2, 3, 4, 5 ], iris.data)

728x90

'Data Analytics with python > [Machine Learning ]' 카테고리의 다른 글

[Clustering] GMM (0)	2023.02.15
[Clustering] MeanShift (0)	2023.02.14
[Dimension Reduction] NMF 변환 (0)	2023.02.13
[Dimension Reduction] SVD 변환 (0)	2023.02.13
[Dimension Reduction] LDA 변환 (0)	2023.02.13

Kang's Note

[Clustering] K-means

'Data Analytics with python > [Machine Learning ]' 카테고리의 다른 글

댓글

티스토리툴바

[Clustering] K-means

'Data Analytics with python > [Machine Learning ]' 카테고리의 다른 글

관련글

댓글

티스토리툴바