目录 DAY 17 常见聚类算法 1.聚类的指标 2.聚类常见算法:kmeans聚类、dbscan聚类、层次聚类 3.三种算法对应的流程 作业: 对心脏病数据集进行聚类。
DAY 17 常见聚类算法
import seaborn as sns
from sklearn. decomposition import PCA
from sklearn. preprocessing import StandardScaler
from sklearn. cluster import KMeans, DBSCAN, AgglomerativeClustering
import numpy as np
import warnings
from sklearn. metrics import classification_report, confusion_matrix
from sklearn. metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn. naive_bayes import GaussianNB
from sklearn. tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn. ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn. linear_model import LogisticRegression
from sklearn. neighbors import KNeighborsClassifier
from sklearn. svm import SVC
import time
from sklearn. model_selection import train_test_split
import pandas as pd
import matplotlib. pyplot as plt
warnings. filterwarnings( 'ignore' ) plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False data = pd. read_csv( r'data.csv' ) list_discrete = data. select_dtypes( include= [ 'object' ] ) . columns. tolist( ) home_ownership_mapping = { 'Own Home' : 1 , 'Rent' : 2 , 'Have Mortgage' : 3 , 'Home Mortgage' : 4 }
data[ 'Home Ownership' ] = data[ 'Home Ownership' ] . map ( home_ownership_mapping) years_in_job_mapping = { '< 1 year' : 1 , '1 year' : 2 , '2 years' : 3 , '3 years' : 4 , '4 years' : 5 , '5 years' : 6 , '6 years' : 7 , '7 years' : 8 , '8 years' : 9 , '9 years' : 10 , '10+ years' : 11 }
data[ 'Years in current job' ] = data[ 'Years in current job' ] . map ( years_in_job_mapping) data = pd. get_dummies( data, columns= [ 'Purpose' ] )
data2 = pd. read_csv( r'data.csv' )
list_new = [ ]
for i in data. columns: if i not in data2. columns: list_new. append( i)
for i in list_new: data[ i] = data[ i] . astype( int ) term_mapping = { 'Short Term' : 0 , 'Long Term' : 1 }
data[ 'Term' ] = data[ 'Term' ] . map ( term_mapping)
data. rename( columns= { 'Term' : 'Long Term' } , inplace= True ) list_continuous = data. select_dtypes( include= [ 'int64' , 'float64' ] ) . columns. tolist( ) for i in list_continuous: median_value = data[ i] . median( ) data[ i] = data[ i] . fillna( median_value) X = data. drop( [ 'Credit Default' ] , axis= 1 )
Y = data[ 'Credit Default' ]
scaler = StandardScaler( )
X_scaled = scaler. fit_transform( X)
1.聚类的指标
2.聚类常见算法:kmeans聚类、dbscan聚类、层次聚类
3.三种算法对应的流程
from sklearn. cluster import KMeans
from sklearn. metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib. pyplot as pltk_range = range ( 2 , 11 )
inertia_values = [ ]
silhouette_scores = [ ]
ch_scores = [ ]
db_scores = [ ] for k in k_range: kmeans = KMeans( n_clusters= k, random_state= 42 ) kmeans_labels = kmeans. fit_predict( X_scaled) inertia_values. append( kmeans. inertia_) silhouette = silhouette_score( X_scaled, kmeans_labels) silhouette_scores. append( silhouette) ch = calinski_harabasz_score( X_scaled, kmeans_labels) ch_scores. append( ch) db = davies_bouldin_score( X_scaled, kmeans_labels) db_scores. append( db) print ( f'k = { k} , 惯性: { kmeans. inertia_: .2f } , 轮廓系数: { silhouette: .3f } , CH 指数: { ch: .2f } , DB 指数: { db: .3f } ' ) plt. figure( figsize= ( 15 , 10 ) ) plt. subplot( 2 , 2 , 1 )
plt. plot( k_range, inertia_values, marker= 'o' )
plt. title( '肘部法则确定最优聚类数 k(惯性,越小越好)' )
plt. xlabel( '聚类数 (k)' )
plt. ylabel( '惯性' )
plt. grid( True ) plt. subplot( 2 , 2 , 2 )
plt. plot( k_range, silhouette_scores, marker= 'o' , color= 'orange' )
plt. title( '轮廓系数确定最优聚类数 k(越大越好)' )
plt. xlabel( '聚类数 (k)' )
plt. ylabel( '轮廓系数' )
plt. grid( True ) plt. subplot( 2 , 2 , 3 )
plt. plot( k_range, ch_scores, marker= 'o' , color= 'green' )
plt. title( 'Calinski-Harabasz 指数确定最优聚类数 k(越大越好)' )
plt. xlabel( '聚类数 (k)' )
plt. ylabel( 'CH 指数' )
plt. grid( True ) plt. subplot( 2 , 2 , 4 )
plt. plot( k_range, db_scores, marker= 'o' , color= 'red' )
plt. title( 'Davies-Bouldin 指数确定最优聚类数 k(越小越好)' )
plt. xlabel( '聚类数 (k)' )
plt. ylabel( 'DB 指数' )
plt. grid( True ) plt. tight_layout( )
plt. show( )
k = 2, 惯性: 224921.38, 轮廓系数: 0.723, CH 指数: 252.64, DB 指数: 0.355
k = 3, 惯性: 210919.39, 轮廓系数: 0.320, CH 指数: 383.53, DB 指数: 2.446
k = 4, 惯性: 204637.65, 轮廓系数: 0.087, CH 指数: 340.21, DB 指数: 2.315
k = 5, 惯性: 198854.98, 轮廓系数: 0.106, CH 指数: 317.03, DB 指数: 2.232
k = 6, 惯性: 191274.31, 轮廓系数: 0.112, CH 指数: 323.04, DB 指数: 1.921
k = 7, 惯性: 183472.98, 轮廓系数: 0.121, CH 指数: 333.71, DB 指数: 1.750
k = 8, 惯性: 174533.93, 轮廓系数: 0.131, CH 指数: 355.46, DB 指数: 2.089
k = 9, 惯性: 167022.49, 轮廓系数: 0.133, CH 指数: 367.09, DB 指数: 1.862
k = 10, 惯性: 163353.82, 轮廓系数: 0.097, CH 指数: 352.27, DB 指数: 1.838
selected_k = 9 kmeans = KMeans( n_clusters= selected_k, random_state= 42 )
kmeans_labels = kmeans. fit_predict( X_scaled)
X[ 'KMeans_Cluster' ] = kmeans_labelspca = PCA( n_components= 2 )
X_pca = pca. fit_transform( X_scaled) plt. figure( figsize= ( 6 , 5 ) )
sns. scatterplot( x= X_pca[ : , 0 ] , y= X_pca[ : , 1 ] , hue= kmeans_labels, palette= 'viridis' )
plt. title( f'KMeans Clustering with k = { selected_k} (PCA Visualization)' )
plt. xlabel( 'PCA Component 1' )
plt. ylabel( 'PCA Component 2' )
plt. show( ) print ( f"KMeans Cluster labels (k = { selected_k} ) added to X:" )
print ( X[ [ 'KMeans_Cluster' ] ] . value_counts( ) )
KMeans Cluster labels (k = 9) added to X:
KMeans_Cluster
6 3049
3 1343
7 899
5 742
0 582
1 401
2 373
4 96
8 15
Name: count, dtype: int64
import numpy as np
import pandas as pd
from sklearn. cluster import DBSCAN
from sklearn. metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_scoreeps_range = np. arange( 0.3 , 0.8 , 0.1 )
min_samples_range = range ( 3 , 8 )
results = [ ] for eps in eps_range: for min_samples in min_samples_range: dbscan = DBSCAN( eps= eps, min_samples= min_samples) dbscan_labels = dbscan. fit_predict( X_scaled) n_clusters = len ( np. unique( dbscan_labels) ) - \( 1 if - 1 in dbscan_labels else 0 ) n_noise = list ( dbscan_labels) . count( - 1 ) if n_clusters > 1 : mask = dbscan_labels != - 1 if mask. sum ( ) > 0 : silhouette = silhouette_score( X_scaled[ mask] , dbscan_labels[ mask] ) ch = calinski_harabasz_score( X_scaled[ mask] , dbscan_labels[ mask] ) db = davies_bouldin_score( X_scaled[ mask] , dbscan_labels[ mask] ) results. append( { 'eps' : eps, 'min_samples' : min_samples, 'n_clusters' : n_clusters, 'n_noise' : n_noise, 'silhouette' : silhouette, 'ch_score' : ch, 'db_score' : db} ) print ( f'eps = { eps: .1f } , min_samples = { min_samples} , 簇数: { n_clusters} , 噪声点: { n_noise} , 轮廓系数: { silhouette: .3f } , CH 指数: { ch: .2f } , DB 指数: { db: .3f } ' ) else : print ( f'eps= { eps: .1f } , min_samples= { min_samples} , 簇数: { n_clusters} , 噪声点: { n_noise} , 无法计算评估指标' ) results_df = pd. DataFrame( results)
eps=0.3, min_samples=3, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps=0.3, min_samples=4, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps=0.3, min_samples=5, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps=0.3, min_samples=6, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps=0.3, min_samples=7, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps=0.4, min_samples=3, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps=0.4, min_samples=4, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps=0.4, min_samples=5, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps=0.4, min_samples=6, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps=0.4, min_samples=7, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps = 0.5, min_samples = 3, 簇数: 13, 噪声点: 7447, 轮廓系数: 0.480, CH 指数: 87.71, DB 指数: 0.642
eps = 0.5, min_samples = 4, 簇数: 5, 噪声点: 7474, 轮廓系数: 0.494, CH 指数: 76.98, DB 指数: 0.707
eps=0.5, min_samples=5, 簇数: 1, 噪声点: 7495, 无法计算评估指标
eps=0.5, min_samples=6, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps=0.5, min_samples=7, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps = 0.6, min_samples = 3, 簇数: 52, 噪声点: 7256, 轮廓系数: 0.307, CH 指数: 58.60, DB 指数: 0.895
eps = 0.6, min_samples = 4, 簇数: 21, 噪声点: 7385, 轮廓系数: 0.396, CH 指数: 68.38, DB 指数: 0.803
eps = 0.6, min_samples = 5, 簇数: 6, 噪声点: 7459, 轮廓系数: 0.405, CH 指数: 40.74, DB 指数: 0.931
eps = 0.6, min_samples = 6, 簇数: 3, 噪声点: 7479, 轮廓系数: 0.484, CH 指数: 25.09, DB 指数: 0.706
eps=0.6, min_samples=7, 簇数: 0, 噪声点: 7500, 无法计算评估指标
eps = 0.7, min_samples = 3, 簇数: 78, 噪声点: 6900, 轮廓系数: 0.077, CH 指数: 28.85, DB 指数: 0.929
eps = 0.7, min_samples = 4, 簇数: 29, 噪声点: 7105, 轮廓系数: 0.039, CH 指数: 30.05, DB 指数: 0.948
eps = 0.7, min_samples = 5, 簇数: 18, 噪声点: 7235, 轮廓系数: 0.074, CH 指数: 23.26, DB 指数: 0.977
eps = 0.7, min_samples = 6, 簇数: 9, 噪声点: 7347, 轮廓系数: 0.060, CH 指数: 22.12, DB 指数: 1.323
eps = 0.7, min_samples = 7, 簇数: 11, 噪声点: 7378, 轮廓系数: 0.181, CH 指数: 29.45, DB 指数: 1.265
results_df. head( )
eps min_samples n_clusters n_noise silhouette ch_score db_score 0 0.5 3 13 7447 0.480120 87.712607 0.641676 1 0.5 4 5 7474 0.494245 76.981585 0.707069 2 0.6 3 52 7256 0.307484 58.603362 0.895483 3 0.6 4 21 7385 0.396210 68.375255 0.803354 4 0.6 5 6 7459 0.405183 40.742722 0.931054
plt. figure( figsize= ( 15 , 10 ) ) plt. subplot( 2 , 2 , 1 )
for min_samples in min_samples_range: subset = results_df[ results_df[ 'min_samples' ] == min_samples] plt. plot( subset[ 'eps' ] , subset[ 'silhouette' ] , marker= 'o' , label= f'min_samples = { min_samples} ' )
plt. title( '轮廓系数确定最优参数(越大越好)' )
plt. xlabel( 'eps' )
plt. ylabel( '轮廓系数' )
plt. legend( )
plt. grid( True ) plt. subplot( 2 , 2 , 2 )
for min_samples in min_samples_range: subset = results_df[ results_df[ 'min_samples' ] == min_samples] plt. plot( subset[ 'eps' ] , subset[ 'ch_score' ] , marker= 'o' , label= f'min_samples = { min_samples} ' )
plt. title( 'Calinski-Harabasz 指数确定最优参数(越大越好)' )
plt. xlabel( 'eps' )
plt. ylabel( 'CH 指数' )
plt. legend( )
plt. grid( True ) plt. subplot( 2 , 2 , 3 )
for min_samples in min_samples_range: subset = results_df[ results_df[ 'min_samples' ] == min_samples] plt. plot( subset[ 'eps' ] , subset[ 'db_score' ] , marker= 'o' , label= f'min_samples = { min_samples} ' )
plt. title( 'Davies-Bouldin 指数确定最优参数(越小越好)' )
plt. xlabel( 'eps' )
plt. ylabel( 'DB 指数' )
plt. legend( )
plt. grid( True ) plt. subplot( 2 , 2 , 4 )
for min_samples in min_samples_range: subset = results_df[ results_df[ 'min_samples' ] == min_samples] plt. plot( subset[ 'eps' ] , subset[ 'n_clusters' ] , marker= 'o' , label= f'min_samples = { min_samples} ' )
plt. title( '簇数量变化' )
plt. xlabel( 'eps' )
plt. ylabel( '簇数量' )
plt. legend( )
plt. grid( True ) plt. tight_layout( )
plt. show( )
selected_eps = 0.6
selected_min_samples = 5 dbscan = DBSCAN( eps= selected_eps, min_samples= selected_min_samples)
dbscan_labels = dbscan. fit_predict( X_scaled)
X[ 'DBSCAN_Cluster' ] = dbscan_labelspca = PCA( n_components= 2 )
X_pca = pca. fit_transform( X_scaled) plt. figure( figsize= ( 6 , 5 ) )
sns. scatterplot( x= X_pca[ : , 0 ] , y= X_pca[ : , 1 ] , hue= dbscan_labels, palette= 'viridis' )
plt. title( f'DBSCAN Clustering with eps = { selected_eps} , min_samples = { selected_min_samples} (PCA Visualization)' )
plt. xlabel( 'PCA Component 1' )
plt. ylabel( 'PCA Component 2' )
plt. show( ) print ( f'DBSCAN Cluster labels (eps = { selected_eps} , min_samples = { selected_min_samples} ) added to X:' )
print ( X[ [ 'DBSCAN_Cluster' ] ] . value_counts( ) )
DBSCAN Cluster labels (eps = 0.6, min_samples = 5) added to X:
DBSCAN_Cluster
-1 74593 124 72 61 60 55 5
Name: count, dtype: int64
from sklearn. cluster import AgglomerativeClustering
from sklearn. preprocessing import StandardScaler
from sklearn. metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib. pyplot as pltscaler = StandardScaler( )
X_scaled = scaler. fit_transform( X) n_clusters_range = range ( 2 , 11 )
silhouette_scores = [ ]
ch_scores = [ ]
db_scores = [ ] for n_clusters in n_clusters_range: agglo = AgglomerativeClustering( n_clusters= n_clusters, linkage= 'ward' ) agglo_labels = agglo. fit_predict( X_scaled) silhouette = silhouette_score( X_scaled, agglo_labels) ch = calinski_harabasz_score( X_scaled, agglo_labels) db = davies_bouldin_score( X_scaled, agglo_labels) silhouette_scores. append( silhouette) ch_scores. append( ch) db_scores. append( db) print ( f'n_clusters = { n_clusters} , 轮廓系数: { silhouette: .3f } , CH 指数: { ch: .2f } , DB 指数: { db: .3f } ' ) plt. figure( figsize= ( 15 , 5 ) ) plt. subplot( 1 , 3 , 1 )
plt. plot( n_clusters_range, silhouette_scores, marker= 'o' )
plt. title( '轮廓系数确定最优簇数(越大越好)' )
plt. xlabel( '簇数量 (n_clusters)' )
plt. ylabel( '轮廓系数' )
plt. grid( True ) plt. subplot( 1 , 3 , 2 )
plt. plot( n_clusters_range, ch_scores, marker= 'o' )
plt. title( 'Calinski-Harabasz 指数确定最优簇数(越大越好)' )
plt. xlabel( '簇数量 (n_clusters)' )
plt. ylabel( 'CH 指数' )
plt. grid( True ) plt. subplot( 1 , 3 , 3 )
plt. plot( n_clusters_range, db_scores, marker= 'o' )
plt. title( 'Davies-Bouldin 指数确定最优簇数(越小越好)' )
plt. xlabel( '簇数量 (n_clusters)' )
plt. ylabel( 'DB 指数' )
plt. grid( True ) plt. tight_layout( )
plt. show( )
n_clusters = 2, 轮廓系数: 0.084, CH 指数: 401.74, DB 指数: 3.261
n_clusters = 3, 轮廓系数: 0.097, CH 指数: 400.37, DB 指数: 2.961
n_clusters = 4, 轮廓系数: 0.093, CH 指数: 386.08, DB 指数: 2.730
n_clusters = 5, 轮廓系数: 0.109, CH 指数: 384.40, DB 指数: 2.538
n_clusters = 6, 轮廓系数: 0.119, CH 指数: 378.20, DB 指数: 2.218
n_clusters = 7, 轮廓系数: 0.133, CH 指数: 377.54, DB 指数: 2.158
n_clusters = 8, 轮廓系数: 0.140, CH 指数: 381.30, DB 指数: 1.920
n_clusters = 9, 轮廓系数: 0.143, CH 指数: 388.37, DB 指数: 1.740
n_clusters = 10, 轮廓系数: 0.148, CH 指数: 398.21, DB 指数: 1.625
selected_n_clusters = 10 agglo = AgglomerativeClustering( n_clusters= selected_n_clusters, linkage= 'ward' )
agglo_labels = agglo. fit_predict( X_scaled)
X[ 'Agglo_Cluster' ] = agglo_labelspca = PCA( n_components= 2 )
X_pca = pca. fit_transform( X_scaled) plt. figure( figsize= ( 6 , 5 ) )
sns. scatterplot( x= X_pca[ : , 0 ] , y= X_pca[ : , 1 ] , hue= agglo_labels, palette= 'viridis' )
plt. title( f'Agglomerative Clustering with n_clusters = { selected_n_clusters} (PCA Visualization)' )
plt. xlabel( 'PCA Component 1' )
plt. ylabel( 'PCA Component 2' )
plt. show( ) print ( f'Agglomerative Cluster labels (n_clusters = { selected_n_clusters} ) added to X:' )
print ( X[ [ 'Agglo_Cluster' ] ] . value_counts( ) )
Agglomerative Cluster labels (n_clusters = 10) added to X:
Agglo_Cluster
7 3123
2 2132
6 775
0 766
4 392
1 153
5 96
8 34
3 21
9 8
Name: count, dtype: int64
from scipy. cluster import hierarchy
import matplotlib. pyplot as pltZ = hierarchy. linkage( X_scaled, method= 'ward' ) plt. figure( figsize= ( 10 , 6 ) )
hierarchy. dendrogram( Z, truncate_mode= 'level' , p= 3 )
plt. title( 'Dendrogram for Agglomerative Clustering' )
plt. xlabel( 'Cluster Size' )
plt. ylabel( 'Distance' )
plt. show( )
作业: 对心脏病数据集进行聚类。
import matplotlib. pyplot as plt
import time
from sklearn. svm import SVC
from sklearn. neighbors import KNeighborsClassifier
from sklearn. linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn. ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn. tree import DecisionTreeClassifier
from sklearn. naive_bayes import GaussianNB
from sklearn. metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn. metrics import classification_report, confusion_matrix
import warnings
import numpy as np
from sklearn. cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn. preprocessing import StandardScaler
from sklearn. decomposition import PCA
import seaborn as sns
from sklearn. model_selection import train_test_split
import pandas as pd
warnings. filterwarnings( 'ignore' ) plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False data = pd. read_csv( r'heart.csv' ) X = data. drop( [ 'target' ] , axis= 1 )
Y = data[ 'target' ]
scaler = StandardScaler( )
X_scaled = scaler. fit_transform( X)
from sklearn. cluster import KMeans
from sklearn. metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib. pyplot as pltk_range = range ( 2 , 11 )
inertia_values = [ ]
silhouette_scores = [ ]
ch_scores = [ ]
db_scores = [ ] for k in k_range: kmeans = KMeans( n_clusters= k, random_state= 42 ) kmeans_labels = kmeans. fit_predict( X_scaled) inertia_values. append( kmeans. inertia_) silhouette = silhouette_score( X_scaled, kmeans_labels) silhouette_scores. append( silhouette) ch = calinski_harabasz_score( X_scaled, kmeans_labels) ch_scores. append( ch) db = davies_bouldin_score( X_scaled, kmeans_labels) db_scores. append( db) print ( f'k = { k} , 惯性: { kmeans. inertia_: .2f } , 轮廓系数: { silhouette: .3f } , CH 指数: { ch: .2f } , DB 指数: { db: .3f } ' ) plt. figure( figsize= ( 15 , 10 ) ) plt. subplot( 2 , 2 , 1 )
plt. plot( k_range, inertia_values, marker= 'o' )
plt. title( '肘部法则确定最优聚类数 k(惯性,越小越好)' )
plt. xlabel( '聚类数 (k)' )
plt. ylabel( '惯性' )
plt. grid( True ) plt. subplot( 2 , 2 , 2 )
plt. plot( k_range, silhouette_scores, marker= 'o' , color= 'orange' )
plt. title( '轮廓系数确定最优聚类数 k(越大越好)' )
plt. xlabel( '聚类数 (k)' )
plt. ylabel( '轮廓系数' )
plt. grid( True ) plt. subplot( 2 , 2 , 3 )
plt. plot( k_range, ch_scores, marker= 'o' , color= 'green' )
plt. title( 'Calinski-Harabasz 指数确定最优聚类数 k(越大越好)' )
plt. xlabel( '聚类数 (k)' )
plt. ylabel( 'CH 指数' )
plt. grid( True ) plt. subplot( 2 , 2 , 4 )
plt. plot( k_range, db_scores, marker= 'o' , color= 'red' )
plt. title( 'Davies-Bouldin 指数确定最优聚类数 k(越小越好)' )
plt. xlabel( '聚类数 (k)' )
plt. ylabel( 'DB 指数' )
plt. grid( True ) plt. tight_layout( )
plt. show( )
k = 2, 惯性: 3331.64, 轮廓系数: 0.166, CH 指数: 54.87, DB 指数: 2.209
k = 3, 惯性: 3087.69, 轮廓系数: 0.112, CH 指数: 41.36, DB 指数: 2.544
k = 4, 惯性: 2892.52, 轮廓系数: 0.118, CH 指数: 36.06, DB 指数: 2.175
k = 5, 惯性: 2814.65, 轮廓系数: 0.094, CH 指数: 29.76, DB 指数: 2.386
k = 6, 惯性: 2673.22, 轮廓系数: 0.095, CH 指数: 28.13, DB 指数: 2.377
k = 7, 惯性: 2596.68, 轮廓系数: 0.088, CH 指数: 25.50, DB 指数: 2.290
k = 8, 惯性: 2464.39, 轮廓系数: 0.115, CH 指数: 25.22, DB 指数: 2.136
k = 9, 惯性: 2415.63, 轮廓系数: 0.105, CH 指数: 23.18, DB 指数: 2.133
k = 10, 惯性: 2337.41, 轮廓系数: 0.111, CH 指数: 22.31, DB 指数: 2.056
selected_k = 4 kmeans = KMeans( n_clusters= selected_k, random_state= 42 )
kmeans_labels = kmeans. fit_predict( X_scaled)
X[ 'KMeans_Cluster' ] = kmeans_labelspca = PCA( n_components= 2 )
X_pca = pca. fit_transform( X_scaled) plt. figure( figsize= ( 6 , 5 ) )
sns. scatterplot( x= X_pca[ : , 0 ] , y= X_pca[ : , 1 ] , hue= kmeans_labels, palette= 'viridis' )
plt. title( f'KMeans Clustering with k = { selected_k} (PCA Visualization)' )
plt. xlabel( 'PCA Component 1' )
plt. ylabel( 'PCA Component 2' )
plt. show( ) print ( f"KMeans Cluster labels (k = { selected_k} ) added to X:" )
print ( X[ [ 'KMeans_Cluster' ] ] . value_counts( ) )
KMeans Cluster labels (k = 4) added to X:
KMeans_Cluster
1 95
0 94
2 69
3 45
Name: count, dtype: int64
import numpy as np
import pandas as pd
from sklearn. cluster import DBSCAN
from sklearn. metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_scoreeps_range = np. arange( 1.0 , 1.5 , 0.1 )
min_samples_range = range ( 1 , 8 )
results = [ ] for eps in eps_range: for min_samples in min_samples_range: dbscan = DBSCAN( eps= eps, min_samples= min_samples) dbscan_labels = dbscan. fit_predict( X_scaled) n_clusters = len ( np. unique( dbscan_labels) ) - \( 1 if - 1 in dbscan_labels else 0 ) n_noise = list ( dbscan_labels) . count( - 1 ) if n_clusters > 1 : mask = dbscan_labels != - 1 if mask. sum ( ) > 0 : silhouette = silhouette_score( X_scaled[ mask] , dbscan_labels[ mask] ) ch = calinski_harabasz_score( X_scaled[ mask] , dbscan_labels[ mask] ) db = davies_bouldin_score( X_scaled[ mask] , dbscan_labels[ mask] ) results. append( { 'eps' : eps, 'min_samples' : min_samples, 'n_clusters' : n_clusters, 'n_noise' : n_noise, 'silhouette' : silhouette, 'ch_score' : ch, 'db_score' : db} ) print ( f'eps = { eps: .1f } , min_samples = { min_samples} , 簇数: { n_clusters} , 噪声点: { n_noise} , 轮廓系数: { silhouette: .3f } , CH 指数: { ch: .2f } , DB 指数: { db: .3f } ' ) else : print ( f'eps= { eps: .1f } , min_samples= { min_samples} , 簇数: { n_clusters} , 噪声点: { n_noise} , 无法计算评估指标' ) results_df = pd. DataFrame( results)
eps = 1.0, min_samples = 1, 簇数: 293, 噪声点: 0, 轮廓系数: 0.030, CH 指数: 37.73, DB 指数: 0.182
eps = 1.0, min_samples = 2, 簇数: 8, 噪声点: 285, 轮廓系数: 0.727, CH 指数: 58.43, DB 指数: 0.300
eps=1.0, min_samples=3, 簇数: 1, 噪声点: 299, 无法计算评估指标
eps=1.0, min_samples=4, 簇数: 1, 噪声点: 299, 无法计算评估指标
eps=1.0, min_samples=5, 簇数: 0, 噪声点: 303, 无法计算评估指标
eps=1.0, min_samples=6, 簇数: 0, 噪声点: 303, 无法计算评估指标
eps=1.0, min_samples=7, 簇数: 0, 噪声点: 303, 无法计算评估指标
eps = 1.1, min_samples = 1, 簇数: 284, 噪声点: 0, 轮廓系数: 0.037, CH 指数: 21.82, DB 指数: 0.261
eps = 1.1, min_samples = 2, 簇数: 13, 噪声点: 271, 轮廓系数: 0.536, CH 指数: 36.52, DB 指数: 0.533
eps = 1.1, min_samples = 3, 簇数: 2, 噪声点: 293, 轮廓系数: 0.762, CH 指数: 65.20, DB 指数: 0.289
eps=1.1, min_samples=4, 簇数: 1, 噪声点: 298, 无法计算评估指标
eps=1.1, min_samples=5, 簇数: 1, 噪声点: 298, 无法计算评估指标
eps=1.1, min_samples=6, 簇数: 0, 噪声点: 303, 无法计算评估指标
eps=1.1, min_samples=7, 簇数: 0, 噪声点: 303, 无法计算评估指标
eps = 1.2, min_samples = 1, 簇数: 274, 噪声点: 0, 轮廓系数: 0.044, CH 指数: 16.45, DB 指数: 0.296
eps = 1.2, min_samples = 2, 簇数: 14, 噪声点: 260, 轮廓系数: 0.487, CH 指数: 30.86, DB 指数: 0.607
eps = 1.2, min_samples = 3, 簇数: 6, 噪声点: 276, 轮廓系数: 0.449, CH 指数: 32.43, DB 指数: 0.745
eps = 1.2, min_samples = 4, 簇数: 2, 噪声点: 291, 轮廓系数: 0.758, CH 指数: 81.07, DB 指数: 0.306
eps=1.2, min_samples=5, 簇数: 1, 噪声点: 297, 无法计算评估指标
eps=1.2, min_samples=6, 簇数: 1, 噪声点: 297, 无法计算评估指标
eps=1.2, min_samples=7, 簇数: 0, 噪声点: 303, 无法计算评估指标
eps = 1.3, min_samples = 1, 簇数: 267, 噪声点: 0, 轮廓系数: 0.043, CH 指数: 12.42, DB 指数: 0.344
eps = 1.3, min_samples = 2, 簇数: 15, 噪声点: 252, 轮廓系数: 0.507, CH 指数: 24.63, DB 指数: 0.607
eps = 1.3, min_samples = 3, 簇数: 5, 噪声点: 272, 轮廓系数: 0.470, CH 指数: 32.40, DB 指数: 0.800
eps = 1.3, min_samples = 4, 簇数: 2, 噪声点: 285, 轮廓系数: 0.710, CH 指数: 83.59, DB 指数: 0.390
eps = 1.3, min_samples = 5, 簇数: 3, 噪声点: 288, 轮廓系数: 0.356, CH 指数: 46.21, DB 指数: 1.119
eps=1.3, min_samples=6, 簇数: 1, 噪声点: 297, 无法计算评估指标
eps=1.3, min_samples=7, 簇数: 0, 噪声点: 303, 无法计算评估指标
eps = 1.4, min_samples = 1, 簇数: 261, 噪声点: 0, 轮廓系数: 0.048, CH 指数: 11.61, DB 指数: 0.362
eps = 1.4, min_samples = 2, 簇数: 18, 噪声点: 243, 轮廓系数: 0.472, CH 指数: 22.88, DB 指数: 0.626
eps = 1.4, min_samples = 3, 簇数: 5, 噪声点: 269, 轮廓系数: 0.444, CH 指数: 30.43, DB 指数: 0.840
eps = 1.4, min_samples = 4, 簇数: 3, 噪声点: 277, 轮廓系数: 0.519, CH 指数: 47.44, DB 指数: 0.628
eps = 1.4, min_samples = 5, 簇数: 2, 噪声点: 282, 轮廓系数: 0.678, CH 指数: 61.68, DB 指数: 0.413
eps=1.4, min_samples=6, 簇数: 1, 噪声点: 287, 无法计算评估指标
eps=1.4, min_samples=7, 簇数: 1, 噪声点: 288, 无法计算评估指标
results_df. head( )
eps min_samples n_clusters n_noise silhouette ch_score db_score 0 1.0 1 293 0 0.029961 37.733300 0.182318 1 1.0 2 8 285 0.727042 58.430329 0.300174 2 1.1 1 284 0 0.037428 21.824925 0.261067 3 1.1 2 13 271 0.535602 36.524859 0.533153 4 1.1 3 2 293 0.761552 65.199077 0.288790
plt. figure( figsize= ( 15 , 10 ) ) plt. subplot( 2 , 2 , 1 )
for min_samples in min_samples_range: subset = results_df[ results_df[ 'min_samples' ] == min_samples] plt. plot( subset[ 'eps' ] , subset[ 'silhouette' ] , marker= 'o' , label= f'min_samples = { min_samples} ' )
plt. title( '轮廓系数确定最优参数(越大越好)' )
plt. xlabel( 'eps' )
plt. ylabel( '轮廓系数' )
plt. legend( )
plt. grid( True ) plt. subplot( 2 , 2 , 2 )
for min_samples in min_samples_range: subset = results_df[ results_df[ 'min_samples' ] == min_samples] plt. plot( subset[ 'eps' ] , subset[ 'ch_score' ] , marker= 'o' , label= f'min_samples = { min_samples} ' )
plt. title( 'Calinski-Harabasz 指数确定最优参数(越大越好)' )
plt. xlabel( 'eps' )
plt. ylabel( 'CH 指数' )
plt. legend( )
plt. grid( True ) plt. subplot( 2 , 2 , 3 )
for min_samples in min_samples_range: subset = results_df[ results_df[ 'min_samples' ] == min_samples] plt. plot( subset[ 'eps' ] , subset[ 'db_score' ] , marker= 'o' , label= f'min_samples = { min_samples} ' )
plt. title( 'Davies-Bouldin 指数确定最优参数(越小越好)' )
plt. xlabel( 'eps' )
plt. ylabel( 'DB 指数' )
plt. legend( )
plt. grid( True ) plt. subplot( 2 , 2 , 4 )
for min_samples in min_samples_range: subset = results_df[ results_df[ 'min_samples' ] == min_samples] plt. plot( subset[ 'eps' ] , subset[ 'n_clusters' ] , marker= 'o' , label= f'min_samples = { min_samples} ' )
plt. title( '簇数量变化' )
plt. xlabel( 'eps' )
plt. ylabel( '簇数量' )
plt. legend( )
plt. grid( True ) plt. tight_layout( )
plt. show( )
selected_eps = 1.2
selected_min_samples = 4 dbscan = DBSCAN( eps= selected_eps, min_samples= selected_min_samples)
dbscan_labels = dbscan. fit_predict( X_scaled)
X[ 'DBSCAN_Cluster' ] = dbscan_labelspca = PCA( n_components= 2 )
X_pca = pca. fit_transform( X_scaled) plt. figure( figsize= ( 6 , 5 ) )
sns. scatterplot( x= X_pca[ : , 0 ] , y= X_pca[ : , 1 ] , hue= dbscan_labels, palette= 'viridis' )
plt. title( f'DBSCAN Clustering with eps = { selected_eps} , min_samples = { selected_min_samples} (PCA Visualization)' )
plt. xlabel( 'PCA Component 1' )
plt. ylabel( 'PCA Component 2' )
plt. show( ) print ( f'DBSCAN Cluster labels (eps = { selected_eps} , min_samples = { selected_min_samples} ) added to X:' )
print ( X[ [ 'DBSCAN_Cluster' ] ] . value_counts( ) )
DBSCAN Cluster labels (eps = 1.2, min_samples = 4) added to X:
DBSCAN_Cluster
-1 2910 81 4
Name: count, dtype: int64
from sklearn. cluster import AgglomerativeClustering
from sklearn. preprocessing import StandardScaler
from sklearn. metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib. pyplot as pltscaler = StandardScaler( )
X_scaled = scaler. fit_transform( X) n_clusters_range = range ( 2 , 11 )
silhouette_scores = [ ]
ch_scores = [ ]
db_scores = [ ] for n_clusters in n_clusters_range: agglo = AgglomerativeClustering( n_clusters= n_clusters, linkage= 'ward' ) agglo_labels = agglo. fit_predict( X_scaled) silhouette = silhouette_score( X_scaled, agglo_labels) ch = calinski_harabasz_score( X_scaled, agglo_labels) db = davies_bouldin_score( X_scaled, agglo_labels) silhouette_scores. append( silhouette) ch_scores. append( ch) db_scores. append( db) print ( f'n_clusters = { n_clusters} , 轮廓系数: { silhouette: .3f } , CH 指数: { ch: .2f } , DB 指数: { db: .3f } ' ) plt. figure( figsize= ( 15 , 5 ) ) plt. subplot( 1 , 3 , 1 )
plt. plot( n_clusters_range, silhouette_scores, marker= 'o' )
plt. title( '轮廓系数确定最优簇数(越大越好)' )
plt. xlabel( '簇数量 (n_clusters)' )
plt. ylabel( '轮廓系数' )
plt. grid( True ) plt. subplot( 1 , 3 , 2 )
plt. plot( n_clusters_range, ch_scores, marker= 'o' )
plt. title( 'Calinski-Harabasz 指数确定最优簇数(越大越好)' )
plt. xlabel( '簇数量 (n_clusters)' )
plt. ylabel( 'CH 指数' )
plt. grid( True ) plt. subplot( 1 , 3 , 3 )
plt. plot( n_clusters_range, db_scores, marker= 'o' )
plt. title( 'Davies-Bouldin 指数确定最优簇数(越小越好)' )
plt. xlabel( '簇数量 (n_clusters)' )
plt. ylabel( 'DB 指数' )
plt. grid( True ) plt. tight_layout( )
plt. show( )
n_clusters = 2, 轮廓系数: 0.094, CH 指数: 38.49, DB 指数: 2.304
n_clusters = 3, 轮廓系数: 0.132, CH 指数: 41.78, DB 指数: 2.226
n_clusters = 4, 轮廓系数: 0.133, CH 指数: 36.54, DB 指数: 2.108
n_clusters = 5, 轮廓系数: 0.149, CH 指数: 34.38, DB 指数: 1.788
n_clusters = 6, 轮廓系数: 0.143, CH 指数: 31.16, DB 指数: 2.093
n_clusters = 7, 轮廓系数: 0.147, CH 指数: 29.12, DB 指数: 2.018
n_clusters = 8, 轮廓系数: 0.157, CH 指数: 27.65, DB 指数: 1.915
n_clusters = 9, 轮廓系数: 0.162, CH 指数: 26.69, DB 指数: 1.806
n_clusters = 10, 轮廓系数: 0.161, CH 指数: 25.45, DB 指数: 1.811
selected_n_clusters = 5 agglo = AgglomerativeClustering( n_clusters= selected_n_clusters, linkage= 'ward' )
agglo_labels = agglo. fit_predict( X_scaled)
X[ 'Agglo_Cluster' ] = agglo_labelspca = PCA( n_components= 2 )
X_pca = pca. fit_transform( X_scaled) plt. figure( figsize= ( 6 , 5 ) )
sns. scatterplot( x= X_pca[ : , 0 ] , y= X_pca[ : , 1 ] , hue= agglo_labels, palette= 'viridis' )
plt. title( f'Agglomerative Clustering with n_clusters = { selected_n_clusters} (PCA Visualization)' )
plt. xlabel( 'PCA Component 1' )
plt. ylabel( 'PCA Component 2' )
plt. show( ) print ( f'Agglomerative Cluster labels (n_clusters = { selected_n_clusters} ) added to X:' )
print ( X[ [ 'Agglo_Cluster' ] ] . value_counts( ) )
Agglomerative Cluster labels (n_clusters = 5) added to X:
Agglo_Cluster
0 91
1 89
3 67
2 52
4 4
Name: count, dtype: int64
from scipy. cluster import hierarchy
import matplotlib. pyplot as pltZ = hierarchy. linkage( X_scaled, method= 'ward' ) plt. figure( figsize= ( 10 , 6 ) )
hierarchy. dendrogram( Z, truncate_mode= 'level' , p= 3 )
plt. title( 'Dendrogram for Agglomerative Clustering' )
plt. xlabel( 'Cluster Size' )
plt. ylabel( 'Distance' )
plt. show( )
@浙大疏锦行