聚类算法实践
- Kmeans与Dbscan算法
- 半监督问题解决方案
- 聚类评估方法
# -*- coding: utf-8 -*-
""" Created on Sat Apr 20 20:42:57 2024 @author: Tom """
import numpy as np
import os
# %matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)
# Kmeans
from sklearn.datasets import make_blobs
# 指定5个中心点
blob_centers = np.array(
[[0.2,2.3],
[-1.5,2.3],
[-2.8,1.8],
[-2.8,2.8],
[-2.8,1.3]])
blob_std = np.array([0.4,0.3,0.1,0.1,0.1])
X,y = make_blobs(n_samples=2000,centers = blob_centers,
cluster_std=blob_std,random_state=7)
def plot_clusters(X,y = None):
plt.scatter(X[:,0], X[:,1], c = y,s = 1)
plt.xlabel("$x_1$",fontsize = 14)
plt.ylabel("$x_2$",fontsize = 14,rotation = 0)
plt.figure(figsize=(8,4))
plot_clusters(X)
plt.show()
from sklearn.cluster import KMeans
k = 5
kmeans = KMeans(n_clusters=k,random_state = 45)
# fit_predict(X)与kmeans.labels_ 得到预测结果是一致的
y_pred = kmeans.fit_predict(X)
print(y_pred)
print(kmeans.labels_)
# 得到中心点
print(kmeans.cluster_centers_)
X_new = np.array([[0,2],[3,2],[-3,3],[-3,2.5]])
print(kmeans.predict(X_new))
print(kmeans.transform(X_new))
# 输出每一个样本到各个簇中心点的距离
print(kmeans.transform(X_new))
def plot_data(X):
plt.plot(X[:,0],X[:,1],"k.",markersize = 2)
def plot_centroids(centroids,weights = None,circle_color = "w",cross_color = "k"):
if weights is not None:
centroids = centroids[weights > weights.max() / 10]
plt.scatter(centroids[:,0],centroids[:,1],
marker = "o",s = 30,linewidths = 8,
color = circle_color,zorder = 10,alpha = 0.9)
plt.scatter(centroids[:,0],centroids[:,1],
marker = "x",s = 2,linewidths = 12,
color = cross_color,zorder = 11,alpha = 1)
def plot_decision_boundaries(clusterer,X,resolution = 1000,show_centroids = True,
show_xlabels = True,show_ylabels = True):
mins = X.min(axis = 0) - 0.1
maxs = X.max(axis = 0) + 0.1
xx,yy = np.meshgrid(np.linspace(mins[0], maxs[0],resolution),
np.linspace(mins[1], maxs[1],resolution))
Z = clusterer.predict(np.c_[xx.ravel(),yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(Z,extent = (mins[0],maxs[0],mins[1],maxs[1]),cmap = "Pastel2")
plt.contour(Z,extent = (mins[0],maxs[0],mins[1],maxs[1]),
linewidths = 1,colors = "k")
plot_data(X)
if show_centroids:
plot_centroids(clusterer.cluster_centers_)
if show_xlabels:
plt.xlabel("$x_1$",fontsize = 14)
else:
plt.tick_params(labelbottom = "off")
if show_ylabels:
plt.ylabel("x_2",fontsize = 14,rotation = 0)
else:
plt.tick_params(labelleft = "off")
plt.figure(figsize = (8,4))
plot_decision_boundaries(kmeans, X)
plt.show()
运行结果:
建模流程解读
# -*- coding: utf-8 -*-
""" Created on Sat Apr 20 20:42:57 2024 @author: Tom """
import numpy as np
import os
# %matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)
# Kmeans
from sklearn.datasets import make_blobs
# 指定5个中心点
blob_centers = np.array(
[[0.2,2.3],
[-1.5,2.3],
[-2.8,1.8],
[-2.8,2.8],
[-2.8,1.3]])
blob_std = np.array([0.4,0.3,0.1,0.1,0.1])
X,y = make_blobs(n_samples=2000,centers = blob_centers,
cluster_std=blob_std,random_state=7)
def plot_clusters(X,y = None):
plt.scatter(X[:,0], X[:,1], c = y,s = 1)
plt.xlabel("$x_1$",fontsize = 14)
plt.ylabel("$x_2$",fontsize = 14,rotation = 0)
plt.figure(figsize=(8,4))
plot_clusters(X)
plt.show()
from sklearn.cluster import KMeans
k = 5
kmeans = KMeans(n_clusters=k,random_state = 45)
# fit_predict(X)与kmeans.labels_ 得到预测结果是一致的
y_pred = kmeans.fit_predict(X)
print(y_pred)
print(kmeans.labels_)
# 得到中心点
print(kmeans.cluster_centers_)
X_new = np.array([[0,2],[3,2],[-3,3],[-3,2.5]])
print(kmeans.predict(X_new))
print(kmeans.transform(X_new))
# 输出每一个样本到各个簇中心点的距离
print(kmeans.transform(X_new))
def plot_data(X):
plt.plot(X[:,0],X[:,1],"k.",markersize = 2)
def plot_centroids(centroids,weights = None,circle_color = "w",cross_color = "k"):
if weights is not None:
centroids = centroids[weights > weights.max() / 10]
plt.scatter(centroids[:,0],centroids[:,1],
marker = "o",s = 30,linewidths = 8,
color = circle_color,zorder = 10,alpha = 0.9)
plt.scatter(centroids[:,0],centroids[:,1],
marker = "x",s = 2,linewidths = 12,
color = cross_color,zorder = 11,alpha = 1)
def plot_decision_boundaries(clusterer,X,resolution = 1000,show_centroids = True,
show_xlabels = True,show_ylabels = True):
mins = X.min(axis = 0) - 0.1
maxs = X.max(axis = 0) + 0.1
xx,yy = np.meshgrid(np.linspace(mins[0], maxs[0],resolution),
np.linspace(mins[1], maxs[1],resolution))
Z = clusterer.predict(np.c_[xx.ravel(),yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(Z,extent = (mins[0],maxs[0],mins[1],maxs[1]),cmap = "Pastel2")
plt.contour(Z,extent = (mins[0],maxs[0],mins[1],maxs[1]),
linewidths = 1,colors = "k")
plot_data(X)
if show_centroids:
plot_centroids(clusterer.cluster_centers_)
if show_xlabels:
plt.xlabel("$x_1$",fontsize = 14)
else:
plt.tick_params(labelbottom = "off")
if show_ylabels:
plt.ylabel("x_2",fontsize = 14,rotation = 0)
else:
plt.tick_params(labelleft = "off")
plt.figure(figsize = (8,4))
plot_decision_boundaries(kmeans, X)
plt.show()
kmeans_iter1 = KMeans(n_clusters = 5,init = "random",n_init= 1,max_iter = 1,random_state= 1)
kmeans_iter2 = KMeans(n_clusters = 5,init = "random",n_init= 1,max_iter = 2,random_state= 1)
kmeans_iter3 = KMeans(n_clusters = 5,init = "random",n_init= 1,max_iter = 3,random_state= 1)
kmeans_iter1.fit(X)
kmeans_iter2.fit(X)
kmeans_iter3.fit(X)
plt.figure(figsize=(12,8))
plt.subplot(321)
plot_data(X)
plot_centroids(kmeans_iter1.cluster_centers_,circle_color = "r",cross_color = "k")
plt.title("Update cluster_centers")
plt.subplot(322)
plot_decision_boundaries(kmeans_iter1, X,show_xlabels=False,show_ylabels=False)
plt.title("Label")
plt.subplot(323)
plot_decision_boundaries(kmeans_iter1, X,show_xlabels=False,show_ylabels=False)
plot_centroids(kmeans_iter2.cluster_centers_)
plt.title("Update cluster_centers")
plt.subplot(324)
plot_decision_boundaries(kmeans_iter2, X,show_xlabels=False,show_ylabels=False)
plt.subplot(325)
plot_decision_boundaries(kmeans_iter2, X,show_xlabels=False,show_ylabels=False)
plot_centroids(kmeans_iter3.cluster_centers_)
plt.subplot(326)
plot_decision_boundaries(kmeans_iter3, X,show_xlabels=False,show_ylabels=False)
plt.show()
运行结果:
不稳定结果
# -*- coding: utf-8 -*-
""" Created on Sat Apr 20 20:42:57 2024 @author: Tom """
import numpy as np
import os
# %matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)
# Kmeans
from sklearn.datasets import make_blobs
# 指定5个中心点
blob_centers = np.array(
[[0.2,2.3],
[-1.5,2.3],
[-2.8,1.8],
[-2.8,2.8],
[-2.8,1.3]])
blob_std = np.array([0.4,0.3,0.1,0.1,0.1])
X,y = make_blobs(n_samples=2000,centers = blob_centers,
cluster_std=blob_std,random_state=7)
def plot_clusters(X,y = None):
plt.scatter(X[:,0], X[:,1], c = y,s = 1)
plt.xlabel("$x_1$",fontsize = 14)
plt.ylabel("$x_2$",fontsize = 14,rotation = 0)
plt.figure(figsize=(8,4))
plot_clusters(X)
plt.show()
from sklearn.cluster import KMeans
k = 5
kmeans = KMeans(n_clusters=k,random_state = 45)
# fit_predict(X)与kmeans.labels_ 得到预测结果是一致的
y_pred = kmeans.fit_predict(X)
print(y_pred)
print(kmeans.labels_)
# 得到中心点
print(kmeans.cluster_centers_)
X_new = np.array([[0,2],[3,2],[-3,3],[-3,2.5]])
print(kmeans.predict(X_new))
print(kmeans.transform(X_new))
# 输出每一个样本到各个簇中心点的距离
print(kmeans.transform(X_new))
def plot_data(X):
plt.plot(X[:,0],X[:,1],"k.",markersize = 2)
def plot_centroids(centroids,weights = None,circle_color = "w",cross_color = "k"):
if weights is not None:
centroids = centroids[weights > weights.max() / 10]
plt.scatter(centroids[:,0],centroids[:,1],
marker = "o",s = 30,linewidths = 8,
color = circle_color,zorder = 10,alpha = 0.9)
plt.scatter(centroids[:,0],centroids[:,1],
marker = "x",s = 2,linewidths = 12,
color = cross_color,zorder = 11,alpha = 1)
def plot_decision_boundaries(clusterer,X,resolution = 1000,show_centroids = True,
show_xlabels = True,show_ylabels = True):
mins = X.min(axis = 0) - 0.1
maxs = X.max(axis = 0) + 0.1
xx,yy = np.meshgrid(np.linspace(mins[0], maxs[0],resolution),
np.linspace(mins[1], maxs[1],resolution))
Z = clusterer.predict(np.c_[xx.ravel(),yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(Z,extent = (mins[0],maxs[0],mins[1],maxs[1]),cmap = "Pastel2")
plt.contour(Z,extent = (mins[0],maxs[0],mins[1],maxs[1]),
linewidths = 1,colors = "k")
plot_data(X)
if show_centroids:
plot_centroids(clusterer.cluster_centers_)
if show_xlabels:
plt.xlabel("$x_1$",fontsize = 14)
else:
plt.tick_params(labelbottom = "off")
if show_ylabels:
plt.ylabel("x_2",fontsize = 14,rotation = 0)
else:
plt.tick_params(labelleft = "off")
plt.figure(figsize = (8,4))
plot_decision_boundaries(kmeans, X)
plt.show()
kmeans_iter1 = KMeans(n_clusters = 5,init = "random",n_init= 1,max_iter = 1,random_state= 1)
kmeans_iter2 = KMeans(n_clusters = 5,init = "random",n_init= 1,max_iter = 2,random_state= 1)
kmeans_iter3 = KMeans(n_clusters = 5,init = "random",n_init= 1,max_iter = 3,random_state= 1)
kmeans_iter1.fit(X)
kmeans_iter2.fit(X)
kmeans_iter3.fit(X)
plt.figure(figsize=(12,8))
plt.subplot(321)
plot_data(X)
plot_centroids(kmeans_iter1.cluster_centers_,circle_color = "r",cross_color = "k")
plt.title("Update cluster_centers")
plt.subplot(322)
plot_decision_boundaries(kmeans_iter1, X,show_xlabels=False,show_ylabels=False)
plt.title("Label")
plt.subplot(323)
plot_decision_boundaries(kmeans_iter1, X,show_xlabels=False,show_ylabels=False)
plot_centroids(kmeans_iter2.cluster_centers_)
plt.title("Update cluster_centers")
plt.subplot(324)
plot_decision_boundaries(kmeans_iter2, X,show_xlabels=False,show_ylabels=False)
plt.subplot(325)
plot_decision_boundaries(kmeans_iter2, X,show_xlabels=False,show_ylabels=False)
plot_centroids(kmeans_iter3.cluster_centers_)
plt.subplot(326)
plot_decision_boundaries(kmeans_iter3, X,show_xlabels=False,show_ylabels=False)
plt.show()
def plot_clusterer_comparison(c1,c2,X):
c1.fit(X)
c2.fit(X)
plt.figure(figsize=(12,4))
plt.subplot(121)
plot_decision_boundaries(c1, X)
plt.subplot(122)
plot_decision_boundaries(c2, X)
c1 = KMeans(n_clusters=5,init = "random",n_init = 1,random_state=15)
c2 = KMeans(n_clusters=5,init = "random",n_init = 1,random_state=19)
plot_clusterer_comparison(c1, c2, X)
运行结果:
评估指标Inertia
- Inertia指标:每个样本与其质心的距离
# -*- coding: utf-8 -*-
""" Created on Sat Apr 20 20:42:57 2024 @author: Tom """
import numpy as np
import os
# %matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)
# Kmeans
from sklearn.datasets import make_blobs
# 指定5个中心点
blob_centers = np.array(
[[0.2,2.3],
[-1.5,2.3],
[-2.8,1.8],
[-2.8,2.8],
[-2.8,1.3]])
blob_std = np.array([0.4,0.3,0.1,0.1,0.1])
X,y = make_blobs(n_samples=2000,centers = blob_centers,
cluster_std=blob_std,random_state=7)
def plot_clusters(X,y = None):
plt.scatter(X[:,0], X[:,1], c = y,s = 1)
plt.xlabel("$x_1$",fontsize = 14)
plt.ylabel("$x_2$",fontsize = 14,rotation = 0)
plt.figure(figsize=(8,4))
plot_clusters(X)
plt.show()
from sklearn.cluster import KMeans
k = 5
kmeans = KMeans(n_clusters=k,random_state = 45)
# fit_predict(X)与kmeans.labels_ 得到预测结果是一致的
y_pred = kmeans.fit_predict(X)
print(y_pred)
print(kmeans.labels_)
# 得到中心点
print(kmeans.cluster_centers_)
X_new = np.array([[0,2],[3,2],[-3,3],[-3,2.5]])
print(kmeans.predict(X_new))
print(kmeans.transform(X_new))
# 输出每一个样本到各个簇中心点的距离
print(kmeans.transform(X_new))
def plot_data(X):
plt.plot(X[:,0],X[:,1],"k.",markersize = 2)
def plot_centroids(centroids,weights = None,circle_color = "w",cross_color = "k"):
if weights is not None:
centroids = centroids[weights > weights.max() / 10]
plt.scatter(centroids[:,0],centroids[:,1],
marker = "o",s = 30,linewidths = 8,
color = circle_color,zorder = 10,alpha = 0.9)
plt.scatter(centroids[:,0],centroids[:,1],
marker = "x",s = 2,linewidths = 12,
color = cross_color,zorder = 11,alpha = 1)
def plot_decision_boundaries(clusterer,X,resolution = 1000,show_centroids = True,
show_xlabels = True,show_ylabels = True):
mins = X.min(axis = 0) - 0.1
maxs = X.max(axis = 0) + 0.1
xx,yy = np.meshgrid(np.linspace(mins[0], maxs[0],resolution),
np.linspace(mins[1], maxs[1],resolution))
Z = clusterer.predict(np.c_[xx.ravel(),yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(Z,extent = (mins[0],maxs[0],mins[1],maxs[1]),cmap = "Pastel2")
plt.contour(Z,extent = (mins[0],maxs[0],mins[1],maxs[1]),
linewidths = 1,colors = "k")
plot_data(X)
if show_centroids:
plot_centroids(clusterer.cluster_centers_)
if show_xlabels:
plt.xlabel("$x_1$",fontsize = 14)
else:
plt.tick_params(labelbottom = "off")
if show_ylabels:
plt.ylabel("x_2",fontsize = 14,rotation = 0)
else:
plt.tick_params(labelleft = "off")
""" plt.figure(figsize = (8,4)) plot_decision_boundaries(kmeans, X) plt.show() kmeans_iter1 = KMeans(n_clusters = 5,init = "random",n_init= 1,max_iter = 1,random_state= 1) kmeans_iter2 = KMeans(n_clusters = 5,init = "random",n_init= 1,max_iter = 2,random_state= 1) kmeans_iter3 = KMeans(n_clusters = 5,init = "random",n_init= 1,max_iter = 3,random_state= 1) kmeans_iter1.fit(X) kmeans_iter2.fit(X) kmeans_iter3.fit(X) plt.figure(figsize=(12,8)) plt.subplot(321) plot_data(X) plot_centroids(kmeans_iter1.cluster_centers_,circle_color = "r",cross_color = "k") plt.title("Update cluster_centers") plt.subplot(322) plot_decision_boundaries(kmeans_iter1, X,show_xlabels=False,show_ylabels=False) plt.title("Label") plt.subplot(323) plot_decision_boundaries(kmeans_iter1, X,show_xlabels=False,show_ylabels=False) plot_centroids(kmeans_iter2.cluster_centers_) plt.title("Update cluster_centers") plt.subplot(324) plot_decision_boundaries(kmeans_iter2, X,show_xlabels=False,show_ylabels=False) plt.subplot(325) plot_decision_boundaries(kmeans_iter2, X,show_xlabels=False,show_ylabels=False) plot_centroids(kmeans_iter3.cluster_centers_) plt.subplot(326) plot_decision_boundaries(kmeans_iter3, X,show_xlabels=False,show_ylabels=False) plt.show() """
def plot_clusterer_comparison(c1,c2,X):
c1.fit(X)
c2.fit(X)
plt.figure(figsize=(12,4))
plt.subplot(121)
plot_decision_boundaries(c1, X)
plt.subplot(122)
plot_decision_boundaries(c2, X)
c1 = KMeans(n_clusters=5,init = "random",n_init = 1,random_state=15)
c2 = KMeans(n_clusters=5,init = "random",n_init = 1,random_state=19)
plot_clusterer_comparison(c1, c2, X)
print("-----------------------------------------------")
# Inertia指标:每个样本与其质心的距离
print(kmeans.inertia_)
print(kmeans.transform(X))
print(kmeans.labels_)
X_dist = kmeans.transform(X)
print(X_dist[np.arange(len(X_dist)),kmeans.labels_])
# 等于Inertia
print(np.sum(X_dist[np.arange(len(X_dist)),kmeans.labels_] ** 2))
print(kmeans.score(X))
print(c1.inertia_)
print(c2.inertia_)
运行结果:
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/bian-cheng-ji-chu/85671.html