导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline
sns.set(style="white")
pd.set_option("display.max_rows", 1000)
sklearn自带iris数据集(nrow=150)
4个预测变量
3分类结局
iris = load_iris()
X = iris["data"]
Y = iris["target"]
display(X[:5])
display(pd.Series(Y).value_counts())
Y = Y.reshape(-1, 1) # Y的形状转换为[150, 1]
array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2]])
2 50
1 50
0 50
dtype: int64
data = pd.DataFrame(np.concatenate((X, Y), axis=1),
columns=["x1", "x2", "x3", "x4", "y"])
data["y"] = data["y"].astype("int64")
data.head()
x1
x2
x3
x4
y
0
5.1
3.5
1.4
0.2
0
1
4.9
3.0
1.4
0.2
0
2
4.7
3.2
1.3
0.2
0
3
4.6
3.1
1.5
0.2
0
4
5.0
3.6
1.4
0.2
0
观察数据分布
4个预测变量两两散点图
sns.pairplot(data, hue="y")
数据标准化
Kmeans聚类前应对数据进行标准化
scaler = MinMaxScaler()
data.iloc[:, :4] = scaler.fit_transform(data.iloc[:, :4])
data.head()
x1
x2
x3
x4
y
0
0.222222
0.625000
0.067797
0.041667
0
1
0.166667
0.416667
0.067797
0.041667
0
2
0.111111
0.500000
0.050847
0.041667
0
3
0.083333
0.458333
0.084746
0.041667
0
4
0.194444
0.666667
0.067797
0.041667
0
设置类别数为3,进行Kmeans聚类
clus = KMeans(n_clusters=3)
clus = clus.fit(data.iloc[:, 1:4])
聚类完成后150个样本的聚类标签
clus.labels_
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)
聚类完成后三个聚类中心
clus.cluster_centers_
array([[0.595 , 0.07830508, 0.06083333],
[0.2975 , 0.55661017, 0.50583333],
[0.42916667, 0.76745763, 0.8075 ]])
聚类的评估指标
clus.inertia_
4.481991774793322
确定最佳聚类数目
尝试不同的类别数,查看criterion值(越小越好),画出“肘线图”
L = []
for i in range(1, 9):
clus = KMeans(n_clusters=i)
clus.fit(data.iloc[:, 1:3])
L.append([i, clus.inertia_])
L = pd.DataFrame(L, columns=["k", "criterion"])
L
k
criterion
0
1
18.253249
1
2
5.106290
2
3
3.312646
3
4
2.585065
4
5
1.946648
5
6
1.637264
6
7
1.387541
7
8
1.175937
sns.pointplot(x="k", y="criterion", data=L)
sns.despine()
根据选定的聚类模型,对样本进行预测
从“肘线图”可看出最佳类别数等于3或4较好,此处使用3
clus = KMeans(n_clusters=3)
clus = clus.fit(data.iloc[:, 1:4])
data["pred"] = clus.predict(data.iloc[:, 1:4])
data.loc[data["pred"] == 0, "Pred"] = 11
data.loc[data["pred"] == 1, "Pred"] = 0
data.loc[data["pred"] == 2, "Pred"] = 2
data.loc[data["Pred"] == 11, "Pred"] = 1
data["Pred"] = data["Pred"].astype("int64")
data.head()
x1
x2
x3
x4
y
pred
Pred
0
0.222222
0.625000
0.067797
0.041667
0
1
0
1
0.166667
0.416667
0.067797
0.041667
0
1
0
2
0.111111
0.500000
0.050847
0.041667
0
1
0
3
0.083333
0.458333
0.084746
0.041667
0
1
0
4
0.194444
0.666667
0.067797
0.041667
0
1
0
画出预测混淆矩阵,计算准确率
df = pd.crosstab(data["y"], data["Pred"])
df
Pred
0
1
2
y
0
50
0
0
1
0
46
4
2
0
4
46
L = []
for i in range(df.shape[0]):
for j in range(df.shape[1]):
if i != j:
L.append(df.iloc[i, j])
print("预测准确率为:", round((150 - sum(L)) / 150 * 100, 1), "%")
预测准确率为: 94.7 %
本文地址:/weixin_40575651/article/details/107334269
希望与广大网友互动??
点此进行留言吧!