1 导入必要的库
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
2 加载数据
# 加载数据
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
3 数据探索
sns.pairplot(df, hue='target')
plt.show()
图3-1
4 聚类分析
# 设定聚类数为3
kmeans = KMeans(n_clusters=3, random_state=0).fit(df.drop('target', axis=1))
df['cluster'] = kmeans.labels_
# 可视化聚类结果
plt.scatter(df['sepal length (cm)'], df['sepal width (cm)'], c=df['cluster'], cmap='viridis')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.title('K-Means Clustering of Iris Dataset')
plt.show()
图4-1 聚类结果
# 计算轮廓系数
score = silhouette_score(df.drop('target', axis=1), kmeans.labels_)
print(f"Silhouette Coefficient: {score}")
# 计算每个样本的轮廓系数
sample_silhouette_values = silhouette_samples(df.drop('target', axis=1), kmeans.labels_)
# 可视化轮廓图
plt.figure(figsize=(10, 5))
y_lower = 10
for i in range(3):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[kmeans.labels_ == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.nipy_spectral(float(i) / 3)
plt.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i),
color=color, fontweight='bold', verticalalignment='center')
y_lower = y_upper + 10 # 10 for the 0 samples
plt.xlabel('Silhouette Coefficient')
plt.ylabel('Cluster Label')
plt.title('Silhouette Plot')
plt.show()
图4-2 轮廓图
5 决策树分类
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.3, random_state=42)
# 初始化决策树分类器
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
# 预测测试集
y_pred = clf.predict(X_test)
# 显示混淆矩阵
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
# 显示分类报告
print(classification_report(y_test, y_pred))
# 可视化决策树
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))
plot_tree(clf, filled=True, feature_names=iris.feature_names, class_names=list(iris.target_names), rounded=True, fontsize=9)
plt.show()
图5-1 混淆矩阵
图5-2 决策树模型结构