基于Python的鸢尾花聚类与分类

news2025/7/6 12:16:16

1 导入必要的库

from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

2 加载数据


# 加载数据
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target

3 数据探索

sns.pairplot(df, hue='target')
plt.show()

图3-1

4 聚类分析


# 设定聚类数为3
kmeans = KMeans(n_clusters=3, random_state=0).fit(df.drop('target', axis=1))
df['cluster'] = kmeans.labels_
# 可视化聚类结果
plt.scatter(df['sepal length (cm)'], df['sepal width (cm)'], c=df['cluster'], cmap='viridis')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.title('K-Means Clustering of Iris Dataset')
plt.show()

图4-1 聚类结果


# 计算轮廓系数
score = silhouette_score(df.drop('target', axis=1), kmeans.labels_)
print(f"Silhouette Coefficient: {score}")

# 计算每个样本的轮廓系数
sample_silhouette_values = silhouette_samples(df.drop('target', axis=1), kmeans.labels_)

# 可视化轮廓图
plt.figure(figsize=(10, 5))
y_lower = 10
for i in range(3):
    # Aggregate the silhouette scores for samples belonging to
    # cluster i, and sort them
    ith_cluster_silhouette_values = sample_silhouette_values[kmeans.labels_ == i]

    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = plt.cm.nipy_spectral(float(i) / 3)
    plt.fill_betweenx(np.arange(y_lower, y_upper),
                      0, ith_cluster_silhouette_values,
                      facecolor=color, alpha=0.7)

    # Label the silhouette plots with their cluster numbers at the middle
    plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i),
             color=color, fontweight='bold', verticalalignment='center')

    y_lower = y_upper + 10  # 10 for the 0 samples

plt.xlabel('Silhouette Coefficient')
plt.ylabel('Cluster Label')
plt.title('Silhouette Plot')
plt.show()

图4-2 轮廓图

5 决策树分类


# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.3, random_state=42)

# 初始化决策树分类器
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# 预测测试集
y_pred = clf.predict(X_test)
# 显示混淆矩阵
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# 显示分类报告
print(classification_report(y_test, y_pred))
# 可视化决策树
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(20,10))
plot_tree(clf, filled=True, feature_names=iris.feature_names, class_names=list(iris.target_names), rounded=True, fontsize=9)
plt.show()