Python数据分析-植物生长数据分析（机器学习模型和神经网络模型）

一、研究背景

植物生长受多种环境因素的影响，包括土壤类型、日照时间、浇水频率、肥料类型、温度和湿度等。这些因素不仅影响植物的生长速度和健康状况，还对植物在不同生长阶段的表现有显著影响。随着气候变化和环境污染问题的加剧，研究如何优化植物生长条件以提高农作物产量和质量变得尤为重要。本研究旨在通过分析不同环境变量对植物生长里程碑的影响，找出最佳的种植条件，为农民和园艺师提供科学依据，帮助他们在不同环境下进行有效的种植管理。

二、研究意义

提高农作物产量和质量：通过了解不同环境因素对植物生长的影响，可以优化种植条件，从而提高农作物的产量和质量。
促进可持续农业发展：通过科学的种植管理，减少对化学肥料和过量水资源的依赖，推动农业的可持续发展。
应对气候变化挑战：为应对气候变化带来的农业挑战提供数据支持，帮助制定应对极端天气和环境变化的种植策略。
增强农民和园艺师的决策能力：提供具体的种植指导，帮助农民和园艺师在实际生产中做出更明智的决策，提高生产效率和经济效益。

三、实证分析

该数据集包含Growth_Milestone（目标）和一些影响它的因素这个问题考虑为二元分类第 1 部分：包含对数据的一些分析，除了准备和清理数据之外的DataExplantory 第2部分：我将使用一些传统的机器学习技术：

目标：根据提供的环境和管理因素对植物的生长里程碑进行预测和分类。我们的目标：预测植物的生长阶段或里程碑。特征： Soil_Type：植物生长的土壤类型或成分。 Sunlight_Hours：植物接受阳光照射的持续时间或强度。 Water_Frequency：植物浇水的频率，表示浇水时间表。 Fertilizer_Type：用于滋养植物的肥料类型。 Temperature:：植物生长的环境温度条件。 Humidity：植物周围环境中的水分或湿度水平。 Growth_Milestone：指示植物生长过程中的阶段或重要事件的描述或标记。

数据和完整代码

导入包：

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['KaiTi']  #中文
plt.rcParams['axes.unicode_minus'] = False   #负号
import IPython.display
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator , TransformerMixin
from sklearn.feature_selection import VarianceThreshold ,f_classif ,SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import tree
from sklearn.metrics import classification_report,confusion_matrix ,ConfusionMatrixDisplay
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

读取数据和查看其基本信息：

data=pd.read_csv("data.csv")
data.head(5)

获取其他信息

随后进行描述性统计分析

data.describe().T

接下来查看数据类型和查看缺失值类型：

data.dtypes

data.isna().mean()

接下来可视化查看 GrowthMilestone的占比


f={"family":"serif","size":25,"color":"r"}
plt.title("Animia Distribution",fontdict=f)
explicit=data["GrowthMilestone"].value_counts()
plt.pie(x=explicit,labels=["Not GrowthMilestone","GrowthMilestone"],colors=["g","r"],explode=[0.1,0],autopct='%1.1f%%')
plt.legend()
plt.show()

观察有多少个 SoilType

plt.title("SoilType Distribution",fontdict=f)
explicit=data["SoilType"].value_counts()
plt.pie(x=explicit,labels=["clay","sandy","loam"],colors=["g","b","r"],explode=[0.2,0.1,0.1],autopct='%1.1f%%')  
plt.legend()
plt.show()

Water_Frequency = data["WaterFrequency"].value_counts()
plt.subplot(4, 4, 1)  # Create the first subplot
plt.pie(
    x=Water_Frequency,
    labels=["daily", "bi-weekly","weekly"],
    colors=["g","r","b"],
    explode=[0.1, 0,0.1],
    autopct='%1.1f%%',
    radius=2,
)                                                                     

plt.title("Water_Frequency_observation", fontdict=f1)
plt.legend()


# Plot 2

Fertilizer_Type = data["FertilizerType"].value_counts()
plt.subplot(4, 4, 2)  # Create the second subplot
plt.pie(
    x=Fertilizer_Type,
    labels=["none", "chemical","organic"],
    colors=["orange","g","y"],
    explode=[0.1, 0,0.1],
    autopct='%1.1f%%',
    radius=2,
)
plt.title("Fertilizer_Type_observation", fontdict=f1)
plt.legend()
# Adjust the spacing between subplots
plt.subplots_adjust(wspace=2,hspace=1)

#plot3:

# 观察有多少个 SoilType ：


plt.subplot(4,4,3)
explicit=data["SoilType"].value_counts()
plt.pie(x=explicit,labels=["clay","sandy","loam"],colors=["g","b","r"],explode=[0.2,0.1,0.1],autopct='%1.1f%%',radius=2)  
plt.title("SoilType Distribution",fontdict=f1)
plt.legend()
plt.show()


plt.subplots_adjust(wspace=2,hspace=1)

# plot4:

# 观察有多少人拥有 GrowthMilestone：
plt.subplot(4,4,4)
plt.title("GrowthMilestone_Distribution",fontdict=f1)
explicit=data["GrowthMilestone"].value_counts()
plt.pie(x=explicit,labels=["Not GrowthMilestone","GrowthMilestone"],colors=["g","r"],explode=[0.1,0],autopct='%1.1f%%',radius=6)
plt.legend()
plt.show()

# pacing between subplots
plt.subplots_adjust(wspace=2,hspace=1)

观察每种土壤类型有多少hours_sunlght：

观察 Soil_Type vs. GrowthMilestone：要知道每种土壤中的植物数量：

plt.figure(figsize=(7,8))
plt.title("Distribution of GrowthMilestone vs. Soil_Type",fontdict=f)
top=data.groupby("SoilType")["GrowthMilestone"].sum().sort_values(ascending=False)
sns.barplot(x=top.index,y=top.values,color="r",palette='coolwarm')
plt.xticks(rotation=45,color="b")
plt.xlabel("Soil_Type",fontdict=f)
plt.ylabel("GrowthMilestone",fontdict=f)
plt.legend(title="Number of Grough in Every soil",prop={'size': 12})
plt.show()

观察 Fertilizer_Type vs.温度分布：

ontinous data的分布

for col in data.select_dtypes("number"):
    
    sns.displot(data[col],color="r")
    plt.title("presention of "+ col,fontdict=f)

创建连续特征的对图：

plt.figure(figsize=(25, 15), dpi=300)
sns.set(style="whitegrid")  
sns.set_palette("coolwarm")   
sns.pairplot(data.select_dtypes(include='float64'), plot_kws={'alpha': 0.6, 's': 80})

接下来查看相关系数和热力图

corr=data.select_dtypes("number").corr()
print(corr)

print("-"*50)

kendall=data.select_dtypes("number").corr(method="kendall")
print(kendall)

print("-"*50)

spearman=data.select_dtypes("number").corr(method="spearman")
print(spearman)

print("-"*50)

pearson=data.select_dtypes("number").corr(method="pearson")
print(pearson)

print("-"*50)

# plot the correlation Matrix :

plt.figure(figsize=(25,15),dpi=200)
plt.title("Heatmap-Correlation-Matrix",fontdict=f)
sns.heatmap(data.select_dtypes("number").corr(),annot=True,fmt="0.3f",cmap='Blues')
plt.xlabel("Features",fontdict=f)
plt.ylabel("Features",fontdict=f)
plt.xticks(rotation=45,color="b")
plt.yticks(rotation=-45,color="b")
plt.show()

接下来划分特征和响应变量进行模型建立和预测

# 划分X和Y
x=data.drop(columns="GrowthMilestone",axis=1)
y=data["GrowthMilestone"]
print(np.shape(x),np.shape(y))
print(len(x),len(y))

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=42)
print(np.shape(x_train),np.shape(x_test))
print(np.shape(y_train),np.shape(x_test))

# 随机森林模型

RC1=RandomForestClassifier(random_state=42,n_jobs=-1,max_depth=5,n_estimators=100)
RC1.fit(x_train,y_train)    

y_predict=RC1.predict(x_test)
y_predict[:10]

#提升准确率
np.random.seed()
for i in range(10,110,10):
    print(f"at n_estimators= {i} n_estimators")
    RC2=RandomForestClassifier(n_estimators=i).fit(x_train,y_train)
    print(f"accuracy={RC2.score(x_test,y_test)*100:0.2f}%")

决策树

决策树可视化

plt.figure(figsize=(15,15),dpi=100) 
tree.plot_tree(DT)
plt.title("--<<< Decision_Tree >>>--",fontdict=f)
plt.show()

支持向量机

使用 Tensorflow 构建神经网络模型

from tensorflow.keras import Sequential 
from tensorflow.keras.layers import Dense,Input,Dropout
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.regularizers import l2


model_r.compile(
    loss=BinaryCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"]
  )

绘制训练历史（准确率和损失）

plt.figure(figsize=(12, 6))

# Plot training & validation accuracy values
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Test'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Test'], loc='upper left')

plt.tight_layout()
plt.show()

关于机器学习模型的一些结果：

1- RandomForestClassifier ： accuracy -->> %58.97%并且改进模型后，我们发现 n_estimators= 20，准确率-->>64.1%，提高了5.13%
2-DecissionTreeClassifier ：准确率 -->> 51.28% %
3-支持向量机：准确率-->>48.72% 我们不能说最好的模型是 RandomForestClassifier 因为每个模型都有优点和缺点，但是这个个模型为我们提供了高精度。这是我们从传统机器学习模型中得到的结果。关于使用 Tensorflow 构建的神经网络模型，我们发现：准确率--->> 67.66%，改进后相同。