- 深入探索Python机器学习算法:模型调优
- 模型调优
- 1. 超参数搜索方法
- 1.1 网格搜索(Grid Search)
- 1.2 随机搜索(Random Search)
- 1.3 贝叶斯优化(Bayesian Optimization)
- 1.4 不同超参数搜索方法的优缺点和适用场景
- 2. 模型复杂度分析
- 2.1 学习曲线和验证曲线
- 2.2 正则化方法
- 2.3 特征选择和特征工程
- 3. 模型融合与集成
- 3.1 模型融合的方法
- Bagging(自助聚合)
- Boosting
- Stacking(堆叠泛化)
- 3.2 常见的集成学习算法
- 随机森林(Random Forest)
- AdaBoost
- Gradient Boosting
- 3.3 模型融合的效果评估和参数调优
- 以 Stacking 模型为例进行参数调优(使用网格搜索)
1. 超参数搜索方法
1.1 网格搜索(Grid Search)
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
# 生成示例数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
# 定义超参数网格
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
model = SVC()
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X, y)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
# 查看所有参数组合的结果
results = grid_search.cv_results_
for mean_score, params in zip(results["mean_test_score"], results["params"]):
print(f"Score: {mean_score:.3f}, Params: {params}")
1.2 随机搜索(Random Search)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
import numpy as np
from scipy.stats import uniform
# 生成示例数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
# 定义超参数分布
param_dist = {'C': uniform(0.1, 10), 'kernel': ['linear', 'rbf']}
model = SVC()
random_search = RandomizedSearchCV(model, param_dist, n_iter=5, cv=5)
random_search.fit(X, y)
print(f"Best parameters: {random_search.best_params_}")
print(f"Best score: {random_search.best_score_}")
# 查看随机搜索的参数组合及结果
for i in range(random_search.n_iter):
params = random_search.cv_results_['params'][i]
score = random_search.cv_results_['mean_test_score'][i]
print(f"Score: {score:.3f}, Params: {params}")
1.3 贝叶斯优化(Bayesian Optimization)
from skopt import BayesSearchCV
from sklearn.svm import SVC
import numpy as np
# 生成示例数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
# 定义超参数空间
param_space = {'C': (0.1, 10.0, 'log-uniform'), 'kernel': ['linear', 'rbf']}
model = SVC()
bayes_search = BayesSearchCV(model, param_space, n_iter=5, cv=5)
bayes_search.fit(X, y)
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best score: {bayes_search.best_score_}")
# 查看贝叶斯优化的迭代过程
for i in range(bayes_search.n_iter):
params = bayes_search.cv_results_['params'][i]
score = bayes_search.cv_results_['mean_test_score'][i]
print(f"Iteration {i+1}: Score: {score:.3f}, Params: {params}")
1.4 不同超参数搜索方法的优缺点和适用场景
- 网格搜索:优点是可以找到全局最优解,缺点是计算复杂度高,适用于超参数空间较小的情况。
- 随机搜索:优点是计算复杂度较低,缺点是不一定能找到全局最优解,适用于超参数空间较大的情况。
- 贝叶斯优化:优点是可以利用之前的评估结果,更快地找到最优解,缺点是实现复杂,适用于超参数空间较大且评估代价较高的情况。
2. 模型复杂度分析
2.1 学习曲线和验证曲线
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt
# 生成示例数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
# 绘制学习曲线
model = LogisticRegression()
train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=5)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Learning Curve")
plt.xlabel("Training examples")
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross - validation score")
# 绘制验证曲线
param_range = np.logspace(-3, 3, 7)
train_scores, test_scores = validation_curve(
LogisticRegression(), X, y, param_name="C", param_range=param_range,
cv=5, scoring="accuracy", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Validation Curve with Logistic Regression")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2,
color="navy", lw=lw)
2.2 正则化方法
正则化方法可以通过对模型参数进行约束,防止模型过拟合。常见的正则化方法有 L1 正则化和 L2 正则化。
from sklearn.linear_model import Lasso, Ridge
import numpy as np
import matplotlib.pyplot as plt
# 生成示例数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
# L1 正则化
lasso = Lasso(alpha=0.1)
lasso.fit(X, y)
print("Lasso coefficients:", lasso.coef_)
# L2 正则化
ridge = Ridge(alpha=0.1)
ridge.fit(X, y)
print("Ridge coefficients:", ridge.coef_)
# 绘制不同正则化强度下的系数变化
alphas = np.logspace(-4, 2, 20)
lasso_coefs = []
ridge_coefs = []
for alpha in alphas:
lasso = Lasso(alpha=alpha)
lasso.fit(X, y)
ridge = Ridge(alpha=alpha)
ridge.fit(X, y)
lasso_coefs = np.array(lasso_coefs)
ridge_coefs = np.array(ridge_coefs)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
for i in range(lasso_coefs.shape[1]):
plt.plot(alphas, lasso_coefs[:, i], label=f'Feature {i+1}')
plt.xlabel('Alpha (Regularization strength)')
plt.title('Lasso Regularization')
plt.subplot(1, 2, 2)
for i in range(ridge_coefs.shape[1]):
plt.plot(alphas, ridge_coefs[:, i], label=f'Feature {i+1}')
plt.xlabel('Alpha (Regularization strength)')
plt.title('Ridge Regularization')
L1 正则化会使模型的部分参数变为零,从而实现特征选择的效果。在一些特征数量较多,但其中部分特征对模型贡献不大的场景下,L1 正则化可以帮助我们筛选出重要的特征。L2 正则化则会使模型的参数值变小,但不会使其变为零。它可以使模型的参数分布更加平滑,减少模型对个别特征的过度依赖,从而提高模型的泛化能力。
2.3 特征选择和特征工程
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# 生成示例数据
data = pd.DataFrame(np.random.rand(100, 6), columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'target'])
# 相关性分析
correlation_matrix = data.corr()
target_correlation = correlation_matrix['target'].drop('target')
selected_features = target_correlation[abs(target_correlation) > 0.2].index
print("Selected features by correlation analysis:", selected_features)
# 绘制相关性矩阵热力图
plt.figure(figsize=(8, 6))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='nearest')
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=45)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix Heatmap')
# 主成分分析(PCA)
X = data.drop('target', axis=1).values
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print("Shape of original data:", X.shape)
print("Shape of data after PCA:", X_pca.shape)
# 绘制 PCA 降维后的数据分布
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=data['target'], cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Data')
3. 模型融合与集成
3.1 模型融合的方法
Bagging 的核心思想是通过自助采样(有放回抽样)从原始数据集中生成多个子集,然后在每个子集上训练一个基模型,最后将这些基模型的预测结果进行综合(如分类任务中的投票,回归任务中的平均)。Bagging 可以降低模型的方差,提高模型的稳定性。
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 生成示例数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建基模型
base_model = DecisionTreeClassifier()
# 创建 Bagging 分类器
bagging_model = BaggingClassifier(base_model, n_estimators=10)
bagging_model.fit(X_train, y_train)
# 预测并评估
y_pred = bagging_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Bagging model accuracy: {accuracy}")
Boosting 是一种迭代的方法,它在每一轮训练中都会关注前一轮训练中被错误分类的样本,通过调整样本的权重,使得后续的基模型更加关注这些难分类的样本。最后将所有基模型的预测结果进行加权组合。常见的 Boosting 算法有 AdaBoost、Gradient Boosting 等。Boosting 可以降低模型的偏差,提高模型的准确性。
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 生成示例数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建基模型
base_model = DecisionTreeClassifier(max_depth=1)
# 创建 AdaBoost 分类器
adaboost_model = AdaBoostClassifier(base_model, n_estimators=10)
adaboost_model.fit(X_train, y_train)
# 预测并评估
y_pred = adaboost_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"AdaBoost model accuracy: {accuracy}")
Stacking 是一种更复杂的模型融合方法,它将多个不同的基模型的预测结果作为新的特征,然后在这些新特征上训练一个元模型,最终由元模型进行预测。Stacking 可以充分利用不同基模型的优势,提高模型的性能。
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 生成示例数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 基模型
base_models = [RandomForestClassifier(), LogisticRegression()]
kf = KFold(n_splits=5)
blend_train = np.zeros((X_train.shape[0], len(base_models)))
for i, model in enumerate(base_models):
for train_index, test_index in kf.split(X_train):
X_tr, X_te = X_train[train_index], X_train[test_index]
y_tr, _ = y_train[train_index], y_train[test_index]
model.fit(X_tr, y_tr)
blend_train[test_index, i] = model.predict_proba(X_te)[:, 1]
# 元模型
meta_model = LogisticRegression()
meta_model.fit(blend_train, y_train)
# 对测试集进行预测
blend_test = np.zeros((X_test.shape[0], len(base_models)))
for i, model in enumerate(base_models):
blend_test[:, i] = model.predict_proba(X_test)[:, 1]
# 元模型进行最终预测
y_pred = meta_model.predict(blend_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Stacking model accuracy: {accuracy}")
3.2 常见的集成学习算法
随机森林(Random Forest)
随机森林是基于 Bagging 思想的集成学习算法,它以决策树为基模型。在构建每棵决策树时,不仅会进行自助采样,还会随机选择部分特征进行分裂,从而增加了模型的多样性。随机森林具有较好的泛化能力,对异常值和噪声不敏感,且可以处理高维数据。
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 生成示例数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建随机森林分类器
rf_model = RandomForestClassifier(n_estimators=10)
rf_model.fit(X_train, y_train)
# 预测并评估
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest model accuracy: {accuracy}")
AdaBoost 通过调整样本的权重,使得后续的基模型更加关注前一轮被错误分类的样本。它会为每个基模型分配一个权重,最终的预测结果是所有基模型预测结果的加权组合。AdaBoost 可以自适应地调整模型的复杂度,提高模型的准确性。
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 生成示例数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建基模型
base_model = DecisionTreeClassifier(max_depth=1)
# 创建 AdaBoost 分类器
adaboost_model = AdaBoostClassifier(base_model, n_estimators=10)
adaboost_model.fit(X_train, y_train)
# 预测并评估
y_pred = adaboost_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"AdaBoost model accuracy: {accuracy}")
Gradient Boosting
Gradient Boosting 是一种迭代的决策树集成方法,它通过拟合前一轮模型的残差来训练新的基模型。每一轮训练都会使模型朝着减少损失函数的方向前进,最终将所有基模型的预测结果相加得到最终的预测值。Gradient Boosting 可以处理各种类型的数据,并且在很多机器学习竞赛中都取得了优异的成绩。
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 生成示例数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建 Gradient Boosting 分类器
gb_model = GradientBoostingClassifier(n_estimators=10)
gb_model.fit(X_train, y_train)
# 预测并评估
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Gradient Boosting model accuracy: {accuracy}")
3.3 模型融合的效果评估和参数调优
模型融合的效果评估可以使用前面提到的各种评估指标,如分类任务中的准确率、召回率、F1 值等,回归任务中的均方误差、决定系数等。在进行参数调优时,可以使用网格搜索、随机搜索或贝叶斯优化等方法,对基模型和元模型的超参数进行调整,以找到最优的参数组合,提高模型融合的性能。
以 Stacking 模型为例进行参数调优(使用网格搜索)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 生成示例数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 基模型
base_models = [RandomForestClassifier(), LogisticRegression()]
kf = KFold(n_splits=5)
blend_train = np.zeros((X_train.shape[0], len(base_models)))
for i, model in enumerate(base_models):
for train_index, test_index in kf.split(X_train):
X_tr, X_te = X_train[train_index], X_train[test_index]
y_tr, _ = y_train[train_index], y_train[test_index]
model.fit(X_tr, y_tr)
blend_train[test_index, i] = model.predict_proba(X_te)[:, 1]
# 元模型
meta_model = LogisticRegression()
# 定义元模型的参数网格
param_grid = {'C': [0.1, 1, 10]}
# 创建网格搜索对象
grid_search = GridSearchCV(meta_model, param_grid, cv=5)
grid_search.fit(blend_train, y_train)
# 输出最佳参数和最佳得分
print(f"Best parameters for meta - model: {grid_search.best_params_}")
print(f"Best score for meta - model: {grid_search.best_score_}")
# 使用最佳参数的元模型进行预测
best_meta_model = grid_search.best_estimator_
# 对测试集进行预测
blend_test = np.zeros((X_test.shape[0], len(base_models)))
for i, model in enumerate(base_models):
blend_test[:, i] = model.predict_proba(X_test)[:, 1]
y_pred = best_meta_model.predict(blend_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Stacking model accuracy after tuning: {accuracy}")