基于Python 3.9版本演示
一、写在前面
最近看了一篇在Lancet子刊《eClinicalMedicine》上发表的机器学习分类的文章:《Development of a novel dementia risk prediction model in the general population: A large, longitudinal, population-based machine-learning study》。
学到一种叫做“概率校准”的骚操作,顺手利用GPT系统学习学习。
文章中用的技术是:保序回归(Isotonic regression)。
为了体现举一反三,顺便问了GPT还有哪些方法也可以实现概率校准。它给我列举了很多,那么就一个一个学习吧。
这一期,介绍一个叫做 Beta Calibration 的方法。
二、Beta Calibration
Beta Calibration 是一种用于概率校准的方法,旨在调整机器学习分类模型输出的概率,使其更接近真实的后验概率。这种校准方法是通过对输出概率进行二项分布参数的转换来实现的。
(1)作用和优势
1)提高概率预测的可靠性:经过校准后的概率输出更能反映真实情况,从而提高模型在实际应用中的可靠性。例如,在医疗诊断中,准确的概率预测对于决策非常重要。
2)优化决策阈值:经过校准的概率输出可以帮助优化决策阈值,使得分类模型在不同的应用场景中更加灵活和有效。
3)改进不确定性评估:通过校准后的概率输出,模型能够更准确地评估其预测的不确定性,从而在风险管理和资源分配等方面提供更有价值的信息。
三、Beta Calibration代码实现
下面,我编一个1比3的不太平衡的数据进行测试,对照组使用不进行校准的SVM模型,实验组就是加入校准的SVM模型,看看性能能够提高多少?
(1)不进行校准的SVM模型(默认参数)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
# 加载数据
dataset = pd.read_csv('8PSMjianmo.csv')
X = dataset.iloc[:, 1:20].values
Y = dataset.iloc[:, 0].values
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=666)
# 标准化数据
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# 使用SVM分类器
classifier = SVC(kernel='linear', probability=True)
classifier.fit(X_train, y_train)
# 预测结果
y_pred = classifier.predict(X_test)
y_testprba = classifier.decision_function(X_test)
y_trainpred = classifier.predict(X_train)
y_trainprba = classifier.decision_function(X_train)
# 混淆矩阵
cm_test = confusion_matrix(y_test, y_pred)
cm_train = confusion_matrix(y_train, y_trainpred)
print(cm_train)
print(cm_test)
# 绘制测试集混淆矩阵
classes = list(set(y_test))
classes.sort()
plt.imshow(cm_test, cmap=plt.cm.Blues)
indices = range(len(cm_test))
plt.xticks(indices, classes)
plt.yticks(indices, classes)
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
for first_index in range(len(cm_test)):
for second_index in range(len(cm_test[first_index])):
plt.text(first_index, second_index, cm_test[first_index][second_index])
plt.show()
# 绘制训练集混淆矩阵
classes = list(set(y_train))
classes.sort()
plt.imshow(cm_train, cmap=plt.cm.Blues)
indices = range(len(cm_train))
plt.xticks(indices, classes)
plt.yticks(indices, classes)
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
for first_index in range(len(cm_train)):
for second_index in range(len(cm_train[first_index])):
plt.text(first_index, second_index, cm_train[first_index][second_index])
plt.show()
# 计算并打印性能参数
def calculate_metrics(cm, y_true, y_pred_prob):
a = cm[0, 0]
b = cm[0, 1]
c = cm[1, 0]
d = cm[1, 1]
acc = (a + d) / (a + b + c + d)
error_rate = 1 - acc
sen = d / (d + c)
sep = a / (a + b)
precision = d / (b + d)
F1 = (2 * precision * sen) / (precision + sen)
MCC = (d * a - b * c) / (np.sqrt((d + b) * (d + c) * (a + b) * (a + c)))
auc_score = roc_auc_score(y_true, y_pred_prob)
metrics = {
"Accuracy": acc,
"Error Rate": error_rate,
"Sensitivity": sen,
"Specificity": sep,
"Precision": precision,
"F1 Score": F1,
"MCC": MCC,
"AUC": auc_score
}
return metrics
metrics_test = calculate_metrics(cm_test, y_test, y_testprba)
metrics_train = calculate_metrics(cm_train, y_train, y_trainprba)
print("Performance Metrics (Test):")
for key, value in metrics_test.items():
print(f"{key}: {value:.4f}")
print("\nPerformance Metrics (Train):")
for key, value in metrics_train.items():
print(f"{key}: {value:.4f}")
结果输出:
记住这些个数字。
这个参数的SVM还没有LR好。
(2)进行校准的SVM模型(默认参数)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from scipy.stats import beta
from scipy.optimize import minimize
# 加载数据
dataset = pd.read_csv('8PSMjianmo.csv')
X = dataset.iloc[:, 1:20].values
Y = dataset.iloc[:, 0].values
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=666)
# 标准化数据
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# 使用SVM分类器
classifier = SVC(kernel='rbf', C=0.1, probability=True)
classifier.fit(X_train, y_train)
# 获取未校准的概率预测
y_train_probs = classifier.predict_proba(X_train)[:, 1]
y_test_probs = classifier.predict_proba(X_test)[:, 1]
# Beta Calibration 函数
def beta_calibration(y_probs, y_true):
def neg_log_likelihood(params):
a, b = params
return -np.sum(beta.logpdf(y_probs, a, b) * y_true + beta.logpdf(1 - y_probs, a, b) * (1 - y_true))
result = minimize(neg_log_likelihood, [1, 1], bounds=[(1e-3, None), (1e-3, None)])
a, b = result.x
return beta.cdf(y_probs, a, b)
# 进行Beta校准
calibrated_train_probs = beta_calibration(y_train_probs, y_train)
calibrated_test_probs = beta_calibration(y_test_probs, y_test)
# 预测结果
y_train_pred = (calibrated_train_probs >= 0.5).astype(int)
y_test_pred = (calibrated_test_probs >= 0.5).astype(int)
# 混淆矩阵
cm_test = confusion_matrix(y_test, y_test_pred)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)
print(cm_test)
# 绘制混淆矩阵函数
def plot_confusion_matrix(cm, classes, title='Confusion Matrix'):
plt.imshow(cm, cmap=plt.cm.Blues)
indices = range(len(cm))
plt.xticks(indices, classes)
plt.yticks(indices, classes)
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
for first_index in range(len(cm)):
for second_index in range(len(cm[first_index])):
plt.text(first_index, second_index, cm[first_index][second_index])
plt.title(title)
plt.show()
# 绘制测试集混淆矩阵
plot_confusion_matrix(cm_test, list(set(y_test)), 'Confusion Matrix (Test)')
# 绘制训练集混淆矩阵
plot_confusion_matrix(cm_train, list(set(y_train)), 'Confusion Matrix (Train)')
# 计算并打印性能参数
def calculate_metrics(cm, y_true, y_pred_prob):
a = cm[0, 0]
b = cm[0, 1]
c = cm[1, 0]
d = cm[1, 1]
acc = (a + d) / (a + b + c + d)
error_rate = 1 - acc
sen = d / (d + c)
sep = a / (a + b)
precision = d / (b + d)
F1 = (2 * precision * sen) / (precision + sen)
MCC = (d * a - b * c) / (np.sqrt((d + b) * (d + c) * (a + b) * (a + c)))
auc_score = roc_auc_score(y_true, y_pred_prob)
metrics = {
"Accuracy": acc,
"Error Rate": error_rate,
"Sensitivity": sen,
"Specificity": sep,
"Precision": precision,
"F1 Score": F1,
"MCC": MCC,
"AUC": auc_score
}
return metrics
metrics_test = calculate_metrics(cm_test, y_test, calibrated_test_probs)
metrics_train = calculate_metrics(cm_train, y_train, calibrated_train_probs)
print("Performance Metrics (Test):")
for key, value in metrics_test.items():
print(f"{key}: {value:.4f}")
print("\nPerformance Metrics (Train):")
for key, value in metrics_train.items():
print(f"{key}: {value:.4f}")
看看结果:
同样的,仅仅训练集起作用了,验证集差强人意。
四、换个策略
参考那篇文章的策略:采用五折交叉验证来建立和评估模型,其中四折用于训练,一折用于评估,在训练集中,其中三折用于建立SVM模型,另一折采用Beta Calibration概率校正,在训练集内部采用交叉验证对超参数进行调参。
代码:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from scipy.stats import beta
from scipy.optimize import minimize
# 加载数据
dataset = pd.read_csv('8PSMjianmo.csv')
X = dataset.iloc[:, 1:20].values
Y = dataset.iloc[:, 0].values
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=666)
# 标准化数据
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# 超参数调优
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['rbf']}
svm = SVC(probability=True)
clf = GridSearchCV(svm, param_grid, cv=3, scoring='roc_auc')
clf.fit(X_train, y_train)
best_params = clf.best_params_
# 五折交叉验证
kf = KFold(n_splits=5)
calibrated_probs = []
true_labels = []
for train_index, test_index in kf.split(X_train):
X_train_cv, X_val_cv = X_train[train_index], X_train[test_index]
y_train_cv, y_val_cv = y_train[train_index], y_train[test_index]
# 使用最佳参数训练SVM
classifier = SVC(kernel=best_params['kernel'], C=best_params['C'], probability=True)
classifier.fit(X_train_cv, y_train_cv)
# 获取未校准的概率预测
y_val_cv_probs = classifier.predict_proba(X_val_cv)[:, 1]
# Beta Calibration
def beta_calibration(y_probs, y_true):
def neg_log_likelihood(params):
a, b = params
return -np.sum(beta.logpdf(y_probs, a, b) * y_true + beta.logpdf(1 - y_probs, a, b) * (1 - y_true))
result = minimize(neg_log_likelihood, [1, 1], bounds=[(1e-3, None), (1e-3, None)])
a, b = result.x
return beta.cdf(y_probs, a, b)
# 进行Beta校准
calibrated_val_cv_probs = beta_calibration(y_val_cv_probs, y_val_cv)
calibrated_probs.extend(calibrated_val_cv_probs)
true_labels.extend(y_val_cv)
# 用于测试集的SVM模型训练和校准
classifier_final = SVC(kernel=best_params['kernel'], C=best_params['C'], probability=True)
classifier_final.fit(X_train, y_train)
y_test_probs = classifier_final.predict_proba(X_test)[:, 1]
calibrated_test_probs = beta_calibration(y_test_probs, y_test)
# 预测结果
y_train_pred = (np.array(calibrated_probs) >= 0.5).astype(int)
y_test_pred = (calibrated_test_probs >= 0.5).astype(int)
# 混淆矩阵
cm_test = confusion_matrix(y_test, y_test_pred)
cm_train = confusion_matrix(y_train, y_train_pred)
print("Training Confusion Matrix:\n", cm_train)
print("Testing Confusion Matrix:\n", cm_test)
# 绘制混淆矩阵函数
def plot_confusion_matrix(cm, classes, title='Confusion Matrix'):
plt.imshow(cm, cmap=plt.cm.Blues)
indices = range(len(cm))
plt.xticks(indices, classes)
plt.yticks(indices, classes)
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
for first_index in range(len(cm)):
for second_index in range(len(cm[first_index])):
plt.text(first_index, second_index, cm[first_index][second_index])
plt.title(title)
plt.show()
# 绘制测试集混淆矩阵
plot_confusion_matrix(cm_test, list(set(y_test)), 'Confusion Matrix (Test)')
# 绘制训练集混淆矩阵
plot_confusion_matrix(cm_train, list(set(y_train)), 'Confusion Matrix (Train)')
# 计算并打印性能参数
def calculate_metrics(cm, y_true, y_pred_prob):
a = cm[0, 0]
b = cm[0, 1]
c = cm[1, 0]
d = cm[1, 1]
acc = (a + d) / (a + b + c + d)
error_rate = 1 - acc
sen = d / (d + c)
sep = a / (a + b)
precision = d / (b + d)
F1 = (2 * precision * sen) / (precision + sen)
MCC = (d * a - b * c) / (np.sqrt((d + b) * (d + c) * (a + b) * (a + c)))
auc_score = roc_auc_score(y_true, y_pred_prob)
metrics = {
"Accuracy": acc,
"Error Rate": error_rate,
"Sensitivity": sen,
"Specificity": sep,
"Precision": precision,
"F1 Score": F1,
"MCC": MCC,
"AUC": auc_score
}
return metrics
metrics_test = calculate_metrics(cm_test, y_test, calibrated_test_probs)
metrics_train = calculate_metrics(cm_train, y_train, np.array(calibrated_probs))
print("Performance Metrics (Test):")
for key, value in metrics_test.items():
print(f"{key}: {value:.4f}")
print("\nPerformance Metrics (Train):")
for key, value in metrics_train.items():
print(f"{key}: {value:.4f}")
输出:
哈哈哈,性能参数都飞回去了。
五、最后
各位可以去试一试在其他数据或者在其他机器学习分类模型中使用的效果。
数据不分享啦。