第1关:什么是集成学习
任务描述
本关任务:根据本节课所学知识完成本关所设置的选择题。
第2关: Boosting
任务描述
本关任务:根据本节课所学知识完成本关所设置的选择题。
第3关:Adaboost算法流程
任务描述
本关任务:用Python实现Adaboost,并通过鸢尾花数据集中鸢尾花的2种属性与种类对Adaboost模型进行训练。我们会调用你训练好的Adaboost模型,来对未知的鸢尾花进行分类。
#encoding=utf8
import numpy as np
#adaboost算法
class AdaBoost:
'''
input:n_estimators(int):迭代轮数
learning_rate(float):弱分类器权重缩减系数
'''
def __init__(self, n_estimators=50, learning_rate=1.0):
self.clf_num = n_estimators
self.learning_rate = learning_rate
def init_args(self, datasets, labels):
self.X = datasets
self.Y = labels
self.M, self.N = datasets.shape
# 弱分类器数目和集合
self.clf_sets = []
# 初始化weights
self.weights = [1.0/self.M]*self.M
# G(x)系数 alpha
self.alpha = []
def _G(self, features, labels, weights):
'''
input:features(ndarray):数据特征
labels(ndarray):数据标签
weights(ndarray):样本权重系数
'''
#********* Begin *********#
m = len(features)
error = 100000.0 # 无穷大
beat_v = 0.0
#单维features
features_min = min(features)
features_max = max(features)
n_step = (features_max-features_min+self.learning_rate) // self.learning_rate
direct,compare_array = None,None
for i in range(1,int(n_step)):
v = features_min + self.learning_rate * i
if v not in features:
# 误分类计算
compare_array_positive = np.array(
[1 if features[k] > v else -1 for k in range(m)])
weight_error_positive = sum([
weights[k] for k in range(m)
if compare_array_positive[k] != labels[k]
])
compare_array_nagetive = np.array(
[-1 if features[k] > v else 1 for k in range(m)])
weight_error_nagetive = sum([
weights[k] for k in range(m)
if compare_array_nagetive[k] != labels[k]
])
if weight_error_positive < weight_error_nagetive:
weight_error = weight_error_positive
_compare_array = compare_array_positive
direct = 'positive'
else:
weight_error = weight_error_nagetive
_compare_array = compare_array_nagetive
direct = 'nagetive'
# print('v:{} error:{}'.format(v, weight_error))
if weight_error < error:
error = weight_error
compare_array = _compare_array
best_v = v
return best_v, direct, error, compare_array
# 计算alpha
def _alpha(self, error):
return 0.5 * np.log((1 - error) / error)
# 规范化因子
def _Z(self, weights, a, clf):
return sum([
weights[i] * np.exp(-1 * a * self.Y[i] * clf[i])
for i in range(self.M)
])
# 权值更新
def _w(self, a, clf, Z):
for i in range(self.M):
self.weights[i] = self.weights[i] * np.exp(
-1 * a * self.Y[i] * clf[i]) / Z
# G(x)的线性组合
def _f(self, alpha, clf_sets):
pass
def G(self, x, v, direct):
if direct == 'positive':
return 1 if x > v else -1
else:
return -1 if x > v else 1
def fit(self, X, y):
self.init_args(X, y)
for epoch in range(self.clf_num):
axis = 0
final_direct = 'null'
best_clf_error, best_v, clf_result = 100000, None, None
# 根据特征维度, 选择误差最小的
for j in range(self.N):
features = self.X[:, j]
# 分类阈值,分类误差,分类结果
v, direct, error, compare_array = self._G(
features, self.Y, self.weights)
if error < best_clf_error:
best_clf_error = error
best_v = v
final_direct = direct
clf_result = compare_array
axis = j # axis数字代表第几个属性列
# print('epoch:{}/{} feature:{} error:{} v:{}'.format(epoch, self.clf_num, j, error, best_v))
if best_clf_error == 0:
break
# 计算G(x)系数a
a = self._alpha(best_clf_error)
self.alpha.append(a)
# 记录分类器
self.clf_sets.append((axis, best_v, final_direct))
# 规范化因子
Z = self._Z(self.weights, a, clf_result)
# 权值更新
self._w(a, clf_result, Z)
#********* End *********#
def predict(self, feature):
result = 0.0
for i in range(len(self.clf_sets)):
axis, clf_v, direct = self.clf_sets[i]
f_input = feature[axis]
result += self.alpha[i] * self.G(f_input, clf_v, direct)
# sign
return 1 if result > 0 else -1
def score(self, X_test, y_test):
right_count = 0
for i in range(len(X_test)):
feature = X_test[i]
if self.predict(feature) == y_test[i]:
right_count += 1
return right_count / len(X_test)
第4关:sklearn中的Adaboost
任务描述
本关任务:你需要调用sklearn中的Adaboost模型,并通过鸢尾花数据集中鸢尾花的2种属性与种类对Adaboost模型进行训练。我们会调用你训练好的Adaboost模型,来对未知的鸢尾花进行分类。
#encoding=utf8
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
def ada_classifier(train_data,train_label,test_data):
'''
input:train_data(ndarray):训练数据
train_label(ndarray):训练标签
test_data(ndarray):测试标签
output:predict(ndarray):预测结果
'''
#********* Begin *********#
ada=AdaBoostClassifier(base_estimator=DecisionTreeClassifier
(max_depth=2,min_samples_split=10,min_samples_leaf=5),
n_estimators=50,learning_rate=0.2)
ada.fit(train_data,train_label)
predict=ada.predict(test_data)
#********* End *********#
return predict