### 10.2.1 ID3算法基本原理 ###
mtcars2 <- within(mtcars[,c('cyl','vs','am','gear')], {
am <- factor(am, labels = c("automatic", "manual"))
vs <- factor(vs, labels = c("V", "S"))
cyl <- ordered(cyl)
gear <- ordered(gear)
})
table(mtcars2$am) # 查看因变量的类别数量
I_am <- -19/32*log2(19/32)-13/32*log2(13/32) # 计算因变量的信息熵
I_am
# 自定义函数计算信息熵、信息增益
information_gain <- function(x,y){
m1 <- matrix(table(y))
entropy_y <- sum(-(m1/sum(m1))*log2(m1/sum(m1)))
t <- table(x,y)
m <- matrix(t,length(unique(x)),length(unique(y)),
dimnames = list(levels(x),levels(y)))
freq <- -rowSums((m/rowSums(m))*log2(m/rowSums(m)))
entropy <- sum(rowSums(m)*freq/dim(mtcars2)[1],na.rm = T)
gain <- entropy_y - entropy
return(c('因变量熵'=entropy_y ,
'条件熵'=entropy,
'信息增益' = gain))
}
cat('计算条件变量为cyl的熵及信息增益为:\n')
information_gain(mtcars2$cyl,mtcars2$am)
cat('计算条件变量为vs的熵及信息增益为: \n')
information_gain(mtcars2$vs,mtcars2$am)
cat('计算条件变量为gear的熵及信息增益为: \n')
information_gain(mtcars2$gear,mtcars2$am)
### 10.2.2 C4.5算法 ###
# 自定义函数计算信息熵、信息增益、信息增益率
gain_rate <- function(x,y){
m0 <- matrix(table(x))
entropy_x <- sum(-(m0/sum(m0))*log2(m0/sum(m0)))
m1 <- matrix(table(y))
entropy_y <- sum(-(m1/sum(m1))*log2(m1/sum(m1)))
t <- table(x,y)
m <- matrix(t,length(unique(x)),length(unique(y)),
dimnames = list(levels(x),levels(y)))
freq <- -rowSums((m/rowSums(m))*log2(m/rowSums(m)))
entropy <- sum(rowSums(m)*freq/dim(mtcars2)[1],na.rm = T)
gain <- entropy_y - entropy
return(c('自变量熵'=entropy_x ,
'因变量熵'=entropy_y ,
'条件熵'=entropy,
'信息增益' = gain,
'信息增益率' = gain/entropy_x))
}
cat('计算条件变量为cyl的信息熵及信息增益率为:\n')
round(gain_rate(mtcars2$cyl,mtcars2$am),3)
cat('计算条件变量为vs的信息熵及信息增益率为:\n')
round(gain_rate(mtcars2$vs,mtcars2$am),3)
cat('计算条件变量为gear的信息熵及信息增益率为:\n')
round(gain_rate(mtcars2$gear,mtcars2$am),3)
#### 10.3 R语言实现及案例 ####
#10.3.2 C5.0案例
# 利用iris数据集
# install.packages("C50")
install.packages("C50")
library(C50)
tree_mod <- C5.0(x = iris[,c('Petal.Length','Petal.Width')],
y = iris$Species)
tree_mod
summary(tree_mod) # 查看详细信息
plot(tree_mod) # 树模型可视化
# 对新样本进行预测
pred_class <- predict(tree_mod,newdata = data.frame('Petal.Length' = 2,
'Petal.Width' = 1))
pred_class
pred_prob <- predict(tree_mod,type = 'prob',
newdata = data.frame('Petal.Length' = 2,
'Petal.Width' = 1))
round(pred_prob,3)
# 10.3.2.4 提高模型的性能
# library(modeldata)
# data(mlc_churn)
# data(churn)
# 导入数据集
library(modeldata)
library(C50)
#install.packages("modeldata")
data(mlc_churn)
data(churn)
churnTrain <- read.csv('data/churnTrain.csv')
churnTest <- read.csv('data/churnTest.csv')
churnTrain$churn <- as.factor(churnTrain$churn)
churnTest$churn <- as.factor(churnTest$churn)
# 构建模型
treeModel <- C5.0(x = churnTrain[, -20],
y = churnTrain$churn)
treeModel1 <- C5.0(x = churnTrain[, -20],
y = churnTrain$churn,trials = 10) # 使用10次boosting迭代
# 查看模型对训练数据集的混淆矩阵
(t0 <- table(churnTrain$churn,predict(treeModel,newdata = churnTrain)))
(t1 <- table(churnTrain$churn,predict(treeModel1,newdata = churnTrain)))
cat('普通模型对训练集的预测准确率:',
paste0(round(sum(diag(t0))*100/sum(t0),2),"%"))
cat('增加boosting的模模型对训练集的预测准确率:',
paste0(round(sum(diag(t1))*100/sum(t1),2),"%"))
# 查看模型对测试数据集的混淆矩阵
(c0 <- table(churnTest$churn,predict(treeModel,newdata = churnTest)))
(c1 <- table(churnTest$churn,predict(treeModel1,newdata = churnTest)))
cat('普通模型对测试集的预测准确率:',
paste0(round(sum(diag(c0))*100/sum(c0),2),"%"))
cat('增加boosting的模模型对测试集的预测准确率:',
paste0(round(sum(diag(c1))*100/sum(c1),2),"%"))
# 定义代价矩阵
cost_mat <- matrix(c(0,1,2,0),nrow = 2)
rownames(cost_mat) <- colnames(cost_mat) <- c("no", "yes")
cost_mat
# 增加代价矩阵的决策树模型
treeModel2 <- C5.0(x = churnTrain[, -20],
y = churnTrain$churn,costs = cost_mat)
# 普通模型的预测结果
pred <- predict(treeModel,newdata = churnTrain)
# 增加代价矩阵模型的预测结果
pred2 <- predict(treeModel2,newdata = churnTrain)
# 普通模型预测结果的混淆矩阵
table('Actual' = churnTrain$churn,
'Prediction' = pred)
# 普通模型的查全率
paste0(round(sum(pred=='yes')*100/sum(churnTrain$churn=='yes'),2),'%')
# 增加代价矩阵模型预测结果的混淆矩阵
table('Actual' = churnTrain$churn,
'Prediction' = pred2)
# 增加代价矩阵模型的查全率
paste0(round(sum(pred2=='yes')*100/sum(churnTrain$churn=='yes'),2),'%')
### 10.3.3 CART案例 ###
# 10.3.3.1 分类树案例
# 分类树构建与预测
library(rpart)
library(rpart.plot)
tree_clf <- rpart(Species ~ Petal.Length + Petal.Width,data = iris)
tree_clf
rpart.plot(tree_clf,extra = 3,digits = 4)
# 对新数据进行预测
predict(tree_clf,newdata = data.frame("Petal.Length" = 5,
"Petal.Width" = 1.5),
type = 'class')
predict(tree_clf,newdata = data.frame("Petal.Length" = 5,
"Petal.Width" = 1.5))
# 回归树构建与预测
# 构建决策树
insurance <- read.csv('data/insurance.csv')
insurance$children <- insurance$children
train <- insurance[1:1000,]
test <- insurance[1001:1338,]
tree_reg <- rpart(charges ~ .,data = train)
tree_reg
rpart.plot(tree_reg,type = 4,extra = 1,digits = 4)
# 查看变量重量性,并进行可视化
tree_reg$variable.importance
barplot(tree_reg$variable.importance,
col='violetred',border = NA,yaxt='n',
main = '回归树的变量重要性')
# 对测试集进行预测
pred <- predict(tree_reg,newdata = test)
# 查看前六行结果
data.frame(head(test),
prediction = head(pred))
# 计算R方
tree_r2 <- cor(test$charges,pred)^2 # 回归树的R2
fit <- lm(charges ~ .,data = train)
pred1 <- predict(fit,newdata = test)
lm_r2 <- cor(test$charges,pred1)^2 # 线性回归的R2
data.frame('模型' = c('回归树','线性回归'),
'判定系数' = round(c(tree_r2,lm_r2),3)) # 查看结果
# 10.3.3.3 决策树的剪枝
library(rpart)
library(rpart.plot)
weather <- read.csv('data/weather.csv') # 导入weather数据集
input <- c("MinTemp", "MaxTemp", "Rainfall",
"Evaporation", "Sunshine", "WindGustDir",
"WindGustSpeed", "WindDir9am", "WindDir3pm",
"WindSpeed9am", "WindSpeed3pm", "Humidity9am",
"Humidity3pm", "Pressure9am", "Pressure3pm",
"Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm",
"RainToday") # 自变量
output <- 'RainTomorrow' # 因变量
# 预剪枝
tree_pre <- rpart(RainTomorrow ~ ., data = weather[,c(input,output)],
control = rpart.control(maxdepth = 3)) # 构建决策树
tree_pre # 查看结果
# 后剪枝
tree_clf1 <- rpart(RainTomorrow ~ ., data = weather[,c(input,output)]) # 构建决策树
printcp(tree_clf1) # 查看复杂性信息
plotcp(tree_clf1) # 绘制CP表的信息图
# 对决策树进行剪枝
tree_clf1_pru <- prune(tree_clf1,cp = 0.059)
tree_clf1_pru
# 10.3.4 条件推理决策树案例
if(!require(party)) install.packages("party") # 加载party包
library(party)
weather_sub <- weather[,c(input,output)]
weather_sub$WindGustDir <- as.factor(weather_sub$WindGustDir)
weather_sub$WindDir9am <- as.factor(weather_sub$WindDir9am)
weather_sub$WindDir3pm <- as.factor(weather_sub$WindDir3pm)
weather_sub$RainToday<- as.factor(weather_sub$RainToday)
weather_sub$RainTomorrow <- as.factor(weather_sub$RainTomorrow)
tree_ctree <- ctree(RainTomorrow ~ ., data = weather_sub,
controls = ctree_control(mincriterion = 0.99))
tree_ctree # 查看模型树
plot(tree_ctree) # 绘制决策树
# 提取数据子集,请查看样本个数及因变量类别占比
weather_sub1 <- weather_sub[weather_sub$Cloud3pm<=6 & weather_sub$Pressure3pm<=1011.8,]
nrow(weather_sub1)
round(prop.table(table(weather_sub1$RainTomorrow)),2)
# 对数据进行预测
pred <- predict(tree_ctree,newdata = weather_sub)
head(pred)
pred_prob <- predict(tree_ctree,type = 'prob',
newdata = weather_sub)
head(pred_prob,3)
# 10.3.5 绘制决策边界
library(rpart)
library(rpart.plot)
# 数据处理
iris1 <- iris[,c('Petal.Length','Petal.Width','Species')]
iris1$Species <- as.factor(as.numeric(iris1$Species)) # 将类别变成1、2、3
# 生成深度为1的决策树
tree_clf <- rpart(Species ~ Petal.Length + Petal.Width,data = iris1,
control = rpart.control(maxdepth = 1))
tree_clf
# 编写绘制决策边界函数
visualize_classifier <- function(model,X,y,xlim,ylim,type = c('n','n')){
x1s <- seq(xlim[1],xlim[2],length.out=200)
x2s <- seq(ylim[1],ylim[2],length.out=200)
Z <- expand.grid(x1s,x2s)
colnames(Z) <- colnames(X)
y_pred <- predict(model,Z,type = 'class')
y_pred <- matrix(y_pred,length(x1s))
filled.contour(x1s,x2s,y_pred,
levels = 1:(length(unique(y))+1),
col = RColorBrewer::brewer.pal(length(unique(y)),'Pastel1'),
key.axes = FALSE,
plot.axes = {axis(1);axis(2);
points(X[,1],X[,2],pch=as.numeric(y)+15,col=as.numeric(y)+1,cex=1.5);
points(c(2.45,2.45),c(0,3),type = type[1],lwd=2)
points(c(2.45,7.5),c(1.75,1.75),type = type[2],lwd=2,lty=2)
},
xlab = colnames(X)[1],ylab = colnames(X)[2]
)
}
# 绘制决策边界
visualize_classifier(tree_clf,xlim = c(0,7.5),ylim = c(0,3),
X = iris1[,1:2],
iris1$Species,
type=c('l','n'))
# 生成深度为2的决策树
tree_clf1 <- rpart(Species ~ Petal.Length + Petal.Width,data = iris1,
control = rpart.control(maxdepth = 2))
tree_clf1
# 绘制决策边界
visualize_classifier(tree_clf1,xlim = c(0,7.5),ylim = c(0,3),
X = iris1[,1:2],
iris1$Species,type=c('l','l'))
# 10.4 集成学习及随机森林
# 导入car数据集
car <- read.table("data/car.data",sep = ",")
# 对变量重命名
colnames(car) <- c("buy","main","doors","capacity",
"lug_boot","safety","accept")
# 随机选取75%的数据作为训练集建立模型,25%的数据作为测试集用来验证模型
library(caret)
library(ggplot2)
library(lattice)
# 构建训练集的下标集
ind <- createDataPartition(car$accept,times=1,p=0.75,list=FALSE)
# 构建测试集数据好训练集数据
carTR <- car[ind,]
carTE <- car[-ind,]
carTR<- within(carTR,accept <- factor(accept,levels=c("unacc","acc","good","vgood")))
carTE<- within(carTE,accept <- factor(accept,levels=c("unacc","acc","good","vgood")))
# 使用adabag包中的bagging函数实现bagging算法
#install.packages("adabag")
library(adabag)
bagging.model <- bagging(accept~.,data=carTR)
# 使用adabag包中的boosting函数实现boosting算法
boosting.model <- boosting(accept~.,data=carTR)
# 使用randomForest包中的randomForest函数实现随机森林算法
#install.packages("randomForest")
library(randomForest)
randomForest.model <- randomForest(accept~.,data=carTR,ntree=500,mtry=3)
# 预测结果,并构建混淆矩阵,查看准确率
# 构建result,存放预测结果
result <- data.frame(arithmetic=c("bagging","boosting","随机森林"),
errTR=rep(0,3),errTE=rep(0,3))
for(i in 1:3){
# 预测结果
carTR_predict <- predict(switch(i,bagging.model,boosting.model,randomForest.model),
newdata=carTR) # 训练集数据
carTE_predict <- predict(switch(i,bagging.model,boosting.model,randomForest.model),
newdata=carTE) # 测试集数据
# 构建混淆矩阵
tableTR <- table(actual=carTR$accept,
predict=switch(i,carTR_predict$class,carTR_predict$class,carTR_predict))
tableTE <- table(actual=carTE$accept,
predict=switch(i,carTE_predict$class,carTE_predict$class,carTE_predict))
# 计算误差率
result[i,2] <- paste0(round((sum(tableTR)-sum(diag(tableTR)))*100/sum(tableTR),
2),"%")
result[i,3] <- paste0(round((sum(tableTE)-sum(diag(tableTE)))*100/sum(tableTE),
2),"%")
}
# 查看结果
result