1) 开始:构建根节点,将所有训练数据都放在根节点,选择一个最优特征,按着这一特征将训练数据集分割成子集,使得各个子集有一个在当前条件下最好的分类。
2) 如果这些子集已经能够被基本正确分类,那么构建叶节点,并将这些子集分到所对应的叶节点去。
import json
import numpy as np
import cv2
import math
# 首先读入一遍数据,找出y有几个,算出信息熵。(暂定使用列表保存,用in来判断是否出现过,从而计数)
# 由y有几个,创建map,来计算信息增益
# 有四个属性,从6-15行,再次遍历一遍,统计每个属性下的正反例的比例
NO. Outlook temperature humidity windy play
1 sunny hot high FALSE no
2 sunny hot high TRUE no
3 overcast hot high FALSE yes
4 rainy mild high FALSE yes
5 rainy cool normal FALSE yes
6 rainy cool normal TRUE no
7 overcast cool normal TRUE yes
8 sunny mild high FALSE no
9 sunny cool normal FALSE yes
10 rainy mild normal FALSE yes
# 读入数据函数 返回正例,反例,总例,数据
def readFile():
f = open('D:\\PythonProject_Class\\test_Data\\PlayTennis.txt', 'r')
lk = 6
preData = [[] for i in range(lk)]
dict_PlusFeatures = {} # 保存属性的名称,并为求信息增益做准备,也就是把初值赋值为0
dict_NegativeFeatures = {} # 上一个保存的是正例,这个保存的的是反例
sum_Features = {}
for i in range(0, 4): # 把前几行的文字描述跳过
s = f.readline()
s = f.readline() # 读入属性
# NO. Outlook temperature humidity windy play
# strip函数是去除这行的开头和结尾的换行符和空格的
s = s.strip('\n')
s = s.strip(' ')
x = s.split(' ')
# 初始化字典
for i in range(1, len(x)): # 从1开始是要跳过NO.
if x[i] == 'play':
dict_PlusFeatures[x[i]] = 0
dict_NegativeFeatures[x[i]] = 0
sum_Features[x[i]] = 0
elif x[i] != '':
dict_PlusFeatures[x[i]] = {}
dict_NegativeFeatures[x[i]] = {}
sum_Features[x[i]] = {}
ls = [i for i in dict_PlusFeatures.keys()] # 提取字典中的特征名称
ls.pop(len(ls) - 1) # 去掉play
# s=set()不能kidls=[s for i in range(len(ls))],这样列表中的一个集合改变,其他的也会改变
# kidls = [set() for i in range(len(ls))] # 保存每个特征的属性值,使用没有重复元素的集合set
flag = 0 # 用于标记是正例还是反例
index = 0 # 用于指向 保存所有读入数据的predata 的下标
for i in range(lk):
cnt = 0
s = f.readline() # 读入属性
s = s.strip('\n')
s = s.strip(' ')
x = s.split(' ')
if x[len(x) - 1] == 'no': # 首先处理是正例还是反例,同时统计正反例个数
flag = -1
dict_NegativeFeatures['play'] += 1
elif x[len(x) - 1] == 'yes':
flag = 1
dict_PlusFeatures['play'] += 1
sum_Features['play'] += 1
for j in range(2, len(x) - 1): # 跳过编号以及最后的正反例
if x[j] != '':
if flag == 1:
if x[j] not in dict_PlusFeatures[ls[cnt]].keys():
dict_PlusFeatures[ls[cnt]][x[j]] = 1
dict_PlusFeatures[ls[cnt]][x[j]] += 1
elif flag == -1:
if x[j] not in dict_NegativeFeatures[ls[cnt]].keys():
dict_NegativeFeatures[ls[cnt]][x[j]] = 1
dict_NegativeFeatures[ls[cnt]][x[j]] += 1
if x[j] not in sum_Features[ls[cnt]].keys():
sum_Features[ls[cnt]][x[j]] = 1
sum_Features[ls[cnt]][x[j]] += 1
# kidls[cnt].add(x[j])
cnt += 1
preData[index].append(x[len(x) - 1])
index += 1
for i in dict_PlusFeatures.keys():
if i != 'play':
for j in dict_PlusFeatures[i].keys():
if j not in dict_NegativeFeatures[i].keys():
dict_NegativeFeatures[i][j] = 0
for i in dict_NegativeFeatures.keys():
if i != 'play':
for j in dict_NegativeFeatures[i].keys():
if j not in dict_PlusFeatures[i].keys():
dict_PlusFeatures[i][j] = 0
preData.insert(0, ls) # 在split中发现需要表头
return dict_PlusFeatures, dict_NegativeFeatures, sum_Features, preData
# 当样本中只剩下一个属性的时候,返回出现次数最多的结果。
def Maxmajority(classList):
Cntyes = 0
Cntno = 0
for i in classList:
if i == 'yes':
Cntyes += 1
Cntno += 1
return max(Cntyes, Cntno)
def getmax(dict_PlusFeatures, dict_NegativeFeatures):
# print(dict_PlusFeatures)
# print(dict_NegativeFeatures)
if dict_PlusFeatures['play'] >= dict_NegativeFeatures['play']:
return 'yes'
return 'no'
def getEnt(dict_PlusFeatures, dict_NegativeFeatures, sum_Features):
# 计算信息熵
cnt_samples = sum_Features['play']
e1 = dict_PlusFeatures['play'] / cnt_samples
e2 = dict_NegativeFeatures['play'] / cnt_samples
Ent = -(e1 * math.log(e1, 2) + e2 * math.log(e2, 2))
# print(Ent)
return Ent
def getGain(Ent, sum_Features):
# 计算信息增益
max = 0
maxFeature = ""
for i in sum_Features.keys():
if i != 'play':
Gain = 0
for j in sum_Features[i].keys():
if dict_PlusFeatures[i][j] == 0:
k1 = 0
k1 = dict_PlusFeatures[i][j] / sum_Features[i][j]
if dict_NegativeFeatures[i][j] == 0:
k2 = 0
k2 = dict_NegativeFeatures[i][j] / sum_Features[i][j]
if k1 == 0 and k2 != 0:
ke = k2 * math.log(k2, 2)
elif k1 != 0 and k2 == 0:
ke = k1 * math.log(k1, 2)
elif k1 == 0 and k2 == 0:
ke = 0
ke = k1 * math.log(k1, 2) + k2 * math.log(k2, 2)
Gain += -(sum_Features[i][j] / sum_Features['play']) * ke
Gain = Ent - Gain
if max < Gain:
max = Gain
maxFeature = i
return max, maxFeature
def chooseBestFeature(dict_PlusFeatures, dict_NegativeFeatures, sum_Features):
Ent = getEnt(dict_PlusFeatures, dict_NegativeFeatures, sum_Features)
Gain, Name = getGain(Ent, sum_Features)
return Name
# 划分函数
def splitValues(dict_PlusFeatures, dict_NegativeFeatures, sum_Features, preData, BestFeatureName, Value):
Newdict_PlusFeatures = dict(dict_PlusFeatures)
Newdict_NegativeFeatures = dict(dict_NegativeFeatures)
Newsum_Features = dict(sum_Features)
Newdict_PlusFeatures['play'] = 0
Newdict_NegativeFeatures['play'] = 0
Newsum_Features['play'] = 0
for i in Newdict_PlusFeatures.keys():
if i != 'play':
for j in Newdict_PlusFeatures[i].keys():
Newdict_PlusFeatures[i][j] = 0
for i in Newdict_NegativeFeatures.keys():
if i != 'play':
for j in Newdict_NegativeFeatures[i].keys():
Newdict_NegativeFeatures[i][j] = 0
for i in Newsum_Features.keys():
if i != 'play':
for j in Newsum_Features[i].keys():
Newsum_Features[i][j] = 0
BestIndex = 0
for i in range(len(preData[0])):
if preData[0][i] == BestFeatureName:
BestIndex = i
for i in range(1, len(preData)):
if preData[i][BestIndex] == Value:
if preData[i][-1] == 'no':
Newdict_NegativeFeatures['play'] += 1
Newsum_Features['play'] += 1
for j in range(len(preData[i]) - 1):
if j != BestIndex:
Newdict_NegativeFeatures[preData[0][j]][preData[i][j]] += 1
Newsum_Features[preData[0][j]][preData[i][j]] += 1
elif preData[i][-1] == 'yes':
Newdict_PlusFeatures['play'] += 1
Newsum_Features['play'] += 1
for j in range(len(preData[i]) - 1):
if j != BestIndex:
Newdict_PlusFeatures[preData[0][j]][preData[i][j]] += 1
Newsum_Features[preData[0][j]][preData[i][j]] += 1
return Newdict_PlusFeatures, Newdict_NegativeFeatures, Newsum_Features
# {'Outlook': {'rainy': {'windy': {'FALSE': {'humidity': {'high': 'no', 'normal': 'yes'}}, 'TRUE': {'temperature': {'cool': 'yes', 'hot': 'no', 'mild': 'yes'}}}},
# 'overcast': 'yes',
# 'sunny': {'temperature': {'cool': {'windy': {'FALSE': 'yes', 'TRUE': 'no'}}, 'hot': {'windy': {'FALSE': 'yes', 'TRUE': 'no'}}, 'mild': {'humidity': {'high': 'no', 'normal': 'yes'}}}}}}
def creatTree(dict_PlusFeatures, dict_NegativeFeatures, sum_Features, preData):
if dict_PlusFeatures['play'] == 0: # 如果只有负例的话,就不用分了
return 'no'
if dict_NegativeFeatures['play'] == 0: # 如果只有正例的话,就不用分了
return 'yes'
if len(sum_Features) - 1 == 1: # 如果只剩下一个属性,那么返回最多的
return getmax(dict_PlusFeatures, dict_NegativeFeatures)
BestFeatureName = chooseBestFeature(dict_PlusFeatures, dict_NegativeFeatures, sum_Features) # 计算信息增益,选出最优属性值
# print(BestFeatureName)
Tree = {BestFeatureName: {}} # 建立树
Values = set() # 保存的的是最优属性的不同取值,因为要根据这些不同取值对数进行分叉
for i in sum_Features[BestFeatureName].keys():
Values.add(i) # set用add方法
for i in Values:
# 选出不同取值的划分
Newdict_PlusFeatures, Newdict_NegativeFeatures, Newsum_Features = splitValues(dict_PlusFeatures,
preData, BestFeatureName, i)
NewpreData = []
index = 0
for j in range(len(preData[0])):
if preData[0][j] == BestFeatureName:
index = j
temp = preData[0][:j] + preData[0][j + 1:]
for j in range(len(preData)):
if preData[j][index] == i:
temp2 = preData[j][:index] + preData[j][index + 1:]
# print(NewpreData)
# 递归调用
Tree[BestFeatureName][i] = creatTree(Newdict_PlusFeatures, Newdict_NegativeFeatures, Newsum_Features,
return Tree
dict_PlusFeatures:{'Outlook': {'overcast': 2, 'rainy': 3, 'sunny': 1}, 'temperature': {'hot': 1, 'mild': 2, 'cool': 3}, 'humidity': {'high': 2, 'normal': 4}, 'windy': {'FALSE': 5, 'TRUE': 1}, 'play': 6}
dict_NegativeFeatures:{'Outlook': {'sunny': 3, 'rainy': 1, 'overcast': 0}, 'temperature': {'hot': 2, 'cool': 1, 'mild': 1}, 'humidity': {'high': 3, 'normal': 1}, 'windy': {'FALSE': 2, 'TRUE': 2}, 'play': 4}
sum_Features:{'Outlook': {'sunny': 4, 'overcast': 2, 'rainy': 4}, 'temperature': {'hot': 3, 'mild': 3, 'cool': 4}, 'humidity': {'high': 5, 'normal': 5}, 'windy': {'FALSE': 7, 'TRUE': 3}, 'play': 10}
{'Outlook': {'rainy': {'windy': {'TRUE': 'no', 'FALSE': 'yes'}}, 'sunny': {'temperature': {'cool': 'yes', 'hot': 'no', 'mild': 'no'}}, 'overcast': 'yes'}}
def test(preData, Tree, nub, book):
for i in Tree.keys():
temp = i
tempkey = preData[nub][book[temp]]
if Tree[temp][tempkey] == 'yes':
print('predict is yes')
elif Tree[temp][tempkey] == 'no':
print('predict is no')
newTree = Tree[temp][tempkey]
test(preData, newTree, nub, book)
if __name__ == '__main__':
dict_PlusFeatures, dict_NegativeFeatures, sum_Features, preData = readFile()
# print(preData)
Tree = creatTree(dict_PlusFeatures, dict_NegativeFeatures, sum_Features, preData)
print(json.dumps(Tree, indent=5))
book = dict()
for i in range(len(preData[0])):
if preData[0][i] != 'play':
book[preData[0][i]] = i
test(preData, Tree, 5, book)