机器学习期末复习决策树ID3的计算与构建

ID3构建的流程就是参考书上的那个伪代码。

1）开始：构建根节点，将所有训练数据都放在根节点，选择一个最优特征，按着这一特征将训练数据集分割成子集，使得各个子集有一个在当前条件下最好的分类。

2）如果这些子集已经能够被基本正确分类，那么构建叶节点，并将这些子集分到所对应的叶节点去。

3）如果还有子集不能够被正确的分类，那么就对这些子集选择新的最优特征，继续对其进行分割，构建相应的节点，如果递归进行，直至所有训练数据子集被基本正确的分类，或者没有合适的特征为止。

4）每个子集都被分到叶节点上，即都有了明确的类，这样就生成了一颗决策树。

过程就是建立一棵树（不是二叉树，因为一个属性可能有多个取值），建树的过程我选择的递归。

首先创建一个空结点，然后通过计算信息增益，选取所有属性中信息增益最大的作为这个结点的值；

然后根据这个属性的不同取值来对这个结点进行“分杈”，如此循环往复，下一次循环在计算信息熵时要去掉这次选到的最大信息增益的那部分训练集；

结束的条件是：

（1）当剩下的所有训练集中都属于同一类，比如都是好瓜，那么就没必要再划分了，根节点就是好瓜。

（2）只剩下一个属性了，比如色泽，分出来青绿和浅白，青绿不能继续划分了，那么统计青绿这一部分的训练集的类别，取最大的。比如青绿中有6个样本，4个为好瓜，2个为坏瓜，那么最后是好瓜的概率大，选好瓜作为根节点。

信息熵与信息增益的计算可以参考这个：

【机器学习实战】3、决策树

信息熵：

i的取值范围取决于训练集中给出的类别的数量，比如西瓜训练集，就只有好坏瓜之分，那么i的取值范围就是1-2。

p（x）就是取好瓜（坏瓜）的概率

代码：

import json
import numpy as np
import cv2
import math

# 首先读入一遍数据，找出y有几个，算出信息熵。(暂定使用列表保存，用in来判断是否出现过，从而计数)
# 由y有几个，创建map，来计算信息增益
# 有四个属性，从6-15行，再次遍历一遍，统计每个属性下的正反例的比例
"""
    利用打网球数据集PlayTenis构建决策树，该数据集的特性如下：
    属性包括天气（outlook）、温度（temperature）、湿度（humidity）、是否有风（windy），样本个数为14。
    标签为今天是否去打网球（play）。
    具体数据如下：
    NO. Outlook temperature humidity   windy  play
    1   sunny      hot       high      FALSE  no
    2   sunny      hot       high      TRUE   no
    3   overcast   hot       high      FALSE  yes
    4   rainy      mild      high      FALSE  yes
    5   rainy      cool      normal    FALSE  yes
    6   rainy      cool      normal    TRUE   no
    7   overcast   cool      normal    TRUE   yes
    8   sunny      mild      high      FALSE  no
    9   sunny      cool      normal    FALSE  yes
    10  rainy      mild      normal    FALSE  yes
"""


# 读入数据函数  返回正例，反例，总例，数据
def readFile():
    f = open('D:\\PythonProject_Class\\test_Data\\PlayTennis.txt', 'r')
    lk = 6
    preData = [[] for i in range(lk)]
    dict_PlusFeatures = {}  # 保存属性的名称，并为求信息增益做准备，也就是把初值赋值为0
    dict_NegativeFeatures = {}  # 上一个保存的是正例，这个保存的的是反例
    sum_Features = {}

    for i in range(0, 4):  # 把前几行的文字描述跳过
        s = f.readline()

    s = f.readline()  # 读入属性
    #    NO. Outlook temperature humidity   windy  play
    # strip函数是去除这行的开头和结尾的换行符和空格的
    s = s.strip('\n')
    s = s.strip(' ')
    x = s.split(' ')
    # 初始化字典
    for i in range(1, len(x)):  # 从1开始是要跳过NO.
        if x[i] == 'play':
            dict_PlusFeatures[x[i]] = 0
            dict_NegativeFeatures[x[i]] = 0
            sum_Features[x[i]] = 0
        elif x[i] != '':
            dict_PlusFeatures[x[i]] = {}
            dict_NegativeFeatures[x[i]] = {}
            sum_Features[x[i]] = {}

    ls = [i for i in dict_PlusFeatures.keys()]  # 提取字典中的特征名称
    ls.pop(len(ls) - 1)  # 去掉play
    # s=set()不能kidls=[s for i in range(len(ls))]，这样列表中的一个集合改变，其他的也会改变
    # kidls = [set() for i in range(len(ls))]  # 保存每个特征的属性值,使用没有重复元素的集合set

    flag = 0  # 用于标记是正例还是反例
    index = 0  # 用于指向 保存所有读入数据的predata 的下标
    for i in range(lk):
        cnt = 0
        s = f.readline()  # 读入属性
        s = s.strip('\n')
        s = s.strip(' ')
        x = s.split(' ')
        if x[len(x) - 1] == 'no':  # 首先处理是正例还是反例，同时统计正反例个数
            flag = -1
            dict_NegativeFeatures['play'] += 1
        elif x[len(x) - 1] == 'yes':
            flag = 1
            dict_PlusFeatures['play'] += 1
        sum_Features['play'] += 1
        for j in range(2, len(x) - 1):  # 跳过编号以及最后的正反例
            if x[j] != '':
                if flag == 1:
                    if x[j] not in dict_PlusFeatures[ls[cnt]].keys():
                        dict_PlusFeatures[ls[cnt]][x[j]] = 1
                    else:
                        dict_PlusFeatures[ls[cnt]][x[j]] += 1
                elif flag == -1:
                    if x[j] not in dict_NegativeFeatures[ls[cnt]].keys():
                        dict_NegativeFeatures[ls[cnt]][x[j]] = 1
                    else:
                        dict_NegativeFeatures[ls[cnt]][x[j]] += 1

                if x[j] not in sum_Features[ls[cnt]].keys():
                    sum_Features[ls[cnt]][x[j]] = 1
                else:
                    sum_Features[ls[cnt]][x[j]] += 1

                # kidls[cnt].add(x[j])
                preData[index].append(x[j])
                cnt += 1
        preData[index].append(x[len(x) - 1])
        index += 1

    for i in dict_PlusFeatures.keys():
        if i != 'play':
            for j in dict_PlusFeatures[i].keys():
                if j not in dict_NegativeFeatures[i].keys():
                    dict_NegativeFeatures[i][j] = 0

    for i in dict_NegativeFeatures.keys():
        if i != 'play':
            for j in dict_NegativeFeatures[i].keys():
                if j not in dict_PlusFeatures[i].keys():
                    dict_PlusFeatures[i][j] = 0

    preData.insert(0, ls)  # 在split中发现需要表头
    preData[0].append('play')
    return dict_PlusFeatures, dict_NegativeFeatures, sum_Features, preData


# 当样本中只剩下一个属性的时候，返回出现次数最多的结果。
def Maxmajority(classList):
    Cntyes = 0
    Cntno = 0
    for i in classList:
        if i == 'yes':
            Cntyes += 1
        else:
            Cntno += 1
    return max(Cntyes, Cntno)


def getmax(dict_PlusFeatures, dict_NegativeFeatures):
    # print(dict_PlusFeatures)
    # print(dict_NegativeFeatures)
    if dict_PlusFeatures['play'] >= dict_NegativeFeatures['play']:
        return 'yes'
    else:
        return 'no'


def getEnt(dict_PlusFeatures, dict_NegativeFeatures, sum_Features):
    # 计算信息熵
    cnt_samples = sum_Features['play']
    e1 = dict_PlusFeatures['play'] / cnt_samples
    e2 = dict_NegativeFeatures['play'] / cnt_samples
    Ent = -(e1 * math.log(e1, 2) + e2 * math.log(e2, 2))
    # print(Ent)
    return Ent


def getGain(Ent, sum_Features):
    # 计算信息增益
    max = 0
    maxFeature = ""
    for i in sum_Features.keys():
        if i != 'play':
            Gain = 0
            for j in sum_Features[i].keys():
                if dict_PlusFeatures[i][j] == 0:
                    k1 = 0
                else:
                    k1 = dict_PlusFeatures[i][j] / sum_Features[i][j]
                if dict_NegativeFeatures[i][j] == 0:
                    k2 = 0
                else:
                    k2 = dict_NegativeFeatures[i][j] / sum_Features[i][j]
                if k1 == 0 and k2 != 0:
                    ke = k2 * math.log(k2, 2)
                elif k1 != 0 and k2 == 0:
                    ke = k1 * math.log(k1, 2)
                elif k1 == 0 and k2 == 0:
                    ke = 0
                else:
                    ke = k1 * math.log(k1, 2) + k2 * math.log(k2, 2)
                Gain += -(sum_Features[i][j] / sum_Features['play']) * ke
            Gain = Ent - Gain
            if max < Gain:
                max = Gain
                maxFeature = i
    return max, maxFeature


def chooseBestFeature(dict_PlusFeatures, dict_NegativeFeatures, sum_Features):
    Ent = getEnt(dict_PlusFeatures, dict_NegativeFeatures, sum_Features)
    Gain, Name = getGain(Ent, sum_Features)
    print("Gain:{}".format(Gain))
    return Name


# 划分函数
def splitValues(dict_PlusFeatures, dict_NegativeFeatures, sum_Features, preData, BestFeatureName, Value):
    Newdict_PlusFeatures = dict(dict_PlusFeatures)
    Newdict_NegativeFeatures = dict(dict_NegativeFeatures)
    Newsum_Features = dict(sum_Features)

    Newdict_PlusFeatures.pop(BestFeatureName)
    Newdict_PlusFeatures['play'] = 0
    Newdict_NegativeFeatures.pop(BestFeatureName)
    Newdict_NegativeFeatures['play'] = 0
    Newsum_Features.pop(BestFeatureName)
    Newsum_Features['play'] = 0

    for i in Newdict_PlusFeatures.keys():
        if i != 'play':
            for j in Newdict_PlusFeatures[i].keys():
                Newdict_PlusFeatures[i][j] = 0
    for i in Newdict_NegativeFeatures.keys():
        if i != 'play':
            for j in Newdict_NegativeFeatures[i].keys():
                Newdict_NegativeFeatures[i][j] = 0
    for i in Newsum_Features.keys():
        if i != 'play':
            for j in Newsum_Features[i].keys():
                Newsum_Features[i][j] = 0
    BestIndex = 0
    for i in range(len(preData[0])):
        if preData[0][i] == BestFeatureName:
            BestIndex = i
            break
    for i in range(1, len(preData)):
        if preData[i][BestIndex] == Value:
            if preData[i][-1] == 'no':
                Newdict_NegativeFeatures['play'] += 1
                Newsum_Features['play'] += 1
                for j in range(len(preData[i]) - 1):
                    if j != BestIndex:
                        Newdict_NegativeFeatures[preData[0][j]][preData[i][j]] += 1
                        Newsum_Features[preData[0][j]][preData[i][j]] += 1
            elif preData[i][-1] == 'yes':
                Newdict_PlusFeatures['play'] += 1
                Newsum_Features['play'] += 1
                for j in range(len(preData[i]) - 1):
                    if j != BestIndex:
                        Newdict_PlusFeatures[preData[0][j]][preData[i][j]] += 1
                        Newsum_Features[preData[0][j]][preData[i][j]] += 1

    return Newdict_PlusFeatures, Newdict_NegativeFeatures, Newsum_Features


# {'Outlook': {'rainy': {'windy': {'FALSE': {'humidity': {'high': 'no', 'normal': 'yes'}}, 'TRUE': {'temperature': {'cool': 'yes', 'hot': 'no', 'mild': 'yes'}}}},
#              'overcast': 'yes',
#              'sunny': {'temperature': {'cool': {'windy': {'FALSE': 'yes', 'TRUE': 'no'}}, 'hot': {'windy': {'FALSE': 'yes', 'TRUE': 'no'}}, 'mild': {'humidity': {'high': 'no', 'normal': 'yes'}}}}}}
def creatTree(dict_PlusFeatures, dict_NegativeFeatures, sum_Features, preData):
    if dict_PlusFeatures['play'] == 0:  # 如果只有负例的话，就不用分了
        return 'no'
    if dict_NegativeFeatures['play'] == 0:  # 如果只有正例的话，就不用分了
        return 'yes'
    if len(sum_Features) - 1 == 1:  # 如果只剩下一个属性，那么返回最多的
        return getmax(dict_PlusFeatures, dict_NegativeFeatures)
    BestFeatureName = chooseBestFeature(dict_PlusFeatures, dict_NegativeFeatures, sum_Features)  # 计算信息增益，选出最优属性值
    # print(BestFeatureName)
    Tree = {BestFeatureName: {}}  # 建立树

    Values = set()  # 保存的的是最优属性的不同取值，因为要根据这些不同取值对数进行分叉
    for i in sum_Features[BestFeatureName].keys():
        Values.add(i)  # set用add方法

    for i in Values:
        # 选出不同取值的划分
        Newdict_PlusFeatures, Newdict_NegativeFeatures, Newsum_Features = splitValues(dict_PlusFeatures,
                                                                                      dict_NegativeFeatures,
                                                                                      sum_Features,
                                                                                      preData, BestFeatureName, i)

        NewpreData = []
        index = 0
        for j in range(len(preData[0])):
            if preData[0][j] == BestFeatureName:
                index = j
                temp = preData[0][:j] + preData[0][j + 1:]
                NewpreData.append(temp)
                break
        for j in range(len(preData)):
            if preData[j][index] == i:
                temp2 = preData[j][:index] + preData[j][index + 1:]
                NewpreData.append(temp2)
        # print(NewpreData)
        # 递归调用
        Tree[BestFeatureName][i] = creatTree(Newdict_PlusFeatures, Newdict_NegativeFeatures, Newsum_Features,
                                             NewpreData)

    return Tree


"""
dict_PlusFeatures:{'Outlook': {'overcast': 2, 'rainy': 3, 'sunny': 1}, 'temperature': {'hot': 1, 'mild': 2, 'cool': 3}, 'humidity': {'high': 2, 'normal': 4}, 'windy': {'FALSE': 5, 'TRUE': 1}, 'play': 6}
dict_NegativeFeatures:{'Outlook': {'sunny': 3, 'rainy': 1, 'overcast': 0}, 'temperature': {'hot': 2, 'cool': 1, 'mild': 1}, 'humidity': {'high': 3, 'normal': 1}, 'windy': {'FALSE': 2, 'TRUE': 2}, 'play': 4}
sum_Features:{'Outlook': {'sunny': 4, 'overcast': 2, 'rainy': 4}, 'temperature': {'hot': 3, 'mild': 3, 'cool': 4}, 'humidity': {'high': 5, 'normal': 5}, 'windy': {'FALSE': 7, 'TRUE': 3}, 'play': 10}
{'Outlook': {'rainy': {'windy': {'TRUE': 'no', 'FALSE': 'yes'}}, 'sunny': {'temperature': {'cool': 'yes', 'hot': 'no', 'mild': 'no'}}, 'overcast': 'yes'}}
"""


def test(preData, Tree, nub, book):
    for i in Tree.keys():
        temp = i
    tempkey = preData[nub][book[temp]]
    if Tree[temp][tempkey] == 'yes':
        print('predict is yes')
        return
    elif Tree[temp][tempkey] == 'no':
        print('predict is no')
        return
    newTree = Tree[temp][tempkey]
    test(preData, newTree, nub, book)


if __name__ == '__main__':
    dict_PlusFeatures, dict_NegativeFeatures, sum_Features, preData = readFile()
    # print(preData)
    Tree = creatTree(dict_PlusFeatures, dict_NegativeFeatures, sum_Features, preData)
    print(json.dumps(Tree, indent=5))
    book = dict()
    for i in range(len(preData[0])):
        if preData[0][i] != 'play':
            book[preData[0][i]] = i
    test(preData, Tree, 5, book)