决策树构建

文章发布时间:

2023-05-12

最后更新时间:

2023-05-31

import numpy as np

import copy

# You can modify the code as what you want

  

class decision_tree:

    def __init__(self, features, output, dataset):

        self.features = features

        self.output = output

        self.dataset = dataset

        self.depth = 0        

    def log(self, x):

        return np.log2(x)

    # you can use this function to calculate the emprical probability of a random variable under a dataset

    def get_prob(self, array):

        (unique, counts) = np.unique(array, return_counts=True, axis=0)

        return counts/len(array)

    # you can use this function to calculate the emprical entropy of a random variable under a dataset

    def entropy(self, array):

        p = self.get_prob(array)

        return -np.sum(p*np.log2(p))

    def output_entropy(self):

        # calculate the emprical entropy of the output

        output_array = []

        for data in self.dataset:

            output_array.append(data.get(self.output))

        #print(self.entropy(output_array)) #测试用

        return self.entropy(output_array)

    def conditional_entropy(self, feature):

        # calculate the emprical conditional entropy of the output relative to the "feature"

        feature_array = []

        for data in self.dataset:

            feature_array.append(data.get(feature))

        # 计算权值

        weight_array = self.get_prob(feature_array)

        (unique, counts) = np.unique(feature_array, return_counts=True)

        # 条件下的经验熵

        entropy_array = []

        for u in unique:

            list = []

            for data in self.dataset:

                if data.get(feature) == u:

                    list.append(data.get(self.output))

            entropy_array.append(self.entropy(list))

        #加权计算得到最后的条件熵

        result = np.sum(np.array(weight_array) * entropy_array)

        return result

    def feature_selection(self):

        # select the feature has maximum mutual information

        v = 0

        selected = '我是字符串'

        for f in self.features:

            #print("互信息为",self.output_entropy()-self.conditional_entropy(f))

            v1 = self.output_entropy()-self.conditional_entropy(f)

            if v1 >= v:              # 这里要取等号，不然互信息为0的时候特征选择会选成“我是字符串”

                selected = f         # 出现这种现象很可能是分类已经很细了，样本数据太小（bug找了半天）

        #print("现在选择：",selected)

        return selected

# 自己加的一个函数

    def tof(self,classify_dataset): # 判断分类 true or false，就是比较分类里去图书馆的人多一些还不去的多一些

        (unique, counts) = np.unique([d.get(self.output) for d in classify_dataset],return_counts=True)

        # 对这个分类下结论

        #print(unique,counts)

        if len(counts) == 2:          # false 和 true 是不是同时有

            if counts[0] > counts[1]: # false 比 true 多

                return unique[0]

            else:

                return unique[1]

        else:

            return unique[0]

# 自己加的又一个函数，不然predict这个函数看起来太大了

    def classify(self,selected_feature,u): # 得到分类好的数据集

        # 划分数据集合

        classify_dataset = []

        for d in self.dataset:

            if d.get(selected_feature) == u:

                classify_dataset.append(d)

        #print("该分类的元素个数",len(classify_dataset))

        #print(classify_dataset) # 打印分类好的数据集

        return classify_dataset

  

# 又又加的

    def train(self, data):

        # 训练构建决策树

        global decision_order,depth,max_depth,tree,branch

        depth = depth + 1

        #print("depth -----------",depth)

        if depth <= max_depth:

            selected_feature = self.feature_selection()# 选出当前最优标签

            if  len(data) == 0: # 只有一个特征标签时就停止进入递归

                depth = depth - 1

                max_depth = depth

                print("最大深度为-------------",max_depth)

                return

            else:

                if max_depth == 100:

                    decision_order.append(selected_feature)   # 当前最优标签加入决策序列

                feature_array = []                        # 依据当前最优标签分成几叉

                for d in self.dataset:

                    feature_array.append(d.get(decision_order[depth-1]))

                (unique, counts) = np.unique(feature_array, return_counts=True)

                decision_feature=list(data)

                decision_feature.remove(selected_feature) # 接下来的决策标签 要除去当且最优标签

                # 遍历这个特征所有的值

                for u in unique:

                    #print("目前所在边为：",selected_feature,u)

                    branch.append(u)

                    classify_dataset=self.classify(selected_feature,u)

                    # 递归进入下一层

                    if depth != max_depth:

                        predict_tree=decision_tree(decision_feature,self.output,classify_dataset)

                        predict_tree.train(decision_feature)

                        #print("当前深度为",depth)

                    # 对这个分类下结论

                    if depth == max_depth:

                        a = self.tof(classify_dataset)

                        tree[tuple(branch)]=a

                    branch.pop(-1)

                depth = depth - 1

        return

  

    def predict(self, data):

        data1 = copy.deepcopy(data)

        if self.output in data1: # 如果输入的特征里有决策的特征就丢掉

            del data1[self.output]

  

        # make prediction for an arbitrary data input

        if tree == {}:

            self.train(data1) # 构建决策树

            #print(tree) # 打印决策树

  

        # 依据构建的决策树做判断

        if len(data1)==len(decision_order):

            #print(tree[tuple([data.get(d) for d in decision_order])])

            my_judge = tree[tuple([data1.get(d) for d in decision_order])]

            #print(my_judge,1231232131)

            return my_judge

        return

  
  

decision_order = [] # 控制决策顺序

depth = 0          # 深度

max_depth = 100    # 最大深度

tree = {}

branch = []

  

dataset = [

    {"age": 19, "male": False, "single": False, "visit_library_in_Sunday": False},

    {"age": 19, "male": False, "single": False, "visit_library_in_Sunday": False},

    {"age": 19, "male": True,  "single": False, "visit_library_in_Sunday": True},

    {"age": 19, "male": True,  "single": True,  "visit_library_in_Sunday": True},

    {"age": 19, "male": False, "single": False, "visit_library_in_Sunday": False},

    {"age": 20, "male": False, "single": False, "visit_library_in_Sunday": False},

    {"age": 20, "male": False, "single": False, "visit_library_in_Sunday": False},

    {"age": 20, "male": True,  "single": True,  "visit_library_in_Sunday": True},

    {"age": 20, "male": False, "single": True,  "visit_library_in_Sunday": True},

    {"age": 20, "male": False, "single": True,  "visit_library_in_Sunday": True},

    {"age": 21, "male": False, "single": True,  "visit_library_in_Sunday": True},

    {"age": 21, "male": False, "single": True,  "visit_library_in_Sunday": True},

    {"age": 21, "male": True,  "single": False, "visit_library_in_Sunday": True},

    {"age": 21, "male": True,  "single": False, "visit_library_in_Sunday": True},

    {"age": 21, "male": False, "single": False, "visit_library_in_Sunday": False},

    {"age": 21, "male": False, "single": False, "visit_library_in_Sunday": False},

    {"age": 21, "male": False, "single": False, "visit_library_in_Sunday": True}

]  

my_tree = decision_tree(\

    ["age", "male", "single"], "visit_library_in_Sunday", dataset)

  

# Test 1

print(my_tree.predict({'age': 19, 'male': False, 'single': False}), "should be 0 or False")

  

# Test 2

# use the feature "single" to classify as last time

def tree_example(data):

    if data["single"]:

        return True

    else:

        return False

true_classification = 0

for data in dataset:

    if tree_example(data) == data["visit_library_in_Sunday"]:

        true_classification += 1

  

print("One feature classification accuracy:", true_classification/len(dataset))

  

# use a decision tree to classify

true_classification = 0

  

for data in dataset:

    if my_tree.predict(data) == data["visit_library_in_Sunday"]:

        true_classification += 1

print("Decision tree classification accuracy:", round(true_classification/len(dataset),4), "should be around 0.9412")

最大深度为------------- 3
False should be 0 or False
One feature classification accuracy: 0.7647058823529411
Decision tree classification accuracy: 0.9412 should be around 0.9412

≡