1 项目简介
GBDT 的全称是 Gradient Boosting Decision Tree,梯度提升树,在传统机器学习算法中,GBDT 算的上 TOP3 的算法。
本开源项目完整讲解了梯度提升树的算法原理,剖析了 GBDT 的重要组件:决策树和梯度提升,包括 GBDT 算法的公式推导。本来试图将这些讲解完全编辑到公众号中,由于目前公众号对公式的支持很不友好,尽管花费了1个多小时,但公式的排版、格式依然混乱。Freemanzxp 将其发布在博客中,地址如下:
https://blog.csdn.net/zpalyq110/article/details/79527653
将重点转向作者实现的完整代码和例子阐述中 。利用 python 实现 GBDT 算法的回归、二分类以及多分类,代码完整,注释详细,并带有例子及其可视化,帮助大家庖丁解牛地理解 GBDT. 这个项目开源在了 Github 中,欢迎 star
https://github.com/Freemanzxp/GBDT_Simple_Tutorial
2 数据介绍
如下表所示:一组数据,特征为年龄、体重,身高为标签值。共有 5 条数据,前四条为训练样本,最后一条为要预测的样本。
编号 | 年龄(岁) | 体重(kg) | 身高(m)(标签值) |
---|---|---|---|
0 | 5 | 20 | 1.1 |
1 | 7 | 30 | 1.3 |
2 | 21 | 70 | 1.7 |
3 | 30 | 60 | 1.8 |
4 | 25 | 65 | ? |
3 完整代码
3.1 依赖环境
- 操作系统:Windows/Linux
- 编程语言:Python3
- Python库:pandas、PIL、pydotplus,
其中pydotplus库会自动调用Graphviz,所以需要去Graphviz官网下载
graphviz的-2.38.msi
,先安装,再将安装目录下的bin
添加到系统环境变量,此时如果再报错可以重启计算机。详细过程不再描述,网上很多解答。
3.2 文件结构
- example.py 回归/二分类/多分类测试文件
- GBDT 主模块文件夹
- gbdt.py 梯度提升算法主框架
- decision_tree.py 单颗树生成,包括节点划分和叶子结点生成
- loss_function.py 损失函数
- tree_plot.py 树的可视化
项目模块图
3.3 运行指南
- 回归测试:
python example.py --model = regression
- 二分类测试:
python example.py --model = binary_cf
- 多分类测试:
python example.py --model = multi_cf
- 其他可配置参数:
lr
-- 学习率,trees
-- 构建的决策树数量即迭代次数,depth
-- 决策树的深度,count
-- 决策树节点分裂的最小数据数量,is_log
-- 是否打印树的生成过程,is_plot
-- 是否可视化树的结构. - 结果文件: 运行后会生成
results
文件夹,里面包含了每棵树的内部结构和生成日志
3.4 完整代码
列举项目的其中两个核心模块 gbdt.py 和 decision_tree.py 的完整代码:
代码语言:javascript复制"""
Created on :2019/03/28
@author: Freeman, feverfc1994
"""
import abc
import math
import logging
import pandas as pd
from GBDT.decision_tree import Tree
from GBDT.loss_function import SquaresError, BinomialDeviance, MultinomialDeviance
from GBDT.tree_plot import plot_tree, plot_all_trees,plot_multi
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
class AbstractBaseGradientBoosting(metaclass=abc.ABCMeta):
def __init__(self):
pass
def fit(self, data):
pass
def predict(self, data):
pass
class BaseGradientBoosting(AbstractBaseGradientBoosting):
def __init__(self, loss, learning_rate, n_trees, max_depth,
min_samples_split=2, is_log=False, is_plot=False):
super().__init__()
self.loss = loss
self.learning_rate = learning_rate
self.n_trees = n_trees
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.features = None
self.trees = {}
self.f_0 = {}
self.is_log = is_log
self.is_plot = is_plot
def fit(self, data):
"""
:param data: pandas.DataFrame, the features data of train training
"""
# 掐头去尾, 删除id和label,得到特征名称
self.features = list(data.columns)[1: -1]
# 初始化 f_0(x)
# 对于平方损失来说,初始化 f_0(x) 就是 y 的均值
self.f_0 = self.loss.initialize_f_0(data)
# 对 m = 1, 2, ..., M
logger.handlers[0].setLevel(logging.INFO if self.is_log else logging.CRITICAL)
for iter in range(1, self.n_trees 1):
if len(logger.handlers) > 1:
logger.removeHandler(logger.handlers[-1])
fh = logging.FileHandler('results/NO.{}_tree.log'.format(iter), mode='w', encoding='utf-8')
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
# 计算负梯度--对于平方误差来说就是残差
logger.info(('-----------------------------构建第%d颗树-----------------------------' % iter))
self.loss.calculate_residual(data, iter)
target_name = 'res_' str(iter)
self.trees[iter] = Tree(data, self.max_depth, self.min_samples_split,
self.features, self.loss, target_name, logger)
self.loss.update_f_m(data, self.trees, iter, self.learning_rate, logger)
if self.is_plot:
plot_tree(self.trees[iter], max_depth=self.max_depth, iter=iter)
# print(self.trees)
if self.is_plot:
plot_all_trees(self.n_trees)
class GradientBoostingRegressor(BaseGradientBoosting):
def __init__(self, learning_rate, n_trees, max_depth,
min_samples_split=2, is_log=False, is_plot=False):
super().__init__(SquaresError(), learning_rate, n_trees, max_depth,
min_samples_split, is_log, is_plot)
def predict(self, data):
data['f_0'] = self.f_0
for iter in range(1, self.n_trees 1):
f_prev_name = 'f_' str(iter - 1)
f_m_name = 'f_' str(iter)
data[f_m_name] = data[f_prev_name]
self.learning_rate *
data.apply(lambda x: self.trees[iter].root_node.get_predict_value(x), axis=1)
data['predict_value'] = data[f_m_name]
class GradientBoostingBinaryClassifier(BaseGradientBoosting):
def __init__(self, learning_rate, n_trees, max_depth,
min_samples_split=2, is_log=False, is_plot=False):
super().__init__(BinomialDeviance(), learning_rate, n_trees, max_depth,
min_samples_split, is_log, is_plot)
def predict(self, data):
data['f_0'] = self.f_0
for iter in range(1, self.n_trees 1):
f_prev_name = 'f_' str(iter - 1)
f_m_name = 'f_' str(iter)
data[f_m_name] = data[f_prev_name]
self.learning_rate *
data.apply(lambda x: self.trees[iter].root_node.get_predict_value(x), axis=1)
data['predict_proba'] = data[f_m_name].apply(lambda x: 1 / (1 math.exp(-x)))
data['predict_label'] = data['predict_proba'].apply(lambda x: 1 if x >= 0.5 else 0)
class GradientBoostingMultiClassifier(BaseGradientBoosting):
def __init__(self, learning_rate, n_trees, max_depth,
min_samples_split=2, is_log=False, is_plot=False):
super().__init__(MultinomialDeviance(), learning_rate, n_trees, max_depth,
min_samples_split, is_log, is_plot)
def fit(self, data):
# 掐头去尾, 删除id和label,得到特征名称
self.features = list(data.columns)[1: -1]
# 获取所有类别
self.classes = data['label'].unique().astype(str)
# 初始化多分类损失函数的参数 K
self.loss.init_classes(self.classes)
# 根据类别将‘label’列进行one-hot处理
for class_name in self.classes:
label_name = 'label_' class_name
data[label_name] = data['label'].apply(lambda x: 1 if str(x) == class_name else 0)
# 初始化 f_0(x)
self.f_0[class_name] = self.loss.initialize_f_0(data, class_name)
# print(data)
# 对 m = 1, 2, ..., M
logger.handlers[0].setLevel(logging.INFO if self.is_log else logging.CRITICAL)
for iter in range(1, self.n_trees 1):
if len(logger.handlers) > 1:
logger.removeHandler(logger.handlers[-1])
fh = logging.FileHandler('results/NO.{}_tree.log'.format(iter), mode='w', encoding='utf-8')
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
logger.info(('-----------------------------构建第%d颗树-----------------------------' % iter))
# 这里计算负梯度整体计算是为了计算p_sum的一致性
self.loss.calculate_residual(data, iter)
self.trees[iter] = {}
for class_name in self.classes:
target_name = 'res_' class_name '_' str(iter)
self.trees[iter][class_name] = Tree(data, self.max_depth, self.min_samples_split,
self.features, self.loss, target_name, logger)
self.loss.update_f_m(data, self.trees, iter, class_name, self.learning_rate, logger)
if self.is_plot:
plot_multi(self.trees[iter], max_depth=self.max_depth, iter=iter)
if self.is_plot:
plot_all_trees(self.n_trees)
def predict(self, data):
"""
此处的预测的实现方式和生成树的方式不同,
生成树是需要每个类别的树的每次迭代需要一起进行,外层循环是iter,内层循环是class
但是,预测时树已经生成,可以将class这层循环作为外循环,可以节省计算成本。
"""
for class_name in self.classes:
f_0_name = 'f_' class_name '_0'
data[f_0_name] = self.f_0[class_name]
for iter in range(1, self.n_trees 1):
f_prev_name = 'f_' class_name '_' str(iter - 1)
f_m_name = 'f_' class_name '_' str(iter)
data[f_m_name] =
data[f_prev_name]
self.learning_rate * data.apply(lambda x:
self.trees[iter][class_name].root_node.get_predict_value(x), axis=1)
data['sum_exp'] = data.apply(lambda x:
sum([math.exp(x['f_' i '_' str(iter)]) for i in self.classes]), axis=1)
for class_name in self.classes:
proba_name = 'predict_proba_' class_name
f_m_name = 'f_' class_name '_' str(iter)
data[proba_name] = data.apply(lambda x: math.exp(x[f_m_name]) / x['sum_exp'], axis=1)
# TODO: log 每一类的概率
data['predict_label'] = data.apply(lambda x: self._get_multi_label(x), axis=1)
def _get_multi_label(self, x):
label = None
max_proba = -1
for class_name in self.classes:
if x['predict_proba_' class_name] > max_proba:
max_proba = x['predict_proba_' class_name]
label = class_name
return label
decision_tree.py
代码语言:javascript复制"""
Created on :2019/03/30
@author: Freeman, feverfc1994
"""
class Node:
def __init__(self, data_index, logger=None, split_feature=None, split_value=None, is_leaf=False, loss=None,
deep=None):
self.loss = loss
self.split_feature = split_feature
self.split_value = split_value
self.data_index = data_index
self.is_leaf = is_leaf
self.predict_value = None
self.left_child = None
self.right_child = None
self.logger = logger
self.deep = deep
def update_predict_value(self, targets, y):
self.predict_value = self.loss.update_leaf_values(targets, y)
self.logger.info(('叶子节点预测值:', self.predict_value))
def get_predict_value(self, instance):
if self.is_leaf:
self.logger.info(('predict:', self.predict_value))
return self.predict_value
if instance[self.split_feature] < self.split_value:
return self.left_child.get_predict_value(instance)
else:
return self.right_child.get_predict_value(instance)
class Tree:
def __init__(self, data, max_depth, min_samples_split, features, loss, target_name, logger):
self.loss = loss
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.features = features
self.logger = logger
self.target_name = target_name
self.remain_index = [True] * len(data)
self.leaf_nodes = []
self.root_node = self.build_tree(data, self.remain_index, depth=0)
def build_tree(self, data, remain_index, depth=0):
"""
此处有三个树继续生长的条件:
1: 深度没有到达最大, 树的深度假如是3, 意思是需要生长成3层, 那么这里的depth只能是0, 1
所以判断条件是 depth < self.max_depth - 1
2: 点样本数 >= min_samples_split
3: 此节点上的样本的 target_name 值不一样(如果值 一样说明已经划分得很好了,不需要再分)
"""
now_data = data[remain_index]
if depth < self.max_depth - 1
and len(now_data) >= self.min_samples_split
and len(now_data[self.target_name].unique()) > 1:
se = None
split_feature = None
split_value = None
left_index_of_now_data = None
right_index_of_now_data = None
self.logger.info(('--树的深度:%d' % depth))
for feature in self.features:
self.logger.info(('----划分特征:', feature))
feature_values = now_data[feature].unique()
for fea_val in feature_values:
# 尝试划分
left_index = list(now_data[feature] < fea_val)
right_index = list(now_data[feature] >= fea_val)
left_se = calculate_se(now_data[left_index][self.target_name])
right_se = calculate_se(now_data[right_index][self.target_name])
sum_se = left_se right_se
self.logger.info(('------划分值:%.3f,左节点损失:%.3f,右节点损失:%.3f,总损失:%.3f' %
(fea_val, left_se, right_se, sum_se)))
if se is None or sum_se < se:
split_feature = feature
split_value = fea_val
se = sum_se
left_index_of_now_data = left_index
right_index_of_now_data = right_index
self.logger.info(('--最佳划分特征:', split_feature))
self.logger.info(('--最佳划分值:', split_value))
node = Node(remain_index, self.logger, split_feature, split_value, deep=depth)
"""
trick for DataFrame, index revert
下面这部分代码是为了记录划分后样本在原始数据中的的索引
DataFrame的数据索引可以使用True和False
所以下面得到的是一个bool类型元素组成的数组
利用这个数组进行索引获得划分后的数据
"""
left_index_of_all_data = []
for i in remain_index:
if i:
if left_index_of_now_data[0]:
left_index_of_all_data.append(True)
del left_index_of_now_data[0]
else:
left_index_of_all_data.append(False)
del left_index_of_now_data[0]
else:
left_index_of_all_data.append(False)
right_index_of_all_data = []
for i in remain_index:
if i:
if right_index_of_now_data[0]:
right_index_of_all_data.append(True)
del right_index_of_now_data[0]
else:
right_index_of_all_data.append(False)
del right_index_of_now_data[0]
else:
right_index_of_all_data.append(False)
node.left_child = self.build_tree(data, left_index_of_all_data, depth 1)
node.right_child = self.build_tree(data, right_index_of_all_data, depth 1)
return node
else:
node = Node(remain_index, self.logger, is_leaf=True, loss=self.loss, deep=depth)
if len(self.target_name.split('_')) == 3:
label_name = 'label_' self.target_name.split('_')[1]
else:
label_name = 'label'
node.update_predict_value(now_data[self.target_name], now_data[label_name])
self.leaf_nodes.append(node)
return node
def calculate_se(label):
mean = label.mean()
se = 0
for y in label:
se = (y - mean) * (y - mean)
return se
输出的结果: