发布时间:2024-11-09 18:01
XGBoost模型可解释性要用到xgb.predict()里面的参数pred_leaf,pred_contribs,
试了用XGBRegressor+fit()来训练模型,predict的时候没有pred_leaf,pred_contribs两个参数项,所以我用了xgb.train方式
模型训练过程:
#----------------------------------------------------------------------XGBoost 模型-------------------------------------------------------------#
#xgboost调参策略
import xgboost as xgb
import pandas as pd
import numpy as np
import pickle
import sys
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error,make_scorer
from sklearn.preprocessing import StandardScaler
# from sklearn.grid_search import GridSearchCV 老版本
from scipy.sparse import csr_matrix,hstack
from sklearn.model_selection import KFold,train_test_split,GridSearchCV
from xgboost import XGBRFRegressor,XGBRegressor
import warnings
warnings.filterwarnings('ignore')
#数据预处理
train =pd.read_csv('D:/机器学习算法/allstate-claims-severity/train.csv')
#对数转换
train['log_loss']=np.log(train['loss'])
#特征分成连续和离散特征
features=[x for x in train.columns if x not in ['id','loss','log_loss']]
cat_features=[x for x in train.select_dtypes(include=['object']).columns
if x not in ['id','loss','log_loss']] #72个二值,88个四值
num_features=[x for x in train.select_dtypes(exclude=['object']).columns
if x not in ['id','loss','log_loss']]
print("Categorical features:",len(cat_features))
print("Numerical features:",len(num_features))
ntrain=train.shape[0]
train_x=train[features]
train_y=train['log_loss']
#离散类别型特征转换成数字标志
for c in range(len(cat_features)):
train_x[cat_features[c]]=train_x[cat_features[c]].astype('category').cat.codes
#字典:字符字段映射关系
category_dict={col: {cat: n for n, cat in enumerate(train[col].astype('category').cat.categories )}
for col in cat_features}
#Simple XGBoost Model
#Model :xgboost自定义了一个数据矩阵类DMatrix,会在训练开始时进行一遍预处理,从而提高之后每次迭代的效率
dtrain=xgb.DMatrix(train_x,train['log_loss'])
'''
XGBoost参数
'booster':'gbtree',梯度提升回归树
'objective':'multi:softmax',多分类的问题。损失函数,分类和回归
'num_class':10,类别数,与multisoftmax并用
'ganmma':损失下降多少才进行分裂
'max_depth':12,构建树的深度,越大越容易过拟合
'lamda':2,控制模型复杂度的权重值得L2正则化项参数,参数越大,模型越不容易过拟合,即1/2λWi^2,λ越大,Wi^2部分必须越小,即叶子权重值越小
’subsample':0.7,随机采样训练样本
'colsample_bytree':0.7,生成树时进行的列采样
'min_child_weight':3,孩子节点中最小的样本权重和。如果一个叶子节点的样本权重和小于min_child_weight则拆分过程结束
'silent':0,设置成1则没有运行信息输出,最好设置为0
'eta':0.007,如同学习率,每颗树的贡献率
'seed':1000
'nthread':7,cpu线程数
'''
'''
XGBoost参数调节
step1:选择一组初始参数
step2:改变max_depth和min_child_weight
step3:调节gamma降低模型过拟合风险
step4:调节subsample和colsample_bytree改变数据采样策略
step5:调节学习率eta
'''
def xg_eval_mae(yhat,dtrain):
y=dtrain.get_label()
return 'mae',mean_absolute_error(np.exp(y),np.exp(yhat))
#模型:200颗
params = {'booster': 'gbtree',
'objective': 'reg:linear',
'eval_metric': 'mae',
'max_depth': 8,
'gamma':0.2,
# 'lambda': 10,
'subsample': 0.9,
'colsample_bytree': 0.6,
'min_child_weight': 6,
'eta': 0.075,
'seed': 0,
'nthread': -1,
'silent': 1}
watchlist = [(dtrain, 'train')]
bst=xgb.train(params,dtrain,num_boost_round=200,evals=watchlist)
train_x_D=xgb.DMatrix(train_x)
# ('mae', 1047.6263)
xg_eval_mae(bst.predict(train_x_D),dtrain)
loss=np.mean(np.abs(pd.DataFrame(np.exp(bst.predict(train_x_D)))-pd.DataFrame(np.exp(train_y.values))))
#(‘mae’, 1047.6263) 用train的方式比我用XGBRegressor训练得到的mae还小,具体原因可能两个方法里面有些参数不一致导致吧,待研究。
XGBRegressor模型结果如下:
#XGBRegressor
grid=XGBRegressor(booster='gbtree',objective='reg:linear',silent=1,seed=0,
n_estimators=200,num_boost_round=200,gamma=0.2,max_depth=8,min_child_weight=6,colsamaple_bytree=0.6,subsample=0.9,eta=0.075)
grid.fit(train_x,train_y.values)
# ('mae', 1092.912)
# (‘mae’, 1092.912)
#pred_leaf | pred_contribs 参数解释:
#对于预测的输出有三种方式
?bst.predict
Signature: bst.predict(data, output_margin=False, ntree_limit=0, pred_leaf=False, pred_contribs=False, approx_contribs=False)
pred_leaf : bool
When this option is on, the output will be a matrix of (nsample, ntrees)
with each record indicating the predicted leaf index of each sample in each tree.
Note that the leaf index of a tree is unique per tree, so you may find leaf 1
in both tree 1 and tree 0.
pred_contribs : bool
When this option is on, the output will be a matrix of (nsample, nfeats+1)
with each record indicating the feature contributions (SHAP values) for that
prediction. The sum of all feature contributions is equal to the prediction.
Note that the bias is added as the final column, on top of the regular features.
#pred_leaf:所属叶子节点分布
>>> bst.predict(train_x_D,pred_leaf=True)
array([[ 82, 89, 83, ..., 300, 209, 113],
[ 82, 89, 83, ..., 300, 209, 113],
[ 35, 91, 33, ..., 300, 209, 113],
...,
[ 53, 36, 75, ..., 300, 210, 113],
[ 68, 55, 28, ..., 300, 165, 113],
[ 87, 66, 58, ..., 300, 207, 113]])
对于第一个样本来说,每颗树的归属节点位置为:第一棵树的第82个叶子节点,第二颗数的第89个叶子节点,。。。。,第200颗树的第113个叶子节点
#pred_contribs:每个特征的影响因子分布
>>> bst.predict(train_x_D,pred_contribs=True)
array([[ 2.8121941e-02, 5.2060433e-02, -3.4024785e-03, ...,
-8.2149508e-04, 4.9418624e-02, 7.6852226e+00],
[ 3.6447022e-02, 5.8015808e-02, -4.4615464e-03, ...,
7.1160109e-03, -1.0666179e-01, 7.6852226e+00],
[ 2.3991451e-02, 5.5216342e-02, -3.0620436e-03, ...,
-1.2132139e-02, 6.0120925e-02, 7.6852226e+00],
...,
[ 2.1242619e-02, 5.0967444e-02, -2.6617078e-03, ...,
2.4747679e-03, -2.9158747e-02, 7.6852226e+00],
[ 2.7094083e-02, 4.0860437e-02, -4.0874789e-03, ...,
-4.5163785e-03, 3.9668504e-02, 7.6852226e+00],
[-5.5207822e-02, -2.8394390e-02, -2.1846974e-03, ...,
-3.7365366e-04, 2.2743054e-02, 7.6852226e+00]], dtype=float32)
>>>
模型评分:
>>> bst.predict(train_x_D)
array([7.5808964, 7.4751277, 8.305426 , ..., 8.342698 , 7.9838195,
8.1374 ], dtype=float32)
影响因子之和:
>>> temp=bst.predict(train_x_D,pred_contribs=True)
>>> score_a=sum(temp[0])
>>> score_a
7.5808893905342245
>>>
第一个样本模型评分:7.5808964,特征影响因子之和:7.5808893905342245,两者还是接近的,微小的差别可能是精确度的问题造成。
模型保存:
#1.保存模型
bst.save_model('D:/机器学习算法/xgboost_model/xgb.model')
# 2/dict写入json
import json
jsObj = json.dumps(category_dict)
fileObject = open('D:/机器学习算法/jsonFile.json', 'w')
fileObject.write(jsObj)
fileObject.close()
#3/features保存
import json
jsObj = json.dumps(features)
fileObject = open('D:/机器学习算法/features.json', 'w')
fileObject.write(jsObj)
fileObject.close()
#4/cat_feature保存
import json
jsObj = json.dumps(cat_features)
fileObject = open('D:/机器学习算法/cat_features.json', 'w')
fileObject.write(jsObj)
fileObject.close()
==================================================
模型调用:
#1.模型加载 load model.
bst = xgb.Booster()
bst.load_model('D:/机器学习算法/xgboost_model/xgb.model')
#字典加载
import json
file = open('D:/机器学习算法/jsonFile.json', 'r')
js = file.read()
dic = json.loads(js)
file.close()
#features加载
import json
file = open('D:/机器学习算法/features.json', 'r')
js = file.read()
features = json.loads(js)
file.close()
#cat_features加载
import json
file = open('D:/机器学习算法/cat_features.json', 'r')
js = file.read()
list_features = json.loads(js)
file.close()
#End;
outmodel_test=pd.read_csv('D:/机器学习算法/allstate-claims-severity/test.csv')
for col in list_features:
outmodel_test[col]=outmodel_test[col].map(dic.get(col))
temp=bst.predict(xgb.DMatrix(outmodel_test[features]))
#预测值还原:np.log()对应np.exp(
predict_outmodel=np.round(np.exp(temp),4)
#pred_leaf:
>>> temp=bst.predict(xgb.DMatrix(outmodel_test[features]))
>>> temp
array([7.3588824, 7.6385818, 9.200299 , ..., 7.8160214, 6.907146 ,
7.925376 ], dtype=float32)
>>> predict_outmodel=np.round(np.exp(temp),4)
>>> predict_outmodel
array([1570.0809, 2076.7964, 9900.092 , ..., 2480.0188, 999.3909,
2766.6042], dtype=float32)
>>> temp=bst.predict(xgb.DMatrix(outmodel_test[features]),pred_leaf=True)
>>> temp
array([[ 82, 89, 83, ..., 300, 209, 113],
[ 85, 79, 83, ..., 225, 214, 113],
[ 10, 73, 22, ..., 300, 220, 113],
...,
[ 26, 89, 74, ..., 300, 209, 113],
[ 86, 81, 81, ..., 300, 208, 113],
[ 58, 59, 77, ..., 225, 209, 113]])
>>>
==================================================
本来还想用shap来看看模型的结果,但是pip install shap 报以下错误(Python3.7)
相关资料参考文档来源:
1.xgboost算模型输出的解释
2.不再黑盒,机器学习解释利器:SHAP原理及实战
3.Python机器学习笔记:XgBoost算法