LightGBM K-Flod训练模板
基础模板—内置metrics
import pandas as pd
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import accuracy_score
# 训练数据集
train = pd.read_csv(...)
X_train = train[...]
y_train = train[...]
# 测试数据集
X_test = pd.read_csv(...)
# lightgbm参数
lgb_params = {
"objective": "xxx",
"metric": "xxx",
"boosting_type": "gbdt",
'early_stopping_rounds': 100,
'learning_rate': 0.01,
'colsample_bytree':0.9,
}
# k-flod
kf = KFold(n_splits=5)
# 存储k-flod中的全部model
lgb_models = []
# 存储预测结果
y_pred = 0
for f,(tr_idx,val_idx) in enumerate(kf.split(X_tr_val)):
# 分割训练集与验证集
X_trn, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
y_trn, y_val = y_train.iloc[tr_idx], train.iloc[val_idx]
# 生成lightgbm数据集
lgb_train = lgb.Dataset(X_trn,y_trn)
lgb_valid = lgb.Dataset(X_val,y_val)
# 启动训练
clf = lgb.train(params=lgb_params, # 训练参数
train_set=lgb_train, # 训练集
valid_sets=[lgb_train, lgb_valid], # 衡量性能(训练集+验证集)
num_boost_round=10000, # 最大round
verbose_eval=100, # 每100round打印
)
y_pred += model_binary.predict(X_test) / 5.0
lgb_models.append(clf)
进阶模板——自定义metrics
自定义metrics函数在lgb.train
中传入eval_metric=自定义metics函数
。
输入格式:
preds
:numpy数组lgb_train
:数据,需要利用.get_label()
获取对应的label
输入格式:
- error名字
- 计算的error值
- 是否error值越高越好
# 自定义metrics
def my_metrics(preds, lgb_train):
labels =lgb_train.get_label()
return 'Error name', error计算公式, True/False Higher is better
# 对应的lgb训练代码
import pandas as pd
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import accuracy_score
# 训练数据集
train = pd.read_csv(...)
X_train = train[...]
y_train = train[...]
# 测试数据集
X_test = pd.read_csv(...)
# lightgbm参数
lgb_params = {
"objective": "xxx",
# 不用在params里面设置: "metric": "xxx",
"boosting_type": "gbdt",
'early_stopping_rounds': 100,
'learning_rate': 0.01,
'colsample_bytree':0.9,
}
# k-flod
kf = KFold(n_splits=5)
# 存储k-flod中的全部model
lgb_models = []
# 存储预测结果
y_pred = 0
for f,(tr_idx,val_idx) in enumerate(kf.split(X_tr_val)):
# 分割训练集与验证集
X_trn, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
y_trn, y_val = y_train.iloc[tr_idx], train.iloc[val_idx]
# 生成lightgbm数据集
lgb_train = lgb.Dataset(X_trn,y_trn)
lgb_valid = lgb.Dataset(X_val,y_val)
# 启动训练
clf = lgb.train(params=lgb_params, # 训练参数
train_set=lgb_train, # 训练集
valid_sets=[lgb_train, lgb_valid], # 衡量性能(训练集+验证集)
num_boost_round=10000, # 最大round
verbose_eval=100, # 每100round打印
feval = my_metrics
)
y_pred += clf.predict(X_test) / 5.0
lgb_models.append(clf)
XGBoost 训练模板
如果需要自定义metrics,可以在xgb.train
中指定feval
自定义metrics函数格式:
输入格式:
ground_truth
:DMatrix
格式,可以使用.get_labe()
获取labelpreds
:预测结果
输出格式:
- error的名字
- 计算的error值
# 自定义metrics,注意这里和lightgbm是相反的
def my_merics(ground_truth, preds):
try:
preds = preds.get_label()
except:
pass
return 'feval_xgb_age', error
# 对应的K-fold xgb训练代码
import pandas as pd
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import accuracy_score
# 训练数据集
train = pd.read_csv(...)
X_train = train[...]
y_train = train[...]
# 测试数据集
X_test = pd.read_csv(...)
# xgboost参数
xgb_params = {
'colsample_bytree': 0.5,
'gamma': 8,
'max_depth': 4,
'min_child_weight': 8,
'n_estimators': 1000,
'reg_alpha': 10.0,
'reg_lambda': 0.5
}
# k-flod
kf = KFold(n_splits=5)
# 存储k-flod中的全部model
xgb_models = []
# 存储预测结果
y_pred = 0
for f,(tr_ind,val_ind) in enumerate(kf.split(X_tr_val)):
print('--------Fold', f)
# 分割训练集与验证集
X_trn, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
y_trn, y_val = y_train.iloc[tr_idx], train.iloc[val_idx]
# 生成xgboost数据集
xgb_train = xgb.DMatrix(X_trn,y_trn)
xgb_valid = xgb.DMatrix(X_val,y_val)
xgb_test = xgb.DMatrix(X_test)
clf = xgb.train(params=xgb_params,
dtrain=xgb_train,
evals=[(xgb_train, 'train'), (xgb_valid, 'valid')],
num_boost_round=100000,
feval = my_metrics,
verbose_eval=100,
early_stopping_rounds=10
)
y_pred += clf.predict(xgb_test) / 5.0
xgb_models.append(clf)
hyperopt 自动调参
结合上述代码,可以轻松引入hyperopt自动调参
space={'max_depth': hp.quniform("max_depth", 3, 15, 1),
'gamma': hp.uniform ('gamma', 0.5,20),
'reg_alpha' : hp.uniform('reg_alpha', 0, 20),
'reg_lambda' : hp.uniform('reg_lambda', 0, 20),
'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
'min_child_weight' : hp.uniform('min_child_weight', 0, 20),
'seed': 0
}
def objective(space, kfold=5):
kf = KFold(n_splits=5)
y_pred = 0
space['max_depth'] = int(space['max_depth'])
print(space)
for f,(tr_ind,val_ind) in enumerate(kf.split(X_tr_val)):
X_train,X_valid = ...
y_train,y_valid = ...
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_valid = xgb.DMatrix(X_valid, label=y_valid)
clf = xgb.train(params=space,
dtrain=xgb_train,
evals=[(xgb_train, 'train'), (xgb_valid, 'valid')],
num_boost_round=100000,
feval = ...,
# verbose_eval=100,
verbose_eval=False,
early_stopping_rounds=100
)
y_pred += clf.predict(xgb_valid) / 5.0
res_metric = my_merics(y_valid, y_pred)[1]
print ("SCORE:", res_metric)
return {'loss': -res_metric, 'status': STATUS_OK }
trials = Trials()
best_hyperparams = fmin(fn = objective,
space = space,
algo = tpe.suggest,
max_evals = 100,
trials = trials)