LightGBM K-Flod训练模板

基础模板—内置metrics

import pandas as pd
from sklearn.model_selection import KFold 
import lightgbm as lgb

from sklearn.metrics import accuracy_score

# 训练数据集
train = pd.read_csv(...)
X_train = train[...]
y_train = train[...]

# 测试数据集
X_test = pd.read_csv(...)


# lightgbm参数
lgb_params = {
      "objective": "xxx", 
      "metric": "xxx", 
      "boosting_type": "gbdt",
      'early_stopping_rounds': 100,
      'learning_rate': 0.01,  
      'colsample_bytree':0.9, 
} 

# k-flod
kf = KFold(n_splits=5)

# 存储k-flod中的全部model
lgb_models = []

# 存储预测结果
y_pred = 0

for f,(tr_idx,val_idx) in enumerate(kf.split(X_tr_val)):
    
    # 分割训练集与验证集
    X_trn, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_trn, y_val = y_train.iloc[tr_idx], train.iloc[val_idx]
    
    # 生成lightgbm数据集
    lgb_train = lgb.Dataset(X_trn,y_trn)
    lgb_valid = lgb.Dataset(X_val,y_val)
    
    # 启动训练
    clf = lgb.train(params=lgb_params,  # 训练参数
                  train_set=lgb_train, # 训练集
                  valid_sets=[lgb_train, lgb_valid], # 衡量性能(训练集+验证集)
                  num_boost_round=10000, # 最大round
                  verbose_eval=100, # 每100round打印
                    )
    
    y_pred += model_binary.predict(X_test) / 5.0
    
    lgb_models.append(clf) 

进阶模板——自定义metrics

自定义metrics函数在lgb.train中传入eval_metric=自定义metics函数

输入格式:

  • preds:numpy数组
  • lgb_train:数据,需要利用.get_label()获取对应的label

输入格式:

  • error名字
  • 计算的error值
  • 是否error值越高越好
# 自定义metrics
def my_metrics(preds, lgb_train):
    
    labels =lgb_train.get_label()
    return 'Error name', error计算公式, True/False Higher is better

# 对应的lgb训练代码

import pandas as pd
from sklearn.model_selection import KFold 
import lightgbm as lgb

from sklearn.metrics import accuracy_score

# 训练数据集
train = pd.read_csv(...)
X_train = train[...]
y_train = train[...]

# 测试数据集
X_test = pd.read_csv(...)


# lightgbm参数
lgb_params = {
      "objective": "xxx", 
      # 不用在params里面设置: "metric": "xxx",  
      "boosting_type": "gbdt",
      'early_stopping_rounds': 100,
      'learning_rate': 0.01,  
      'colsample_bytree':0.9, 
} 

# k-flod
kf = KFold(n_splits=5)

# 存储k-flod中的全部model
lgb_models = []

# 存储预测结果
y_pred = 0

for f,(tr_idx,val_idx) in enumerate(kf.split(X_tr_val)):
    
    # 分割训练集与验证集
    X_trn, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_trn, y_val = y_train.iloc[tr_idx], train.iloc[val_idx]
    
    # 生成lightgbm数据集
    lgb_train = lgb.Dataset(X_trn,y_trn)
    lgb_valid = lgb.Dataset(X_val,y_val)
    
    # 启动训练
    clf = lgb.train(params=lgb_params,  # 训练参数
                  train_set=lgb_train, # 训练集
                  valid_sets=[lgb_train, lgb_valid], # 衡量性能(训练集+验证集)
                  num_boost_round=10000, # 最大round
                  verbose_eval=100, # 每100round打印
                    feval = my_metrics
                    )
    
    y_pred += clf.predict(X_test) / 5.0
    lgb_models.append(clf) 

XGBoost 训练模板

如果需要自定义metrics,可以在xgb.train中指定feval

自定义metrics函数格式:

输入格式:

  • ground_truthDMatrix格式,可以使用.get_labe()获取label
  • preds:预测结果

输出格式:

  • error的名字
  • 计算的error值

# 自定义metrics,注意这里和lightgbm是相反的
def my_merics(ground_truth, preds):
    try:
        preds = preds.get_label()
    except:
        pass
    return 'feval_xgb_age', error

# 对应的K-fold xgb训练代码
import pandas as pd
from sklearn.model_selection import KFold 
import xgboost as xgb

from sklearn.metrics import accuracy_score

# 训练数据集
train = pd.read_csv(...)
X_train = train[...]
y_train = train[...]

# 测试数据集
X_test = pd.read_csv(...)


# xgboost参数
xgb_params = {
    'colsample_bytree': 0.5,
    'gamma': 8,
    'max_depth': 4,
    'min_child_weight': 8,
    'n_estimators': 1000,
    'reg_alpha': 10.0,
    'reg_lambda': 0.5
}

# k-flod
kf = KFold(n_splits=5)

# 存储k-flod中的全部model
xgb_models = []

# 存储预测结果
y_pred = 0

for f,(tr_ind,val_ind) in enumerate(kf.split(X_tr_val)):
    print('--------Fold', f)    
    
    # 分割训练集与验证集
    X_trn, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_trn, y_val = y_train.iloc[tr_idx], train.iloc[val_idx]
    
    # 生成xgboost数据集
    xgb_train = xgb.DMatrix(X_trn,y_trn)
    xgb_valid = xgb.DMatrix(X_val,y_val)
    xgb_test = xgb.DMatrix(X_test)
    
    clf = xgb.train(params=xgb_params, 
                  dtrain=xgb_train,
                  evals=[(xgb_train, 'train'), (xgb_valid, 'valid')],
                  num_boost_round=100000,   
                  feval = my_metrics,
                  verbose_eval=100,
                   early_stopping_rounds=10
                   )
    
    y_pred += clf.predict(xgb_test) / 5.0
    xgb_models.append(clf) 

hyperopt 自动调参

结合上述代码,可以轻松引入hyperopt自动调参

space={'max_depth': hp.quniform("max_depth", 3, 15, 1),
        'gamma': hp.uniform ('gamma', 0.5,20),
        'reg_alpha' : hp.uniform('reg_alpha', 0, 20),
        'reg_lambda' : hp.uniform('reg_lambda', 0, 20),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
        'min_child_weight' : hp.uniform('min_child_weight', 0, 20),
        'seed': 0
    }

def objective(space, kfold=5):
    
    kf = KFold(n_splits=5)
    y_pred = 0
    space['max_depth'] = int(space['max_depth'])
    print(space)
    for f,(tr_ind,val_ind) in enumerate(kf.split(X_tr_val)):
        X_train,X_valid = ...
        y_train,y_valid = ...

        xgb_train = xgb.DMatrix(X_train, label=y_train)  
        xgb_valid = xgb.DMatrix(X_valid, label=y_valid)

        clf = xgb.train(params=space, 
                      dtrain=xgb_train,
                      evals=[(xgb_train, 'train'), (xgb_valid, 'valid')],
                      num_boost_round=100000,   
                         feval = ...,
                      # verbose_eval=100,
                        verbose_eval=False,
                       early_stopping_rounds=100
                       )
        
        y_pred += clf.predict(xgb_valid) / 5.0
    res_metric = my_merics(y_valid, y_pred)[1]
    print ("SCORE:", res_metric)
    return {'loss': -res_metric, 'status': STATUS_OK }


trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)
最后修改:2021 年 09 月 05 日 11 : 36 PM
如果觉得我的文章对你有用,请随意赞赏