可复用代码

import pandas as pd
import numpy as np
import gc

from sklearn.model_selection import KFold

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

各种基分类器的Wrapper,统一的调用方法:

  • __init__:提供params参数
  • train:提供X_train, y_train, X_val, y_val
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.params = params
        self.params['seed'] = seed
        
    def train(self, X_train, y_train, X_val, y_val, num_boost_round=100000, early_stopping_rounds=100, verbose_eval=100):
        
        
        
        xgb_train = xgb.DMatrix(X_train, label=y_train)  
        xgb_valid = xgb.DMatrix(X_val, label=y_val)
        
        self.clf = xgb.train(params=self.params, 
                      dtrain=xgb_train,
                      evals=[(xgb_train, 'train'), (xgb_valid, 'valid')],
                      num_boost_round=num_boost_round,   
                        verbose_eval=verbose_eval,
                       early_stopping_rounds=early_stopping_rounds,
                       )

    def predict(self, x):
        return self.clf.predict(xgb.DMatrix(x))

class LightGBMWrapper(object):
    def __init__(self, seed=0, params=None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.params = params
        
    def train(self, X_train, y_train, X_val, y_val, num_boost_round=100000, early_stopping_rounds=100, verbose_eval=100):
        
        lgb_train = lgb.Dataset(X_train,y_train)  
        lgb_valid = lgb.Dataset(X_val,y_val)
                
        self.clf = lgb.train(params=self.params, 
                         train_set=lgb_train,
                         valid_sets=[lgb_train, lgb_valid],
                         num_boost_round=num_boost_round,   
                         verbose_eval=verbose_eval,
                         early_stopping_rounds=early_stopping_rounds,
                        )
        
    def predict(self, x):
        return self.clf.predict(x)

class CatboostWrapper(object):
    def __init__(self, seed=0, params=None):
        params['random_seed'] = seed
        self.params = params
    def train(self, X_train, y_train, X_val, y_val, num_boost_round=100000, early_stopping_rounds=100, verbose=100):
        cb_train = cb.Pool(X_train,y_train)  
        cb_valid = cb.Pool(X_val,y_val)
        self.clf = cb.train(dtrain=cb_train, 
                            params=self.params, 
                            evals=[cb_train, cb_valid],
                            num_boost_round=num_boost_round,
                            early_stopping_rounds=early_stopping_rounds,
                            verbose=verbose
                           )
    
    def predict(self, x):
        return self.clf.predict(x)

class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, X_train, y_train, X_val, y_val):
        self.clf.fit(X_train, y_train)
    
    def predict(self, x, proba=True):
        if proba:
            return self.clf.predict_proba(x)[:,1]
        else:
            return self.clf.predict(x)

K-Fold获取预测值

def get_oof(clf, X_train, y_train, X_test, SEED=1995, NFOLDS=5):
    kf = KFold(n_splits = NFOLDS, shuffle=True, random_state=SEED)
    oof_train = np.zeros((len(X_train),))
    oof_test = np.zeros((len(X_test),))
    oof_test_skf = np.empty((NFOLDS, len(X_test)))
    
    for i, (train_index, val_index) in enumerate(kf.split(X_train)):
        print('------Fold:', i)
        X_tr, X_val = X_train.loc[train_index], X_train.loc[val_index]
        y_tr, y_val = y_train.loc[train_index], y_train.loc[val_index]
        
        clf.train(X_tr, y_tr, X_val, y_val)
        
        oof_train[val_index] = clf.predict(X_val)
        oof_test_skf[i, :] = clf.predict(X_test)
        
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

例子

输入参数

xgb_params = {
    'colsample_bytree': 0.8498307591268144,
    'gamma': 1.402103937415227,
    'max_depth': 9,
    'min_child_weight': 13.372782085406131,
    'eval_metric': 'error'
}


rf_params = {
    'n_jobs': 4,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
}

lightgbm_params = {
    'bagging_fraction': 0.6538598486733266,
    'bagging_freq': 1,
    'feature_fraction': 0.5230248560467201,
    'max_depth': 11,
    'min_data_in_leaf': 25,
    'objective': 'binary',
    'metric': 'binary_error',
    'learning_rate': 0.01
}

catboost_params = {
    'learning_rate': 0.01,
    'depth': 3,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

创建基分类器

clf_xgb = XgbWrapper(params = xgb_params)
clf_lgb = LightGBMWrapper(params = lightgbm_params)
clf_cb = CatboostWrapper(params = catboost_params)
clf_rf = SklearnWrapper(clf=RandomForestClassifier, params=rf_params)

K-Fold

xgb_oof_train, xgb_oof_test = get_oof(clf_xgb, 
  X_tr_val[tr_features], 
  X_tr_val[label_gender], 
  X_te
 )

rf_oof_train, rf_oof_test = get_oof(clf_rf, 
  X_tr_val[tr_features], 
  X_tr_val[label_gender], 
  X_te
 )

lgb_oof_train, lgb_oof_test = get_oof(clf_lgb, 
  X_tr_val[tr_features], 
  X_tr_val[label_gender], 
  X_te
 )

cb_oof_train, cb_oof_test = get_oof(clf_cb, 
  X_tr_val[tr_features], 
  X_tr_val[label_gender], 
  X_te
 )

创建最终的数据集

final_X_train = np.concatenate((xgb_oof_train, lgb_oof_train, cb_oof_train, rf_oof_train), axis=1)
final_X_test = np.concatenate((xgb_oof_test, lgb_oof_test, cb_oof_test, rf_oof_test), axis=1)

最终的分类器

final_clf = LogisticRegression()
final_clf.fit(final_X_train,X_tr_val[label_gender])
最后修改:2021 年 11 月 02 日 10 : 36 PM
如果觉得我的文章对你有用,请随意赞赏