可复用代码
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import KFold
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
各种基分类器的Wrapper,统一的调用方法:
__init__
:提供params
参数train
:提供X_train
,y_train
,X_val
,y_val
class XgbWrapper(object):
def __init__(self, seed=0, params=None):
self.params = params
self.params['seed'] = seed
def train(self, X_train, y_train, X_val, y_val, num_boost_round=100000, early_stopping_rounds=100, verbose_eval=100):
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_valid = xgb.DMatrix(X_val, label=y_val)
self.clf = xgb.train(params=self.params,
dtrain=xgb_train,
evals=[(xgb_train, 'train'), (xgb_valid, 'valid')],
num_boost_round=num_boost_round,
verbose_eval=verbose_eval,
early_stopping_rounds=early_stopping_rounds,
)
def predict(self, x):
return self.clf.predict(xgb.DMatrix(x))
class LightGBMWrapper(object):
def __init__(self, seed=0, params=None):
params['feature_fraction_seed'] = seed
params['bagging_seed'] = seed
self.params = params
def train(self, X_train, y_train, X_val, y_val, num_boost_round=100000, early_stopping_rounds=100, verbose_eval=100):
lgb_train = lgb.Dataset(X_train,y_train)
lgb_valid = lgb.Dataset(X_val,y_val)
self.clf = lgb.train(params=self.params,
train_set=lgb_train,
valid_sets=[lgb_train, lgb_valid],
num_boost_round=num_boost_round,
verbose_eval=verbose_eval,
early_stopping_rounds=early_stopping_rounds,
)
def predict(self, x):
return self.clf.predict(x)
class CatboostWrapper(object):
def __init__(self, seed=0, params=None):
params['random_seed'] = seed
self.params = params
def train(self, X_train, y_train, X_val, y_val, num_boost_round=100000, early_stopping_rounds=100, verbose=100):
cb_train = cb.Pool(X_train,y_train)
cb_valid = cb.Pool(X_val,y_val)
self.clf = cb.train(dtrain=cb_train,
params=self.params,
evals=[cb_train, cb_valid],
num_boost_round=num_boost_round,
early_stopping_rounds=early_stopping_rounds,
verbose=verbose
)
def predict(self, x):
return self.clf.predict(x)
class SklearnWrapper(object):
def __init__(self, clf, seed=0, params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, X_train, y_train, X_val, y_val):
self.clf.fit(X_train, y_train)
def predict(self, x, proba=True):
if proba:
return self.clf.predict_proba(x)[:,1]
else:
return self.clf.predict(x)
K-Fold获取预测值
def get_oof(clf, X_train, y_train, X_test, SEED=1995, NFOLDS=5):
kf = KFold(n_splits = NFOLDS, shuffle=True, random_state=SEED)
oof_train = np.zeros((len(X_train),))
oof_test = np.zeros((len(X_test),))
oof_test_skf = np.empty((NFOLDS, len(X_test)))
for i, (train_index, val_index) in enumerate(kf.split(X_train)):
print('------Fold:', i)
X_tr, X_val = X_train.loc[train_index], X_train.loc[val_index]
y_tr, y_val = y_train.loc[train_index], y_train.loc[val_index]
clf.train(X_tr, y_tr, X_val, y_val)
oof_train[val_index] = clf.predict(X_val)
oof_test_skf[i, :] = clf.predict(X_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
例子
输入参数
xgb_params = {
'colsample_bytree': 0.8498307591268144,
'gamma': 1.402103937415227,
'max_depth': 9,
'min_child_weight': 13.372782085406131,
'eval_metric': 'error'
}
rf_params = {
'n_jobs': 4,
'n_estimators': 100,
'max_features': 0.2,
'max_depth': 6,
'min_samples_leaf': 2,
}
lightgbm_params = {
'bagging_fraction': 0.6538598486733266,
'bagging_freq': 1,
'feature_fraction': 0.5230248560467201,
'max_depth': 11,
'min_data_in_leaf': 25,
'objective': 'binary',
'metric': 'binary_error',
'learning_rate': 0.01
}
catboost_params = {
'learning_rate': 0.01,
'depth': 3,
'bootstrap_type': 'Bernoulli',
'subsample': 0.7,
'eval_metric': 'AUC',
'od_type': 'Iter',
'allow_writing_files': False
}
创建基分类器
clf_xgb = XgbWrapper(params = xgb_params)
clf_lgb = LightGBMWrapper(params = lightgbm_params)
clf_cb = CatboostWrapper(params = catboost_params)
clf_rf = SklearnWrapper(clf=RandomForestClassifier, params=rf_params)
K-Fold
xgb_oof_train, xgb_oof_test = get_oof(clf_xgb,
X_tr_val[tr_features],
X_tr_val[label_gender],
X_te
)
rf_oof_train, rf_oof_test = get_oof(clf_rf,
X_tr_val[tr_features],
X_tr_val[label_gender],
X_te
)
lgb_oof_train, lgb_oof_test = get_oof(clf_lgb,
X_tr_val[tr_features],
X_tr_val[label_gender],
X_te
)
cb_oof_train, cb_oof_test = get_oof(clf_cb,
X_tr_val[tr_features],
X_tr_val[label_gender],
X_te
)
创建最终的数据集
final_X_train = np.concatenate((xgb_oof_train, lgb_oof_train, cb_oof_train, rf_oof_train), axis=1)
final_X_test = np.concatenate((xgb_oof_test, lgb_oof_test, cb_oof_test, rf_oof_test), axis=1)
最终的分类器
final_clf = LogisticRegression()
final_clf.fit(final_X_train,X_tr_val[label_gender])