Optuna是机器学习调参的工具。

基本概念:

  • study:根据目标函数的优化session,由一系列的trail组成
  • trail:根据目标函数作出一次执行

搜索方式(含有low和high值的都是左闭右开[low, high)

  • 选择型搜索(选择其中一个)
    • trail.suggest_categorical('objective',['rank:pairwise','binary:hinge'])
  • 整型搜索([1,10)之前的整数)
    • trail.suggest_int('max_depth',1,10)
  • 连续均匀采样搜索 ([1e-3,1)之间的均匀采样)
    • trail.suggest_uniform('lambda',1e-3, 1.0)
  • 对数均匀采样 ([1e-3,1)之间的对数均匀采样)
    • trail.suggest_loguniform('lambda',1e-3, 1)
  • 离散均匀采样 ([1e-3,1)之间的离散均匀采样,步长为0.05)
    • trail.suggest_discrete_uniform('drop_path_rate',1e-3, 1, 0.05)

实战例子

接下来以Titanic数据集为例,使用optuna对xgboost调参。

首先,导入需要的包

import xgboost as xgb
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import time

加载数据集

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
sample = pd.read_csv("./gender_submission.csv")

数据预处理(这里不是重点)

data = pd.concat([train, test], sort=False)

data['Sex'].replace(['male','female'], [0, 1], inplace=True)
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data['fare_value']=data['Fare']/50
age_avg = data['Age'].mean()
age_std = data['Age'].std()
data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)
data['age_value']=data['Age']/50
data['family'] = (data['SibSp'] + data['Parch'])/5 
data['isAlone'] = 0
data.loc[data['family'] > 0, 'isAlone'] = 1
delete_columns = ['Name','PassengerId','SibSp','Parch','Ticket','Cabin','Age','Fare']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

target = train['Survived'].astype(int)
data = train.drop('Survived',axis=1)
test = test.drop('Survived',axis=1)

columns=data.columns.to_list()

创建objective函数

def objective(trial,data=data,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {
        'objective': trial.suggest_categorical('objective',['rank:pairwise','binary:hinge','reg:logistic']), 
        'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']),  # using the GPU
        'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
        'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
        'colsample_bytree': trial.suggest_loguniform("colsample_bytree", 0.1, 1),
        'subsample': trial.suggest_loguniform("subsample", 0.1, 1),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018,0.02]),
        'n_estimators': trial.suggest_categorical('n_estimators', [1000,2000,4000,8000]),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24,48,2021]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1,300),
        'use_label_encoder': trial.suggest_categorical('use_label_encoder',[False])
    }
    model = xgb.XGBClassifier(**param)      ###
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

启动优化任务

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200) # 200个trails
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

完成后,看一下性能表格

study.trials_dataframe()

显示每个trail的分数变化,以及最优分数

optuna.visualization.plot_optimization_history(study)

平行坐标图查看参数的变化

optuna.visualization.plot_parallel_coordinate(study)

查看每个参数的搜索过程

optuna.visualization.plot_slice(study)

查看两个参数的交互优化变化

optuna.visualization.plot_contour(study, params=['lambda','learning_rate'])

查看参数在优化中的重要程度

optuna.visualization.plot_param_importances(study)

可视化empirical distribution function

optuna.visualization.plot_edf(study)

最优的一组参数

Best_trial= study.best_trial.params
print(Best_trial)
{'objective': 'binary:hinge', 'tree_method': 'gpu_hist', 'lambda': 0.6094037979615238, 'alpha': 0.04482620630937828, 'colsample_bytree': 0.7804319534229214, 'subsample': 0.3954426222022122, 'learning_rate': 0.014, 'n_estimators': 4000, 'max_depth': 7, 'random_state': 48, 'min_child_weight': 1, 'use_label_encoder': False}

最后,可以拿最优参数做出预测

preds = np.zeros(test.shape[0])
kf = KFold(n_splits=5,random_state=48,shuffle=True)

for trn_idx, test_idx in kf.split(train[columns],train['Survived']):
    X_tr,X_val=train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
    y_tr,y_val=train['Survived'].iloc[trn_idx],train['Survived'].iloc[test_idx]
    model = xgb.XGBClassifier(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    preds+=model.predict(test[columns])/kf.n_splits
    rmse=mean_squared_error(y_val, model.predict(X_val),squared=False)
    print(rmse)
0.46074993520704494
0.3821876708246056
0.41053541362798
0.39661488976905146
0.423999152002544
最后修改:2021 年 11 月 01 日 03 : 03 PM
如果觉得我的文章对你有用,请随意赞赏