Optuna是机器学习调参的工具。
基本概念:
study
:根据目标函数的优化session,由一系列的trail
组成trail
:根据目标函数作出一次执行
搜索方式(含有low和high值的都是左闭右开[low, high)
- 选择型搜索(选择其中一个)
trail.suggest_categorical('objective',['rank:pairwise','binary:hinge'])
- 整型搜索(
[1,10)
之前的整数) trail.suggest_int('max_depth',1,10)
- 连续均匀采样搜索 (
[1e-3,1)
之间的均匀采样) trail.suggest_uniform('lambda',1e-3, 1.0)
- 对数均匀采样 (
[1e-3,1)
之间的对数均匀采样) trail.suggest_loguniform('lambda',1e-3, 1)
- 离散均匀采样 (
[1e-3,1)
之间的离散均匀采样,步长为0.05) trail.suggest_discrete_uniform('drop_path_rate',1e-3, 1, 0.05)
实战例子
接下来以Titanic数据集为例,使用optuna对xgboost调参。
首先,导入需要的包
import xgboost as xgb
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import time
加载数据集
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
sample = pd.read_csv("./gender_submission.csv")
数据预处理(这里不是重点)
data = pd.concat([train, test], sort=False)
data['Sex'].replace(['male','female'], [0, 1], inplace=True)
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data['fare_value']=data['Fare']/50
age_avg = data['Age'].mean()
age_std = data['Age'].std()
data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)
data['age_value']=data['Age']/50
data['family'] = (data['SibSp'] + data['Parch'])/5
data['isAlone'] = 0
data.loc[data['family'] > 0, 'isAlone'] = 1
delete_columns = ['Name','PassengerId','SibSp','Parch','Ticket','Cabin','Age','Fare']
data.drop(delete_columns, axis=1, inplace=True)
train = data[:len(train)]
test = data[len(train):]
target = train['Survived'].astype(int)
data = train.drop('Survived',axis=1)
test = test.drop('Survived',axis=1)
columns=data.columns.to_list()
创建objective
函数
def objective(trial,data=data,target=target):
train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
param = {
'objective': trial.suggest_categorical('objective',['rank:pairwise','binary:hinge','reg:logistic']),
'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']), # using the GPU
'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
'colsample_bytree': trial.suggest_loguniform("colsample_bytree", 0.1, 1),
'subsample': trial.suggest_loguniform("subsample", 0.1, 1),
'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018,0.02]),
'n_estimators': trial.suggest_categorical('n_estimators', [1000,2000,4000,8000]),
'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
'random_state': trial.suggest_categorical('random_state', [24,48,2021]),
'min_child_weight': trial.suggest_int('min_child_weight', 1,300),
'use_label_encoder': trial.suggest_categorical('use_label_encoder',[False])
}
model = xgb.XGBClassifier(**param) ###
model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
preds = model.predict(test_x)
rmse = mean_squared_error(test_y, preds,squared=False)
return rmse
启动优化任务
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200) # 200个trails
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
完成后,看一下性能表格
study.trials_dataframe()
显示每个trail的分数变化,以及最优分数
optuna.visualization.plot_optimization_history(study)
平行坐标图查看参数的变化
optuna.visualization.plot_parallel_coordinate(study)
查看每个参数的搜索过程
optuna.visualization.plot_slice(study)
查看两个参数的交互优化变化
optuna.visualization.plot_contour(study, params=['lambda','learning_rate'])
查看参数在优化中的重要程度
optuna.visualization.plot_param_importances(study)
可视化empirical distribution function
optuna.visualization.plot_edf(study)
最优的一组参数
Best_trial= study.best_trial.params
print(Best_trial)
{'objective': 'binary:hinge', 'tree_method': 'gpu_hist', 'lambda': 0.6094037979615238, 'alpha': 0.04482620630937828, 'colsample_bytree': 0.7804319534229214, 'subsample': 0.3954426222022122, 'learning_rate': 0.014, 'n_estimators': 4000, 'max_depth': 7, 'random_state': 48, 'min_child_weight': 1, 'use_label_encoder': False}
最后,可以拿最优参数做出预测
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=5,random_state=48,shuffle=True)
for trn_idx, test_idx in kf.split(train[columns],train['Survived']):
X_tr,X_val=train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
y_tr,y_val=train['Survived'].iloc[trn_idx],train['Survived'].iloc[test_idx]
model = xgb.XGBClassifier(**Best_trial)
model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
preds+=model.predict(test[columns])/kf.n_splits
rmse=mean_squared_error(y_val, model.predict(X_val),squared=False)
print(rmse)
0.46074993520704494
0.3821876708246056
0.41053541362798
0.39661488976905146
0.423999152002544