0 介绍
这篇文章记录特征工程代码,详细的特征工程列表可以查看Feature Engineering 特征工程 操作汇总
1 数据预处理
1.1 导入需要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')
1.2 读入数据集
data_train =pd.read_csv('./train.csv')
data_test_a = pd.read_csv('./testA.csv')
1.3 删除不需要的数据
for data in [data_train, data_test_a]:
data.drop(['id', 'policyCode'], axis=1,inplace=True)
1.4 取出数值特征和分类特征
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
label = 'isDefault'
numerical_fea.remove(label)
1.5 缺失值填充
#按照平均数填充数值型特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
#按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())
1.6 日期特征处理
查看一下issueDate
,来对Date
特征做一下处理
data_train['issueDate']
转换为距离最早日期的天数
#转化成时间格式
for data in [data_train, data_test_a]:
data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
#构造时间特征
data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
1.7 应聘时长employmentLength 处理为数值
data_train['employmentLength'].value_counts(dropna=False).sort_index()
1 year 52489
10+ years 262753
2 years 72358
3 years 64152
4 years 47985
5 years 50102
6 years 37254
7 years 35407
8 years 36192
9 years 30272
< 1 year 64237
NaN 46799
Name: employmentLength, dtype: int64
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
for data in [data_train, data_test_a]:
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts(dropna=False).sort_index()
0.0 15989
1.0 13182
2.0 18207
3.0 16011
4.0 11833
5.0 12543
6.0 9328
7.0 8823
8.0 8976
9.0 7594
10.0 65772
NaN 11742
Name: employmentLength, dtype: int64
1.8 日期特征:earliesCreditLine 处理为年份
data_train['earliesCreditLine'].sample(5)
244911 Mar-1984
269166 Aug-2003
740556 Sep-1998
182701 Jan-2012
704652 Dec-2004
Name: earliesCreditLine, dtype: object
# 取出int型年份
for data in [data_train, data_test_a]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
1.9 分类特征处理
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
'applicationType', 'initialListStatus', 'title']
for f in cate_features:
print(f, '类型数:', data[f].nunique())
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 79282
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 889
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 12058
1.9.1 处理grad
特征
for data in [data_train, data_test_a]:
data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
1.9.2 处理分类数>2,但不太高维的分类特征
做one-hot化
for data in [data_train, data_test_a]:
data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
2 数值特征处理
2.1 处理异常值
def find_outliers_by_3sigma(data,fea):
# 标准差(sigma)
data_std = np.std(data[fea])
#均值
data_mean = np.mean(data[fea])
# 3sigma
outliers_cut_off = data_std * 3
# 下限
lower_rule = data_mean - outliers_cut_off
# 上限
upper_rule = data_mean + outliers_cut_off
# filter出异常值
data[fea+'_outliers'] = data[fea].apply(lambda x:str('abnormal') if x > upper_rule or x < lower_rule else 'normal')
return data
查看异常值数量
for fea in numerical_fea:
data_train = find_outliers_by_3sigma(data_train,fea)
print(data_train[fea+'_outliers'].value_counts())
print(data_train.groupby(fea+'_outliers')['isDefault'].sum())
print('*'*10)
删除异常值,假设异常值比例小于10%
for col in numerical_fea:
percent_of_abnormal = sum(data_train[col+'_outliers'] == 'abnormal') / len(data_train)
if percent_of_abnormal < 0.1:
print('Dealing with', col)
data_train = data_train[data_train[fea+'_outliers']=='normal']
data_train = data_train.reset_index(drop=True)
else:
print('Not dealing with', col)
3 数值类型分箱
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000
# data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)
# 通过对数函数映射到指数宽度分箱
# data['loanAmnt_bin2'] = np.floor(np.sign(data['loanAmnt']) * np.log10(np.abs(data['loanAmnt']) + 1))
# 分位数分箱
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
4 特征生成
4.1 分类特征根据欺诈比例,生成概率特征
for col in ['grade', 'subGrade']:
temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
temp_dict.index = temp_dict[col].values
temp_dict = temp_dict[col + '_target_mean'].to_dict()
data_train[col + '_target_mean'] = data_train[col].map(temp_dict)
data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)
4.2 交叉特征,根据grade
与数值特征生成交叉的均值和标准差
for df in [data_train, data_test_a]:
for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')
5 特征转换
#label-encoding:subGrade,postCode,title
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):
le = LabelEncoder()
le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
data_train[col] = le.transform(list(data_train[col].astype(str).values))
data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')
其他处理
6.1 删除不需要的数据
for data in [data_train, data_test_a]:
data.drop(['applicationType', 'n11', 'n12'], axis=1,inplace=True)
6.2 重置一下数值数据的列
numerical_fea = ['loanAmnt', 'term', 'interestRate', 'installment',
'employmentTitle', 'homeOwnership', 'annualIncome',
'verificationStatus', 'purpose', 'postCode', 'regionCode',
'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh',
'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil',
'totalAcc', 'initialListStatus', 'title', 'n0', 'n1', 'n2', 'n3',
'n4','n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n13', 'n14'
]
6.3 删除不需要的特征
for data in [data_train, data_test_a]:
data.drop(['issueDate'], axis=1,inplace=True)
6.4 纵向填补缺失值
"纵向用缺失值上面的值替换缺失值"
data_train = data_train.fillna(axis=0,method='ffill')
7 分割出feature和target
features = [f for f in data_train.columns if f not in ['id','issueDate','isDefault'] and '_outliers' not in f]
x_train = data_train[features]
x_test = data_test_a[features]
y_train = data_train['isDefault']
8 处理好的数据集保存起来
data_train.to_csv('./ok_train.csv',index=None)
data_test_a.to_csv('./ok_testA.csv',index=None)