0 介绍

这篇文章记录特征工程代码,详细的特征工程列表可以查看Feature Engineering 特征工程 操作汇总

1 数据预处理

1.1 导入需要的库

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')

1.2 读入数据集

data_train =pd.read_csv('./train.csv')
data_test_a = pd.read_csv('./testA.csv')

1.3 删除不需要的数据

for data in [data_train, data_test_a]:
    data.drop(['id', 'policyCode'], axis=1,inplace=True)

1.4 取出数值特征和分类特征

numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
label = 'isDefault'
numerical_fea.remove(label)

1.5 缺失值填充

#按照平均数填充数值型特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
#按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())

1.6 日期特征处理

查看一下issueDate,来对Date特征做一下处理

data_train['issueDate']

转换为距离最早日期的天数

#转化成时间格式
for data in [data_train, data_test_a]:
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
    #构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

1.7 应聘时长employmentLength 处理为数值

data_train['employmentLength'].value_counts(dropna=False).sort_index()
1 year        52489
10+ years    262753
2 years       72358
3 years       64152
4 years       47985
5 years       50102
6 years       37254
7 years       35407
8 years       36192
9 years       30272
< 1 year      64237
NaN           46799
Name: employmentLength, dtype: int64
def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
for data in [data_train, data_test_a]:
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts(dropna=False).sort_index()
0.0     15989
1.0     13182
2.0     18207
3.0     16011
4.0     11833
5.0     12543
6.0      9328
7.0      8823
8.0      8976
9.0      7594
10.0    65772
NaN     11742
Name: employmentLength, dtype: int64

1.8 日期特征:earliesCreditLine 处理为年份

data_train['earliesCreditLine'].sample(5)
244911    Mar-1984
269166    Aug-2003
740556    Sep-1998
182701    Jan-2012
704652    Dec-2004
Name: earliesCreditLine, dtype: object
# 取出int型年份
for data in [data_train, data_test_a]:
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))

1.9 分类特征处理

# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
                 'applicationType', 'initialListStatus', 'title']
for f in cate_features:
    print(f, '类型数:', data[f].nunique())
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 79282
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 889
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 12058

1.9.1 处理grad特征

for data in [data_train, data_test_a]:
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
    

1.9.2 处理分类数>2,但不太高维的分类特征

做one-hot化

for data in [data_train, data_test_a]:
    data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

2 数值特征处理

2.1 处理异常值

def find_outliers_by_3sigma(data,fea):
    # 标准差(sigma)
    data_std = np.std(data[fea])
    #均值
    data_mean = np.mean(data[fea])
    # 3sigma
    outliers_cut_off = data_std * 3
    # 下限
    lower_rule = data_mean - outliers_cut_off
    # 上限
    upper_rule = data_mean + outliers_cut_off
    # filter出异常值
    data[fea+'_outliers'] = data[fea].apply(lambda x:str('abnormal') if x > upper_rule or x < lower_rule else 'normal')
    return data

查看异常值数量

for fea in numerical_fea:
    data_train = find_outliers_by_3sigma(data_train,fea)
    print(data_train[fea+'_outliers'].value_counts())
    print(data_train.groupby(fea+'_outliers')['isDefault'].sum())
    print('*'*10)

删除异常值,假设异常值比例小于10%

for col in numerical_fea:
    percent_of_abnormal = sum(data_train[col+'_outliers'] == 'abnormal') / len(data_train)
    if percent_of_abnormal < 0.1:
        print('Dealing with', col)
        data_train = data_train[data_train[fea+'_outliers']=='normal']
        data_train = data_train.reset_index(drop=True)
    else:
        print('Not dealing with', col)

3 数值类型分箱

# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000
# data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)

# 通过对数函数映射到指数宽度分箱
# data['loanAmnt_bin2'] = np.floor(np.sign(data['loanAmnt']) * np.log10(np.abs(data['loanAmnt']) + 1))

# 分位数分箱
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)

4 特征生成

4.1 分类特征根据欺诈比例,生成概率特征

for col in ['grade', 'subGrade']: 
    temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col + '_target_mean'].to_dict()

    data_train[col + '_target_mean'] = data_train[col].map(temp_dict)
    data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)

4.2 交叉特征,根据grade与数值特征生成交叉的均值和标准差

for df in [data_train, data_test_a]:
    for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
        df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
        df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')

5 特征转换

#label-encoding:subGrade,postCode,title
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):
    le = LabelEncoder()
    le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
    data_train[col] = le.transform(list(data_train[col].astype(str).values))
    data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')

其他处理

6.1 删除不需要的数据

for data in [data_train, data_test_a]:
    data.drop(['applicationType', 'n11', 'n12'], axis=1,inplace=True)

6.2 重置一下数值数据的列

numerical_fea = ['loanAmnt', 'term', 'interestRate', 'installment', 
                 'employmentTitle', 'homeOwnership', 'annualIncome', 
                 'verificationStatus', 'purpose', 'postCode', 'regionCode',
                 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh',
                 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil',
                 'totalAcc', 'initialListStatus', 'title', 'n0', 'n1', 'n2', 'n3',
                 'n4','n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n13', 'n14'
                ]

6.3 删除不需要的特征

for data in [data_train, data_test_a]:
    data.drop(['issueDate'], axis=1,inplace=True)

6.4 纵向填补缺失值

"纵向用缺失值上面的值替换缺失值"
data_train = data_train.fillna(axis=0,method='ffill')

7 分割出feature和target

features = [f for f in data_train.columns if f not in ['id','issueDate','isDefault'] and '_outliers' not in f]
x_train = data_train[features]
x_test = data_test_a[features]
y_train = data_train['isDefault']

8 处理好的数据集保存起来

data_train.to_csv('./ok_train.csv',index=None)
data_test_a.to_csv('./ok_testA.csv',index=None)
最后修改:2021 年 11 月 02 日 10 : 37 PM
如果觉得我的文章对你有用,请随意赞赏