特征工程
多项式
from sklearn import preprocessing
def polynomial_features(df, feats, poly=2, test_df=None):
poly = preprocessing.PolynomialFeatures()
poly.fit(df[feats])
new_df = poly.transform(df[feats])
new_df = pd.DataFrame(new_df,
columns = poly.get_feature_names()
).drop('1', axis=1)
if type(test_df) != type(None):
new_test_df = poly.transform(test_df[feats])
new_test_df = pd.DataFrame(new_test_df,
columns = poly.get_feature_names()
).drop('1', axis=1)
return new_df, new_test_df
return new_df
train_df = pd.DataFrame({'a':[1, 2, 3, 4, 5],
'b':[2, 2, 2, 2, 2],
})
test_df = pd.DataFrame({'a':[2, 3, 4],
'b':[2, 2, 2],
})
polynomial_features(train_df, ['a', 'b'], test_df)
One-Hot(Dummy Variable)
from sklearn import preprocessing
def dummy_variable(df, feats, test_df=None):
# 必须保证df里面的label包含test_df里面的label
onehot_encoder = preprocessing.OneHotEncoder(categories='auto', )
onehot_encoder.fit(df[feats])
new_df = onehot_encoder.transform(df[feats])
new_df = pd.DataFrame(new_df.toarray(), columns=onehot_encoder.get_feature_names())
if type(test_df) != type(None):
new_test_df = onehot_encoder.transform(test_df[feats])
new_test_df = pd.DataFrame(new_test_df.toarray(), columns=onehot_encoder.get_feature_names())
return new_df, new_test_df
return new_df
train_df = pd.DataFrame({'x1':['A','A','B','B','C'],})
test_df = pd.DataFrame({'x1':['A', 'B','B']})
dummy_variable(train_df, ['x1'], test_df)
缺失值填充 ——Simple Impute
from sklearn.impute import SimpleImputer
def simple_impute(df, feats, test_df=None):
# 必须保证df里面的label包含test_df里面的label
simple_imputer = SimpleImputer()
simple_imputer.fit(df[feats])
new_df = simple_imputer.transform(df[feats])
if type(test_df) != type(None):
new_test_df = simple_imputer.transform(test_df[feats])
return new_df, new_test_df
return new_df
train_df = pd.DataFrame({
'X1':[1, 2, 3, np.nan, 4, 5],
'X2':[10, 11, 12, np.nan, 13, 14]
})
test_df = pd.DataFrame({
'X1':[2, 4, 6, np.nan],
'X2':[3, 4, 5, np.nan]
})
simple_impute(train_df, ['X1', 'X2'], test_df)
缺失值填充 ——kNN Impute
from sklearn.impute import KNNImputer
def knn_impute(df, feats, test_df=None, k=5):
knn_imputer = KNNImputer(n_neighbors=k).fit(df[feats])
new_df = knn_imputer.transform(df[feats])
if type(test_df) != type(None):
new_test_df = knn_imputer.transform(test_df[feats])
return new_df, new_test_df
return new_df
train_df = pd.DataFrame({
'X1':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'X2':[10, 11, 12, np.nan, 14, 16, 18, 20, 22, 24]
})
test_df = pd.DataFrame({
'X1':[2, 4, 6, 8],
'X2':[11, 13, 15, np.nan]
})
knn_impute(train_df, ['X1', 'X2'], test_df=test_df, k=3)
无穷值填充
def fill_inf(df, feats, test_df=None, method='nan', threshold=None):
for feat in feats:
if method == 'nan':
df[feat] = df[feat].replace([np.inf, -np.inf], np.nan)
elif method == 'mean':
df[feat] = df[feat].replace([np.inf, -np.inf], df[feat].replace([np.inf, -np.inf], np.nan).mean())
elif method == 'median':
df[feat] = df[feat].replace([np.inf, -np.inf], df[feat].replace([np.inf, -np.inf], np.nan).median())
elif method == 'mode':
df[feat] = df[feat].replace([np.inf, -np.inf], df[feat].replace([np.inf, -np.inf], np.nan).mode()[0])
elif method == 'threshold':
if type(threshold) != type(None):
df[feat] = df[feat].replace(-np.inf, threshold[0])
df[feat] = df[feat].replace(np.inf, threshold[1])
else:
df[feat] = df[feat].replace(-np.inf, df[feat].replace([np.inf, -np.inf], np.nan).min())
df[feat] = df[feat].replace(np.inf, df[feat].replace([np.inf, -np.inf], np.nan).max())
return df
train_df = pd.DataFrame({
'X1':[1, 2, 3, -np.inf, 4, 5],
'X2':[10, 11, 12, np.inf, 13, 14]
})
test_df = pd.DataFrame({
'X1':[2, 4, 6, np.inf],
'X2':[3, 4, 5, np.inf]
})
# 默认使用空值
fill_inf(train_df, ['X1', 'X2'])
# 使用众数
fill_inf(train_df, ['X1', 'X2'], method='mode')
# 设置shreshold
fill_inf(train_df, ['X1', 'X2'], method='threshold', threshold= [1, 10])
# 不设置shreshold,自动选择min-max
fill_inf(train_df, ['X1', 'X2'], method='threshold', threshold= [1, 10])
特征选择
方差阈值法 Variance Threshold
from sklearn.feature_selection import VarianceThreshold
from sklearn.datasets import load_iris
def variance_threshold(df, feats, k=5):
var_thr = VarianceThreshold(threshold=0.5).fit(df[feats])
return pd.Series(feats)[var_thr.get_support()].values
dataset = pd.DataFrame(load_iris().data, columns=['A','B','C','D'])
variance_threshold(dataset,['A','B','C','D'])
皮尔森系数——K Best
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
from sklearn.datasets import load_iris
def select_k_best_pearsonr(df, feats, target, k=5):
select_k_best = SelectKBest(lambda X, Y : np.array(list(map(lambda x : pearsonr(x, Y), X.T))).T[0],
k=k)
select_k_best.fit(df[feats].values,df[target].values)
return pd.Series(feats)[kb.get_support()].values
dataset = pd.DataFrame(load_iris().data, columns=['A','B','C','D'])
dataset['target'] = load_iris().target
select_k_best_pearsonr(dataset,['A','B','C','D'], 'target', k=2)
卡方检验(Chi-Square Test)——K-Best
# 特征选择,chi square,选择前K个
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
def select_k_best_chi2(df, feats, target, k=5):
select_k_best = SelectKBest(chi2, k=k)
select_k_best.fit(df[feats].values,df[target].values)
return pd.Series(feats)[select_k_best.get_support()].values
dataset = pd.DataFrame(load_iris().data, columns=['A','B','C','D'])
dataset['target'] = load_iris().target
select_k_best_chi2(dataset,['A','B','C','D'], 'target', k=2)
互信息——K-Best
# ! pip install minepy
from sklearn.feature_selection import SelectKBest
from minepy import MINE
def select_k_best_mine(df, feats, target, k=5, p_value=0.5):
def mic(x, y):
m = MINE()
m.compute_score(x, y)
return (m.mic(), p_value)
select_k_best = SelectKBest(lambda X, Y : np.array(list(map(lambda x: mic(x, Y), X.T))).T[0],
k=k)
select_k_best.fit(df[feats].values, df[target].values)
return pd.Series(feats)[select_k_best.get_support()].values
dataset = pd.DataFrame(load_iris().data, columns=['A','B','C','D'])
dataset['target'] = load_iris().target
select_k_best_mine(dataset,['A','B','C','D'], 'target', k=2)
RFE——K-Best
# RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
def select_k_best_rfe(df, feats, target, k=5, base_model=LogisticRegression):
rfe = RFE(estimator=base_model(multi_class='auto',
solver='lbfgs',
max_iter=500
),n_features_to_select=k)
rfe.fit(df[feats], df[target])
return pd.Series(feats)[rfe.get_support()].values
dataset = pd.DataFrame(load_iris().data, columns=['A','B','C','D'])
dataset['target'] = load_iris().target
select_k_best_rfe(dataset,['A','B','C','D'], 'target', k=2)
基于模型的特征选择——GBDT为例
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
def select_from_model(df, feats, target, base_model=LogisticRegression):
select_from_model = SelectFromModel(GradientBoostingClassifier())
select_from_model.fit_transform(df[feats], df[target])
return pd.Series(feats)[select_from_model.get_support()].values
select_from_model(dataset,['A','B','C','D'], 'target')
PCA降维——特征选择
def get_pca(df, feats, target, test_df=None, n_components=1):
# 如果[0,1)则是按照百分比保留
pca = PCA(n_components=n_components)
new_df = pca.fit_transform(df)
if type(test_df) == type(None):
return new_df
return new_df, pca.transform(test_df)
dataset = pd.DataFrame(load_iris().data, columns=['A','B','C','D'])
dataset['target'] = load_iris().target
get_pca(dataset,['A','B','C','D'], 'target', n_components=0.9)
特征生成
import pandas as pd
epsilon=1e-5
# 组合交叉特征
sim_func_dict = {
'add': lambda x,y: x+y,
'mins': lambda x,y: x-y,
'div': lambda x,y: x/(y+epsilon),
'multi': lambda x,y: x*y,
'logx/y': lambda x,y: np,log(x) / y,
'logy/x': lambda x,y: np,log(y) / x,
}
def sim_numerical_feature_make(df, feats, func_dict, test_df=None):
for i, col_i in enumerate(feats):
for col_j in feats[i + 1 :]:
for func_name, func in func_dict.items():
func_features = func(df[col_i],df[col_j])
col_func_features = '-'.join([col_i,func_name,col_j])
df[col_func_features] = func_features
if type(test_df) != type(None):
func_features = func(test_df[col_i],df[col_j])
col_func_features = '-'.join([col_i,func_name,col_j])
test_df[col_func_features] = func_features
return train_data,test_data
new_df, new_test_df = sim_numerical_feature_make(df, ['x1', 'x2', 'x3'], sim_func_dict, test_df = test_df)