import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
def basic_statistics(df):
print('**************Head**************')
print(df.head())
print('\n\n\n**************Info**************')
df.info()
print('\n\n\n**************Describe**************')
print(df.describe())
print('\n\n\n**************IsNull**************')
print(df.isnull().sum())
print('\n\n\n**************IsNA**************')
print(df.isna().sum())
def box_plot(df, feats):
# Box plot
fig = plt.figure(figsize=(20, 40)) # 指定绘图对象宽度和高度
for i in range(len(feats)):
plt.subplot((len(feats) + 2) // 3, 3, i + 1) # 13行3列子图
sns.boxplot(df[feats[i]], orient="v", width=0.5) # 箱式图
plt.ylabel(feats[i], fontsize=8)
plt.show()
def qq_plot(df, feats):
# Distribution difference with Gaussian Distribution
qq_cols = 6
qq_rows = len(feats)
plt.figure(figsize=(4*qq_cols,4*qq_rows))
i = 0
for feat in feats:
i+=1
ax=plt.subplot(qq_rows,qq_cols,i)
sns.distplot(df[feat],fit=stats.norm)
i+=1
ax=plt.subplot(qq_rows,qq_cols,i)
res = stats.probplot(df[feat], plot=plt)
plt.show()
def kde_plot(df, feats, test_df=None):
# KDE
dist_cols = 6
dist_rows = len(feats)
plt.figure(figsize=(4*dist_cols,4*dist_rows))
i=1
for feat in feats:
ax=plt.subplot(dist_rows,dist_cols,i)
ax = sns.kdeplot(df[feat], color="Red", shade=True)
if type(test_df) != type(None):
ax = sns.kdeplot(test_df[feat], color="Blue", shade=True)
ax.set_xlabel(feat)
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"] if type(test_df) != type(None) else 'train')
i+=1
plt.show()
def linear_regression_plot(df, feats, target):
# 线性回归
fcols = 6
frows = len(feats)
plt.figure(figsize=(5*fcols,4*frows))
i=0
for feat in feats:
i+=1
ax=plt.subplot(frows,fcols,i)
sns.regplot(x=feat, y=target, data=df, ax=ax,
scatter_kws={'marker':'.','s':3,'alpha':0.3},
line_kws={'color':'k'});
plt.xlabel(feat)
plt.ylabel(target)
i+=1
ax=plt.subplot(frows,fcols,i)
sns.distplot(df[feat].dropna())
plt.xlabel(feat)
plt.show()
# 热力图 v1
def heatmap_v1(df, feats):
ax = plt.subplots(figsize=(20, 16))#调整画布大小
ax = sns.heatmap(df[feats].corr(), vmax=.8, square=True, annot=True)#画热力图 annot=True 显示系数
plt.show()
# 热力图 v2
def heatmap_v2(df, feats):
plt.figure(figsize=(20, 16)) # 指定绘图对象宽度和高度
mcorr = df[feats].corr(method="spearman") # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True # 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f') # 热力图(看两两相似度)
plt.show()
def select_features_by_k(df, feats, target, k=10):
k += 1 # include target itself
cols = df[feats.tolist() + [target]].corr().nlargest(k, target)[target].index
cm = np.corrcoef(df[cols].values.T)
hm = plt.subplots(figsize=(10, 10))#调整画布大小
hm = sns.heatmap(df[cols].corr(),annot=True,square=True)
plt.show()
def select_features_by_threshold(df, feats, target, threshold=0.5):
corrmat = df[feats.tolist() + [target]].corr()
top_corr_features = corrmat.index[abs(corrmat[target])>threshold]
plt.figure(figsize=(10,10))
g = sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")
def outliers_box_plot(df, feats):
plt.figure(figsize=(18, 10))
plt.boxplot(x=df[feats].values,labels=feats)
plt.show()
def outliers_violin_plot(df, feats):
plt.figure(figsize=(18, 10))
plt.violinplot(train_data)
plt.title('violin plot')
plt.xticks(np.arange(len(feats)), feats.values.tolist())
plt.xlabel('xlabel')
plt.ylabel('ylabel')
plt.show()
def eda(df, numerical_features, target, task, test_df=None):
if False:
### basic statistics
basic_statistics(df)
### numerical features
# box plot
box_plot(df, numerical_features)
# qq plot
qq_plot(df, numerical_features)
# kde plot
kde_plot(df, numerical_features, test_df)
# heatmap two versions
heatmap_v1(df, numerical_features)
heatmap_v2(df, numerical_features)
## regression task visualizations
if task == 'regression':
# regression to visualize correlation with target
linear_regression_plot(df, numerical_features, target)
# select k best featuers most correlated with target
select_features_by_k(df, numerical_features, target, k = 10)
# select features using threshold
select_features_by_threshold(df, numerical_features, target)
# outliers
outliers_box_plot(df, numerical_features)
outliers_violin_plot(df, numerical_features)
eda(df = train_data,
numerical_features = numerical_features,
target = target,
task = task,
test_df = test_data
)
最后修改:2021 年 08 月 09 日 02 : 05 PM
© 允许规范转载