import pandas as pd

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import warnings
warnings.filterwarnings("ignore")

def basic_statistics(df):
    print('**************Head**************')
    print(df.head())

    print('\n\n\n**************Info**************')
    df.info()

    print('\n\n\n**************Describe**************')
    print(df.describe())

    print('\n\n\n**************IsNull**************')
    print(df.isnull().sum())

    print('\n\n\n**************IsNA**************')
    print(df.isna().sum())

def box_plot(df, feats):
    # Box plot
    fig = plt.figure(figsize=(20, 40))  # 指定绘图对象宽度和高度
    for i in range(len(feats)):
        plt.subplot((len(feats) + 2) // 3, 3, i + 1)  # 13行3列子图
        sns.boxplot(df[feats[i]], orient="v", width=0.5)  # 箱式图
        plt.ylabel(feats[i], fontsize=8)
    plt.show()


def qq_plot(df, feats):
    # Distribution difference with Gaussian Distribution
    qq_cols = 6
    qq_rows = len(feats)
    plt.figure(figsize=(4*qq_cols,4*qq_rows))

    i = 0
    for feat in feats:
        i+=1
        ax=plt.subplot(qq_rows,qq_cols,i)
        sns.distplot(df[feat],fit=stats.norm)

        i+=1
        ax=plt.subplot(qq_rows,qq_cols,i)
        res = stats.probplot(df[feat], plot=plt)
    plt.show()

def kde_plot(df, feats, test_df=None):
    # KDE
    dist_cols = 6
    dist_rows = len(feats)
    plt.figure(figsize=(4*dist_cols,4*dist_rows))

    i=1
    for feat in feats:
        ax=plt.subplot(dist_rows,dist_cols,i)
        ax = sns.kdeplot(df[feat], color="Red", shade=True)
        if type(test_df) != type(None):
            ax = sns.kdeplot(test_df[feat], color="Blue", shade=True)
        ax.set_xlabel(feat)
        ax.set_ylabel("Frequency")
        ax = ax.legend(["train","test"] if type(test_df) != type(None) else 'train')

        i+=1
    plt.show()
    
def linear_regression_plot(df, feats, target):
    # 线性回归
    fcols = 6
    frows = len(feats)
    plt.figure(figsize=(5*fcols,4*frows))

    i=0
    for feat in feats:
        i+=1
        ax=plt.subplot(frows,fcols,i)
        sns.regplot(x=feat, y=target, data=df, ax=ax, 
                    scatter_kws={'marker':'.','s':3,'alpha':0.3},
                    line_kws={'color':'k'});
        plt.xlabel(feat)
        plt.ylabel(target)

        i+=1
        ax=plt.subplot(frows,fcols,i)
        sns.distplot(df[feat].dropna())
        plt.xlabel(feat)

    plt.show()

# 热力图 v1
def heatmap_v1(df, feats):
    ax = plt.subplots(figsize=(20, 16))#调整画布大小
    ax = sns.heatmap(df[feats].corr(), vmax=.8, square=True, annot=True)#画热力图   annot=True 显示系数
    plt.show()

# 热力图 v2
def heatmap_v2(df, feats):
    plt.figure(figsize=(20, 16))  # 指定绘图对象宽度和高度
    mcorr = df[feats].corr(method="spearman")  # 相关系数矩阵,即给出了任意两个变量之间的相关系数
    mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵 为bool型
    mask[np.triu_indices_from(mask)] = True  # 角分线右侧为True
    cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap对象
    g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')  # 热力图(看两两相似度)
    plt.show()
    
def select_features_by_k(df, feats, target, k=10):
    k += 1 # include target itself
    cols = df[feats.tolist() + [target]].corr().nlargest(k, target)[target].index
    
    cm = np.corrcoef(df[cols].values.T)
    hm = plt.subplots(figsize=(10, 10))#调整画布大小
    hm = sns.heatmap(df[cols].corr(),annot=True,square=True)

    plt.show()

def select_features_by_threshold(df, feats, target, threshold=0.5):
    
    corrmat = df[feats.tolist() + [target]].corr()
    top_corr_features = corrmat.index[abs(corrmat[target])>threshold]
    plt.figure(figsize=(10,10))
    g = sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

def outliers_box_plot(df, feats):
    plt.figure(figsize=(18, 10))
    plt.boxplot(x=df[feats].values,labels=feats)
    plt.show()
def outliers_violin_plot(df, feats):
    plt.figure(figsize=(18, 10))
    plt.violinplot(train_data)
    plt.title('violin plot')
    plt.xticks(np.arange(len(feats)), feats.values.tolist())
    plt.xlabel('xlabel')
    plt.ylabel('ylabel')
    plt.show()


def eda(df, numerical_features, target, task, test_df=None):
    
    
    if False:
        ### basic statistics
        basic_statistics(df)

        ### numerical features
        # box plot
        box_plot(df, numerical_features)
        # qq plot
        qq_plot(df, numerical_features)
        # kde plot
        kde_plot(df, numerical_features, test_df)
        # heatmap two versions
        heatmap_v1(df, numerical_features)
        heatmap_v2(df, numerical_features)
        ## regression task visualizations
        if task == 'regression':
            # regression to visualize correlation with target 
            linear_regression_plot(df, numerical_features, target)
            # select k best featuers most correlated with target
            select_features_by_k(df, numerical_features, target, k = 10)
            # select features using threshold
            select_features_by_threshold(df, numerical_features, target)
        # outliers
        outliers_box_plot(df, numerical_features)
        outliers_violin_plot(df, numerical_features)

eda(df = train_data, 
    numerical_features = numerical_features,
    target = target,
    task = task, 
    test_df = test_data
   )
最后修改:2021 年 08 月 09 日 02 : 05 PM
如果觉得我的文章对你有用,请随意赞赏