[数据科学-拿来就用] EDA

2021 年 08 月 09 日

643次浏览

5587字数

import pandas as pd

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import warnings
warnings.filterwarnings("ignore")

def basic_statistics(df):
    print('**************Head**************')
    print(df.head())

    print('\n\n\n**************Info**************')
    df.info()

    print('\n\n\n**************Describe**************')
    print(df.describe())

    print('\n\n\n**************IsNull**************')
    print(df.isnull().sum())

    print('\n\n\n**************IsNA**************')
    print(df.isna().sum())

def box_plot(df, feats):
    # Box plot
    fig = plt.figure(figsize=(20, 40))  # 指定绘图对象宽度和高度
    for i in range(len(feats)):
        plt.subplot((len(feats) + 2) // 3, 3, i + 1)  # 13行3列子图
        sns.boxplot(df[feats[i]], orient="v", width=0.5)  # 箱式图
        plt.ylabel(feats[i], fontsize=8)
    plt.show()


def qq_plot(df, feats):
    # Distribution difference with Gaussian Distribution
    qq_cols = 6
    qq_rows = len(feats)
    plt.figure(figsize=(4*qq_cols,4*qq_rows))

    i = 0
    for feat in feats:
        i+=1
        ax=plt.subplot(qq_rows,qq_cols,i)
        sns.distplot(df[feat],fit=stats.norm)

        i+=1
        ax=plt.subplot(qq_rows,qq_cols,i)
        res = stats.probplot(df[feat], plot=plt)
    plt.show()

def kde_plot(df, feats, test_df=None):
    # KDE
    dist_cols = 6
    dist_rows = len(feats)
    plt.figure(figsize=(4*dist_cols,4*dist_rows))

    i=1
    for feat in feats:
        ax=plt.subplot(dist_rows,dist_cols,i)
        ax = sns.kdeplot(df[feat], color="Red", shade=True)
        if type(test_df) != type(None):
            ax = sns.kdeplot(test_df[feat], color="Blue", shade=True)
        ax.set_xlabel(feat)
        ax.set_ylabel("Frequency")
        ax = ax.legend(["train","test"] if type(test_df) != type(None) else 'train')

        i+=1
    plt.show()
    
def linear_regression_plot(df, feats, target):
    # 线性回归
    fcols = 6
    frows = len(feats)
    plt.figure(figsize=(5*fcols,4*frows))

    i=0
    for feat in feats:
        i+=1
        ax=plt.subplot(frows,fcols,i)
        sns.regplot(x=feat, y=target, data=df, ax=ax, 
                    scatter_kws={'marker':'.','s':3,'alpha':0.3},
                    line_kws={'color':'k'});
        plt.xlabel(feat)
        plt.ylabel(target)

        i+=1
        ax=plt.subplot(frows,fcols,i)
        sns.distplot(df[feat].dropna())
        plt.xlabel(feat)

    plt.show()

# 热力图 v1
def heatmap_v1(df, feats):
    ax = plt.subplots(figsize=(20, 16))#调整画布大小
    ax = sns.heatmap(df[feats].corr(), vmax=.8, square=True, annot=True)#画热力图   annot=True 显示系数
    plt.show()

# 热力图 v2
def heatmap_v2(df, feats):
    plt.figure(figsize=(20, 16))  # 指定绘图对象宽度和高度
    mcorr = df[feats].corr(method="spearman")  # 相关系数矩阵，即给出了任意两个变量之间的相关系数
    mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵 为bool型
    mask[np.triu_indices_from(mask)] = True  # 角分线右侧为True
    cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap对象
    g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')  # 热力图（看两两相似度）
    plt.show()
    
def select_features_by_k(df, feats, target, k=10):
    k += 1 # include target itself
    cols = df[feats.tolist() + [target]].corr().nlargest(k, target)[target].index
    
    cm = np.corrcoef(df[cols].values.T)
    hm = plt.subplots(figsize=(10, 10))#调整画布大小
    hm = sns.heatmap(df[cols].corr(),annot=True,square=True)

    plt.show()

def select_features_by_threshold(df, feats, target, threshold=0.5):
    
    corrmat = df[feats.tolist() + [target]].corr()
    top_corr_features = corrmat.index[abs(corrmat[target])>threshold]
    plt.figure(figsize=(10,10))
    g = sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

def outliers_box_plot(df, feats):
    plt.figure(figsize=(18, 10))
    plt.boxplot(x=df[feats].values,labels=feats)
    plt.show()
def outliers_violin_plot(df, feats):
    plt.figure(figsize=(18, 10))
    plt.violinplot(train_data)
    plt.title('violin plot')
    plt.xticks(np.arange(len(feats)), feats.values.tolist())
    plt.xlabel('xlabel')
    plt.ylabel('ylabel')
    plt.show()


def eda(df, numerical_features, target, task, test_df=None):
    
    
    if False:
        ### basic statistics
        basic_statistics(df)

        ### numerical features
        # box plot
        box_plot(df, numerical_features)
        # qq plot
        qq_plot(df, numerical_features)
        # kde plot
        kde_plot(df, numerical_features, test_df)
        # heatmap two versions
        heatmap_v1(df, numerical_features)
        heatmap_v2(df, numerical_features)
        ## regression task visualizations
        if task == 'regression':
            # regression to visualize correlation with target 
            linear_regression_plot(df, numerical_features, target)
            # select k best featuers most correlated with target
            select_features_by_k(df, numerical_features, target, k = 10)
            # select features using threshold
            select_features_by_threshold(df, numerical_features, target)
        # outliers
        outliers_box_plot(df, numerical_features)
        outliers_violin_plot(df, numerical_features)

eda(df = train_data, 
    numerical_features = numerical_features,
    target = target,
    task = task, 
    test_df = test_data
   )

[数据科学-拿来就用] EDA

admin • 2021 年 08 月 09 日

import pandas as pd

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import warnings
warnings.filterwarnings("ignore")

def basic_statistics(df):
    print('**************Head**************')
    print(df.head())

    print('\n\n\n**************Info**************')
    df.info()

    print('\n\n\n**************Describe**************')
    print(df.describe())

    print('\n\n\n**************IsNull**************')
    print(df.isnull().sum())

    print('\n\n\n**************IsNA**************')
    print(df.isna().sum())

def box_plot(df, feats):
    # Box plot
    fig = plt.figure(figsize=(20, 40))  # 指定绘图对象宽度和高度
    for i in range(len(feats)):
        plt.subplot((len(feats) + 2) // 3, 3, i + 1)  # 13行3列子图
        sns.boxplot(df[feats[i]], orient="v", width=0.5)  # 箱式图
        plt.ylabel(feats[i], fontsize=8)
    plt.show()


def qq_plot(df, feats):
    # Distribution difference with Gaussian Distribution
    qq_cols = 6
    qq_rows = len(feats)
    plt.figure(figsize=(4*qq_cols,4*qq_rows))

    i = 0
    for feat in feats:
        i+=1
        ax=plt.subplot(qq_rows,qq_cols,i)
        sns.distplot(df[feat],fit=stats.norm)

        i+=1
        ax=plt.subplot(qq_rows,qq_cols,i)
        res = stats.probplot(df[feat], plot=plt)
    plt.show()

def kde_plot(df, feats, test_df=None):
    # KDE
    dist_cols = 6
    dist_rows = len(feats)
    plt.figure(figsize=(4*dist_cols,4*dist_rows))

    i=1
    for feat in feats:
        ax=plt.subplot(dist_rows,dist_cols,i)
        ax = sns.kdeplot(df[feat], color="Red", shade=True)
        if type(test_df) != type(None):
            ax = sns.kdeplot(test_df[feat], color="Blue", shade=True)
        ax.set_xlabel(feat)
        ax.set_ylabel("Frequency")
        ax = ax.legend(["train","test"] if type(test_df) != type(None) else 'train')

        i+=1
    plt.show()
    
def linear_regression_plot(df, feats, target):
    # 线性回归
    fcols = 6
    frows = len(feats)
    plt.figure(figsize=(5*fcols,4*frows))

    i=0
    for feat in feats:
        i+=1
        ax=plt.subplot(frows,fcols,i)
        sns.regplot(x=feat, y=target, data=df, ax=ax, 
                    scatter_kws={'marker':'.','s':3,'alpha':0.3},
                    line_kws={'color':'k'});
        plt.xlabel(feat)
        plt.ylabel(target)

        i+=1
        ax=plt.subplot(frows,fcols,i)
        sns.distplot(df[feat].dropna())
        plt.xlabel(feat)

    plt.show()

# 热力图 v1
def heatmap_v1(df, feats):
    ax = plt.subplots(figsize=(20, 16))#调整画布大小
    ax = sns.heatmap(df[feats].corr(), vmax=.8, square=True, annot=True)#画热力图   annot=True 显示系数
    plt.show()

# 热力图 v2
def heatmap_v2(df, feats):
    plt.figure(figsize=(20, 16))  # 指定绘图对象宽度和高度
    mcorr = df[feats].corr(method="spearman")  # 相关系数矩阵，即给出了任意两个变量之间的相关系数
    mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵 为bool型
    mask[np.triu_indices_from(mask)] = True  # 角分线右侧为True
    cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap对象
    g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')  # 热力图（看两两相似度）
    plt.show()
    
def select_features_by_k(df, feats, target, k=10):
    k += 1 # include target itself
    cols = df[feats.tolist() + [target]].corr().nlargest(k, target)[target].index
    
    cm = np.corrcoef(df[cols].values.T)
    hm = plt.subplots(figsize=(10, 10))#调整画布大小
    hm = sns.heatmap(df[cols].corr(),annot=True,square=True)

    plt.show()

def select_features_by_threshold(df, feats, target, threshold=0.5):
    
    corrmat = df[feats.tolist() + [target]].corr()
    top_corr_features = corrmat.index[abs(corrmat[target])>threshold]
    plt.figure(figsize=(10,10))
    g = sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

def outliers_box_plot(df, feats):
    plt.figure(figsize=(18, 10))
    plt.boxplot(x=df[feats].values,labels=feats)
    plt.show()
def outliers_violin_plot(df, feats):
    plt.figure(figsize=(18, 10))
    plt.violinplot(train_data)
    plt.title('violin plot')
    plt.xticks(np.arange(len(feats)), feats.values.tolist())
    plt.xlabel('xlabel')
    plt.ylabel('ylabel')
    plt.show()


def eda(df, numerical_features, target, task, test_df=None):
    
    
    if False:
        ### basic statistics
        basic_statistics(df)

        ### numerical features
        # box plot
        box_plot(df, numerical_features)
        # qq plot
        qq_plot(df, numerical_features)
        # kde plot
        kde_plot(df, numerical_features, test_df)
        # heatmap two versions
        heatmap_v1(df, numerical_features)
        heatmap_v2(df, numerical_features)
        ## regression task visualizations
        if task == 'regression':
            # regression to visualize correlation with target 
            linear_regression_plot(df, numerical_features, target)
            # select k best featuers most correlated with target
            select_features_by_k(df, numerical_features, target, k = 10)
            # select features using threshold
            select_features_by_threshold(df, numerical_features, target)
        # outliers
        outliers_box_plot(df, numerical_features)
        outliers_violin_plot(df, numerical_features)

eda(df = train_data, 
    numerical_features = numerical_features,
    target = target,
    task = task, 
    test_df = test_data
   )

[数据科学-拿来就用] EDA

发表评论取消回复

广义拉格朗日函数及其对偶算法

支持向量机SVM 系列(1)——线性可分支持向量机

支持向量机SVM 系列(2)——对偶方法(Dual Method)

支持向量机SVM 系列(3)——核函数(Kernel Function)

支持向量机SVM 系列(4)——软间隔(soft-margin SVM)

GBDT：Gradient Boosting Decision Tree

AWS ECS Boto3调用——创建Log Group, 创建TaskDefinition, Lambda启动TaskDefinition

广义拉格朗日函数及其对偶算法

统计自读系列3——统计量及其抽样分布

统计自读系列——统计抽样总结与理解

[数据科学-拿来就用] EDA

发表评论 取消回复

[数据科学-拿来就用] EDA

发表评论取消回复