Correlation between continuous-numeric columns, and between categorical columns


import pandas as pd
import numpy as np 
import scipy.stats as ss
from scipy.stats import chi2_contingency

df_train = pd.read_csv(r'Train_Data.csv')

emp_length_map = {
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,   
    '10+ years': 10 }
df_train['emp_length_num'] = df_train['emp_length'].apply(lambda x: emp_length_map[x])

df_train.drop(["pymnt_plan", "application_type"], axis = 1, inplace = True) # Removing columns with single value throughout

df_corr = df_train.corr()

categorical_cols = list(set(df_train.columns) - set(df_corr.columns))

# Ref 1: Towards Data Science
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

# Ref 2: Stackoverflow
def cramers_v_original(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

for i in categorical_cols:
    confusion_matrix = pd.crosstab(df_train["loan_status"], df_train[i]).values
    print('{:<25} {}'.format(i, round(cramers_v_original(confusion_matrix), 4)))

OUTPUT:
verification_status       0.029
zip_code                  0.0
purpose                   0.0715
title                     0.0712
emp_title                 0.0
addr_state                0.0173
last_pymnt_d              0.4131
issue_d                   0.0882
grade                     0.1252
home_ownership            0.03
sub_grade                 0.1338
term                      0.0291
last_credit_pull_d        0.1225
earliest_cr_line          0.0
next_pymnt_d              0.0698
emp_length                0.0141
initial_list_status       0.04

# Ref 3: GitHub

import math
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as ss
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
from collections import Counter

_REPLACE = 'replace'
_DROP = 'drop'
_DROP_SAMPLES = 'drop_samples'
_DROP_FEATURES = 'drop_features'
_SKIP = 'skip'
_DEFAULT_REPLACE_VALUE = 0.0

def conditional_entropy(x,
                        y,
                        nan_strategy=_REPLACE,
                        nan_replace_value=_DEFAULT_REPLACE_VALUE,
                        log_base: float = math.e):
    """
    Calculates the conditional entropy of x given y: S(x|y)
    Wikipedia: Conditional_entropy

    Parameters:
    -----------
    x : list / NumPy ndarray / Pandas Series
        A sequence of measurements
    y : list / NumPy ndarray / Pandas Series
        A sequence of measurements
    nan_strategy : string, default = 'replace'
        How to handle missing values: can be either 'drop' to remove samples
        with missing values, or 'replace' to replace all missing values with
        the nan_replace_value. Missing values are None and np.nan.
    nan_replace_value : any, default = 0.0
        The value used to replace missing values with. Only applicable when
        nan_strategy is set to 'replace'.
    log_base: float, default = e
        specifying base for calculating entropy. Default is base e.

    Returns:
    --------
    float
    """
    if nan_strategy == _REPLACE:
        x, y = replace_nan_with_value(x, y, nan_replace_value)
    elif nan_strategy == _DROP:
        x, y = remove_incomplete_samples(x, y)
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x, y)))
    total_occurrences = sum(y_counter.values())
    entropy = 0.0
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy += p_xy * math.log(p_y / p_xy, log_base)
    return entropy
	
# Location: ...\dython_master\dython\_private.py
def replace_nan_with_value(x, y, value):
    x = np.array([v if v == v and v is not None else value for v in x])  # NaN != NaN
    y = np.array([v if v == v and v is not None else value for v in y])
    return x, y

# Uncertainty Coefficient, is based on the conditional entropy between x and y. Function 'theils_u' is asymmetric. 
def theils_u(x, y):
    s_xy = conditional_entropy(x,y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x
		
for i in categorical_cols:
    confusion_matrix = pd.crosstab(df_train["loan_status"], df_train[i]).values
    print((i + (' '*20))[0:20], "\t", round(cramers_v(df_train[i], df_train["loan_status"]), 3), 
          "\t", round(theils_u(df_train["loan_status"], df_train[i]), 3),
          "\t", round(theils_u(df_train[i], df_train["loan_status"]), 3))

OUTPUT:
verification_status  	 0.029 	 0.004 	 0.0
zip_code             	 0.0 	 0.177 	 0.004
purpose              	 0.071 	 0.019 	 0.002
title                	 0.071 	 0.019 	 0.002
emp_title            	 0.0 	 0.63 	 0.011
addr_state           	 0.017 	 0.018 	 0.001
last_pymnt_d         	 0.413 	 0.342 	 0.074
issue_d              	 0.088 	 0.034 	 0.002
grade                	 0.125 	 0.049 	 0.004
home_ownership       	 0.03 	 0.004 	 0.001
sub_grade            	 0.134 	 0.06 	 0.003
term                 	 0.029 	 0.003 	 0.001
last_credit_pull_d   	 0.123 	 0.029 	 0.016
earliest_cr_line     	 0.0 	 0.124 	 0.003
next_pymnt_d         	 0.07 	 0.014 	 0.004
emp_length           	 0.014 	 0.004 	 0.0
initial_list_status  	 0.04 	 0.006 	 0.001 


# For a pair of a continuous feature and a categorical feature. Method 'correlation_ratio' is asymmetric. We are calculating average of a derived measurement. Swapping category with measure would result in error in the line: "y_avg_array[i] = np.average(cat_measures)". 
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator/denominator)
    return eta
	
for i in categorical_cols:
    confusion_matrix = pd.crosstab(df_train[i], df_train["loan_status"]).values
    print((i + (' '*20))[0:20], "\t", round(correlation_ratio(df_train[i], df_train["loan_status"]), 3))

OUTPUT:
verification_status  	 0.032
zip_code             	 0.245
purpose              	 0.078
title                	 0.078
emp_title            	 0.735
addr_state           	 0.067
last_pymnt_d         	 0.414
issue_d              	 0.092
grade                	 0.127
home_ownership       	 0.033
sub_grade            	 0.145
term                 	 0.031
last_credit_pull_d   	 0.126
earliest_cr_line     	 0.189
next_pymnt_d         	 0.072
emp_length           	 0.033
initial_list_status  	 0.042 

# Ref 4: OpenCodez 

Dataset used can be found here: Google Drive 

No comments:

Post a Comment