import pandas as pd import numpy as np import scipy.stats as ss from scipy.stats import chi2_contingency df_train = pd.read_csv(r'Train_Data.csv') emp_length_map = { '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9, '10+ years': 10 } df_train['emp_length_num'] = df_train['emp_length'].apply(lambda x: emp_length_map[x]) df_train.drop(["pymnt_plan", "application_type"], axis = 1, inplace = True) # Removing columns with single value throughout df_corr = df_train.corr() categorical_cols = list(set(df_train.columns) - set(df_corr.columns)) # Ref 1: Towards Data Science def cramers_v(x, y): confusion_matrix = pd.crosstab(x,y) chi2 = ss.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum().sum() phi2 = chi2/n r,k = confusion_matrix.shape phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1)) rcorr = r-((r-1)**2)/(n-1) kcorr = k-((k-1)**2)/(n-1) return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1))) # Ref 2: Stackoverflow def cramers_v_original(confusion_matrix): """ calculate Cramers V statistic for categorial-categorial association. uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328 """ chi2 = ss.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum() phi2 = chi2 / n r, k = confusion_matrix.shape phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1)) rcorr = r - ((r-1)**2)/(n-1) kcorr = k - ((k-1)**2)/(n-1) return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1))) for i in categorical_cols: confusion_matrix = pd.crosstab(df_train["loan_status"], df_train[i]).values print('{:<25} {}'.format(i, round(cramers_v_original(confusion_matrix), 4))) OUTPUT: verification_status 0.029 zip_code 0.0 purpose 0.0715 title 0.0712 emp_title 0.0 addr_state 0.0173 last_pymnt_d 0.4131 issue_d 0.0882 grade 0.1252 home_ownership 0.03 sub_grade 0.1338 term 0.0291 last_credit_pull_d 0.1225 earliest_cr_line 0.0 next_pymnt_d 0.0698 emp_length 0.0141 initial_list_status 0.04 # Ref 3: GitHub import math import warnings import numpy as np import pandas as pd import seaborn as sns import scipy.stats as ss import scipy.cluster.hierarchy as sch import matplotlib.pyplot as plt from collections import Counter _REPLACE = 'replace' _DROP = 'drop' _DROP_SAMPLES = 'drop_samples' _DROP_FEATURES = 'drop_features' _SKIP = 'skip' _DEFAULT_REPLACE_VALUE = 0.0 def conditional_entropy(x, y, nan_strategy=_REPLACE, nan_replace_value=_DEFAULT_REPLACE_VALUE, log_base: float = math.e): """ Calculates the conditional entropy of x given y: S(x|y) Wikipedia: Conditional_entropy Parameters: ----------- x : list / NumPy ndarray / Pandas Series A sequence of measurements y : list / NumPy ndarray / Pandas Series A sequence of measurements nan_strategy : string, default = 'replace' How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. nan_replace_value : any, default = 0.0 The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'. log_base: float, default = e specifying base for calculating entropy. Default is base e. Returns: -------- float """ if nan_strategy == _REPLACE: x, y = replace_nan_with_value(x, y, nan_replace_value) elif nan_strategy == _DROP: x, y = remove_incomplete_samples(x, y) y_counter = Counter(y) xy_counter = Counter(list(zip(x, y))) total_occurrences = sum(y_counter.values()) entropy = 0.0 for xy in xy_counter.keys(): p_xy = xy_counter[xy] / total_occurrences p_y = y_counter[xy[1]] / total_occurrences entropy += p_xy * math.log(p_y / p_xy, log_base) return entropy # Location: ...\dython_master\dython\_private.py def replace_nan_with_value(x, y, value): x = np.array([v if v == v and v is not None else value for v in x]) # NaN != NaN y = np.array([v if v == v and v is not None else value for v in y]) return x, y # Uncertainty Coefficient, is based on the conditional entropy between x and y. Function 'theils_u' is asymmetric. def theils_u(x, y): s_xy = conditional_entropy(x,y) x_counter = Counter(x) total_occurrences = sum(x_counter.values()) p_x = list(map(lambda n: n/total_occurrences, x_counter.values())) s_x = ss.entropy(p_x) if s_x == 0: return 1 else: return (s_x - s_xy) / s_x for i in categorical_cols: confusion_matrix = pd.crosstab(df_train["loan_status"], df_train[i]).values print((i + (' '*20))[0:20], "\t", round(cramers_v(df_train[i], df_train["loan_status"]), 3), "\t", round(theils_u(df_train["loan_status"], df_train[i]), 3), "\t", round(theils_u(df_train[i], df_train["loan_status"]), 3)) OUTPUT: verification_status 0.029 0.004 0.0 zip_code 0.0 0.177 0.004 purpose 0.071 0.019 0.002 title 0.071 0.019 0.002 emp_title 0.0 0.63 0.011 addr_state 0.017 0.018 0.001 last_pymnt_d 0.413 0.342 0.074 issue_d 0.088 0.034 0.002 grade 0.125 0.049 0.004 home_ownership 0.03 0.004 0.001 sub_grade 0.134 0.06 0.003 term 0.029 0.003 0.001 last_credit_pull_d 0.123 0.029 0.016 earliest_cr_line 0.0 0.124 0.003 next_pymnt_d 0.07 0.014 0.004 emp_length 0.014 0.004 0.0 initial_list_status 0.04 0.006 0.001 # For a pair of a continuous feature and a categorical feature. Method 'correlation_ratio' is asymmetric. We are calculating average of a derived measurement. Swapping category with measure would result in error in the line: "y_avg_array[i] = np.average(cat_measures)". def correlation_ratio(categories, measurements): fcat, _ = pd.factorize(categories) cat_num = np.max(fcat)+1 y_avg_array = np.zeros(cat_num) n_array = np.zeros(cat_num) for i in range(0,cat_num): cat_measures = measurements[np.argwhere(fcat == i).flatten()] n_array[i] = len(cat_measures) y_avg_array[i] = np.average(cat_measures) y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array) numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2))) denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2)) if numerator == 0: eta = 0.0 else: eta = np.sqrt(numerator/denominator) return eta for i in categorical_cols: confusion_matrix = pd.crosstab(df_train[i], df_train["loan_status"]).values print((i + (' '*20))[0:20], "\t", round(correlation_ratio(df_train[i], df_train["loan_status"]), 3)) OUTPUT: verification_status 0.032 zip_code 0.245 purpose 0.078 title 0.078 emp_title 0.735 addr_state 0.067 last_pymnt_d 0.414 issue_d 0.092 grade 0.127 home_ownership 0.033 sub_grade 0.145 term 0.031 last_credit_pull_d 0.126 earliest_cr_line 0.189 next_pymnt_d 0.072 emp_length 0.033 initial_list_status 0.042 # Ref 4: OpenCodez Dataset used can be found here: Google Drive
Correlation between continuous-numeric columns, and between categorical columns
Subscribe to:
Posts (Atom)
No comments:
Post a Comment