import pandas as pd
import numpy as np
import scipy.stats as ss
from scipy.stats import chi2_contingency
df_train = pd.read_csv(r'Train_Data.csv')
emp_length_map = {
'< 1 year': 0,
'1 year': 1,
'2 years': 2,
'3 years': 3,
'4 years': 4,
'5 years': 5,
'6 years': 6,
'7 years': 7,
'8 years': 8,
'9 years': 9,
'10+ years': 10 }
df_train['emp_length_num'] = df_train['emp_length'].apply(lambda x: emp_length_map[x])
df_train.drop(["pymnt_plan", "application_type"], axis = 1, inplace = True) # Removing columns with single value throughout
df_corr = df_train.corr()
categorical_cols = list(set(df_train.columns) - set(df_corr.columns))
# Ref 1: Towards Data Science
def cramers_v(x, y):
confusion_matrix = pd.crosstab(x,y)
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)
return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
# Ref 2: Stackoverflow
def cramers_v_original(confusion_matrix):
""" calculate Cramers V statistic for categorial-categorial association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
"""
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
for i in categorical_cols:
confusion_matrix = pd.crosstab(df_train["loan_status"], df_train[i]).values
print('{:<25} {}'.format(i, round(cramers_v_original(confusion_matrix), 4)))
OUTPUT:
verification_status 0.029
zip_code 0.0
purpose 0.0715
title 0.0712
emp_title 0.0
addr_state 0.0173
last_pymnt_d 0.4131
issue_d 0.0882
grade 0.1252
home_ownership 0.03
sub_grade 0.1338
term 0.0291
last_credit_pull_d 0.1225
earliest_cr_line 0.0
next_pymnt_d 0.0698
emp_length 0.0141
initial_list_status 0.04
# Ref 3: GitHub
import math
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as ss
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
from collections import Counter
_REPLACE = 'replace'
_DROP = 'drop'
_DROP_SAMPLES = 'drop_samples'
_DROP_FEATURES = 'drop_features'
_SKIP = 'skip'
_DEFAULT_REPLACE_VALUE = 0.0
def conditional_entropy(x,
y,
nan_strategy=_REPLACE,
nan_replace_value=_DEFAULT_REPLACE_VALUE,
log_base: float = math.e):
"""
Calculates the conditional entropy of x given y: S(x|y)
Wikipedia: Conditional_entropy
Parameters:
-----------
x : list / NumPy ndarray / Pandas Series
A sequence of measurements
y : list / NumPy ndarray / Pandas Series
A sequence of measurements
nan_strategy : string, default = 'replace'
How to handle missing values: can be either 'drop' to remove samples
with missing values, or 'replace' to replace all missing values with
the nan_replace_value. Missing values are None and np.nan.
nan_replace_value : any, default = 0.0
The value used to replace missing values with. Only applicable when
nan_strategy is set to 'replace'.
log_base: float, default = e
specifying base for calculating entropy. Default is base e.
Returns:
--------
float
"""
if nan_strategy == _REPLACE:
x, y = replace_nan_with_value(x, y, nan_replace_value)
elif nan_strategy == _DROP:
x, y = remove_incomplete_samples(x, y)
y_counter = Counter(y)
xy_counter = Counter(list(zip(x, y)))
total_occurrences = sum(y_counter.values())
entropy = 0.0
for xy in xy_counter.keys():
p_xy = xy_counter[xy] / total_occurrences
p_y = y_counter[xy[1]] / total_occurrences
entropy += p_xy * math.log(p_y / p_xy, log_base)
return entropy
# Location: ...\dython_master\dython\_private.py
def replace_nan_with_value(x, y, value):
x = np.array([v if v == v and v is not None else value for v in x]) # NaN != NaN
y = np.array([v if v == v and v is not None else value for v in y])
return x, y
# Uncertainty Coefficient, is based on the conditional entropy between x and y. Function 'theils_u' is asymmetric.
def theils_u(x, y):
s_xy = conditional_entropy(x,y)
x_counter = Counter(x)
total_occurrences = sum(x_counter.values())
p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
s_x = ss.entropy(p_x)
if s_x == 0:
return 1
else:
return (s_x - s_xy) / s_x
for i in categorical_cols:
confusion_matrix = pd.crosstab(df_train["loan_status"], df_train[i]).values
print((i + (' '*20))[0:20], "\t", round(cramers_v(df_train[i], df_train["loan_status"]), 3),
"\t", round(theils_u(df_train["loan_status"], df_train[i]), 3),
"\t", round(theils_u(df_train[i], df_train["loan_status"]), 3))
OUTPUT:
verification_status 0.029 0.004 0.0
zip_code 0.0 0.177 0.004
purpose 0.071 0.019 0.002
title 0.071 0.019 0.002
emp_title 0.0 0.63 0.011
addr_state 0.017 0.018 0.001
last_pymnt_d 0.413 0.342 0.074
issue_d 0.088 0.034 0.002
grade 0.125 0.049 0.004
home_ownership 0.03 0.004 0.001
sub_grade 0.134 0.06 0.003
term 0.029 0.003 0.001
last_credit_pull_d 0.123 0.029 0.016
earliest_cr_line 0.0 0.124 0.003
next_pymnt_d 0.07 0.014 0.004
emp_length 0.014 0.004 0.0
initial_list_status 0.04 0.006 0.001
# For a pair of a continuous feature and a categorical feature. Method 'correlation_ratio' is asymmetric. We are calculating average of a derived measurement. Swapping category with measure would result in error in the line: "y_avg_array[i] = np.average(cat_measures)".
def correlation_ratio(categories, measurements):
fcat, _ = pd.factorize(categories)
cat_num = np.max(fcat)+1
y_avg_array = np.zeros(cat_num)
n_array = np.zeros(cat_num)
for i in range(0,cat_num):
cat_measures = measurements[np.argwhere(fcat == i).flatten()]
n_array[i] = len(cat_measures)
y_avg_array[i] = np.average(cat_measures)
y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
if numerator == 0:
eta = 0.0
else:
eta = np.sqrt(numerator/denominator)
return eta
for i in categorical_cols:
confusion_matrix = pd.crosstab(df_train[i], df_train["loan_status"]).values
print((i + (' '*20))[0:20], "\t", round(correlation_ratio(df_train[i], df_train["loan_status"]), 3))
OUTPUT:
verification_status 0.032
zip_code 0.245
purpose 0.078
title 0.078
emp_title 0.735
addr_state 0.067
last_pymnt_d 0.414
issue_d 0.092
grade 0.127
home_ownership 0.033
sub_grade 0.145
term 0.031
last_credit_pull_d 0.126
earliest_cr_line 0.189
next_pymnt_d 0.072
emp_length 0.033
initial_list_status 0.042
# Ref 4: OpenCodez
Dataset used can be found here: Google Drive
Pages
- Index of Lessons in Technology
- Index of Book Summaries
- Index of Book Lists And Downloads
- Index For Job Interviews Preparation
- Index of "Algorithms: Design and Analysis"
- Python Course (Index)
- Data Analytics Course (Index)
- Index of Machine Learning
- Postings Index
- Index of BITS WILP Exam Papers and Content
- Lessons in Investing
- Index of Math Lessons
- Downloads
- Index of Management Lessons
- Book Requests
- Index of English Lessons
- Index of Medicines
- Index of Quizzes (Educational)
Correlation between continuous-numeric columns, and between categorical columns
Subscribe to:
Comments (Atom)
No comments:
Post a Comment