Thursday, September 17, 2020

Improving a Classifier (ML) Using Snorkel's Slicing Technique


The dataset we are using is the '150 datapoints strong' Iris flower species dataset (Download from here).

We have a dependency here to draw the confusion matrix. The code file name is: DrawConfusionMatrix.py

Content:

# Ref: Scikit-Learn 

import itertools
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize = False,
                          title = 'Confusion matrix',
                          cmap = plt.cm.Blues,
                          use_seaborn = False):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    
    if use_seaborn == False:
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.colorbar()
        
        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
            
        tick_marks = np.arange(len(classes) + 0)
    
    else:
        
        ax = sns.heatmap(cm, annot=True, fmt='d') #notation: "annot" not "annote"
        # fmt='d': print values as decimals
        
        bottom, top = ax.get_ylim()
        ax.set_ylim(bottom + 0.5, top - 0.5)
        tick_marks = np.arange(len(classes) + 1)

    plt.title(title)
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.ylabel('True label')
    plt.xlabel('Predicted label') 
    
Now, the main problem: 

# Import libraries.

import DrawConfusionMatrix as dcm
import importlib # The imp module was deprecated in Python 3.4 in favor of the importlib module.
importlib.reload(dcm)

import pandas as pd
import numpy as np
from collections import Counter

from snorkel.augmentation import transformation_function
from snorkel.augmentation import RandomPolicy
from snorkel.augmentation import PandasTFApplier

from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


df = pd.read_csv('datasets_19_420_Iris.csv') 

for i in set(df.Species):
    # ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
    print(i)
    print(df[df.Species == i].describe().loc[['mean', 'std'], :], '\n') 
	
Iris-versicolor
            Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
mean  75.50000       5.936000      2.770000       4.260000      1.326000
std   14.57738       0.516171      0.313798       0.469911      0.197753 

Iris-virginica
             Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
mean  125.50000        6.58800      2.974000       5.552000       2.02600
std    14.57738        0.63588      0.322497       0.551895       0.27465 

Iris-setosa
            Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
mean  25.50000        5.00600      3.418000       1.464000       0.24400
std   14.57738        0.35249      0.381024       0.173511       0.10721  

 
features = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

classes = ['Iris-setosa', 'Iris-virginica', 'Iris-versicolor']
desc_dict = {}
for i in classes:
    desc_dict[i] = df[df.Species == i].describe()
	
df['Train'] = 'Train'

# random.randint returns a random integer N such that a <= N <= b

@transformation_function(pre = [])
def get_new_instance_for_this_class(x):
    x.SepalLengthCm = np.random.normal(round(desc_dict[x.Species].loc[['mean'], ['SepalLengthCm']].iloc[0,0], 2) * 100, 
                  round(desc_dict[x.Species].loc[['std'], ['SepalLengthCm']].iloc[0,0], 2) * 100) / 100
    
    x.SepalWidthCm = np.random.normal(round(desc_dict[x.Species].loc[['mean'], ['SepalWidthCm']].iloc[0,0], 2) * 100, 
                  round(desc_dict[x.Species].loc[['std'], ['SepalWidthCm']].iloc[0,0], 2) * 100) / 100
    
    x.PetalLengthCm = np.random.normal(round(desc_dict[x.Species].loc[['mean'], ['PetalLengthCm']].iloc[0,0], 2) * 100, 
                  round(desc_dict[x.Species].loc[['std'], ['PetalLengthCm']].iloc[0,0], 2) * 100) / 100
    
    x.PetalWidthCm = np.random.normal(round(desc_dict[x.Species].loc[['mean'], ['PetalWidthCm']].iloc[0,0], 2) * 100, 
                  round(desc_dict[x.Species].loc[['std'], ['PetalWidthCm']].iloc[0,0], 2) * 100) / 100
    
    x.Train = 'Test'
    return x

tfs = [ get_new_instance_for_this_class ]

random_policy = RandomPolicy(
    len(tfs), sequence_length=2, n_per_original=5, keep_original=True
    # n_per_original (int) – Number of transformed data points per original
)

tf_applier = PandasTFApplier(tfs, random_policy)
df_train_augmented = tf_applier.apply(df)

print(f"Original training set size: {len(df)}")
print(f"Augmented training set size: {len(df_train_augmented)}") 

Original training set size: 150
Augmented training set size: 900 

df_test = df_train_augmented[df_train_augmented.Train == 'Test']

pred = clf.predict(df_test[features])

pred_probs = clf.predict_proba(df_test[features])
# Make Note Of >> AttributeError: predict_proba is not available when 'probability=False'

print(Counter(pred))
print("Accuracy: {:.3f}".format(accuracy_score(df_test['Species'], pred)))

cm = confusion_matrix(df_test['Species'], pred)
print("Confusion matrix:\n{}".format(cm))

Counter({'Iris-versicolor': 252, 'Iris-setosa': 250, 'Iris-virginica': 248})
Accuracy: 0.968
Confusion matrix:
[[250   0   0]
 [  0 239  11]
 [  0  13 237]] 

classes = ['setosa', 'versicolor', 'virginica']

dcm.plot_confusion_matrix(cm, classes = classes, use_seaborn = True) 

# This plot is for 'Support Vector Machine' based classifier.

# This plot is for 'Random Forest' based classifier.
Here we see that there are some misclassified data points for classes 'Versicolor' and 'Verginica'. 'Setosa' has not been misclassified by either SVM or RandomForest. Next, we would slice the dataframe into 'setosa' and 'not setosa' dataframes. Because we are not having issues with 'setosa' data points, we would re-train a classifier on the other two classes viz. 'versicolor' and 'virginica'. import re from snorkel.slicing import slicing_function @slicing_function() def not_setosa(x): return x.Species != 'Iris-setosa' sfs = [not_setosa] # ~ ~ ~ #Store slice metadata in S from snorkel.slicing import PandasSFApplier applier = PandasSFApplier(sfs) S_test = applier.apply(df_test) # ~ ~ ~ from snorkel.analysis import Scorer scorer = Scorer(metrics=["f1_micro", "f1_macro"]) # Make Note Of >> ValueError: f1 not supported for multiclass. # Try f1_micro or f1_macro instead. # ~ ~ ~ from sklearn import preprocessing le = preprocessing.LabelEncoder() le.fit(df_test['Species']) scorer.score_slices( S=S_test, golds=le.transform(df_test['Species']), preds=le.transform(pred), probs=pred_probs, as_dataframe=True )
from snorkel.slicing import slice_dataframe df_not_setosa = slice_dataframe(df_train_augmented, not_setosa) from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(max_depth=4, random_state=0, n_estimators = 100) RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=4, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, oob_score=False, random_state=0, verbose=0, warm_start=False) df_test_rfc = df_not_setosa[df_not_setosa.Train == 'Test'] pred_rfc = rfc.predict(df_test_rfc[features]) print(Counter(pred_rfc)) print("Accuracy: {:.3f}".format(accuracy_score(df_test_rfc['Species'], pred_rfc))) cm = confusion_matrix(df_test_rfc['Species'], pred_rfc) print("Confusion matrix:\n{}".format(cm)) Counter({'Iris-versicolor': 251, 'Iris-virginica': 249}) Accuracy: 0.990 Confusion matrix: [[248 2] [ 3 247]] dcm.plot_confusion_matrix(cm, classes = ['versicolor', 'virginica'], use_seaborn = True) Using RandomForestClassifier on sliced dataset:
We also have the score for SVC, it is not as good as RandomForestClassifier: svc = svm.SVC(gamma = 'auto', probability=True) svc.fit(df_not_setosa[features], df_not_setosa['Species']) SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) pred_svc = svc.predict(df_test_rfc[features]) print(Counter(pred_svc)) print("Accuracy: {:.3f}".format(accuracy_score(df_test_rfc['Species'], pred_svc))) cm = confusion_matrix(df_test_rfc['Species'], pred_svc) print("Confusion matrix:\n{}".format(cm)) Counter({'Iris-versicolor': 251, 'Iris-virginica': 249}) Accuracy: 0.986 Confusion matrix: [[247 3] [ 4 246]] Reference % Slice-based Learning: a Programming Model for Residual Learning in Critical Data Slices

No comments:

Post a Comment