The dataset we are using is the '150 datapoints strong' Iris flower species dataset (Download from here). We have a dependency here to draw the confusion matrix. The code file name is: DrawConfusionMatrix.py Content: # Ref: Scikit-Learn import itertools import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn import svm, datasets from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix def plot_confusion_matrix(cm, classes, normalize = False, title = 'Confusion matrix', cmap = plt.cm.Blues, use_seaborn = False): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) if use_seaborn == False: plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.colorbar() fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") tick_marks = np.arange(len(classes) + 0) else: ax = sns.heatmap(cm, annot=True, fmt='d') #notation: "annot" not "annote" # fmt='d': print values as decimals bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top - 0.5) tick_marks = np.arange(len(classes) + 1) plt.title(title) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) plt.ylabel('True label') plt.xlabel('Predicted label') Now, the main problem: # Import libraries. import DrawConfusionMatrix as dcm import importlib # The imp module was deprecated in Python 3.4 in favor of the importlib module. importlib.reload(dcm) import pandas as pd import numpy as np from collections import Counter from snorkel.augmentation import transformation_function from snorkel.augmentation import RandomPolicy from snorkel.augmentation import PandasTFApplier from sklearn import svm from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix df = pd.read_csv('datasets_19_420_Iris.csv') for i in set(df.Species): # ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] print(i) print(df[df.Species == i].describe().loc[['mean', 'std'], :], '\n') Iris-versicolor Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm mean 75.50000 5.936000 2.770000 4.260000 1.326000 std 14.57738 0.516171 0.313798 0.469911 0.197753 Iris-virginica Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm mean 125.50000 6.58800 2.974000 5.552000 2.02600 std 14.57738 0.63588 0.322497 0.551895 0.27465 Iris-setosa Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm mean 25.50000 5.00600 3.418000 1.464000 0.24400 std 14.57738 0.35249 0.381024 0.173511 0.10721 features = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'] classes = ['Iris-setosa', 'Iris-virginica', 'Iris-versicolor'] desc_dict = {} for i in classes: desc_dict[i] = df[df.Species == i].describe() df['Train'] = 'Train' # random.randint returns a random integer N such that a <= N <= b @transformation_function(pre = []) def get_new_instance_for_this_class(x): x.SepalLengthCm = np.random.normal(round(desc_dict[x.Species].loc[['mean'], ['SepalLengthCm']].iloc[0,0], 2) * 100, round(desc_dict[x.Species].loc[['std'], ['SepalLengthCm']].iloc[0,0], 2) * 100) / 100 x.SepalWidthCm = np.random.normal(round(desc_dict[x.Species].loc[['mean'], ['SepalWidthCm']].iloc[0,0], 2) * 100, round(desc_dict[x.Species].loc[['std'], ['SepalWidthCm']].iloc[0,0], 2) * 100) / 100 x.PetalLengthCm = np.random.normal(round(desc_dict[x.Species].loc[['mean'], ['PetalLengthCm']].iloc[0,0], 2) * 100, round(desc_dict[x.Species].loc[['std'], ['PetalLengthCm']].iloc[0,0], 2) * 100) / 100 x.PetalWidthCm = np.random.normal(round(desc_dict[x.Species].loc[['mean'], ['PetalWidthCm']].iloc[0,0], 2) * 100, round(desc_dict[x.Species].loc[['std'], ['PetalWidthCm']].iloc[0,0], 2) * 100) / 100 x.Train = 'Test' return x tfs = [ get_new_instance_for_this_class ] random_policy = RandomPolicy( len(tfs), sequence_length=2, n_per_original=5, keep_original=True # n_per_original (int) – Number of transformed data points per original ) tf_applier = PandasTFApplier(tfs, random_policy) df_train_augmented = tf_applier.apply(df) print(f"Original training set size: {len(df)}") print(f"Augmented training set size: {len(df_train_augmented)}") Original training set size: 150 Augmented training set size: 900 df_test = df_train_augmented[df_train_augmented.Train == 'Test'] pred = clf.predict(df_test[features]) pred_probs = clf.predict_proba(df_test[features]) # Make Note Of >> AttributeError: predict_proba is not available when 'probability=False' print(Counter(pred)) print("Accuracy: {:.3f}".format(accuracy_score(df_test['Species'], pred))) cm = confusion_matrix(df_test['Species'], pred) print("Confusion matrix:\n{}".format(cm)) Counter({'Iris-versicolor': 252, 'Iris-setosa': 250, 'Iris-virginica': 248}) Accuracy: 0.968 Confusion matrix: [[250 0 0] [ 0 239 11] [ 0 13 237]] classes = ['setosa', 'versicolor', 'virginica'] dcm.plot_confusion_matrix(cm, classes = classes, use_seaborn = True) # This plot is for 'Support Vector Machine' based classifier. # This plot is for 'Random Forest' based classifier. Here we see that there are some misclassified data points for classes 'Versicolor' and 'Verginica'. 'Setosa' has not been misclassified by either SVM or RandomForest. Next, we would slice the dataframe into 'setosa' and 'not setosa' dataframes. Because we are not having issues with 'setosa' data points, we would re-train a classifier on the other two classes viz. 'versicolor' and 'virginica'. import re from snorkel.slicing import slicing_function @slicing_function() def not_setosa(x): return x.Species != 'Iris-setosa' sfs = [not_setosa] # ~ ~ ~ #Store slice metadata in S from snorkel.slicing import PandasSFApplier applier = PandasSFApplier(sfs) S_test = applier.apply(df_test) # ~ ~ ~ from snorkel.analysis import Scorer scorer = Scorer(metrics=["f1_micro", "f1_macro"]) # Make Note Of >> ValueError: f1 not supported for multiclass. # Try f1_micro or f1_macro instead. # ~ ~ ~ from sklearn import preprocessing le = preprocessing.LabelEncoder() le.fit(df_test['Species']) scorer.score_slices( S=S_test, golds=le.transform(df_test['Species']), preds=le.transform(pred), probs=pred_probs, as_dataframe=True ) from snorkel.slicing import slice_dataframe df_not_setosa = slice_dataframe(df_train_augmented, not_setosa) from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(max_depth=4, random_state=0, n_estimators = 100) RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=4, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, oob_score=False, random_state=0, verbose=0, warm_start=False) df_test_rfc = df_not_setosa[df_not_setosa.Train == 'Test'] pred_rfc = rfc.predict(df_test_rfc[features]) print(Counter(pred_rfc)) print("Accuracy: {:.3f}".format(accuracy_score(df_test_rfc['Species'], pred_rfc))) cm = confusion_matrix(df_test_rfc['Species'], pred_rfc) print("Confusion matrix:\n{}".format(cm)) Counter({'Iris-versicolor': 251, 'Iris-virginica': 249}) Accuracy: 0.990 Confusion matrix: [[248 2] [ 3 247]] dcm.plot_confusion_matrix(cm, classes = ['versicolor', 'virginica'], use_seaborn = True) Using RandomForestClassifier on sliced dataset: We also have the score for SVC, it is not as good as RandomForestClassifier: svc = svm.SVC(gamma = 'auto', probability=True) svc.fit(df_not_setosa[features], df_not_setosa['Species']) SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) pred_svc = svc.predict(df_test_rfc[features]) print(Counter(pred_svc)) print("Accuracy: {:.3f}".format(accuracy_score(df_test_rfc['Species'], pred_svc))) cm = confusion_matrix(df_test_rfc['Species'], pred_svc) print("Confusion matrix:\n{}".format(cm)) Counter({'Iris-versicolor': 251, 'Iris-virginica': 249}) Accuracy: 0.986 Confusion matrix: [[247 3] [ 4 246]] Reference % Slice-based Learning: a Programming Model for Residual Learning in Critical Data Slices
Thursday, September 17, 2020
Improving a Classifier (ML) Using Snorkel's Slicing Technique
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment