survival8: Using Snorkel to create test data and classifying using Scikit-Learn

Saturday, August 22, 2020
Using Snorkel to create test data and classifying using Scikit-Learn

The data set we have is the "Iris" dataset. We will augment the dataset to create "test" dataset and then use "Scikit-Learn's Support Vector Machines' classifier 'SVC'" to classify the test points into one of the Iris species.

import pandas as pd
import numpy as np

from snorkel.augmentation import transformation_function
from snorkel.augmentation import RandomPolicy
from snorkel.augmentation import PandasTFApplier

from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

df = pd.read_csv('files_1/datasets_19_420_Iris.csv')

for i in set(df.Species):
    # Other columns are ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
    print(i)
    print(df[df.Species == i].describe().loc[['min', '25%', '50%', '75%', 'max'], :])
	


features = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

classes = ['Iris-setosa', 'Iris-virginica', 'Iris-versicolor']
desc_dict = {}
for i in classes:
    desc_dict[i] = df[df.Species == i].describe()
	
df['Train'] = 'Train'

# random.randint returns a random integer N such that a <= N <= b
@transformation_function(pre = [])
def get_new_instance_for_this_class(x):
    x.SepalLengthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['SepalLengthCm']].iloc[0,0], 2) * 100, 
                  round(desc_dict[x.Species].loc[['75%'], ['SepalLengthCm']].iloc[0,0], 2) * 100) / 100
    
    x.SepalWidthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['SepalWidthCm']].iloc[0,0], 2) * 100, 
                  round(desc_dict[x.Species].loc[['75%'], ['SepalWidthCm']].iloc[0,0], 2) * 100) / 100
    
    x.PetalLengthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['PetalLengthCm']].iloc[0,0], 2) * 100, 
                  round(desc_dict[x.Species].loc[['75%'], ['PetalLengthCm']].iloc[0,0], 2) * 100) / 100
    
    x.PetalWidthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['PetalWidthCm']].iloc[0,0], 2) * 100, 
                  round(desc_dict[x.Species].loc[['75%'], ['PetalWidthCm']].iloc[0,0], 2) * 100) / 100
    
    x.Train = 'Test'
    return x

tfs = [ get_new_instance_for_this_class ]

random_policy = RandomPolicy(
    len(tfs), sequence_length=2, n_per_original=1, keep_original=True
)

tf_applier = PandasTFApplier(tfs, random_policy)
df_train_augmented = tf_applier.apply(df)

print(f"Original training set size: {len(df)}")
print(f"Augmented training set size: {len(df_train_augmented)}") 

Output:
Original training set size: 150
Augmented training set size: 300 

df_test = df_train_augmented[df_train_augmented.Train == 'Test']

clf = svm.SVC(gamma = 'auto')

clf.fit(df[features], df['Species'])

Output:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
	
pred = clf.predict(df_test[features])

print("Accuracy: {:.3f}".format(accuracy_score(df_test['Species'], pred)))
print("Confusion matrix:\n{}".format(confusion_matrix(df_test['Species'], pred))) 



To confirm that we do not have an overlap in training and testing data:

left = df[features]
right = df_test[features]
print(left.merge(right, on = features, how = 'inner').shape)

Output:
(0, 4)

left = df[['Id']]
right = df_test[['Id']]
print(left.merge(right, on = ['Id'], how = 'inner').shape)

(150, 1)
survival8

Saturday, August 22, 2020

Using Snorkel to create test data and classifying using Scikit-Learn

No comments:

Post a Comment