Saturday, August 22, 2020

Using Snorkel to create test data and classifying using Scikit-Learn


The data set we have is the "Iris" dataset. We will augment the dataset to create "test" dataset and then use "Scikit-Learn's Support Vector Machines' classifier 'SVC'" to classify the test points into one of the Iris species.

import pandas as pd
import numpy as np

from snorkel.augmentation import transformation_function
from snorkel.augmentation import RandomPolicy
from snorkel.augmentation import PandasTFApplier

from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

df = pd.read_csv('files_1/datasets_19_420_Iris.csv')

for i in set(df.Species):
    # Other columns are ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
    print(i)
    print(df[df.Species == i].describe().loc[['min', '25%', '50%', '75%', 'max'], :])
	
features = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'] classes = ['Iris-setosa', 'Iris-virginica', 'Iris-versicolor'] desc_dict = {} for i in classes: desc_dict[i] = df[df.Species == i].describe() df['Train'] = 'Train' # random.randint returns a random integer N such that a <= N <= b @transformation_function(pre = []) def get_new_instance_for_this_class(x): x.SepalLengthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['SepalLengthCm']].iloc[0,0], 2) * 100, round(desc_dict[x.Species].loc[['75%'], ['SepalLengthCm']].iloc[0,0], 2) * 100) / 100 x.SepalWidthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['SepalWidthCm']].iloc[0,0], 2) * 100, round(desc_dict[x.Species].loc[['75%'], ['SepalWidthCm']].iloc[0,0], 2) * 100) / 100 x.PetalLengthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['PetalLengthCm']].iloc[0,0], 2) * 100, round(desc_dict[x.Species].loc[['75%'], ['PetalLengthCm']].iloc[0,0], 2) * 100) / 100 x.PetalWidthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['PetalWidthCm']].iloc[0,0], 2) * 100, round(desc_dict[x.Species].loc[['75%'], ['PetalWidthCm']].iloc[0,0], 2) * 100) / 100 x.Train = 'Test' return x tfs = [ get_new_instance_for_this_class ] random_policy = RandomPolicy( len(tfs), sequence_length=2, n_per_original=1, keep_original=True ) tf_applier = PandasTFApplier(tfs, random_policy) df_train_augmented = tf_applier.apply(df) print(f"Original training set size: {len(df)}") print(f"Augmented training set size: {len(df_train_augmented)}")
Output: Original training set size: 150 Augmented training set size: 300 df_test = df_train_augmented[df_train_augmented.Train == 'Test'] clf = svm.SVC(gamma = 'auto') clf.fit(df[features], df['Species']) Output: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) pred = clf.predict(df_test[features]) print("Accuracy: {:.3f}".format(accuracy_score(df_test['Species'], pred))) print("Confusion matrix:\n{}".format(confusion_matrix(df_test['Species'], pred)))
To confirm that we do not have an overlap in training and testing data: left = df[features] right = df_test[features] print(left.merge(right, on = features, how = 'inner').shape) Output: (0, 4) left = df[['Id']] right = df_test[['Id']] print(left.merge(right, on = ['Id'], how = 'inner').shape) (150, 1)

No comments:

Post a Comment