The data set we have is the "Iris" dataset. We will augment the dataset to create "test" dataset and then use "Scikit-Learn's Support Vector Machines' classifier 'SVC'" to classify the test points into one of the Iris species. import pandas as pd import numpy as np from snorkel.augmentation import transformation_function from snorkel.augmentation import RandomPolicy from snorkel.augmentation import PandasTFApplier from sklearn import svm from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix df = pd.read_csv('files_1/datasets_19_420_Iris.csv') for i in set(df.Species): # Other columns are ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] print(i) print(df[df.Species == i].describe().loc[['min', '25%', '50%', '75%', 'max'], :]) features = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'] classes = ['Iris-setosa', 'Iris-virginica', 'Iris-versicolor'] desc_dict = {} for i in classes: desc_dict[i] = df[df.Species == i].describe() df['Train'] = 'Train' # random.randint returns a random integer N such that a <= N <= b @transformation_function(pre = []) def get_new_instance_for_this_class(x): x.SepalLengthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['SepalLengthCm']].iloc[0,0], 2) * 100, round(desc_dict[x.Species].loc[['75%'], ['SepalLengthCm']].iloc[0,0], 2) * 100) / 100 x.SepalWidthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['SepalWidthCm']].iloc[0,0], 2) * 100, round(desc_dict[x.Species].loc[['75%'], ['SepalWidthCm']].iloc[0,0], 2) * 100) / 100 x.PetalLengthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['PetalLengthCm']].iloc[0,0], 2) * 100, round(desc_dict[x.Species].loc[['75%'], ['PetalLengthCm']].iloc[0,0], 2) * 100) / 100 x.PetalWidthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['PetalWidthCm']].iloc[0,0], 2) * 100, round(desc_dict[x.Species].loc[['75%'], ['PetalWidthCm']].iloc[0,0], 2) * 100) / 100 x.Train = 'Test' return x tfs = [ get_new_instance_for_this_class ] random_policy = RandomPolicy( len(tfs), sequence_length=2, n_per_original=1, keep_original=True ) tf_applier = PandasTFApplier(tfs, random_policy) df_train_augmented = tf_applier.apply(df) print(f"Original training set size: {len(df)}") print(f"Augmented training set size: {len(df_train_augmented)}") Output: Original training set size: 150 Augmented training set size: 300 df_test = df_train_augmented[df_train_augmented.Train == 'Test'] clf = svm.SVC(gamma = 'auto') clf.fit(df[features], df['Species']) Output: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) pred = clf.predict(df_test[features]) print("Accuracy: {:.3f}".format(accuracy_score(df_test['Species'], pred))) print("Confusion matrix:\n{}".format(confusion_matrix(df_test['Species'], pred))) To confirm that we do not have an overlap in training and testing data: left = df[features] right = df_test[features] print(left.merge(right, on = features, how = 'inner').shape) Output: (0, 4) left = df[['Id']] right = df_test[['Id']] print(left.merge(right, on = ['Id'], how = 'inner').shape) (150, 1)
Saturday, August 22, 2020
Using Snorkel to create test data and classifying using Scikit-Learn
Labels:
Technology
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment