The data set we have is the "Iris" dataset. We will augment the dataset to create "test" dataset and then use "Scikit-Learn's Support Vector Machines' classifier 'SVC'" to classify the test points into one of the Iris species.
import pandas as pd
import numpy as np
from snorkel.augmentation import transformation_function
from snorkel.augmentation import RandomPolicy
from snorkel.augmentation import PandasTFApplier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
df = pd.read_csv('files_1/datasets_19_420_Iris.csv')
for i in set(df.Species):
# Other columns are ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
print(i)
print(df[df.Species == i].describe().loc[['min', '25%', '50%', '75%', 'max'], :])
features = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
classes = ['Iris-setosa', 'Iris-virginica', 'Iris-versicolor']
desc_dict = {}
for i in classes:
desc_dict[i] = df[df.Species == i].describe()
df['Train'] = 'Train'
# random.randint returns a random integer N such that a <= N <= b
@transformation_function(pre = [])
def get_new_instance_for_this_class(x):
x.SepalLengthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['SepalLengthCm']].iloc[0,0], 2) * 100,
round(desc_dict[x.Species].loc[['75%'], ['SepalLengthCm']].iloc[0,0], 2) * 100) / 100
x.SepalWidthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['SepalWidthCm']].iloc[0,0], 2) * 100,
round(desc_dict[x.Species].loc[['75%'], ['SepalWidthCm']].iloc[0,0], 2) * 100) / 100
x.PetalLengthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['PetalLengthCm']].iloc[0,0], 2) * 100,
round(desc_dict[x.Species].loc[['75%'], ['PetalLengthCm']].iloc[0,0], 2) * 100) / 100
x.PetalWidthCm = np.random.randint(round(desc_dict[x.Species].loc[['25%'], ['PetalWidthCm']].iloc[0,0], 2) * 100,
round(desc_dict[x.Species].loc[['75%'], ['PetalWidthCm']].iloc[0,0], 2) * 100) / 100
x.Train = 'Test'
return x
tfs = [ get_new_instance_for_this_class ]
random_policy = RandomPolicy(
len(tfs), sequence_length=2, n_per_original=1, keep_original=True
)
tf_applier = PandasTFApplier(tfs, random_policy)
df_train_augmented = tf_applier.apply(df)
print(f"Original training set size: {len(df)}")
print(f"Augmented training set size: {len(df_train_augmented)}")
Output:
Original training set size: 150
Augmented training set size: 300
df_test = df_train_augmented[df_train_augmented.Train == 'Test']
clf = svm.SVC(gamma = 'auto')
clf.fit(df[features], df['Species'])
Output:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
pred = clf.predict(df_test[features])
print("Accuracy: {:.3f}".format(accuracy_score(df_test['Species'], pred)))
print("Confusion matrix:\n{}".format(confusion_matrix(df_test['Species'], pred)))
To confirm that we do not have an overlap in training and testing data:
left = df[features]
right = df_test[features]
print(left.merge(right, on = features, how = 'inner').shape)
Output:
(0, 4)
left = df[['Id']]
right = df_test[['Id']]
print(left.merge(right, on = ['Id'], how = 'inner').shape)
(150, 1)
Pages
- Index of Lessons in Technology
- Index of Book Summaries
- Index of Book Lists And Downloads
- Index For Job Interviews Preparation
- Index of "Algorithms: Design and Analysis"
- Python Course (Index)
- Data Analytics Course (Index)
- Index of Machine Learning
- Postings Index
- Index of BITS WILP Exam Papers and Content
- Lessons in Investing
- Index of Math Lessons
- Downloads
- Index of Management Lessons
- Book Requests
- Index of English Lessons
- Index of Medicines
- Index of Quizzes (Educational)
Saturday, August 22, 2020
Using Snorkel to create test data and classifying using Scikit-Learn
Labels:
Technology
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment