We are going to do unsupervised outlier detection on "Thyroid" dataset using the PyOD package for follwing algorithms: Model 1: Angle-based Outlier Detector (ABOD) Model 2: Cluster-based Local Outlier Factor (CBLOF) Model 3: Feature Bagging Model 4: Histogram-base Outlier Detection (HBOS) Model 5: Isolation Forest Model 6: K Nearest Neighbors (KNN) Model 7: Average KNN Model 8: Median KNN Model 9: Local Outlier Factor (LOF) Model 10: Minimum Covariance Determinant (MCD) Model 11: Principal Component Analysis (PCA) Model 12: One-class SVM (OCSVM) Model 13: AutoEncoder Model 14: LODA Model 15: MO_GAAL Model 16: SO_GAAL Model 17: VAE We have downloaded the dataset from this link and also shared here Google Drive link. # -*- coding: utf-8 -*- """Compare all detection algorithms by plotting decision boundaries and the number of decision boundaries. """ from __future__ import division from __future__ import print_function import os import sys import warnings warnings.filterwarnings("ignore") import numpy as np from numpy import percentile import pandas as pd from time import time from joblib import load, dump # Import models from "GitHub clone" directory from pyod_container.pyod.models.abod import ABOD from pyod_container.pyod.models.cblof import CBLOF from pyod_container.pyod.models.feature_bagging import FeatureBagging from pyod_container.pyod.models.hbos import HBOS from pyod_container.pyod.models.iforest import IForest from pyod_container.pyod.models.knn import KNN from pyod_container.pyod.models.lof import LOF from pyod_container.pyod.models.loci import LOCI from pyod_container.pyod.models.mcd import MCD from pyod_container.pyod.models.ocsvm import OCSVM from pyod_container.pyod.models.pca import PCA from pyod_container.pyod.models.sos import SOS from pyod_container.pyod.models.lscp import LSCP from pyod_container.pyod.models.cof import COF from pyod_container.pyod.models.sod import SOD # --- Imporing packages from "pip" installation from pyod.models.auto_encoder import AutoEncoder from pyod.models.lmdd import LMDD from pyod.models.loda import LODA from pyod.models.mo_gaal import MO_GAAL from pyod.models.so_gaal import SO_GAAL from pyod.models.vae import VAE from pyod.models.xgbod import XGBOD from pyod.utils.data import evaluate_print # --- Imports --- End Here outliers_fraction = 0.01 random_state = np.random.RandomState(42) df_in = pd.read_csv("files_2/annthyroid-unsupervised-ad.csv", header = None) df = df_in[list(range(0, 21))] # Column 21 has actual results (O: Outlier, N: Inlier) classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Median KNN': KNN(method='median', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), "One-class SVM (OCSVM)": OCSVM(contamination=outliers_fraction), "AutoEncoder": AutoEncoder(epochs=30, contamination = outliers_fraction, hidden_neurons = [20, 19, 18, 17, 18, 19, 20]), "LODA": LODA(), "MO_GAAL": MO_GAAL(k=3, stop_epochs=2, contamination = outliers_fraction), "SO_GAAL": SO_GAAL(contamination = outliers_fraction), "VAE": VAE(epochs=30, contamination = outliers_fraction, encoder_neurons= [20, 19, 18, 17], decoder_neurons = [17, 18, 19, 20]), # "LMDD": LMDD(random_state=42), # Takes infinite time # "XGBOD": XGBOD(random_state=42) # This is a supervised algorithm. Requires labeled data. } # Show all detectors for i, clf in enumerate(classifiers.keys()): print('Model', i + 1, clf) for i, (clf_name, clf) in enumerate(classifiers.items()): start = time() print() print(i + 1, ': fitting', clf_name) # fit the data and tag outliers if clf_name == "XGBOD": clf = clf.fit(df, y = list(df_in.labels)) else: clf = clf.fit(df) dump(clf, os.path.join("files_2", "models", clf_name + ".joblib")) scores_pred = clf.decision_function(df) * -1 y_pred = clf.predict(df) threshold = percentile(scores_pred, 100 * outliers_fraction) dump(y_pred, os.path.join("files_2", "predictions", clf_name + ".joblib")) print("Time taken: {}".format(time() - start)) Next we compare the results from our models. Using the below code in Jupyter Notebook: import pandas as pd import os from joblib import load, dump import seaborn as sns from collections import Counter predictions = {} for dirpath, subdirs, files in os.walk('files_2/predictions'): for f in files: predictions[str(f)] = load(os.path.join(dirpath, f)) predictions_df = pd.DataFrame(predictions) predictions_df.columns = ['abod', 'ae', 'a_knn', 'cblof', 'fb', 'hbos', 'if', 'knn', 'lof', 'loda', 'm_knn', 'mcd', 'mo_gaal', 'ocsvm', 'pca', 'so_gaal', 'vae'] relevant_cols = ['ae', 'a_knn', 'cblof', 'fb', 'hbos', 'if', 'knn', 'lof', 'loda', 'm_knn', 'mcd', 'mo_gaal', 'ocsvm', 'pca', 'so_gaal', 'vae'] corr = predictions_df[relevant_cols].corr() sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap = sns.diverging_palette(0, 255, sep=1, n=256), vmin=-1, vmax=1) Output: Note: sns.diverging_palette(0, 255, sep=1, n=256) :: This works for Python 3.6 and not for 3.7 and 3.8 On running the code "sns.diverging_palette(0, 255, sep=1, n=256)" on Python 3.7 or 3.8, following error occurs: TypeError Traceback (most recent call last) ~\AppData\Roaming\Python\Python37\site-packages\numpy\core\function_base.py in linspace(start, stop, num, endpoint, retstep, dtype, axis) 116 try: --> 117 num = operator.index(num) 118 except TypeError: TypeError: 'float' object cannot be interpreted as an integer During handling of the above exception, another exception occurred: TypeError Traceback (most recent call last)in Refereces: 1. https://github.com/yzhao062/anomaly-detection-resources----> 1 sns.diverging_palette(0, 255, sep=1, n=256) ~\AppData\Local\Continuum\anaconda3\lib\site-packages\seaborn\palettes.py in diverging_palette(h_neg, h_pos, s, l, sep, n, center, as_cmap) 742 """ 743 palfunc = dark_palette if center == "dark" else light_palette --> 744 neg = palfunc((h_neg, s, l), 128 - (sep / 2), reverse=True, input="husl") 745 pos = palfunc((h_pos, s, l), 128 - (sep / 2), input="husl") 746 midpoint = dict(light=[(.95, .95, .95, 1.)], ~\AppData\Local\Continuum\anaconda3\lib\site-packages\seaborn\palettes.py in light_palette(color, n_colors, reverse, as_cmap, input) 639 light = set_hls_values(color, l=.95) # noqa 640 colors = [color, light] if reverse else [light, color] --> 641 return blend_palette(colors, n_colors, as_cmap) 642 643 ~\AppData\Local\Continuum\anaconda3\lib\site-packages\seaborn\palettes.py in blend_palette(colors, n_colors, as_cmap, input) 775 pal = mpl.colors.LinearSegmentedColormap.from_list(name, colors) 776 if not as_cmap: --> 777 pal = _ColorPalette(pal(np.linspace(0, 1, n_colors))) 778 return pal 779 <__array_function__ internals> in linspace(*args, **kwargs) ~\AppData\Roaming\Python\Python37\site-packages\numpy\core\function_base.py in linspace(start, stop, num, endpoint, retstep, dtype, axis) 119 raise TypeError( 120 "object of type {} cannot be safely interpreted as an integer." --> 121 .format(type(num))) 122 123 if num < 0: TypeError: object of type cannot be safely interpreted as an integer.
Unsupervised Outlier Detection Using PyOD
Subscribe to:
Posts (Atom)
No comments:
Post a Comment