Unsupervised Outlier Detection Using PyOD



We are going to do unsupervised outlier detection on "Thyroid" dataset using the PyOD package for follwing algorithms:
Model 1: Angle-based Outlier Detector (ABOD)
Model 2: Cluster-based Local Outlier Factor (CBLOF)
Model 3: Feature Bagging
Model 4: Histogram-base Outlier Detection (HBOS)
Model 5: Isolation Forest
Model 6: K Nearest Neighbors (KNN)
Model 7: Average KNN
Model 8: Median KNN
Model 9: Local Outlier Factor (LOF)
Model 10: Minimum Covariance Determinant (MCD)
Model 11: Principal Component Analysis (PCA)
Model 12: One-class SVM (OCSVM)
Model 13: AutoEncoder
Model 14: LODA
Model 15: MO_GAAL
Model 16: SO_GAAL
Model 17: VAE

We have downloaded the dataset from this link and also shared here Google Drive link.

# -*- coding: utf-8 -*-
"""Compare all detection algorithms by plotting decision boundaries and
the number of decision boundaries.
"""

from __future__ import division
from __future__ import print_function

import os
import sys

import warnings
warnings.filterwarnings("ignore")

import numpy as np
from numpy import percentile

import pandas as pd
from time import time
from joblib import load, dump

# Import models from "GitHub clone" directory
from pyod_container.pyod.models.abod import ABOD
from pyod_container.pyod.models.cblof import CBLOF
from pyod_container.pyod.models.feature_bagging import FeatureBagging
from pyod_container.pyod.models.hbos import HBOS
from pyod_container.pyod.models.iforest import IForest
from pyod_container.pyod.models.knn import KNN
from pyod_container.pyod.models.lof import LOF
from pyod_container.pyod.models.loci import LOCI
from pyod_container.pyod.models.mcd import MCD
from pyod_container.pyod.models.ocsvm import OCSVM
from pyod_container.pyod.models.pca import PCA
from pyod_container.pyod.models.sos import SOS
from pyod_container.pyod.models.lscp import LSCP
from pyod_container.pyod.models.cof import COF
from pyod_container.pyod.models.sod import SOD

#  --- Imporing packages from "pip" installation

from pyod.models.auto_encoder import AutoEncoder
from pyod.models.lmdd import LMDD
from pyod.models.loda import LODA
from pyod.models.mo_gaal import MO_GAAL
from pyod.models.so_gaal import SO_GAAL
from pyod.models.vae import VAE
from pyod.models.xgbod import XGBOD

from pyod.utils.data import evaluate_print

#   ---   Imports   ---   End Here

outliers_fraction = 0.01
random_state = np.random.RandomState(42)

df_in = pd.read_csv("files_2/annthyroid-unsupervised-ad.csv", header = None)

df = df_in[list(range(0, 21))] # Column 21 has actual results (O: Outlier, N: Inlier)

classifiers = {
    'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),

    'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state),

    'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state),

    'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),

    'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state),

    'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),

    'Average KNN': KNN(method='mean', contamination=outliers_fraction),

    'Median KNN': KNN(method='median', contamination=outliers_fraction),

    'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction),
    
    'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state),

    'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state),

    "One-class SVM (OCSVM)": OCSVM(contamination=outliers_fraction),

    "AutoEncoder": AutoEncoder(epochs=30, contamination = outliers_fraction, hidden_neurons = [20, 19, 18, 17, 18, 19, 20]), 
    
    "LODA": LODA(),

    "MO_GAAL": MO_GAAL(k=3, stop_epochs=2, contamination = outliers_fraction),

    "SO_GAAL": SO_GAAL(contamination = outliers_fraction),

    "VAE": VAE(epochs=30, contamination = outliers_fraction, encoder_neurons= [20, 19, 18, 17], decoder_neurons = [17, 18, 19, 20]),

    # "LMDD": LMDD(random_state=42), # Takes infinite time
    # "XGBOD": XGBOD(random_state=42) # This is a supervised algorithm. Requires labeled data.
}

# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)

for i, (clf_name, clf) in enumerate(classifiers.items()):
    start = time()
    print()
    print(i + 1, ': fitting', clf_name)
    
    # fit the data and tag outliers
    if clf_name == "XGBOD":
        clf = clf.fit(df, y = list(df_in.labels))
    else:
        clf = clf.fit(df)

    dump(clf, os.path.join("files_2", "models", clf_name + ".joblib"))

    scores_pred = clf.decision_function(df) * -1

    y_pred = clf.predict(df)
    threshold = percentile(scores_pred, 100 * outliers_fraction)

    dump(y_pred, os.path.join("files_2", "predictions", clf_name + ".joblib"))

    print("Time taken: {}".format(time() - start))
 
Next we compare the results from our models. Using the below code in Jupyter Notebook:

import pandas as pd
import os
from joblib import load, dump
import seaborn as sns
from collections import Counter

predictions = {}
for dirpath, subdirs, files in os.walk('files_2/predictions'):
    for f in files:
        predictions[str(f)] = load(os.path.join(dirpath, f))
  
predictions_df = pd.DataFrame(predictions)

predictions_df.columns = ['abod', 'ae', 'a_knn', 'cblof', 'fb', 'hbos', 'if', 'knn', 'lof', 'loda', 'm_knn', 'mcd', 
                          'mo_gaal', 'ocsvm', 'pca', 'so_gaal', 'vae']
        
relevant_cols = ['ae', 'a_knn', 'cblof', 'fb', 'hbos', 'if', 'knn', 'lof', 'loda', 'm_knn', 'mcd', 
                          'mo_gaal', 'ocsvm', 'pca', 'so_gaal', 'vae']

corr = predictions_df[relevant_cols].corr()

sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap = sns.diverging_palette(0, 255, sep=1, n=256), vmin=-1, vmax=1)
Output:

Note: sns.diverging_palette(0, 255, sep=1, n=256) :: This works for Python 3.6 and not for 3.7 and 3.8 On running the code "sns.diverging_palette(0, 255, sep=1, n=256)" on Python 3.7 or 3.8, following error occurs: TypeError Traceback (most recent call last) ~\AppData\Roaming\Python\Python37\site-packages\numpy\core\function_base.py in linspace(start, stop, num, endpoint, retstep, dtype, axis) 116 try: --> 117 num = operator.index(num) 118 except TypeError: TypeError: 'float' object cannot be interpreted as an integer During handling of the above exception, another exception occurred: TypeError Traceback (most recent call last) in ----> 1 sns.diverging_palette(0, 255, sep=1, n=256) ~\AppData\Local\Continuum\anaconda3\lib\site-packages\seaborn\palettes.py in diverging_palette(h_neg, h_pos, s, l, sep, n, center, as_cmap) 742 """ 743 palfunc = dark_palette if center == "dark" else light_palette --> 744 neg = palfunc((h_neg, s, l), 128 - (sep / 2), reverse=True, input="husl") 745 pos = palfunc((h_pos, s, l), 128 - (sep / 2), input="husl") 746 midpoint = dict(light=[(.95, .95, .95, 1.)], ~\AppData\Local\Continuum\anaconda3\lib\site-packages\seaborn\palettes.py in light_palette(color, n_colors, reverse, as_cmap, input) 639 light = set_hls_values(color, l=.95) # noqa 640 colors = [color, light] if reverse else [light, color] --> 641 return blend_palette(colors, n_colors, as_cmap) 642 643 ~\AppData\Local\Continuum\anaconda3\lib\site-packages\seaborn\palettes.py in blend_palette(colors, n_colors, as_cmap, input) 775 pal = mpl.colors.LinearSegmentedColormap.from_list(name, colors) 776 if not as_cmap: --> 777 pal = _ColorPalette(pal(np.linspace(0, 1, n_colors))) 778 return pal 779 <__array_function__ internals> in linspace(*args, **kwargs) ~\AppData\Roaming\Python\Python37\site-packages\numpy\core\function_base.py in linspace(start, stop, num, endpoint, retstep, dtype, axis) 119 raise TypeError( 120 "object of type {} cannot be safely interpreted as an integer." --> 121 .format(type(num))) 122 123 if num < 0: TypeError: object of type cannot be safely interpreted as an integer. Refereces: 1. https://github.com/yzhao062/anomaly-detection-resources

No comments:

Post a Comment