Add cluster-based local outlier factor (CBLOF)

kranthi-nord · Jun 18, 2018 · 1ba7e6f · 1ba7e6f
1 parent 189f6c0
commit 1ba7e6f
Show file tree

Hide file tree

Showing 9 changed files with 376 additions and 172 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -24,4 +24,5 @@ v<0.4.9>, <06/09/2018> -- Add new utility functions and improve documentations.
 v<0.5.0>, <06/10/2018> -- Refactor models and improve documentation.
 v<0.5.1>, <06/12/2018> -- Add MCD detector and more Jupyter notebooks.
 v<0.5.2>, <06/13/2018> -- Incremental changes.
-v<0.5.3>, <06/14/2018> -- Incremental changes.
+v<0.5.3>, <06/14/2018> -- Incremental changes.
+v<0.5.4>, <06/18/2018> -- Add CBLOF model and incremental improvements.
diff --git a/README.md b/README.md
@@ -64,7 +64,7 @@ detection utility functions.
 
   2. Proximity-Based Outlier Detection Models:
      1. **LOF: Local Outlier Factor** [1]
-     2. **CBLOF: Clustering-Based Local Outlier Factor** [15] (work in progress)
+     2. **CBLOF: Clustering-Based Local Outlier Factor** [15]
      3. **HBOS: Histogram-based Outlier Score** [5]
      4. **kNN: k Nearest Neighbors** (use the distance to the kth nearest 
      neighbor as the outlier score) [13]

diff --git a/docs/index.rst b/docs/index.rst
@@ -63,13 +63,13 @@ detection utility functions.
 
   i. **LOF: Local Outlier Factor** :cite:`a-breunig2000lof`: :class:`pyod.models.lof.LOF`
   ii. **CBLOF: Clustering-Based Local Outlier Factor** :cite:`a-he2003discovering`: :class:`pyod.models.cblof.CBLOF`
-  ii. **kNN: k Nearest Neighbors** (use the distance to the kth nearest
-      neighbor as the outlier score) :cite:`a-ramaswamy2000efficient,a-angiulli2002fast`: :class:`pyod.models.knn.KNN`
-  iii. **Average kNN** (use the average distance to k nearest neighbors as
-       the outlier score): :class:`pyod.models.knn.KNN`
-  iv. **Median kNN** (use the median distance to k nearest neighbors
-      as the outlier score): :class:`pyod.models.knn.KNN`
-  v. **HBOS: Histogram-based Outlier Score** :cite:`a-goldstein2012histogram`: :class:`pyod.models.hbos.HBOS`
+  iii. **kNN: k Nearest Neighbors** (use the distance to the kth nearest
+       neighbor as the outlier score) :cite:`a-ramaswamy2000efficient,a-angiulli2002fast`: :class:`pyod.models.knn.KNN`
+  iv. **Average kNN** (use the average distance to k nearest neighbors as
+      the outlier score): :class:`pyod.models.knn.KNN`
+  v. **Median kNN** (use the median distance to k nearest neighbors
+     as the outlier score): :class:`pyod.models.knn.KNN`
+  vi. **HBOS: Histogram-based Outlier Score** :cite:`a-goldstein2012histogram`: :class:`pyod.models.hbos.HBOS`
 
 3. Probabilistic Models for Outlier Detection:
 

diff --git a/docs/pyod.models.rst b/docs/pyod.models.rst
@@ -23,7 +23,7 @@ pyod.models.base module
     :inherited-members:
 
 pyod.models.cblof module
------------------------
+------------------------
 
 .. automodule:: pyod.models.cblof
     :members:

diff --git a/examples/cblof_example.py b/examples/cblof_example.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+"""Example of using Cluster-based Local Outlier Factor (CBLOF) for outlier
+detection
+"""
+# Author: Yue Zhao <[email protected]>
+# License: BSD 2 clause
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# temporary solution for relative imports in case pyod is not installed
+# if pyod is installed, no need to use the following line
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from sklearn.utils import check_X_y
+import matplotlib.pyplot as plt
+from matplotlib.lines import Line2D
+
+from pyod.models.cblof import CBLOF
+from pyod.utils.data import generate_data
+from pyod.utils.data import get_color_codes
+from pyod.utils.data import evaluate_print
+
+
+def visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
+              y_test_pred, show_figure=True,
+              save_figure=False):  # pragma: no cover
+    """
+    Utility function for visualizing the results in examples
+    Internal use only
+
+    :param clf_name: The name of the detector
+    :type clf_name: str
+
+    :param X_train: The training samples
+    :param X_train: numpy array of shape (n_samples, n_features)
+
+    :param y_train: The ground truth of training samples
+    :type y_train: list or array of shape (n_samples,)
+
+    :param X_test: The test samples
+    :type X_test: numpy array of shape (n_samples, n_features)
+
+    :param y_test: The ground truth of test samples
+    :type y_test: list or array of shape (n_samples,)
+
+    :param y_train_pred: The predicted outlier scores on the training samples
+    :type y_train_pred: numpy array of shape (n_samples, n_features)
+
+    :param y_test_pred: The predicted outlier scores on the test samples
+    :type y_test_pred: numpy array of shape (n_samples, n_features)
+
+    :param show_figure: If set to True, show the figure
+    :type show_figure: bool, optional (default=True)
+
+    :param save_figure: If set to True, save the figure to the local
+    :type save_figure: bool, optional (default=False)
+    """
+
+    if X_train.shape[1] != 2 or X_test.shape[1] != 2:
+        raise ValueError("Input data has to be 2-d for visualization. The "
+                         "input data has {shape}.".format(shape=X_train.shape))
+
+    X_train, y_train = check_X_y(X_train, y_train)
+    X_test, y_test = check_X_y(X_test, y_test)
+    c_train = get_color_codes(y_train)
+    c_test = get_color_codes(y_test)
+
+    fig = plt.figure(figsize=(12, 10))
+    plt.suptitle("Demo of {clf_name}".format(clf_name=clf_name))
+
+    fig.add_subplot(221)
+    plt.scatter(X_train[:, 0], X_train[:, 1], c=c_train)
+    plt.title('Train ground truth')
+    legend_elements = [Line2D([0], [0], marker='o', color='w', label='normal',
+                              markerfacecolor='b', markersize=8),
+                       Line2D([0], [0], marker='o', color='w', label='outlier',
+                              markerfacecolor='r', markersize=8)]
+
+    plt.legend(handles=legend_elements, loc=4)
+
+    fig.add_subplot(222)
+    plt.scatter(X_test[:, 0], X_test[:, 1], c=c_test)
+    plt.title('Test ground truth')
+    plt.legend(handles=legend_elements, loc=4)
+
+    fig.add_subplot(223)
+    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train_pred)
+    plt.title('Train prediction by {clf_name}'.format(clf_name=clf_name))
+    legend_elements = [Line2D([0], [0], marker='o', color='w', label='normal',
+                              markerfacecolor='0', markersize=8),
+                       Line2D([0], [0], marker='o', color='w', label='outlier',
+                              markerfacecolor='yellow', markersize=8)]
+    plt.legend(handles=legend_elements, loc=4)
+
+    fig.add_subplot(224)
+    plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test_pred)
+    plt.title('Test prediction by {clf_name}'.format(clf_name=clf_name))
+    plt.legend(handles=legend_elements, loc=4)
+
+    if save_figure:
+        plt.savefig('{clf_name}.png'.format(clf_name=clf_name), dpi=300)
+    if show_figure:
+        plt.show()
+    return
+
+
+if __name__ == "__main__":
+    contamination = 0.1  # percentage of outliers
+    n_train = 200  # number of training points
+    n_test = 100  # number of testing points
+
+    # Generate sample data
+    X_train, y_train, X_test, y_test = \
+        generate_data(n_train=n_train,
+                      n_test=n_test,
+                      n_features=2,
+                      contamination=contamination,
+                      random_state=42)
+
+    # train CBLOF detector
+    clf_name = 'CBLOF'
+    clf = CBLOF()
+    clf.fit(X_train)
+
+    # get the prediction labels and outlier scores of the training data
+    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
+    y_train_scores = clf.decision_scores_  # raw outlier scores
+
+    # get the prediction on the test data
+    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
+    y_test_scores = clf.decision_function(X_test)  # outlier scores
+
+    # evaluate and print the results
+    print("\nOn Training Data:")
+    evaluate_print(clf_name, y_train, y_train_scores)
+    print("\nOn Test Data:")
+    evaluate_print(clf_name, y_test, y_test_scores)
+
+    # visualize the results
+    visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
+              y_test_pred, show_figure=True, save_figure=False)
diff --git a/pyod/__init__.py b/pyod/__init__.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-__version__ = '0.5.3'
+__version__ = '0.5.4'
 
 from . import models
 from . import utils

diff --git a/pyod/models/__init__.py b/pyod/models/__init__.py
@@ -13,6 +13,7 @@
 from .pca import PCA
 
 __all__ = ['ABOD',
+           'CBLOF',
            'clone',
            'aom', 'moa', 'average', 'maximization',
            'FeatureBagging',