Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
yzhao062 committed Jul 12, 2021
1 parent 8a7033c commit f8db75f
Show file tree
Hide file tree
Showing 9 changed files with 70 additions and 17 deletions.
1 change: 1 addition & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ v<0.8.9>, <06/11/2021> -- Fix LMDD parameter (#307)
v<0.9.0>, <06/20/2021> -- Add clone test for models.
v<0.9.0>, <07/03/2021> -- ROD hot fix (#316).
v<0.9.0>, <07/04/2021> -- Improve COPOD plot with colunms parameter.
v<0.9.1>, <07/12/2021> -- Improve COPOD by dropping pandas dependency.



Expand Down
1 change: 0 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,6 @@ Alternatively, you could clone and run setup.py file:
* joblib
* numpy>=1.13
* numba>=0.35
* pandas>=0.25
* scipy>=0.19.1
* scikit_learn>=0.20.0
* statsmodels
Expand Down
1 change: 0 additions & 1 deletion docs/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ Alternatively, you could clone and run setup.py file:
* joblib
* numpy>=1.13
* numba>=0.35
* pandas>=0.25
* scipy>=0.20.0
* scikit_learn>=0.19.1
* statsmodels
Expand Down
1 change: 0 additions & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ matplotlib
nose
numpy>=1.13
numba>=0.35
pandas>=0.25
pytest
scipy>=1.3.1
scikit_learn>=0.20.0
Expand Down
56 changes: 56 additions & 0 deletions examples/copod_parallel_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
"""Example of using Copula Based Outlier Detector (COPOD) for outlier detection
with parallelization
"""
# Author: Winston Li <[email protected]>
# License: BSD 2 clause

from __future__ import division
from __future__ import print_function

import os
import sys

# temporary solution for relative imports in case pyod is not installed
# if pyod is installed, no need to use the following line
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

from pyod.models.copod import COPOD
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize

if __name__ == "__main__":
contamination = 0.1 # percentage of outliers
n_train = 2000 # number of training points
n_test = 100 # number of testing points

# Generate sample data
X_train, y_train, X_test, y_test = \
generate_data(n_train=n_train,
n_test=n_test,
n_features=10,
contamination=contamination,
random_state=42)

# train COPOD detector
# you could try parallel version as well.
clf_name = 'COPOD_parallel'
clf = COPOD(n_jobs=2)
clf.fit(X_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_ # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test) # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)

23 changes: 12 additions & 11 deletions pyod/models/copod.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import warnings
import numpy as np
import pandas as pd

from statsmodels.distributions.empirical_distribution import ECDF
from scipy.stats import skew
Expand All @@ -21,8 +20,6 @@
from .sklearn_base import _partition_estimators


# todo: we should be able to drop pandas

def ecdf(X):
"""Calculated the empirical CDF of a given dataset.
Parameters
Expand Down Expand Up @@ -146,16 +143,17 @@ def decision_function(self, X):
original_size = X.shape[0]
X = np.concatenate((self.X_train, X), axis=0)

self.U_l = pd.DataFrame(-1 * np.log(np.apply_along_axis(ecdf, 0, X)))
self.U_r = pd.DataFrame(-1 * np.log(np.apply_along_axis(ecdf, 0, -X)))
self.U_l = -1 * np.log(np.apply_along_axis(ecdf, 0, X))
self.U_r = -1 * np.log(np.apply_along_axis(ecdf, 0, -X))

skewness = np.sign(skew(X, axis=0))
self.U_skew = self.U_l * -1 * np.sign(
skewness - 1) + self.U_r * np.sign(skewness + 1)
self.O = np.maximum(self.U_skew, np.add(self.U_l, self.U_r) / 2)
if hasattr(self, 'X_train'):
decision_scores_ = self.O.sum(axis=1).to_numpy()[-original_size:]
decision_scores_ = self.O.sum(axis=1)[-original_size:]
else:
decision_scores_ = self.O.sum(axis=1).to_numpy()
decision_scores_ = self.O.sum(axis=1)
return decision_scores_.ravel()

def _decision_function_parallel(self, X):
Expand Down Expand Up @@ -204,17 +202,20 @@ def _decision_function_parallel(self, X):
self.U_l[:, starts[i]:starts[i + 1]] = all_results[i][0]
self.U_r[:, starts[i]:starts[i + 1]] = all_results[i][1]

self.U_l = pd.DataFrame(-1 * np.log(self.U_l))
self.U_r = pd.DataFrame(-1 * np.log(self.U_r))
# self.U_l = pd.DataFrame(-1 * np.log(self.U_l))
# self.U_r = pd.DataFrame(-1 * np.log(self.U_r))

self.U_l = -1 * np.log(self.U_l)
self.U_r = -1 * np.log(self.U_r)

skewness = np.sign(skew(X, axis=0))
self.U_skew = self.U_l * -1 * np.sign(
skewness - 1) + self.U_r * np.sign(skewness + 1)
self.O = np.maximum(self.U_skew, np.add(self.U_l, self.U_r) / 2)
if hasattr(self, 'X_train'):
decision_scores_ = self.O.sum(axis=1).to_numpy()[-original_size:]
decision_scores_ = self.O.sum(axis=1)[-original_size:]
else:
decision_scores_ = self.O.sum(axis=1).to_numpy()
decision_scores_ = self.O.sum(axis=1)
return decision_scores_.ravel()

def explain_outlier(self, ind, columns=None, cutoffs=None,
Expand Down
2 changes: 1 addition & 1 deletion pyod/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
#
__version__ = '0.9.0' # pragma: no cover
__version__ = '0.9.1' # pragma: no cover
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ joblib
matplotlib
numpy>=1.13
numba>=0.35
pandas>=0.25
scipy>=1.3.1
scikit_learn>=0.20.0
six
Expand Down
1 change: 0 additions & 1 deletion requirements_ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ matplotlib
nose
numpy>=1.13
numba>=0.35
pandas>=0.25
scipy>=0.19.1
scikit_learn>=0.20.0
six
Expand Down

0 comments on commit f8db75f

Please sign in to comment.