fix yzhao062#223

kranthi-nord · Jul 12, 2021 · f8db75f · f8db75f
1 parent 8a7033c
commit f8db75f
Show file tree

Hide file tree

Showing 9 changed files with 70 additions and 17 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -129,6 +129,7 @@ v<0.8.9>, <06/11/2021> -- Fix LMDD parameter (#307)
 v<0.9.0>, <06/20/2021> -- Add clone test for models.
 v<0.9.0>, <07/03/2021> -- ROD hot fix (#316).
 v<0.9.0>, <07/04/2021> -- Improve COPOD plot with colunms parameter.
+v<0.9.1>, <07/12/2021> -- Improve COPOD by dropping pandas dependency.
 
 
 

diff --git a/README.rst b/README.rst
@@ -198,7 +198,6 @@ Alternatively, you could clone and run setup.py file:
 * joblib
 * numpy>=1.13
 * numba>=0.35
-* pandas>=0.25
 * scipy>=0.19.1
 * scikit_learn>=0.20.0
 * statsmodels

diff --git a/docs/install.rst b/docs/install.rst
@@ -27,7 +27,6 @@ Alternatively, you could clone and run setup.py file:
 * joblib
 * numpy>=1.13
 * numba>=0.35
-* pandas>=0.25
 * scipy>=0.20.0
 * scikit_learn>=0.19.1
 * statsmodels

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -5,7 +5,6 @@ matplotlib
 nose
 numpy>=1.13
 numba>=0.35
-pandas>=0.25
 pytest
 scipy>=1.3.1
 scikit_learn>=0.20.0

diff --git a/examples/copod_parallel_example.py b/examples/copod_parallel_example.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+"""Example of using Copula Based Outlier Detector (COPOD) for outlier detection
+with parallelization
+"""
+# Author: Winston Li <[email protected]>
+# License: BSD 2 clause
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# temporary solution for relative imports in case pyod is not installed
+# if pyod is installed, no need to use the following line
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
+
+from pyod.models.copod import COPOD
+from pyod.utils.data import generate_data
+from pyod.utils.data import evaluate_print
+from pyod.utils.example import visualize
+
+if __name__ == "__main__":
+    contamination = 0.1  # percentage of outliers
+    n_train = 2000  # number of training points
+    n_test = 100  # number of testing points
+
+    # Generate sample data
+    X_train, y_train, X_test, y_test = \
+        generate_data(n_train=n_train,
+                      n_test=n_test,
+                      n_features=10,
+                      contamination=contamination,
+                      random_state=42)
+
+    # train COPOD detector
+    # you could try parallel version as well.
+    clf_name = 'COPOD_parallel'
+    clf = COPOD(n_jobs=2)
+    clf.fit(X_train)
+
+    # get the prediction labels and outlier scores of the training data
+    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
+    y_train_scores = clf.decision_scores_  # raw outlier scores
+
+    # get the prediction on the test data
+    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
+    y_test_scores = clf.decision_function(X_test)  # outlier scores
+
+    # evaluate and print the results
+    print("\nOn Training Data:")
+    evaluate_print(clf_name, y_train, y_train_scores)
+    print("\nOn Test Data:")
+    evaluate_print(clf_name, y_test, y_test_scores)
+
diff --git a/pyod/models/copod.py b/pyod/models/copod.py
@@ -9,7 +9,6 @@
 
 import warnings
 import numpy as np
-import pandas as pd
 
 from statsmodels.distributions.empirical_distribution import ECDF
 from scipy.stats import skew
@@ -21,8 +20,6 @@
 from .sklearn_base import _partition_estimators
 
 
-# todo: we should be able to drop pandas
-
 def ecdf(X):
     """Calculated the empirical CDF of a given dataset.
     Parameters
@@ -146,16 +143,17 @@ def decision_function(self, X):
             original_size = X.shape[0]
             X = np.concatenate((self.X_train, X), axis=0)
 
-        self.U_l = pd.DataFrame(-1 * np.log(np.apply_along_axis(ecdf, 0, X)))
-        self.U_r = pd.DataFrame(-1 * np.log(np.apply_along_axis(ecdf, 0, -X)))
+        self.U_l = -1 * np.log(np.apply_along_axis(ecdf, 0, X))
+        self.U_r = -1 * np.log(np.apply_along_axis(ecdf, 0, -X))
+
         skewness = np.sign(skew(X, axis=0))
         self.U_skew = self.U_l * -1 * np.sign(
             skewness - 1) + self.U_r * np.sign(skewness + 1)
         self.O = np.maximum(self.U_skew, np.add(self.U_l, self.U_r) / 2)
         if hasattr(self, 'X_train'):
-            decision_scores_ = self.O.sum(axis=1).to_numpy()[-original_size:]
+            decision_scores_ = self.O.sum(axis=1)[-original_size:]
         else:
-            decision_scores_ = self.O.sum(axis=1).to_numpy()
+            decision_scores_ = self.O.sum(axis=1)
         return decision_scores_.ravel()
 
     def _decision_function_parallel(self, X):
@@ -204,17 +202,20 @@ def _decision_function_parallel(self, X):
             self.U_l[:, starts[i]:starts[i + 1]] = all_results[i][0]
             self.U_r[:, starts[i]:starts[i + 1]] = all_results[i][1]
 
-        self.U_l = pd.DataFrame(-1 * np.log(self.U_l))
-        self.U_r = pd.DataFrame(-1 * np.log(self.U_r))
+        # self.U_l = pd.DataFrame(-1 * np.log(self.U_l))
+        # self.U_r = pd.DataFrame(-1 * np.log(self.U_r))
+
+        self.U_l = -1 * np.log(self.U_l)
+        self.U_r = -1 * np.log(self.U_r)
 
         skewness = np.sign(skew(X, axis=0))
         self.U_skew = self.U_l * -1 * np.sign(
             skewness - 1) + self.U_r * np.sign(skewness + 1)
         self.O = np.maximum(self.U_skew, np.add(self.U_l, self.U_r) / 2)
         if hasattr(self, 'X_train'):
-            decision_scores_ = self.O.sum(axis=1).to_numpy()[-original_size:]
+            decision_scores_ = self.O.sum(axis=1)[-original_size:]
         else:
-            decision_scores_ = self.O.sum(axis=1).to_numpy()
+            decision_scores_ = self.O.sum(axis=1)
         return decision_scores_.ravel()
 
     def explain_outlier(self, ind, columns=None, cutoffs=None,

diff --git a/pyod/version.py b/pyod/version.py
@@ -20,4 +20,4 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = '0.9.0'  # pragma: no cover
+__version__ = '0.9.1'  # pragma: no cover
diff --git a/requirements.txt b/requirements.txt
@@ -2,7 +2,6 @@ joblib
 matplotlib
 numpy>=1.13
 numba>=0.35
-pandas>=0.25
 scipy>=1.3.1
 scikit_learn>=0.20.0
 six

diff --git a/requirements_ci.txt b/requirements_ci.txt
@@ -5,7 +5,6 @@ matplotlib
 nose
 numpy>=1.13
 numba>=0.35
-pandas>=0.25
 scipy>=0.19.1
 scikit_learn>=0.20.0
 six