use labels to find duplicates in multi-index (GH9125)

blbradley · Dec 23, 2014 · 8f4a321 · 8f4a321
1 parent eb77d1d
commit 8f4a321
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 6 deletions.
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -45,9 +45,8 @@ Performance
 
 .. _whatsnew_0160.performance:
 
-
-- Fixed a severe performance regression for ``.loc`` indexing with an array or list (:issue:9126:).
-
+- Fixed a performance regression for ``.loc`` indexing with an array or list-like (:issue:`9126`:).
+- Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values (:issue:`9125`)
 - Improved the speed of `nunique` by calling `unique` instead of `value_counts` (:issue:`9129`, :issue:`7771`)
 
 

diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -3225,14 +3225,17 @@ def _has_complex_internals(self):
 
     @cache_readonly
     def is_unique(self):
-        from pandas.hashtable import Int64HashTable
+        return not self.duplicated().any()
+
+    @Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
+    def duplicated(self, take_last=False):
         from pandas.core.groupby import get_flat_ids
+        from pandas.hashtable import duplicated_int64
 
         shape = map(len, self.levels)
         ids = get_flat_ids(self.labels, shape, False)
-        table = Int64HashTable(min(1 << 20, len(ids)))
 
-        return len(table.unique(ids)) == len(self)
+        return duplicated_int64(ids, take_last)
 
     def get_value(self, series, key):
         # somewhat broken encapsulation

diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -1062,3 +1062,27 @@ def mode_int64(ndarray[int64_t] values):
     kh_destroy_int64(table)
 
     return modes[:j+1]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
+    cdef:
+        int ret = 0
+        Py_ssize_t i, n = len(values)
+        kh_int64_t * table = kh_init_int64()
+        ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
+
+    kh_resize_int64(table, min(1 << 20, n))
+
+    if take_last:
+        for i from n > i >=0:
+            kh_put_int64(table, values[i], &ret)
+            out[i] = ret == 0
+    else:
+        for i from 0 <= i < n:
+            kh_put_int64(table, values[i], &ret)
+            out[i] = ret == 0
+
+    kh_destroy_int64(table)
+    return out
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -3514,6 +3514,17 @@ def check(nlevels, with_nulls):
         check(8, False)
         check(8, True)
 
+        # GH 9125
+        n, k = 200, 5000
+        levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
+        labels = [np.random.choice(n, k * n) for lev in levels]
+        mi = MultiIndex(levels=levels, labels=labels)
+
+        for take_last in [False, True]:
+            left = mi.duplicated(take_last=take_last)
+            right = pd.lib.duplicated(mi.values, take_last=take_last)
+            tm.assert_array_equal(left, right)
+
     def test_tolist(self):
         result = self.index.tolist()
         exp = list(self.index.values)

diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py
@@ -138,6 +138,17 @@
               name='multiindex_with_datetime_level_sliced',
               start_date=datetime(2014, 10, 11))
 
+# multi-index duplicated
+setup = common_setup + """
+n, k = 200, 5000
+levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)]
+labels = [np.random.choice(n, k * n) for lev in levels]
+mi = MultiIndex(levels=levels, labels=labels)
+"""
+
+multiindex_duplicated = Benchmark('mi.duplicated()', setup,
+                                  name='multiindex_duplicated')
+
 #----------------------------------------------------------------------
 # repr