Skip to content

Commit

Permalink
use labels to find duplicates in multi-index (GH9125)
Browse files Browse the repository at this point in the history
  • Loading branch information
behzadnouri authored and jreback committed Dec 23, 2014
1 parent eb77d1d commit 8f4a321
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 6 deletions.
5 changes: 2 additions & 3 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,8 @@ Performance

.. _whatsnew_0160.performance:


- Fixed a severe performance regression for ``.loc`` indexing with an array or list (:issue:9126:).

- Fixed a performance regression for ``.loc`` indexing with an array or list-like (:issue:`9126`:).
- Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values (:issue:`9125`)
- Improved the speed of `nunique` by calling `unique` instead of `value_counts` (:issue:`9129`, :issue:`7771`)


Expand Down
9 changes: 6 additions & 3 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3225,14 +3225,17 @@ def _has_complex_internals(self):

@cache_readonly
def is_unique(self):
from pandas.hashtable import Int64HashTable
return not self.duplicated().any()

@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, take_last=False):
from pandas.core.groupby import get_flat_ids
from pandas.hashtable import duplicated_int64

shape = map(len, self.levels)
ids = get_flat_ids(self.labels, shape, False)
table = Int64HashTable(min(1 << 20, len(ids)))

return len(table.unique(ids)) == len(self)
return duplicated_int64(ids, take_last)

def get_value(self, series, key):
# somewhat broken encapsulation
Expand Down
24 changes: 24 additions & 0 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1062,3 +1062,27 @@ def mode_int64(ndarray[int64_t] values):
kh_destroy_int64(table)

return modes[:j+1]


@cython.wraparound(False)
@cython.boundscheck(False)
def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
cdef:
int ret = 0
Py_ssize_t i, n = len(values)
kh_int64_t * table = kh_init_int64()
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')

kh_resize_int64(table, min(1 << 20, n))

if take_last:
for i from n > i >=0:
kh_put_int64(table, values[i], &ret)
out[i] = ret == 0
else:
for i from 0 <= i < n:
kh_put_int64(table, values[i], &ret)
out[i] = ret == 0

kh_destroy_int64(table)
return out
11 changes: 11 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3514,6 +3514,17 @@ def check(nlevels, with_nulls):
check(8, False)
check(8, True)

# GH 9125
n, k = 200, 5000
levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
labels = [np.random.choice(n, k * n) for lev in levels]
mi = MultiIndex(levels=levels, labels=labels)

for take_last in [False, True]:
left = mi.duplicated(take_last=take_last)
right = pd.lib.duplicated(mi.values, take_last=take_last)
tm.assert_array_equal(left, right)

def test_tolist(self):
result = self.index.tolist()
exp = list(self.index.values)
Expand Down
11 changes: 11 additions & 0 deletions vb_suite/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,17 @@
name='multiindex_with_datetime_level_sliced',
start_date=datetime(2014, 10, 11))

# multi-index duplicated
setup = common_setup + """
n, k = 200, 5000
levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)]
labels = [np.random.choice(n, k * n) for lev in levels]
mi = MultiIndex(levels=levels, labels=labels)
"""

multiindex_duplicated = Benchmark('mi.duplicated()', setup,
name='multiindex_duplicated')

#----------------------------------------------------------------------
# repr

Expand Down

0 comments on commit 8f4a321

Please sign in to comment.