Skip to content

Commit

Permalink
improves merge performance when key space exceeds i8 bounds
Browse files Browse the repository at this point in the history
  • Loading branch information
behzadnouri committed Dec 28, 2014
1 parent def58c9 commit 7ff782b
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 29 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ Performance
- Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values (:issue:`9125`)
- Improved the speed of `nunique` by calling `unique` instead of `value_counts` (:issue:`9129`, :issue:`7771`)
- Performance improvement of up to 10x in ``DataFrame.count`` and ``DataFrame.dropna`` by taking advantage of homogeneous/heterogeneous dtypes appropriately (:issue:`9136`)
- Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`)

Bug Fixes
~~~~~~~~~
Expand Down
75 changes: 47 additions & 28 deletions pandas/tools/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import types

import numpy as np
from pandas.compat import range, long, lrange, lzip, zip
from pandas.compat import range, long, lrange, lzip, zip, map, filter
import pandas.compat as compat
from pandas.core.categorical import Categorical
from pandas.core.frame import DataFrame, _merge_doc
Expand Down Expand Up @@ -450,39 +450,29 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'):
-------
"""
if len(left_keys) != len(right_keys):
raise AssertionError('left_key and right_keys must be the same length')
from functools import partial

left_labels = []
right_labels = []
group_sizes = []
assert len(left_keys) == len(right_keys), \
'left_key and right_keys must be the same length'

for lk, rk in zip(left_keys, right_keys):
llab, rlab, count = _factorize_keys(lk, rk, sort=sort)
# bind `sort` arg. of _factorize_keys
fkeys = partial(_factorize_keys, sort=sort)

left_labels.append(llab)
right_labels.append(rlab)
group_sizes.append(count)
# get left & right join labels and num. of levels at each location
llab, rlab, shape = map(list, zip( * map(fkeys, left_keys, right_keys)))

max_groups = long(1)
for x in group_sizes:
max_groups *= long(x)
# get flat i8 keys from label lists
lkey, rkey = _get_join_keys(llab, rlab, shape, sort)

if max_groups > 2 ** 63: # pragma: no cover
left_group_key, right_group_key, max_groups = \
_factorize_keys(lib.fast_zip(left_labels),
lib.fast_zip(right_labels))
else:
left_group_key = get_group_index(left_labels, group_sizes)
right_group_key = get_group_index(right_labels, group_sizes)

left_group_key, right_group_key, max_groups = \
_factorize_keys(left_group_key, right_group_key, sort=sort)
# factorize keys to a dense i8 space
# `count` is the num. of unique keys
# set(lkey) | set(rkey) == range(count)
lkey, rkey, count = fkeys(lkey, rkey)

# preserve left frame order if how == 'left' and sort == False
kwargs = {'sort':sort} if how == 'left' else {}
join_func = _join_functions[how]
return join_func(left_group_key, right_group_key, max_groups, **kwargs)
return join_func(lkey, rkey, count, **kwargs)


class _OrderedMerge(_MergeOperation):
Expand Down Expand Up @@ -590,9 +580,9 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
# if asked to sort or there are 1-to-many matches
join_index = left_ax.take(left_indexer)
return join_index, left_indexer, right_indexer
else:
# left frame preserves order & length of its index
return left_ax, None, right_indexer

# left frame preserves order & length of its index
return left_ax, None, right_indexer


def _right_outer_join(x, y, max_groups):
Expand Down Expand Up @@ -663,6 +653,35 @@ def _sort_labels(uniques, left, right):
return new_left, new_right


def _get_join_keys(llab, rlab, shape, sort):
from pandas.core.groupby import _int64_overflow_possible

# how many levels can be done without overflow
pred = lambda i: not _int64_overflow_possible(shape[:i])
nlev = next(filter(pred, range(len(shape), 0, -1)))

# get keys for the first `nlev` levels
stride = np.prod(shape[1:nlev], dtype='i8')
lkey = stride * llab[0].astype('i8', subok=False, copy=False)
rkey = stride * rlab[0].astype('i8', subok=False, copy=False)

for i in range(1, nlev):
stride //= shape[i]
lkey += llab[i] * stride
rkey += rlab[i] * stride

if nlev == len(shape): # all done!
return lkey, rkey

# densify current keys to avoid overflow
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)

llab = [lkey] + llab[nlev:]
rlab = [rkey] + rlab[nlev:]
shape = [count] + shape[nlev:]

return _get_join_keys(llab, rlab, shape, sort)

#----------------------------------------------------------------------
# Concatenate DataFrame objects

Expand Down
117 changes: 117 additions & 0 deletions pandas/tools/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1141,6 +1141,10 @@ def test_merge_na_keys(self):
tm.assert_frame_equal(result, expected)

def test_int64_overflow_issues(self):
from itertools import product
from collections import defaultdict
from pandas.core.groupby import _int64_overflow_possible

# #2690, combinatorial explosion
df1 = DataFrame(np.random.randn(1000, 7),
columns=list('ABCDEF') + ['G1'])
Expand All @@ -1151,6 +1155,119 @@ def test_int64_overflow_issues(self):
result = merge(df1, df2, how='outer')
self.assertTrue(len(result) == 2000)

low, high, n = -1 << 10, 1 << 10, 1 << 20
left = DataFrame(np.random.randint(low, high, (n, 7)),
columns=list('ABCDEFG'))
left['left'] = left.sum(axis=1)

# one-2-one match
i = np.random.permutation(len(left))
right = left.iloc[i].copy()
right.columns = right.columns[:-1].tolist() + ['right']
right.index = np.arange(len(right))
right['right'] *= -1

out = merge(left, right, how='outer')
self.assertEqual(len(out), len(left))
assert_series_equal(out['left'], - out['right'])
assert_series_equal(out['left'], out.iloc[:, :-2].sum(axis=1))

out.sort(out.columns.tolist(), inplace=True)
out.index = np.arange(len(out))
for how in ['left', 'right', 'outer', 'inner']:
assert_frame_equal(out, merge(left, right, how=how, sort=True))

# check that left merge w/ sort=False maintains left frame order
out = merge(left, right, how='left', sort=False)
assert_frame_equal(left, out[left.columns.tolist()])

out = merge(right, left, how='left', sort=False)
assert_frame_equal(right, out[right.columns.tolist()])

# one-2-many/none match
n = 1 << 11
left = DataFrame(np.random.randint(low, high, (n, 7)),
columns=list('ABCDEFG'))

# confirm that this is checking what it is supposed to check
shape = left.apply(pd.Series.nunique).values
self.assertTrue(_int64_overflow_possible(shape))

# add duplicates to left frame
left = pd.concat([left, left], ignore_index=True)

right = DataFrame(np.random.randint(low, high, (n // 2, 7)),
columns=list('ABCDEFG'))

# add duplicates & overlap with left to the right frame
i = np.random.choice(len(left), n)
right = pd.concat([right, right, left.iloc[i]], ignore_index=True)

left['left'] = np.random.randn(len(left))
right['right'] = np.random.randn(len(right))

# shuffle left & right frames
i = np.random.permutation(len(left))
left = left.iloc[i].copy()
left.index = np.arange(len(left))

i = np.random.permutation(len(right))
right = right.iloc[i].copy()
right.index = np.arange(len(right))

# manually compute outer merge
ldict, rdict = defaultdict(list), defaultdict(list)

for idx, row in left.set_index(list('ABCDEFG')).iterrows():
ldict[idx].append(row['left'])

for idx, row in right.set_index(list('ABCDEFG')).iterrows():
rdict[idx].append(row['right'])

vals = []
for k, lval in ldict.items():
rval = rdict.get(k, [np.nan])
for lv, rv in product(lval, rval):
vals.append(k + tuple([lv, rv]))

for k, rval in rdict.items():
if k not in ldict:
for rv in rval:
vals.append(k + tuple([np.nan, rv]))

def align(df):
df = df.sort(df.columns.tolist())
df.index = np.arange(len(df))
return df

def verify_order(df):
kcols = list('ABCDEFG')
assert_frame_equal(df[kcols].copy(),
df[kcols].sort(kcols, kind='mergesort'))

out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right'])
out = align(out)

jmask = {'left': out['left'].notnull(),
'right': out['right'].notnull(),
'inner': out['left'].notnull() & out['right'].notnull(),
'outer': np.ones(len(out), dtype='bool')}

for how in 'left', 'right', 'outer', 'inner':
mask = jmask[how]
frame = align(out[mask].copy())
self.assertTrue(mask.all() ^ mask.any() or how == 'outer')

for sort in [False, True]:
res = merge(left, right, how=how, sort=sort)
if sort:
verify_order(res)

# as in GH9092 dtypes break with outer/right join
assert_frame_equal(frame, align(res),
check_dtype=how not in ('right', 'outer'))


def test_join_multi_levels(self):

# GH 3662
Expand Down
20 changes: 19 additions & 1 deletion vb_suite/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,4 +249,22 @@ def sample(values, k):
columns=['jolie', 'jolia']).set_index('jolie')
'''

left_outer_join_index = Benchmark("left.join(right, on='jim')", setup)
left_outer_join_index = Benchmark("left.join(right, on='jim')", setup,
name='left_outer_join_index')


setup = common_setup + """
low, high, n = -1 << 10, 1 << 10, 1 << 20
left = DataFrame(np.random.randint(low, high, (n, 7)),
columns=list('ABCDEFG'))
left['left'] = left.sum(axis=1)
i = np.random.permutation(len(left))
right = left.iloc[i].copy()
right.columns = right.columns[:-1].tolist() + ['right']
right.index = np.arange(len(right))
right['right'] *= -1
"""

i8merge = Benchmark("merge(left, right, how='outer')", setup,
name='i8merge')

0 comments on commit 7ff782b

Please sign in to comment.