Skip to content

Commit

Permalink
TST/PERF: optimize tm.makeStringIndex
Browse files Browse the repository at this point in the history
* add tm.rands_array(nchars, size) function to generate string arrays

* add tm.randu_array(nchars, size) function to generate unicode arrays

* replace [rands(N) for _ in range(X)] idioms:
  - with makeStringIndex in benchmarks to maintain backward compatibility
  - with rands_array in tests to maintain 1-to-1 type correspondence
  • Loading branch information
immerrr committed Oct 18, 2014
1 parent 2baaefe commit cf599d9
Show file tree
Hide file tree
Showing 21 changed files with 134 additions and 105 deletions.
4 changes: 2 additions & 2 deletions doc/source/merging.rst
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,9 @@ behavior:

.. ipython:: python
from pandas.util.testing import rands
from pandas.util.testing import rands_array
df = DataFrame(np.random.randn(10, 4), columns=['a', 'b', 'c', 'd'],
index=[rands(5) for _ in range(10)])
index=rands_array(5, 10))
df
concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']],
Expand Down
5 changes: 0 additions & 5 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2110,11 +2110,6 @@ def _count_not_none(*args):
# miscellaneous python tools


def rands(n):
"""Generates a random alphanumeric string of length *n*"""
from random import Random
import string
return ''.join(Random().sample(string.ascii_letters + string.digits, n))


def adjoin(space, *lists):
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ def test_long_strings(self):
# GH6166
# unconversion of long strings was being chopped in earlier
# versions of numpy < 1.7.2
df = DataFrame({'a': [tm.rands(100) for _ in range(10)]},
index=[tm.rands(100) for _ in range(10)])
df = DataFrame({'a': tm.rands_array(100, size=10)},
index=tm.rands_array(100, size=10))

with ensure_clean_store(self.path) as store:
store.append('df', df, data_columns=['a'])
Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,11 +274,6 @@ def test_repr_binary_type():
assert_equal(res, b)


def test_rands():
r = com.rands(10)
assert(len(r) == 10)


def test_adjoin():
data = [['a', 'b', 'c'],
['dd', 'ee', 'ff'],
Expand Down
50 changes: 23 additions & 27 deletions pandas/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,9 +1201,8 @@ def test_pprint_thing(self):

def test_wide_repr(self):
with option_context('mode.sim_interactive', True, 'display.show_dimensions', True):
col = lambda l, k: [tm.rands(k) for _ in range(l)]
max_cols = get_option('display.max_columns')
df = DataFrame([col(max_cols - 1, 25) for _ in range(10)])
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
set_option('display.expand_frame_repr', False)
rep_str = repr(df)

Expand All @@ -1227,9 +1226,8 @@ def test_wide_repr_wide_columns(self):

def test_wide_repr_named(self):
with option_context('mode.sim_interactive', True):
col = lambda l, k: [tm.rands(k) for _ in range(l)]
max_cols = get_option('display.max_columns')
df = DataFrame([col(max_cols-1, 25) for _ in range(10)])
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
df.index.name = 'DataFrame Index'
set_option('display.expand_frame_repr', False)

Expand All @@ -1249,11 +1247,10 @@ def test_wide_repr_named(self):

def test_wide_repr_multiindex(self):
with option_context('mode.sim_interactive', True):
col = lambda l, k: [tm.rands(k) for _ in range(l)]
midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)),
np.array(col(10, 5))])
midx = pandas.MultiIndex.from_arrays(
tm.rands_array(5, size=(2, 10)))
max_cols = get_option('display.max_columns')
df = DataFrame([col(max_cols-1, 25) for _ in range(10)],
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)),
index=midx)
df.index.names = ['Level 0', 'Level 1']
set_option('display.expand_frame_repr', False)
Expand All @@ -1274,12 +1271,11 @@ def test_wide_repr_multiindex(self):
def test_wide_repr_multiindex_cols(self):
with option_context('mode.sim_interactive', True):
max_cols = get_option('display.max_columns')
col = lambda l, k: [tm.rands(k) for _ in range(l)]
midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)),
np.array(col(10, 5))])
mcols = pandas.MultiIndex.from_arrays([np.array(col(max_cols-1, 3)),
np.array(col(max_cols-1, 3))])
df = DataFrame([col(max_cols-1, 25) for _ in range(10)],
midx = pandas.MultiIndex.from_arrays(
tm.rands_array(5, size=(2, 10)))
mcols = pandas.MultiIndex.from_arrays(
tm.rands_array(3, size=(2, max_cols - 1)))
df = DataFrame(tm.rands_array(25, (10, max_cols - 1)),
index=midx, columns=mcols)
df.index.names = ['Level 0', 'Level 1']
set_option('display.expand_frame_repr', False)
Expand All @@ -1296,9 +1292,8 @@ def test_wide_repr_multiindex_cols(self):

def test_wide_repr_unicode(self):
with option_context('mode.sim_interactive', True):
col = lambda l, k: [tm.randu(k) for _ in range(l)]
max_cols = get_option('display.max_columns')
df = DataFrame([col(max_cols-1, 25) for _ in range(10)])
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
set_option('display.expand_frame_repr', False)
rep_str = repr(df)
set_option('display.expand_frame_repr', True)
Expand Down Expand Up @@ -1877,30 +1872,31 @@ def test_repr_html(self):
self.reset_display_options()

def test_repr_html_wide(self):
row = lambda l, k: [tm.rands(k) for _ in range(l)]
max_cols = get_option('display.max_columns')
df = DataFrame([row(max_cols-1, 25) for _ in range(10)])
df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
reg_repr = df._repr_html_()
assert "..." not in reg_repr

wide_df = DataFrame([row(max_cols+1, 25) for _ in range(10)])
wide_df = DataFrame(tm.rands_array(25, size=(10, max_cols + 1)))
wide_repr = wide_df._repr_html_()
assert "..." in wide_repr

def test_repr_html_wide_multiindex_cols(self):
row = lambda l, k: [tm.rands(k) for _ in range(l)]
max_cols = get_option('display.max_columns')

tuples = list(itertools.product(np.arange(max_cols//2), ['foo', 'bar']))
mcols = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = DataFrame([row(len(mcols), 25) for _ in range(10)], columns=mcols)
mcols = pandas.MultiIndex.from_product([np.arange(max_cols//2),
['foo', 'bar']],
names=['first', 'second'])
df = DataFrame(tm.rands_array(25, size=(10, len(mcols))),
columns=mcols)
reg_repr = df._repr_html_()
assert '...' not in reg_repr


tuples = list(itertools.product(np.arange(1+(max_cols//2)), ['foo', 'bar']))
mcols = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = DataFrame([row(len(mcols), 25) for _ in range(10)], columns=mcols)
mcols = pandas.MultiIndex.from_product((np.arange(1+(max_cols//2)),
['foo', 'bar']),
names=['first', 'second'])
df = DataFrame(tm.rands_array(25, size=(10, len(mcols))),
columns=mcols)
wide_repr = df._repr_html_()
assert '...' in wide_repr

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4734,7 +4734,7 @@ def test_bytestring_with_unicode(self):

def test_very_wide_info_repr(self):
df = DataFrame(np.random.randn(10, 20),
columns=[tm.rands(10) for _ in range(20)])
columns=tm.rands_array(10, 20))
repr(df)

def test_repr_column_name_unicode_truncation_bug(self):
Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from pandas import date_range,bdate_range, Timestamp
from pandas.core.index import Index, MultiIndex, Int64Index
from pandas.core.common import rands
from pandas.core.api import Categorical, DataFrame
from pandas.core.groupby import (SpecificationError, DataError,
_nargsort, _lexsort_indexer)
Expand Down Expand Up @@ -2579,7 +2578,7 @@ def test_cython_grouper_series_bug_noncontig(self):
self.assertTrue(result.isnull().all())

def test_series_grouper_noncontig_index(self):
index = Index([tm.rands(10) for _ in range(100)])
index = Index(tm.rands_array(10, 100))

values = Series(np.random.randn(50), index=index[::2])
labels = np.random.randint(0, 5, 50)
Expand Down Expand Up @@ -2869,8 +2868,8 @@ def test_column_select_via_attr(self):
assert_frame_equal(result, expected)

def test_rank_apply(self):
lev1 = np.array([rands(10) for _ in range(100)], dtype=object)
lev2 = np.array([rands(10) for _ in range(130)], dtype=object)
lev1 = tm.rands_array(10, 100)
lev2 = tm.rands_array(10, 130)
lab1 = np.random.randint(0, 100, size=500)
lab2 = np.random.randint(0, 130, size=500)

Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,7 @@ def test_getitem_setitem_ellipsis(self):
self.assertTrue((result == 5).all())

def test_getitem_negative_out_of_bounds(self):
s = Series([tm.rands(5) for _ in range(10)],
index=[tm.rands(10) for _ in range(10)])
s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10))

self.assertRaises(IndexError, s.__getitem__, -11)
self.assertRaises(IndexError, s.__setitem__, -11, 'foo')
Expand Down Expand Up @@ -3852,11 +3851,10 @@ def _check_op(arr, op):
_check_op(arr, operator.floordiv)

def test_series_frame_radd_bug(self):
from pandas.util.testing import rands
import operator

# GH 353
vals = Series([rands(5) for _ in range(10)])
vals = Series(tm.rands_array(5, 10))
result = 'foo_' + vals
expected = vals.map(lambda x: 'foo_' + x)
assert_series_equal(result, expected)
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,22 @@ def test_bad_deprecate_kwarg(self):
def f4(new=None):
pass


def test_rands():
r = tm.rands(10)
assert(len(r) == 10)


def test_rands_array():
arr = tm.rands_array(5, size=10)
assert(arr.shape == (10,))
assert(len(arr[0]) == 5)

arr = tm.rands_array(7, size=(10, 10))
assert(arr.shape == (10, 10))
assert(len(arr[1, 1]) == 7)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
4 changes: 2 additions & 2 deletions pandas/tools/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pandas.tseries.index import DatetimeIndex
from pandas.tools.merge import merge, concat, ordered_merge, MergeError
from pandas.util.testing import (assert_frame_equal, assert_series_equal,
assert_almost_equal, rands,
assert_almost_equal,
makeCustomDataframe as mkdf,
assertRaisesRegexp)
from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table, read_csv
Expand Down Expand Up @@ -913,7 +913,7 @@ def test_merge_right_vs_left(self):
def test_compress_group_combinations(self):

# ~ 40000000 possible unique groups
key1 = np.array([rands(10) for _ in range(10000)], dtype='O')
key1 = tm.rands_array(10, 10000)
key1 = np.tile(key1, 2)
key2 = key1[::-1]

Expand Down
54 changes: 45 additions & 9 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,15 +193,50 @@ def randbool(size=(), p=0.5):
return rand(*size) <= p


def rands(n):
choices = string.ascii_letters + string.digits
return ''.join(random.choice(choices) for _ in range(n))
RANDS_CHARS = np.array(list(string.ascii_letters + string.digits),
dtype=(np.str_, 1))
RANDU_CHARS = np.array(list(u("").join(map(unichr, lrange(1488, 1488 + 26))) +
string.digits), dtype=(np.unicode_, 1))


def rands_array(nchars, size, dtype='O'):
"""Generate an array of byte strings."""
retval = (choice(RANDS_CHARS, size=nchars * np.prod(size))
.view((np.str_, nchars)).reshape(size))
if dtype is None:
return retval
else:
return retval.astype(dtype)


def randu_array(nchars, size, dtype='O'):
"""Generate an array of unicode strings."""
retval = (choice(RANDU_CHARS, size=nchars * np.prod(size))
.view((np.unicode_, nchars)).reshape(size))
if dtype is None:
return retval
else:
return retval.astype(dtype)


def randu(n):
choices = u("").join(map(unichr, lrange(1488, 1488 + 26)))
choices += string.digits
return ''.join([random.choice(choices) for _ in range(n)])
def rands(nchars):
"""
Generate one random byte string.
See `rands_array` if you want to create an array of random strings.
"""
return ''.join(choice(RANDS_CHARS, nchars))


def randu(nchars):
"""
Generate one random unicode string.
See `randu_array` if you want to create an array of random unicode strings.
"""
return ''.join(choice(RANDU_CHARS, nchars))


def choice(x, size=10):
Expand Down Expand Up @@ -743,10 +778,11 @@ def getArangeMat():

# make index
def makeStringIndex(k=10):
return Index([rands(10) for _ in range(k)])
return Index(rands_array(nchars=10, size=k))


def makeUnicodeIndex(k=10):
return Index([randu(10) for _ in range(k)])
return Index(randu_array(nchars=10, size=k))

def makeBoolIndex(k=10):
if k == 1:
Expand Down
4 changes: 2 additions & 2 deletions vb_suite/frame_ctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

setup = common_setup + """
N, K = 5000, 50
index = [rands(10) for _ in xrange(N)]
columns = [rands(10) for _ in xrange(K)]
index = tm.makeStringIndex(N)
columns = tm.makeStringIndex(K)
frame = DataFrame(np.random.randn(N, K), index=index, columns=columns)
try:
Expand Down
2 changes: 1 addition & 1 deletion vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def f():
setup = common_setup + """
K = 1000
N = 100000
uniques = np.array([rands(10) for x in xrange(K)], dtype='O')
uniques = tm.makeStringIndex(K).values
s = Series(np.tile(uniques, N // K))
"""

Expand Down
Loading

0 comments on commit cf599d9

Please sign in to comment.