TST/PERF: optimize tm.makeStringIndex

* add tm.rands_array(nchars, size) function to generate string arrays * add tm.randu_array(nchars, size) function to generate unicode arrays * replace [rands(N) for _ in range(X)] idioms: - with makeStringIndex in benchmarks to maintain backward compatibility - with rands_array in tests to maintain 1-to-1 type correspondence
blbradley · Oct 18, 2014 · cf599d9 · cf599d9
1 parent 2baaefe
commit cf599d9
Show file tree

Hide file tree

Showing 21 changed files with 134 additions and 105 deletions.
diff --git a/doc/source/merging.rst b/doc/source/merging.rst
@@ -130,9 +130,9 @@ behavior:
 
 .. ipython:: python
 
-   from pandas.util.testing import rands
+   from pandas.util.testing import rands_array
    df = DataFrame(np.random.randn(10, 4), columns=['a', 'b', 'c', 'd'],
-                  index=[rands(5) for _ in range(10)])
+                  index=rands_array(5, 10))
    df
 
    concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']],

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -2110,11 +2110,6 @@ def _count_not_none(*args):
 # miscellaneous python tools
 
 
-def rands(n):
-    """Generates a random alphanumeric string of length *n*"""
-    from random import Random
-    import string
-    return ''.join(Random().sample(string.ascii_letters + string.digits, n))
 
 
 def adjoin(space, *lists):

diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -198,8 +198,8 @@ def test_long_strings(self):
         # GH6166
         # unconversion of long strings was being chopped in earlier
         # versions of numpy < 1.7.2
-        df = DataFrame({'a': [tm.rands(100) for _ in range(10)]},
-                       index=[tm.rands(100) for _ in range(10)])
+        df = DataFrame({'a': tm.rands_array(100, size=10)},
+                       index=tm.rands_array(100, size=10))
 
         with ensure_clean_store(self.path) as store:
             store.append('df', df, data_columns=['a'])

diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
@@ -274,11 +274,6 @@ def test_repr_binary_type():
     assert_equal(res, b)
 
 
-def test_rands():
-    r = com.rands(10)
-    assert(len(r) == 10)
-
-
 def test_adjoin():
     data = [['a', 'b', 'c'],
             ['dd', 'ee', 'ff'],

diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py
@@ -1201,9 +1201,8 @@ def test_pprint_thing(self):
 
     def test_wide_repr(self):
         with option_context('mode.sim_interactive', True, 'display.show_dimensions', True):
-            col = lambda l, k: [tm.rands(k) for _ in range(l)]
             max_cols = get_option('display.max_columns')
-            df = DataFrame([col(max_cols - 1, 25) for _ in range(10)])
+            df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
             set_option('display.expand_frame_repr', False)
             rep_str = repr(df)
 
@@ -1227,9 +1226,8 @@ def test_wide_repr_wide_columns(self):
 
     def test_wide_repr_named(self):
         with option_context('mode.sim_interactive', True):
-            col = lambda l, k: [tm.rands(k) for _ in range(l)]
             max_cols = get_option('display.max_columns')
-            df = DataFrame([col(max_cols-1, 25) for _ in range(10)])
+            df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
             df.index.name = 'DataFrame Index'
             set_option('display.expand_frame_repr', False)
 
@@ -1249,11 +1247,10 @@ def test_wide_repr_named(self):
 
     def test_wide_repr_multiindex(self):
         with option_context('mode.sim_interactive', True):
-            col = lambda l, k: [tm.rands(k) for _ in range(l)]
-            midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)),
-                                                  np.array(col(10, 5))])
+            midx = pandas.MultiIndex.from_arrays(
+                tm.rands_array(5, size=(2, 10)))
             max_cols = get_option('display.max_columns')
-            df = DataFrame([col(max_cols-1, 25) for _ in range(10)],
+            df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)),
                            index=midx)
             df.index.names = ['Level 0', 'Level 1']
             set_option('display.expand_frame_repr', False)
@@ -1274,12 +1271,11 @@ def test_wide_repr_multiindex(self):
     def test_wide_repr_multiindex_cols(self):
         with option_context('mode.sim_interactive', True):
             max_cols = get_option('display.max_columns')
-            col = lambda l, k: [tm.rands(k) for _ in range(l)]
-            midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)),
-                                                  np.array(col(10, 5))])
-            mcols = pandas.MultiIndex.from_arrays([np.array(col(max_cols-1, 3)),
-                                                   np.array(col(max_cols-1, 3))])
-            df = DataFrame([col(max_cols-1, 25) for _ in range(10)],
+            midx = pandas.MultiIndex.from_arrays(
+                tm.rands_array(5, size=(2, 10)))
+            mcols = pandas.MultiIndex.from_arrays(
+                tm.rands_array(3, size=(2, max_cols - 1)))
+            df = DataFrame(tm.rands_array(25, (10, max_cols - 1)),
                            index=midx, columns=mcols)
             df.index.names = ['Level 0', 'Level 1']
             set_option('display.expand_frame_repr', False)
@@ -1296,9 +1292,8 @@ def test_wide_repr_multiindex_cols(self):
 
     def test_wide_repr_unicode(self):
         with option_context('mode.sim_interactive', True):
-            col = lambda l, k: [tm.randu(k) for _ in range(l)]
             max_cols = get_option('display.max_columns')
-            df = DataFrame([col(max_cols-1, 25) for _ in range(10)])
+            df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
             set_option('display.expand_frame_repr', False)
             rep_str = repr(df)
             set_option('display.expand_frame_repr', True)
@@ -1877,30 +1872,31 @@ def test_repr_html(self):
         self.reset_display_options()
 
     def test_repr_html_wide(self):
-        row = lambda l, k: [tm.rands(k) for _ in range(l)]
         max_cols = get_option('display.max_columns')
-        df = DataFrame([row(max_cols-1, 25) for _ in range(10)])
+        df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
         reg_repr = df._repr_html_()
         assert "..." not in reg_repr
 
-        wide_df = DataFrame([row(max_cols+1, 25) for _ in range(10)])
+        wide_df = DataFrame(tm.rands_array(25, size=(10, max_cols + 1)))
         wide_repr = wide_df._repr_html_()
         assert "..." in wide_repr
 
     def test_repr_html_wide_multiindex_cols(self):
-        row = lambda l, k: [tm.rands(k) for _ in range(l)]
         max_cols = get_option('display.max_columns')
 
-        tuples = list(itertools.product(np.arange(max_cols//2), ['foo', 'bar']))
-        mcols = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second'])
-        df = DataFrame([row(len(mcols), 25) for _ in range(10)], columns=mcols)
+        mcols = pandas.MultiIndex.from_product([np.arange(max_cols//2),
+                                                ['foo', 'bar']],
+                                               names=['first', 'second'])
+        df = DataFrame(tm.rands_array(25, size=(10, len(mcols))),
+                       columns=mcols)
         reg_repr = df._repr_html_()
         assert '...' not in reg_repr
 
-
-        tuples = list(itertools.product(np.arange(1+(max_cols//2)), ['foo', 'bar']))
-        mcols = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second'])
-        df = DataFrame([row(len(mcols), 25) for _ in range(10)], columns=mcols)
+        mcols = pandas.MultiIndex.from_product((np.arange(1+(max_cols//2)),
+                                                ['foo', 'bar']),
+                                               names=['first', 'second'])
+        df = DataFrame(tm.rands_array(25, size=(10, len(mcols))),
+                       columns=mcols)
         wide_repr = df._repr_html_()
         assert '...' in wide_repr
 

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -4734,7 +4734,7 @@ def test_bytestring_with_unicode(self):
 
     def test_very_wide_info_repr(self):
         df = DataFrame(np.random.randn(10, 20),
-                       columns=[tm.rands(10) for _ in range(20)])
+                       columns=tm.rands_array(10, 20))
         repr(df)
 
     def test_repr_column_name_unicode_truncation_bug(self):

diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -8,7 +8,6 @@
 
 from pandas import date_range,bdate_range, Timestamp
 from pandas.core.index import Index, MultiIndex, Int64Index
-from pandas.core.common import rands
 from pandas.core.api import Categorical, DataFrame
 from pandas.core.groupby import (SpecificationError, DataError,
                                  _nargsort, _lexsort_indexer)
@@ -2579,7 +2578,7 @@ def test_cython_grouper_series_bug_noncontig(self):
         self.assertTrue(result.isnull().all())
 
     def test_series_grouper_noncontig_index(self):
-        index = Index([tm.rands(10) for _ in range(100)])
+        index = Index(tm.rands_array(10, 100))
 
         values = Series(np.random.randn(50), index=index[::2])
         labels = np.random.randint(0, 5, 50)
@@ -2869,8 +2868,8 @@ def test_column_select_via_attr(self):
         assert_frame_equal(result, expected)
 
     def test_rank_apply(self):
-        lev1 = np.array([rands(10) for _ in range(100)], dtype=object)
-        lev2 = np.array([rands(10) for _ in range(130)], dtype=object)
+        lev1 = tm.rands_array(10, 100)
+        lev2 = tm.rands_array(10, 130)
         lab1 = np.random.randint(0, 100, size=500)
         lab2 = np.random.randint(0, 130, size=500)
 

diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -327,8 +327,7 @@ def test_getitem_setitem_ellipsis(self):
         self.assertTrue((result == 5).all())
 
     def test_getitem_negative_out_of_bounds(self):
-        s = Series([tm.rands(5) for _ in range(10)],
-                   index=[tm.rands(10) for _ in range(10)])
+        s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10))
 
         self.assertRaises(IndexError, s.__getitem__, -11)
         self.assertRaises(IndexError, s.__setitem__, -11, 'foo')
@@ -3852,11 +3851,10 @@ def _check_op(arr, op):
         _check_op(arr, operator.floordiv)
 
     def test_series_frame_radd_bug(self):
-        from pandas.util.testing import rands
         import operator
 
         # GH 353
-        vals = Series([rands(5) for _ in range(10)])
+        vals = Series(tm.rands_array(5, 10))
         result = 'foo_' + vals
         expected = vals.map(lambda x: 'foo_' + x)
         assert_series_equal(result, expected)

diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py
@@ -59,6 +59,22 @@ def test_bad_deprecate_kwarg(self):
             def f4(new=None):
                 pass
 
+
+def test_rands():
+    r = tm.rands(10)
+    assert(len(r) == 10)
+
+
+def test_rands_array():
+    arr = tm.rands_array(5, size=10)
+    assert(arr.shape == (10,))
+    assert(len(arr[0]) == 5)
+
+    arr = tm.rands_array(7, size=(10, 10))
+    assert(arr.shape == (10, 10))
+    assert(len(arr[1, 1]) == 7)
+
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -14,7 +14,7 @@
 from pandas.tseries.index import DatetimeIndex
 from pandas.tools.merge import merge, concat, ordered_merge, MergeError
 from pandas.util.testing import (assert_frame_equal, assert_series_equal,
-                                 assert_almost_equal, rands,
+                                 assert_almost_equal,
                                  makeCustomDataframe as mkdf,
                                  assertRaisesRegexp)
 from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table, read_csv
@@ -913,7 +913,7 @@ def test_merge_right_vs_left(self):
     def test_compress_group_combinations(self):
 
         # ~ 40000000 possible unique groups
-        key1 = np.array([rands(10) for _ in range(10000)], dtype='O')
+        key1 = tm.rands_array(10, 10000)
         key1 = np.tile(key1, 2)
         key2 = key1[::-1]
 

diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -193,15 +193,50 @@ def randbool(size=(), p=0.5):
     return rand(*size) <= p
 
 
-def rands(n):
-    choices = string.ascii_letters + string.digits
-    return ''.join(random.choice(choices) for _ in range(n))
+RANDS_CHARS = np.array(list(string.ascii_letters + string.digits),
+                       dtype=(np.str_, 1))
+RANDU_CHARS = np.array(list(u("").join(map(unichr, lrange(1488, 1488 + 26))) +
+                            string.digits), dtype=(np.unicode_, 1))
+
+
+def rands_array(nchars, size, dtype='O'):
+    """Generate an array of byte strings."""
+    retval = (choice(RANDS_CHARS, size=nchars * np.prod(size))
+              .view((np.str_, nchars)).reshape(size))
+    if dtype is None:
+        return retval
+    else:
+        return retval.astype(dtype)
+
+
+def randu_array(nchars, size, dtype='O'):
+    """Generate an array of unicode strings."""
+    retval = (choice(RANDU_CHARS, size=nchars * np.prod(size))
+              .view((np.unicode_, nchars)).reshape(size))
+    if dtype is None:
+        return retval
+    else:
+        return retval.astype(dtype)
 
 
-def randu(n):
-    choices = u("").join(map(unichr, lrange(1488, 1488 + 26)))
-    choices += string.digits
-    return ''.join([random.choice(choices) for _ in range(n)])
+def rands(nchars):
+    """
+    Generate one random byte string.
+
+    See `rands_array` if you want to create an array of random strings.
+
+    """
+    return ''.join(choice(RANDS_CHARS, nchars))
+
+
+def randu(nchars):
+    """
+    Generate one random unicode string.
+
+    See `randu_array` if you want to create an array of random unicode strings.
+
+    """
+    return ''.join(choice(RANDU_CHARS, nchars))
 
 
 def choice(x, size=10):
@@ -743,10 +778,11 @@ def getArangeMat():
 
 # make index
 def makeStringIndex(k=10):
-    return Index([rands(10) for _ in range(k)])
+    return Index(rands_array(nchars=10, size=k))
+
 
 def makeUnicodeIndex(k=10):
-    return Index([randu(10) for _ in range(k)])
+    return Index(randu_array(nchars=10, size=k))
 
 def makeBoolIndex(k=10):
     if k == 1:

diff --git a/vb_suite/frame_ctor.py b/vb_suite/frame_ctor.py
@@ -17,8 +17,8 @@
 
 setup = common_setup + """
 N, K = 5000, 50
-index = [rands(10) for _ in xrange(N)]
-columns = [rands(10) for _ in xrange(K)]
+index = tm.makeStringIndex(N)
+columns = tm.makeStringIndex(K)
 frame = DataFrame(np.random.randn(N, K), index=index, columns=columns)
 
 try:

diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -187,7 +187,7 @@ def f():
 setup = common_setup + """
 K = 1000
 N = 100000
-uniques = np.array([rands(10) for x in xrange(K)], dtype='O')
+uniques = tm.makeStringIndex(K).values
 s = Series(np.tile(uniques, N // K))
 """