Faster unstack (pydata#2364)

* Make dataset.unstack faster by skipping reindex if not necessary. * Remove prints, add comment * added asv benchmark for unstacking * Added test * Simplified test * Added whats-new entry * PEP8 * Made asv test faster
tomchor · Aug 15, 2018 · c27ca43 · c27ca43
1 parent b87b684
commit c27ca43
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 3 deletions.
diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py
@@ -0,0 +1,25 @@
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import xarray as xr
+
+from . import requires_dask
+
+
+class Unstacking(object):
+    def setup(self):
+        data = np.random.RandomState(0).randn(1, 1000, 500)
+        self.ds = xr.DataArray(data).stack(flat_dim=['dim_1', 'dim_2'])
+
+    def time_unstack_fast(self):
+        self.ds.unstack('flat_dim')
+
+    def time_unstack_slow(self):
+        self.ds[:, ::-1].unstack('flat_dim')
+
+
+class UnstackingDask(Unstacking):
+    def setup(self, *args, **kwargs):
+        requires_dask()
+        super(UnstackingDask, self).setup(**kwargs)
+        self.ds = self.ds.chunk({'flat_dim': 50})
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -52,6 +52,10 @@ Enhancements
   (:issue:`2331`)
   By `Maximilian Roos <https://github.com/max-sixty>`_.
 
+- Applying ``unstack`` to a large DataArray or Dataset is now much faster if the MultiIndex has not been modified after stacking the indices.
+  (:issue:`1560`)
+  By `Maximilian Maahn <https://github.com/maahn>`_.
+
 
 Bug fixes
 ~~~~~~~~~

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -2324,8 +2324,13 @@ def unstack(self, dim):
                              'a MultiIndex')
 
         full_idx = pd.MultiIndex.from_product(index.levels, names=index.names)
-        obj = self.reindex(copy=False, **{dim: full_idx})
-
+
+        # take a shortcut in case the MultiIndex was not modified.
+        if index.equals(full_idx):
+            obj = self
+        else:
+            obj = self.reindex(copy=False, **{dim: full_idx})
+
         new_dim_names = index.names
         new_dim_sizes = [lev.size for lev in index.levels]
 

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -2113,7 +2113,7 @@ def test_unstack_errors(self):
         with raises_regex(ValueError, 'does not have a MultiIndex'):
             ds.unstack('x')
 
-    def test_stack_unstack(self):
+    def test_stack_unstack_fast(self):
         ds = Dataset({'a': ('x', [0, 1]),
                       'b': (('x', 'y'), [[0, 1], [2, 3]]),
                       'x': [0, 1],
@@ -2124,6 +2124,19 @@ def test_stack_unstack(self):
         actual = ds[['b']].stack(z=['x', 'y']).unstack('z')
         assert actual.identical(ds[['b']])
 
+    def test_stack_unstack_slow(self):
+        ds = Dataset({'a': ('x', [0, 1]),
+                      'b': (('x', 'y'), [[0, 1], [2, 3]]),
+                      'x': [0, 1],
+                      'y': ['a', 'b']})
+        stacked = ds.stack(z=['x', 'y'])
+        actual = stacked.isel(z=slice(None, None, -1)).unstack('z')
+        assert actual.broadcast_equals(ds)
+
+        stacked = ds[['b']].stack(z=['x', 'y'])
+        actual = stacked.isel(z=slice(None, None, -1)).unstack('z')
+        assert actual.identical(ds[['b']])
+
     def test_update(self):
         data = create_test_data(seed=0)
         expected = data.copy()