Skip to content

Commit

Permalink
Faster unstack (pydata#2364)
Browse files Browse the repository at this point in the history
* Make dataset.unstack faster by skipping reindex if not necessary.

* Remove prints, add comment

* added asv benchmark for unstacking

* Added test

* Simplified test

* Added whats-new entry

* PEP8

* Made asv test faster
  • Loading branch information
Maximilian Maahn authored and fujiisoup committed Aug 15, 2018
1 parent b87b684 commit c27ca43
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 3 deletions.
25 changes: 25 additions & 0 deletions asv_bench/benchmarks/unstacking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from __future__ import absolute_import, division, print_function

import numpy as np
import xarray as xr

from . import requires_dask


class Unstacking(object):
def setup(self):
data = np.random.RandomState(0).randn(1, 1000, 500)
self.ds = xr.DataArray(data).stack(flat_dim=['dim_1', 'dim_2'])

def time_unstack_fast(self):
self.ds.unstack('flat_dim')

def time_unstack_slow(self):
self.ds[:, ::-1].unstack('flat_dim')


class UnstackingDask(Unstacking):
def setup(self, *args, **kwargs):
requires_dask()
super(UnstackingDask, self).setup(**kwargs)
self.ds = self.ds.chunk({'flat_dim': 50})
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ Enhancements
(:issue:`2331`)
By `Maximilian Roos <https://github.com/max-sixty>`_.

- Applying ``unstack`` to a large DataArray or Dataset is now much faster if the MultiIndex has not been modified after stacking the indices.
(:issue:`1560`)
By `Maximilian Maahn <https://github.com/maahn>`_.


Bug fixes
~~~~~~~~~
Expand Down
9 changes: 7 additions & 2 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2324,8 +2324,13 @@ def unstack(self, dim):
'a MultiIndex')

full_idx = pd.MultiIndex.from_product(index.levels, names=index.names)
obj = self.reindex(copy=False, **{dim: full_idx})


# take a shortcut in case the MultiIndex was not modified.
if index.equals(full_idx):
obj = self
else:
obj = self.reindex(copy=False, **{dim: full_idx})

new_dim_names = index.names
new_dim_sizes = [lev.size for lev in index.levels]

Expand Down
15 changes: 14 additions & 1 deletion xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2113,7 +2113,7 @@ def test_unstack_errors(self):
with raises_regex(ValueError, 'does not have a MultiIndex'):
ds.unstack('x')

def test_stack_unstack(self):
def test_stack_unstack_fast(self):
ds = Dataset({'a': ('x', [0, 1]),
'b': (('x', 'y'), [[0, 1], [2, 3]]),
'x': [0, 1],
Expand All @@ -2124,6 +2124,19 @@ def test_stack_unstack(self):
actual = ds[['b']].stack(z=['x', 'y']).unstack('z')
assert actual.identical(ds[['b']])

def test_stack_unstack_slow(self):
ds = Dataset({'a': ('x', [0, 1]),
'b': (('x', 'y'), [[0, 1], [2, 3]]),
'x': [0, 1],
'y': ['a', 'b']})
stacked = ds.stack(z=['x', 'y'])
actual = stacked.isel(z=slice(None, None, -1)).unstack('z')
assert actual.broadcast_equals(ds)

stacked = ds[['b']].stack(z=['x', 'y'])
actual = stacked.isel(z=slice(None, None, -1)).unstack('z')
assert actual.identical(ds[['b']])

def test_update(self):
data = create_test_data(seed=0)
expected = data.copy()
Expand Down

0 comments on commit c27ca43

Please sign in to comment.