Skip to content

Commit

Permalink
Faster unstacking (pydata#4746)
Browse files Browse the repository at this point in the history
* Significantly improve unstacking performance

* Hack to get sparse tests passing

* Use the existing unstack function for dask & sparse

* Add whatsnew

* Require numpy 1.17 for new unstack

* Also special case pint

* Revert "Also special case pint"

This reverts commit b33aded.

* Only run fast unstack on numpy arrays

* Update asvs for unstacking

* Update whatsnew
  • Loading branch information
max-sixty authored Jan 24, 2021
1 parent d555172 commit a0c71c1
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 12 deletions.
15 changes: 10 additions & 5 deletions asv_bench/benchmarks/unstacking.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,23 @@

class Unstacking:
def setup(self):
data = np.random.RandomState(0).randn(1, 1000, 500)
self.ds = xr.DataArray(data).stack(flat_dim=["dim_1", "dim_2"])
data = np.random.RandomState(0).randn(500, 1000)
self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
self.da_missing = self.da_full[:-1]
self.df_missing = self.da_missing.to_pandas()

def time_unstack_fast(self):
self.ds.unstack("flat_dim")
self.da_full.unstack("flat_dim")

def time_unstack_slow(self):
self.ds[:, ::-1].unstack("flat_dim")
self.da_missing.unstack("flat_dim")

def time_unstack_pandas_slow(self):
self.df_missing.unstack()


class UnstackingDask(Unstacking):
def setup(self, *args, **kwargs):
requires_dask()
super().setup(**kwargs)
self.ds = self.ds.chunk({"flat_dim": 50})
self.da_full = self.da_full.chunk({"flat_dim": 50})
7 changes: 6 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ What's New
.. _whats-new.0.16.3:

v0.16.3 (unreleased)
v0.17.0 (unreleased)
--------------------

Breaking changes
Expand Down Expand Up @@ -45,6 +45,11 @@ Breaking changes

New Features
~~~~~~~~~~~~
- Significantly higher ``unstack`` performance on numpy-backed arrays which
contain missing values; 8x faster in our benchmark, and 2x faster than pandas.
(:pull:`4746`);
By `Maximilian Roos <https://github.com/max-sixty>`_.

- Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables.
By `Deepak Cherian <https://github.com/dcherian>`_.
- :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims
Expand Down
75 changes: 72 additions & 3 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import warnings
from collections import defaultdict
from distutils.version import LooseVersion
from html import escape
from numbers import Number
from operator import methodcaller
Expand Down Expand Up @@ -79,7 +80,7 @@
)
from .missing import get_clean_interp_index
from .options import OPTIONS, _get_keep_attrs
from .pycompat import is_duck_dask_array
from .pycompat import is_duck_dask_array, sparse_array_type
from .utils import (
Default,
Frozen,
Expand Down Expand Up @@ -3715,7 +3716,40 @@ def ensure_stackable(val):

return data_array

def _unstack_once(self, dim: Hashable, fill_value, sparse) -> "Dataset":
def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset":
index = self.get_index(dim)
index = remove_unused_levels_categories(index)

variables: Dict[Hashable, Variable] = {}
indexes = {k: v for k, v in self.indexes.items() if k != dim}

for name, var in self.variables.items():
if name != dim:
if dim in var.dims:
if isinstance(fill_value, Mapping):
fill_value_ = fill_value[name]
else:
fill_value_ = fill_value

variables[name] = var._unstack_once(
index=index, dim=dim, fill_value=fill_value_
)
else:
variables[name] = var

for name, lev in zip(index.names, index.levels):
variables[name] = IndexVariable(name, lev)
indexes[name] = lev

coord_names = set(self._coord_names) - {dim} | set(index.names)

return self._replace_with_new_dims(
variables, coord_names=coord_names, indexes=indexes
)

def _unstack_full_reindex(
self, dim: Hashable, fill_value, sparse: bool
) -> "Dataset":
index = self.get_index(dim)
index = remove_unused_levels_categories(index)
full_idx = pd.MultiIndex.from_product(index.levels, names=index.names)
Expand Down Expand Up @@ -3812,7 +3846,38 @@ def unstack(

result = self.copy(deep=False)
for dim in dims:
result = result._unstack_once(dim, fill_value, sparse)

if (
# Dask arrays don't support assignment by index, which the fast unstack
# function requires.
# https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
any(is_duck_dask_array(v.data) for v in self.variables.values())
# Sparse doesn't currently support (though we could special-case
# it)
# https://github.com/pydata/sparse/issues/422
or any(
isinstance(v.data, sparse_array_type)
for v in self.variables.values()
)
or sparse
# numpy full_like only added `shape` in 1.17
or LooseVersion(np.__version__) < LooseVersion("1.17")
# Until https://github.com/pydata/xarray/pull/4751 is resolved,
# we check explicitly whether it's a numpy array. Once that is
# resolved, explicitly exclude pint arrays.
# # pint doesn't implement `np.full_like` in a way that's
# # currently compatible.
# # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173
# # or any(
# # isinstance(v.data, pint_array_type) for v in self.variables.values()
# # )
or any(
not isinstance(v.data, np.ndarray) for v in self.variables.values()
)
):
result = result._unstack_full_reindex(dim, fill_value, sparse)
else:
result = result._unstack_once(dim, fill_value)
return result

def update(self, other: "CoercibleMapping") -> "Dataset":
Expand Down Expand Up @@ -4982,6 +5047,10 @@ def _set_numpy_data_from_dataframe(
self[name] = (dims, values)
return

# NB: similar, more general logic, now exists in
# variable.unstack_once; we could consider combining them at some
# point.

shape = tuple(lev.size for lev in idx.levels)
indexer = tuple(idx.codes)

Expand Down
68 changes: 65 additions & 3 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
Any,
Dict,
Hashable,
List,
Mapping,
Optional,
Sequence,
Expand Down Expand Up @@ -1488,7 +1489,7 @@ def set_dims(self, dims, shape=None):
)
return expanded_var.transpose(*dims)

def _stack_once(self, dims, new_dim):
def _stack_once(self, dims: List[Hashable], new_dim: Hashable):
if not set(dims) <= set(self.dims):
raise ValueError("invalid existing dimensions: %s" % dims)

Expand Down Expand Up @@ -1544,7 +1545,15 @@ def stack(self, dimensions=None, **dimensions_kwargs):
result = result._stack_once(dims, new_dim)
return result

def _unstack_once(self, dims, old_dim):
def _unstack_once_full(
self, dims: Mapping[Hashable, int], old_dim: Hashable
) -> "Variable":
"""
Unstacks the variable without needing an index.
Unlike `_unstack_once`, this function requires the existing dimension to
contain the full product of the new dimensions.
"""
new_dim_names = tuple(dims.keys())
new_dim_sizes = tuple(dims.values())

Expand Down Expand Up @@ -1573,13 +1582,64 @@ def _unstack_once(self, dims, old_dim):

return Variable(new_dims, new_data, self._attrs, self._encoding, fastpath=True)

def _unstack_once(
self,
index: pd.MultiIndex,
dim: Hashable,
fill_value=dtypes.NA,
) -> "Variable":
"""
Unstacks this variable given an index to unstack and the name of the
dimension to which the index refers.
"""

reordered = self.transpose(..., dim)

new_dim_sizes = [lev.size for lev in index.levels]
new_dim_names = index.names
indexer = index.codes

# Potentially we could replace `len(other_dims)` with just `-1`
other_dims = [d for d in self.dims if d != dim]
new_shape = list(reordered.shape[: len(other_dims)]) + new_dim_sizes
new_dims = reordered.dims[: len(other_dims)] + new_dim_names

if fill_value is dtypes.NA:
is_missing_values = np.prod(new_shape) > np.prod(self.shape)
if is_missing_values:
dtype, fill_value = dtypes.maybe_promote(self.dtype)
else:
dtype = self.dtype
fill_value = dtypes.get_fill_value(dtype)
else:
dtype = self.dtype

# Currently fails on sparse due to https://github.com/pydata/sparse/issues/422
data = np.full_like(
self.data,
fill_value=fill_value,
shape=new_shape,
dtype=dtype,
)

# Indexer is a list of lists of locations. Each list is the locations
# on the new dimension. This is robust to the data being sparse; in that
# case the destinations will be NaN / zero.
data[(..., *indexer)] = reordered

return self._replace(dims=new_dims, data=data)

def unstack(self, dimensions=None, **dimensions_kwargs):
"""
Unstack an existing dimension into multiple new dimensions.
New dimensions will be added at the end, and the order of the data
along each new dimension will be in contiguous (C) order.
Note that unlike ``DataArray.unstack`` and ``Dataset.unstack``, this
method requires the existing dimension to contain the full product of
the new dimensions.
Parameters
----------
dimensions : mapping of hashable to mapping of hashable to int
Expand All @@ -1598,11 +1658,13 @@ def unstack(self, dimensions=None, **dimensions_kwargs):
See also
--------
Variable.stack
DataArray.unstack
Dataset.unstack
"""
dimensions = either_dict_or_kwargs(dimensions, dimensions_kwargs, "unstack")
result = self
for old_dim, dims in dimensions.items():
result = result._unstack_once(dims, old_dim)
result = result._unstack_once_full(dims, old_dim)
return result

def fillna(self, value):
Expand Down

0 comments on commit a0c71c1

Please sign in to comment.