Skip to content

Commit

Permalink
Bug fixes for Dataset.reduce() and n-dimensional cumsum/cumprod (pyda…
Browse files Browse the repository at this point in the history
…ta#2156)

* Bug fixes for Dataset.reduce() and n-dimensional cumsum/cumprod

Fixes GH1470, "Dataset.mean drops coordinates"

Fixes a bug where non-scalar data-variables that did not include the
aggregated dimension were not properly reduced:

    Previously::

        >>> ds = Dataset({'x': ('a', [2, 2]), 'y': 2, 'z': ('b', [2])})
        >>> ds.var('a')
        <xarray.Dataset>
        Dimensions:  (b: 1)
        Dimensions without coordinates: b
        Data variables:
            x        float64 0.0
            y        float64 0.0
            z        (b) int64 2

    Now::

        >>> ds.var('a')
        <xarray.Dataset>
        Dimensions:  (b: 1)
        Dimensions without coordinates: b
        Data variables:
            x        int64 0
            y        int64 0
            z        (b) int64 0

Finally, adds support for n-dimensional cumsum() and cumprod(), reducing
over all dimensions of an array. (This was necessary as part of the above fix.)

* Lint fixup

* remove confusing comments
  • Loading branch information
shoyer authored May 18, 2018
1 parent ecb10e3 commit c346d3b
Show file tree
Hide file tree
Showing 7 changed files with 140 additions and 38 deletions.
12 changes: 12 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,24 @@ Documentation
Enhancements
~~~~~~~~~~~~

- :py:meth:`~DataArray.cumsum` and :py:meth:`~DataArray.cumprod` now support
aggregation over multiple dimensions at the same time. This is the default
behavior when dimensions are not specified (previously this raised an error).
By `Stephan Hoyer <https://github.com/shoyer>`_

Bug fixes
~~~~~~~~~

- Fixed a bug where `to_netcdf(..., unlimited_dims='bar'` yielded NetCDF files
with spurious 0-length dimensions (i.e. `b`, `a`, and `r`) (:issue:`2134`).
By `Joe Hamman <https://github.com/jhamman>`_.

- Aggregations with :py:meth:`Dataset.reduce` (including ``mean``, ``sum``,
etc) no longer drop unrelated coordinates (:issue:`1470`). Also fixed a
bug where non-scalar data-variables that did not include the aggregation
dimension were improperly skipped.
By `Stephan Hoyer <https://github.com/shoyer>`_

.. _whats-new.0.10.4:

v0.10.4 (May 16, 2018)
Expand Down
38 changes: 19 additions & 19 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2594,26 +2594,26 @@ def reduce(self, func, dim=None, keep_attrs=False, numeric_only=False,
variables = OrderedDict()
for name, var in iteritems(self._variables):
reduce_dims = [dim for dim in var.dims if dim in dims]
if reduce_dims or not var.dims:
if name not in self.coords:
if (not numeric_only or
np.issubdtype(var.dtype, np.number) or
(var.dtype == np.bool_)):
if len(reduce_dims) == 1:
# unpack dimensions for the benefit of functions
# like np.argmin which can't handle tuple arguments
reduce_dims, = reduce_dims
elif len(reduce_dims) == var.ndim:
# prefer to aggregate over axis=None rather than
# axis=(0, 1) if they will be equivalent, because
# the former is often more efficient
reduce_dims = None
variables[name] = var.reduce(func, dim=reduce_dims,
keep_attrs=keep_attrs,
allow_lazy=allow_lazy,
**kwargs)
if name in self.coords:
if not reduce_dims:
variables[name] = var
else:
variables[name] = var
if (not numeric_only or
np.issubdtype(var.dtype, np.number) or
(var.dtype == np.bool_)):
if len(reduce_dims) == 1:
# unpack dimensions for the benefit of functions
# like np.argmin which can't handle tuple arguments
reduce_dims, = reduce_dims
elif len(reduce_dims) == var.ndim:
# prefer to aggregate over axis=None rather than
# axis=(0, 1) if they will be equivalent, because
# the former is often more efficient
reduce_dims = None
variables[name] = var.reduce(func, dim=reduce_dims,
keep_attrs=keep_attrs,
allow_lazy=allow_lazy,
**kwargs)

coord_names = set(k for k in self.coords if k in variables)
attrs = self.attrs if keep_attrs else None
Expand Down
36 changes: 29 additions & 7 deletions xarray/core/duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,7 @@ def _nanvar_object(value, axis=None, **kwargs):


def _create_nan_agg_method(name, numeric_only=False, np_compat=False,
no_bottleneck=False, coerce_strings=False,
keep_dims=False):
no_bottleneck=False, coerce_strings=False):
def f(values, axis=None, skipna=None, **kwargs):
if kwargs.pop('out', None) is not None:
raise TypeError('`out` is not valid for {}'.format(name))
Expand Down Expand Up @@ -343,7 +342,6 @@ def f(values, axis=None, skipna=None, **kwargs):
'or newer to use skipna=True or skipna=None' % name)
raise NotImplementedError(msg)
f.numeric_only = numeric_only
f.keep_dims = keep_dims
f.__name__ = name
return f

Expand All @@ -358,10 +356,34 @@ def f(values, axis=None, skipna=None, **kwargs):
var = _create_nan_agg_method('var', numeric_only=True)
median = _create_nan_agg_method('median', numeric_only=True)
prod = _create_nan_agg_method('prod', numeric_only=True, no_bottleneck=True)
cumprod = _create_nan_agg_method('cumprod', numeric_only=True, np_compat=True,
no_bottleneck=True, keep_dims=True)
cumsum = _create_nan_agg_method('cumsum', numeric_only=True, np_compat=True,
no_bottleneck=True, keep_dims=True)
cumprod_1d = _create_nan_agg_method(
'cumprod', numeric_only=True, np_compat=True, no_bottleneck=True)
cumsum_1d = _create_nan_agg_method(
'cumsum', numeric_only=True, np_compat=True, no_bottleneck=True)


def _nd_cum_func(cum_func, array, axis, **kwargs):
array = asarray(array)
if axis is None:
axis = tuple(range(array.ndim))
if isinstance(axis, int):
axis = (axis,)

out = array
for ax in axis:
out = cum_func(out, axis=ax, **kwargs)
return out


def cumprod(array, axis=None, **kwargs):
"""N-dimensional version of cumprod."""
return _nd_cum_func(cumprod_1d, array, axis, **kwargs)


def cumsum(array, axis=None, **kwargs):
"""N-dimensional version of cumsum."""
return _nd_cum_func(cumsum_1d, array, axis, **kwargs)


_fail_on_dask_array_input_skipna = partial(
fail_on_dask_array_input,
Expand Down
5 changes: 0 additions & 5 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1256,11 +1256,6 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=False,
if dim is not None and axis is not None:
raise ValueError("cannot supply both 'axis' and 'dim' arguments")

if getattr(func, 'keep_dims', False):
if dim is None and axis is None:
raise ValueError("must supply either single 'dim' or 'axis' "
"argument to %s" % (func.__name__))

if dim is not None:
axis = self.get_axis_num(dim)
data = func(self.data if allow_lazy else self.values,
Expand Down
5 changes: 5 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1767,6 +1767,11 @@ def test_cumops(self):
orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords,
dims=['x', 'y'])

actual = orig.cumsum()
expected = DataArray([[-1, -1, 0], [-4, -4, 0]], coords,
dims=['x', 'y'])
assert_identical(expected, actual)

actual = orig.cumsum('x')
expected = DataArray([[-1, 0, 1], [-4, 0, 4]], coords,
dims=['x', 'y'])
Expand Down
34 changes: 27 additions & 7 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3331,7 +3331,18 @@ def test_reduce(self):

assert_equal(data.mean(dim=[]), data)

# uint support
def test_reduce_coords(self):
# regression test for GH1470
data = xr.Dataset({'a': ('x', [1, 2, 3])}, coords={'b': 4})
expected = xr.Dataset({'a': 2}, coords={'b': 4})
actual = data.mean('x')
assert_identical(actual, expected)

# should be consistent
actual = data['a'].mean('x').to_dataset()
assert_identical(actual, expected)

def test_mean_uint_dtype(self):
data = xr.Dataset({'a': (('x', 'y'),
np.arange(6).reshape(3, 2).astype('uint')),
'b': (('x', ), np.array([0.1, 0.2, np.nan]))})
Expand All @@ -3345,15 +3356,20 @@ def test_reduce_bad_dim(self):
with raises_regex(ValueError, 'Dataset does not contain'):
data.mean(dim='bad_dim')

def test_reduce_cumsum(self):
data = xr.Dataset({'a': 1,
'b': ('x', [1, 2]),
'c': (('x', 'y'), [[np.nan, 3], [0, 4]])})
assert_identical(data.fillna(0), data.cumsum('y'))

expected = xr.Dataset({'a': 1,
'b': ('x', [1, 3]),
'c': (('x', 'y'), [[0, 3], [0, 7]])})
assert_identical(expected, data.cumsum())

def test_reduce_cumsum_test_dims(self):
data = create_test_data()
for cumfunc in ['cumsum', 'cumprod']:
with raises_regex(ValueError,
"must supply either single 'dim' or 'axis'"):
getattr(data, cumfunc)()
with raises_regex(ValueError,
"must supply either single 'dim' or 'axis'"):
getattr(data, cumfunc)(dim=['dim1', 'dim2'])
with raises_regex(ValueError, 'Dataset does not contain'):
getattr(data, cumfunc)(dim='bad_dim')

Expand Down Expand Up @@ -3460,6 +3476,10 @@ def test_reduce_scalars(self):
actual = ds.var()
assert_identical(expected, actual)

expected = Dataset({'x': 0, 'y': 0, 'z': ('b', [0])})
actual = ds.var('a')
assert_identical(expected, actual)

def test_reduce_only_one_axis(self):

def mean_only_one_axis(x, axis):
Expand Down
48 changes: 48 additions & 0 deletions xarray/tests/test_duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import warnings

from xarray import DataArray, concat
from xarray.core import duck_array_ops
from xarray.core.duck_array_ops import (
array_notnull_equiv, concatenate, count, first, last, mean, rolling_window,
stack, where)
Expand Down Expand Up @@ -103,6 +104,53 @@ def test_all_nan_arrays(self):
assert np.isnan(mean([np.nan, np.nan]))


def test_cumsum_1d():
inputs = np.array([0, 1, 2, 3])
expected = np.array([0, 1, 3, 6])
actual = duck_array_ops.cumsum(inputs)
assert_array_equal(expected, actual)

actual = duck_array_ops.cumsum(inputs, axis=0)
assert_array_equal(expected, actual)

actual = duck_array_ops.cumsum(inputs, axis=-1)
assert_array_equal(expected, actual)

actual = duck_array_ops.cumsum(inputs, axis=(0,))
assert_array_equal(expected, actual)

actual = duck_array_ops.cumsum(inputs, axis=())
assert_array_equal(inputs, actual)


def test_cumsum_2d():
inputs = np.array([[1, 2], [3, 4]])

expected = np.array([[1, 3], [4, 10]])
actual = duck_array_ops.cumsum(inputs)
assert_array_equal(expected, actual)

actual = duck_array_ops.cumsum(inputs, axis=(0, 1))
assert_array_equal(expected, actual)

actual = duck_array_ops.cumsum(inputs, axis=())
assert_array_equal(inputs, actual)


def test_cumprod_2d():
inputs = np.array([[1, 2], [3, 4]])

expected = np.array([[1, 2], [3, 2 * 3 * 4]])
actual = duck_array_ops.cumprod(inputs)
assert_array_equal(expected, actual)

actual = duck_array_ops.cumprod(inputs, axis=(0, 1))
assert_array_equal(expected, actual)

actual = duck_array_ops.cumprod(inputs, axis=())
assert_array_equal(inputs, actual)


class TestArrayNotNullEquiv():
@pytest.mark.parametrize("arr1, arr2", [
(np.array([1, 2, 3]), np.array([1, 2, 3])),
Expand Down

0 comments on commit c346d3b

Please sign in to comment.