Bug fixes for Dataset.reduce() and n-dimensional cumsum/cumprod (pyda…

…ta#2156) * Bug fixes for Dataset.reduce() and n-dimensional cumsum/cumprod Fixes GH1470, "Dataset.mean drops coordinates" Fixes a bug where non-scalar data-variables that did not include the aggregated dimension were not properly reduced: Previously:: >>> ds = Dataset({'x': ('a', [2, 2]), 'y': 2, 'z': ('b', [2])}) >>> ds.var('a') <xarray.Dataset> Dimensions: (b: 1) Dimensions without coordinates: b Data variables: x float64 0.0 y float64 0.0 z (b) int64 2 Now:: >>> ds.var('a') <xarray.Dataset> Dimensions: (b: 1) Dimensions without coordinates: b Data variables: x int64 0 y int64 0 z (b) int64 0 Finally, adds support for n-dimensional cumsum() and cumprod(), reducing over all dimensions of an array. (This was necessary as part of the above fix.) * Lint fixup * remove confusing comments
stefraynaud · May 18, 2018 · c346d3b · c346d3b
1 parent ecb10e3
commit c346d3b
Show file tree

Hide file tree

Showing 7 changed files with 140 additions and 38 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -37,12 +37,24 @@ Documentation
 Enhancements
 ~~~~~~~~~~~~
 
+- :py:meth:`~DataArray.cumsum` and :py:meth:`~DataArray.cumprod` now support
+  aggregation over multiple dimensions at the same time. This is the default
+  behavior when dimensions are not specified (previously this raised an error).
+  By `Stephan Hoyer <https://github.com/shoyer>`_
+
 Bug fixes
 ~~~~~~~~~
+
 - Fixed a bug where `to_netcdf(..., unlimited_dims='bar'` yielded NetCDF files
   with spurious 0-length dimensions (i.e. `b`, `a`, and `r`) (:issue:`2134`).
   By `Joe Hamman <https://github.com/jhamman>`_.
 
+- Aggregations with :py:meth:`Dataset.reduce` (including ``mean``, ``sum``,
+  etc) no longer drop unrelated coordinates (:issue:`1470`). Also fixed a
+  bug where non-scalar data-variables that did not include the aggregation
+  dimension were improperly skipped.
+  By `Stephan Hoyer <https://github.com/shoyer>`_
+
 .. _whats-new.0.10.4:
 
 v0.10.4 (May 16, 2018)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -2594,26 +2594,26 @@ def reduce(self, func, dim=None, keep_attrs=False, numeric_only=False,
         variables = OrderedDict()
         for name, var in iteritems(self._variables):
             reduce_dims = [dim for dim in var.dims if dim in dims]
-            if reduce_dims or not var.dims:
-                if name not in self.coords:
-                    if (not numeric_only or
-                            np.issubdtype(var.dtype, np.number) or
-                            (var.dtype == np.bool_)):
-                        if len(reduce_dims) == 1:
-                            # unpack dimensions for the benefit of functions
-                            # like np.argmin which can't handle tuple arguments
-                            reduce_dims, = reduce_dims
-                        elif len(reduce_dims) == var.ndim:
-                            # prefer to aggregate over axis=None rather than
-                            # axis=(0, 1) if they will be equivalent, because
-                            # the former is often more efficient
-                            reduce_dims = None
-                        variables[name] = var.reduce(func, dim=reduce_dims,
-                                                     keep_attrs=keep_attrs,
-                                                     allow_lazy=allow_lazy,
-                                                     **kwargs)
+            if name in self.coords:
+                if not reduce_dims:
+                    variables[name] = var
             else:
-                variables[name] = var
+                if (not numeric_only or
+                        np.issubdtype(var.dtype, np.number) or
+                        (var.dtype == np.bool_)):
+                    if len(reduce_dims) == 1:
+                        # unpack dimensions for the benefit of functions
+                        # like np.argmin which can't handle tuple arguments
+                        reduce_dims, = reduce_dims
+                    elif len(reduce_dims) == var.ndim:
+                        # prefer to aggregate over axis=None rather than
+                        # axis=(0, 1) if they will be equivalent, because
+                        # the former is often more efficient
+                        reduce_dims = None
+                    variables[name] = var.reduce(func, dim=reduce_dims,
+                                                 keep_attrs=keep_attrs,
+                                                 allow_lazy=allow_lazy,
+                                                 **kwargs)
 
         coord_names = set(k for k in self.coords if k in variables)
         attrs = self.attrs if keep_attrs else None

diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py
@@ -281,8 +281,7 @@ def _nanvar_object(value, axis=None, **kwargs):
 
 
 def _create_nan_agg_method(name, numeric_only=False, np_compat=False,
-                           no_bottleneck=False, coerce_strings=False,
-                           keep_dims=False):
+                           no_bottleneck=False, coerce_strings=False):
     def f(values, axis=None, skipna=None, **kwargs):
         if kwargs.pop('out', None) is not None:
             raise TypeError('`out` is not valid for {}'.format(name))
@@ -343,7 +342,6 @@ def f(values, axis=None, skipna=None, **kwargs):
                            'or newer to use skipna=True or skipna=None' % name)
                 raise NotImplementedError(msg)
     f.numeric_only = numeric_only
-    f.keep_dims = keep_dims
     f.__name__ = name
     return f
 
@@ -358,10 +356,34 @@ def f(values, axis=None, skipna=None, **kwargs):
 var = _create_nan_agg_method('var', numeric_only=True)
 median = _create_nan_agg_method('median', numeric_only=True)
 prod = _create_nan_agg_method('prod', numeric_only=True, no_bottleneck=True)
-cumprod = _create_nan_agg_method('cumprod', numeric_only=True, np_compat=True,
-                                 no_bottleneck=True, keep_dims=True)
-cumsum = _create_nan_agg_method('cumsum', numeric_only=True, np_compat=True,
-                                no_bottleneck=True, keep_dims=True)
+cumprod_1d = _create_nan_agg_method(
+    'cumprod', numeric_only=True, np_compat=True, no_bottleneck=True)
+cumsum_1d = _create_nan_agg_method(
+    'cumsum', numeric_only=True, np_compat=True, no_bottleneck=True)
+
+
+def _nd_cum_func(cum_func, array, axis, **kwargs):
+    array = asarray(array)
+    if axis is None:
+        axis = tuple(range(array.ndim))
+    if isinstance(axis, int):
+        axis = (axis,)
+
+    out = array
+    for ax in axis:
+        out = cum_func(out, axis=ax, **kwargs)
+    return out
+
+
+def cumprod(array, axis=None, **kwargs):
+    """N-dimensional version of cumprod."""
+    return _nd_cum_func(cumprod_1d, array, axis, **kwargs)
+
+
+def cumsum(array, axis=None, **kwargs):
+    """N-dimensional version of cumsum."""
+    return _nd_cum_func(cumsum_1d, array, axis, **kwargs)
+
 
 _fail_on_dask_array_input_skipna = partial(
     fail_on_dask_array_input,

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -1256,11 +1256,6 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=False,
         if dim is not None and axis is not None:
             raise ValueError("cannot supply both 'axis' and 'dim' arguments")
 
-        if getattr(func, 'keep_dims', False):
-            if dim is None and axis is None:
-                raise ValueError("must supply either single 'dim' or 'axis' "
-                                 "argument to %s" % (func.__name__))
-
         if dim is not None:
             axis = self.get_axis_num(dim)
         data = func(self.data if allow_lazy else self.values,

diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -1767,6 +1767,11 @@ def test_cumops(self):
         orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords,
                          dims=['x', 'y'])
 
+        actual = orig.cumsum()
+        expected = DataArray([[-1, -1, 0], [-4, -4, 0]], coords,
+                             dims=['x', 'y'])
+        assert_identical(expected, actual)
+
         actual = orig.cumsum('x')
         expected = DataArray([[-1, 0, 1], [-4, 0, 4]], coords,
                              dims=['x', 'y'])

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -3331,7 +3331,18 @@ def test_reduce(self):
 
         assert_equal(data.mean(dim=[]), data)
 
-        # uint support
+    def test_reduce_coords(self):
+        # regression test for GH1470
+        data = xr.Dataset({'a': ('x', [1, 2, 3])}, coords={'b': 4})
+        expected = xr.Dataset({'a': 2}, coords={'b': 4})
+        actual = data.mean('x')
+        assert_identical(actual, expected)
+
+        # should be consistent
+        actual = data['a'].mean('x').to_dataset()
+        assert_identical(actual, expected)
+
+    def test_mean_uint_dtype(self):
         data = xr.Dataset({'a': (('x', 'y'),
                                  np.arange(6).reshape(3, 2).astype('uint')),
                            'b': (('x', ), np.array([0.1, 0.2, np.nan]))})
@@ -3345,15 +3356,20 @@ def test_reduce_bad_dim(self):
         with raises_regex(ValueError, 'Dataset does not contain'):
             data.mean(dim='bad_dim')
 
+    def test_reduce_cumsum(self):
+        data = xr.Dataset({'a': 1,
+                           'b': ('x', [1, 2]),
+                           'c': (('x', 'y'), [[np.nan, 3], [0, 4]])})
+        assert_identical(data.fillna(0), data.cumsum('y'))
+
+        expected = xr.Dataset({'a': 1,
+                               'b': ('x', [1, 3]),
+                               'c': (('x', 'y'), [[0, 3], [0, 7]])})
+        assert_identical(expected, data.cumsum())
+
     def test_reduce_cumsum_test_dims(self):
         data = create_test_data()
         for cumfunc in ['cumsum', 'cumprod']:
-            with raises_regex(ValueError,
-                              "must supply either single 'dim' or 'axis'"):
-                getattr(data, cumfunc)()
-            with raises_regex(ValueError,
-                              "must supply either single 'dim' or 'axis'"):
-                getattr(data, cumfunc)(dim=['dim1', 'dim2'])
             with raises_regex(ValueError, 'Dataset does not contain'):
                 getattr(data, cumfunc)(dim='bad_dim')
 
@@ -3460,6 +3476,10 @@ def test_reduce_scalars(self):
         actual = ds.var()
         assert_identical(expected, actual)
 
+        expected = Dataset({'x': 0, 'y': 0, 'z': ('b', [0])})
+        actual = ds.var('a')
+        assert_identical(expected, actual)
+
     def test_reduce_only_one_axis(self):
 
         def mean_only_one_axis(x, axis):

diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py
@@ -8,6 +8,7 @@
 import warnings
 
 from xarray import DataArray, concat
+from xarray.core import duck_array_ops
 from xarray.core.duck_array_ops import (
     array_notnull_equiv, concatenate, count, first, last, mean, rolling_window,
     stack, where)
@@ -103,6 +104,53 @@ def test_all_nan_arrays(self):
         assert np.isnan(mean([np.nan, np.nan]))
 
 
+def test_cumsum_1d():
+    inputs = np.array([0, 1, 2, 3])
+    expected = np.array([0, 1, 3, 6])
+    actual = duck_array_ops.cumsum(inputs)
+    assert_array_equal(expected, actual)
+
+    actual = duck_array_ops.cumsum(inputs, axis=0)
+    assert_array_equal(expected, actual)
+
+    actual = duck_array_ops.cumsum(inputs, axis=-1)
+    assert_array_equal(expected, actual)
+
+    actual = duck_array_ops.cumsum(inputs, axis=(0,))
+    assert_array_equal(expected, actual)
+
+    actual = duck_array_ops.cumsum(inputs, axis=())
+    assert_array_equal(inputs, actual)
+
+
+def test_cumsum_2d():
+    inputs = np.array([[1, 2], [3, 4]])
+
+    expected = np.array([[1, 3], [4, 10]])
+    actual = duck_array_ops.cumsum(inputs)
+    assert_array_equal(expected, actual)
+
+    actual = duck_array_ops.cumsum(inputs, axis=(0, 1))
+    assert_array_equal(expected, actual)
+
+    actual = duck_array_ops.cumsum(inputs, axis=())
+    assert_array_equal(inputs, actual)
+
+
+def test_cumprod_2d():
+    inputs = np.array([[1, 2], [3, 4]])
+
+    expected = np.array([[1, 2], [3, 2 * 3 * 4]])
+    actual = duck_array_ops.cumprod(inputs)
+    assert_array_equal(expected, actual)
+
+    actual = duck_array_ops.cumprod(inputs, axis=(0, 1))
+    assert_array_equal(expected, actual)
+
+    actual = duck_array_ops.cumprod(inputs, axis=())
+    assert_array_equal(inputs, actual)
+
+
 class TestArrayNotNullEquiv():
     @pytest.mark.parametrize("arr1, arr2", [
         (np.array([1, 2, 3]), np.array([1, 2, 3])),