Skip to content

Commit

Permalink
Enable pandas-style rounding of cftime.datetime objects (pydata#3792)
Browse files Browse the repository at this point in the history
* Initial progress on implementing cftime floor/ceil/round

* Improve tests and docstrings

* Add tests of rounding cftime datetimes via dt accessor

* Add documentation

* docstring edits

* Test rounding raises error with non-fixed frequency

* black

* typo

* A couple cleanup items:
  - Fix floating point issue in asi8 and add tests
  - Ensure dask only computes once when using the rounding accessors

* black
  • Loading branch information
spencerkclark authored Mar 2, 2020
1 parent 016a77d commit 45d88fc
Show file tree
Hide file tree
Showing 6 changed files with 359 additions and 9 deletions.
8 changes: 8 additions & 0 deletions doc/weather-climate.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports:
da.time.dt.dayofyear
da.time.dt.dayofweek
- Rounding of datetimes to fixed frequencies via the ``dt`` accessor:

.. ipython:: python
da.time.dt.ceil('3D')
da.time.dt.floor('5D')
da.time.dt.round('2D')
- Group-by operations based on datetime accessor attributes (e.g. by month of
the year):

Expand Down
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ Breaking changes
New Features
~~~~~~~~~~~~

- Added support for :py:class:`pandas.DatetimeIndex`-style rounding of
``cftime.datetime`` objects directly via a :py:class:`CFTimeIndex` or via the
:py:class:`~core.accessor_dt.DatetimeAccessor`.
By `Spencer Clark <https://github.com/spencerkclark>`_
- Support new h5netcdf backend keyword `phony_dims` (available from h5netcdf
v0.8.0 for :py:class:`~xarray.backends.H5NetCDFStore`.
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
Expand Down
135 changes: 135 additions & 0 deletions xarray/coding/cftimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,83 @@ def strftime(self, date_format):
"""
return pd.Index([date.strftime(date_format) for date in self._data])

@property
def asi8(self):
"""Convert to integers with units of microseconds since 1970-01-01."""
from ..core.resample_cftime import exact_cftime_datetime_difference

epoch = self.date_type(1970, 1, 1)
return np.array(
[
_total_microseconds(exact_cftime_datetime_difference(epoch, date))
for date in self.values
]
)

def _round_via_method(self, freq, method):
"""Round dates using a specified method."""
from .cftime_offsets import CFTIME_TICKS, to_offset

offset = to_offset(freq)
if not isinstance(offset, CFTIME_TICKS):
raise ValueError(f"{offset} is a non-fixed frequency")

unit = _total_microseconds(offset.as_timedelta())
values = self.asi8
rounded = method(values, unit)
return _cftimeindex_from_i8(rounded, self.date_type, self.name)

def floor(self, freq):
"""Round dates down to fixed frequency.
Parameters
----------
freq : str or CFTimeOffset
The frequency level to round the index to. Must be a fixed
frequency like 'S' (second) not 'ME' (month end). See `frequency
aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
for a list of possible values.
Returns
-------
CFTimeIndex
"""
return self._round_via_method(freq, _floor_int)

def ceil(self, freq):
"""Round dates up to fixed frequency.
Parameters
----------
freq : str or CFTimeOffset
The frequency level to round the index to. Must be a fixed
frequency like 'S' (second) not 'ME' (month end). See `frequency
aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
for a list of possible values.
Returns
-------
CFTimeIndex
"""
return self._round_via_method(freq, _ceil_int)

def round(self, freq):
"""Round dates to a fixed frequency.
Parameters
----------
freq : str or CFTimeOffset
The frequency level to round the index to. Must be a fixed
frequency like 'S' (second) not 'ME' (month end). See `frequency
aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
for a list of possible values.
Returns
-------
CFTimeIndex
"""
return self._round_via_method(freq, _round_to_nearest_half_even)


def _parse_iso8601_without_reso(date_type, datetime_str):
date, _ = _parse_iso8601_with_reso(date_type, datetime_str)
Expand All @@ -554,3 +631,61 @@ def _parse_array_of_cftime_strings(strings, date_type):
return np.array(
[_parse_iso8601_without_reso(date_type, s) for s in strings.ravel()]
).reshape(strings.shape)


def _cftimeindex_from_i8(values, date_type, name):
"""Construct a CFTimeIndex from an array of integers.
Parameters
----------
values : np.array
Integers representing microseconds since 1970-01-01.
date_type : cftime.datetime
Type of date for the index.
name : str
Name of the index.
Returns
-------
CFTimeIndex
"""
epoch = date_type(1970, 1, 1)
dates = np.array([epoch + timedelta(microseconds=int(value)) for value in values])
return CFTimeIndex(dates, name=name)


def _total_microseconds(delta):
"""Compute the total number of microseconds of a datetime.timedelta.
Parameters
----------
delta : datetime.timedelta
Input timedelta.
Returns
-------
int
"""
return delta / timedelta(microseconds=1)


def _floor_int(values, unit):
"""Copied from pandas."""
return values - np.remainder(values, unit)


def _ceil_int(values, unit):
"""Copied from pandas."""
return values + np.remainder(-values, unit)


def _round_to_nearest_half_even(values, unit):
"""Copied from pandas."""
if unit % 2:
return _ceil_int(values - unit // 2, unit)
quotient, remainder = np.divmod(values, unit)
mask = np.logical_or(
remainder > (unit // 2), np.logical_and(remainder == (unit // 2), quotient % 2)
)
quotient[mask] += 1
return quotient * unit
28 changes: 19 additions & 9 deletions xarray/core/accessor_dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,20 +78,27 @@ def _get_date_field(values, name, dtype):
return access_method(values, name)


def _round_series(values, name, freq):
"""Coerce an array of datetime-like values to a pandas Series and
apply requested rounding
def _round_through_series_or_index(values, name, freq):
"""Coerce an array of datetime-like values to a pandas Series or xarray
CFTimeIndex and apply requested rounding
"""
values_as_series = pd.Series(values.ravel())
method = getattr(values_as_series.dt, name)
from ..coding.cftimeindex import CFTimeIndex

if is_np_datetime_like(values.dtype):
values_as_series = pd.Series(values.ravel())
method = getattr(values_as_series.dt, name)
else:
values_as_cftimeindex = CFTimeIndex(values.ravel())
method = getattr(values_as_cftimeindex, name)

field_values = method(freq=freq).values

return field_values.reshape(values.shape)


def _round_field(values, name, freq):
"""Indirectly access pandas rounding functions by wrapping data
as a Series and calling through `.dt` attribute.
"""Indirectly access rounding functions by wrapping data
as a Series or CFTimeIndex
Parameters
----------
Expand All @@ -110,9 +117,12 @@ def _round_field(values, name, freq):
if isinstance(values, dask_array_type):
from dask.array import map_blocks

return map_blocks(_round_series, values, name, freq=freq, dtype=np.datetime64)
dtype = np.datetime64 if is_np_datetime_like(values.dtype) else np.dtype("O")
return map_blocks(
_round_through_series_or_index, values, name, freq=freq, dtype=dtype
)
else:
return _round_series(values, name, freq)
return _round_through_series_or_index(values, name, freq)


def _strftime_through_cftimeindex(values, date_format):
Expand Down
104 changes: 104 additions & 0 deletions xarray/tests/test_accessor_dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from . import (
assert_array_equal,
assert_equal,
assert_identical,
raises_regex,
requires_cftime,
requires_dask,
Expand Down Expand Up @@ -435,3 +436,106 @@ def test_seasons(cftime_date_type):
seasons = xr.DataArray(seasons)

assert_array_equal(seasons.values, dates.dt.season.values)


@pytest.fixture
def cftime_rounding_dataarray(cftime_date_type):
return xr.DataArray(
[
[cftime_date_type(1, 1, 1, 1), cftime_date_type(1, 1, 1, 15)],
[cftime_date_type(1, 1, 1, 23), cftime_date_type(1, 1, 2, 1)],
]
)


@requires_cftime
@requires_dask
@pytest.mark.parametrize("use_dask", [False, True])
def test_cftime_floor_accessor(cftime_rounding_dataarray, cftime_date_type, use_dask):
import dask.array as da

freq = "D"
expected = xr.DataArray(
[
[cftime_date_type(1, 1, 1, 0), cftime_date_type(1, 1, 1, 0)],
[cftime_date_type(1, 1, 1, 0), cftime_date_type(1, 1, 2, 0)],
],
name="floor",
)

if use_dask:
chunks = {"dim_0": 1}
# Currently a compute is done to inspect a single value of the array
# if it is of object dtype to check if it is a cftime.datetime (if not
# we raise an error when using the dt accessor).
with raise_if_dask_computes(max_computes=1):
result = cftime_rounding_dataarray.chunk(chunks).dt.floor(freq)
expected = expected.chunk(chunks)
assert isinstance(result.data, da.Array)
assert result.chunks == expected.chunks
else:
result = cftime_rounding_dataarray.dt.floor(freq)

assert_identical(result, expected)


@requires_cftime
@requires_dask
@pytest.mark.parametrize("use_dask", [False, True])
def test_cftime_ceil_accessor(cftime_rounding_dataarray, cftime_date_type, use_dask):
import dask.array as da

freq = "D"
expected = xr.DataArray(
[
[cftime_date_type(1, 1, 2, 0), cftime_date_type(1, 1, 2, 0)],
[cftime_date_type(1, 1, 2, 0), cftime_date_type(1, 1, 3, 0)],
],
name="ceil",
)

if use_dask:
chunks = {"dim_0": 1}
# Currently a compute is done to inspect a single value of the array
# if it is of object dtype to check if it is a cftime.datetime (if not
# we raise an error when using the dt accessor).
with raise_if_dask_computes(max_computes=1):
result = cftime_rounding_dataarray.chunk(chunks).dt.ceil(freq)
expected = expected.chunk(chunks)
assert isinstance(result.data, da.Array)
assert result.chunks == expected.chunks
else:
result = cftime_rounding_dataarray.dt.ceil(freq)

assert_identical(result, expected)


@requires_cftime
@requires_dask
@pytest.mark.parametrize("use_dask", [False, True])
def test_cftime_round_accessor(cftime_rounding_dataarray, cftime_date_type, use_dask):
import dask.array as da

freq = "D"
expected = xr.DataArray(
[
[cftime_date_type(1, 1, 1, 0), cftime_date_type(1, 1, 2, 0)],
[cftime_date_type(1, 1, 2, 0), cftime_date_type(1, 1, 2, 0)],
],
name="round",
)

if use_dask:
chunks = {"dim_0": 1}
# Currently a compute is done to inspect a single value of the array
# if it is of object dtype to check if it is a cftime.datetime (if not
# we raise an error when using the dt accessor).
with raise_if_dask_computes(max_computes=1):
result = cftime_rounding_dataarray.chunk(chunks).dt.round(freq)
expected = expected.chunk(chunks)
assert isinstance(result.data, da.Array)
assert result.chunks == expected.chunks
else:
result = cftime_rounding_dataarray.dt.round(freq)

assert_identical(result, expected)
Loading

0 comments on commit 45d88fc

Please sign in to comment.