Skip to content

Commit

Permalink
Respect user-specified coordinates attribute. (pydata#3487)
Browse files Browse the repository at this point in the history
* Respect user-specified coordinates attribute.

* Add whats-new

* Better if statement.

* maybe it's better to not raise an error if "coordinates" in attrs.

* tweak whats-new.

* more thorough test.

* Emit one "coordinates" warning per dataset, instead of one per variable.

Also add stacklevel

* Preserve attrs["coordinates"] when roundtripping with decode_coords=False

* Avoid raising warnings.

* fix whats-new

* [minor] add comments

* fix whats-new

* Actually test global "coordinates" handling.

* filerwarning not necessary.

* Add comment

* fix whats-new
  • Loading branch information
dcherian authored Dec 10, 2019
1 parent bcf0d61 commit 2ee89c3
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 17 deletions.
16 changes: 15 additions & 1 deletion doc/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -437,9 +437,23 @@ like ``'days'`` for ``timedelta64`` data. ``calendar`` should be one of the cale
supported by netCDF4-python: 'standard', 'gregorian', 'proleptic_gregorian' 'noleap',
'365_day', '360_day', 'julian', 'all_leap', '366_day'.

By default, xarray uses the 'proleptic_gregorian' calendar and units of the smallest time
By default, xarray uses the ``'proleptic_gregorian'`` calendar and units of the smallest time
difference between values, with a reference time of the first time value.


.. _io.coordinates:

Coordinates
...........

You can control the ``coordinates`` attribute written to disk by specifying ``DataArray.encoding["coordinates"]``.
If not specified, xarray automatically sets ``DataArray.encoding["coordinates"]`` to a space-delimited list
of names of coordinate variables that share dimensions with the ``DataArray`` being written.
This allows perfect roundtripping of xarray datasets but may not be desirable.
When an xarray ``Dataset`` contains non-dimensional coordinates that do not share dimensions with any of
the variables, these coordinate variable names are saved under a "global" ``"coordinates"`` attribute.
This is not CF-compliant but again facilitates roundtripping of xarray datasets.

Invalid netCDF files
~~~~~~~~~~~~~~~~~~~~

Expand Down
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ New Features
invoked. (:issue:`3378`, :pull:`3446`, :pull:`3515`)
By `Deepak Cherian <https://github.com/dcherian>`_ and
`Guido Imperiale <https://github.com/crusaderky>`_.
- xarray now respects the ``DataArray.encoding["coordinates"]`` attribute when writing to disk.
See :ref:`io.coordinates` for more. (:issue:`3351`, :pull:`3487`)
By `Deepak Cherian <https://github.com/dcherian>`_.
- Add the documented-but-missing :py:meth:`DatasetGroupBy.quantile`.
(:issue:`3525`, :pull:`3527`). By `Justus Magin <https://github.com/keewis>`_.

Expand Down
40 changes: 26 additions & 14 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd

from .coding import strings, times, variables
from .coding.variables import SerializationWarning
from .coding.variables import SerializationWarning, pop_to
from .core import duck_array_ops, indexing
from .core.common import contains_cftime_datetimes
from .core.pycompat import dask_array_type
Expand Down Expand Up @@ -660,34 +660,46 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names):
and set(target_dims) <= set(v.dims)
):
variable_coordinates[k].add(coord_name)
global_coordinates.discard(coord_name)

variables = {k: v.copy(deep=False) for k, v in variables.items()}

# These coordinates are saved according to CF conventions
for var_name, coord_names in variable_coordinates.items():
attrs = variables[var_name].attrs
if "coordinates" in attrs:
# keep track of variable names written to file under the "coordinates" attributes
written_coords = set()
for name, var in variables.items():
encoding = var.encoding
attrs = var.attrs
if "coordinates" in attrs and "coordinates" in encoding:
raise ValueError(
"cannot serialize coordinates because variable "
"%s already has an attribute 'coordinates'" % var_name
f"'coordinates' found in both attrs and encoding for variable {name!r}."
)
attrs["coordinates"] = " ".join(map(str, coord_names))

# this will copy coordinates from encoding to attrs if "coordinates" in attrs
# after the next line, "coordinates" is never in encoding
# we get support for attrs["coordinates"] for free.
coords_str = pop_to(encoding, attrs, "coordinates")
if not coords_str and variable_coordinates[name]:
attrs["coordinates"] = " ".join(map(str, variable_coordinates[name]))
if "coordinates" in attrs:
written_coords.update(attrs["coordinates"].split())

# These coordinates are not associated with any particular variables, so we
# save them under a global 'coordinates' attribute so xarray can roundtrip
# the dataset faithfully. Because this serialization goes beyond CF
# conventions, only do it if necessary.
# Reference discussion:
# http://mailman.cgd.ucar.edu/pipermail/cf-metadata/2014/057771.html
# http://mailman.cgd.ucar.edu/pipermail/cf-metadata/2014/007571.html
global_coordinates.difference_update(written_coords)
if global_coordinates:
attributes = dict(attributes)
if "coordinates" in attributes:
raise ValueError(
"cannot serialize coordinates because the global "
"attribute 'coordinates' already exists"
warnings.warn(
f"cannot serialize global coordinates {global_coordinates!r} because the global "
f"attribute 'coordinates' already exists. This may prevent faithful roundtripping"
f"of xarray datasets",
SerializationWarning,
)
attributes["coordinates"] = " ".join(map(str, global_coordinates))
else:
attributes["coordinates"] = " ".join(map(str, global_coordinates))

return variables, attributes

Expand Down
37 changes: 35 additions & 2 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from xarray.backends.netCDF4_ import _extract_nc4_variable_encoding
from xarray.backends.pydap_ import PydapDataStore
from xarray.coding.variables import SerializationWarning
from xarray.conventions import encode_dataset_coordinates
from xarray.core import indexing
from xarray.core.options import set_options
from xarray.core.pycompat import dask_array_type
Expand Down Expand Up @@ -522,15 +523,35 @@ def test_roundtrip_coordinates(self):
with self.roundtrip(original) as actual:
assert_identical(original, actual)

original["foo"].encoding["coordinates"] = "y"
with self.roundtrip(original, open_kwargs={"decode_coords": False}) as expected:
# check roundtripping when decode_coords=False
with self.roundtrip(
expected, open_kwargs={"decode_coords": False}
) as actual:
assert_identical(expected, actual)

def test_roundtrip_global_coordinates(self):
original = Dataset({"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])})
original = Dataset(
{"foo": ("x", [0, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])}
)
with self.roundtrip(original) as actual:
assert_identical(original, actual)

# test that global "coordinates" is as expected
_, attrs = encode_dataset_coordinates(original)
assert attrs["coordinates"] == "y"

# test warning when global "coordinates" is already set
original.attrs["coordinates"] = "foo"
with pytest.warns(SerializationWarning):
_, attrs = encode_dataset_coordinates(original)
assert attrs["coordinates"] == "foo"

def test_roundtrip_coordinates_with_space(self):
original = Dataset(coords={"x": 0, "y z": 1})
expected = Dataset({"y z": 1}, {"x": 0})
with pytest.warns(xr.SerializationWarning):
with pytest.warns(SerializationWarning):
with self.roundtrip(original) as actual:
assert_identical(expected, actual)

Expand Down Expand Up @@ -810,6 +831,18 @@ def equals_latlon(obj):
assert "coordinates" not in ds["lat"].attrs
assert "coordinates" not in ds["lon"].attrs

original["temp"].encoding["coordinates"] = "lat"
with self.roundtrip(original) as actual:
assert_identical(actual, original)
original["precip"].encoding["coordinates"] = "lat"
with create_tmp_file() as tmp_file:
original.to_netcdf(tmp_file)
with open_dataset(tmp_file, decode_coords=True) as ds:
assert "lon" not in ds["temp"].encoding["coordinates"]
assert "lon" not in ds["precip"].encoding["coordinates"]
assert "coordinates" not in ds["lat"].encoding
assert "coordinates" not in ds["lon"].encoding

def test_roundtrip_endian(self):
ds = Dataset(
{
Expand Down
14 changes: 14 additions & 0 deletions xarray/tests/test_conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,20 @@ def test_multidimensional_coordinates(self):
# Should not have any global coordinates.
assert "coordinates" not in attrs

def test_do_not_overwrite_user_coordinates(self):
orig = Dataset(
coords={"x": [0, 1, 2], "y": ("x", [5, 6, 7]), "z": ("x", [8, 9, 10])},
data_vars={"a": ("x", [1, 2, 3]), "b": ("x", [3, 5, 6])},
)
orig["a"].encoding["coordinates"] = "y"
orig["b"].encoding["coordinates"] = "z"
enc, _ = conventions.encode_dataset_coordinates(orig)
assert enc["a"].attrs["coordinates"] == "y"
assert enc["b"].attrs["coordinates"] == "z"
orig["a"].attrs["coordinates"] = "foo"
with raises_regex(ValueError, "'coordinates' found in both attrs"):
conventions.encode_dataset_coordinates(orig)

@requires_dask
def test_string_object_warning(self):
original = Variable(("x",), np.array(["foo", "bar"], dtype=object)).chunk()
Expand Down

0 comments on commit 2ee89c3

Please sign in to comment.