Skip to content

Commit

Permalink
Improve renaming logic to prevent million warnings (#280)
Browse files Browse the repository at this point in the history
* Improve renaming logic to prevent million warnings

* Reactivate warnings

* Eliminate warnings about renaming dims

* Dont rename dim coordinates again

* Testing

* Testing

* remove dims after renaming

* another try

* fix

* Remove prints

* Fix warning dataset_id
  • Loading branch information
jbusecke authored Jan 10, 2023
1 parent aba65da commit b0b451d
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 41 deletions.
52 changes: 42 additions & 10 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,19 +472,51 @@ def test_combined_preprocessing_dropped_coords(add_coords, shift):
assert "bnds" not in ds.coords


def test_combined_preprocessing_mislabeled_coords():
"""Test if the renaming is applied to datavariables and then if they are moved to the coords."""
def test_rename_mislabeled_coords():
"""Test if the renaming is applied to datavariables"""
# create a 2d dataset
xlen, ylen, zlen = (10, 5, 1)
ds = (
create_test_ds("x", "y", "dummy", xlen, ylen, zlen).squeeze().drop_vars("dummy")
xlen, ylen, zlen = (10, 5, 3)
ds = create_test_ds("x", "y", "z", xlen, ylen, zlen).squeeze()
ds["nav_lon"] = ds.lon # assign longitude as data variable
ds = ds.drop_vars(["lon"])

ds_pp = rename_cmip6(ds)
np.testing.assert_allclose(ds.nav_lon.data, ds_pp.lon.data)


def test_duplicate_renamed_coordinates():
# create a 2d dataset
xlen, ylen, zlen = (10, 5, 3)
ds = create_test_ds("x", "y", "lev", xlen, ylen, zlen)
ds = ds.drop_vars("lon") # drop the original longitude
# assign two coordinates which should both be renamed according to the renaming dict
coord_da_1 = xr.DataArray(np.random.rand(xlen, ylen), dims=["x", "y"])
coord_da_2 = xr.DataArray(np.random.rand(xlen, ylen), dims=["x", "y"])
ds = ds.assign_coords(longitude=coord_da_1, nav_lon=coord_da_2)
print(ds)
with pytest.warns(
match="While renaming to target `lon`, more than one candidate was found"
):
ds_pp = rename_cmip6(ds)

assert "nav_lon" in ds_pp.coords
xr.testing.assert_allclose(
ds_pp.lon.reset_coords(drop=True).drop(["x", "y"]), coord_da_1
)
ds = ds.assign(depth=5.0)
ds.depth.attrs["units"] = "m" # otherwise pint complains.

ds_pp = combined_preprocessing(ds)
assert "lev" in ds_pp.coords
np.testing.assert_allclose(ds.depth.data, ds_pp.lev.data)

def test_renamed_coordinate_exists():
# create a 2d dataset
xlen, ylen, zlen = (10, 5, 3)
ds = create_test_ds("x", "y", "lev", xlen, ylen, zlen)
# assign two coordinates which should both be renamed according to the renaming dict
coord_da = xr.DataArray(np.random.rand(xlen, ylen), dims=["x", "y"])
ds = ds.assign_coords(longitude=coord_da)

ds_pp = rename_cmip6(ds)
# make sure the original lon is intact
xr.testing.assert_allclose(ds_pp.lon, ds.lon)
assert "longitude" in ds_pp


def test_preserve_attrs():
Expand Down
58 changes: 27 additions & 31 deletions xmip/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pint_xarray # noqa: F401
import xarray as xr

from xmip.utils import _maybe_make_list, cmip6_dataset_id
from xmip.utils import cmip6_dataset_id


# global object for units
Expand Down Expand Up @@ -57,58 +57,54 @@ def cmip6_renaming_dict():
return rename_dict


def _invert_dict(rdict):
exploded_dict = {}
# there is probably a more effective way to 'invert' a dictionary
for k, v in rdict.items():
v = _maybe_make_list(v)
for vv in v:
exploded_dict[vv] = k
return exploded_dict


def rename_cmip6(ds, rename_dict=None):
"""Homogenizes cmip6 dataasets to common naming"""
ds = ds.copy()
attrs = {k: v for k, v in ds.attrs.items()}
ds_id = cmip6_dataset_id(ds)

if rename_dict is None:
rename_dict = cmip6_renaming_dict()

inverted_rename_dict = _invert_dict(rename_dict)
# TODO: Be even stricter here and reset every variable except the one given in the attr
# as variable_id
# ds_reset = ds.reset_coords()

ds_reset = ds.reset_coords()

def _maybe_rename(obj, rdict):
return obj.rename({kk: vv for kk, vv in rdict.items() if kk in obj.dims})
def _maybe_rename_dims(da, rdict):
for di in da.dims:
for target, candidates in rdict.items():
if di in candidates:
da = da.swap_dims({di: target})
if di in da.coords:
da = da.drop_vars(di)
return da

# first take care of the dims and reconstruct a clean ds
ds = xr.Dataset(
{
k: _maybe_rename(ds_reset[k], inverted_rename_dict)
for k in ds_reset.data_vars
k: _maybe_rename_dims(ds[k], rename_dict)
for k in list(ds.data_vars) + list(set(ds.coords) - set(ds.dims))
}
)

rename_vars = list(set(ds.variables) - set(ds.dims))

for target, candidates in rename_dict.items():
if target not in ds:
matching_candidates = [ca for ca in candidates if ca in rename_vars]
if len(matching_candidates) > 0:
if len(matching_candidates) > 1:
warnings.warn(
f"{ds_id}:While renaming to target `{target}`, more than one candidate was found {matching_candidates}. Renaming {matching_candidates[0]} to {target}. Please double check results."
)
ds = ds.rename({matching_candidates[0]: target})

# special treatment for 'lon'/'lat' if there is no 'x'/'y' after renaming process
for di, co in [("x", "lon"), ("y", "lat")]:
if di not in ds.dims and co in ds.dims:
ds = ds.rename({co: di})

# now rename the variables
# try and pass here, cause some of the datasets (MIROC) have like 3 times the same info
# e.g. lev/sigma/zlev...not sure this is the best way to handle this with
# a silent fail here though...
for va in ds.data_vars:
try:
ds = ds.rename({va: inverted_rename_dict[va]})
except Exception as e:
warnings.warn(f"Renaming failed with {e}")
pass

# restore attributes
ds.attrs = attrs

return ds


Expand Down

0 comments on commit b0b451d

Please sign in to comment.