From 65ad2dd92f2865ab9e7c19b9be8e46206d05ac7f Mon Sep 17 00:00:00 2001 From: b8raoult <53792887+b8raoult@users.noreply.github.com> Date: Fri, 11 Oct 2024 14:51:10 +0100 Subject: [PATCH] Fill missing dates by interpolating data (#81) * add interpolate and closest --- CHANGELOG.md | 8 +- docs/using/code/fill_missing_dates1_.py | 1 + docs/using/code/fill_missing_dates2_.py | 1 + docs/using/code/missing_dates_.py | 1 - docs/using/code/set_missing_dates_.py | 1 + docs/using/missing.rst | 23 +++- pyproject.toml | 33 +---- src/anemoi/datasets/data/dataset.py | 26 ++-- src/anemoi/datasets/data/fill_missing.py | 162 +++++++++++++++++++++++ 9 files changed, 213 insertions(+), 43 deletions(-) create mode 100644 docs/using/code/fill_missing_dates1_.py create mode 100644 docs/using/code/fill_missing_dates2_.py delete mode 100644 docs/using/code/missing_dates_.py create mode 100644 docs/using/code/set_missing_dates_.py create mode 100644 src/anemoi/datasets/data/fill_missing.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 7eff57d2..daa40d14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,14 +16,16 @@ Keep it human-readable, your future self will thank you! ## [0.5.7](https://github.com/ecmwf/anemoi-datasets/compare/0.5.6...0.5.7) - 2024-10-09 -## [Allow for unknown CF coordinates](https://github.com/ecmwf/anemoi-datasets/compare/0.5.5...0.5.6) - 2024-10-04 +### Changed -- Update documentation +- Add support to fill missing dates + +## [Allow for unknown CF coordinates](https://github.com/ecmwf/anemoi-datasets/compare/0.5.5...0.5.6) - 2024-10-04 -- Update documentation ### Changed - Add `variables_metadata` entry in the dataset metadata +- Update documentation ### Changed diff --git a/docs/using/code/fill_missing_dates1_.py b/docs/using/code/fill_missing_dates1_.py new file mode 100644 index 00000000..93d78ccf --- /dev/null +++ b/docs/using/code/fill_missing_dates1_.py @@ -0,0 +1 @@ +ds = open_dataset(dataset, fill_missing_dates="interpolate") diff --git a/docs/using/code/fill_missing_dates2_.py b/docs/using/code/fill_missing_dates2_.py new file mode 100644 index 00000000..6567aaf4 --- /dev/null +++ b/docs/using/code/fill_missing_dates2_.py @@ -0,0 +1 @@ +ds = open_dataset(dataset, fill_missing_dates="closest") diff --git a/docs/using/code/missing_dates_.py b/docs/using/code/missing_dates_.py deleted file mode 100644 index 5c2da45d..00000000 --- a/docs/using/code/missing_dates_.py +++ /dev/null @@ -1 +0,0 @@ -ds = open_dataset(dataset, missing_dates=["2010-01-01T12:00:00", "2010-02-01T12:00:00"]) diff --git a/docs/using/code/set_missing_dates_.py b/docs/using/code/set_missing_dates_.py new file mode 100644 index 00000000..35f35ed6 --- /dev/null +++ b/docs/using/code/set_missing_dates_.py @@ -0,0 +1 @@ +ds = open_dataset(dataset, set_missing_dates=["2010-01-01T12:00:00", "2010-02-01T12:00:00"]) diff --git a/docs/using/missing.rst b/docs/using/missing.rst index 3388a444..e7c6b9d4 100644 --- a/docs/using/missing.rst +++ b/docs/using/missing.rst @@ -4,6 +4,25 @@ Managing missing dates ######################## +************************************************** + Filling the missing dates with artificial values +************************************************** + +When you have missing dates in a dataset, you can fill them with +artificial values. You can either fill them with values that are the +result of a linear interpolation between the two closest dates: + +.. literalinclude:: code/fill_missing_dates1_.py + +Or you can select the copy the value of the closest date: + +.. literalinclude:: code/fill_missing_dates2_.py + +if the missing date is exactly in the middle of two dates, the library +will choose that value of the largest date. You can change this behavior +by setting the ``closest`` parameter to ``'down'`` or ``'up'`` +explicitly. + ************************************************ Skipping missing when iterating over a dataset ************************************************ @@ -72,7 +91,7 @@ the datasets to make the dates contiguous. Debugging *********** -You can set missing dates using the ``missing_dates`` option. This +You can set missing dates using the ``set_missing_dates`` option. This option is for debugging purposes only. -.. literalinclude:: code/missing_dates_.py +.. literalinclude:: code/set_missing_dates_.py diff --git a/pyproject.toml b/pyproject.toml index 5210d87c..797cfd0b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ dynamic = [ "version", ] dependencies = [ - "anemoi-utils[provenance]>=0.3.15", + "anemoi-utils[provenance]>=0.3.18", "cfunits", "numpy", "pyyaml", @@ -60,43 +60,20 @@ dependencies = [ ] optional-dependencies.all = [ - "boto3", - "earthkit-data[mars]>=0.9", - "earthkit-geo>=0.2", - "earthkit-meteo", - "ecmwflibs>=0.6.3", - "entrypoints", - "gcsfs", - "kerchunk", - "pyproj", - "requests", + "anemoi-datasets[create,remote,xarray]", ] optional-dependencies.create = [ - "earthkit-data[mars]>=0.9", + "earthkit-data[mars]>=0.10.7", "earthkit-geo>=0.2", "earthkit-meteo", - "ecmwflibs>=0.6.3", + "eccodes>=2.38.1", "entrypoints", "pyproj", ] optional-dependencies.dev = [ - "boto3", - "earthkit-data[mars]>=0.9", - "earthkit-geo>=0.2", - "earthkit-meteo", - "ecmwflibs>=0.6.3", - "entrypoints", - "gcsfs", - "kerchunk", - "nbsphinx", - "pandoc", - "pyproj", - "pytest", - "requests", - "sphinx", - "sphinx-rtd-theme", + "anemoi-datasets[all,docs,tests]", ] optional-dependencies.docs = [ diff --git a/src/anemoi/datasets/data/dataset.py b/src/anemoi/datasets/data/dataset.py index 2f6af1b1..56275d1e 100644 --- a/src/anemoi/datasets/data/dataset.py +++ b/src/anemoi/datasets/data/dataset.py @@ -41,6 +41,14 @@ def _subset(self, **kwargs): if not kwargs: return self.mutate() + # This one must be first + if "fill_missing_dates" in kwargs: + from .fill_missing import fill_missing_dates_factory + + fill_missing_dates = kwargs.pop("fill_missing_dates") + ds = fill_missing_dates_factory(self, fill_missing_dates, kwargs) + return ds._subset(**kwargs).mutate() + if "start" in kwargs or "end" in kwargs: start = kwargs.pop("start", None) end = kwargs.pop("end", None) @@ -64,12 +72,6 @@ def _subset(self, **kwargs): .mutate() ) - if "interpolate_frequency" in kwargs: - from .interpolate import InterpolateFrequency - - interpolate_frequency = kwargs.pop("interpolate_frequency") - return InterpolateFrequency(self, interpolate_frequency)._subset(**kwargs).mutate() - if "select" in kwargs: from .select import Select @@ -121,11 +123,11 @@ def _subset(self, **kwargs): bbox = kwargs.pop("area") return Cropping(self, bbox)._subset(**kwargs).mutate() - if "missing_dates" in kwargs: + if "set_missing_dates" in kwargs: from .missing import MissingDates - missing_dates = kwargs.pop("missing_dates") - return MissingDates(self, missing_dates)._subset(**kwargs).mutate() + set_missing_dates = kwargs.pop("set_missing_dates") + return MissingDates(self, set_missing_dates)._subset(**kwargs).mutate() if "skip_missing_dates" in kwargs: from .missing import SkipMissingDates @@ -139,6 +141,12 @@ def _subset(self, **kwargs): if skip_missing_dates: return SkipMissingDates(self, expected_access)._subset(**kwargs).mutate() + if "interpolate_frequency" in kwargs: + from .interpolate import InterpolateFrequency + + interpolate_frequency = kwargs.pop("interpolate_frequency") + return InterpolateFrequency(self, interpolate_frequency)._subset(**kwargs).mutate() + # Keep last if "shuffle" in kwargs: from .subset import Subset diff --git a/src/anemoi/datasets/data/fill_missing.py b/src/anemoi/datasets/data/fill_missing.py new file mode 100644 index 00000000..ca16fd65 --- /dev/null +++ b/src/anemoi/datasets/data/fill_missing.py @@ -0,0 +1,162 @@ +# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts. +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +import logging + +import numpy as np + +from anemoi.datasets.data import MissingDateError + +from .debug import Node +from .debug import debug_indexing +from .forwards import Forwards +from .indexing import apply_index_to_slices_changes +from .indexing import expand_list_indexing +from .indexing import index_to_slices +from .indexing import update_tuple + +LOG = logging.getLogger(__name__) + + +class MissingDatesFill(Forwards): + def __init__(self, dataset): + super().__init__(dataset) + self._missing = set(dataset.missing) + self._warnings = set() + + @debug_indexing + @expand_list_indexing + def _get_tuple(self, index): + index, changes = index_to_slices(index, self.shape) + index, previous = update_tuple(index, 0, slice(None)) + result = self._get_slice(previous) + return apply_index_to_slices_changes(result[index], changes) + + def _get_slice(self, s): + return np.stack([self[i] for i in range(*s.indices(self._len))]) + + @property + def missing(self): + return set() + + @debug_indexing + def __getitem__(self, n): + + try: + return self.forward[n] + except MissingDateError: + pass + + if isinstance(n, tuple): + return self._get_tuple(n) + + if isinstance(n, slice): + return self._get_slice(n) + + if n < 0: + n += self._len + + a = None + i = n + while a is None and i >= 0: + if i in self._missing: + i -= 1 + else: + a = i + + len = self._len + b = None + i = n + while b is None and n < len: + if i in self._missing: + i += 1 + else: + b = i + + return self._fill_missing(n, a, b) + + +class MissingDatesClosest(MissingDatesFill): + + def __init__(self, dataset, closest): + super().__init__(dataset) + self.closest = closest + self._closest = {} + + def _fill_missing(self, n, a, b): + + if n not in self._warnings: + LOG.warning(f"Missing date at index {n} ({self.dates[n]})") + if abs(n - a) == abs(b - n): + if self.closest == "up": + u = b + else: + u = a + else: + if abs(n - a) < abs(b - n): + u = a + else: + u = b + LOG.warning(f"Using closest date {u} ({self.dates[u]})") + + self._closest[n] = u + self._warnings.add(n) + + return self.forward[self._closest[n]] + + def subclass_metadata_specific(self): + return {"closest": self.closest} + + def tree(self): + return Node(self, [self.forward.tree()], closest=self.closest) + + +class MissingDatesInterpolate(MissingDatesFill): + def __init__(self, dataset): + super().__init__(dataset) + self._alpha = {} + + def _fill_missing(self, n, a, b): + if n not in self._warnings: + LOG.warning(f"Missing date at index {n} ({self.dates[n]})") + + if a is None or b is None: + raise MissingDateError( + f"Cannot interpolate at index {n} ({self.dates[n]}). Are the first or last date missing?" + ) + + assert a < n < b, (a, n, b) + + alpha = (n - a) / (b - a) + assert 0 < alpha < 1, alpha + + LOG.warning(f"Interpolating between index {a} ({self.dates[a]}) and {b} ({self.dates[b]})") + LOG.warning(f"Interpolation {1 - alpha:g} * ({self.dates[a]}) + {alpha:g} * ({self.dates[b]})") + + self._alpha[n] = alpha + + self._warnings.add(n) + + alpha = self._alpha[n] + return self.forward[a] * (1 - alpha) + self.forward[b] * alpha + + def subclass_metadata_specific(self): + return {} + + def tree(self): + return Node(self, [self.forward.tree()]) + + +def fill_missing_dates_factory(dataset, method, kwargs): + if method == "closest": + closest = kwargs.get("closest", "up") + return MissingDatesClosest(dataset, closest=closest) + + if method == "interpolate": + return MissingDatesInterpolate(dataset) + + raise ValueError(f"Invalid `fill_missing_dates` method '{method}'")