Skip to content

Commit

Permalink
Fill missing dates by interpolating data (ecmwf#81)
Browse files Browse the repository at this point in the history
* add interpolate and closest
  • Loading branch information
b8raoult authored Oct 11, 2024
1 parent 2a88ece commit 65ad2dd
Show file tree
Hide file tree
Showing 9 changed files with 213 additions and 43 deletions.
8 changes: 5 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@ Keep it human-readable, your future self will thank you!

## [0.5.7](https://github.com/ecmwf/anemoi-datasets/compare/0.5.6...0.5.7) - 2024-10-09

## [Allow for unknown CF coordinates](https://github.com/ecmwf/anemoi-datasets/compare/0.5.5...0.5.6) - 2024-10-04
### Changed

- Update documentation
- Add support to fill missing dates

## [Allow for unknown CF coordinates](https://github.com/ecmwf/anemoi-datasets/compare/0.5.5...0.5.6) - 2024-10-04

- Update documentation
### Changed

- Add `variables_metadata` entry in the dataset metadata
- Update documentation

### Changed

Expand Down
1 change: 1 addition & 0 deletions docs/using/code/fill_missing_dates1_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ds = open_dataset(dataset, fill_missing_dates="interpolate")
1 change: 1 addition & 0 deletions docs/using/code/fill_missing_dates2_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ds = open_dataset(dataset, fill_missing_dates="closest")
1 change: 0 additions & 1 deletion docs/using/code/missing_dates_.py

This file was deleted.

1 change: 1 addition & 0 deletions docs/using/code/set_missing_dates_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ds = open_dataset(dataset, set_missing_dates=["2010-01-01T12:00:00", "2010-02-01T12:00:00"])
23 changes: 21 additions & 2 deletions docs/using/missing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,25 @@
Managing missing dates
########################

**************************************************
Filling the missing dates with artificial values
**************************************************

When you have missing dates in a dataset, you can fill them with
artificial values. You can either fill them with values that are the
result of a linear interpolation between the two closest dates:

.. literalinclude:: code/fill_missing_dates1_.py

Or you can select the copy the value of the closest date:

.. literalinclude:: code/fill_missing_dates2_.py

if the missing date is exactly in the middle of two dates, the library
will choose that value of the largest date. You can change this behavior
by setting the ``closest`` parameter to ``'down'`` or ``'up'``
explicitly.

************************************************
Skipping missing when iterating over a dataset
************************************************
Expand Down Expand Up @@ -72,7 +91,7 @@ the datasets to make the dates contiguous.
Debugging
***********

You can set missing dates using the ``missing_dates`` option. This
You can set missing dates using the ``set_missing_dates`` option. This
option is for debugging purposes only.

.. literalinclude:: code/missing_dates_.py
.. literalinclude:: code/set_missing_dates_.py
33 changes: 5 additions & 28 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ dynamic = [
"version",
]
dependencies = [
"anemoi-utils[provenance]>=0.3.15",
"anemoi-utils[provenance]>=0.3.18",
"cfunits",
"numpy",
"pyyaml",
Expand All @@ -60,43 +60,20 @@ dependencies = [
]

optional-dependencies.all = [
"boto3",
"earthkit-data[mars]>=0.9",
"earthkit-geo>=0.2",
"earthkit-meteo",
"ecmwflibs>=0.6.3",
"entrypoints",
"gcsfs",
"kerchunk",
"pyproj",
"requests",
"anemoi-datasets[create,remote,xarray]",
]

optional-dependencies.create = [
"earthkit-data[mars]>=0.9",
"earthkit-data[mars]>=0.10.7",
"earthkit-geo>=0.2",
"earthkit-meteo",
"ecmwflibs>=0.6.3",
"eccodes>=2.38.1",
"entrypoints",
"pyproj",
]

optional-dependencies.dev = [
"boto3",
"earthkit-data[mars]>=0.9",
"earthkit-geo>=0.2",
"earthkit-meteo",
"ecmwflibs>=0.6.3",
"entrypoints",
"gcsfs",
"kerchunk",
"nbsphinx",
"pandoc",
"pyproj",
"pytest",
"requests",
"sphinx",
"sphinx-rtd-theme",
"anemoi-datasets[all,docs,tests]",
]

optional-dependencies.docs = [
Expand Down
26 changes: 17 additions & 9 deletions src/anemoi/datasets/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ def _subset(self, **kwargs):
if not kwargs:
return self.mutate()

# This one must be first
if "fill_missing_dates" in kwargs:
from .fill_missing import fill_missing_dates_factory

fill_missing_dates = kwargs.pop("fill_missing_dates")
ds = fill_missing_dates_factory(self, fill_missing_dates, kwargs)
return ds._subset(**kwargs).mutate()

if "start" in kwargs or "end" in kwargs:
start = kwargs.pop("start", None)
end = kwargs.pop("end", None)
Expand All @@ -64,12 +72,6 @@ def _subset(self, **kwargs):
.mutate()
)

if "interpolate_frequency" in kwargs:
from .interpolate import InterpolateFrequency

interpolate_frequency = kwargs.pop("interpolate_frequency")
return InterpolateFrequency(self, interpolate_frequency)._subset(**kwargs).mutate()

if "select" in kwargs:
from .select import Select

Expand Down Expand Up @@ -121,11 +123,11 @@ def _subset(self, **kwargs):
bbox = kwargs.pop("area")
return Cropping(self, bbox)._subset(**kwargs).mutate()

if "missing_dates" in kwargs:
if "set_missing_dates" in kwargs:
from .missing import MissingDates

missing_dates = kwargs.pop("missing_dates")
return MissingDates(self, missing_dates)._subset(**kwargs).mutate()
set_missing_dates = kwargs.pop("set_missing_dates")
return MissingDates(self, set_missing_dates)._subset(**kwargs).mutate()

if "skip_missing_dates" in kwargs:
from .missing import SkipMissingDates
Expand All @@ -139,6 +141,12 @@ def _subset(self, **kwargs):
if skip_missing_dates:
return SkipMissingDates(self, expected_access)._subset(**kwargs).mutate()

if "interpolate_frequency" in kwargs:
from .interpolate import InterpolateFrequency

interpolate_frequency = kwargs.pop("interpolate_frequency")
return InterpolateFrequency(self, interpolate_frequency)._subset(**kwargs).mutate()

# Keep last
if "shuffle" in kwargs:
from .subset import Subset
Expand Down
162 changes: 162 additions & 0 deletions src/anemoi/datasets/data/fill_missing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

import logging

import numpy as np

from anemoi.datasets.data import MissingDateError

from .debug import Node
from .debug import debug_indexing
from .forwards import Forwards
from .indexing import apply_index_to_slices_changes
from .indexing import expand_list_indexing
from .indexing import index_to_slices
from .indexing import update_tuple

LOG = logging.getLogger(__name__)


class MissingDatesFill(Forwards):
def __init__(self, dataset):
super().__init__(dataset)
self._missing = set(dataset.missing)
self._warnings = set()

@debug_indexing
@expand_list_indexing
def _get_tuple(self, index):
index, changes = index_to_slices(index, self.shape)
index, previous = update_tuple(index, 0, slice(None))
result = self._get_slice(previous)
return apply_index_to_slices_changes(result[index], changes)

def _get_slice(self, s):
return np.stack([self[i] for i in range(*s.indices(self._len))])

@property
def missing(self):
return set()

@debug_indexing
def __getitem__(self, n):

try:
return self.forward[n]
except MissingDateError:
pass

if isinstance(n, tuple):
return self._get_tuple(n)

if isinstance(n, slice):
return self._get_slice(n)

if n < 0:
n += self._len

a = None
i = n
while a is None and i >= 0:
if i in self._missing:
i -= 1
else:
a = i

len = self._len
b = None
i = n
while b is None and n < len:
if i in self._missing:
i += 1
else:
b = i

return self._fill_missing(n, a, b)


class MissingDatesClosest(MissingDatesFill):

def __init__(self, dataset, closest):
super().__init__(dataset)
self.closest = closest
self._closest = {}

def _fill_missing(self, n, a, b):

if n not in self._warnings:
LOG.warning(f"Missing date at index {n} ({self.dates[n]})")
if abs(n - a) == abs(b - n):
if self.closest == "up":
u = b
else:
u = a
else:
if abs(n - a) < abs(b - n):
u = a
else:
u = b
LOG.warning(f"Using closest date {u} ({self.dates[u]})")

self._closest[n] = u
self._warnings.add(n)

return self.forward[self._closest[n]]

def subclass_metadata_specific(self):
return {"closest": self.closest}

def tree(self):
return Node(self, [self.forward.tree()], closest=self.closest)


class MissingDatesInterpolate(MissingDatesFill):
def __init__(self, dataset):
super().__init__(dataset)
self._alpha = {}

def _fill_missing(self, n, a, b):
if n not in self._warnings:
LOG.warning(f"Missing date at index {n} ({self.dates[n]})")

if a is None or b is None:
raise MissingDateError(
f"Cannot interpolate at index {n} ({self.dates[n]}). Are the first or last date missing?"
)

assert a < n < b, (a, n, b)

alpha = (n - a) / (b - a)
assert 0 < alpha < 1, alpha

LOG.warning(f"Interpolating between index {a} ({self.dates[a]}) and {b} ({self.dates[b]})")
LOG.warning(f"Interpolation {1 - alpha:g} * ({self.dates[a]}) + {alpha:g} * ({self.dates[b]})")

self._alpha[n] = alpha

self._warnings.add(n)

alpha = self._alpha[n]
return self.forward[a] * (1 - alpha) + self.forward[b] * alpha

def subclass_metadata_specific(self):
return {}

def tree(self):
return Node(self, [self.forward.tree()])


def fill_missing_dates_factory(dataset, method, kwargs):
if method == "closest":
closest = kwargs.get("closest", "up")
return MissingDatesClosest(dataset, closest=closest)

if method == "interpolate":
return MissingDatesInterpolate(dataset)

raise ValueError(f"Invalid `fill_missing_dates` method '{method}'")

0 comments on commit 65ad2dd

Please sign in to comment.