Merge branch 'develop' of github.com:MeteoSwiss-APN/anemoi-datasets i…

…nto develop
MeteoSwiss · Dec 11, 2024 · b8471d3 · b8471d3
2 parents a4586d5 + 2c1e6b1
commit b8471d3
Show file tree

Hide file tree

Showing 9 changed files with 49 additions and 6 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -40,7 +40,7 @@ repos:
     - --force-single-line-imports
     - --profile black
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.7.2
+  rev: v0.8.1
   hooks:
   - id: ruff
     args:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ Keep it human-readable, your future self will thank you!
 ## Changed
 
 - Fix metadata serialization handling of numpy.integer (#140)
+- Fix negative variance for constant variables (#148)
 - Fix cutout slicing of grid dimension (#145)
 
 ### Added

diff --git a/docs/index.rst b/docs/index.rst
@@ -27,13 +27,15 @@ datasets <building-introduction>`.
 
 -  :doc:`overview`
 -  :doc:`installing`
+-  :doc:`naming_conventions`
 
 .. toctree::
    :maxdepth: 1
    :hidden:
 
    overview
    installing
+   naming_conventions
 
 **Using training datasets**
 

diff --git a/docs/naming_conventions.rst b/docs/naming_conventions.rst
@@ -0,0 +1,8 @@
+####################
+ Naming conventions
+####################
+
+This is the current naming conventions for datasets :
+
+`Anemoi Naming Conventions
+<https://anemoi-registry.readthedocs.io/en/latest/naming-conventions.html>`_
diff --git a/src/anemoi/datasets/create/check.py b/src/anemoi/datasets/create/check.py
@@ -58,7 +58,7 @@ def raise_if_not_valid(self, print=print):
             raise ValueError(self.error_message)
 
     def _parse(self, name):
-        pattern = r"^(\w+)-([\w-]+)-(\w+)-(\w+)-(\d\d\d\d)-(\d\d\d\d)-(\d+h)-v(\d+)-?([a-zA-Z0-9-]+)?$"
+        pattern = r"^(\w+)-([\w-]+)-(\w+)-(\w+)-(\d\d\d\d)-(\d\d\d\d)-(\d+h|\d+m)-v(\d+)-?([a-zA-Z0-9-]+)?$"
         match = re.match(pattern, name)
 
         if not match:

diff --git a/src/anemoi/datasets/create/functions/__init__.py b/src/anemoi/datasets/create/functions/__init__.py
@@ -22,6 +22,7 @@ def assert_is_fieldlist(obj):
 def import_function(name, kind):
 
     from anemoi.transform.filters import filter_registry
+    from anemoi.transform.sources import source_registry
 
     name = name.replace("-", "_")
 
@@ -45,7 +46,20 @@ def import_function(name, kind):
         if filter_registry.lookup(name, return_none=True):
 
             def proc(context, data, *args, **kwargs):
-                return filter_registry.create(name, *args, **kwargs)(data)
+                filter = filter_registry.create(name, *args, **kwargs)
+                filter.context = context
+                # filter = filter_registry.create(context, name, *args, **kwargs)
+                return filter.forward(data)
+
+            return proc
+
+    if kind == "sources":
+        if source_registry.lookup(name, return_none=True):
+
+            def proc(context, data, *args, **kwargs):
+                source = source_registry.create(name, *args, **kwargs)
+                # source = source_registry.create(context, name, *args, **kwargs)
+                return source.forward(data)
 
             return proc
 

diff --git a/src/anemoi/datasets/create/input/repeated_dates.py b/src/anemoi/datasets/create/input/repeated_dates.py
@@ -72,6 +72,11 @@ def transform(self, group_of_dates):
                 end += self.frequency
 
         to_try = sorted(to_try - self.tried)
+        info = {k: "no-data" for k in to_try}
+
+        if not to_try:
+            LOG.warning(f"No new dates to try for {group_of_dates} in {self.source}")
+            # return []
 
         if to_try:
             result = self.source.select(
@@ -82,19 +87,32 @@ def transform(self, group_of_dates):
                 )
             )
 
+            cnt = 0
             for f in result.datasource:
+                cnt += 1
                 # We could keep the fields in a dictionary, but we don't want to keep the fields in memory
                 date = as_datetime(f.metadata("valid_datetime"))
 
                 if self.skip_all_nans:
                     if np.isnan(f.to_numpy()).all():
                         LOG.warning(f"Skipping {date} because all values are NaN")
+                        info[date] = "all-nans"
                         continue
 
+                info[date] = "ok"
                 self.found.add(date)
 
+            if cnt == 0:
+                raise ValueError(f"No data found for {group_of_dates} in {self.source}")
+
             self.tried.update(to_try)
 
+        if not self.found:
+            for k, v in info.items():
+                LOG.warning(f"{k}: {v}")
+
+            raise ValueError(f"No matching data found for {asked_dates} in {self.source}")
+
         new_dates = defaultdict(list)
 
         for date in asked_dates:

diff --git a/src/anemoi/datasets/create/statistics/__init__.py b/src/anemoi/datasets/create/statistics/__init__.py
@@ -98,7 +98,7 @@ def fix_variance(x, name, count, sums, squares):
 
     variances = squares / count - mean * mean
     assert variances.shape == squares.shape == mean.shape
-    if all(variances >= 0):
+    if np.all(variances >= 0):
         LOG.warning(f"All individual variances for {name} are positive, setting variance to 0.")
         return 0
 
@@ -108,7 +108,7 @@ def fix_variance(x, name, count, sums, squares):
     #     return 0
 
     LOG.warning(f"ERROR at least one individual variance is negative ({np.nanmin(variances)}).")
-    return x
+    return 0
 
 
 def check_variance(x, variables_names, minimum, maximum, mean, count, sums, squares):

diff --git a/src/anemoi/datasets/data/dataset.py b/src/anemoi/datasets/data/dataset.py
@@ -469,7 +469,7 @@ def _compute_constant_fields_from_a_few_samples(self):
         sample_count = min(4, len(indices))
         count = len(indices)
 
-        p = slice(0, count, count // (sample_count - 1))
+        p = slice(0, count, count // max(1, sample_count - 1))
         samples = list(range(*p.indices(count)))
 
         samples.append(count - 1)  # Add last