hydrologie · RondeauG · Nov 5, 2024 · Oct 25, 2024 · Oct 28, 2024 · Oct 28, 2024
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -9,12 +9,15 @@ Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Trevor
 New features and enhancements
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 * A `periods` parameter was added to ``frequency_analysis.local.fit`` to compute multiple separate periods with a single call. (:pull:`219`).
+* In ``xhydro.cc.sampled_indicators``, the `delta_type` argument can now be a dictionary or None, in which case the attribute `delta_kind` is used. (:pull:`220`).
+* In ``xhydro.cc.sampled_indicators``, weights along a `time` or `horizon` dimension will no longer reduce that dimension. (:pull:`220`).
 
 Breaking changes
 ^^^^^^^^^^^^^^^^
 * The `xhydro` testing utilities have been rewritten to use `pooch` for downloading and caching testing datasets from `hydrologie/xhydro-testdata`. (:pull:`212`).
 * The `xhydro` testing utilities now require `pytest-xdist` as a development dependency. (:pull:`212`).
 * Many core dependencies have been updated to more modern versions. (:pull:`218`).
+* The `delta_type` argument in ``xhydro.cc.sampled_indicators`` has been renamed to `delta_kind` and is no longer positional. (:pull:`220`).
 
 Internal changes
 ^^^^^^^^^^^^^^^^

diff --git a/docs/notebooks/climate_change.ipynb b/docs/notebooks/climate_change.ipynb
@@ -516,7 +516,7 @@
     "fut_pct, hist_dist, delta_dist, fut_dist = xh.cc.sampled_indicators(\n",
     "    ref,\n",
     "    deltas=deltas,\n",
-    "    delta_type=\"percentage\",\n",
+    "    delta_kind=\"percentage\",\n",
     "    delta_weights=weights,\n",
     "    n=n,\n",
     "    seed=0,\n",

diff --git a/src/xhydro/cc.py b/src/xhydro/cc.py
@@ -1,9 +1,10 @@
 """Module to compute climate change statistics using xscen functions."""
 
-from typing import Optional, Union
+from datetime import datetime
 
 import numpy as np
 import xarray as xr
+import xscen as xs
 
 # Special imports from xscen
 from xscen import climatological_op, compute_deltas, ensemble_stats, produce_horizon
@@ -17,11 +18,11 @@
 ]
 
 
-def sampled_indicators(
+def sampled_indicators(  # noqa: C901
     ds: xr.Dataset,
     deltas: xr.Dataset,
-    delta_type: str,
     *,
+    delta_kind: str | dict | None = None,
     ds_weights: xr.DataArray | None = None,
     delta_weights: xr.DataArray | None = None,
     n: int = 50000,
@@ -37,8 +38,10 @@ def sampled_indicators(
         The percentiles should be stored in either a dimension called "percentile" [0, 100] or "quantile" [0, 1].
     deltas : xr.Dataset
         Dataset containing the future deltas to apply to the historical indicators.
-    delta_type : str
-        Type of delta provided. Must be one of ['absolute', 'percentage'].
+    delta_kind : str or dict, optional
+        Type of delta provided. Recognized values are: ['absolute', 'abs.', '+'], ['percentage', 'pct.', '%'].
+        If a dict is provided, it should map the variable names to their respective delta type.
+        If None, the variables should have a 'delta_kind' attribute.
     ds_weights : xr.DataArray, optional
         Weights to use when sampling the historical indicators, for dimensions other than 'percentile'/'quantile'.
         Dimensions not present in this Dataset, or if None, will be sampled uniformly unless they are shared with 'deltas'.
@@ -65,6 +68,9 @@ def sampled_indicators(
 
     Notes
     -----
+    Weights along the 'time' or 'horizon' dimensions are supported, but behave differently than other dimensions. They will not
+    be stacked alongside other dimensions in the new 'sample' dimension. Rather, a separate sampling will be done for each time/horizon,
+
     The future percentiles are computed as follows:
     1. Sample 'n' values from the historical distribution, weighting the percentiles by their associated coverage.
     2. Sample 'n' values from the delta distribution, using the provided weights.
@@ -90,7 +96,9 @@ def sampled_indicators(
             )
         }
     )
+    deltas_weighted = True
     if delta_weights is None:
+        deltas_weighted = False
         dims = set(deltas.dims).difference(list(shared_dims) + exclude_dims)
         delta_weights = xr.DataArray(
             np.ones([deltas.sizes[dim] for dim in dims]),
@@ -115,31 +123,104 @@ def sampled_indicators(
             f"Dimension(s) {problem_dims} is shared between 'ds' and 'deltas', but not between 'ds_weights' and 'delta_weights'."
         )
 
+    # Prepare the operation to apply on the variables
+    if delta_kind is None:
+        try:
+            delta_kind = {
+                var: deltas[var].attrs["delta_kind"] for var in deltas.data_vars
+            }
+        except KeyError:
+            raise KeyError(
+                "The 'delta_kind' argument is None, but the variables within the 'deltas' Dataset are missing a 'delta_kind' attribute."
+            )
+    elif isinstance(delta_kind, str):
+        delta_kind = {var: delta_kind for var in deltas.data_vars}
+    elif isinstance(delta_kind, dict):
+        if not all([var in delta_kind for var in deltas.data_vars]):
+            raise ValueError(
+                f"If 'delta_kind' is a dict, it should contain all the variables in 'deltas'. Missing variables: "
+                f"{list(set(deltas.data_vars) - set(delta_kind))}."
+            )
+
+    def _add_sampling_attrs(d, prefix, history, ds_for_attrs=None):
+        for var in d.data_vars:
+            if ds_for_attrs is not None:
+                d[var].attrs = ds_for_attrs[var].attrs
+            d[var].attrs["sampling_kind"] = delta_kind[var]
+            d[var].attrs["sampling_n"] = n
+            d[var].attrs["sampling_seed"] = seed
+            d[var].attrs[
+                "description"
+            ] = f"{prefix} of {d[var].attrs.get('long_name', var)} constructed from a perturbation approach and random sampling."
+            d[var].attrs[
+                "long_name"
+            ] = f"{prefix} of {d[var].attrs.get('long_name', var)}"
+            old_history = d[var].attrs.get("history", "")
+            d[var].attrs[
+                "history"
+            ] = f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {history}\n {old_history}"
+
     # Sample the distributions
     _, pdim, mult = _perc_or_quantile(ds)
     ds_dist = (
         _weighted_sampling(ds, percentile_weights, n=n, seed=seed)
-        .drop_vars(["sample", *percentile_weights.dims])
+        .drop_vars("sample", errors="ignore")
         .assign_coords({"sample": np.arange(n)})
     )
+    _add_sampling_attrs(
+        ds_dist,
+        "Historical distribution",
+        f"{'Weighted' if ds_weights is not None else 'Uniform'} random sampling "
+        f"applied to the historical distribution using 'xhydro.sampled_indicators' "
+        f"and {n} samples.",
+        ds_for_attrs=ds,
+    )
+
     deltas_dist = (
         _weighted_sampling(deltas, delta_weights, n=n, seed=seed)
-        .drop_vars(["sample", *delta_weights.dims])
+        .drop_vars("sample", errors="ignore")
         .assign_coords({"sample": np.arange(n)})
     )
+    _add_sampling_attrs(
+        deltas_dist,
+        "Future delta distribution",
+        f"{'Weighted' if deltas_weighted else 'Uniform'} random sampling "
+        f"applied to the delta distribution using 'xhydro.sampled_indicators' "
+        f"and {n} samples.",
+        ds_for_attrs=deltas,
+    )
 
-    # Element-wise multiplication of the ref_dist and ens_dist
-    if delta_type == "percentage":
-        fut_dist = ds_dist * (1 + deltas_dist / 100)
-    elif delta_type == "absolute":
-        fut_dist = ds_dist + deltas_dist
-    else:
-        raise ValueError(
-            f"Unknown operation '{delta_type}', expected one of ['absolute', 'percentage']."
-        )
+    # Apply the deltas element-wise to the historical distribution
+    fut_dist = xr.Dataset()
+    for var in deltas.data_vars:
+        with xr.set_options(keep_attrs=True):
+            if delta_kind[var] in ["absolute", "abs.", "+"]:
+                fut_dist[var] = ds_dist[var] + deltas_dist[var]
+            elif delta_kind[var] in ["percentage", "pct.", "%"]:
+                fut_dist[var] = ds_dist[var] * (1 + deltas_dist[var] / 100)
+            else:
+                raise ValueError(
+                    f"Unknown operation '{delta_kind[var]}', expected one of ['absolute', 'abs.', '+'], ['percentage', 'pct.', '%']."
+                )
+            _add_sampling_attrs(
+                fut_dist,
+                "Future distribution",
+                f"Future distribution constructed from a perturbation approach and random sampling with {n} samples.",
+                ds_for_attrs=ds,
+            )
+
+    # Build the attributes based on common attributes between the historical and delta distributions
+    fut_dist = xs.clean_up(fut_dist, common_attrs_only=[ds, deltas])
 
     # Compute future percentiles
-    fut_pct = fut_dist.quantile(ds.percentile / mult, dim="sample")
+    with xr.set_options(keep_attrs=True):
+        fut_pct = fut_dist.quantile(ds.percentile / mult, dim="sample")
+    _add_sampling_attrs(
+        fut_pct,
+        "Future percentiles",
+        f"Future percentiles computed from the future distribution using 'xhydro.sampled_indicators' and {n} samples.",
+        ds_for_attrs=ds,
+    )
 
     if pdim == "percentile":
         fut_pct = fut_pct.rename({"quantile": "percentile"})
@@ -185,6 +266,7 @@ def _weighted_sampling(
     ds: xr.Dataset, weights: xr.DataArray, n: int = 50000, seed: int | None = None
 ) -> xr.Dataset:
     """Sample from a distribution with weights.
+    In the case of weights on a 'time' or 'horizon' dimension, the sampling is done separately for each time/horizon, but with the same seed.
 
     Parameters
     ----------
@@ -205,23 +287,51 @@ def _weighted_sampling(
     # Prepare the weights
     if weights.isnull().any():
         raise ValueError("The weights contain NaNs.")
-    weights = weights / weights.sum()  # Must equal 1
+
+    # Weights on the time dimension need to be handled differently
+    time_dim = [dim for dim in weights.dims if dim in ["time", "horizon"]]
+    if len(time_dim) > 1:
+        raise NotImplementedError(
+            "Weights on multiple time dimensions are not supported."
+        )
+    time_dim = time_dim[0] if time_dim else ""
+    other_dims = list(set(weights.dims).difference([time_dim]))
+
+    # Must equal 1
+    weights = weights / weights.sum(other_dims)
 
     # For performance reasons, remove the chunking along impacted dimensions
     ds = ds.chunk({dim: -1 for dim in weights.dims})
     weights = weights.chunk({dim: -1 for dim in weights.dims})
 
     # Stack the dimensions containing weights
-    ds = ds.stack({"sample": sorted(list(weights.dims))})
-    weights = weights.stack({"sample": sorted(list(weights.dims))})
+    ds = ds.stack({"sample": sorted(other_dims)})
+    weights = weights.stack({"sample": sorted(other_dims)})
     weights = weights.reindex_like(ds)
 
     # Sample the dimensions with weights
     rng = np.random.default_rng(seed=seed)
-    idx = rng.choice(weights.size, size=n, p=weights)
 
-    # Create the distribution dataset
-    ds_dist = ds.isel({"sample": idx}).chunk({"sample": -1})
+    if not time_dim:
+        idx = rng.choice(len(weights.sample), size=n, p=weights)
+        # Create the distribution dataset
+        ds_dist = ds.isel({"sample": idx}).chunk({"sample": -1})
+    else:
+        # 2D weights are not supported by np.random.choice, so we need to loop over the time dimension
+        ds_tmp = [
+            ds.sel({time_dim: time}).isel(
+                {
+                    "sample": rng.choice(
+                        len(weights.sample), size=n, p=weights.sel({time_dim: time})
+                    )
+                }
+            )
+            for time in ds[time_dim].values
+        ]
+        # Remove the coordinates from the sample dimension to allow concatenation
+        for i, ds_ in enumerate(ds_tmp):
+            ds_tmp[i] = ds_.reset_index("sample", drop=True)
+        ds_dist = xr.concat(ds_tmp, dim=time_dim).chunk({time_dim: -1, "sample": -1})
 
     return ds_dist