Skip to content

Commit

Permalink
Add resample distributions function for probability matching (#390)
Browse files Browse the repository at this point in the history
* feat: first attempt to add resample distributions function for probability matching

* fix: remove erroneous import

* feat: add resampling option to probability matching step

* fix: make sure there are no nans in the resampling function

* fix: fill up space outside radar domain with model data and change function names

* docs: add info about the weights

* fix: add requested changes from review

* fix: add tests and make sure probability stays within bounds

* Better docstrings

* Minor code improvements

* Add test

* Run black

* Handle nans outside of resampling method

---------

Co-authored-by: Daniele Nerini <daniele.nerini@gmail.com>
  • Loading branch information
RubenImhoff and dnerini authored Jul 22, 2024
1 parent 3e433ff commit d577a98
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 40 deletions.
53 changes: 40 additions & 13 deletions pysteps/blending/steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def forecast(
conditional=False,
probmatching_method="cdf",
mask_method="incremental",
resample_distribution=True,
smooth_radar_mask_range=0,
callback=None,
return_output=True,
Expand Down Expand Up @@ -205,25 +206,32 @@ def forecast(
If set to True, compute the statistics of the precipitation field
conditionally by excluding pixels where the values are below the threshold
precip_thr.
probmatching_method: {'cdf','mean',None}, optional
Method for matching the statistics of the forecast field with those of
the most recently observed one. 'cdf'=map the forecast CDF to the observed
one, 'mean'=adjust only the conditional mean value of the forecast field
in precipitation areas, None=no matching applied. Using 'mean' requires
that mask_method is not None.
mask_method: {'obs','incremental',None}, optional
The method to use for masking no precipitation areas in the forecast field.
The masked pixels are set to the minimum value of the observations.
'obs' = apply precip_thr to the most recently observed precipitation intensity
field, 'incremental' = iteratively buffer the mask with a certain rate
(currently it is 1 km/min), None=no masking.
resample_distribution: bool, optional
Method to resample the distribution from the extrapolation and NWP cascade as input
for the probability matching. Not resampling these distributions may lead to losing
some extremes when the weight of both the extrapolation and NWP cascade is similar.
Defaults to True.
smooth_radar_mask_range: int, Default is 0.
Method to smooth the transition between the radar-NWP-noise blend and the NWP-noise
blend near the edge of the radar domain (radar mask), where the radar data is either
not present anymore or is not reliable. If set to 0 (grid cells), this generates a normal forecast without smoothing. To create a smooth mask, this range
should be a positive value, representing a buffer band of a number of pixels
by which the mask is cropped and smoothed. The smooth radar mask removes
the hard edges between NWP and radar in the final blended product. Typically, a value between 50 and 100 km can be used. 80 km generally gives good results.
probmatching_method: {'cdf','mean',None}, optional
Method for matching the statistics of the forecast field with those of
the most recently observed one. 'cdf'=map the forecast CDF to the observed
one, 'mean'=adjust only the conditional mean value of the forecast field
in precipitation areas, None=no matching applied. Using 'mean' requires
that mask_method is not None.
not present anymore or is not reliable. If set to 0 (grid cells), this generates a
normal forecast without smoothing. To create a smooth mask, this range should be a
positive value, representing a buffer band of a number of pixels by which the mask
is cropped and smoothed. The smooth radar mask removes the hard edges between NWP
and radar in the final blended product. Typically, a value between 50 and 100 km
can be used. 80 km generally gives good results.
callback: function, optional
Optional function that is called after computation of each time step of
the nowcast. The function takes one argument: a three-dimensional array
Expand Down Expand Up @@ -1404,7 +1412,6 @@ def worker(j):
# latest extrapolated radar rainfall field blended with the
# nwp model(s) rainfall forecast fields as 'benchmark'.

# TODO: Check probability matching method
# 8.7.1 first blend the extrapolated rainfall field (the field
# that is only used for post-processing steps) with the NWP
# rainfall forecast for this time step using the weights
Expand Down Expand Up @@ -1538,19 +1545,39 @@ def worker(j):
# Set to min value outside of mask
R_f_new[~MASK_prec_] = R_cmin

# If probmatching_method is not None, resample the distribution from
# both the extrapolation cascade and the model (NWP) cascade and use
# that for the probability matching
if probmatching_method is not None and resample_distribution:
# deal with missing values
arr1 = R_pm_ep[t_index]
arr2 = precip_models_pm_temp[j]
arr2 = np.where(np.isnan(arr2), np.nanmin(arr2), arr2)
arr1 = np.where(np.isnan(arr1), arr2, arr1)
# resample weights based on cascade level 2
R_pm_resampled = probmatching.resample_distributions(
first_array=arr1,
second_array=arr2,
probability_first_array=weights_pm_normalized[0],
)
else:
R_pm_resampled = R_pm_blended.copy()

if probmatching_method == "cdf":
# Adjust the CDF of the forecast to match the most recent
# benchmark rainfall field (R_pm_blended). If the forecast
if np.any(np.isfinite(R_f_new)):
R_f_new = probmatching.nonparam_match_empirical_cdf(
R_f_new, R_pm_blended
R_f_new, R_pm_resampled
)
R_pm_resampled = None
elif probmatching_method == "mean":
# Use R_pm_blended as benchmark field and
mu_0 = np.mean(R_pm_blended[R_pm_blended >= precip_thr])
mu_0 = np.mean(R_pm_resampled[R_pm_resampled >= precip_thr])
MASK = R_f_new >= precip_thr
mu_fct = np.mean(R_f_new[MASK])
R_f_new[MASK] = R_f_new[MASK] - mu_fct + mu_0
R_pm_resampled = None

R_f_out.append(R_f_new)

Expand Down
51 changes: 51 additions & 0 deletions pysteps/postprocessing/probmatching.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
pmm_init
pmm_compute
shift_scale
resample_distributions
"""

import numpy as np
Expand Down Expand Up @@ -259,6 +260,56 @@ def _get_error(scale):
return shift, scale, R.reshape(shape)


def resample_distributions(first_array, second_array, probability_first_array):
"""
Merges two distributions (e.g., from the extrapolation nowcast and NWP in the blending module)
to effectively combine two distributions for probability matching without losing extremes.
Parameters
----------
first_array: array_like
One of the two arrays from which the distribution should be sampled (e.g., the extrapolation
cascade). It must be of the same shape as `second_array`. Input must not contain NaNs.
second_array: array_like
One of the two arrays from which the distribution should be sampled (e.g., the NWP (model)
cascade). It must be of the same shape as `first_array`.. Input must not contain NaNs.
probability_first_array: float
The weight that `first_array` should get (a value between 0 and 1). This determines the
likelihood of selecting elements from `first_array` over `second_array`.
Returns
-------
csort: array_like
The combined output distribution. This is an array of the same shape as the input arrays,
where each element is chosen from either `first_array` or `second_array` based on the specified
probability, and then sorted in descending order.
Raises
------
ValueError
If `first_array` and `second_array` do not have the same shape or if inputs contain NaNs.
"""

# Valide inputs
if first_array.shape != second_array.shape:
raise ValueError("first_array and second_array must have the same shape")
if np.isnan(first_array).any() or np.isnan(second_array).any():
raise ValueError("Input arrays must not contain NaNs")
probability_first_array = np.clip(probability_first_array, 0.0, 1.0)

# Flatten and sort the arrays
asort = np.sort(first_array, axis=None)[::-1]
bsort = np.sort(second_array, axis=None)[::-1]
n = asort.shape[0]

# Resample the distributions
idxsamples = np.random.binomial(1, probability_first_array, n).astype(bool)
csort = np.where(idxsamples, asort, bsort)
csort = np.sort(csort)[::-1]

return csort


def _invfunc(y, fx, fy):
if len(y) == 0:
return np.array([])
Expand Down
59 changes: 32 additions & 27 deletions pysteps/tests/test_blending_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,40 @@


steps_arg_values = [
(1, 3, 4, 8, None, None, False, "spn", True, 4, False, False, 0),
(1, 3, 4, 8, "obs", None, False, "spn", True, 4, False, False, 0),
(1, 3, 4, 8, "incremental", None, False, "spn", True, 4, False, False, 0),
(1, 3, 4, 8, None, "mean", False, "spn", True, 4, False, False, 0),
(1, 3, 4, 8, None, "cdf", False, "spn", True, 4, False, False, 0),
(1, 3, 4, 8, "incremental", "cdf", False, "spn", True, 4, False, False, 0),
(1, 3, 4, 6, "incremental", "cdf", False, "bps", True, 4, False, False, 0),
(1, 3, 4, 6, "incremental", "cdf", False, "bps", False, 4, False, False, 0),
(1, 3, 4, 9, "incremental", "cdf", False, "spn", True, 4, False, False, 0),
(2, 3, 10, 8, "incremental", "cdf", False, "spn", True, 10, False, False, 0),
(5, 3, 5, 8, "incremental", "cdf", False, "spn", True, 5, False, False, 0),
(1, 10, 1, 8, "incremental", "cdf", False, "spn", True, 1, False, False, 0),
(2, 3, 2, 8, "incremental", "cdf", True, "spn", True, 2, False, False, 0),
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, False, 0),
(1, 3, 4, 8, None, None, False, "spn", True, 4, False, False, 0, False),
(1, 3, 4, 8, "obs", None, False, "spn", True, 4, False, False, 0, False),
(1, 3, 4, 8, "incremental", None, False, "spn", True, 4, False, False, 0, False),
(1, 3, 4, 8, None, "mean", False, "spn", True, 4, False, False, 0, False),
(1, 3, 4, 8, None, "mean", False, "spn", True, 4, False, False, 0, True),
(1, 3, 4, 8, None, "cdf", False, "spn", True, 4, False, False, 0, False),
(1, 3, 4, 8, "incremental", "cdf", False, "spn", True, 4, False, False, 0, False),
(1, 3, 4, 6, "incremental", "cdf", False, "bps", True, 4, False, False, 0, False),
(1, 3, 4, 6, "incremental", "cdf", False, "bps", False, 4, False, False, 0, False),
(1, 3, 4, 6, "incremental", "cdf", False, "bps", False, 4, False, False, 0, True),
(1, 3, 4, 9, "incremental", "cdf", False, "spn", True, 4, False, False, 0, False),
(2, 3, 10, 8, "incremental", "cdf", False, "spn", True, 10, False, False, 0, False),
(5, 3, 5, 8, "incremental", "cdf", False, "spn", True, 5, False, False, 0, False),
(1, 10, 1, 8, "incremental", "cdf", False, "spn", True, 1, False, False, 0, False),
(2, 3, 2, 8, "incremental", "cdf", True, "spn", True, 2, False, False, 0, False),
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, False, 0, False),
# Test the case where the radar image contains no rain.
(1, 3, 6, 8, None, None, False, "spn", True, 6, True, False, 0),
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 0),
(1, 3, 6, 8, None, None, False, "spn", True, 6, True, False, 0, False),
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 0, False),
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 0, True),
# Test the case where the NWP fields contain no rain.
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, True, 0),
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, False, True, 0),
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, True, 0, False),
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, False, True, 0, True),
# Test the case where both the radar image and the NWP fields contain no rain.
(1, 3, 6, 8, None, None, False, "spn", True, 6, True, True, 0),
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, True, 0),
(5, 3, 5, 6, "obs", "mean", True, "spn", True, 5, True, True, 0),
(1, 3, 6, 8, None, None, False, "spn", True, 6, True, True, 0, False),
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, True, 0, False),
(5, 3, 5, 6, "obs", "mean", True, "spn", True, 5, True, True, 0, False),
# Test for smooth radar mask
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, False, 80),
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, False, False, 80),
(5, 3, 5, 6, "obs", "mean", False, "spn", False, 5, False, False, 80),
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, True, 80),
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 80),
(5, 3, 5, 6, "obs", "mean", False, "spn", False, 5, True, True, 80),
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, False, 80, False),
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, False, False, 80, False),
(5, 3, 5, 6, "obs", "mean", False, "spn", False, 5, False, False, 80, False),
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, True, 80, False),
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 80, True),
(5, 3, 5, 6, "obs", "mean", False, "spn", False, 5, True, True, 80, False),
]

steps_arg_names = (
Expand All @@ -55,6 +58,7 @@
"zero_radar",
"zero_nwp",
"smooth_radar_mask_range",
"resample_distribution",
)


Expand All @@ -73,6 +77,7 @@ def test_steps_blending(
zero_radar,
zero_nwp,
smooth_radar_mask_range,
resample_distribution,
):
pytest.importorskip("cv2")

Expand Down
57 changes: 57 additions & 0 deletions pysteps/tests/test_postprocessing_probmatching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import numpy as np
import pytest
from pysteps.postprocessing.probmatching import resample_distributions


class TestResampleDistributions:

@pytest.fixture(autouse=True)
def setup(self):
# Set the seed for reproducibility
np.random.seed(42)

def test_valid_inputs(self):
first_array = np.array([1, 3, 5, 7, 9])
second_array = np.array([2, 4, 6, 8, 10])
probability_first_array = 0.6
result = resample_distributions(
first_array, second_array, probability_first_array
)
expected_result = np.array([9, 8, 6, 3, 1]) # Expected result based on the seed
assert result.shape == first_array.shape
assert np.array_equal(result, expected_result)

def test_probability_zero(self):
first_array = np.array([1, 3, 5, 7, 9])
second_array = np.array([2, 4, 6, 8, 10])
probability_first_array = 0.0
result = resample_distributions(
first_array, second_array, probability_first_array
)
assert np.array_equal(result, np.sort(second_array)[::-1])

def test_probability_one(self):
first_array = np.array([1, 3, 5, 7, 9])
second_array = np.array([2, 4, 6, 8, 10])
probability_first_array = 1.0
result = resample_distributions(
first_array, second_array, probability_first_array
)
assert np.array_equal(result, np.sort(first_array)[::-1])

def test_nan_in_inputs(self):
array_with_nan = np.array([1, 3, np.nan, 7, 9])
array_without_nan = np.array([2, 4, 6, 8, 10])
probability_first_array = 0.6
with pytest.raises(ValueError, match="Input arrays must not contain NaNs"):
resample_distributions(
array_with_nan, array_without_nan, probability_first_array
)
with pytest.raises(ValueError, match="Input arrays must not contain NaNs"):
resample_distributions(
array_without_nan, array_with_nan, probability_first_array
)
with pytest.raises(ValueError, match="Input arrays must not contain NaNs"):
resample_distributions(
array_with_nan, array_with_nan, probability_first_array
)

0 comments on commit d577a98

Please sign in to comment.