Skip to content

Commit

Permalink
Add seed fixing option to PySMO's sampling methods to enhance reprodu…
Browse files Browse the repository at this point in the history
…cibility (#1307)

* Fix NumPy array creation error by specifying object type

* Removing print and display statements

* Adding a function for custom sampling.

- User can explicitly define a distribution for sampling of each variable. Sampling options currently available are random, uniform and Gaussian.

* Improve errors and warnings

* Tests for CustomSampling

* running black...

* Updating docs and example.

* Fix docs

* Improve docsstrings.

* Improving tests based on feedback

* Edit Gaussian sampling bounds to allow for strict enforcement

* Add tests to validate for Gaussian bounds

* Update test_sampling.py

* Update test_sampling.py

* Add missing check in init

* Improve docs on Gaussian distribution samples.

* Update test_sampling.py

* Add random seed specification option

* Switch seed check to try-except

---------

Co-authored-by: Keith Beattie <ksbeattie@lbl.gov>
Co-authored-by: Dan Gunter <dkgunter@lbl.gov>
Co-authored-by: Andrew Lee <andrew.lee@netl.doe.gov>
Co-authored-by: Ludovico Bianchi <lbianchi@lbl.gov>
  • Loading branch information
5 people authored Jan 25, 2024
1 parent f07552d commit 7ee5489
Show file tree
Hide file tree
Showing 2 changed files with 314 additions and 2 deletions.
32 changes: 30 additions & 2 deletions idaes/core/surrogate/pysmo/sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ def __init__(
sampling_type=None,
xlabels=None,
ylabels=None,
rand_seed=None,
):
"""
Initialization of **LatinHypercubeSampling** class. Two inputs are required.
Expand All @@ -496,6 +497,7 @@ def __init__(
Keyword Args:
xlabels (list): List of column names (if **data_input** is a dataframe) or column numbers (if **data_input** is an array) for the independent/input variables. Only used in "selection" mode. Default is None.
ylabels (list): List of column names (if **data_input** is a dataframe) or column numbers (if **data_input** is an array) for the dependent/output variables. Only used in "selection" mode. Default is None.
rand_seed (int): Option that allows users to fix the numpy random seed generator for reproducibility (if required).
Returns:
**self** function containing the input information
Expand Down Expand Up @@ -594,6 +596,13 @@ def __init__(
self.number_of_samples = number_of_samples
self.x_data = bounds_array # Only x data will be present in this case

if rand_seed is not None:
try:
self.seed_value = int(rand_seed)
np.random.seed(self.seed_value)
except ValueError:
raise ValueError("Random seed must be an integer.")

def variable_sample_creation(self, variable_min, variable_max):
"""
Expand Down Expand Up @@ -1269,6 +1278,7 @@ def __init__(
sampling_type=None,
xlabels=None,
ylabels=None,
rand_seed=None,
):
"""
Initialization of CVTSampling class. Two inputs are required, while an optional option to control the solution accuracy may be specified.
Expand All @@ -1285,6 +1295,7 @@ def __init__(
Keyword Args:
xlabels (list): List of column names (if **data_input** is a dataframe) or column numbers (if **data_input** is an array) for the independent/input variables. Only used in "selection" mode. Default is None.
ylabels (list): List of column names (if **data_input** is a dataframe) or column numbers (if **data_input** is an array) for the dependent/output variables. Only used in "selection" mode. Default is None.
rand_seed (int): Option that allows users to fix the numpy random seed generator for reproducibility (if required).
tolerance(float): Maximum allowable Euclidean distance between centres from consecutive iterations of the algorithm. Termination condition for algorithm.
- The smaller the value of tolerance, the better the solution but the longer the algorithm requires to converge. Default value is :math:`10^{-7}`.
Expand Down Expand Up @@ -1412,6 +1423,13 @@ def __init__(
raise Exception("Invalid tolerance input")
self.eps = tolerance

if rand_seed is not None:
try:
self.seed_value = int(rand_seed)
np.random.seed(self.seed_value)
except ValueError:
raise ValueError("Random seed must be an integer.")

@staticmethod
def random_sample_selection(no_samples, no_features):
"""
Expand Down Expand Up @@ -1591,6 +1609,7 @@ def __init__(
xlabels=None,
ylabels=None,
strictly_enforce_gaussian_bounds=False,
rand_seed=None,
):
"""
Initialization of CustomSampling class. Four inputs are required.
Expand All @@ -1608,6 +1627,7 @@ def __init__(
Keyword Args:
xlabels (list): List of column names (if **data_input** is a dataframe) or column numbers (if **data_input** is an array) for the independent/input variables. Only used in "selection" mode. Default is None.
ylabels (list): List of column names (if **data_input** is a dataframe) or column numbers (if **data_input** is an array) for the dependent/output variables. Only used in "selection" mode. Default is None.
rand_seed (int): Option that allows users to fix the numpy random seed generator for reproducibility (if required).
strictly_enforce_gaussian_bounds (bool): Boolean specifying whether the provided bounds for normal distributions should be strictly enforced. Note that selecting this option may affect the underlying distribution. Default is False.
Returns:
Expand Down Expand Up @@ -1732,13 +1752,21 @@ def __init__(
)
self.normal_bounds_enforced = strictly_enforce_gaussian_bounds

if rand_seed is not None:
try:
self.seed_value = int(rand_seed)
except ValueError:
raise ValueError("Random seed must be an integer.")
else:
self.seed_value = rand_seed

def generate_from_dist(self, dist_name):
if dist_name.lower() in ["uniform", "random"]:
dist = getattr(np.random.default_rng(), dist_name.lower())
dist = getattr(np.random.default_rng(self.seed_value), dist_name.lower())
var_values = np.array(dist(size=self.number_of_samples))
return dist, var_values
elif dist_name.lower() == "normal":
dist = getattr(np.random.default_rng(), "normal")
dist = getattr(np.random.default_rng(self.seed_value), "normal")
var_values = dist(loc=0.5, scale=1 / 6, size=self.number_of_samples)
if not self.normal_bounds_enforced:
return dist, np.array(var_values)
Expand Down
Loading

0 comments on commit 7ee5489

Please sign in to comment.