From 2f4c0c7891200d8370f2a29b2c6061535c162349 Mon Sep 17 00:00:00 2001 From: Wei-Tse Hsu Date: Wed, 25 Oct 2023 16:16:36 -0600 Subject: [PATCH 01/11] Updated README.md; Changed file names and entry points --- README.md | 2 +- .../cli/{analyze_EEXE.py => analyze_REXEE.py} | 0 .../cli/{explore_EEXE.py => explore_REXEE.py} | 0 ensemble_md/cli/{run_EEXE.py => run_REXEE.py} | 0 ensemble_md/replica_exchange_ee.py | 1477 +++++++++++++++++ setup.py | 6 +- 6 files changed, 1481 insertions(+), 4 deletions(-) rename ensemble_md/cli/{analyze_EEXE.py => analyze_REXEE.py} (100%) rename ensemble_md/cli/{explore_EEXE.py => explore_REXEE.py} (100%) rename ensemble_md/cli/{run_EEXE.py => run_REXEE.py} (100%) create mode 100644 ensemble_md/replica_exchange_ee.py diff --git a/README.md b/README.md index ba271b4b..1630864e 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Ensemble Molecular Dynamics [![Downloads](https://static.pepy.tech/badge/ensemble-md)](https://pepy.tech/project/ensemble-md) [![Twitter Follow](https://img.shields.io/twitter/follow/WeiTseHsu?style=social)](https://twitter.com/WeiTseHsu) -**ensemble_md** is a Python package that provides methods for setting up, running, and analyzing molecular dynamics ensembles in GROMACS. The current implementation is mainly for synchronous ensemble of expanded ensemble (EXEE), but we will develop methods like asynchronous EEXE, or ensemble of alchemical metadynamics in the future. For installation instructions, theory overview, tutorials, and API references, please visit the [documentation](https://ensemble-md.readthedocs.io/en/latest/?badge=latest). +**ensemble_md** is a Python package that provides methods for setting up, running, and analyzing molecular dynamics ensembles in GROMACS. The current implementation is mainly for synchronous replica exchange (REX) of expanded ensemble (EE), abbreviated as REXEE. In the future, we will develop methods like asynchronous REXEE or multi-topology REXEE. For installation instructions, theory overview, tutorials, and API references, please visit the [documentation](https://ensemble-md.readthedocs.io/en/latest/?badge=latest). ### Copyright diff --git a/ensemble_md/cli/analyze_EEXE.py b/ensemble_md/cli/analyze_REXEE.py similarity index 100% rename from ensemble_md/cli/analyze_EEXE.py rename to ensemble_md/cli/analyze_REXEE.py diff --git a/ensemble_md/cli/explore_EEXE.py b/ensemble_md/cli/explore_REXEE.py similarity index 100% rename from ensemble_md/cli/explore_EEXE.py rename to ensemble_md/cli/explore_REXEE.py diff --git a/ensemble_md/cli/run_EEXE.py b/ensemble_md/cli/run_REXEE.py similarity index 100% rename from ensemble_md/cli/run_EEXE.py rename to ensemble_md/cli/run_REXEE.py diff --git a/ensemble_md/replica_exchange_ee.py b/ensemble_md/replica_exchange_ee.py new file mode 100644 index 00000000..c2500ef6 --- /dev/null +++ b/ensemble_md/replica_exchange_ee.py @@ -0,0 +1,1477 @@ +#################################################################### +# # +# ensemble_md, # +# a python package for running GROMACS simulation ensembles # +# # +# Written by Wei-Tse Hsu # +# Copyright (c) 2022 University of Colorado Boulder # +# # +#################################################################### +""" +The :obj:`.ensemble_EXE` module provides functions for setting up and ensemble of expanded ensemble. +""" +import os +import sys +import copy +import yaml +import random +import warnings +import importlib +import subprocess +import numpy as np +from mpi4py import MPI +from itertools import combinations +from collections import OrderedDict +from alchemlyb.parsing.gmx import _get_headers as get_headers +from alchemlyb.parsing.gmx import _extract_dataframe as extract_dataframe + +import ensemble_md +from ensemble_md.utils import utils +from ensemble_md.utils import gmx_parser +from ensemble_md.utils.exceptions import ParameterError + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() + + +class ReplicaExchangeEE: + """ + This class provides a variety of functions useful for setting up and running + replica exchange (REX) of expanded ensemble (EE), or REXEE simulations. + Upon instantiation, all parameters in the YAML + file will be assigned to an attribute in the class. In addition to these variables, + below is a list of attributes of the class. (All the the attributes are assigned by + :obj:`set_params` unless otherwise noted.) + + :ivar gmx_path: The absolute path of the GROMACS exectuable. + :ivar gmx_version: The version of the GROMACS executable. + :ivar yaml: The input YAML file used to instantiate the class. Assigned by the :code:`__init__` function. + :ivar warnings: Warnings about parameter specification in either YAML or MDP files. + :ivar reformatted_mdp: Whether the templated MDP file has been reformatted by replacing hyphens + with underscores or not. + :ivar template: The instance of the :obj:`MDP` class based on the template MDP file. + :ivar dt: The simulation timestep in ps. + :ivar temp: The simulation temperature in Kelvin. + :ivar fixed_weights: Whether the weights will be fixed during the simulation (according to the template MDP file). + :ivar updating_weights: The list of weights as a function of time (since the last update of the Wang-Landau + incrementor) for different replicas. The length is equal to the number of replicas. This is only relevant for + weight-updating simulations. + :ivar equilibrated_weights: The equilibrated weights of different replicas. For weight-equilibrating simulations, + this list is initialized as a list of empty lists. Otherwise (weight-fixed), it is initialized as a list of + :code:`None`. + :ivar current_wl_delta: The current value of the Wang-Landau incrementor. This is only relevent for weight-updating + simulations. + :ivar kT: 1 kT in kJ/mol at the simulation temperature. + :ivar lambda_types: The types of lambda variables involved in expanded ensemble simulations, e.g. + :code:`fep_lambdas`, :code:`mass_lambdas`, :code:`coul_lambdas`, etc. + :ivar n_tot: The total number of states for all replicas. + :ivar n_sub: The numbmer of states for each replica. The current implementation assumes homogenous replicas. + :ivar state_ranges: A list of list of state indices for each replica. + :ivar equil: A list of times it took to equilibrated the weights for different replicas. This + list is initialized with a list of -1, where -1 means that the weights haven't been equilibrated. Also, + a value of 0 means that the simulation is a fixed-weight simulation. + :ivar n_rejected: The number of proposed exchanges that have been rejected. Updated by :obj:`.accept_or_reject`. + :ivar n_swap_attempts: The number of swaps attempted so far. This does not include the cases + where there is no swappable pair. Updated by :obj:`.get_swapping_pattern`. + :ivar n_emtpy_swappable: The number of times when there was no swappable pair. + :ivar rep_trajs: The replica-space trajectories of all configurations. + :ivar configs: A list that thows the current configuration index that each replica is sampling. + :ivar g_vecs: The time series of the (processed) whole-range alchemical weights. If no weight combination is + applied, this list will just be a list of :code:`None`'s. + :ivar df_data_type: The type of data (either :math:`u_{nk}` or :math:`dH/dλ`) that will be used for + free energy calculations if :code:`df_method` is :code:`True`. + :ivar modify_coords_fn: The function (callable) in the external module (specified as :code:`modify_coords` in + the input YAML file) for modifying coordinates at exchanges. + """ + + def __init__(self, yaml_file, analysis=False): + self.yaml = yaml_file + self.set_params(analysis) + + def set_params(self, analysis): + """ + Sets up or reads in the user-defined parameters from a yaml file and an MDP template. + This function is called to instantiate the class in the :code:`__init__` function of + class. Specifically, it does the following: + + 1. Sets up constants. + 2. Reads in parameters from a YAML file. + 3. Handles YAML parameters. + 4. Checks if the parameters in the YAML file are well-defined. + 5. Reformats the input MDP file to replace all hyphens with underscores. + 6. Reads in parameters from the MDP template. + + After instantiation, the class instance will have attributes corresponding to + each of the parameters specified in the YAML file. For a full list of the parameters that + can be specified in the YAML file, please refer to :ref:`doc_parameters`. + + :param yaml_file: The file name of the YAML file for specifying the parameters for EEXE. + :type yaml_file: str + :param analysis: Whether the instantiation of the class is for data analysis of EEXE simulations. + The default is :code:`False` + :type analysis: bool + + :raises ParameterError: + + - If a required parameter is not specified in the YAML file. + - If a specified parameter is not recognizable. + - If a specified weight combining scheme is not available. + - If a specified acceptance scheme is not available. + - If a specified free energy estimator is not available. + - If a specified method for error estimation is not available. + - If an integer parameter is not an integer. + - If a positive parameter is not positive. + - If a non-negative parameter is negative. + - If an invalid MDP file is detected. + """ + self.warnings = [] # Store warnings, if any. + + # Step 0: Set up constants + k = 1.380649e-23 + NA = 6.0221408e23 + + # Step 1: Read in parameters from the YAML file. + with open(self.yaml) as f: + params = yaml.load(f, Loader=yaml.FullLoader) + + for attr in params: + setattr(self, attr, params[attr]) + + # Step 2: Handle the compulsory YAML parameters + required_args = [ + "gmx_executable", + "gro", + "top", + "mdp", + "n_sim", + "n_iter", + "s", + ] + for i in required_args: + if hasattr(self, i) is False or getattr(self, i) is None: + raise ParameterError( + f"Required parameter '{i}' not specified in {self.yaml}." + ) # noqa: F405 + + # Step 3: Handle the optional YAML parameters + # Key: Optional argument; Value: Default value + optional_args = { + "add_swappables": None, + "modify_coords": None, + "nst_sim": None, + "proposal": 'exhaustive', + "acceptance": "metropolis", + "w_combine": None, + "rmse_cutoff": np.inf, + "N_cutoff": 1000, + "n_ex": 'N^3', # only active for multiple swaps. + "verbose": True, + "mdp_args": None, + "grompp_args": None, + "runtime_args": None, + "n_ckpt": 100, + "rm_cpt": True, + "msm": False, + "free_energy": False, + "subsampling_avg": False, + "df_spacing": 1, + "df_ref": None, + "df_method": "MBAR", + "err_method": "propagate", + "n_bootstrap": 50, + "seed": None, + } + for i in optional_args: + if hasattr(self, i) is False or getattr(self, i) is None: + setattr(self, i, optional_args[i]) + + # all_args: Arguments that can be specified in the YAML file. + all_args = required_args + list(optional_args.keys()) + for i in params: + if i not in all_args: + self.warnings.append(f'Warning: Parameter "{i}" specified in the input YAML file is not recognizable.') + + # Step 4: Check if the parameters in the YAML file are well-defined + if self.proposal not in [None, 'single', 'neighboring', 'exhaustive', 'multiple']: + raise ParameterError("The specified proposal scheme is not available. Available options include 'single', 'neighboring', 'exhaustive', and 'multiple'.") # noqa: E501 + + if self.acceptance not in [None, 'same-state', 'same_state', 'metropolis', 'metropolis-eq', 'metropolis_eq']: + raise ParameterError("The specified acceptance scheme is not available. Available options include 'same-state', 'metropolis', and 'metropolis-eq'.") # noqa: E501 + + if self.w_combine not in [None, 'final', 'avg']: + raise ParameterError("The specified type of weight to be combined is not available. Available options include 'final' and 'avg'.") # noqa: E501 + + if self.df_method not in [None, 'TI', 'BAR', 'MBAR']: + raise ParameterError("The specified free energy estimator is not available. Available options include 'TI', 'BAR', and 'MBAR'.") # noqa: E501 + + if self.err_method not in [None, 'propagate', 'bootstrap']: + raise ParameterError("The specified method for error estimation is not available. Available options include 'propagate', and 'bootstrap'.") # noqa: E501 + + if self.w_combine is not None and self.rmse_cutoff == np.inf: + self.warnings.append('Warning: We recommend setting rmse_cutoff when w_combine is used.') + + if self.rmse_cutoff != np.inf: + if type(self.rmse_cutoff) is not float and type(self.rmse_cutoff) is not int: + raise ParameterError("The parameter 'rmse_cutoff' should be a float.") + + params_int = ['n_sim', 'n_iter', 's', 'N_cutoff', 'df_spacing', 'n_ckpt', 'n_bootstrap'] # integer parameters # noqa: E501 + if self.nst_sim is not None: + params_int.append('nst_sim') + if self.n_ex != 'N^3': # no need to add "and self.proposal == 'multiple' since if multiple swaps are not used, n_ex=1" # noqa: E501 + params_int.append('n_ex') + if self.seed is not None: + params_int.append('seed') + for i in params_int: + if type(getattr(self, i)) != int: + raise ParameterError(f"The parameter '{i}' should be an integer.") + + params_pos = ['n_sim', 'n_iter', 'n_ckpt', 'df_spacing', 'n_bootstrap', 'rmse_cutoff'] # positive parameters + if self.nst_sim is not None: + params_pos.append('nst_sim') + for i in params_pos: + if getattr(self, i) <= 0: + raise ParameterError(f"The parameter '{i}' should be positive.") + + if self.n_ex != 'N^3' and self.n_ex < 0: + raise ParameterError("The parameter 'n_ex' should be non-negative.") + + if self.s < 0: + raise ParameterError("The parameter 's' should be non-negative.") + + if self.N_cutoff < 0 and self.N_cutoff != -1: + raise ParameterError("The parameter 'N_cutoff' should be non-negative unless no weight correction is needed, i.e. N_cutoff = -1.") # noqa: E501 + + params_str = ['gro', 'top', 'mdp', 'gmx_executable'] + # First check if self.gro and self.top are lists and check their lengths + check_files = ['gro', 'top'] # just for checking file types that can take multiple inputs + for i in check_files: + if isinstance(getattr(self, i), list): + params_str.remove(i) + if len(getattr(self, i)) != self.n_sim: + raise ParameterError(f"The number of the input {i.upper()} files must be the same as the number of replicas, if multiple are specified.") # noqa: E501 + if self.modify_coords is not None: + params_str.append('modify_coords') + for i in params_str: + if type(getattr(self, i)) != str: + raise ParameterError(f"The parameter '{i}' should be a string.") + + params_bool = ['verbose', 'rm_cpt', 'msm', 'free_energy', 'subsampling_avg'] + for i in params_bool: + if type(getattr(self, i)) != bool: + raise ParameterError(f"The parameter '{i}' should be a boolean variable.") + + params_list = ['add_swappables', 'df_ref'] + for i in params_list: + if getattr(self, i) is not None and not isinstance(getattr(self, i), list): + raise ParameterError(f"The parameter '{i}' should be a list.") + + params_dict = ['mdp_args', 'grompp_args', 'runtime_args'] + for i in params_dict: + if getattr(self, i) is not None and not isinstance(getattr(self, i), dict): + raise ParameterError(f"The parameter '{i}' should be a dictionary.") + + if self.add_swappables is not None: + if not isinstance(self.add_swappables, list): + raise ParameterError("The parameter 'add_swappables' should be a nested list.") + for sublist in self.add_swappables: + if not isinstance(sublist, list): + raise ParameterError("The parameter 'add_swappables' should be a nested list.") + for item in sublist: + if not isinstance(item, int) or item < 0: + raise ParameterError("Each number specified in 'add_swappables' should be a non-negative integer.") # noqa: E501 + + if self.mdp_args is not None: + for key in self.mdp_args.keys(): + if not isinstance(key, str): + raise ParameterError("All keys specified in 'mdp_args' should be strings.") + else: + if '-' in key: + raise ParameterError("Parameters specified in 'mdp_args' must use underscores in place of hyphens.") # noqa: E501 + for val_list in self.mdp_args.values(): + if len(val_list) != self.n_sim: + raise ParameterError("The number of values specified for each key in 'mdp_args' should be the same as the number of replicas.") # noqa: E501 + + # Step 5: Reformat the input MDP file to replace all hypens with underscores. + self.reformatted_mdp = EnsembleEXE.reformat_MDP(self.mdp) + + # Step 6: Read in parameters from the MDP template + self.template = gmx_parser.MDP(self.mdp) + self.dt = self.template["dt"] # ps + self.temp = self.template["ref_t"] + + if self.nst_sim is None: + self.nst_sim = self.template["nsteps"] + + if 'wl_scale' in self.template.keys(): + if self.template['wl_scale'] == []: + # If wl_scale in the MDP file is a blank (i.e. fixed weights), mdp['wl_scale'] will be an empty list. + # This is the only case where mdp['wl_scale'] is a numpy array. + self.fixed_weights = True + self.equilibrated_weights = [None for i in range(self.n_sim)] + else: + self.fixed_weights = False + self.equilibrated_weights = [[] for i in range(self.n_sim)] + self.updating_weights = [[] for i in range(self.n_sim)] + self.current_wl_delta = [0 for i in range(self.n_sim)] + else: + self.fixed_weights = True + self.equilibrated_weights = [None for i in range(self.n_sim)] + + if self.fixed_weights is True: + if self.N_cutoff != -1 or self.w_combine is not None: + self.warnings.append('Warning: The weight correction/weight combination method is specified but will not be used since the weights are fixed.') # noqa: E501 + # In the case that the warning is ignored, enforce the defaults. + self.N_cutoff = -1 + self.w_combine = None + + if 'lmc_seed' in self.template and self.template['lmc_seed'] != -1: + self.warnings.append('Warning: We recommend setting lmc_seed as -1 so the random seed is different for each iteration.') # noqa: E501 + + if 'gen_seed' in self.template and self.template['gen_seed'] != -1: + self.warnings.append('Warning: We recommend setting gen_seed as -1 so the random seed is different for each iteration.') # noqa: E501 + + if 'gen_vel' not in self.template or ('gen_vel' in self.template and self.template['gen_vel'] == 'no'): + self.warnings.append('Warning: We recommend generating new velocities for each iteration to avoid potential issues with detailed balance.') # noqa: E501 + + if self.nst_sim % self.template['nstlog'] != 0: + raise ParameterError( + 'The parameter "nstlog" must be a factor of the parameter "nst_sim" specified in the YAML file.') + + if self.nst_sim % self.template['nstdhdl'] != 0: + raise ParameterError( + 'The parameter "nstdhdl" must be a factor of the parameter "nst_sim" specified in the YAML file.') + + if self.template['nstexpanded'] % self.template['nstdhdl'] != 0: + raise ParameterError( + 'In EEXE, the parameter "nstdhdl" must be a factor of the parameter "nstexpanded", or the calculation of acceptance ratios might be wrong.') # noqa: E501 + + if self.mdp_args is not None: + if 'lmc_seed' in self.mdp_args and -1 not in self.mdp_args['lmc_seed']: + self.warnings.append('Warning: We recommend setting lmc_seed as -1 so the random seed is different for each iteration.') # noqa: E501 + + if 'gen_seed' in self.mdp_args and -1 not in self.mdp_args['gen_seed']: + self.warnings.append('Warning: We recommend setting gen_seed as -1 so the random seed is different for each iteration.') # noqa: E501 + + if 'gen_vel' in self.mdp_args and 'no' in self.mdp_args['gen_vel']: + self.warnings.append('Warning: We recommend generating new velocities for each iteration to avoid potential issues with the detailed balance.') # noqa: E501 + + if 'nstlog' in self.mdp_args and sum(self.nst_sim % np.array(self.mdp_args['nstlog'])) != 0: + raise ParameterError( + 'The parameter "nstlog" must be a factor of the parameter "nst_sim" specified in the YAML file.') + + if 'nstdhdl' in self.mdp_args and sum(self.nst_sim % np.array(self.mdp_args['nstdhdl'])) != 0: + raise ParameterError( + 'The parameter "nstdhdl" must be a factor of the parameter "nst_sim" specified in the YAML file.') + + if 'nstexpanded' in self.mdp_args and 'nstdhdl' in self.mdp_args and sum(np.array(self.mdp_args['nstexpanded']) % np.array(self.mdp_args['nstdhdl'])) != 0: # noqa: E501 + raise ParameterError( + 'In EEXE, the parameter "nstdhdl" must be a factor of the parameter "nstexpanded", or the calculation of acceptance ratios might be wrong.') # noqa: E501 + + if 'pull' in self.template and self.template['pull'] == 'yes': + pull_ncoords = self.template['pull_ncoords'] + self.set_ref_dist = [] + for i in range(pull_ncoords): + if self.template[f'pull_coord{i+1}_geometry'] == 'distance': + if self.template[f'pull_coord{i+1}_start'] == 'yes': + self.set_ref_dist.append(True) # starting from the second iteration, set pull_coord*_init. + if 'pull_nstxout' not in self.template: + self.warnings.append('A non-zero value should be specified for pull_nstxout if pull_coord*_start is set to yes.') # noqa: E501 + if self.template['pull_nstxout'] == 0: + self.warnings.append('A non-zero value should be specified for pull_nstxout if pull_coord*_start is set to yes.') # noqa: E501 + else: + self.set_ref_dist.append(False) # Here we assume that the user know what reference distance to use. # noqa: E501 + else: + self.set_ref_dist.append(False) # we only deal with distance restraints for now. + + # Step 7: Set up derived parameters + # 7-1. kT in kJ/mol + self.kT = k * NA * self.temp / 1000 # 1 kT in kJ/mol + + # 7-2. Figure out what types of lambda variables are involved + # Here is we possible lambda types in the order read by GROMACS, which is likely also the order when being printed to the log file. # noqa: E501 + # See https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/gmxpreprocess/readir.cpp#L2543 + lambdas_types_all = ['fep_lambdas', 'mass_lambdas', 'coul_lambdas', 'vdw_lambdas', 'bonded_lambdas', 'restraint_lambdas', 'temperature_lambdas'] # noqa: E501 + self.lambda_types = [] # lambdas specified in the MDP file + for i in lambdas_types_all: + if i in self.template.keys(): # there shouldn't be parameters like "fep-lambdas" after reformatting the MDP file # noqa: E501 + self.lambda_types.append(i) + + # 7-3. Total # of states: n_tot = n_sub * n_sim - (n_overlap) * (n_sim - 1), where n_overlap = n_sub - s + self.n_tot = len(self.template[self.lambda_types[0]]) + + # 7-4. Number of states of each replica (assuming the same for all rep) + self.n_sub = self.n_tot - self.s * (self.n_sim - 1) + if self.n_sub < 1: + raise ParameterError( + f"There must be at least two states for each replica (current value: {self.n_sub}). The current specified configuration (n_tot={self.n_tot}, n_sim={self.n_sim}, s={self.s}) does not work for EEXE.") # noqa: E501 + + # 7-5. A list of sets of state indices + start_idx = [i * self.s for i in range(self.n_sim)] + self.state_ranges = [list(np.arange(i, i + self.n_sub)) for i in start_idx] + + # 7-6. A list of time it took to get the weights equilibrated + self.equil = [-1 for i in range(self.n_sim)] # -1 means unequilibrated + + # 7-7. Some variables for counting + self.n_rejected = 0 + self.n_swap_attempts = 0 + self.n_empty_swappable = 0 + + # 7-8. Replica space trajectories. For example, rep_trajs[0] = [0, 2, 3, 0, 1, ...] means + # that configuration 0 transitioned to replica 2, then 3, 0, 1, in iterations 1, 2, 3, 4, ..., + # respectively. The first element of rep_traj[i] should always be i. + self.rep_trajs = [[i] for i in range(self.n_sim)] + + # 7-9. configs shows the current configuration that each replica is sampling. + # For example, self.configs = [0, 2, 1, 3] means that configurations 0, 2, 1, and 3 are + # in replicas, 0, 1, 2, 3, respectively. This list will be constantly updated during the simulation. + self.configs = list(range(self.n_sim)) + + # 7-10. The time series of the (processed) whole-range alchemical weights + # If no weight combination is applied, self.g_vecs will just be a list of None's. + self.g_vecs = [] + + # 7-11. Data analysis + if self.df_method == 'MBAR': + self.df_data_type = 'u_nk' + else: + self.df_data_type = 'dhdl' + + # 7-12. External module for coordinate modification + if self.modify_coords is not None: + sys.path.append(os.getcwd()) + module = importlib.import_module(self.modify_coords) + self.modify_coords_fn = getattr(module, self.modify_coords) + else: + self.modify_coords_fn = None + + # Step 8. Check the executables + if analysis is False: + self.check_gmx_executable() + + def check_gmx_executable(self): + """ + Checks if the GROMACS executable can be used and gets its absolute path and version. + """ + try: + result = subprocess.run(['which', self.gmx_executable], capture_output=True, text=True, check=True) + self.gmx_path = result.stdout.strip() # this can be exactly the same as self.gmx_executable + + version_output = subprocess.run([self.gmx_path, "-version"], capture_output=True, text=True, check=True) + for line in version_output.stdout.splitlines(): + if "GROMACS version" in line: + self.gmx_version = line.split()[-1] + break + except subprocess.CalledProcessError: + print(f"{self.gmx_executable} is not available on this system.") + except Exception as e: + print(f"An error occurred:\n{e}") + + def print_params(self, params_analysis=False): + """ + Prints important parameters related to the EXEE simulation. + + Parameters + ---------- + params_analysis : bool, optional + If True, additional parameters related to data analysis will be printed. Default is False. + """ + if isinstance(self.gro, list): + gro_str = ', '.join(self.gro) + else: + gro_str = self.gro + + if isinstance(self.top, list): + top_str = ', '.join(self.top) + else: + top_str = self.top + + print("Important parameters of EXEE") + print("============================") + print(f"Python version: {sys.version}") + print(f"GROMACS executable: {self.gmx_path}") # we print the full path here + print(f"GROMACS version: {self.gmx_version}") + print(f"ensemble_md version: {ensemble_md.__version__}") + print(f'Simulation inputs: {gro_str}, {top_str}, {self.mdp}') + print(f"Verbose log file: {self.verbose}") + print(f"Proposal scheme: {self.proposal}") + print(f"Acceptance scheme for swapping simulations: {self.acceptance}") + print(f"Type of weights to be combined: {self.w_combine}") + print(f"Histogram cutoff: {self.N_cutoff}") + print(f"Number of replicas: {self.n_sim}") + print(f"Number of iterations: {self.n_iter}") + print(f"Number of attempted swaps in one exchange interval: {self.n_ex}") + print(f"Length of each replica: {self.dt * self.nst_sim} ps") + print(f"Frequency for checkpointing: {self.n_ckpt} iterations") + print(f"Total number of states: {self.n_tot}") + print(f"Additionally defined swappable states: {self.add_swappables}") + print(f"Additional grompp arguments: {self.grompp_args}") + print(f"Additional runtime arguments: {self.runtime_args}") + if self.mdp_args is not None and len(self.mdp_args.keys()) > 1: + print("MDP parameters differing across replicas:") + for i in self.mdp_args.keys(): + print(f" - {i}: {self.mdp_args[i]}") + else: + print(f"MDP parameters differing across replicas: {self.mdp_args}") + print("Alchemical ranges of each replica in EEXE:") + for i in range(self.n_sim): + print(f" - Replica {i}: States {self.state_ranges[i]}") + + if params_analysis is True: + print() + print(f"Whether to build Markov state models and perform relevant analysis: {self.msm}") + print(f"Whether to perform free energy calculations: {self.free_energy}") + print(f"The step to used in subsampling the DHDL data in free energy calculations, if any: {self.df_spacing}") # noqa: E501 + print(f"The chosen free energy estimator for free energy calculations, if any: {self.df_method}") + print(f"The method for estimating the uncertainty of free energies in free energy calculations, if any: {self.err_method}") # noqa: E501 + print(f"The number of bootstrap iterations in the boostrapping method, if used: {self.n_bootstrap}") + print(f"The random seed to use in bootstrapping, if used: {self.seed}") + + if self.reformatted_mdp is True: + print("Note that the input MDP file has been reformatted by replacing hypens with underscores. The original mdp file has been renamed as *backup.mdp.") # noqa: E501 + + @staticmethod + def reformat_MDP(mdp_file): + """ + Reformats the input MDP file so that all hyphens in the parameter names are replaced by underscores. + If the input MDP file contains hyphens in its parameter names, the class attribue :code:`self.reformatted` + will be set to :code:`True`. In this case, the new MDP object with reformatted parameter names will be + written to the original file path of the file, while the original file will be renamed with a + :code:`_backup` suffix. If the input MDP file is not reformatted, the function sets + the class attribute :code:`self.reformatted_mdp` to :code:`False`. + + Returns + ------- + reformatted : bool + Whether the file was reformatted. + """ + params = gmx_parser.MDP(mdp_file) + + odict = OrderedDict([(k.replace('-', '_'), v) for k, v in params.items()]) + params_new = gmx_parser.MDP(None, **odict) + reformatted = None + + if rank == 0: + if params_new.keys() == params.keys(): + reformatted = False # no need to reformat the file + else: + reformatted = True + new_name = mdp_file.split('.mdp')[0] + '_backup.mdp' + os.rename(mdp_file, new_name) + params_new.write(mdp_file) + + return reformatted + + def initialize_MDP(self, idx): + """ + Initializes the MDP object for generating MDP files for a replica based on the MDP template. + This function should be called only for generating MDP files for the FIRST iteration + and it has nothing to do with whether the weights are fixed or equilibrating. + It is assumed that the MDP template has all the common parameters of all replicas. + + Parameters + ---------- + idx : int + Index of the simulation whose MDP parameters need to be initialized. + + Returns + ------- + MDP : :obj:`.gmx_parser.MDP` obj + An updated object of :obj:`.gmx_parser.MDP` that can be used to write MDP files. + """ + MDP = copy.deepcopy(self.template) + MDP["nsteps"] = self.nst_sim + + if self.mdp_args is not None: + for param in self.mdp_args.keys(): + MDP[param] = self.mdp_args[param][idx] + + start_idx = idx * self.s + for i in self.lambda_types: + MDP[i] = self.template[i][start_idx:start_idx + self.n_sub] + + if "init_lambda_weights" in self.template: + MDP["init_lambda_weights"] = self.template["init_lambda_weights"][start_idx:start_idx + self.n_sub] + + return MDP + + def get_ref_dist(self): + """ + Gets the reference distance(s) to use starting from the second iteration if distance restraint(s) are used. + Specifically, a reference distance determined here is the initial COM distance between the pull groups + in the input GRO file. This function initializes the attribute :code:`ref_dist`. + """ + if hasattr(self, 'set_ref_dist'): + self.ref_dist = [] + pullx_file = 'sim_0/iteration_0/pullx.xvg' + headers = get_headers(pullx_file) + for i in range(len(self.set_ref_dist)): + if self.set_ref_dist[i] is True: + dist = list(extract_dataframe(pullx_file, headers=headers)[f'{i+1}'])[0] + self.ref_dist.append(dist) + + def update_MDP(self, new_template, sim_idx, iter_idx, states, wl_delta, weights, counts=None): + """ + Updates the MDP file for a new iteration based on the new MDP template coming from the previous iteration. + Note that if the weights got equilibrated in the previous iteration, then the weights will be fixed + at these equilibrated values for all the following iterations. + + Parameters + ---------- + new_template : str + The new MDP template file. Typically the MDP file of the previous iteration. + sim_idx : int + The index of the simulation whose MDP parameters need to be updated. + iter_idx : int + The index of the iteration to be performed later. + states : list + A list of last sampled states of all simulaitons in the previous iteration. + wl_delta : list + A list of final Wang-Landau incrementors of all simulations. + weights : list + A list of lists final weights of all simulations. + counts : list + A list of lists final counts of all simulations. If the value is :code:`None`, + then the MDP parameter :code:`init-histogram-counts` won't be specified in the next iteration. + Note that not all the GROMACS versions have the MDP parameter :code:`init-histogram-counts` available, + in which case one should always pass :code:`None`, or set :code:`-maxwarn` in :code:`grompp_args` + in the input YAML file. + + Return + ------ + MDP : :obj:`.gmx_parser.MDP` obj + An updated object of :obj:`.gmx_parser.MDP` that can be used to write MDP files. + """ + new_template = gmx_parser.MDP(new_template) # turn into a gmx_parser.MDP object + MDP = copy.deepcopy(new_template) + MDP["tinit"] = self.nst_sim * self.dt * iter_idx + MDP["nsteps"] = self.nst_sim + MDP["init_lambda_state"] = (states[sim_idx] - sim_idx * self.s) # 2nd term is for shifting from the global to local index. # noqa: E501 + MDP["init_wl_delta"] = wl_delta[sim_idx] + MDP["init_lambda_weights"] = weights[sim_idx] + if counts is not None: + MDP["init_histogram_counts"] = counts[sim_idx] + + if self.equil[sim_idx] == -1: # the weights haven't been equilibrated + MDP["init_wl_delta"] = wl_delta[sim_idx] + else: + MDP["lmc_stats"] = "no" + MDP["wl_scale"] = "" + MDP["wl_ratio"] = "" + MDP["init_wl_delta"] = "" + MDP["lmc_weights_equil"] = "" + MDP["weight_equil_wl_delta"] = "" + + # Here we deal with the distance restraint in the pull code, if any. + if hasattr(self, 'ref_dist'): + for i in range(len(self.ref_dist)): + MDP[f'pull_coord{i+1}_start'] = "no" + MDP[f'pull_coord{i+1}_init'] = self.ref_dist[i] + + return MDP + + def extract_final_dhdl_info(self, dhdl_files): + """ + For all the replica simulations, finds the last sampled state + and print the corresponding lambda values from a dhdl file. + + Parameters + ---------- + dhdl_files : list + A list of file paths to GROMACS DHDL files of different replicas. + + Returns + ------- + states : list + A list of the global indices of the last sampled states of all simulaitons. + """ + states = [] + print("\nBelow are the final states being visited:") + for i in range(len(dhdl_files)): + headers = get_headers(dhdl_files[i]) + state_local = list(extract_dataframe(dhdl_files[i], headers=headers)['Thermodynamic state'])[-1] # local index of the last state # noqa: E501 + state_global = state_local + i * self.s # global index of the last state + states.append(state_global) # append the global index + print(f" Simulation {i}: Global state {states[i]}") + print('\n', end='') + + return states + + def extract_final_log_info(self, log_files): + """ + Extracts the following information for each replica simulation from a given list of GROMACS LOG files: + + - The final Wang-Landau incrementors. + - The final lists of weights. + - The final lists of counts. + - Whether the weights were equilibrated in the simulations. + + Parameters + ---------- + log_files : list + A list of file paths to GROMACS LOG files of different replicas. + + Returns + ------- + wl_delta : list + A list of final Wang-Landau incrementors of all simulations. + weights : list + A list of lists of final weights of all simulations. + counts : list + A list of lists of final counts of all simulations. + """ + wl_delta, weights, counts = [], [], [] + + # Find the final Wang-Landau incrementors and weights + for i in range(len(log_files)): + if self.verbose: + print(f'Parsing {log_files[i]} ...') + result = gmx_parser.parse_log(log_files[i]) # weights, counts, wl_delta, equil_time + weights.append(result[0][-1]) + counts.append(result[1]) + wl_delta.append(result[2]) + + # In Case 3 described in the docstring of parse_log (fixed-weights), + # result[3] will be 0 but it will never be passed to self.equil[i] + # because once self.equil[i] is not -1, we stop updating. This way, we can keep + # the time when the weights get equilibrated all the way. + if self.equil[i] == -1: + # At this point self.equil is the equilibration status BEFORE the last iteration + # while result[3] is the equilibration status AFTER finishing the last iteraiton. + # For any replicas where weights are still equilibrating (i.e. self.equil[j] == -1) + # we update its equilibration status. + self.equil[i] = result[3] + + if self.equilibrated_weights[i] == []: + if self.equil[i] != -1 and self.equil[i] != 0: + # self.equil[i] != -1: uneqilibrated + # self.equil[i] != 0: fixed-weight simulation + self.equilibrated_weights[i] = result[0][-1] + + return wl_delta, weights, counts + + @staticmethod + def identify_swappable_pairs(states, state_ranges, neighbor_exchange, add_swappables=None): + """ + Identify swappable pairs. By definition, a pair of simulation is considered swappable only if + their last sampled states are in the alchemical ranges of both simulations. This is required + to ensure that the values of involved ΔH and Δg can always be looked up from the DHDL and LOG files. + This also automatically guarantee that the simulations to be swapped have overlapping alchemical ranges. + + Parameters + ---------- + states : list + A list of the global indices of the last sampled states of all simulations. This list can be + generated by the :obj:`.extract_final_dhdl_info` method. Notably, the input list should not be + a list that has been updated/modified by :obj:`get_swapping_pattern`, or the result will be incorrect. + state_ranges : list of lists + A list of state indies for all replicas. The input list can be a list updated by + :obj:`.get_swapping_pattern`, especially in the case where there is a need to re-identify the + swappable pairs after an attempted swap is accepted. + neighbor_exchange : bool + Whether to exchange only between neighboring replicas. + add_swappables: list + A list of lists that additionally consider states (in global indices) that can be swapped. + For example, :code:`add_swappables=[[4, 5], [14, 15]]` means that if a replica samples state 4, + it can be swapped with another replica that samples state 5 and vice versa. The same logic applies + to states 14 and 15. + + Returns + ------- + swappables : list + A list of tuples representing the simulations that can be swapped. + """ + n_sim = len(states) + sim_idx = list(range(n_sim)) + all_pairs = list(combinations(sim_idx, 2)) + + # First, we identify pairs of replicas with overlapping ranges + swappables = [i for i in all_pairs if set(state_ranges[i[0]]).intersection(set(state_ranges[i[1]])) != set()] # noqa: E501 + + # Then, from these pairs, we exclude the ones whose the last sampled states are not present in both alchemical ranges # noqa: E501 + # In this case, U^i_n, U_^j_m, g^i_n, and g_^j_m are unknown and the acceptance cannot be calculated. + swappables = [i for i in swappables if states[i[0]] in state_ranges[i[1]] and states[i[1]] in state_ranges[i[0]]] # noqa: E501 + + # Expand the definition of swappable pairs when add_swappables is specified + if add_swappables is not None: + all_paired_states = [[states[p[0]], states[p[1]]] for p in all_pairs] + for i in all_paired_states: + if i in add_swappables: + pair = all_pairs[all_paired_states.index(i)] + if pair not in swappables: + swappables.append(pair) + + if neighbor_exchange is True: + print('Note: One neighboring swap will be proposed.') + swappables = [i for i in swappables if np.abs(i[0] - i[1]) == 1] + + return swappables + + @staticmethod + def propose_swap(swappables): + """ + Proposes a swap of coordinates between replicas by drawing samples from the swappable pairs. + + Parameters + ---------- + swappables : list + A list of tuples representing the simulations that can be swapped. + + Return + ------ + swap : tuple or an empty list + A tuple of simulation indices to be swapped. If there is no swappable pair, + an empty list will be returned. + """ + try: + swap = random.choices(swappables, k=1)[0] + except IndexError: # no swappable pairs + swap = [] + + return swap + + def get_swapping_pattern(self, dhdl_files, states, weights): + """ + Generates a list (:code:`swap_pattern`) that represents how the configurations should be swapped in the + next iteration. The indices of the output list correspond to the simulation/replica indices, and the + values represent the configuration indices in the corresponding simulation/replica. For example, if the + swapping pattern is :code:`[0, 2, 1, 3]`, it means that in the next iteration, replicas 0, 1, 2, 3 should + sample configurations 0, 2, 1, 3, respectively, where configurations 0, 1, 2, 3 here are defined as whatever + configurations are in replicas 0, 1, 2, 3 in the CURRENT iteration (not iteration 0), respectively. + + Notably, when this function is called (e.g. once every iteration in an EEXE simulation), the output + list :code:`swap_pattern` is always initialized as :code:`[0, 1, 2, 3, ...]` and gets updated once every + attempted swap. This is different from the attribute :code:`configs`, which is only initialized at the + very beginning of the entire EEXE simulation (iteration 0), though :code:`configs` also gets updated with + :code:`swap_pattern` once every attempted swap in this function. + + Parameters + ---------- + dhdl_files : list + A list of DHDL files. The indicies in the DHDL filenames shouuld be in an ascending order, e.g. + :code:`[dhdl_0.xvg, dhdl_1.xvg, ..., dhdl_N.xvg]`. + states : list + A list of last sampled states (in global indices) of ALL simulaitons. :code:`states[i]=j` means that + the configuration in replica :code:`i` is at state :code:`j` at the time when the exchange is performed. + This list can be generated :obj:`.extract_final_dhdl_info`. + weights : list + A list of lists of final weights of ALL simulations. :code:`weights[i]` corresponds to the list of weights + used in replica :code:`i`. The list can be generated by :obj:`.extract_final_log_info`. + + Returns + ------- + swap_pattern : list + A list showing the configuration of replicas after swapping. + swap_list : list + A list of tuples showing the accepted swaps. + """ + swap_list = [] + if self.proposal != 'multiple': + if self.proposal == 'exhaustive': + n_ex = int(np.floor(self.n_sim / 2)) # This is the maximum, not necessarily the number that will always be reached. # noqa + n_ex_exhaustive = 0 # The actual number of swaps atttempted. + else: + n_ex = 1 # single swap or neighboring swap + else: + # multiple swaps + if self.n_ex == 'N^3': + n_ex = self.n_tot ** 3 + else: + n_ex = self.n_ex + + shifts = list(self.s * np.arange(self.n_sim)) + swap_pattern = list(range(self.n_sim)) # Can be regarded as the indices of DHDL files/configurations + state_ranges = copy.deepcopy(self.state_ranges) + states_copy = copy.deepcopy(states) # only for re-identifying swappable pairs given updated state_ranges + swappables = EnsembleEXE.identify_swappable_pairs(states, state_ranges, self.proposal == 'neighboring', self.add_swappables) # noqa: E501 + + # Note that if there is only 1 swappable pair, then it will still be the only swappable pair + # after an attempted swap is accepted. Therefore, there is no need to perform multiple swaps or re-identify + # the new set of swappable pairs. In this case, we set n_ex back to 1. + if len(swappables) == 1: + if n_ex > 1: + n_ex = 1 # n_ex is set back to 1 since there is only 1 swappable pair. + + print(f"Swappable pairs: {swappables}") + for i in range(n_ex): + # Update the list of swappable pairs starting from the 2nd attempted swap for the exhaustive swap method. + if self.proposal == 'exhaustive' and i >= 1: + # Note that this should be done regardless of the acceptance/rejection of the previously drawn pairs. + # Also note that at this point, swap is still the last attempted swap. + swappables = [i for i in swappables if set(i).intersection(set(swap)) == set()] # noqa: F821 + print(f'\nRemaining swappable pairs: {swappables}') + + if len(swappables) == 0 and self.proposal == 'exhaustive': + # This should only happen when the method of exhaustive swaps is used. + if i == 0: + self.n_empty_swappable += 1 + break + else: + self.n_swap_attempts += 1 + if self.proposal == 'exhaustive': + n_ex_exhaustive += 1 + + swap = EnsembleEXE.propose_swap(swappables) + print(f'\nProposed swap: {swap}') + if swap == []: + self.n_empty_swappable += 1 + print('No swap is proposed because there is no swappable pair at all.') + break # no need to re-identify swappable pairs and draw new samples + else: + if self.verbose is True and self.proposal != 'exhaustive': + print(f'A swap ({i + 1}/{n_ex}) is proposed between the configurations of Simulation {swap[0]} (state {states[swap[0]]}) and Simulation {swap[1]} (state {states[swap[1]]}) ...') # noqa: E501 + + if self.modify_coords_fn is not None: + swap_bool = True # always accept the move + else: + # Calculate the acceptance ratio and decide whether to accept the swap. + prob_acc = self.calc_prob_acc(swap, dhdl_files, states, shifts, weights) + swap_bool = self.accept_or_reject(prob_acc) + + # Theoretically, in an EEXE simulation, we could either choose to swap configurations (via + # swapping GRO files) or replicas (via swapping MDP files). In ensemble_md package, we chose the + # former when implementing the EEXE algorithm. Specifically, in the CLI `run_EEXE`, `swap_pattern` + # is used to swap the GRO files. Therefore, when an attempted swap is accetped and `swap_pattern` + # is updated, we also need to update the variables `shifts`, `weights`, `dhdl_files`, + # `state_ranges`, `self.configs` but not anything else. Otherwise, incorrect results will be + # produced. To better understand this, one can refer to our unit test for get_swapping_pattern + # and calc_prob_acc, set checkpoints and examine why the variables should/should not be updated. + + if swap_bool is True: + swap_list.append(swap) + # The assignments need to be done at the same time in just one line. + # states[swap[0]], states[swap[1]] = states[swap[1]], states[swap[0]] + shifts[swap[0]], shifts[swap[1]] = shifts[swap[1]], shifts[swap[0]] + weights[swap[0]], weights[swap[1]] = weights[swap[1]], weights[swap[0]] + dhdl_files[swap[0]], dhdl_files[swap[1]] = dhdl_files[swap[1]], dhdl_files[swap[0]] + swap_pattern[swap[0]], swap_pattern[swap[1]] = swap_pattern[swap[1]], swap_pattern[swap[0]] + state_ranges[swap[0]], state_ranges[swap[1]] = state_ranges[swap[1]], state_ranges[swap[0]] + self.configs[swap[0]], self.configs[swap[1]] = self.configs[swap[1]], self.configs[swap[0]] + + if n_ex > 1 and self.proposal == 'multiple': # must be multiple swaps + # After state_ranges have been updated, we re-identify the swappable pairs. + # Notably, states_copy (instead of states) should be used. (They could be different.) + swappables = EnsembleEXE.identify_swappable_pairs(states_copy, state_ranges, self.proposal == 'neighboring', self.add_swappables) # noqa: E501 + print(f" New swappable pairs: {swappables}") + else: + # In this case, there is no need to update the swappables + pass + + print(f' Current list of configurations: {self.configs}') + + if self.verbose is False: + print(f'\n{n_ex} swap(s) have been proposed.') + print(f'\nThe finally adopted swap pattern: {swap_pattern}') + print(f'The list of configurations sampled in each replica in the next iteration: {self.configs}') + + # Update the replica-space trajectories + for i in range(self.n_sim): + self.rep_trajs[i].append(self.configs.index(i)) + + return swap_pattern, swap_list + + def calc_prob_acc(self, swap, dhdl_files, states, shifts, weights): + """ + Calculates the acceptance ratio given the Monte Carlo scheme for swapping the simulations. + + Parameters + ---------- + swap : tuple + A tuple of indices corresponding to the simulations to be swapped. + dhdl_files : list + A list of DHDL files, e.g. :code:`dhdl_files = ['dhdl_2.xvg', 'dhdl_1.xvg', 'dhdl_0.xvg', 'dhdl_3.xvg']` + means that configurations 2, 1, 0, and 3 are now in replicas 0, 1, 2, 3. This can happen in multiple swaps + when a previous swap between configurations 0 and 2 has just been accepted. Otherwise, the list of + filenames should always be in the ascending order, e.g. :code:`['dhdl_0.xvg', 'dhdl_1.xvg', 'dhdl_2.xvg', + dhdl_3.xvg]`. + states : list + A list of last sampled states (in global indices) in the DHDL files corresponding to configurations 0, 1, + 2, ... (e.g. :code:`dhdl_0.xvg`, :code:`dhdl_1.xvg`, :code:`dhdl_2.xvg`, ...) + This list can be generated by :obj:`.extract_final_dhdl_info`. + shifts : list + A list of state shifts for converting global state indices to the local ones. Specifically, :code:`states` + substracted by :code:`shifts` should be the local state indices of the last sampled states + in :code:`dhdl_files[0]`, :code:`dhdl_files[1]`, ... (which importantly, are not necessarily + :code:`dhdl_0.xvg`, :code:`dhdl_1.xvg`, ...). If multiple swaps are not used, then + this should always be :code:`list(EEXE.s * np.arange(EEXE.n_sim))`. + weights : list + A list of lists of final weights of ALL simulations. Typiecally generated by + :obj:`.extract_final_log_info`. :code:`weights[i]` and :code:`dhdl_files[i]` should correspond to + the same configuration. + + Returns + ------- + prob_acc : float + The acceptance ratio. + """ + if self.acceptance == "same_state" or self.acceptance == "same-state": + if states[swap[0]] == states[swap[1]]: # same state, swap! + prob_acc = 1 # This must lead to an output of swap_bool = True from the function accept_or_reject + else: + prob_acc = 0 # This must lead to an output of swap_bool = False from the function accept_or_reject + + else: # i.e. metropolis-eq or metropolis, which both require the calculation of dU + # Now we calculate dU + if self.verbose is True: + print(" Proposing a move from (x^i_m, x^j_n) to (x^i_n, x^j_m) ...") + f0, f1 = dhdl_files[swap[0]], dhdl_files[swap[1]] + h0, h1 = get_headers(f0), get_headers(f1) + data_0, data_1 = ( + extract_dataframe(f0, headers=h0).iloc[-1], + extract_dataframe(f1, headers=h1).iloc[-1], + ) + + # \Delta H to all states at the last time frame + # Notably, the can be regarded as H for each state since the reference state will have a value of 0 anyway. + dhdl_0 = data_0[-self.n_sub:] + dhdl_1 = data_1[-self.n_sub:] + + # Old local index, which will only be used in "metropolis" + old_state_0 = states[swap[0]] - shifts[swap[0]] + old_state_1 = states[swap[1]] - shifts[swap[1]] + + # New local index. Note that states are global indices, so we shift them back to the local ones + new_state_0 = states[swap[1]] - shifts[swap[0]] # new state index (local index in simulation swap[0]) + new_state_1 = states[swap[0]] - shifts[swap[1]] # new state index (local index in simulation swap[1]) + + dU_0 = (dhdl_0[new_state_0] - dhdl_0[old_state_0]) / self.kT # U^{i}_{n} - U^{i}_{m}, i.e. \Delta U (kT) to the new state # noqa: E501 + dU_1 = (dhdl_1[new_state_1] - dhdl_1[old_state_1]) / self.kT # U^{j}_{m} - U^{j}_{n}, i.e. \Delta U (kT) to the new state # noqa: E501 + dU = dU_0 + dU_1 + if self.verbose is True: + print( + f" U^i_n - U^i_m = {dU_0:.2f} kT, U^j_m - U^j_n = {dU_1:.2f} kT, Total dU: {dU:.2f} kT" + ) + + if self.acceptance == "metropolis_eq" or self.acceptance == "metropolis-eq": + prob_acc = min(1, np.exp(-dU)) + else: # must be 'metropolis', which consider lambda weights as well + g0 = weights[swap[0]] + g1 = weights[swap[1]] + dg_0 = g0[new_state_0] - g0[old_state_0] # g^{i}_{n} - g^{i}_{m} + dg_1 = g1[new_state_1] - g1[old_state_1] # g^{j}_{m} - g^{j}_{n} + dg = dg_0 + dg_1 # kT + + # Note that simulations with different lambda ranges would have different references + # so g^{i}_{n} - g^{j}_{n} or g^{j}_{m} - g^{i}_{m} wouldn't not make sense. + # We therefore print g^i_n - g^i_m and g^j_m - g^j_n instead even if they are less interesting. + if self.verbose is True: + print( + f" g^i_n - g^i_m = {dg_0:.2f} kT, g^j_m - g^j_n = {dg_1:.2f} kT, Total dg: {dg:.2f} kT" + ) + + prob_acc = min(1, np.exp(-dU + dg)) + return prob_acc + + def accept_or_reject(self, prob_acc): + """ + Returns a boolean variable indiciating whether the proposed swap should be acceepted given the acceptance rate. + + Parameters + ---------- + prob_acc : float + The acceptance rate. + + Returns + ------- + swap_bool : bool + A boolean variable indicating whether the swap should be accepted. + """ + if prob_acc == 0: + swap_bool = False + self.n_rejected += 1 + if self.verbose is True: + print(" Swap rejected! ", end="", flush=True) + else: + rand = random.random() + if self.verbose is True: + print( + f" Acceptance rate: {prob_acc:.3f} / Random number drawn: {rand:.3f}" + ) + if rand < prob_acc: + swap_bool = True + if self.verbose is True: + print(" Swap accepted! ") + else: + swap_bool = False + self.n_rejected += 1 + if self.verbose is True: + print(" Swap rejected! ") + return swap_bool + + def weight_correction(self, weights, counts): + """ + Corrects the lambda weights based on the histogram counts. Namely, + :math:`g_k' = g_k + ln(N_{k-1}/N_k)`, where :math:`g_k` and :math:`g_k'` + are the lambda weight after and before the correction, respectively. + Notably, in any of the following situations, we don't do any correction. + + - Either :math:`N_{k-1}` or :math:`N_k` is 0. + - Either :math:`N_{k-1}` or :math:`N_k` is smaller than the histogram cutoff. + + Parameters + ---------- + weights : list + A list of lists of weights (of ALL simulations) to be corrected. + counts : list + A list of lists of counts (of ALL simulations). + + Return + ------ + weights : list + An updated list of lists of corected weights. + """ + if self.verbose is True: + print("\nPerforming weight correction for the lambda weights ...") + else: + print("\nPerforming weight correction for the lambda weights ...", end="") + + for i in range(len(weights)): # loop over the replicas + if self.verbose is True: + print(f" Counts of rep {i}:\t\t{counts[i]}") + print(f' Original weights of rep {i}:\t{[float(f"{k:.3f}") for k in weights[i]]}') + + for j in range(1, len(weights[i])): # loop over the alchemical states + if counts[i][j - 1] != 0 and counts[i][j - 1] != 0: + if np.min([counts[i][j - 1], counts[i][j]]) > self.N_cutoff: + weights[i][j] += np.log(counts[i][j - 1] / counts[i][j]) + + if self.verbose is True: + print(f' Corrected weights of rep {i}:\t{[float(f"{k:.3f}") for k in weights[i]]}\n') + + if self.verbose is False: + print(' Done') + + return weights + + def get_averaged_weights(self, log_files): + """ + For each replica, calculate the averaged weights (and the associated error) from the time series + of the weights since the previous update of the Wang-Landau incrementor. + + Parameters + ---------- + log_files : list + A list of file paths to GROMACS LOG files of different replicas. + + Returned + -------- + weights_avg : list + A list of lists of weights averaged since the last update of the Wang-Landau + incrementor. The length of the list should be the number of replicas. + weights_err : list + A list of lists of errors corresponding to the values in :code:`weights_avg`. + """ + for i in range(self.n_sim): + weights, _, wl_delta, _ = gmx_parser.parse_log(log_files[i]) + if self.current_wl_delta[i] == wl_delta: + self.updating_weights[i] += weights # expand the list + else: + self.current_wl_delta[i] = wl_delta + self.updating_weights[i] = weights + + # shape of self.updating_weights is (n_sim, n_points, n_states), but n_points can be different + # for different replicas, which will error out np.mean(self.updating_weights, axis=1) + weights_avg = [np.mean(self.updating_weights[i], axis=0).tolist() for i in range(self.n_sim)] + weights_err = [] + for i in range(self.n_sim): + if len(self.updating_weights[i]) == 1: # this would lead to a RunTime Warning and nan + weights_err.append([0] * self.n_sub) # in `weighted_mean``, a simple average will be returned. + else: + weights_err.append(np.std(self.updating_weights[i], axis=0, ddof=1).tolist()) + + return weights_avg, weights_err + + def prepare_weights(self, weights_avg, weights_final): + """ + Prepared weights to be combined by the function :code:`combine_weights`. + For each replica, the RMSE between the averaged weights and the final weights is calculated. If the + maximum of the RMSEs of all replicas is smaller than the cutoff specified in the input YAML file + (the parameter :code:`rmse_cutoff`), either final weights or time-averaged weights will be used + (depending on the value of the parameter :code:`w_combine`). Otherwise, :code:`None` will be returned, + which will lead to deactivation of weight combination in the CLI :code:`run_EEXE`. + + Parameters + ---------- + weights_avg : list + A list of lists of weights averaged since the last update of the Wang-Landau + incrementor. The length of the list should be the number of replicas. + weights_final : list + A list of lists of final weights of all simulations. The length of the list should + be the number of replicas. + + Returns + ------- + weights_output : list + A list of lists of weights to be combined. + """ + rmse_list = [utils.calc_rmse(weights_avg[i], weights_final[i]) for i in range(self.n_sim)] + rmse_str = ', '.join([f'{i:.2f}' for i in rmse_list]) + print(f'RMSE between the final weights and time-averaged weights for each replica: {rmse_str} kT') + if np.max(rmse_list) < self.rmse_cutoff: + # Weight combination will be activated + if self.w_combine == 'final': + weights_output = weights_final + elif self.w_combine == 'avg': + weights_output = weights_avg + else: + weights_output = None + + return weights_output + + def combine_weights(self, hist, weights, weights_err=None, print_values=True): + """ + Combine alchemical weights across multiple replicas and adjusts the histogram counts + corerspondingly. Note that if :code:`weights_err` is provided, inverse-variance weighting will be used. + Care must be taken since inverse-variance weighting can lead to slower + convergence if the provided errors are not accurate. (See :ref:`doc_w_schemes` for mor details.) + + Parameters + ---------- + hist : list + A list of lists of histogram counts of ALL simulations. + weights : list + A list of lists alchemical weights of ALL simulations. + weights_err : list, optional + A list of lists of errors corresponding to the values in :code:`weights`. + print_values : bool, optional + Whether to print the histograms and weights for each replica before and + after weight combinationfor each replica. + + Returns + ------- + hist_modified : list + A list of modified histogram counts of ALL simulations. + weights_modified : list + A list of modified Wang-Landau weights of ALL simulations. + g_vec : np.ndarray + An array of alchemical weights of the whole range of states. + """ + if print_values is True: + w = np.round(weights, decimals=3).tolist() # just for printing + print(' Original weights:') + for i in range(len(w)): + print(f' Rep {i}: {w[i]}') + print('\n Original histogram counts:') + for i in range(len(hist)): + print(f' Rep {i}: {hist[i]}') + + # Calculate adjacent weight differences and g_vec + dg_vec, N_ratio_vec = [], [] # alchemical weight differences and histogram count ratios for the whole range + dg_adjacent = [list(np.diff(weights[i])) for i in range(len(weights))] + # Suppress the specific warning here + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=RuntimeWarning) + N_ratio_adjacent = [list(np.array(hist[i][1:]) / np.array(hist[i][:-1])) for i in range(len(hist))] + + # Below we deal with the case where the sampling is poor or the WL incrementor just got updated such that + # the histogram counts are 0 for some states, in which case we simply skip histogram correction. + contains_nan = any(np.isnan(value) for sublist in N_ratio_adjacent for value in sublist) # can be caused by 0/0 # noqa: E501 + contains_inf = any(np.isinf(value) for sublist in N_ratio_adjacent for value in sublist) # can be caused by x/0, where x is a finite number # noqa: E501 + skip_hist_correction = contains_nan or contains_inf + if skip_hist_correction: + print('\n Histogram correction is skipped because the histogram counts are 0 for some states.') + + if weights_err is not None: + dg_adjacent_err = [[np.sqrt(weights_err[i][j] ** 2 + weights_err[i][j + 1] ** 2) for j in range(len(weights_err[i]) - 1)] for i in range(len(weights_err))] # noqa: E501 + + for i in range(self.n_tot - 1): + dg_list, dg_err_list, N_ratio_list = [], [], [] + for j in range(len(self.state_ranges)): + if i in self.state_ranges[j] and i + 1 in self.state_ranges[j]: + idx = self.state_ranges[j].index(i) + dg_list.append(dg_adjacent[j][idx]) + N_ratio_list.append(N_ratio_adjacent[j][idx]) + if weights_err is not None: + dg_err_list.append(dg_adjacent_err[j][idx]) + if weights_err is None: + dg_vec.append(np.mean(dg_list)) + else: + dg_vec.append(utils.weighted_mean(dg_list, dg_err_list)[0]) + N_ratio_vec.append(np.prod(N_ratio_list) ** (1 / len(N_ratio_list))) # geometric mean + dg_vec.insert(0, 0) + N_ratio_vec.insert(0, hist[0][0]) + g_vec = np.array([sum(dg_vec[:(i + 1)]) for i in range(len(dg_vec))]) + if skip_hist_correction is False: + # When skip_hist_correction is True, previous lines for calculating N_ratio_vec or N_ratio_list will + # still not error out so it's fine to not add the conditional statement like here, since we will + # have hist_modified = hist at the end anyway. However, if skip_hist_correction, things like + # int(np.nan) will lead to an error, so we put an if condition here. + N_vec = np.array([int(np.ceil(np.prod(N_ratio_vec[:(i + 1)]))) for i in range(len(N_ratio_vec))]) + + # Determine the vector of alchemical weights and histogram counts for each replica + weights_modified = np.zeros_like(weights) + for i in range(self.n_sim): + hist_modified = [] + if self.equil[i] == -1: # unequilibrated + weights_modified[i] = list(g_vec[i * self.s: i * self.s + self.n_sub] - g_vec[i * self.s: i * self.s + self.n_sub][0]) # noqa: E501 + else: + weights_modified[i] = self.equilibrated_weights[i] + if skip_hist_correction is False: + hist_modified = [list(N_vec[self.state_ranges[i]]) for i in range(self.n_sim)] + else: + hist_modified = hist + + if print_values is True: + w = np.round(weights_modified, decimals=3).tolist() # just for printing + print('\n Modified weights:') + for i in range(len(w)): + print(f' Rep {i}: {w[i]}') + if skip_hist_correction is False: + print('\n Modified histogram counts:') + for i in range(len(hist_modified)): + print(f' Rep {i}: {hist_modified[i]}') + + if self.verbose is False: + print(' DONE') + print(f'The alchemical weights of all states: \n {list(np.round(g_vec, decimals=3))}') + else: + print(f'\n The alchemical weights of all states: \n {list(np.round(g_vec, decimals=3))}') + + return hist_modified, weights_modified, g_vec + + def _run_grompp(self, n, swap_pattern): + """ + Prepares TPR files for the simulation ensemble using the GROMACS :code:`grompp` command. + + Parameters + ---------- + n : int + The iteration index (starting from 0). + swap_pattern : list + A list generated by :obj:`.get_swapping_pattern`. It represents how the replicas should be swapped. + """ + args_list = [] + for i in range(self.n_sim): + arguments = [self.gmx_executable, 'grompp'] + + # Input files + mdp = f"sim_{i}/iteration_{n}/{self.mdp.split('/')[-1]}" + if n == 0: + if isinstance(self.gro, list): + gro = f"{self.gro[i]}" + else: + gro = f"{self.gro}" + else: + gro = f"sim_{swap_pattern[i]}/iteration_{n-1}/confout.gro" # This effectively swap out GRO files + + if isinstance(self.top, list): + top = f"{self.top[i]}" + else: + top = f"{self.top}" + + # Add input file arguments + arguments.extend(["-f", mdp, "-c", gro, "-p", top]) + + # Add output file arguments + arguments.extend([ + "-o", f"sim_{i}/iteration_{n}/sys_EE.tpr", + "-po", f"sim_{i}/iteration_{n}/mdout.mdp" + ]) + + # Add additional arguments if any + if self.grompp_args is not None: + # Turn the dictionary into a list with the keys alternating with values + add_args = [elem for pair in self.grompp_args.items() for elem in pair] + arguments.extend(add_args) + + args_list.append(arguments) + + # Run the GROMACS grompp commands in parallel + returncode = None # Initialize as None for all ranks (necessary for the case when -np > n_sim, which is rare) + if rank == 0: + print('Generating TPR files ...') + if rank < self.n_sim: + returncode, stdout, stderr = utils.run_gmx_cmd(args_list[rank]) + if returncode != 0: + print(f'Error on rank {rank} (return code: {returncode}):\n{stderr}') + + # gather return codes at rank 0 + code_list = comm.gather(returncode, root=0) + + if rank == 0: + # Filter out None values which represent ranks that did not execute the command + code_list = [code for code in code_list if code is not None] + if code_list != [0] * self.n_sim: + MPI.COMM_WORLD.Abort(1) # Doesn't matter what non-zero returncode we put here as the code from GROMACS will be printed before this point anyway. # noqa: E501 + + def _run_mdrun(self, n): + """ + Executes GROMACS mdrun commands in parallel. + + Parameters + ---------- + n : int + The iteration index (starting from 0). + """ + # We will change the working directory so the mdrun command should be the same for all replicas. + arguments = [self.gmx_executable, 'mdrun'] + + # Add input file arguments + arguments.extend(['-s', 'sys_EE.tpr']) + + if self.runtime_args is not None: + # Turn the dictionary into a list with the keys alternating with values + add_args = [elem for pair in self.runtime_args.items() for elem in pair] + arguments.extend(add_args) + + # Run the GROMACS mdrun commands in parallel + returncode = None # Initialize as None for all ranks (necessary for the case when -np > n_sim, which is rare) + if rank == 0: + print('Running EXE simulations ...') + if rank < self.n_sim: + os.chdir(f'sim_{rank}/iteration_{n}') + returncode, stdout, stderr = utils.run_gmx_cmd(arguments) + if returncode != 0: + print(f'Error on rank {rank} (return code: {returncode}):\n{stderr}') + if self.rm_cpt is True: + # if the simulation went wrong, there would be no checkpoint file + try: + os.remove('state.cpt') + except Exception: + print('\n--------------------------------------------------------------------------\n') + MPI.COMM_WORLD.Abort(1) + os.chdir('../../') + + # gather return codes at rank 0 + code_list = comm.gather(returncode, root=0) + + if rank == 0: + # Filter out None values which represent ranks that did not execute the command + code_list = [code for code in code_list if code is not None] + if code_list != [0] * self.n_sim: + MPI.COMM_WORLD.Abort(1) # Doesn't matter what non-zero returncode we put here as the code from GROMACS will be printed before this point anyway. # noqa: E501 + + def run_EEXE(self, n, swap_pattern=None): + """ + Perform one iteration in the EEXE simulation, which includes generating the + TPR files using the GROMACS grompp :code:`command` and running the expanded ensemble simulations + in parallel using GROMACS :code:`mdrun` command. The GROMACS commands are launched by as subprocesses. + The function assumes that the GROMACS executable is available. + + Parameters + ---------- + n : int + The iteration index (starting from 0). + swap_pattern : list + A list generated by :obj:`.get_swapping_pattern`. It represents how the replicas should be swapped. + This parameter is not needed only if :code:`n` is 0. + """ + if rank == 0: + iter_str = f'\nIteration {n}: {self.dt * self.nst_sim * n: .1f} - {self.dt * self.nst_sim * (n + 1): .1f} ps' # noqa: E501 + print(iter_str + '\n' + '=' * (len(iter_str) - 1)) + + # 1st synchronizing point for all MPI processes: To make sure ranks other than 0 will not start executing + # _run_grompp earlier and mess up the order of printing. + comm.barrier() + + # Generating all required TPR files simultaneously, then run all simulations simultaneously. + # No synchronizing point is needed between _run_grompp and _run_mdrun, since once rank i finishes _run_grompp, + # it should run _run_mdrun in the same working directory, so there won't be any I/O error. + self._run_grompp(n, swap_pattern) + self._run_mdrun(n) + + # 2nd synchronizaing point for all MPI processes: To make sure no rank will start getting to the next + # iteration earlier than the others. For example, if rank 0 finishes the mdrun command earlier, we don't + # want it to start parsing the dhdl file (in the if condition of if rank == 0) of simulation 3 being run by + # rank 3 that has not been generated, which will lead to an I/O error. + comm.barrier() diff --git a/setup.py b/setup.py index 36d3ed0e..0f98b5eb 100644 --- a/setup.py +++ b/setup.py @@ -73,9 +73,9 @@ # Add entry points entry_points={ 'console_scripts':[ - 'run_EEXE = ensemble_md.cli.run_EEXE:main', - 'analyze_EEXE = ensemble_md.cli.analyze_EEXE:main', - 'explore_EEXE = ensemble_md.cli.explore_EEXE:main', + 'run_REXEE = ensemble_md.cli.run_REXEE:main', + 'analyze_REXEE = ensemble_md.cli.analyze_REXEE:main', + 'explore_REXEE = ensemble_md.cli.explore_REXEE:main', ], }, From 3e7ffe5921d1e1a27a20bc8b8ff0c2b32c3483be Mon Sep 17 00:00:00 2001 From: Wei-Tse Hsu Date: Wed, 25 Oct 2023 16:25:53 -0600 Subject: [PATCH 02/11] Renamed ensemble_EXE.py to replica_exchange_EE.py --- ensemble_md/ensemble_EXE.py | 1476 ----------------------------------- 1 file changed, 1476 deletions(-) delete mode 100644 ensemble_md/ensemble_EXE.py diff --git a/ensemble_md/ensemble_EXE.py b/ensemble_md/ensemble_EXE.py deleted file mode 100644 index 8a95d8dc..00000000 --- a/ensemble_md/ensemble_EXE.py +++ /dev/null @@ -1,1476 +0,0 @@ -#################################################################### -# # -# ensemble_md, # -# a python package for running GROMACS simulation ensembles # -# # -# Written by Wei-Tse Hsu # -# Copyright (c) 2022 University of Colorado Boulder # -# # -#################################################################### -""" -The :obj:`.ensemble_EXE` module provides functions for setting up and ensemble of expanded ensemble. -""" -import os -import sys -import copy -import yaml -import random -import warnings -import importlib -import subprocess -import numpy as np -from mpi4py import MPI -from itertools import combinations -from collections import OrderedDict -from alchemlyb.parsing.gmx import _get_headers as get_headers -from alchemlyb.parsing.gmx import _extract_dataframe as extract_dataframe - -import ensemble_md -from ensemble_md.utils import utils -from ensemble_md.utils import gmx_parser -from ensemble_md.utils.exceptions import ParameterError - -comm = MPI.COMM_WORLD -rank = comm.Get_rank() - - -class EnsembleEXE: - """ - This class provides a variety of functions useful for setting up and running - an ensemble of expanded ensemble. Upon instantiation, all parameters in the YAML - file will be assigned to an attribute in the class. In addition to these variables, - below is a list of attributes of the class. (All the the attributes are assigned by - :obj:`set_params` unless otherwise noted.) - - :ivar gmx_path: The absolute path of the GROMACS exectuable. - :ivar gmx_version: The version of the GROMACS executable. - :ivar yaml: The input YAML file used to instantiate the class. Assigned by the :code:`__init__` function. - :ivar warnings: Warnings about parameter specification in either YAML or MDP files. - :ivar reformatted_mdp: Whether the templated MDP file has been reformatted by replacing hyphens - with underscores or not. - :ivar template: The instance of the :obj:`MDP` class based on the template MDP file. - :ivar dt: The simulation timestep in ps. - :ivar temp: The simulation temperature in Kelvin. - :ivar fixed_weights: Whether the weights will be fixed during the simulation (according to the template MDP file). - :ivar updating_weights: The list of weights as a function of time (since the last update of the Wang-Landau - incrementor) for different replicas. The length is equal to the number of replicas. This is only relevant for - weight-updating simulations. - :ivar equilibrated_weights: The equilibrated weights of different replicas. For weight-equilibrating simulations, - this list is initialized as a list of empty lists. Otherwise (weight-fixed), it is initialized as a list of - :code:`None`. - :ivar current_wl_delta: The current value of the Wang-Landau incrementor. This is only relevent for weight-updating - simulations. - :ivar kT: 1 kT in kJ/mol at the simulation temperature. - :ivar lambda_types: The types of lambda variables involved in expanded ensemble simulations, e.g. - :code:`fep_lambdas`, :code:`mass_lambdas`, :code:`coul_lambdas`, etc. - :ivar n_tot: The total number of states for all replicas. - :ivar n_sub: The numbmer of states for each replica. The current implementation assumes homogenous replicas. - :ivar state_ranges: A list of list of state indices for each replica. - :ivar equil: A list of times it took to equilibrated the weights for different replicas. This - list is initialized with a list of -1, where -1 means that the weights haven't been equilibrated. Also, - a value of 0 means that the simulation is a fixed-weight simulation. - :ivar n_rejected: The number of proposed exchanges that have been rejected. Updated by :obj:`.accept_or_reject`. - :ivar n_swap_attempts: The number of swaps attempted so far. This does not include the cases - where there is no swappable pair. Updated by :obj:`.get_swapping_pattern`. - :ivar n_emtpy_swappable: The number of times when there was no swappable pair. - :ivar rep_trajs: The replica-space trajectories of all configurations. - :ivar configs: A list that thows the current configuration index that each replica is sampling. - :ivar g_vecs: The time series of the (processed) whole-range alchemical weights. If no weight combination is - applied, this list will just be a list of :code:`None`'s. - :ivar df_data_type: The type of data (either :math:`u_{nk}` or :math:`dH/dλ`) that will be used for - free energy calculations if :code:`df_method` is :code:`True`. - :ivar modify_coords_fn: The function (callable) in the external module (specified as :code:`modify_coords` in - the input YAML file) for modifying coordinates at exchanges. - """ - - def __init__(self, yaml_file, analysis=False): - self.yaml = yaml_file - self.set_params(analysis) - - def set_params(self, analysis): - """ - Sets up or reads in the user-defined parameters from a yaml file and an MDP template. - This function is called to instantiate the class in the :code:`__init__` function of - class. Specifically, it does the following: - - 1. Sets up constants. - 2. Reads in parameters from a YAML file. - 3. Handles YAML parameters. - 4. Checks if the parameters in the YAML file are well-defined. - 5. Reformats the input MDP file to replace all hyphens with underscores. - 6. Reads in parameters from the MDP template. - - After instantiation, the class instance will have attributes corresponding to - each of the parameters specified in the YAML file. For a full list of the parameters that - can be specified in the YAML file, please refer to :ref:`doc_parameters`. - - :param yaml_file: The file name of the YAML file for specifying the parameters for EEXE. - :type yaml_file: str - :param analysis: Whether the instantiation of the class is for data analysis of EEXE simulations. - The default is :code:`False` - :type analysis: bool - - :raises ParameterError: - - - If a required parameter is not specified in the YAML file. - - If a specified parameter is not recognizable. - - If a specified weight combining scheme is not available. - - If a specified acceptance scheme is not available. - - If a specified free energy estimator is not available. - - If a specified method for error estimation is not available. - - If an integer parameter is not an integer. - - If a positive parameter is not positive. - - If a non-negative parameter is negative. - - If an invalid MDP file is detected. - """ - self.warnings = [] # Store warnings, if any. - - # Step 0: Set up constants - k = 1.380649e-23 - NA = 6.0221408e23 - - # Step 1: Read in parameters from the YAML file. - with open(self.yaml) as f: - params = yaml.load(f, Loader=yaml.FullLoader) - - for attr in params: - setattr(self, attr, params[attr]) - - # Step 2: Handle the compulsory YAML parameters - required_args = [ - "gmx_executable", - "gro", - "top", - "mdp", - "n_sim", - "n_iter", - "s", - ] - for i in required_args: - if hasattr(self, i) is False or getattr(self, i) is None: - raise ParameterError( - f"Required parameter '{i}' not specified in {self.yaml}." - ) # noqa: F405 - - # Step 3: Handle the optional YAML parameters - # Key: Optional argument; Value: Default value - optional_args = { - "add_swappables": None, - "modify_coords": None, - "nst_sim": None, - "proposal": 'exhaustive', - "acceptance": "metropolis", - "w_combine": None, - "rmse_cutoff": np.inf, - "N_cutoff": 1000, - "n_ex": 'N^3', # only active for multiple swaps. - "verbose": True, - "mdp_args": None, - "grompp_args": None, - "runtime_args": None, - "n_ckpt": 100, - "rm_cpt": True, - "msm": False, - "free_energy": False, - "subsampling_avg": False, - "df_spacing": 1, - "df_ref": None, - "df_method": "MBAR", - "err_method": "propagate", - "n_bootstrap": 50, - "seed": None, - } - for i in optional_args: - if hasattr(self, i) is False or getattr(self, i) is None: - setattr(self, i, optional_args[i]) - - # all_args: Arguments that can be specified in the YAML file. - all_args = required_args + list(optional_args.keys()) - for i in params: - if i not in all_args: - self.warnings.append(f'Warning: Parameter "{i}" specified in the input YAML file is not recognizable.') - - # Step 4: Check if the parameters in the YAML file are well-defined - if self.proposal not in [None, 'single', 'neighboring', 'exhaustive', 'multiple']: - raise ParameterError("The specified proposal scheme is not available. Available options include 'single', 'neighboring', 'exhaustive', and 'multiple'.") # noqa: E501 - - if self.acceptance not in [None, 'same-state', 'same_state', 'metropolis', 'metropolis-eq', 'metropolis_eq']: - raise ParameterError("The specified acceptance scheme is not available. Available options include 'same-state', 'metropolis', and 'metropolis-eq'.") # noqa: E501 - - if self.w_combine not in [None, 'final', 'avg']: - raise ParameterError("The specified type of weight to be combined is not available. Available options include 'final' and 'avg'.") # noqa: E501 - - if self.df_method not in [None, 'TI', 'BAR', 'MBAR']: - raise ParameterError("The specified free energy estimator is not available. Available options include 'TI', 'BAR', and 'MBAR'.") # noqa: E501 - - if self.err_method not in [None, 'propagate', 'bootstrap']: - raise ParameterError("The specified method for error estimation is not available. Available options include 'propagate', and 'bootstrap'.") # noqa: E501 - - if self.w_combine is not None and self.rmse_cutoff == np.inf: - self.warnings.append('Warning: We recommend setting rmse_cutoff when w_combine is used.') - - if self.rmse_cutoff != np.inf: - if type(self.rmse_cutoff) is not float and type(self.rmse_cutoff) is not int: - raise ParameterError("The parameter 'rmse_cutoff' should be a float.") - - params_int = ['n_sim', 'n_iter', 's', 'N_cutoff', 'df_spacing', 'n_ckpt', 'n_bootstrap'] # integer parameters # noqa: E501 - if self.nst_sim is not None: - params_int.append('nst_sim') - if self.n_ex != 'N^3': # no need to add "and self.proposal == 'multiple' since if multiple swaps are not used, n_ex=1" # noqa: E501 - params_int.append('n_ex') - if self.seed is not None: - params_int.append('seed') - for i in params_int: - if type(getattr(self, i)) != int: - raise ParameterError(f"The parameter '{i}' should be an integer.") - - params_pos = ['n_sim', 'n_iter', 'n_ckpt', 'df_spacing', 'n_bootstrap', 'rmse_cutoff'] # positive parameters - if self.nst_sim is not None: - params_pos.append('nst_sim') - for i in params_pos: - if getattr(self, i) <= 0: - raise ParameterError(f"The parameter '{i}' should be positive.") - - if self.n_ex != 'N^3' and self.n_ex < 0: - raise ParameterError("The parameter 'n_ex' should be non-negative.") - - if self.s < 0: - raise ParameterError("The parameter 's' should be non-negative.") - - if self.N_cutoff < 0 and self.N_cutoff != -1: - raise ParameterError("The parameter 'N_cutoff' should be non-negative unless no weight correction is needed, i.e. N_cutoff = -1.") # noqa: E501 - - params_str = ['gro', 'top', 'mdp', 'gmx_executable'] - # First check if self.gro and self.top are lists and check their lengths - check_files = ['gro', 'top'] # just for checking file types that can take multiple inputs - for i in check_files: - if isinstance(getattr(self, i), list): - params_str.remove(i) - if len(getattr(self, i)) != self.n_sim: - raise ParameterError(f"The number of the input {i.upper()} files must be the same as the number of replicas, if multiple are specified.") # noqa: E501 - if self.modify_coords is not None: - params_str.append('modify_coords') - for i in params_str: - if type(getattr(self, i)) != str: - raise ParameterError(f"The parameter '{i}' should be a string.") - - params_bool = ['verbose', 'rm_cpt', 'msm', 'free_energy', 'subsampling_avg'] - for i in params_bool: - if type(getattr(self, i)) != bool: - raise ParameterError(f"The parameter '{i}' should be a boolean variable.") - - params_list = ['add_swappables', 'df_ref'] - for i in params_list: - if getattr(self, i) is not None and not isinstance(getattr(self, i), list): - raise ParameterError(f"The parameter '{i}' should be a list.") - - params_dict = ['mdp_args', 'grompp_args', 'runtime_args'] - for i in params_dict: - if getattr(self, i) is not None and not isinstance(getattr(self, i), dict): - raise ParameterError(f"The parameter '{i}' should be a dictionary.") - - if self.add_swappables is not None: - if not isinstance(self.add_swappables, list): - raise ParameterError("The parameter 'add_swappables' should be a nested list.") - for sublist in self.add_swappables: - if not isinstance(sublist, list): - raise ParameterError("The parameter 'add_swappables' should be a nested list.") - for item in sublist: - if not isinstance(item, int) or item < 0: - raise ParameterError("Each number specified in 'add_swappables' should be a non-negative integer.") # noqa: E501 - - if self.mdp_args is not None: - for key in self.mdp_args.keys(): - if not isinstance(key, str): - raise ParameterError("All keys specified in 'mdp_args' should be strings.") - else: - if '-' in key: - raise ParameterError("Parameters specified in 'mdp_args' must use underscores in place of hyphens.") # noqa: E501 - for val_list in self.mdp_args.values(): - if len(val_list) != self.n_sim: - raise ParameterError("The number of values specified for each key in 'mdp_args' should be the same as the number of replicas.") # noqa: E501 - - # Step 5: Reformat the input MDP file to replace all hypens with underscores. - self.reformatted_mdp = EnsembleEXE.reformat_MDP(self.mdp) - - # Step 6: Read in parameters from the MDP template - self.template = gmx_parser.MDP(self.mdp) - self.dt = self.template["dt"] # ps - self.temp = self.template["ref_t"] - - if self.nst_sim is None: - self.nst_sim = self.template["nsteps"] - - if 'wl_scale' in self.template.keys(): - if self.template['wl_scale'] == []: - # If wl_scale in the MDP file is a blank (i.e. fixed weights), mdp['wl_scale'] will be an empty list. - # This is the only case where mdp['wl_scale'] is a numpy array. - self.fixed_weights = True - self.equilibrated_weights = [None for i in range(self.n_sim)] - else: - self.fixed_weights = False - self.equilibrated_weights = [[] for i in range(self.n_sim)] - self.updating_weights = [[] for i in range(self.n_sim)] - self.current_wl_delta = [0 for i in range(self.n_sim)] - else: - self.fixed_weights = True - self.equilibrated_weights = [None for i in range(self.n_sim)] - - if self.fixed_weights is True: - if self.N_cutoff != -1 or self.w_combine is not None: - self.warnings.append('Warning: The weight correction/weight combination method is specified but will not be used since the weights are fixed.') # noqa: E501 - # In the case that the warning is ignored, enforce the defaults. - self.N_cutoff = -1 - self.w_combine = None - - if 'lmc_seed' in self.template and self.template['lmc_seed'] != -1: - self.warnings.append('Warning: We recommend setting lmc_seed as -1 so the random seed is different for each iteration.') # noqa: E501 - - if 'gen_seed' in self.template and self.template['gen_seed'] != -1: - self.warnings.append('Warning: We recommend setting gen_seed as -1 so the random seed is different for each iteration.') # noqa: E501 - - if 'gen_vel' not in self.template or ('gen_vel' in self.template and self.template['gen_vel'] == 'no'): - self.warnings.append('Warning: We recommend generating new velocities for each iteration to avoid potential issues with detailed balance.') # noqa: E501 - - if self.nst_sim % self.template['nstlog'] != 0: - raise ParameterError( - 'The parameter "nstlog" must be a factor of the parameter "nst_sim" specified in the YAML file.') - - if self.nst_sim % self.template['nstdhdl'] != 0: - raise ParameterError( - 'The parameter "nstdhdl" must be a factor of the parameter "nst_sim" specified in the YAML file.') - - if self.template['nstexpanded'] % self.template['nstdhdl'] != 0: - raise ParameterError( - 'In EEXE, the parameter "nstdhdl" must be a factor of the parameter "nstexpanded", or the calculation of acceptance ratios might be wrong.') # noqa: E501 - - if self.mdp_args is not None: - if 'lmc_seed' in self.mdp_args and -1 not in self.mdp_args['lmc_seed']: - self.warnings.append('Warning: We recommend setting lmc_seed as -1 so the random seed is different for each iteration.') # noqa: E501 - - if 'gen_seed' in self.mdp_args and -1 not in self.mdp_args['gen_seed']: - self.warnings.append('Warning: We recommend setting gen_seed as -1 so the random seed is different for each iteration.') # noqa: E501 - - if 'gen_vel' in self.mdp_args and 'no' in self.mdp_args['gen_vel']: - self.warnings.append('Warning: We recommend generating new velocities for each iteration to avoid potential issues with the detailed balance.') # noqa: E501 - - if 'nstlog' in self.mdp_args and sum(self.nst_sim % np.array(self.mdp_args['nstlog'])) != 0: - raise ParameterError( - 'The parameter "nstlog" must be a factor of the parameter "nst_sim" specified in the YAML file.') - - if 'nstdhdl' in self.mdp_args and sum(self.nst_sim % np.array(self.mdp_args['nstdhdl'])) != 0: - raise ParameterError( - 'The parameter "nstdhdl" must be a factor of the parameter "nst_sim" specified in the YAML file.') - - if 'nstexpanded' in self.mdp_args and 'nstdhdl' in self.mdp_args and sum(np.array(self.mdp_args['nstexpanded']) % np.array(self.mdp_args['nstdhdl'])) != 0: # noqa: E501 - raise ParameterError( - 'In EEXE, the parameter "nstdhdl" must be a factor of the parameter "nstexpanded", or the calculation of acceptance ratios might be wrong.') # noqa: E501 - - if 'pull' in self.template and self.template['pull'] == 'yes': - pull_ncoords = self.template['pull_ncoords'] - self.set_ref_dist = [] - for i in range(pull_ncoords): - if self.template[f'pull_coord{i+1}_geometry'] == 'distance': - if self.template[f'pull_coord{i+1}_start'] == 'yes': - self.set_ref_dist.append(True) # starting from the second iteration, set pull_coord*_init. - if 'pull_nstxout' not in self.template: - self.warnings.append('A non-zero value should be specified for pull_nstxout if pull_coord*_start is set to yes.') # noqa: E501 - if self.template['pull_nstxout'] == 0: - self.warnings.append('A non-zero value should be specified for pull_nstxout if pull_coord*_start is set to yes.') # noqa: E501 - else: - self.set_ref_dist.append(False) # Here we assume that the user know what reference distance to use. # noqa: E501 - else: - self.set_ref_dist.append(False) # we only deal with distance restraints for now. - - # Step 7: Set up derived parameters - # 7-1. kT in kJ/mol - self.kT = k * NA * self.temp / 1000 # 1 kT in kJ/mol - - # 7-2. Figure out what types of lambda variables are involved - # Here is we possible lambda types in the order read by GROMACS, which is likely also the order when being printed to the log file. # noqa: E501 - # See https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/gmxpreprocess/readir.cpp#L2543 - lambdas_types_all = ['fep_lambdas', 'mass_lambdas', 'coul_lambdas', 'vdw_lambdas', 'bonded_lambdas', 'restraint_lambdas', 'temperature_lambdas'] # noqa: E501 - self.lambda_types = [] # lambdas specified in the MDP file - for i in lambdas_types_all: - if i in self.template.keys(): # there shouldn't be parameters like "fep-lambdas" after reformatting the MDP file # noqa: E501 - self.lambda_types.append(i) - - # 7-3. Total # of states: n_tot = n_sub * n_sim - (n_overlap) * (n_sim - 1), where n_overlap = n_sub - s - self.n_tot = len(self.template[self.lambda_types[0]]) - - # 7-4. Number of states of each replica (assuming the same for all rep) - self.n_sub = self.n_tot - self.s * (self.n_sim - 1) - if self.n_sub < 1: - raise ParameterError( - f"There must be at least two states for each replica (current value: {self.n_sub}). The current specified configuration (n_tot={self.n_tot}, n_sim={self.n_sim}, s={self.s}) does not work for EEXE.") # noqa: E501 - - # 7-5. A list of sets of state indices - start_idx = [i * self.s for i in range(self.n_sim)] - self.state_ranges = [list(np.arange(i, i + self.n_sub)) for i in start_idx] - - # 7-6. A list of time it took to get the weights equilibrated - self.equil = [-1 for i in range(self.n_sim)] # -1 means unequilibrated - - # 7-7. Some variables for counting - self.n_rejected = 0 - self.n_swap_attempts = 0 - self.n_empty_swappable = 0 - - # 7-8. Replica space trajectories. For example, rep_trajs[0] = [0, 2, 3, 0, 1, ...] means - # that configuration 0 transitioned to replica 2, then 3, 0, 1, in iterations 1, 2, 3, 4, ..., - # respectively. The first element of rep_traj[i] should always be i. - self.rep_trajs = [[i] for i in range(self.n_sim)] - - # 7-9. configs shows the current configuration that each replica is sampling. - # For example, self.configs = [0, 2, 1, 3] means that configurations 0, 2, 1, and 3 are - # in replicas, 0, 1, 2, 3, respectively. This list will be constantly updated during the simulation. - self.configs = list(range(self.n_sim)) - - # 7-10. The time series of the (processed) whole-range alchemical weights - # If no weight combination is applied, self.g_vecs will just be a list of None's. - self.g_vecs = [] - - # 7-11. Data analysis - if self.df_method == 'MBAR': - self.df_data_type = 'u_nk' - else: - self.df_data_type = 'dhdl' - - # 7-12. External module for coordinate modification - if self.modify_coords is not None: - sys.path.append(os.getcwd()) - module = importlib.import_module(self.modify_coords) - self.modify_coords_fn = getattr(module, self.modify_coords) - else: - self.modify_coords_fn = None - - # Step 8. Check the executables - if analysis is False: - self.check_gmx_executable() - - def check_gmx_executable(self): - """ - Checks if the GROMACS executable can be used and gets its absolute path and version. - """ - try: - result = subprocess.run(['which', self.gmx_executable], capture_output=True, text=True, check=True) - self.gmx_path = result.stdout.strip() # this can be exactly the same as self.gmx_executable - - version_output = subprocess.run([self.gmx_path, "-version"], capture_output=True, text=True, check=True) - for line in version_output.stdout.splitlines(): - if "GROMACS version" in line: - self.gmx_version = line.split()[-1] - break - except subprocess.CalledProcessError: - print(f"{self.gmx_executable} is not available on this system.") - except Exception as e: - print(f"An error occurred:\n{e}") - - def print_params(self, params_analysis=False): - """ - Prints important parameters related to the EXEE simulation. - - Parameters - ---------- - params_analysis : bool, optional - If True, additional parameters related to data analysis will be printed. Default is False. - """ - if isinstance(self.gro, list): - gro_str = ', '.join(self.gro) - else: - gro_str = self.gro - - if isinstance(self.top, list): - top_str = ', '.join(self.top) - else: - top_str = self.top - - print("Important parameters of EXEE") - print("============================") - print(f"Python version: {sys.version}") - print(f"GROMACS executable: {self.gmx_path}") # we print the full path here - print(f"GROMACS version: {self.gmx_version}") - print(f"ensemble_md version: {ensemble_md.__version__}") - print(f'Simulation inputs: {gro_str}, {top_str}, {self.mdp}') - print(f"Verbose log file: {self.verbose}") - print(f"Proposal scheme: {self.proposal}") - print(f"Acceptance scheme for swapping simulations: {self.acceptance}") - print(f"Type of weights to be combined: {self.w_combine}") - print(f"Histogram cutoff: {self.N_cutoff}") - print(f"Number of replicas: {self.n_sim}") - print(f"Number of iterations: {self.n_iter}") - print(f"Number of attempted swaps in one exchange interval: {self.n_ex}") - print(f"Length of each replica: {self.dt * self.nst_sim} ps") - print(f"Frequency for checkpointing: {self.n_ckpt} iterations") - print(f"Total number of states: {self.n_tot}") - print(f"Additionally defined swappable states: {self.add_swappables}") - print(f"Additional grompp arguments: {self.grompp_args}") - print(f"Additional runtime arguments: {self.runtime_args}") - if self.mdp_args is not None and len(self.mdp_args.keys()) > 1: - print("MDP parameters differing across replicas:") - for i in self.mdp_args.keys(): - print(f" - {i}: {self.mdp_args[i]}") - else: - print(f"MDP parameters differing across replicas: {self.mdp_args}") - print("Alchemical ranges of each replica in EEXE:") - for i in range(self.n_sim): - print(f" - Replica {i}: States {self.state_ranges[i]}") - - if params_analysis is True: - print() - print(f"Whether to build Markov state models and perform relevant analysis: {self.msm}") - print(f"Whether to perform free energy calculations: {self.free_energy}") - print(f"The step to used in subsampling the DHDL data in free energy calculations, if any: {self.df_spacing}") # noqa: E501 - print(f"The chosen free energy estimator for free energy calculations, if any: {self.df_method}") - print(f"The method for estimating the uncertainty of free energies in free energy calculations, if any: {self.err_method}") # noqa: E501 - print(f"The number of bootstrap iterations in the boostrapping method, if used: {self.n_bootstrap}") - print(f"The random seed to use in bootstrapping, if used: {self.seed}") - - if self.reformatted_mdp is True: - print("Note that the input MDP file has been reformatted by replacing hypens with underscores. The original mdp file has been renamed as *backup.mdp.") # noqa: E501 - - @staticmethod - def reformat_MDP(mdp_file): - """ - Reformats the input MDP file so that all hyphens in the parameter names are replaced by underscores. - If the input MDP file contains hyphens in its parameter names, the class attribue :code:`self.reformatted` - will be set to :code:`True`. In this case, the new MDP object with reformatted parameter names will be - written to the original file path of the file, while the original file will be renamed with a - :code:`_backup` suffix. If the input MDP file is not reformatted, the function sets - the class attribute :code:`self.reformatted_mdp` to :code:`False`. - - Returns - ------- - reformatted : bool - Whether the file was reformatted. - """ - params = gmx_parser.MDP(mdp_file) - - odict = OrderedDict([(k.replace('-', '_'), v) for k, v in params.items()]) - params_new = gmx_parser.MDP(None, **odict) - reformatted = None - - if rank == 0: - if params_new.keys() == params.keys(): - reformatted = False # no need to reformat the file - else: - reformatted = True - new_name = mdp_file.split('.mdp')[0] + '_backup.mdp' - os.rename(mdp_file, new_name) - params_new.write(mdp_file) - - return reformatted - - def initialize_MDP(self, idx): - """ - Initializes the MDP object for generating MDP files for a replica based on the MDP template. - This function should be called only for generating MDP files for the FIRST iteration - and it has nothing to do with whether the weights are fixed or equilibrating. - It is assumed that the MDP template has all the common parameters of all replicas. - - Parameters - ---------- - idx : int - Index of the simulation whose MDP parameters need to be initialized. - - Returns - ------- - MDP : :obj:`.gmx_parser.MDP` obj - An updated object of :obj:`.gmx_parser.MDP` that can be used to write MDP files. - """ - MDP = copy.deepcopy(self.template) - MDP["nsteps"] = self.nst_sim - - if self.mdp_args is not None: - for param in self.mdp_args.keys(): - MDP[param] = self.mdp_args[param][idx] - - start_idx = idx * self.s - for i in self.lambda_types: - MDP[i] = self.template[i][start_idx:start_idx + self.n_sub] - - if "init_lambda_weights" in self.template: - MDP["init_lambda_weights"] = self.template["init_lambda_weights"][start_idx:start_idx + self.n_sub] - - return MDP - - def get_ref_dist(self): - """ - Gets the reference distance(s) to use starting from the second iteration if distance restraint(s) are used. - Specifically, a reference distance determined here is the initial COM distance between the pull groups - in the input GRO file. This function initializes the attribute :code:`ref_dist`. - """ - if hasattr(self, 'set_ref_dist'): - self.ref_dist = [] - pullx_file = 'sim_0/iteration_0/pullx.xvg' - headers = get_headers(pullx_file) - for i in range(len(self.set_ref_dist)): - if self.set_ref_dist[i] is True: - dist = list(extract_dataframe(pullx_file, headers=headers)[f'{i+1}'])[0] - self.ref_dist.append(dist) - - def update_MDP(self, new_template, sim_idx, iter_idx, states, wl_delta, weights, counts=None): - """ - Updates the MDP file for a new iteration based on the new MDP template coming from the previous iteration. - Note that if the weights got equilibrated in the previous iteration, then the weights will be fixed - at these equilibrated values for all the following iterations. - - Parameters - ---------- - new_template : str - The new MDP template file. Typically the MDP file of the previous iteration. - sim_idx : int - The index of the simulation whose MDP parameters need to be updated. - iter_idx : int - The index of the iteration to be performed later. - states : list - A list of last sampled states of all simulaitons in the previous iteration. - wl_delta : list - A list of final Wang-Landau incrementors of all simulations. - weights : list - A list of lists final weights of all simulations. - counts : list - A list of lists final counts of all simulations. If the value is :code:`None`, - then the MDP parameter :code:`init-histogram-counts` won't be specified in the next iteration. - Note that not all the GROMACS versions have the MDP parameter :code:`init-histogram-counts` available, - in which case one should always pass :code:`None`, or set :code:`-maxwarn` in :code:`grompp_args` - in the input YAML file. - - Return - ------ - MDP : :obj:`.gmx_parser.MDP` obj - An updated object of :obj:`.gmx_parser.MDP` that can be used to write MDP files. - """ - new_template = gmx_parser.MDP(new_template) # turn into a gmx_parser.MDP object - MDP = copy.deepcopy(new_template) - MDP["tinit"] = self.nst_sim * self.dt * iter_idx - MDP["nsteps"] = self.nst_sim - MDP["init_lambda_state"] = (states[sim_idx] - sim_idx * self.s) # 2nd term is for shifting from the global to local index. # noqa: E501 - MDP["init_wl_delta"] = wl_delta[sim_idx] - MDP["init_lambda_weights"] = weights[sim_idx] - if counts is not None: - MDP["init_histogram_counts"] = counts[sim_idx] - - if self.equil[sim_idx] == -1: # the weights haven't been equilibrated - MDP["init_wl_delta"] = wl_delta[sim_idx] - else: - MDP["lmc_stats"] = "no" - MDP["wl_scale"] = "" - MDP["wl_ratio"] = "" - MDP["init_wl_delta"] = "" - MDP["lmc_weights_equil"] = "" - MDP["weight_equil_wl_delta"] = "" - - # Here we deal with the distance restraint in the pull code, if any. - if hasattr(self, 'ref_dist'): - for i in range(len(self.ref_dist)): - MDP[f'pull_coord{i+1}_start'] = "no" - MDP[f'pull_coord{i+1}_init'] = self.ref_dist[i] - - return MDP - - def extract_final_dhdl_info(self, dhdl_files): - """ - For all the replica simulations, finds the last sampled state - and print the corresponding lambda values from a dhdl file. - - Parameters - ---------- - dhdl_files : list - A list of file paths to GROMACS DHDL files of different replicas. - - Returns - ------- - states : list - A list of the global indices of the last sampled states of all simulaitons. - """ - states = [] - print("\nBelow are the final states being visited:") - for i in range(len(dhdl_files)): - headers = get_headers(dhdl_files[i]) - state_local = list(extract_dataframe(dhdl_files[i], headers=headers)['Thermodynamic state'])[-1] # local index of the last state # noqa: E501 - state_global = state_local + i * self.s # global index of the last state - states.append(state_global) # append the global index - print(f" Simulation {i}: Global state {states[i]}") - print('\n', end='') - - return states - - def extract_final_log_info(self, log_files): - """ - Extracts the following information for each replica simulation from a given list of GROMACS LOG files: - - - The final Wang-Landau incrementors. - - The final lists of weights. - - The final lists of counts. - - Whether the weights were equilibrated in the simulations. - - Parameters - ---------- - log_files : list - A list of file paths to GROMACS LOG files of different replicas. - - Returns - ------- - wl_delta : list - A list of final Wang-Landau incrementors of all simulations. - weights : list - A list of lists of final weights of all simulations. - counts : list - A list of lists of final counts of all simulations. - """ - wl_delta, weights, counts = [], [], [] - - # Find the final Wang-Landau incrementors and weights - for i in range(len(log_files)): - if self.verbose: - print(f'Parsing {log_files[i]} ...') - result = gmx_parser.parse_log(log_files[i]) # weights, counts, wl_delta, equil_time - weights.append(result[0][-1]) - counts.append(result[1]) - wl_delta.append(result[2]) - - # In Case 3 described in the docstring of parse_log (fixed-weights), - # result[3] will be 0 but it will never be passed to self.equil[i] - # because once self.equil[i] is not -1, we stop updating. This way, we can keep - # the time when the weights get equilibrated all the way. - if self.equil[i] == -1: - # At this point self.equil is the equilibration status BEFORE the last iteration - # while result[3] is the equilibration status AFTER finishing the last iteraiton. - # For any replicas where weights are still equilibrating (i.e. self.equil[j] == -1) - # we update its equilibration status. - self.equil[i] = result[3] - - if self.equilibrated_weights[i] == []: - if self.equil[i] != -1 and self.equil[i] != 0: - # self.equil[i] != -1: uneqilibrated - # self.equil[i] != 0: fixed-weight simulation - self.equilibrated_weights[i] = result[0][-1] - - return wl_delta, weights, counts - - @staticmethod - def identify_swappable_pairs(states, state_ranges, neighbor_exchange, add_swappables=None): - """ - Identify swappable pairs. By definition, a pair of simulation is considered swappable only if - their last sampled states are in the alchemical ranges of both simulations. This is required - to ensure that the values of involved ΔH and Δg can always be looked up from the DHDL and LOG files. - This also automatically guarantee that the simulations to be swapped have overlapping alchemical ranges. - - Parameters - ---------- - states : list - A list of the global indices of the last sampled states of all simulations. This list can be - generated by the :obj:`.extract_final_dhdl_info` method. Notably, the input list should not be - a list that has been updated/modified by :obj:`get_swapping_pattern`, or the result will be incorrect. - state_ranges : list of lists - A list of state indies for all replicas. The input list can be a list updated by - :obj:`.get_swapping_pattern`, especially in the case where there is a need to re-identify the - swappable pairs after an attempted swap is accepted. - neighbor_exchange : bool - Whether to exchange only between neighboring replicas. - add_swappables: list - A list of lists that additionally consider states (in global indices) that can be swapped. - For example, :code:`add_swappables=[[4, 5], [14, 15]]` means that if a replica samples state 4, - it can be swapped with another replica that samples state 5 and vice versa. The same logic applies - to states 14 and 15. - - Returns - ------- - swappables : list - A list of tuples representing the simulations that can be swapped. - """ - n_sim = len(states) - sim_idx = list(range(n_sim)) - all_pairs = list(combinations(sim_idx, 2)) - - # First, we identify pairs of replicas with overlapping ranges - swappables = [i for i in all_pairs if set(state_ranges[i[0]]).intersection(set(state_ranges[i[1]])) != set()] # noqa: E501 - - # Then, from these pairs, we exclude the ones whose the last sampled states are not present in both alchemical ranges # noqa: E501 - # In this case, U^i_n, U_^j_m, g^i_n, and g_^j_m are unknown and the acceptance cannot be calculated. - swappables = [i for i in swappables if states[i[0]] in state_ranges[i[1]] and states[i[1]] in state_ranges[i[0]]] # noqa: E501 - - # Expand the definition of swappable pairs when add_swappables is specified - if add_swappables is not None: - all_paired_states = [[states[p[0]], states[p[1]]] for p in all_pairs] - for i in all_paired_states: - if i in add_swappables: - pair = all_pairs[all_paired_states.index(i)] - if pair not in swappables: - swappables.append(pair) - - if neighbor_exchange is True: - print('Note: One neighboring swap will be proposed.') - swappables = [i for i in swappables if np.abs(i[0] - i[1]) == 1] - - return swappables - - @staticmethod - def propose_swap(swappables): - """ - Proposes a swap of coordinates between replicas by drawing samples from the swappable pairs. - - Parameters - ---------- - swappables : list - A list of tuples representing the simulations that can be swapped. - - Return - ------ - swap : tuple or an empty list - A tuple of simulation indices to be swapped. If there is no swappable pair, - an empty list will be returned. - """ - try: - swap = random.choices(swappables, k=1)[0] - except IndexError: # no swappable pairs - swap = [] - - return swap - - def get_swapping_pattern(self, dhdl_files, states, weights): - """ - Generates a list (:code:`swap_pattern`) that represents how the configurations should be swapped in the - next iteration. The indices of the output list correspond to the simulation/replica indices, and the - values represent the configuration indices in the corresponding simulation/replica. For example, if the - swapping pattern is :code:`[0, 2, 1, 3]`, it means that in the next iteration, replicas 0, 1, 2, 3 should - sample configurations 0, 2, 1, 3, respectively, where configurations 0, 1, 2, 3 here are defined as whatever - configurations are in replicas 0, 1, 2, 3 in the CURRENT iteration (not iteration 0), respectively. - - Notably, when this function is called (e.g. once every iteration in an EEXE simulation), the output - list :code:`swap_pattern` is always initialized as :code:`[0, 1, 2, 3, ...]` and gets updated once every - attempted swap. This is different from the attribute :code:`configs`, which is only initialized at the - very beginning of the entire EEXE simulation (iteration 0), though :code:`configs` also gets updated with - :code:`swap_pattern` once every attempted swap in this function. - - Parameters - ---------- - dhdl_files : list - A list of DHDL files. The indicies in the DHDL filenames shouuld be in an ascending order, e.g. - :code:`[dhdl_0.xvg, dhdl_1.xvg, ..., dhdl_N.xvg]`. - states : list - A list of last sampled states (in global indices) of ALL simulaitons. :code:`states[i]=j` means that - the configuration in replica :code:`i` is at state :code:`j` at the time when the exchange is performed. - This list can be generated :obj:`.extract_final_dhdl_info`. - weights : list - A list of lists of final weights of ALL simulations. :code:`weights[i]` corresponds to the list of weights - used in replica :code:`i`. The list can be generated by :obj:`.extract_final_log_info`. - - Returns - ------- - swap_pattern : list - A list showing the configuration of replicas after swapping. - swap_list : list - A list of tuples showing the accepted swaps. - """ - swap_list = [] - if self.proposal != 'multiple': - if self.proposal == 'exhaustive': - n_ex = int(np.floor(self.n_sim / 2)) # This is the maximum, not necessarily the number that will always be reached. # noqa - n_ex_exhaustive = 0 # The actual number of swaps atttempted. - else: - n_ex = 1 # single swap or neighboring swap - else: - # multiple swaps - if self.n_ex == 'N^3': - n_ex = self.n_tot ** 3 - else: - n_ex = self.n_ex - - shifts = list(self.s * np.arange(self.n_sim)) - swap_pattern = list(range(self.n_sim)) # Can be regarded as the indices of DHDL files/configurations - state_ranges = copy.deepcopy(self.state_ranges) - states_copy = copy.deepcopy(states) # only for re-identifying swappable pairs given updated state_ranges - swappables = EnsembleEXE.identify_swappable_pairs(states, state_ranges, self.proposal == 'neighboring', self.add_swappables) # noqa: E501 - - # Note that if there is only 1 swappable pair, then it will still be the only swappable pair - # after an attempted swap is accepted. Therefore, there is no need to perform multiple swaps or re-identify - # the new set of swappable pairs. In this case, we set n_ex back to 1. - if len(swappables) == 1: - if n_ex > 1: - n_ex = 1 # n_ex is set back to 1 since there is only 1 swappable pair. - - print(f"Swappable pairs: {swappables}") - for i in range(n_ex): - # Update the list of swappable pairs starting from the 2nd attempted swap for the exhaustive swap method. - if self.proposal == 'exhaustive' and i >= 1: - # Note that this should be done regardless of the acceptance/rejection of the previously drawn pairs. - # Also note that at this point, swap is still the last attempted swap. - swappables = [i for i in swappables if set(i).intersection(set(swap)) == set()] # noqa: F821 - print(f'\nRemaining swappable pairs: {swappables}') - - if len(swappables) == 0 and self.proposal == 'exhaustive': - # This should only happen when the method of exhaustive swaps is used. - if i == 0: - self.n_empty_swappable += 1 - break - else: - self.n_swap_attempts += 1 - if self.proposal == 'exhaustive': - n_ex_exhaustive += 1 - - swap = EnsembleEXE.propose_swap(swappables) - print(f'\nProposed swap: {swap}') - if swap == []: - self.n_empty_swappable += 1 - print('No swap is proposed because there is no swappable pair at all.') - break # no need to re-identify swappable pairs and draw new samples - else: - if self.verbose is True and self.proposal != 'exhaustive': - print(f'A swap ({i + 1}/{n_ex}) is proposed between the configurations of Simulation {swap[0]} (state {states[swap[0]]}) and Simulation {swap[1]} (state {states[swap[1]]}) ...') # noqa: E501 - - if self.modify_coords_fn is not None: - swap_bool = True # always accept the move - else: - # Calculate the acceptance ratio and decide whether to accept the swap. - prob_acc = self.calc_prob_acc(swap, dhdl_files, states, shifts, weights) - swap_bool = self.accept_or_reject(prob_acc) - - # Theoretically, in an EEXE simulation, we could either choose to swap configurations (via - # swapping GRO files) or replicas (via swapping MDP files). In ensemble_md package, we chose the - # former when implementing the EEXE algorithm. Specifically, in the CLI `run_EEXE`, `swap_pattern` - # is used to swap the GRO files. Therefore, when an attempted swap is accetped and `swap_pattern` - # is updated, we also need to update the variables `shifts`, `weights`, `dhdl_files`, - # `state_ranges`, `self.configs` but not anything else. Otherwise, incorrect results will be - # produced. To better understand this, one can refer to our unit test for get_swapping_pattern - # and calc_prob_acc, set checkpoints and examine why the variables should/should not be updated. - - if swap_bool is True: - swap_list.append(swap) - # The assignments need to be done at the same time in just one line. - # states[swap[0]], states[swap[1]] = states[swap[1]], states[swap[0]] - shifts[swap[0]], shifts[swap[1]] = shifts[swap[1]], shifts[swap[0]] - weights[swap[0]], weights[swap[1]] = weights[swap[1]], weights[swap[0]] - dhdl_files[swap[0]], dhdl_files[swap[1]] = dhdl_files[swap[1]], dhdl_files[swap[0]] - swap_pattern[swap[0]], swap_pattern[swap[1]] = swap_pattern[swap[1]], swap_pattern[swap[0]] - state_ranges[swap[0]], state_ranges[swap[1]] = state_ranges[swap[1]], state_ranges[swap[0]] - self.configs[swap[0]], self.configs[swap[1]] = self.configs[swap[1]], self.configs[swap[0]] - - if n_ex > 1 and self.proposal == 'multiple': # must be multiple swaps - # After state_ranges have been updated, we re-identify the swappable pairs. - # Notably, states_copy (instead of states) should be used. (They could be different.) - swappables = EnsembleEXE.identify_swappable_pairs(states_copy, state_ranges, self.proposal == 'neighboring', self.add_swappables) # noqa: E501 - print(f" New swappable pairs: {swappables}") - else: - # In this case, there is no need to update the swappables - pass - - print(f' Current list of configurations: {self.configs}') - - if self.verbose is False: - print(f'\n{n_ex} swap(s) have been proposed.') - print(f'\nThe finally adopted swap pattern: {swap_pattern}') - print(f'The list of configurations sampled in each replica in the next iteration: {self.configs}') - - # Update the replica-space trajectories - for i in range(self.n_sim): - self.rep_trajs[i].append(self.configs.index(i)) - - return swap_pattern, swap_list - - def calc_prob_acc(self, swap, dhdl_files, states, shifts, weights): - """ - Calculates the acceptance ratio given the Monte Carlo scheme for swapping the simulations. - - Parameters - ---------- - swap : tuple - A tuple of indices corresponding to the simulations to be swapped. - dhdl_files : list - A list of DHDL files, e.g. :code:`dhdl_files = ['dhdl_2.xvg', 'dhdl_1.xvg', 'dhdl_0.xvg', 'dhdl_3.xvg']` - means that configurations 2, 1, 0, and 3 are now in replicas 0, 1, 2, 3. This can happen in multiple swaps - when a previous swap between configurations 0 and 2 has just been accepted. Otherwise, the list of - filenames should always be in the ascending order, e.g. :code:`['dhdl_0.xvg', 'dhdl_1.xvg', 'dhdl_2.xvg', - dhdl_3.xvg]`. - states : list - A list of last sampled states (in global indices) in the DHDL files corresponding to configurations 0, 1, - 2, ... (e.g. :code:`dhdl_0.xvg`, :code:`dhdl_1.xvg`, :code:`dhdl_2.xvg`, ...) - This list can be generated by :obj:`.extract_final_dhdl_info`. - shifts : list - A list of state shifts for converting global state indices to the local ones. Specifically, :code:`states` - substracted by :code:`shifts` should be the local state indices of the last sampled states - in :code:`dhdl_files[0]`, :code:`dhdl_files[1]`, ... (which importantly, are not necessarily - :code:`dhdl_0.xvg`, :code:`dhdl_1.xvg`, ...). If multiple swaps are not used, then - this should always be :code:`list(EEXE.s * np.arange(EEXE.n_sim))`. - weights : list - A list of lists of final weights of ALL simulations. Typiecally generated by - :obj:`.extract_final_log_info`. :code:`weights[i]` and :code:`dhdl_files[i]` should correspond to - the same configuration. - - Returns - ------- - prob_acc : float - The acceptance ratio. - """ - if self.acceptance == "same_state" or self.acceptance == "same-state": - if states[swap[0]] == states[swap[1]]: # same state, swap! - prob_acc = 1 # This must lead to an output of swap_bool = True from the function accept_or_reject - else: - prob_acc = 0 # This must lead to an output of swap_bool = False from the function accept_or_reject - - else: # i.e. metropolis-eq or metropolis, which both require the calculation of dU - # Now we calculate dU - if self.verbose is True: - print(" Proposing a move from (x^i_m, x^j_n) to (x^i_n, x^j_m) ...") - f0, f1 = dhdl_files[swap[0]], dhdl_files[swap[1]] - h0, h1 = get_headers(f0), get_headers(f1) - data_0, data_1 = ( - extract_dataframe(f0, headers=h0).iloc[-1], - extract_dataframe(f1, headers=h1).iloc[-1], - ) - - # \Delta H to all states at the last time frame - # Notably, the can be regarded as H for each state since the reference state will have a value of 0 anyway. - dhdl_0 = data_0[-self.n_sub:] - dhdl_1 = data_1[-self.n_sub:] - - # Old local index, which will only be used in "metropolis" - old_state_0 = states[swap[0]] - shifts[swap[0]] - old_state_1 = states[swap[1]] - shifts[swap[1]] - - # New local index. Note that states are global indices, so we shift them back to the local ones - new_state_0 = states[swap[1]] - shifts[swap[0]] # new state index (local index in simulation swap[0]) - new_state_1 = states[swap[0]] - shifts[swap[1]] # new state index (local index in simulation swap[1]) - - dU_0 = (dhdl_0[new_state_0] - dhdl_0[old_state_0]) / self.kT # U^{i}_{n} - U^{i}_{m}, i.e. \Delta U (kT) to the new state # noqa: E501 - dU_1 = (dhdl_1[new_state_1] - dhdl_1[old_state_1]) / self.kT # U^{j}_{m} - U^{j}_{n}, i.e. \Delta U (kT) to the new state # noqa: E501 - dU = dU_0 + dU_1 - if self.verbose is True: - print( - f" U^i_n - U^i_m = {dU_0:.2f} kT, U^j_m - U^j_n = {dU_1:.2f} kT, Total dU: {dU:.2f} kT" - ) - - if self.acceptance == "metropolis_eq" or self.acceptance == "metropolis-eq": - prob_acc = min(1, np.exp(-dU)) - else: # must be 'metropolis', which consider lambda weights as well - g0 = weights[swap[0]] - g1 = weights[swap[1]] - dg_0 = g0[new_state_0] - g0[old_state_0] # g^{i}_{n} - g^{i}_{m} - dg_1 = g1[new_state_1] - g1[old_state_1] # g^{j}_{m} - g^{j}_{n} - dg = dg_0 + dg_1 # kT - - # Note that simulations with different lambda ranges would have different references - # so g^{i}_{n} - g^{j}_{n} or g^{j}_{m} - g^{i}_{m} wouldn't not make sense. - # We therefore print g^i_n - g^i_m and g^j_m - g^j_n instead even if they are less interesting. - if self.verbose is True: - print( - f" g^i_n - g^i_m = {dg_0:.2f} kT, g^j_m - g^j_n = {dg_1:.2f} kT, Total dg: {dg:.2f} kT" - ) - - prob_acc = min(1, np.exp(-dU + dg)) - return prob_acc - - def accept_or_reject(self, prob_acc): - """ - Returns a boolean variable indiciating whether the proposed swap should be acceepted given the acceptance rate. - - Parameters - ---------- - prob_acc : float - The acceptance rate. - - Returns - ------- - swap_bool : bool - A boolean variable indicating whether the swap should be accepted. - """ - if prob_acc == 0: - swap_bool = False - self.n_rejected += 1 - if self.verbose is True: - print(" Swap rejected! ", end="", flush=True) - else: - rand = random.random() - if self.verbose is True: - print( - f" Acceptance rate: {prob_acc:.3f} / Random number drawn: {rand:.3f}" - ) - if rand < prob_acc: - swap_bool = True - if self.verbose is True: - print(" Swap accepted! ") - else: - swap_bool = False - self.n_rejected += 1 - if self.verbose is True: - print(" Swap rejected! ") - return swap_bool - - def weight_correction(self, weights, counts): - """ - Corrects the lambda weights based on the histogram counts. Namely, - :math:`g_k' = g_k + ln(N_{k-1}/N_k)`, where :math:`g_k` and :math:`g_k'` - are the lambda weight after and before the correction, respectively. - Notably, in any of the following situations, we don't do any correction. - - - Either :math:`N_{k-1}` or :math:`N_k` is 0. - - Either :math:`N_{k-1}` or :math:`N_k` is smaller than the histogram cutoff. - - Parameters - ---------- - weights : list - A list of lists of weights (of ALL simulations) to be corrected. - counts : list - A list of lists of counts (of ALL simulations). - - Return - ------ - weights : list - An updated list of lists of corected weights. - """ - if self.verbose is True: - print("\nPerforming weight correction for the lambda weights ...") - else: - print("\nPerforming weight correction for the lambda weights ...", end="") - - for i in range(len(weights)): # loop over the replicas - if self.verbose is True: - print(f" Counts of rep {i}:\t\t{counts[i]}") - print(f' Original weights of rep {i}:\t{[float(f"{k:.3f}") for k in weights[i]]}') - - for j in range(1, len(weights[i])): # loop over the alchemical states - if counts[i][j - 1] != 0 and counts[i][j - 1] != 0: - if np.min([counts[i][j - 1], counts[i][j]]) > self.N_cutoff: - weights[i][j] += np.log(counts[i][j - 1] / counts[i][j]) - - if self.verbose is True: - print(f' Corrected weights of rep {i}:\t{[float(f"{k:.3f}") for k in weights[i]]}\n') - - if self.verbose is False: - print(' Done') - - return weights - - def get_averaged_weights(self, log_files): - """ - For each replica, calculate the averaged weights (and the associated error) from the time series - of the weights since the previous update of the Wang-Landau incrementor. - - Parameters - ---------- - log_files : list - A list of file paths to GROMACS LOG files of different replicas. - - Returned - -------- - weights_avg : list - A list of lists of weights averaged since the last update of the Wang-Landau - incrementor. The length of the list should be the number of replicas. - weights_err : list - A list of lists of errors corresponding to the values in :code:`weights_avg`. - """ - for i in range(self.n_sim): - weights, _, wl_delta, _ = gmx_parser.parse_log(log_files[i]) - if self.current_wl_delta[i] == wl_delta: - self.updating_weights[i] += weights # expand the list - else: - self.current_wl_delta[i] = wl_delta - self.updating_weights[i] = weights - - # shape of self.updating_weights is (n_sim, n_points, n_states), but n_points can be different - # for different replicas, which will error out np.mean(self.updating_weights, axis=1) - weights_avg = [np.mean(self.updating_weights[i], axis=0).tolist() for i in range(self.n_sim)] - weights_err = [] - for i in range(self.n_sim): - if len(self.updating_weights[i]) == 1: # this would lead to a RunTime Warning and nan - weights_err.append([0] * self.n_sub) # in `weighted_mean``, a simple average will be returned. - else: - weights_err.append(np.std(self.updating_weights[i], axis=0, ddof=1).tolist()) - - return weights_avg, weights_err - - def prepare_weights(self, weights_avg, weights_final): - """ - Prepared weights to be combined by the function :code:`combine_weights`. - For each replica, the RMSE between the averaged weights and the final weights is calculated. If the - maximum of the RMSEs of all replicas is smaller than the cutoff specified in the input YAML file - (the parameter :code:`rmse_cutoff`), either final weights or time-averaged weights will be used - (depending on the value of the parameter :code:`w_combine`). Otherwise, :code:`None` will be returned, - which will lead to deactivation of weight combination in the CLI :code:`run_EEXE`. - - Parameters - ---------- - weights_avg : list - A list of lists of weights averaged since the last update of the Wang-Landau - incrementor. The length of the list should be the number of replicas. - weights_final : list - A list of lists of final weights of all simulations. The length of the list should - be the number of replicas. - - Returns - ------- - weights_output : list - A list of lists of weights to be combined. - """ - rmse_list = [utils.calc_rmse(weights_avg[i], weights_final[i]) for i in range(self.n_sim)] - rmse_str = ', '.join([f'{i:.2f}' for i in rmse_list]) - print(f'RMSE between the final weights and time-averaged weights for each replica: {rmse_str} kT') - if np.max(rmse_list) < self.rmse_cutoff: - # Weight combination will be activated - if self.w_combine == 'final': - weights_output = weights_final - elif self.w_combine == 'avg': - weights_output = weights_avg - else: - weights_output = None - - return weights_output - - def combine_weights(self, hist, weights, weights_err=None, print_values=True): - """ - Combine alchemical weights across multiple replicas and adjusts the histogram counts - corerspondingly. Note that if :code:`weights_err` is provided, inverse-variance weighting will be used. - Care must be taken since inverse-variance weighting can lead to slower - convergence if the provided errors are not accurate. (See :ref:`doc_w_schemes` for mor details.) - - Parameters - ---------- - hist : list - A list of lists of histogram counts of ALL simulations. - weights : list - A list of lists alchemical weights of ALL simulations. - weights_err : list, optional - A list of lists of errors corresponding to the values in :code:`weights`. - print_values : bool, optional - Whether to print the histograms and weights for each replica before and - after weight combinationfor each replica. - - Returns - ------- - hist_modified : list - A list of modified histogram counts of ALL simulations. - weights_modified : list - A list of modified Wang-Landau weights of ALL simulations. - g_vec : np.ndarray - An array of alchemical weights of the whole range of states. - """ - if print_values is True: - w = np.round(weights, decimals=3).tolist() # just for printing - print(' Original weights:') - for i in range(len(w)): - print(f' Rep {i}: {w[i]}') - print('\n Original histogram counts:') - for i in range(len(hist)): - print(f' Rep {i}: {hist[i]}') - - # Calculate adjacent weight differences and g_vec - dg_vec, N_ratio_vec = [], [] # alchemical weight differences and histogram count ratios for the whole range - dg_adjacent = [list(np.diff(weights[i])) for i in range(len(weights))] - # Suppress the specific warning here - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=RuntimeWarning) - N_ratio_adjacent = [list(np.array(hist[i][1:]) / np.array(hist[i][:-1])) for i in range(len(hist))] - - # Below we deal with the case where the sampling is poor or the WL incrementor just got updated such that - # the histogram counts are 0 for some states, in which case we simply skip histogram correction. - contains_nan = any(np.isnan(value) for sublist in N_ratio_adjacent for value in sublist) # can be caused by 0/0 # noqa: E501 - contains_inf = any(np.isinf(value) for sublist in N_ratio_adjacent for value in sublist) # can be caused by x/0, where x is a finite number # noqa: E501 - skip_hist_correction = contains_nan or contains_inf - if skip_hist_correction: - print('\n Histogram correction is skipped because the histogram counts are 0 for some states.') - - if weights_err is not None: - dg_adjacent_err = [[np.sqrt(weights_err[i][j] ** 2 + weights_err[i][j + 1] ** 2) for j in range(len(weights_err[i]) - 1)] for i in range(len(weights_err))] # noqa: E501 - - for i in range(self.n_tot - 1): - dg_list, dg_err_list, N_ratio_list = [], [], [] - for j in range(len(self.state_ranges)): - if i in self.state_ranges[j] and i + 1 in self.state_ranges[j]: - idx = self.state_ranges[j].index(i) - dg_list.append(dg_adjacent[j][idx]) - N_ratio_list.append(N_ratio_adjacent[j][idx]) - if weights_err is not None: - dg_err_list.append(dg_adjacent_err[j][idx]) - if weights_err is None: - dg_vec.append(np.mean(dg_list)) - else: - dg_vec.append(utils.weighted_mean(dg_list, dg_err_list)[0]) - N_ratio_vec.append(np.prod(N_ratio_list) ** (1 / len(N_ratio_list))) # geometric mean - dg_vec.insert(0, 0) - N_ratio_vec.insert(0, hist[0][0]) - g_vec = np.array([sum(dg_vec[:(i + 1)]) for i in range(len(dg_vec))]) - if skip_hist_correction is False: - # When skip_hist_correction is True, previous lines for calculating N_ratio_vec or N_ratio_list will - # still not error out so it's fine to not add the conditional statement like here, since we will - # have hist_modified = hist at the end anyway. However, if skip_hist_correction, things like - # int(np.nan) will lead to an error, so we put an if condition here. - N_vec = np.array([int(np.ceil(np.prod(N_ratio_vec[:(i + 1)]))) for i in range(len(N_ratio_vec))]) - - # Determine the vector of alchemical weights and histogram counts for each replica - weights_modified = np.zeros_like(weights) - for i in range(self.n_sim): - hist_modified = [] - if self.equil[i] == -1: # unequilibrated - weights_modified[i] = list(g_vec[i * self.s: i * self.s + self.n_sub] - g_vec[i * self.s: i * self.s + self.n_sub][0]) # noqa: E501 - else: - weights_modified[i] = self.equilibrated_weights[i] - if skip_hist_correction is False: - hist_modified = [list(N_vec[self.state_ranges[i]]) for i in range(self.n_sim)] - else: - hist_modified = hist - - if print_values is True: - w = np.round(weights_modified, decimals=3).tolist() # just for printing - print('\n Modified weights:') - for i in range(len(w)): - print(f' Rep {i}: {w[i]}') - if skip_hist_correction is False: - print('\n Modified histogram counts:') - for i in range(len(hist_modified)): - print(f' Rep {i}: {hist_modified[i]}') - - if self.verbose is False: - print(' DONE') - print(f'The alchemical weights of all states: \n {list(np.round(g_vec, decimals=3))}') - else: - print(f'\n The alchemical weights of all states: \n {list(np.round(g_vec, decimals=3))}') - - return hist_modified, weights_modified, g_vec - - def _run_grompp(self, n, swap_pattern): - """ - Prepares TPR files for the simulation ensemble using the GROMACS :code:`grompp` command. - - Parameters - ---------- - n : int - The iteration index (starting from 0). - swap_pattern : list - A list generated by :obj:`.get_swapping_pattern`. It represents how the replicas should be swapped. - """ - args_list = [] - for i in range(self.n_sim): - arguments = [self.gmx_executable, 'grompp'] - - # Input files - mdp = f"sim_{i}/iteration_{n}/{self.mdp.split('/')[-1]}" - if n == 0: - if isinstance(self.gro, list): - gro = f"{self.gro[i]}" - else: - gro = f"{self.gro}" - else: - gro = f"sim_{swap_pattern[i]}/iteration_{n-1}/confout.gro" # This effectively swap out GRO files - - if isinstance(self.top, list): - top = f"{self.top[i]}" - else: - top = f"{self.top}" - - # Add input file arguments - arguments.extend(["-f", mdp, "-c", gro, "-p", top]) - - # Add output file arguments - arguments.extend([ - "-o", f"sim_{i}/iteration_{n}/sys_EE.tpr", - "-po", f"sim_{i}/iteration_{n}/mdout.mdp" - ]) - - # Add additional arguments if any - if self.grompp_args is not None: - # Turn the dictionary into a list with the keys alternating with values - add_args = [elem for pair in self.grompp_args.items() for elem in pair] - arguments.extend(add_args) - - args_list.append(arguments) - - # Run the GROMACS grompp commands in parallel - returncode = None # Initialize as None for all ranks (necessary for the case when -np > n_sim, which is rare) - if rank == 0: - print('Generating TPR files ...') - if rank < self.n_sim: - returncode, stdout, stderr = utils.run_gmx_cmd(args_list[rank]) - if returncode != 0: - print(f'Error on rank {rank} (return code: {returncode}):\n{stderr}') - - # gather return codes at rank 0 - code_list = comm.gather(returncode, root=0) - - if rank == 0: - # Filter out None values which represent ranks that did not execute the command - code_list = [code for code in code_list if code is not None] - if code_list != [0] * self.n_sim: - MPI.COMM_WORLD.Abort(1) # Doesn't matter what non-zero returncode we put here as the code from GROMACS will be printed before this point anyway. # noqa: E501 - - def _run_mdrun(self, n): - """ - Executes GROMACS mdrun commands in parallel. - - Parameters - ---------- - n : int - The iteration index (starting from 0). - """ - # We will change the working directory so the mdrun command should be the same for all replicas. - arguments = [self.gmx_executable, 'mdrun'] - - # Add input file arguments - arguments.extend(['-s', 'sys_EE.tpr']) - - if self.runtime_args is not None: - # Turn the dictionary into a list with the keys alternating with values - add_args = [elem for pair in self.runtime_args.items() for elem in pair] - arguments.extend(add_args) - - # Run the GROMACS mdrun commands in parallel - returncode = None # Initialize as None for all ranks (necessary for the case when -np > n_sim, which is rare) - if rank == 0: - print('Running EXE simulations ...') - if rank < self.n_sim: - os.chdir(f'sim_{rank}/iteration_{n}') - returncode, stdout, stderr = utils.run_gmx_cmd(arguments) - if returncode != 0: - print(f'Error on rank {rank} (return code: {returncode}):\n{stderr}') - if self.rm_cpt is True: - # if the simulation went wrong, there would be no checkpoint file - try: - os.remove('state.cpt') - except Exception: - print('\n--------------------------------------------------------------------------\n') - MPI.COMM_WORLD.Abort(1) - os.chdir('../../') - - # gather return codes at rank 0 - code_list = comm.gather(returncode, root=0) - - if rank == 0: - # Filter out None values which represent ranks that did not execute the command - code_list = [code for code in code_list if code is not None] - if code_list != [0] * self.n_sim: - MPI.COMM_WORLD.Abort(1) # Doesn't matter what non-zero returncode we put here as the code from GROMACS will be printed before this point anyway. # noqa: E501 - - def run_EEXE(self, n, swap_pattern=None): - """ - Perform one iteration in the EEXE simulation, which includes generating the - TPR files using the GROMACS grompp :code:`command` and running the expanded ensemble simulations - in parallel using GROMACS :code:`mdrun` command. The GROMACS commands are launched by as subprocesses. - The function assumes that the GROMACS executable is available. - - Parameters - ---------- - n : int - The iteration index (starting from 0). - swap_pattern : list - A list generated by :obj:`.get_swapping_pattern`. It represents how the replicas should be swapped. - This parameter is not needed only if :code:`n` is 0. - """ - if rank == 0: - iter_str = f'\nIteration {n}: {self.dt * self.nst_sim * n: .1f} - {self.dt * self.nst_sim * (n + 1): .1f} ps' # noqa: E501 - print(iter_str + '\n' + '=' * (len(iter_str) - 1)) - - # 1st synchronizing point for all MPI processes: To make sure ranks other than 0 will not start executing - # _run_grompp earlier and mess up the order of printing. - comm.barrier() - - # Generating all required TPR files simultaneously, then run all simulations simultaneously. - # No synchronizing point is needed between _run_grompp and _run_mdrun, since once rank i finishes _run_grompp, - # it should run _run_mdrun in the same working directory, so there won't be any I/O error. - self._run_grompp(n, swap_pattern) - self._run_mdrun(n) - - # 2nd synchronizaing point for all MPI processes: To make sure no rank will start getting to the next - # iteration earlier than the others. For example, if rank 0 finishes the mdrun command earlier, we don't - # want it to start parsing the dhdl file (in the if condition of if rank == 0) of simulation 3 being run by - # rank 3 that has not been generated, which will lead to an I/O error. - comm.barrier() From e954af83e6375c67a90e6a81a9c5cfc73ba4f98f Mon Sep 17 00:00:00 2001 From: Wei-Tse Hsu Date: Wed, 25 Oct 2023 16:31:25 -0600 Subject: [PATCH 03/11] Finished updating replica_exchange_EE.py --- ensemble_md/replica_exchange_ee.py | 39 +++++++++++++++--------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/ensemble_md/replica_exchange_ee.py b/ensemble_md/replica_exchange_ee.py index c2500ef6..e4017f1a 100644 --- a/ensemble_md/replica_exchange_ee.py +++ b/ensemble_md/replica_exchange_ee.py @@ -8,7 +8,8 @@ # # #################################################################### """ -The :obj:`.ensemble_EXE` module provides functions for setting up and ensemble of expanded ensemble. +The :obj:`.replica_exchange_EE` module provides functions for setting up and +replica exchange and expanded ensemble (REXEE) simulations. """ import os import sys @@ -105,9 +106,9 @@ def set_params(self, analysis): each of the parameters specified in the YAML file. For a full list of the parameters that can be specified in the YAML file, please refer to :ref:`doc_parameters`. - :param yaml_file: The file name of the YAML file for specifying the parameters for EEXE. + :param yaml_file: The file name of the YAML file for specifying the parameters for REXEE. :type yaml_file: str - :param analysis: Whether the instantiation of the class is for data analysis of EEXE simulations. + :param analysis: Whether the instantiation of the class is for data analysis of REXEE simulations. The default is :code:`False` :type analysis: bool @@ -292,7 +293,7 @@ def set_params(self, analysis): raise ParameterError("The number of values specified for each key in 'mdp_args' should be the same as the number of replicas.") # noqa: E501 # Step 5: Reformat the input MDP file to replace all hypens with underscores. - self.reformatted_mdp = EnsembleEXE.reformat_MDP(self.mdp) + self.reformatted_mdp = ReplicaExchangeEE.reformat_MDP(self.mdp) # Step 6: Read in parameters from the MDP template self.template = gmx_parser.MDP(self.mdp) @@ -343,7 +344,7 @@ def set_params(self, analysis): if self.template['nstexpanded'] % self.template['nstdhdl'] != 0: raise ParameterError( - 'In EEXE, the parameter "nstdhdl" must be a factor of the parameter "nstexpanded", or the calculation of acceptance ratios might be wrong.') # noqa: E501 + 'In REXEE, the parameter "nstdhdl" must be a factor of the parameter "nstexpanded", or the calculation of acceptance ratios might be wrong.') # noqa: E501 if self.mdp_args is not None: if 'lmc_seed' in self.mdp_args and -1 not in self.mdp_args['lmc_seed']: @@ -365,7 +366,7 @@ def set_params(self, analysis): if 'nstexpanded' in self.mdp_args and 'nstdhdl' in self.mdp_args and sum(np.array(self.mdp_args['nstexpanded']) % np.array(self.mdp_args['nstdhdl'])) != 0: # noqa: E501 raise ParameterError( - 'In EEXE, the parameter "nstdhdl" must be a factor of the parameter "nstexpanded", or the calculation of acceptance ratios might be wrong.') # noqa: E501 + 'In REXEE, the parameter "nstdhdl" must be a factor of the parameter "nstexpanded", or the calculation of acceptance ratios might be wrong.') # noqa: E501 if 'pull' in self.template and self.template['pull'] == 'yes': pull_ncoords = self.template['pull_ncoords'] @@ -403,7 +404,7 @@ def set_params(self, analysis): self.n_sub = self.n_tot - self.s * (self.n_sim - 1) if self.n_sub < 1: raise ParameterError( - f"There must be at least two states for each replica (current value: {self.n_sub}). The current specified configuration (n_tot={self.n_tot}, n_sim={self.n_sim}, s={self.s}) does not work for EEXE.") # noqa: E501 + f"There must be at least two states for each replica (current value: {self.n_sub}). The current specified configuration (n_tot={self.n_tot}, n_sim={self.n_sim}, s={self.s}) does not work for REXEE.") # noqa: E501 # 7-5. A list of sets of state indices start_idx = [i * self.s for i in range(self.n_sim)] @@ -513,7 +514,7 @@ def print_params(self, params_analysis=False): print(f" - {i}: {self.mdp_args[i]}") else: print(f"MDP parameters differing across replicas: {self.mdp_args}") - print("Alchemical ranges of each replica in EEXE:") + print("Alchemical ranges of each replica in REXEE:") for i in range(self.n_sim): print(f" - Replica {i}: States {self.state_ranges[i]}") @@ -839,10 +840,10 @@ def get_swapping_pattern(self, dhdl_files, states, weights): sample configurations 0, 2, 1, 3, respectively, where configurations 0, 1, 2, 3 here are defined as whatever configurations are in replicas 0, 1, 2, 3 in the CURRENT iteration (not iteration 0), respectively. - Notably, when this function is called (e.g. once every iteration in an EEXE simulation), the output + Notably, when this function is called (e.g. once every iteration in an REXEE simulation), the output list :code:`swap_pattern` is always initialized as :code:`[0, 1, 2, 3, ...]` and gets updated once every attempted swap. This is different from the attribute :code:`configs`, which is only initialized at the - very beginning of the entire EEXE simulation (iteration 0), though :code:`configs` also gets updated with + very beginning of the entire REXEE simulation (iteration 0), though :code:`configs` also gets updated with :code:`swap_pattern` once every attempted swap in this function. Parameters @@ -883,7 +884,7 @@ def get_swapping_pattern(self, dhdl_files, states, weights): swap_pattern = list(range(self.n_sim)) # Can be regarded as the indices of DHDL files/configurations state_ranges = copy.deepcopy(self.state_ranges) states_copy = copy.deepcopy(states) # only for re-identifying swappable pairs given updated state_ranges - swappables = EnsembleEXE.identify_swappable_pairs(states, state_ranges, self.proposal == 'neighboring', self.add_swappables) # noqa: E501 + swappables = ReplicaExchangeEE.identify_swappable_pairs(states, state_ranges, self.proposal == 'neighboring', self.add_swappables) # noqa: E501 # Note that if there is only 1 swappable pair, then it will still be the only swappable pair # after an attempted swap is accepted. Therefore, there is no need to perform multiple swaps or re-identify @@ -911,7 +912,7 @@ def get_swapping_pattern(self, dhdl_files, states, weights): if self.proposal == 'exhaustive': n_ex_exhaustive += 1 - swap = EnsembleEXE.propose_swap(swappables) + swap = ReplicaExchangeEE.propose_swap(swappables) print(f'\nProposed swap: {swap}') if swap == []: self.n_empty_swappable += 1 @@ -928,9 +929,9 @@ def get_swapping_pattern(self, dhdl_files, states, weights): prob_acc = self.calc_prob_acc(swap, dhdl_files, states, shifts, weights) swap_bool = self.accept_or_reject(prob_acc) - # Theoretically, in an EEXE simulation, we could either choose to swap configurations (via + # Theoretically, in an REXEE simulation, we could either choose to swap configurations (via # swapping GRO files) or replicas (via swapping MDP files). In ensemble_md package, we chose the - # former when implementing the EEXE algorithm. Specifically, in the CLI `run_EEXE`, `swap_pattern` + # former when implementing the REXEE algorithm. Specifically, in the CLI `run_REXEE`, `swap_pattern` # is used to swap the GRO files. Therefore, when an attempted swap is accetped and `swap_pattern` # is updated, we also need to update the variables `shifts`, `weights`, `dhdl_files`, # `state_ranges`, `self.configs` but not anything else. Otherwise, incorrect results will be @@ -951,7 +952,7 @@ def get_swapping_pattern(self, dhdl_files, states, weights): if n_ex > 1 and self.proposal == 'multiple': # must be multiple swaps # After state_ranges have been updated, we re-identify the swappable pairs. # Notably, states_copy (instead of states) should be used. (They could be different.) - swappables = EnsembleEXE.identify_swappable_pairs(states_copy, state_ranges, self.proposal == 'neighboring', self.add_swappables) # noqa: E501 + swappables = ReplicaExchangeEE.identify_swappable_pairs(states_copy, state_ranges, self.proposal == 'neighboring', self.add_swappables) # noqa: E501 print(f" New swappable pairs: {swappables}") else: # In this case, there is no need to update the swappables @@ -993,7 +994,7 @@ def calc_prob_acc(self, swap, dhdl_files, states, shifts, weights): substracted by :code:`shifts` should be the local state indices of the last sampled states in :code:`dhdl_files[0]`, :code:`dhdl_files[1]`, ... (which importantly, are not necessarily :code:`dhdl_0.xvg`, :code:`dhdl_1.xvg`, ...). If multiple swaps are not used, then - this should always be :code:`list(EEXE.s * np.arange(EEXE.n_sim))`. + this should always be :code:`list(REXEE.s * np.arange(REXEE.n_sim))`. weights : list A list of lists of final weights of ALL simulations. Typiecally generated by :obj:`.extract_final_log_info`. :code:`weights[i]` and :code:`dhdl_files[i]` should correspond to @@ -1188,7 +1189,7 @@ def prepare_weights(self, weights_avg, weights_final): maximum of the RMSEs of all replicas is smaller than the cutoff specified in the input YAML file (the parameter :code:`rmse_cutoff`), either final weights or time-averaged weights will be used (depending on the value of the parameter :code:`w_combine`). Otherwise, :code:`None` will be returned, - which will lead to deactivation of weight combination in the CLI :code:`run_EEXE`. + which will lead to deactivation of weight combination in the CLI :code:`run_REXEE`. Parameters ---------- @@ -1441,9 +1442,9 @@ def _run_mdrun(self, n): if code_list != [0] * self.n_sim: MPI.COMM_WORLD.Abort(1) # Doesn't matter what non-zero returncode we put here as the code from GROMACS will be printed before this point anyway. # noqa: E501 - def run_EEXE(self, n, swap_pattern=None): + def run_REXEE(self, n, swap_pattern=None): """ - Perform one iteration in the EEXE simulation, which includes generating the + Perform one iteration in the REXEE simulation, which includes generating the TPR files using the GROMACS grompp :code:`command` and running the expanded ensemble simulations in parallel using GROMACS :code:`mdrun` command. The GROMACS commands are launched by as subprocesses. The function assumes that the GROMACS executable is available. From bff87856eac2a9ea410742b00f4e127853a73314 Mon Sep 17 00:00:00 2001 From: Wei-Tse Hsu Date: Wed, 25 Oct 2023 16:34:31 -0600 Subject: [PATCH 04/11] Renamed replica_exchange_ee.py to replica_exchange_EE.py --- ensemble_md/{replica_exchange_ee.py => replica_exchange_EE.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename ensemble_md/{replica_exchange_ee.py => replica_exchange_EE.py} (100%) diff --git a/ensemble_md/replica_exchange_ee.py b/ensemble_md/replica_exchange_EE.py similarity index 100% rename from ensemble_md/replica_exchange_ee.py rename to ensemble_md/replica_exchange_EE.py From 605c1cc6e84c68ec209a7aff89dea7dd546fabdc Mon Sep 17 00:00:00 2001 From: Wei-Tse Hsu Date: Wed, 25 Oct 2023 16:43:29 -0600 Subject: [PATCH 05/11] Finished updating the CLIs --- ensemble_md/cli/analyze_REXEE.py | 124 +++++++++++++-------------- ensemble_md/cli/explore_REXEE.py | 24 +++--- ensemble_md/cli/run_REXEE.py | 142 +++++++++++++++---------------- 3 files changed, 145 insertions(+), 145 deletions(-) diff --git a/ensemble_md/cli/analyze_REXEE.py b/ensemble_md/cli/analyze_REXEE.py index b694d17e..693f0065 100644 --- a/ensemble_md/cli/analyze_REXEE.py +++ b/ensemble_md/cli/analyze_REXEE.py @@ -29,7 +29,7 @@ from ensemble_md.analysis import analyze_matrix # noqa: E402 from ensemble_md.analysis import msm_analysis # noqa: E402 from ensemble_md.analysis import analyze_free_energy # noqa: E402 -from ensemble_md.ensemble_EXE import EnsembleEXE # noqa: E402 +from ensemble_md.replica_exchange_EE import ReplicaExchangeEE # noqa: E402 from ensemble_md.utils.exceptions import ParameterError # noqa: E402 @@ -41,13 +41,13 @@ def initialize(args): '--yaml', type=str, default='params.yaml', - help='The input YAML file used to run the EEXE simulation. (Default: params.yaml)') + help='The input YAML file used to run the REXEE simulation. (Default: params.yaml)') parser.add_argument('-o', '--output', type=str, - default='analyze_EEXE_log.txt', - help='The output log file that contains the analysis results of EEXE. \ - (Default: analyze_EEXE_log.txt)') + default='analyze_REXEE_log.txt', + help='The output log file that contains the analysis results of REXEE. \ + (Default: analyze_REXEE_log.txt)') parser.add_argument('-rt', '--rep_trajs', type=str, @@ -101,15 +101,15 @@ def main(): print(f'Current time: {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}') print(f'Command line: {" ".join(sys.argv)}') - EEXE = EnsembleEXE(args.yaml) - EEXE.print_params(params_analysis=True) + REXEE = ReplicaExchangeEE(args.yaml) + REXEE.print_params(params_analysis=True) - for i in EEXE.warnings: + for i in REXEE.warnings: print() print(f'{i}') print() - if len(EEXE.warnings) > args.maxwarn: + if len(REXEE.warnings) > args.maxwarn: raise ParameterError( f"The execution failed due to warning(s) about parameter spcificaiton. Consider setting maxwarn in the input YAML file if you want to ignore them.") # noqa: E501, F541 @@ -130,12 +130,12 @@ def main(): # 1-1. Plot the replica-sapce trajectory print('1-1. Plotting transitions between alchemical ranges ...') - dt_swap = EEXE.nst_sim * EEXE.dt # dt for swapping replicas + dt_swap = REXEE.nst_sim * REXEE.dt # dt for swapping replicas analyze_traj.plot_rep_trajs(rep_trajs, f'{args.dir}/rep_trajs.png', dt_swap) # 1-2. Plot the replica-space transition matrix print('1-2. Plotting the replica-space transition matrix (considering all continuous trajectories) ...') - counts = [analyze_traj.traj2transmtx(rep_trajs[i], EEXE.n_sim, normalize=False) for i in range(len(rep_trajs))] + counts = [analyze_traj.traj2transmtx(rep_trajs[i], REXEE.n_sim, normalize=False) for i in range(len(rep_trajs))] reps_mtx = np.sum(counts, axis=0) # First sum up the counts. This should be symmetric if n_ex=1. Otherwise it might not be. # noqa: E501 reps_mtx /= np.sum(reps_mtx, axis=1)[:, None] # and then normalize each row analyze_matrix.plot_matrix(reps_mtx, f'{args.dir}/rep_transmtx_allconfigs.png') @@ -155,16 +155,16 @@ def main(): else: # This may take a while. print('2-1. Stitching trajectories for each starting configuration from dhdl files ...') - dhdl_files = [natsort.natsorted(glob.glob(f'sim_{i}/iteration_*/*dhdl*xvg')) for i in range(EEXE.n_sim)] - shifts = np.arange(EEXE.n_sim) * EEXE.s + dhdl_files = [natsort.natsorted(glob.glob(f'sim_{i}/iteration_*/*dhdl*xvg')) for i in range(REXEE.n_sim)] + shifts = np.arange(REXEE.n_sim) * REXEE.s state_trajs = analyze_traj.stitch_time_series(dhdl_files, rep_trajs, shifts=shifts, save_npy=True, save_xvg=True) # length: the number of replicas # noqa: E501 # 2-2. Plot the state-space trajectory print('\n2-2. Plotting transitions between different alchemical states ...') - dt_traj = EEXE.dt * EEXE.template['nstdhdl'] # in ps + dt_traj = REXEE.dt * REXEE.template['nstdhdl'] # in ps analyze_traj.plot_state_trajs( state_trajs, - EEXE.state_ranges, + REXEE.state_ranges, f'{args.dir}/state_trajs.png', dt_traj ) @@ -173,10 +173,10 @@ def main(): print('\n2-3. Plotting the histograms of the state index for different trajectories ...') hist_data = analyze_traj.plot_state_hist( state_trajs, - EEXE.state_ranges, + REXEE.state_ranges, f'{args.dir}/state_hist.png' ) - rmse = analyze_traj.calculate_hist_rmse(hist_data, EEXE.state_ranges) + rmse = analyze_traj.calculate_hist_rmse(hist_data, REXEE.state_ranges) print(f'The RMSE of accumulated histogram counts of the state index: {rmse:.0f}') # 2-4. Stitch the time series of state index for different alchemical ranges @@ -186,15 +186,15 @@ def main(): else: # This may take a while. print('2-4. Stitching time series of state index for each alchemical range ...') - dhdl_files = [natsort.natsorted(glob.glob(f'sim_{i}/iteration_*/*dhdl*xvg')) for i in range(EEXE.n_sim)] - shifts = np.arange(EEXE.n_sim) * EEXE.s + dhdl_files = [natsort.natsorted(glob.glob(f'sim_{i}/iteration_*/*dhdl*xvg')) for i in range(REXEE.n_sim)] + shifts = np.arange(REXEE.n_sim) * REXEE.s state_trajs_for_sim = analyze_traj.stitch_time_series_for_sim(dhdl_files, shifts) # 2-5. Plot the time series of state index for different alchemical ranges print('\n2-5. Plotting the time series of state index for different alchemical ranges ...') analyze_traj.plot_state_trajs( state_trajs_for_sim, - EEXE.state_ranges, + REXEE.state_ranges, f'{args.dir}/state_trajs_for_sim.png', title_prefix='Alchemical range' ) @@ -203,7 +203,7 @@ def main(): print('\n2-6. Plotting the histograms of state index for different alchemical ranges') analyze_traj.plot_state_hist( state_trajs_for_sim, - EEXE.state_ranges, + REXEE.state_ranges, f'{args.dir}/state_hist_for_sim.png', prefix='Alchemical range', subplots=True @@ -212,8 +212,8 @@ def main(): # 2-7. Plot the overall state transition matrices calculated from the state-space trajectories print('\n2-7. Plotting the overall state transition matrices from different trajectories ...') mtx_list = [] - for i in range(EEXE.n_sim): - mtx = analyze_traj.traj2transmtx(state_trajs[i], EEXE.n_tot) + for i in range(REXEE.n_sim): + mtx = analyze_traj.traj2transmtx(state_trajs[i], REXEE.n_tot) mtx_list.append(mtx) analyze_matrix.plot_matrix(mtx, f'{args.dir}/traj_{i}_state_transmtx.png') @@ -223,7 +223,7 @@ def main(): spectral_gaps = [results[i][0] if None not in results else None for i in range(len(results))] eig_vals = [results[i][1] if None not in results else None for i in range(len(results))] if None not in spectral_gaps: - for i in range(EEXE.n_sim): + for i in range(REXEE.n_sim): print(f' - Trajectory {i}: {spectral_gaps[i]:.3f} (λ_1: {eig_vals[i][0]:.5f}, λ_2: {eig_vals[i][1]:.5f})') # noqa: E501 print(f' - Average of the above: {np.mean(spectral_gaps):.3f} (std: {np.std(spectral_gaps, ddof=1):.3f})') @@ -233,21 +233,21 @@ def main(): if any([x is None for x in pi_list]): pass # None is in the list else: - for i in range(EEXE.n_sim): + for i in range(REXEE.n_sim): print(f' - Trajectory {i}: {", ".join([f"{j:.3f}" for j in pi_list[i].reshape(-1)])}') if len({len(i) for i in pi_list}) == 1: # all lists in pi_list have the same length print(f' - Average of the above: {", ".join([f"{i:.3f}" for i in np.mean(pi_list, axis=0).reshape(-1)])}') # noqa: E501 # 2-10. Calculate the state index correlation time for each trajectory (this step is more time-consuming one) print('\n2-10. Calculating the state index correlation time ...') - tau_list = [(pymbar.timeseries.statistical_inefficiency(state_trajs[i], fast=True) - 1) / 2 * dt_traj for i in range(EEXE.n_sim)] # noqa: E501 - for i in range(EEXE.n_sim): + tau_list = [(pymbar.timeseries.statistical_inefficiency(state_trajs[i], fast=True) - 1) / 2 * dt_traj for i in range(REXEE.n_sim)] # noqa: E501 + for i in range(REXEE.n_sim): print(f' - Trajectory {i}: {tau_list[i]:.1f} ps') print(f' - Average of the above: {np.mean(tau_list):.1f} ps (std: {np.std(tau_list, ddof=1):.1f} ps)') # 2-11. Calculate transit times for each trajectory print('\n2-11. Plotting the average transit times ...') - t_0k_list, t_k0_list, t_roundtrip_list, units = analyze_traj.plot_transit_time(state_trajs, EEXE.n_tot, dt=dt_traj, folder=args.dir) # noqa: E501 + t_0k_list, t_k0_list, t_roundtrip_list, units = analyze_traj.plot_transit_time(state_trajs, REXEE.n_tot, dt=dt_traj, folder=args.dir) # noqa: E501 meta_list = [t_0k_list, t_k0_list, t_roundtrip_list] t_names = [ '\n - Average transit time from states 0 to k', @@ -267,7 +267,7 @@ def main(): if np.sum(np.isnan([np.mean(i) for i in t_list])) != 0: poor_sampling = True - if EEXE.msm is True: + if REXEE.msm is True: section_idx += 1 # Section 3. Analysis based on Markov state models @@ -275,7 +275,7 @@ def main(): # 3-1. Plot the implied timescale as a function of lag time print('\n3-1. Plotting the implied timescale as a function of lag time for all trajectories ...') - lags = np.arange(EEXE.lag_spacing, EEXE.lag_max + EEXE.lag_spacing, EEXE.lag_spacing) + lags = np.arange(REXEE.lag_spacing, REXEE.lag_max + REXEE.lag_spacing, REXEE.lag_spacing) # lags could also be None and decided automatically. Could consider using that. ts_list = msm_analysis.plot_its(state_trajs, lags, fig_name=f'{args.dir}/implied_timescales.png', dt=dt_traj, units='ps') # noqa: E501 @@ -289,7 +289,7 @@ def main(): # 3-3. Build a Bayesian MSM and perform a CK test for each trajectory to validate the models print('\n3-3. Building Bayesian MSMs for the state-space trajectory for each trajectory ...') print(' Performing a Chapman-Kolmogorov test on each trajectory ...') - models = [pyemma.msm.bayesian_markov_model(state_trajs[i], chosen_lags[i], dt_traj=f'{dt_traj} ps', show_progress=False) for i in range(EEXE.n_sim)] # noqa: E501 + models = [pyemma.msm.bayesian_markov_model(state_trajs[i], chosen_lags[i], dt_traj=f'{dt_traj} ps', show_progress=False) for i in range(REXEE.n_sim)] # noqa: E501 for i in range(len(models)): print(f' Plotting the CK-test results for trajectory {i} ...') @@ -300,7 +300,7 @@ def main(): # not be counted as involved in the transition matrix (i.e. not in the active set). To check the # active states, use models[i].active_set. If the system sampled all states frequently, # models[i].active_set should be equal to np.unique(state_trajs[i]) and both lengths should be - # EEXE.n_tot. I'm not sure why the attribute nstates_full is not always EEXE.n_tot but is less + # REXEE.n_tot. I'm not sure why the attribute nstates_full is not always REXEE.n_tot but is less # relevant here. cktest = models[i].cktest(nsets=nsets, mlags=mlags, show_progress=False) pyemma.plots.plot_cktest(cktest, dt=dt_traj, units='ps') @@ -309,7 +309,7 @@ def main(): # Additionally, check if the sampling is poor for each trajectory for i in range(len(models)): - if models[i].nstates != EEXE.n_tot: + if models[i].nstates != REXEE.n_tot: print(f' Note: The sampling of trajectory {i} was poor.') # 3-4. Plot the state transition matrices estimated with the specified lag times in MSMs @@ -318,13 +318,13 @@ def main(): mtx_list_modified = [] # just for plotting (if all trajs sampled the fulle range frequently, this will be the same as mtx_list) # noqa: E501 for i in range(len(mtx_list)): # check if each mtx in mtx_list spans the full alchemical range. (If the system did not visit - # certain states, the dimension will be less than EEXE.n_tot * EEXE.n_tot. In this case, we + # certain states, the dimension will be less than REXEE.n_tot * REXEE.n_tot. In this case, we # add rows and columns of 0. Note that the modified matrix will not be a transition matrix, # so this is only for plotting. For later analysis such as spectral gap calculation, we # will just use the unmodified matrices. - if mtx_list[i].shape != (EEXE.n_tot, EEXE.n_tot): # add rows and columns of 0 + if mtx_list[i].shape != (REXEE.n_tot, REXEE.n_tot): # add rows and columns of 0 sampled = models[i].active_set - missing = list(set(range(EEXE.n_tot)) - set(sampled)) # states not visited + missing = list(set(range(REXEE.n_tot)) - set(sampled)) # states not visited # figure out which end we should stack rows/columns to n_1 = sum(missing > max(sampled)) # add rows/columns to the end of large state indices @@ -348,7 +348,7 @@ def main(): print(' Saving transmtx.npy (plotted transition matrices)...') np.save(f'{args.dir}/transmtx.npy', mtx_list_modified) - for i in range(EEXE.n_sim): + for i in range(REXEE.n_sim): analyze_matrix.plot_matrix(mtx_list[i], f'{args.dir}/traj_{i}_state_transmtx_msm.png') analyze_matrix.plot_matrix(avg_mtx, f'{args.dir}/state_transmtx_avg_msm.png') @@ -363,13 +363,13 @@ def main(): # 3-5. Calculate the spectral gap from the transition matrix of each trajectory print('\n3-5. Calculating the spectral gap of the state transition matrices obtained from MSMs ...') spectral_gaps, eig_vals = [analyze_matrix.calc_spectral_gap(mtx) for mtx in mtx_list] - for i in range(EEXE.n_sim): + for i in range(REXEE.n_sim): print(f' - Trajectory {i}: {spectral_gaps[i]:.3f}') # 3-6. Calculate the stationary distribution for each trajectory print('\n3-6. Calculating the stationary distributions from the transition matrices obtained from MSMs ...') pi_list = [m.pi for m in models] - for i in range(EEXE.n_sim): + for i in range(REXEE.n_sim): print(f' - Trajectory {i}: {", ".join([f"{j:.3f}" for j in pi_list[i]])}') if len({len(i) for i in pi_list}) == 1: # all lists in pi_list have the same length print(f' - Average of the above: {", ".join([f"{i:.3f}" for i in np.mean(pi_list, axis=0)])}') @@ -379,16 +379,16 @@ def main(): # note that it's not m.mfpt(min(m.active_set), max(m.active_set)) as the input to mfpt should be indices # though sometimes these two could be same. mfpt_list = [m.mfpt(0, m.nstates - 1) for m in models] - for i in range(EEXE.n_sim): + for i in range(REXEE.n_sim): print(f' - Trajectory {i}: {mfpt_list[i]:.1f} ps') print(f' - Average of the above: {np.mean(mfpt_list):.1f} ps (std: {np.std(mfpt_list, ddof=1):.1f} ps)') # 3-8. Calculate the state index correlation time for each trajectory print('\n3-8. Plotting the state index correlation times for all trajectories ...') - msm_analysis.plot_acf(models, EEXE.n_tot, f'{args.dir}/state_ACF.png') + msm_analysis.plot_acf(models, REXEE.n_tot, f'{args.dir}/state_ACF.png') # Section 4 (or Section 3). Free energy calculations - if EEXE.free_energy is True: + if REXEE.free_energy is True: if poor_sampling is True: print('\nFree energy calculation is not performed since the sampling appears poor.') sys.exit() @@ -397,7 +397,7 @@ def main(): # 4-1. Subsampling the data data_list = [] # either a list of u_nk or a list of dhdl - if EEXE.df_data_type == 'u_nk': + if REXEE.df_data_type == 'u_nk': if os.path.isfile(f'{args.dir}/u_nk_data.pickle') is True: print('Loading the preprocessed data u_nk ...') with open(f'{args.dir}/u_nk_data.pickle', 'rb') as handle: @@ -409,54 +409,54 @@ def main(): data_list = pickle.load(handle) if data_list == []: - files_list = [natsort.natsorted(glob.glob(f'sim_{i}/iteration_*/*dhdl*xvg')) for i in range(EEXE.n_sim)] - data_list, t_list, g_list = analyze_free_energy.preprocess_data(files_list, EEXE.temp, EEXE.df_data_type, EEXE.df_spacing) # noqa: E501 + files_list = [natsort.natsorted(glob.glob(f'sim_{i}/iteration_*/*dhdl*xvg')) for i in range(REXEE.n_sim)] + data_list, t_list, g_list = analyze_free_energy.preprocess_data(files_list, REXEE.temp, REXEE.df_data_type, REXEE.df_spacing) # noqa: E501 - with open(f'{args.dir}/{EEXE.df_data_type}_data.pickle', 'wb') as handle: + with open(f'{args.dir}/{REXEE.df_data_type}_data.pickle', 'wb') as handle: pickle.dump(data_list, handle, protocol=pickle.HIGHEST_PROTOCOL) # 4-2. Calculate the free energy profile - f, f_err, estimators = analyze_free_energy.calculate_free_energy(data_list, EEXE.state_ranges, EEXE.df_method, EEXE.err_method, EEXE.n_bootstrap, EEXE.seed) # noqa: E501 + f, f_err, estimators = analyze_free_energy.calculate_free_energy(data_list, REXEE.state_ranges, REXEE.df_method, REXEE.err_method, REXEE.n_bootstrap, REXEE.seed) # noqa: E501 print('Plotting the full-range free energy profile ...') analyze_free_energy.plot_free_energy(f, f_err, f'{args.dir}/free_energy_profile.png') print('The full-range free energy profile averaged over all replicas:') - print(f" {', '.join(f'{f[i]:.3f} +/- {f_err[i]:.3f} kT' for i in range(EEXE.n_tot))}") + print(f" {', '.join(f'{f[i]:.3f} +/- {f_err[i]:.3f} kT' for i in range(REXEE.n_tot))}") print(f'The free energy difference between the coupled and decoupled states: {f[-1]:.3f} +/- {f_err[-1]:.3f} kT') # noqa: E501 - if EEXE.df_ref is not None: - rmse_list = analyze_free_energy.calculate_df_rmse(estimators, EEXE.df_ref, EEXE.state_ranges) - for i in range(EEXE.n_sim): - print(f'RMSE of the free energy profile for alchemical range {i} (states {EEXE.state_ranges[i][0]} to {EEXE.state_ranges[i][-1]}): {rmse_list[i]:.2f} kT') # noqa: E501 + if REXEE.df_ref is not None: + rmse_list = analyze_free_energy.calculate_df_rmse(estimators, REXEE.df_ref, REXEE.state_ranges) + for i in range(REXEE.n_sim): + print(f'RMSE of the free energy profile for alchemical range {i} (states {REXEE.state_ranges[i][0]} to {REXEE.state_ranges[i][-1]}): {rmse_list[i]:.2f} kT') # noqa: E501 # 4-3. Recalculate the free energy profile if subsampling_avg is True - if EEXE.subsampling_avg is True: + if REXEE.subsampling_avg is True: print('\nUsing averaged start index of the equilibrated data and the avearged statistic inefficiency to re-perform free energy calculations ...') # noqa: E501 t_avg = int(np.mean(t_list)) + 1 # Using the ceiling function to be a little more conservative g_avg = np.array(g_list).prod() ** (1/len(g_list)) # geometric mean print(f'Averaged start index: {t_avg}') print(f'Averaged statistical inefficiency: {g_avg:.2f}') - data_list, _, _ = analyze_free_energy.preprocess_data(files_list, EEXE.temp, EEXE.df_data_type, EEXE.df_spacing, t_avg, g_avg) # noqa: E501 - with open(f'{args.dir}/{EEXE.df_data_type}_data_avg_subsampling.pickle', 'wb') as handle: + data_list, _, _ = analyze_free_energy.preprocess_data(files_list, REXEE.temp, REXEE.df_data_type, REXEE.df_spacing, t_avg, g_avg) # noqa: E501 + with open(f'{args.dir}/{REXEE.df_data_type}_data_avg_subsampling.pickle', 'wb') as handle: pickle.dump(data_list, handle, protocol=pickle.HIGHEST_PROTOCOL) - f, f_err, estimators = analyze_free_energy.calculate_free_energy(data_list, EEXE.state_ranges, EEXE.df_method, EEXE.err_method, EEXE.n_bootstrap, EEXE.seed) # noqa: E501 + f, f_err, estimators = analyze_free_energy.calculate_free_energy(data_list, REXEE.state_ranges, REXEE.df_method, REXEE.err_method, REXEE.n_bootstrap, REXEE.seed) # noqa: E501 print('Plotting the full-range free energy profile ...') analyze_free_energy.plot_free_energy(f, f_err, f'{args.dir}/free_energy_profile_avg_subsampling.png') print('The full-range free energy profile averaged over all replicas:') - print(f" {', '.join(f'{f[i]:.3f} +/- {f_err[i]:.3f} kT' for i in range(EEXE.n_tot))}") + print(f" {', '.join(f'{f[i]:.3f} +/- {f_err[i]:.3f} kT' for i in range(REXEE.n_tot))}") print(f'The free energy difference between the coupled and decoupled states: {f[-1]:.3f} +/- {f_err[-1]:.3f} kT') # noqa: E501 - if EEXE.df_ref is not None: - rmse_list = analyze_free_energy.calculate_df_rmse(estimators, EEXE.df_ref, EEXE.state_ranges) - for i in range(EEXE.n_sim): - print(f'RMSE of the free energy profile for alchemical range {i} (states {EEXE.state_ranges[i][0]} to {EEXE.state_ranges[i][-1]}): {rmse_list[i]:.2f} kT') # noqa: E501 + if REXEE.df_ref is not None: + rmse_list = analyze_free_energy.calculate_df_rmse(estimators, REXEE.df_ref, REXEE.state_ranges) + for i in range(REXEE.n_sim): + print(f'RMSE of the free energy profile for alchemical range {i} (states {REXEE.state_ranges[i][0]} to {REXEE.state_ranges[i][-1]}): {rmse_list[i]:.2f} kT') # noqa: E501 # Section 4. Calculate the time spent in GROMACS (This could take a while.) - t_wall_tot, t_sync, _ = utils.analyze_EEXE_time() + t_wall_tot, t_sync, _ = utils.analyze_REXEE_time() print(f'\nTotal wall time GROMACS spent to finish all iterations: {utils.format_time(t_wall_tot)}') print(f'Total time spent in syncrhonizing all replicas: {utils.format_time(t_sync)}') diff --git a/ensemble_md/cli/explore_REXEE.py b/ensemble_md/cli/explore_REXEE.py index 23fe5c1c..e83334f4 100644 --- a/ensemble_md/cli/explore_REXEE.py +++ b/ensemble_md/cli/explore_REXEE.py @@ -12,23 +12,23 @@ import argparse import numpy as np import pandas as pd -from ensemble_md.ensemble_EXE import EnsembleEXE +from ensemble_md.replica_exchange_EE import ReplicaExchangeEE def initialize(args): parser = argparse.ArgumentParser( - description='This code explores the parameter space of homogenous EEXE to help you figure \ + description='This code explores the parameter space of homogenous REXEE to help you figure \ out all possible combinations of the number of replicas, the number of \ states in each replica, and the number of overlapping states, and the total number states.') parser.add_argument('-N', '--N', required=True, type=int, - help='The total number of states of the EEXE simulation.') + help='The total number of states of the REXEE simulation.') parser.add_argument('-r', '--r', type=int, - help='The number of replicas that compose the EEXE simulation.') + help='The number of replicas that compose the REXEE simulation.') parser.add_argument('-n', '--n', type=int, @@ -54,9 +54,9 @@ def initialize(args): return args_parse -def solv_EEXE_diophantine(N, constraint=False): +def solv_REXEE_diophantine(N, constraint=False): """ - Solves the general nonlinear Diophantine equation associated with the homogeneous EEXE + Solves the general nonlinear Diophantine equation associated with the homogeneous REXEE parameters. Specifically, given the total number of states :math:`N` and the number of replicas r, the states for each replica n and the state shift s can be expressed as: n = N + (r-1)(t-1), and s = 1 - t, with the range of t being either the following: @@ -66,7 +66,7 @@ def solv_EEXE_diophantine(N, constraint=False): Parameters ---------- N : int - The total number of states of the homogeneous EEXE of interesst. + The total number of states of the homogeneous REXEE of interesst. constraint : bool Whether to apply additional constraints such that n-s <= 1/2n. @@ -115,7 +115,7 @@ def estimate_swapless_rate(state_ranges, N=1000000): n = 0 # number of times of not having any swappable pairs for i in range(N): rands = [random.choice(state_ranges[i]) for i in range(len(state_ranges))] - swappables = EnsembleEXE.identify_swappable_pairs(rands, state_ranges, False) + swappables = ReplicaExchangeEE.identify_swappable_pairs(rands, state_ranges, False) if swappables == []: n += 1 @@ -125,18 +125,18 @@ def estimate_swapless_rate(state_ranges, N=1000000): def main(): - # For now, we only consider homogenous EEXE simulations + # For now, we only consider homogenous REXEE simulations args = initialize(sys.argv[1:]) - print('Exploration of the EEXE parameter space') + print('Exploration of the REXEE parameter space') print('=======================================') - print('[ EEXE parameters of interest ]') + print('[ REXEE parameters of interest ]') print('- N: The total number of states') print('- r: The number of replicas') print('- n: The number of states for each replica') print('- s: The state shift between adjacent replicas') # Enuerate all possible combinations of (N, r, n, s) even if any of r, n, s is given - it's easy/fast anyway. - soln_all = solv_EEXE_diophantine(args.N, constraint=args.cnst) + soln_all = solv_REXEE_diophantine(args.N, constraint=args.cnst) # Now filter the solutions if args.r is not None: diff --git a/ensemble_md/cli/run_REXEE.py b/ensemble_md/cli/run_REXEE.py index cfc69c82..239bc910 100644 --- a/ensemble_md/cli/run_REXEE.py +++ b/ensemble_md/cli/run_REXEE.py @@ -19,7 +19,7 @@ from datetime import datetime from ensemble_md.utils import utils -from ensemble_md.ensemble_EXE import EnsembleEXE +from ensemble_md.replica_exchange_EE import ReplicaExchangeEE def initialize(args): @@ -29,7 +29,7 @@ def initialize(args): '--yaml', type=str, default='params.yaml', - help='The input YAML file that contains EEXE parameters. (Default: params.yaml)') + help='The input YAML file that contains REXEE parameters. (Default: params.yaml)') parser.add_argument('-c', '--ckpt', type=str, @@ -46,9 +46,9 @@ def initialize(args): parser.add_argument('-o', '--output', type=str, - default='run_EEXE_log.txt', + default='run_REXEE_log.txt', help='The output file for logging how replicas interact with each other. \ - (Default: run_EEXE_log.txt)') + (Default: run_REXEE_log.txt)') parser.add_argument('-m', '--maxwarn', type=int, @@ -65,7 +65,7 @@ def main(): sys.stdout = utils.Logger(logfile=args.output) sys.stderr = utils.Logger(logfile=args.output) - # Step 1: Set up MPI rank and instantiate EnsembleEXE to set up EEXE parameters + # Step 1: Set up MPI rank and instantiate ReplicaExchangeEE to set up REXEE parameters comm = MPI.COMM_WORLD rank = comm.Get_rank() # Note that this is a GLOBAL variable @@ -73,17 +73,17 @@ def main(): print(f'Current time: {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}') print(f'Command line: {" ".join(sys.argv)}\n') - EEXE = EnsembleEXE(args.yaml) + REXEE = ReplicaExchangeEE(args.yaml) if rank == 0: # Print out simulation parameters - EEXE.print_params() + REXEE.print_params() # Print out warnings and fail if needed - for i in EEXE.warnings: + for i in REXEE.warnings: print(f'\n{i}\n') - if len(EEXE.warnings) > args.maxwarn: + if len(REXEE.warnings) > args.maxwarn: print(f"The execution failed due to warning(s) about parameter spcificaiton. Check the warnings, or consider setting maxwarn in the input YAML file if you find them harmless.") # noqa: E501, F541 comm.Abort(101) @@ -93,28 +93,28 @@ def main(): # 2-1. Set up input files for all simulations if rank == 0: - for i in range(EEXE.n_sim): + for i in range(REXEE.n_sim): os.mkdir(f'sim_{i}') os.mkdir(f'sim_{i}/iteration_0') - MDP = EEXE.initialize_MDP(i) + MDP = REXEE.initialize_MDP(i) MDP.write(f"sim_{i}/iteration_0/expanded.mdp", skipempty=True) # 2-2. Run the first ensemble of simulations - EEXE.run_EEXE(0) + REXEE.run_REXEE(0) else: if rank == 0: - # If there is a checkpoint file, we see the execution as an extension of an EEXE simulation + # If there is a checkpoint file, we see the execution as an extension of an REXEE simulation ckpt_data = np.load(args.ckpt) start_idx = len(ckpt_data[0]) # The length should be the same for the same axis - print(f'\nGetting prepared to extend the EEXE simulation from iteration {start_idx} ...') + print(f'\nGetting prepared to extend the REXEE simulation from iteration {start_idx} ...') - if start_idx == EEXE.n_iter: + if start_idx == REXEE.n_iter: print('Extension aborted: The expected number of iterations have been completed!') MPI.COMM_WORLD.Abort(1) else: print('Deleting data generated after the checkpoint ...') - for i in range(EEXE.n_sim): + for i in range(REXEE.n_sim): n_finished = len(next(os.walk(f'sim_{i}'))[1]) # number of finished iterations for j in range(start_idx, n_finished): print(f' Deleting the folder sim_{i}/iteration_{j}') @@ -123,18 +123,18 @@ def main(): # Read g_vecs.npy and rep_trajs.npy so that new data can be appended, if any. # Note that these two arrays are created in rank 0 and should always be operated in rank 0, # or broadcasting is required. - EEXE.rep_trajs = [list(i) for i in ckpt_data] + REXEE.rep_trajs = [list(i) for i in ckpt_data] if os.path.isfile(args.g_vecs) is True: - EEXE.g_vecs = [list(i) for i in np.load(args.g_vecs)] + REXEE.g_vecs = [list(i) for i in np.load(args.g_vecs)] else: start_idx = None start_idx = comm.bcast(start_idx, root=0) # so that all the ranks are aware of start_idx # 2-3. Get the reference distance for the distance restraint specified in the pull code, if any. - EEXE.get_ref_dist() + REXEE.get_ref_dist() - for i in range(start_idx, EEXE.n_iter): + for i in range(start_idx, REXEE.n_iter): # For a large code block like below executed on rank 0, we try to catch any exception and abort the simulation. # So if there is bug, the execution will be terminated and no computation time will be wasted. try: @@ -143,10 +143,10 @@ def main(): # 3-1. For all the replica simulations, # (1) Find the last sampled state and the corresponding lambda values from the DHDL files. # (2) Find the final Wang-Landau incrementors and weights from the LOG files. - dhdl_files = [f'sim_{j}/iteration_{i - 1}/dhdl.xvg' for j in range(EEXE.n_sim)] - log_files = [f'sim_{j}/iteration_{i - 1}/md.log' for j in range(EEXE.n_sim)] - states_ = EEXE.extract_final_dhdl_info(dhdl_files) - wl_delta, weights_, counts_ = EEXE.extract_final_log_info(log_files) + dhdl_files = [f'sim_{j}/iteration_{i - 1}/dhdl.xvg' for j in range(REXEE.n_sim)] + log_files = [f'sim_{j}/iteration_{i - 1}/md.log' for j in range(REXEE.n_sim)] + states_ = REXEE.extract_final_dhdl_info(dhdl_files) + wl_delta, weights_, counts_ = REXEE.extract_final_log_info(log_files) print() # 3-2. Identify swappable pairs, propose swap(s), calculate P_acc, and accept/reject swap(s) @@ -159,68 +159,68 @@ def main(): states = copy.deepcopy(states_) weights = copy.deepcopy(weights_) counts = copy.deepcopy(counts_) - swap_pattern, swap_list = EEXE.get_swapping_pattern(dhdl_files, states_, weights_) # swap_list will only be used for modify_coords # noqa: E501 + swap_pattern, swap_list = REXEE.get_swapping_pattern(dhdl_files, states_, weights_) # swap_list will only be used for modify_coords # noqa: E501 # 3-3. Perform weight correction/weight combination - if wl_delta != [None for i in range(EEXE.n_sim)]: # weight-updating + if wl_delta != [None for i in range(REXEE.n_sim)]: # weight-updating print(f'\nCurrent Wang-Landau incrementors: {wl_delta}\n') # (1) First we prepare the weights to be combined. # Note that although averaged weights are sometimes used for weight correction/weight combination, # the final weights are always used for calculating the acceptance ratio. - if EEXE.N_cutoff != -1 or EEXE.w_combine is not None: + if REXEE.N_cutoff != -1 or REXEE.w_combine is not None: # Only when weight correction/weight combination is needed. - weights_avg, weights_err = EEXE.get_averaged_weights(log_files) - weights_input = EEXE.prepare_weights(weights_avg, weights) # weights_input is for weight combination # noqa: E501 + weights_avg, weights_err = REXEE.get_averaged_weights(log_files) + weights_input = REXEE.prepare_weights(weights_avg, weights) # weights_input is for weight combination # noqa: E501 # (2) Now we perform weight correction/weight combination. # The product of this step should always be named as "weights" to be used in update_MDP - if EEXE.N_cutoff != -1 and EEXE.w_combine is not None: + if REXEE.N_cutoff != -1 and REXEE.w_combine is not None: # perform both if weights_input is None: # Then only weight correction will be performed print('Note: Weight combination is deactivated because the weights are too noisy.') - weights = EEXE.weight_correction(weights, counts) - _ = EEXE.combine_weights(counts_, weights, print_values=False)[1] # just to print the combiend weights # noqa: E501 + weights = REXEE.weight_correction(weights, counts) + _ = REXEE.combine_weights(counts_, weights, print_values=False)[1] # just to print the combiend weights # noqa: E501 else: - weights_preprocessed = EEXE.weight_correction(weights_input, counts) - if EEXE.verbose is True: + weights_preprocessed = REXEE.weight_correction(weights_input, counts) + if REXEE.verbose is True: print('Performing weight combination ...') else: print('Performing weight combination ...', end='') - counts, weights, g_vec = EEXE.combine_weights(counts_, weights_preprocessed) # inverse-variance weighting seems worse # noqa: E501 - EEXE.g_vecs.append(g_vec) - elif EEXE.N_cutoff == -1 and EEXE.w_combine is not None: + counts, weights, g_vec = REXEE.combine_weights(counts_, weights_preprocessed) # inverse-variance weighting seems worse # noqa: E501 + REXEE.g_vecs.append(g_vec) + elif REXEE.N_cutoff == -1 and REXEE.w_combine is not None: # only perform weight combination print('Note: No weight correction will be performed.') if weights_input is None: print('Note: Weight combination is deactivated because the weights are too noisy.') - _ = EEXE.combine_weights(counts_, weights, print_values=False)[1] # just to print the combined weights # noqa: E501 + _ = REXEE.combine_weights(counts_, weights, print_values=False)[1] # just to print the combined weights # noqa: E501 else: - if EEXE.verbose is True: + if REXEE.verbose is True: print('Performing weight combination ...') else: print('Performing weight combination ...', end='') - counts, weights, g_vec = EEXE.combine_weights(counts_, weights_input) # inverse-variance weighting seems worse # noqa: E501 - EEXE.g_vecs.append(g_vec) - elif EEXE.N_cutoff != -1 and EEXE.w_combine is None: + counts, weights, g_vec = REXEE.combine_weights(counts_, weights_input) # inverse-variance weighting seems worse # noqa: E501 + REXEE.g_vecs.append(g_vec) + elif REXEE.N_cutoff != -1 and REXEE.w_combine is None: # only perform weight correction print('Note: No weight combination will be performed.') - weights = EEXE.histogram_correction(weights_input, counts) - _ = EEXE.combine_weights(counts_, weights, print_values=False)[1] # just to print the combined weights # noqa: E501 + weights = REXEE.histogram_correction(weights_input, counts) + _ = REXEE.combine_weights(counts_, weights, print_values=False)[1] # just to print the combined weights # noqa: E501 else: print('Note: No weight correction will be performed.') print('Note: No weight combination will be performed.') - _ = EEXE.combine_weights(counts_, weights, print_values=False)[1] # just to print the combiend weights # noqa: E501 + _ = REXEE.combine_weights(counts_, weights, print_values=False)[1] # just to print the combiend weights # noqa: E501 # 3-5. Modify the MDP files and swap out the GRO files (if needed) # Here we keep the lambda range set in mdp the same across different iterations in the same folder but swap out the gro file # noqa: E501 # Note we use states (copy of states_) instead of states_ in update_MDP. - for j in list(range(EEXE.n_sim)): + for j in list(range(REXEE.n_sim)): os.mkdir(f'sim_{j}/iteration_{i}') - MDP = EEXE.update_MDP(f"sim_{j}/iteration_{i - 1}/expanded.mdp", j, i, states, wl_delta, weights, counts) # modify with a new template # noqa: E501 + MDP = REXEE.update_MDP(f"sim_{j}/iteration_{i - 1}/expanded.mdp", j, i, states, wl_delta, weights, counts) # modify with a new template # noqa: E501 MDP.write(f"sim_{j}/iteration_{i}/expanded.mdp", skipempty=True) - # In run_EEXE(i, swap_pattern), where the tpr files will be generated, we use the top file at the + # In run_REXEE(i, swap_pattern), where the tpr files will be generated, we use the top file at the # level of the simulation (the file that will be shared by all simulations). For the gro file, we # pass swap_pattern to the function to figure it out internally. else: @@ -231,13 +231,13 @@ def main(): print(f'An error occurred on rank 0:\n{traceback.format_exc()}') MPI.COMM_WORLD.Abort(1) - if -1 not in EEXE.equil and 0 not in EEXE.equil: + if -1 not in REXEE.equil and 0 not in REXEE.equil: # This is the case where the weights are equilibrated in a weight-updating simulation. - # As a remidner, EEXE.equil should be a list of 0 after extract_final_log_info in a + # As a remidner, REXEE.equil should be a list of 0 after extract_final_log_info in a # fixed-weight simulation, and a list of -1 for a weight-updating simulation with unequilibrated weights. print('\nSimulation terminated: The weights have been equilibrated for all replicas.') # this will only be printed in rank 0 # noqa: E501 - # Note that EEXE.equil is avaiable for all ranks but only updated in rank 0. So the if condition here + # Note that REXEE.equil is avaiable for all ranks but only updated in rank 0. So the if condition here # can only be satisfied in rank 0. We broadcast exit_loop to all ranks so that all ranks can exit the # simulation at the same time, if the weights get equilibrated. exit_loop = True @@ -257,7 +257,7 @@ def main(): if len(swap_list) == 0: pass else: - if EEXE.modify_coords_fn is not None: + if REXEE.modify_coords_fn is not None: try: if rank == 0: for j in range(len(swap_list)): @@ -274,53 +274,53 @@ def main(): os.rename(gro_2, gro_2_backup) # Here we input gro_1_backup and gro_2_backup and modify_coords_fn will save the modified gro files as gro_1 and gro_2 # noqa: E501 - EEXE.modify_coords_fn(gro_1_backup, gro_2_backup) # the order should not matter + REXEE.modify_coords_fn(gro_1_backup, gro_2_backup) # the order should not matter except Exception: print('\n--------------------------------------------------------------------------\n') print(f'\nAn error occurred on rank 0:\n{traceback.format_exc()}') MPI.COMM_WORLD.Abort(1) # 4-2. Run another ensemble of simulations - EEXE.run_EEXE(i, swap_pattern) + REXEE.run_REXEE(i, swap_pattern) # 4-3. Save data if rank == 0: - if (i + 1) % EEXE.n_ckpt == 0: - if len(EEXE.g_vecs) != 0: + if (i + 1) % REXEE.n_ckpt == 0: + if len(REXEE.g_vecs) != 0: # Save g_vec as a function of time if weight combination was used. - np.save('g_vecs.npy', EEXE.g_vecs) + np.save('g_vecs.npy', REXEE.g_vecs) print('\n----- Saving .npy files to checkpoint the simulation ---') - np.save('rep_trajs.npy', EEXE.rep_trajs) + np.save('rep_trajs.npy', REXEE.rep_trajs) # Save the npy files at the end of the simulation anyway. if rank == 0: - if len(EEXE.g_vecs) != 0: # The length will be 0 only if there is no weight combination. - np.save('g_vecs.npy', EEXE.g_vecs) - np.save('rep_trajs.npy', EEXE.rep_trajs) + if len(REXEE.g_vecs) != 0: # The length will be 0 only if there is no weight combination. + np.save('g_vecs.npy', REXEE.g_vecs) + np.save('rep_trajs.npy', REXEE.rep_trajs) # Step 5: Write a summary for the simulation ensemble if rank == 0: print('\nSummary of the simulation ensemble') print('==================================') print('Simulation status:') - for i in range(EEXE.n_sim): - if EEXE.fixed_weights is True: + for i in range(REXEE.n_sim): + if REXEE.fixed_weights is True: print(f'- Rep {i}: The weights were fixed throughout the simulation.') - elif EEXE.equil[i] == -1: + elif REXEE.equil[i] == -1: print(f' - Rep {i}: The weights have not been equilibrated.') else: - idx = int(np.floor(EEXE.equil[i] / (EEXE.dt * EEXE.nst_sim))) - if EEXE.equil[i] > 1000: + idx = int(np.floor(REXEE.equil[i] / (REXEE.dt * REXEE.nst_sim))) + if REXEE.equil[i] > 1000: units = 'ns' - EEXE.equil[i] /= 1000 + REXEE.equil[i] /= 1000 else: units = 'ps' - print(f' - Rep {i}: The weights have been equilibrated at {EEXE.equil[i]:.2f} {units} (iteration {idx}).') # noqa: E501 + print(f' - Rep {i}: The weights have been equilibrated at {REXEE.equil[i]:.2f} {units} (iteration {idx}).') # noqa: E501 - print(f'\n{EEXE.n_empty_swappable} out of {EEXE.n_iter}, or {EEXE.n_empty_swappable / EEXE.n_iter * 100:.1f}% iterations had an empty list of swappable pairs.') # noqa: E501 - if EEXE.n_swap_attempts != 0: - print(f'{EEXE.n_rejected} out of {EEXE.n_swap_attempts}, or {EEXE.n_rejected / EEXE.n_swap_attempts * 100:.1f}% of attempted exchanges were rejected.') # noqa: E501 + print(f'\n{REXEE.n_empty_swappable} out of {REXEE.n_iter}, or {REXEE.n_empty_swappable / REXEE.n_iter * 100:.1f}% iterations had an empty list of swappable pairs.') # noqa: E501 + if REXEE.n_swap_attempts != 0: + print(f'{REXEE.n_rejected} out of {REXEE.n_swap_attempts}, or {REXEE.n_rejected / REXEE.n_swap_attempts * 100:.1f}% of attempted exchanges were rejected.') # noqa: E501 print(f'\nTime elapsed: {utils.format_time(time.time() - t1)}') From 9d56d387c494607212400661acd197ba46756d74 Mon Sep 17 00:00:00 2001 From: Wei-Tse Hsu Date: Wed, 25 Oct 2023 16:45:34 -0600 Subject: [PATCH 06/11] Finished updating utils --- ensemble_md/utils/gmx_parser.py | 6 +++--- ensemble_md/utils/utils.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ensemble_md/utils/gmx_parser.py b/ensemble_md/utils/gmx_parser.py index 0e075bb6..2fd31df8 100644 --- a/ensemble_md/utils/gmx_parser.py +++ b/ensemble_md/utils/gmx_parser.py @@ -24,7 +24,7 @@ def parse_log(log_file): """ This function parses a log file generated by expanded ensemble and provides - important information, especially for running new iterations in EEXE. + important information, especially for running new iterations in REXEE. Typically, there are three types of log files from an expanded ensemble simulation: - **Case 1**: The weights are still updating in the simulation and have never been equilibrated. @@ -58,7 +58,7 @@ def parse_log(log_file): incrementor up to equilibration will be returned. - In Case 3, the returned list will only have one list inside, which is the list of the final weights. - That is, for all cases, :code:`weights[-1]` will be the final weights, which are useful in EEXE. + That is, for all cases, :code:`weights[-1]` will be the final weights, which are useful in REXEE. counts : list The final histogram counts. wl_delta : float @@ -390,7 +390,7 @@ def compare_MDPs(mdp_list, print_diff=False): Given a list of MDP files, identify the parameters for which not all MDP files have the same values. Note that this function is not aware of the default values of GROMACS parameters. (Currently, this function is not used in the - workflow adopted in :code:`run_EEXE.py` but it might be useful in some places, + workflow adopted in :code:`run_REXEE.py` but it might be useful in some places, so we decided to keep it.) Parameters diff --git a/ensemble_md/utils/utils.py b/ensemble_md/utils/utils.py index 2b79a157..c595c403 100644 --- a/ensemble_md/utils/utils.py +++ b/ensemble_md/utils/utils.py @@ -289,9 +289,9 @@ def get_time_metrics(log): return t_metrics -def analyze_EEXE_time(log_files=None): +def analyze_REXEE_time(log_files=None): """ - Perform simple data analysis on the wall times and performances of all iterations of an EEXE simulation. + Perform simple data analysis on the wall times and performances of all iterations of an REXEE simulation. Parameters ---------- @@ -301,7 +301,7 @@ def analyze_EEXE_time(log_files=None): Returns ------- t_wall_tot : float - The total wall time GROMACS spent to finish all iterations for the EEXE simulation. + The total wall time GROMACS spent to finish all iterations for the REXEE simulation. t_sync : float The total time spent in synchronizing all replicas, which is the sum of the differences between the longest and the shortest time elapsed to finish a iteration. From a6056b76bd813fb814569a76158ffa0388879a7a Mon Sep 17 00:00:00 2001 From: Wei-Tse Hsu Date: Wed, 25 Oct 2023 16:49:49 -0600 Subject: [PATCH 07/11] Finished updating analyze_free_energy.py, analyze_matrix.py and analyze_traj.py --- ensemble_md/analysis/analyze_free_energy.py | 10 +++++----- ensemble_md/analysis/analyze_matrix.py | 4 ++-- ensemble_md/analysis/analyze_traj.py | 20 ++++++++++---------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/ensemble_md/analysis/analyze_free_energy.py b/ensemble_md/analysis/analyze_free_energy.py index 715aa843..3c033799 100644 --- a/ensemble_md/analysis/analyze_free_energy.py +++ b/ensemble_md/analysis/analyze_free_energy.py @@ -8,7 +8,7 @@ # # #################################################################### """ -The :obj:`.analyze_free_energy` module provides functions for performing free energy calculations for EEXE simulations. +The :obj:`.analyze_free_energy` module provides functions for performing free energy calculations for REXEE simulations. """ import alchemlyb import numpy as np @@ -26,7 +26,7 @@ def preprocess_data(files_list, temp, data_type, spacing=1, t=None, g=None): """ - This function preprocesses :math:`u_{nk}`/:math:`dH/dλ` data for all replicas in an EEXE simulation. + This function preprocesses :math:`u_{nk}`/:math:`dH/dλ` data for all replicas in an REXEE simulation. For each replica, it reads in :math:`u_{nk}`/:math:`dH/dλ` data from all iterations, concatenate them, remove the equilibrium region and and decorrelate the concatenated data. Notably, the data preprocessing protocol is basically the same as the one adopted in @@ -116,7 +116,7 @@ def _apply_estimators(data, df_method="MBAR"): Parameters ---------- data : pd.Dataframe - A list of dHdl or u_nk dataframes obtained from all replicas of the EEXE simulation of interest. + A list of dHdl or u_nk dataframes obtained from all replicas of the REXEE simulation of interest. Preferrably, the dHdl or u_nk data should be preprocessed by the function proprocess_data. df_method : str The selected free energy estimator. Options include "MBAR", "BAR" and "TI". @@ -222,13 +222,13 @@ def _calculate_weighted_df(df_adjacent, df_err_adjacent, state_ranges, propagate def calculate_free_energy(data, state_ranges, df_method="MBAR", err_method='propagate', n_bootstrap=None, seed=None): """ Caculates the averaged free energy profile with the chosen method given dHdl or u_nk data obtained from - all replicas of the EEXE simulation of interest. Available methods include TI, BAR, and MBAR. TI + all replicas of the REXEE simulation of interest. Available methods include TI, BAR, and MBAR. TI requires dHdl data while the other two require u_nk data. Parameters ---------- data : pd.Dataframe - A list of dHdl or u_nk dataframes obtained from all replicas of the EEXE simulation of interest. + A list of dHdl or u_nk dataframes obtained from all replicas of the REXEE simulation of interest. Preferrably, the dHdl or u_nk data should be preprocessed by the function proprocess_data. state_ranges : list A list of lists of intergers that represents the alchemical states that can be sampled by different replicas. diff --git a/ensemble_md/analysis/analyze_matrix.py b/ensemble_md/analysis/analyze_matrix.py index 13c4d308..75fe0f00 100644 --- a/ensemble_md/analysis/analyze_matrix.py +++ b/ensemble_md/analysis/analyze_matrix.py @@ -8,7 +8,7 @@ # # #################################################################### """ -The :obj:`.analyze_matrix` module provides methods for analyzing matrices obtained from EEXE. +The :obj:`.analyze_matrix` module provides methods for analyzing matrices obtained from REXEE. """ import numpy as np import seaborn as sns @@ -175,7 +175,7 @@ def split_transmtx(trans_mtx, n_sim, n_sub): trans_mtx : np.ndarray The input state transition matrix to split n_sim : int - The number of replicas in EEXE. + The number of replicas in REXEE. n_sub : int The number of states for each replica. diff --git a/ensemble_md/analysis/analyze_traj.py b/ensemble_md/analysis/analyze_traj.py index c7fb97e1..c3300ab6 100644 --- a/ensemble_md/analysis/analyze_traj.py +++ b/ensemble_md/analysis/analyze_traj.py @@ -8,7 +8,7 @@ # # #################################################################### """ -The :obj:`.analyze_traj` module provides methods for analyzing trajectories in EEXE. +The :obj:`.analyze_traj` module provides methods for analyzing trajectories in REXEE. """ import numpy as np import matplotlib.pyplot as plt @@ -256,7 +256,7 @@ def traj2transmtx(traj, N, normalize=True): """ Computes the transition matrix given a trajectory. For example, if a state-space trajectory from a EXE or HREX simulation given, a state transition matrix is returned. - If a trajectory showing transitions between replicas in a EEXE simulation is given, + If a trajectory showing transitions between replicas in a REXEE simulation is given, a replica transition matrix is returned. Parameters @@ -444,7 +444,7 @@ def plot_state_hist(trajs, state_ranges, fig_name, stack=True, figsize=None, pre A list of state index time series either from different continuous trajectories or from different alchemical ranges (i.e. from different simulation folders). state_ranges : list - A list of lists of state indices. (Like the attribute :code:`state_ranges` in :obj:`.EnsembleEXE`.) + A list of lists of state indices. (Like the attribute :code:`state_ranges` in :obj:`.ReplicaExchangeEE`.) fig_name : str The file name of the png file to be saved (with the extension). stack : bool @@ -573,7 +573,7 @@ def calculate_hist_rmse(hist_data, state_ranges): hist_data : list The histogram data of the state index for each trajectory. state_ranges : list - A list of lists of state indices. (Like the attribute :code:`state_ranges` in :obj:`.EnsembleEXE`.) + A list of lists of state indices. (Like the attribute :code:`state_ranges` in :obj:`.ReplicaExchangeEE`.) Returns ------- @@ -770,7 +770,7 @@ def plot_g_vecs(g_vecs, refs=None, refs_err=None, plot_rmse=True): g_vecs : np.array The alchemical weights of all states as a function of iteration index. The shape should be (n_iterations, n_states). Such an array can be directly read from :code:`g_vecs.npy` - saved by :code:`run_EEXE`. + saved by :code:`run_REXEE`. refs : np.array The reference values of the alchemical weights. refs_err : list or np.array @@ -822,16 +822,16 @@ def plot_g_vecs(g_vecs, refs=None, refs_err=None, plot_rmse=True): plt.savefig('g_vecs_rmse.png', dpi=600) -def get_swaps(EEXE_log='run_EEXE_log.txt'): +def get_swaps(REXEE_log='run_REXEE_log.txt'): """ For each replica, identifies the states where exchanges were proposed and accepted. (Todo: We should be able to only use :code:`rep_trajs.npy` and :code:`state_trajs.npy` - instead of parsing the EEXE log file to reach the same goal.) + instead of parsing the REXEE log file to reach the same goal.) Parameters ---------- - EEXE_log : str - The output log file of the EEXE simulation. + REXEE_log : str + The output log file of the REXEE simulation. Returns ------- @@ -846,7 +846,7 @@ def get_swaps(EEXE_log='run_EEXE_log.txt'): keys being the global state indices and values being the number of accepted swaps that occurred in the state indicated by the key. """ - f = open(EEXE_log, 'r') + f = open(REXEE_log, 'r') lines = f.readlines() f.close() From 1b2796785b23e33818a766a12b6d1be066eda567 Mon Sep 17 00:00:00 2001 From: Wei-Tse Hsu Date: Wed, 25 Oct 2023 16:57:20 -0600 Subject: [PATCH 08/11] Renamed test_ensemble_EXE.py to test_replica_exchange_EE.py and finished updating unit tests --- ...ble_EXE.py => test_replica_exchange_EE.py} | 374 +++++++++--------- 1 file changed, 187 insertions(+), 187 deletions(-) rename ensemble_md/tests/{test_ensemble_EXE.py => test_replica_exchange_EE.py} (69%) diff --git a/ensemble_md/tests/test_ensemble_EXE.py b/ensemble_md/tests/test_replica_exchange_EE.py similarity index 69% rename from ensemble_md/tests/test_ensemble_EXE.py rename to ensemble_md/tests/test_replica_exchange_EE.py index a832515b..e41678aa 100644 --- a/ensemble_md/tests/test_ensemble_EXE.py +++ b/ensemble_md/tests/test_replica_exchange_EE.py @@ -8,7 +8,7 @@ # # #################################################################### """ -Unit tests for the module ensemble_EXE.py. +Unit tests for the module replica_exchange_EE.py. """ import os import sys @@ -19,7 +19,7 @@ import numpy as np import ensemble_md from ensemble_md.utils import gmx_parser -from ensemble_md.ensemble_EXE import EnsembleEXE +from ensemble_md.replica_exchange_EE import ReplicaExchangeEE from ensemble_md.utils.exceptions import ParameterError current_path = os.path.dirname(os.path.abspath(__file__)) @@ -29,9 +29,9 @@ @pytest.fixture def params_dict(): """ - Generates a dictionary containing the required EEXE parameters. + Generates a dictionary containing the required REXEE parameters. """ - EEXE_dict = { + REXEE_dict = { 'gmx_executable': 'gmx', 'gro': 'ensemble_md/tests/data/sys.gro', 'top': 'ensemble_md/tests/data/sys.top', @@ -40,45 +40,45 @@ def params_dict(): 'n_iter': 10, 's': 1, } - yield EEXE_dict + yield REXEE_dict # Remove the file after the unit test is done. if os.path.isfile('params.yaml') is True: os.remove('params.yaml') -def get_EEXE_instance(input_dict, yml_file='params.yaml'): +def get_REXEE_instance(input_dict, yml_file='params.yaml'): """ - Saves a dictionary as a yaml file and use it to instantiate the EnsembleEXE class. + Saves a dictionary as a yaml file and use it to instantiate the ReplicaExchangeEE class. """ with open(yml_file, 'w') as f: yaml.dump(input_dict, f) - EEXE = EnsembleEXE(yml_file) - return EEXE + REXEE = ReplicaExchangeEE(yml_file) + return REXEE -def check_param_error(EEXE_dict, param, match, wrong_val='cool', right_val=None): +def check_param_error(REXEE_dict, param, match, wrong_val='cool', right_val=None): """ Test if ParameterError is raised if a parameter is not well-defined. """ - EEXE_dict[param] = wrong_val + REXEE_dict[param] = wrong_val with pytest.raises(ParameterError, match=match): - get_EEXE_instance(EEXE_dict) - EEXE_dict[param] = right_val # so that EEXE_dict can be read without failing in later assertions + get_REXEE_instance(REXEE_dict) + REXEE_dict[param] = right_val # so that REXEE_dict can be read without failing in later assertions - return EEXE_dict + return REXEE_dict -class Test_EnsembleEXE: +class Test_ReplicaExchangeEE: def test_init(self, params_dict): - EEXE = get_EEXE_instance(params_dict) - assert EEXE.yaml == 'params.yaml' + REXEE = get_REXEE_instance(params_dict) + assert REXEE.yaml == 'params.yaml' def test_set_params_error(self, params_dict): # 1. Required parameters del params_dict['n_sim'] with pytest.raises(ParameterError, match="Required parameter 'n_sim' not specified in params.yaml."): - get_EEXE_instance(params_dict) + get_REXEE_instance(params_dict) params_dict['n_sim'] = 4 # so params_dict can be read without failing in the assertions below # 2. Available options @@ -114,7 +114,7 @@ def test_set_params_error(self, params_dict): params_dict['mdp'] = 'ensemble_md/tests/data/expanded_test.mdp' params_dict['nst_sim'] = 100 with pytest.raises(ParameterError, match='The parameter "nstlog" must be a factor of the parameter "nst_sim" specified in the YAML file.'): # noqa: E501 - get_EEXE_instance(params_dict) + get_REXEE_instance(params_dict) params_dict['nst_sim'] = 500 os.remove(os.path.join(input_path, "expanded_test.mdp")) @@ -122,16 +122,16 @@ def test_set_params_error(self, params_dict): params_dict['s'] = 5 params_dict['mdp'] = 'ensemble_md/tests/data/expanded.mdp' # Note that the parentheses are special characters that need to be escaped in regular expressions - with pytest.raises(ParameterError, match=r'There must be at least two states for each replica \(current value: -6\). The current specified configuration \(n_tot=9, n_sim=4, s=5\) does not work for EEXE.'): # noqa: E501 - get_EEXE_instance(params_dict) + with pytest.raises(ParameterError, match=r'There must be at least two states for each replica \(current value: -6\). The current specified configuration \(n_tot=9, n_sim=4, s=5\) does not work for REXEE.'): # noqa: E501 + get_REXEE_instance(params_dict) params_dict['s'] = 1 def test_set_params_warnings(self, params_dict): # 1. Non-recognizable parameter in the YAML file params_dict['cool'] = 10 - EEXE = get_EEXE_instance(params_dict) + REXEE = get_REXEE_instance(params_dict) warning = 'Warning: Parameter "cool" specified in the input YAML file is not recognizable.' - assert warning in EEXE.warnings + assert warning in REXEE.warnings # 2. Warnings related to the mdp file mdp = gmx_parser.MDP(os.path.join(input_path, "expanded.mdp")) # A perfect mdp file @@ -142,87 +142,87 @@ def test_set_params_warnings(self, params_dict): params_dict['mdp'] = 'ensemble_md/tests/data/expanded_test.mdp' params_dict['N_cutoff'] = 1000 - EEXE = get_EEXE_instance(params_dict) + REXEE = get_REXEE_instance(params_dict) warning_1 = 'Warning: The weight correction/weight combination method is specified but will not be used since the weights are fixed.' # noqa: E501 warning_2 = 'Warning: We recommend setting lmc_seed as -1 so the random seed is different for each iteration.' warning_3 = 'Warning: We recommend setting gen_seed as -1 so the random seed is different for each iteration.' - assert warning_1 in EEXE.warnings - assert warning_2 in EEXE.warnings - assert warning_3 in EEXE.warnings + assert warning_1 in REXEE.warnings + assert warning_2 in REXEE.warnings + assert warning_3 in REXEE.warnings os.remove(os.path.join(input_path, "expanded_test.mdp")) def test_set_params(self, params_dict): - # 0. Get an EEXE instance to test - EEXE = get_EEXE_instance(params_dict) - - # 1. Check the required EEXE parameters - assert EEXE.gmx_executable == 'gmx' - assert EEXE.gro == "ensemble_md/tests/data/sys.gro" - assert EEXE.top == "ensemble_md/tests/data/sys.top" - assert EEXE.mdp == "ensemble_md/tests/data/expanded.mdp" - assert EEXE.n_sim == 4 - assert EEXE.n_iter == 10 - assert EEXE.s == 1 + # 0. Get an REXEE instance to test + REXEE = get_REXEE_instance(params_dict) + + # 1. Check the required REXEE parameters + assert REXEE.gmx_executable == 'gmx' + assert REXEE.gro == "ensemble_md/tests/data/sys.gro" + assert REXEE.top == "ensemble_md/tests/data/sys.top" + assert REXEE.mdp == "ensemble_md/tests/data/expanded.mdp" + assert REXEE.n_sim == 4 + assert REXEE.n_iter == 10 + assert REXEE.s == 1 # 2. Check the default values of the parameters not specified in params.yaml - assert EEXE.proposal == "exhaustive" - assert EEXE.acceptance == "metropolis" - assert EEXE.w_combine is None - assert EEXE.N_cutoff == 1000 - assert EEXE.n_ex == 'N^3' - assert EEXE.verbose is True - assert EEXE.runtime_args is None - assert EEXE.n_ckpt == 100 - assert EEXE.msm is False - assert EEXE.free_energy is False - assert EEXE.df_spacing == 1 - assert EEXE.df_method == 'MBAR' - assert EEXE.err_method == 'propagate' - assert EEXE.n_bootstrap == 50 - assert EEXE.seed is None + assert REXEE.proposal == "exhaustive" + assert REXEE.acceptance == "metropolis" + assert REXEE.w_combine is None + assert REXEE.N_cutoff == 1000 + assert REXEE.n_ex == 'N^3' + assert REXEE.verbose is True + assert REXEE.runtime_args is None + assert REXEE.n_ckpt == 100 + assert REXEE.msm is False + assert REXEE.free_energy is False + assert REXEE.df_spacing == 1 + assert REXEE.df_method == 'MBAR' + assert REXEE.err_method == 'propagate' + assert REXEE.n_bootstrap == 50 + assert REXEE.seed is None # 3. Check the MDP parameters - assert EEXE.dt == 0.002 - assert EEXE.temp == 298 - assert EEXE.nst_sim == 500 - assert EEXE.fixed_weights is False + assert REXEE.dt == 0.002 + assert REXEE.temp == 298 + assert REXEE.nst_sim == 500 + assert REXEE.fixed_weights is False # 4. Checked the derived parameters # Note that lambda_dict will also be tested in test_map_lambda2state. k = 1.380649e-23 NA = 6.0221408e23 - assert EEXE.kT == k * NA * 298 / 1000 - assert EEXE.lambda_types == ['coul_lambdas', 'vdw_lambdas'] - assert EEXE.n_tot == 9 - assert EEXE.n_sub == 6 - assert EEXE.state_ranges == [ + assert REXEE.kT == k * NA * 298 / 1000 + assert REXEE.lambda_types == ['coul_lambdas', 'vdw_lambdas'] + assert REXEE.n_tot == 9 + assert REXEE.n_sub == 6 + assert REXEE.state_ranges == [ [0, 1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 6], [2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8], ] - assert EEXE.equil == [-1, -1, -1, -1] # will be zeros right after the first iteration if the weights are fixed - assert EEXE.n_rejected == 0 - assert EEXE.n_swap_attempts == 0 - assert EEXE.n_empty_swappable == 0 - assert EEXE.rep_trajs == [[0], [1], [2], [3]] + assert REXEE.equil == [-1, -1, -1, -1] # will be zeros right after the first iteration if the weights are fixed + assert REXEE.n_rejected == 0 + assert REXEE.n_swap_attempts == 0 + assert REXEE.n_empty_swappable == 0 + assert REXEE.rep_trajs == [[0], [1], [2], [3]] params_dict['df_method'] = 'MBAR' - EEXE = get_EEXE_instance(params_dict) - assert EEXE.df_data_type == 'u_nk' + REXEE = get_REXEE_instance(params_dict) + assert REXEE.df_data_type == 'u_nk' params_dict['df_method'] = 'BAR' - EEXE = get_EEXE_instance(params_dict) - assert EEXE.df_data_type == 'dhdl' + REXEE = get_REXEE_instance(params_dict) + assert REXEE.df_data_type == 'dhdl' params_dict['grompp_args'] = {'-maxwarn': 1} params_dict['runtime_args'] = {'-nt': 16, '-ntomp': 8} - EEXE = get_EEXE_instance(params_dict) - assert EEXE.runtime_args == {'-nt': 16, '-ntomp': 8} - assert EEXE.grompp_args == {'-maxwarn': 1} + REXEE = get_REXEE_instance(params_dict) + assert REXEE.runtime_args == {'-nt': 16, '-ntomp': 8} + assert REXEE.grompp_args == {'-maxwarn': 1} - assert EEXE.reformatted_mdp is False + assert REXEE.reformatted_mdp is False def test_set_params_edge_cases(self, params_dict): # In the previous unit tests, we have tested the case where fixed_weights is False. Here we test the others. @@ -232,14 +232,14 @@ def test_set_params_edge_cases(self, params_dict): del mdp['wl_scale'] mdp.write(os.path.join(input_path, "expanded_test.mdp")) params_dict['mdp'] = 'ensemble_md/tests/data/expanded_test.mdp' - EEXE = get_EEXE_instance(params_dict) - assert EEXE.fixed_weights is True + REXEE = get_REXEE_instance(params_dict) + assert REXEE.fixed_weights is True # 2. wl_scale is not empty mdp['wl_scale'] = '' # It will become an empty array array([]) after read by gmx_parser.abs(x) mdp.write(os.path.join(input_path, "expanded_test.mdp")) - EEXE = get_EEXE_instance(params_dict) - assert EEXE.fixed_weights is True + REXEE = get_REXEE_instance(params_dict) + assert REXEE.fixed_weights is True def test_reformat_MDP(self, params_dict): # Note that the function reformat_MDP is called in set_params @@ -247,9 +247,9 @@ def test_reformat_MDP(self, params_dict): mdp['cool-stuff'] = 10 mdp.write(os.path.join(input_path, "expanded_test.mdp")) params_dict['mdp'] = 'ensemble_md/tests/data/expanded_test.mdp' - EEXE = get_EEXE_instance(params_dict) # this calss reformat_MDP internally - assert EEXE.reformatted_mdp is True - assert EEXE.template['cool_stuff'] == 10 + REXEE = get_REXEE_instance(params_dict) # this calss reformat_MDP internally + assert REXEE.reformatted_mdp is True + assert REXEE.template['cool_stuff'] == 10 assert os.path.isfile(os.path.join(input_path, "expanded_test_backup.mdp")) os.remove(os.path.join(input_path, "expanded_test.mdp")) @@ -257,14 +257,14 @@ def test_reformat_MDP(self, params_dict): def test_print_params(self, capfd, params_dict): # capfd is a fixture in pytest for testing STDOUT - EEXE = get_EEXE_instance(params_dict) - EEXE.print_params() + REXEE = get_REXEE_instance(params_dict) + REXEE.print_params() out_1, err = capfd.readouterr() L = "" L += "Important parameters of EXEE\n============================\n" L += f"Python version: {sys.version}\n" - L += f"GROMACS executable: {EEXE.gmx_path}\n" # Easier to pass CI. This is easy to catch anyway - L += f"GROMACS version: {EEXE.gmx_version}\n" # Easier to pass CI. This is easy to catch anyway + L += f"GROMACS executable: {REXEE.gmx_path}\n" # Easier to pass CI. This is easy to catch anyway + L += f"GROMACS version: {REXEE.gmx_version}\n" # Easier to pass CI. This is easy to catch anyway L += f"ensemble_md version: {ensemble_md.__version__}\n" L += "Simulation inputs: ensemble_md/tests/data/sys.gro, ensemble_md/tests/data/sys.top, ensemble_md/tests/data/expanded.mdp\n" # noqa: E501 L += "Verbose log file: True\n" @@ -279,13 +279,13 @@ def test_print_params(self, capfd, params_dict): L += "Additional grompp arguments: None\n" L += "Additional runtime arguments: None\n" L += "MDP parameters differing across replicas: None\n" - L += "Alchemical ranges of each replica in EEXE:\n - Replica 0: States [0, 1, 2, 3, 4, 5]\n" + L += "Alchemical ranges of each replica in REXEE:\n - Replica 0: States [0, 1, 2, 3, 4, 5]\n" L += " - Replica 1: States [1, 2, 3, 4, 5, 6]\n - Replica 2: States [2, 3, 4, 5, 6, 7]\n" L += " - Replica 3: States [3, 4, 5, 6, 7, 8]\n" assert out_1 == L - EEXE.reformatted_mdp = True # Just to test the case where EEXE.reformatted_mdp is True - EEXE.print_params(params_analysis=True) + REXEE.reformatted_mdp = True # Just to test the case where REXEE.reformatted_mdp is True + REXEE.print_params(params_analysis=True) out_2, err = capfd.readouterr() L += "\nWhether to build Markov state models and perform relevant analysis: False\n" L += "Whether to perform free energy calculations: False\n" @@ -298,8 +298,8 @@ def test_print_params(self, capfd, params_dict): assert out_2 == L def test_initialize_MDP(self, params_dict): - EEXE = get_EEXE_instance(params_dict) - MDP = EEXE.initialize_MDP(2) # the third replica + REXEE = get_REXEE_instance(params_dict) + MDP = REXEE.initialize_MDP(2) # the third replica assert MDP["nsteps"] == 500 assert all( [ @@ -332,11 +332,11 @@ def test_update_MDP(self, params_dict): [3.48, 2.78, 3.21, 4.56, 8.79, 0.48], [8.45, 0.52, 3.69, 2.43, 4.56, 6.73], ] - EEXE = get_EEXE_instance(params_dict) - EEXE.equil = [-1, 1, 0, -1] # i.e. the 3rd replica will use fixed weights in the next iteration - MDP_1 = EEXE.update_MDP( + REXEE = get_REXEE_instance(params_dict) + REXEE.equil = [-1, 1, 0, -1] # i.e. the 3rd replica will use fixed weights in the next iteration + MDP_1 = REXEE.update_MDP( new_template, 2, iter_idx, states, wl_delta, weights) # third replica - MDP_2 = EEXE.update_MDP( + MDP_2 = REXEE.update_MDP( new_template, 3, iter_idx, states, wl_delta, weights) # fourth replica assert MDP_1["tinit"] == MDP_2["tinit"] == 3 @@ -364,18 +364,18 @@ def test_update_MDP(self, params_dict): ) def test_extract_final_dhdl_info(self, params_dict): - EEXE = get_EEXE_instance(params_dict) + REXEE = get_REXEE_instance(params_dict) dhdl_files = [ - os.path.join(input_path, f"dhdl/dhdl_{i}.xvg") for i in range(EEXE.n_sim) + os.path.join(input_path, f"dhdl/dhdl_{i}.xvg") for i in range(REXEE.n_sim) ] - states = EEXE.extract_final_dhdl_info(dhdl_files) + states = REXEE.extract_final_dhdl_info(dhdl_files) assert states == [5, 2, 2, 8] def test_extract_final_log_info(self, params_dict): - EEXE = get_EEXE_instance(params_dict) + REXEE = get_REXEE_instance(params_dict) log_files = [ - os.path.join(input_path, f"log/EXE_{i}.log") for i in range(EEXE.n_sim)] - wl_delta, weights, counts = EEXE.extract_final_log_info(log_files) + os.path.join(input_path, f"log/EXE_{i}.log") for i in range(REXEE.n_sim)] + wl_delta, weights, counts = REXEE.extract_final_log_info(log_files) assert wl_delta == [0.4, 0.5, 0.5, 0.5] assert np.allclose(weights, [ [0, 1.03101, 2.55736, 3.63808, 4.47220, 6.13408], @@ -387,35 +387,35 @@ def test_extract_final_log_info(self, params_dict): [9, 8, 8, 11, 7, 7], [3, 1, 1, 9, 15, 21], [0, 0, 0, 1, 18, 31], ] - assert EEXE.equil == [-1, -1, -1, -1] + assert REXEE.equil == [-1, -1, -1, -1] def test_get_averaged_weights(self, params_dict): - EEXE = get_EEXE_instance(params_dict) + REXEE = get_REXEE_instance(params_dict) log_files = [ - os.path.join(input_path, f"log/EXE_{i}.log") for i in range(EEXE.n_sim)] - avg, err = EEXE.get_averaged_weights(log_files) + os.path.join(input_path, f"log/EXE_{i}.log") for i in range(REXEE.n_sim)] + avg, err = REXEE.get_averaged_weights(log_files) assert np.allclose(avg[0], [0, 2.55101, 3.35736, 4.83808, 4.8722, 5.89408]) assert np.allclose(err[0], [0, 1.14542569, 1.0198039, 0.8, 0.69282032, 0.35777088]) def test_identify_swappable_pairs(self, params_dict): - EEXE = get_EEXE_instance(params_dict) - EEXE.state_ranges = [list(range(i, i + 5)) for i in range(EEXE.n_sim)] # 5 states per replica + REXEE = get_REXEE_instance(params_dict) + REXEE.state_ranges = [list(range(i, i + 5)) for i in range(REXEE.n_sim)] # 5 states per replica states = [4, 2, 2, 7] # This would lead to the swappables: [(0, 1), (0, 2), (1, 2)] in the standard case # Case 1: Any case that is not neighboring swap has the same definition for the swappable pairs - swappables_1 = EEXE.identify_swappable_pairs(states, EEXE.state_ranges, EEXE.proposal == 'neighboring') + swappables_1 = REXEE.identify_swappable_pairs(states, REXEE.state_ranges, REXEE.proposal == 'neighboring') assert swappables_1 == [(0, 1), (0, 2), (1, 2)] # Case 2: Neighboring exchange - EEXE.proposal = 'neighboring' - swappables_2 = EEXE.identify_swappable_pairs(states, EEXE.state_ranges, EEXE.proposal == 'neighboring') + REXEE.proposal = 'neighboring' + swappables_2 = REXEE.identify_swappable_pairs(states, REXEE.state_ranges, REXEE.proposal == 'neighboring') assert swappables_2 == [(0, 1), (1, 2)] def test_propose_swap(self, params_dict): random.seed(0) - EEXE = get_EEXE_instance(params_dict) - swap_1 = EEXE.propose_swap([]) - swap_2 = EEXE.propose_swap([(0, 1), (0, 2), (1, 2)]) + REXEE = get_REXEE_instance(params_dict) + swap_1 = REXEE.propose_swap([]) + swap_2 = REXEE.propose_swap([(0, 1), (0, 2), (1, 2)]) assert swap_1 == [] assert swap_2 == (1, 2) @@ -430,95 +430,95 @@ def test_get_swapping_pattern(self, params_dict): dhdl_files = [os.path.join(input_path, f"dhdl/dhdl_{i}.xvg") for i in range(4)] # Case 1: Empty swap list - EEXE = get_EEXE_instance(params_dict) - EEXE.verbose = False + REXEE = get_REXEE_instance(params_dict) + REXEE.verbose = False states = [0, 6, 7, 8] # No swappable pairs w, f = copy.deepcopy(weights), copy.deepcopy(dhdl_files) - pattern_1, swap_list_1 = EEXE.get_swapping_pattern(f, states, w) - assert EEXE.n_empty_swappable == 1 - assert EEXE.n_swap_attempts == 0 - assert EEXE.n_rejected == 0 + pattern_1, swap_list_1 = REXEE.get_swapping_pattern(f, states, w) + assert REXEE.n_empty_swappable == 1 + assert REXEE.n_swap_attempts == 0 + assert REXEE.n_rejected == 0 assert pattern_1 == [0, 1, 2, 3] assert swap_list_1 == [] # Case 2: Single swap (proposal = 'single') random.seed(0) - EEXE = get_EEXE_instance(params_dict) - EEXE.verbose = True - EEXE.proposal = 'single' # n_ex will be set to 1 automatically. + REXEE = get_REXEE_instance(params_dict) + REXEE.verbose = True + REXEE.proposal = 'single' # n_ex will be set to 1 automatically. states = [5, 2, 2, 8] # swappable pairs: [(0, 1), (0, 2), (1, 2)], swap = (1, 2), accept w, f = copy.deepcopy(weights), copy.deepcopy(dhdl_files) - pattern_2, swap_list_2 = EEXE.get_swapping_pattern(f, states, w) - assert EEXE.n_swap_attempts == 1 - assert EEXE.n_rejected == 0 + pattern_2, swap_list_2 = REXEE.get_swapping_pattern(f, states, w) + assert REXEE.n_swap_attempts == 1 + assert REXEE.n_rejected == 0 assert pattern_2 == [0, 2, 1, 3] assert swap_list_2 == [(1, 2)] # Case 3: Neighboring swap random.seed(0) - EEXE = get_EEXE_instance(params_dict) - EEXE.proposal = 'neighboring' # n_ex will be set to 1 automatically. + REXEE = get_REXEE_instance(params_dict) + REXEE.proposal = 'neighboring' # n_ex will be set to 1 automatically. states = [5, 2, 2, 8] # swappable pairs: [(0, 1), (1, 2)], swap = (1, 2), accept w, f = copy.deepcopy(weights), copy.deepcopy(dhdl_files) - pattern_3, swap_list_3 = EEXE.get_swapping_pattern(f, states, w) - assert EEXE.n_swap_attempts == 1 - assert EEXE.n_rejected == 0 + pattern_3, swap_list_3 = REXEE.get_swapping_pattern(f, states, w) + assert REXEE.n_swap_attempts == 1 + assert REXEE.n_rejected == 0 assert pattern_3 == [0, 2, 1, 3] assert swap_list_3 == [(1, 2)] # Case 4-1: Exhaustive swaps that end up in a single swap random.seed(0) - EEXE = get_EEXE_instance(params_dict) - EEXE.proposal = 'exhaustive' + REXEE = get_REXEE_instance(params_dict) + REXEE.proposal = 'exhaustive' states = [5, 2, 2, 8] # swappable pairs: [(0, 1), (0, 2), (1, 2)], swap = (1, 2), accept w, f = copy.deepcopy(weights), copy.deepcopy(dhdl_files) - pattern_4_1, swap_list_4_1 = EEXE.get_swapping_pattern(f, states, w) - assert EEXE.n_swap_attempts == 1 - assert EEXE.n_rejected == 0 + pattern_4_1, swap_list_4_1 = REXEE.get_swapping_pattern(f, states, w) + assert REXEE.n_swap_attempts == 1 + assert REXEE.n_rejected == 0 assert pattern_4_1 == [0, 2, 1, 3] assert swap_list_4_1 == [(1, 2)] # Case 4-2: Exhaustive swaps that involve multiple attempted swaps random.seed(0) - EEXE = get_EEXE_instance(params_dict) - EEXE.proposal = 'exhaustive' + REXEE = get_REXEE_instance(params_dict) + REXEE.proposal = 'exhaustive' states = [4, 2, 4, 3] # swappable pairs: [(0, 1), (0, 2), (0, 3), (1, 2), (2, 3)]; swap 1: (2, 3), accepted; swap 2: (0, 1), accept # noqa: E501 w, f = copy.deepcopy(weights), copy.deepcopy(dhdl_files) - pattern_4_2, swap_list_4_2 = EEXE.get_swapping_pattern(f, states, w) - assert EEXE.n_swap_attempts == 2 # \Delta is negative for both swaps -> both accepted - assert EEXE.n_rejected == 0 + pattern_4_2, swap_list_4_2 = REXEE.get_swapping_pattern(f, states, w) + assert REXEE.n_swap_attempts == 2 # \Delta is negative for both swaps -> both accepted + assert REXEE.n_rejected == 0 assert pattern_4_2 == [1, 0, 3, 2] assert swap_list_4_2 == [(2, 3), (0, 1)] # Case 5-1: Multiple swaps (proposal = 'multiple', n_ex = 5) print('test 5-1') random.seed(0) - EEXE = get_EEXE_instance(params_dict) - EEXE.n_ex = 5 - EEXE.proposal = 'multiple' + REXEE = get_REXEE_instance(params_dict) + REXEE.n_ex = 5 + REXEE.proposal = 'multiple' states = [3, 1, 4, 6] # swappable pairs: [(0, 1), (0, 2), (1, 2)], first swap = (0, 2), accept w, f = copy.deepcopy(weights), copy.deepcopy(dhdl_files) - pattern_5_1, swap_list_5_1 = EEXE.get_swapping_pattern(f, states, w) - assert EEXE.n_swap_attempts == 5 - assert EEXE.n_rejected == 4 + pattern_5_1, swap_list_5_1 = REXEE.get_swapping_pattern(f, states, w) + assert REXEE.n_swap_attempts == 5 + assert REXEE.n_rejected == 4 assert pattern_5_1 == [2, 1, 0, 3] assert swap_list_5_1 == [(0, 2)] # Case 5-2: Multiple swaps but with only one swappable pair (proposal = 'multiple') # This is specifically for testing n_swap_attempts in the case where n_ex reduces to 1. random.seed(0) - EEXE = get_EEXE_instance(params_dict) + REXEE = get_REXEE_instance(params_dict) states = [0, 2, 3, 8] # The only swappable pair is [(1, 2)] --> accept w, f = copy.deepcopy(weights), copy.deepcopy(dhdl_files) - pattern_5_2, swap_list_5_2 = EEXE.get_swapping_pattern(f, states, w) - assert EEXE.n_swap_attempts == 1 # since there is only 1 swappable pair - assert EEXE.n_rejected == 0 + pattern_5_2, swap_list_5_2 = REXEE.get_swapping_pattern(f, states, w) + assert REXEE.n_swap_attempts == 1 # since there is only 1 swappable pair + assert REXEE.n_rejected == 0 assert pattern_5_2 == [0, 2, 1, 3] assert swap_list_5_2 == [(1, 2)] def test_calc_prob_acc(self, capfd, params_dict): - EEXE = get_EEXE_instance(params_dict) - # EEXE.state_ranges = [[0, 1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 6], ..., [3, 4, 5, 6, 7, 8]] + REXEE = get_REXEE_instance(params_dict) + # REXEE.state_ranges = [[0, 1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 6], ..., [3, 4, 5, 6, 7, 8]] states = [5, 2, 2, 8] shifts = [0, 1, 2, 3] weights = [ @@ -530,25 +530,25 @@ def test_calc_prob_acc(self, capfd, params_dict): # Test 1: Same-state swapping (True) swap = (1, 2) - EEXE.acceptance = "same_state" - prob_acc_1 = EEXE.calc_prob_acc(swap, dhdl_files, states, shifts, weights) + REXEE.acceptance = "same_state" + prob_acc_1 = REXEE.calc_prob_acc(swap, dhdl_files, states, shifts, weights) assert prob_acc_1 == 1 # Test 2: Same-state swapping (False) swap = (0, 2) - prob_acc_2 = EEXE.calc_prob_acc(swap, dhdl_files, states, shifts, weights) + prob_acc_2 = REXEE.calc_prob_acc(swap, dhdl_files, states, shifts, weights) assert prob_acc_2 == 0 # Test 3: Metropolis-eq swap = (0, 2) - EEXE.acceptance = "metropolis-eq" - prob_acc_3 = EEXE.calc_prob_acc(swap, dhdl_files, states, shifts, weights) + REXEE.acceptance = "metropolis-eq" + prob_acc_3 = REXEE.calc_prob_acc(swap, dhdl_files, states, shifts, weights) assert prob_acc_3 == 1 # Delta U = (-9.1366697 + 4.9963939)/2.478956208925815 ~ -1.67 kT # Test 4: Metropolis swap = (0, 2) - EEXE.acceptance = "metropolis" - prob_acc_4 = EEXE.calc_prob_acc(swap, dhdl_files, states, shifts, weights) + REXEE.acceptance = "metropolis" + prob_acc_4 = REXEE.calc_prob_acc(swap, dhdl_files, states, shifts, weights) out, err = capfd.readouterr() # dH ~-1.67 kT as calculated above, dg = (2.55736 - 6.13408) + (0.24443 - 0) ~ -3.33229 kT # dU - dg ~ 1.66212 kT, so p_acc ~ 0.189 ... @@ -561,34 +561,34 @@ def test_calc_prob_acc(self, capfd, params_dict): # And the acceptance ratio should be 1. swap = (0, 2) states = [2, 2, 5, 8] - prob_acc_5 = EEXE.calc_prob_acc(swap, dhdl_files, states, shifts, weights) + prob_acc_5 = REXEE.calc_prob_acc(swap, dhdl_files, states, shifts, weights) out, err = capfd.readouterr() assert prob_acc_5 == 1 assert 'U^i_n - U^i_m = 3.69 kT, U^j_m - U^j_n = -2.02 kT, Total dU: 1.67 kT' in out assert 'g^i_n - g^i_m = 3.58 kT, g^j_m - g^j_n = -0.24 kT, Total dg: 3.33 kT' in out def test_accept_or_reject(self, params_dict): - EEXE = get_EEXE_instance(params_dict) + REXEE = get_REXEE_instance(params_dict) random.seed(0) - swap_bool_1 = EEXE.accept_or_reject(0) - swap_bool_2 = EEXE.accept_or_reject(0.8) # rand = 0.844 - swap_bool_3 = EEXE.accept_or_reject(0.8) # rand = 0.758 + swap_bool_1 = REXEE.accept_or_reject(0) + swap_bool_2 = REXEE.accept_or_reject(0.8) # rand = 0.844 + swap_bool_3 = REXEE.accept_or_reject(0.8) # rand = 0.758 - assert EEXE.n_swap_attempts == 0 # since we didn't use get_swapping_pattern - assert EEXE.n_rejected == 2 + assert REXEE.n_swap_attempts == 0 # since we didn't use get_swapping_pattern + assert REXEE.n_rejected == 2 assert swap_bool_1 is False assert swap_bool_2 is False assert swap_bool_3 is True def test_weight_correction(self, params_dict): - EEXE = get_EEXE_instance(params_dict) + REXEE = get_REXEE_instance(params_dict) # Case 1: Perform weight correction (N_cutoff reached) - EEXE.N_cutoff = 5000 - EEXE.verbose = False # just to increase code coverage + REXEE.N_cutoff = 5000 + REXEE.verbose = False # just to increase code coverage weights_1 = [[0, 10.304, 20.073, 29.364]] counts_1 = [[31415, 45701, 55457, 59557]] - weights_1 = EEXE.weight_correction(weights_1, counts_1) + weights_1 = REXEE.weight_correction(weights_1, counts_1) assert np.allclose(weights_1, [ [ 0, @@ -599,26 +599,26 @@ def test_weight_correction(self, params_dict): ]) # noqa: E501 # Case 2: Perform weight correction (N_cutoff not reached by both N_k and N_{k-1}) - EEXE.verbose = True + REXEE.verbose = True weights_2 = [[0, 10.304, 20.073, 29.364]] counts_2 = [[3141, 4570, 5545, 5955]] - weights_2 = EEXE.weight_correction(weights_2, counts_2) + weights_2 = REXEE.weight_correction(weights_2, counts_2) assert np.allclose(weights_2, [[0, 10.304, 20.073, 29.364 + np.log(5545 / 5955)]]) def test_combine_weights_1(self, params_dict): """ Here we just test the combined weights, so the values of hist does not matter. """ - EEXE = get_EEXE_instance(params_dict) - EEXE.n_tot = 6 - EEXE.n_sub = 4 - EEXE.s = 1 - EEXE.n_sim = 3 - EEXE.state_ranges = [[0, 1, 2, 3], [1, 2, 3, 4], [2, 3, 4, 5]] + REXEE = get_REXEE_instance(params_dict) + REXEE.n_tot = 6 + REXEE.n_sub = 4 + REXEE.s = 1 + REXEE.n_sim = 3 + REXEE.state_ranges = [[0, 1, 2, 3], [1, 2, 3, 4], [2, 3, 4, 5]] weights = [[0, 2.1, 4.0, 3.7], [0, 1.7, 1.2, 2.6], [0, -0.4, 0.9, 1.9]] hist = [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]] - _, w_1, g_vec_1 = EEXE.combine_weights(hist, weights) + _, w_1, g_vec_1 = REXEE.combine_weights(hist, weights) assert np.allclose(w_1, [ [0, 2.1, 3.9, 3.5], [0, 1.8, 1.4, 2.75], @@ -627,7 +627,7 @@ def test_combine_weights_1(self, params_dict): weights = [[0, 2.1, 4.0, 3.7], [0, 1.7, 1.2, 2.6], [0, -0.4, 0.9, 1.9]] errors = [[0, 0.1, 0.15, 0.1], [0, 0.12, 0.1, 0.12], [0, 0.12, 0.15, 0.1]] - _, w_2, g_vec_2 = EEXE.combine_weights(hist, weights, errors) + _, w_2, g_vec_2 = REXEE.combine_weights(hist, weights, errors) assert np.allclose(w_2, [ [0, 2.1, 3.86140725, 3.45417313], [0, 1.76140725, 1.35417313, 2.71436889], @@ -638,14 +638,14 @@ def test_combine_weights_2(self, params_dict): """ Here we just test the modified histograms, so the values of weights does not matter. """ - EEXE = get_EEXE_instance(params_dict) - EEXE.n_tot = 6 - EEXE.n_sub = 5 - EEXE.s = 1 - EEXE.n_sim = 2 - EEXE.state_ranges = [[0, 1, 2, 3, 4], [1, 2, 3, 4, 5]] + REXEE = get_REXEE_instance(params_dict) + REXEE.n_tot = 6 + REXEE.n_sub = 5 + REXEE.s = 1 + REXEE.n_sim = 2 + REXEE.state_ranges = [[0, 1, 2, 3, 4], [1, 2, 3, 4, 5]] weights = [[0, 2.1, 4.0, 3.7, 5], [0, 1.7, 1.2, 2.6, 4]] hist = [[416, 332, 130, 71, 61], [303, 181, 123, 143, 260]] - hist_modified, _, _ = EEXE.combine_weights(hist, weights) + hist_modified, _, _ = REXEE.combine_weights(hist, weights) assert hist_modified == [[416, 332, 161, 98, 98], [332, 161, 98, 98, 178]] From a594964720d7c246f663f146ebd7b15fd112261f Mon Sep 17 00:00:00 2001 From: Wei-Tse Hsu Date: Wed, 25 Oct 2023 17:01:38 -0600 Subject: [PATCH 09/11] Fixed the linting error --- ensemble_md/analysis/analyze_free_energy.py | 3 ++- ensemble_md/replica_exchange_EE.py | 23 ++++++++++--------- ensemble_md/tests/test_replica_exchange_EE.py | 2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/ensemble_md/analysis/analyze_free_energy.py b/ensemble_md/analysis/analyze_free_energy.py index 3c033799..c3f215a9 100644 --- a/ensemble_md/analysis/analyze_free_energy.py +++ b/ensemble_md/analysis/analyze_free_energy.py @@ -8,7 +8,8 @@ # # #################################################################### """ -The :obj:`.analyze_free_energy` module provides functions for performing free energy calculations for REXEE simulations. +The :obj:`.analyze_free_energy` module provides functions for performing free energy +calculations for REXEE simulations. """ import alchemlyb import numpy as np diff --git a/ensemble_md/replica_exchange_EE.py b/ensemble_md/replica_exchange_EE.py index e4017f1a..d5d097f5 100644 --- a/ensemble_md/replica_exchange_EE.py +++ b/ensemble_md/replica_exchange_EE.py @@ -38,11 +38,11 @@ class ReplicaExchangeEE: """ This class provides a variety of functions useful for setting up and running - replica exchange (REX) of expanded ensemble (EE), or REXEE simulations. - Upon instantiation, all parameters in the YAML - file will be assigned to an attribute in the class. In addition to these variables, - below is a list of attributes of the class. (All the the attributes are assigned by - :obj:`set_params` unless otherwise noted.) + replica exchange (REX) of expanded ensemble (EE), or REXEE simulations. + Upon instantiation, all parameters in the YAML file will be assigned to an + attribute in the class. In addition to these variables, below is a list of + attributes of the class. (All the the attributes are assigned by :obj:`set_params` + unless otherwise noted.) :ivar gmx_path: The absolute path of the GROMACS exectuable. :ivar gmx_version: The version of the GROMACS executable. @@ -931,12 +931,13 @@ def get_swapping_pattern(self, dhdl_files, states, weights): # Theoretically, in an REXEE simulation, we could either choose to swap configurations (via # swapping GRO files) or replicas (via swapping MDP files). In ensemble_md package, we chose the - # former when implementing the REXEE algorithm. Specifically, in the CLI `run_REXEE`, `swap_pattern` - # is used to swap the GRO files. Therefore, when an attempted swap is accetped and `swap_pattern` - # is updated, we also need to update the variables `shifts`, `weights`, `dhdl_files`, - # `state_ranges`, `self.configs` but not anything else. Otherwise, incorrect results will be - # produced. To better understand this, one can refer to our unit test for get_swapping_pattern - # and calc_prob_acc, set checkpoints and examine why the variables should/should not be updated. + # former when implementing the REXEE algorithm. Specifically, in the CLI `run_REXEE`, + # `swap_pattern` is used to swap the GRO files. Therefore, when an attempted swap is accetped and + # `swap_pattern` is updated, we also need to update the variables `shifts`, `weights`, + # `dhdl_files`, `state_ranges`, `self.configs` but not anything else. Otherwise, incorrect results + # will be produced. To better understand this, one can refer to our unit test for + # get_swapping_pattern and calc_prob_acc, set checkpoints and examine why the variables + # should/should not be updated. if swap_bool is True: swap_list.append(swap) diff --git a/ensemble_md/tests/test_replica_exchange_EE.py b/ensemble_md/tests/test_replica_exchange_EE.py index e41678aa..76dd3c0c 100644 --- a/ensemble_md/tests/test_replica_exchange_EE.py +++ b/ensemble_md/tests/test_replica_exchange_EE.py @@ -202,7 +202,7 @@ def test_set_params(self, params_dict): [1, 2, 3, 4, 5, 6], [2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8], ] - assert REXEE.equil == [-1, -1, -1, -1] # will be zeros right after the first iteration if the weights are fixed + assert REXEE.equil == [-1, -1, -1, -1] # will be zeros after the first iteration if the weights are fixed assert REXEE.n_rejected == 0 assert REXEE.n_swap_attempts == 0 assert REXEE.n_empty_swappable == 0 From 3abc1407bedea8cfc610c45025a96af912d8c865 Mon Sep 17 00:00:00 2001 From: Wei-Tse Hsu Date: Wed, 25 Oct 2023 17:15:12 -0600 Subject: [PATCH 10/11] Updated the jupyter notebooks in docs/examples --- .../{analyze_EEXE.ipynb => analyze_REXEE.ipynb} | 8 +++++++- docs/examples/{run_EEXE.ipynb => run_REXEE.ipynb} | 10 ++++++++-- ...dify_inputs.ipynb => run_REXEE_modify_inputs.ipynb} | 8 +++++++- 3 files changed, 22 insertions(+), 4 deletions(-) rename docs/examples/{analyze_EEXE.ipynb => analyze_REXEE.ipynb} (79%) rename docs/examples/{run_EEXE.ipynb => run_REXEE.ipynb} (94%) rename docs/examples/{run_EEXE_modify_inputs.ipynb => run_REXEE_modify_inputs.ipynb} (78%) diff --git a/docs/examples/analyze_EEXE.ipynb b/docs/examples/analyze_REXEE.ipynb similarity index 79% rename from docs/examples/analyze_EEXE.ipynb rename to docs/examples/analyze_REXEE.ipynb index f96ee669..76c22543 100644 --- a/docs/examples/analyze_EEXE.ipynb +++ b/docs/examples/analyze_REXEE.ipynb @@ -5,8 +5,14 @@ "id": "cc81ea4a", "metadata": {}, "source": [ - "# Tutorial 2: Analyzing an ensemble of expanded ensemble" + "# Tutorial 2: Analyzing a REXEE simulation" ] + }, + { + "cell_type": "markdown", + "id": "c3014606", + "metadata": {}, + "source": [] } ], "metadata": { diff --git a/docs/examples/run_EEXE.ipynb b/docs/examples/run_REXEE.ipynb similarity index 94% rename from docs/examples/run_EEXE.ipynb rename to docs/examples/run_REXEE.ipynb index c3fd677e..e9d44ff9 100644 --- a/docs/examples/run_EEXE.ipynb +++ b/docs/examples/run_REXEE.ipynb @@ -5,7 +5,7 @@ "id": "87324540", "metadata": {}, "source": [ - "# Tutorial 1: Launching an ensemble of expanded ensemble" + "# Tutorial 1: Launching a REXEE simulation" ] }, { @@ -13,7 +13,7 @@ "id": "b2d9b443", "metadata": {}, "source": [ - "In this tutorial, we will show how one can launch an ensemble of expanded ensemble simulations in parallel using functions in `ensemble_md`. To this end, we will at least need 4 files: one top file (`sys.top`) and one gro file (`sys.gro`) for the system of interest, one mdp file as the template for customizing mdp files for different replicas (`expanded.mdp`), and finally a yaml file speciying the EEXE-relevant parameters (`params.yaml`). \n", + "In this tutorial, we will show how one can launch an ensemble of expanded ensemble simulations in parallel using functions in `ensemble_md`. To this end, we will at least need 4 files: one top file (`sys.top`) and one gro file (`sys.gro`) for the system of interest, one mdp file as the template for customizing mdp files for different replicas (`expanded.mdp`), and finally a yaml file speciying the REXEE-relevant parameters (`params.yaml`). \n", "\n", "In our case, the system of interest is a linear model composed of 4 interaction sites, where the first and last atom have a charge of +0.2 and -0.2, respectively. In `expanded.mdp`, we define 9 alchemical states in total to decouple the van der Waals interactions and coulombic interactions. Our goal is to run an ensemble composed of 4 replicas of expanded ensemble, each of which sample 6 alchemical states, with the shift between adjacent replicas being 1 state. That is, we want replicas 0 to 3 to sample states 0 to 6, 1 to 7, 2 to 8, and 3 to 9, respectively. Each replica will be performed for 5 iterations, with the length of each iteration being 500 steps (i.e. 1 ps). We will use the Metropolis MC scheme to swap replicas, and use exponential averaging with histogram correction for combining weights of exchanging replicas. The histogram cutoff will be set as 1000 to avoid overcorrection and we will swap 2 pairs of replicas in each attempt when possible. All STDOUT will be redirected to `results.txt`. \n", "\n", @@ -33,6 +33,12 @@ "outfile: 'results.txt' # The output file for logging how replicas interact with each other.\n", "```" ] + }, + { + "cell_type": "markdown", + "id": "6cedf8d9", + "metadata": {}, + "source": [] } ], "metadata": { diff --git a/docs/examples/run_EEXE_modify_inputs.ipynb b/docs/examples/run_REXEE_modify_inputs.ipynb similarity index 78% rename from docs/examples/run_EEXE_modify_inputs.ipynb rename to docs/examples/run_REXEE_modify_inputs.ipynb index 4e2798ea..2903fcdb 100644 --- a/docs/examples/run_EEXE_modify_inputs.ipynb +++ b/docs/examples/run_REXEE_modify_inputs.ipynb @@ -5,8 +5,14 @@ "id": "87324540", "metadata": {}, "source": [ - "# Tutorial 3: EEXE for multiple serial mutations" + "# Tutorial 3: REXEE for multiple serial mutations" ] + }, + { + "cell_type": "markdown", + "id": "46970a2d", + "metadata": {}, + "source": [] } ], "metadata": { From a95ea2d1500318405120ff0a3948248e4899df86 Mon Sep 17 00:00:00 2001 From: Wei-Tse Hsu Date: Wed, 25 Oct 2023 17:22:57 -0600 Subject: [PATCH 11/11] Finished updating the documentation --- docs/analysis.rst | 4 +- docs/getting_started.rst | 4 +- docs/index.rst | 8 +- docs/simulations.rst | 142 +++++++++++++++---------------- docs/theory.rst | 17 ++-- ensemble_md/cli/analyze_REXEE.py | 2 +- ensemble_md/cli/run_REXEE.py | 6 +- 7 files changed, 91 insertions(+), 92 deletions(-) diff --git a/docs/analysis.rst b/docs/analysis.rst index b4f3c10b..3022e660 100644 --- a/docs/analysis.rst +++ b/docs/analysis.rst @@ -1,7 +1,7 @@ 1. An overview ============== -Automated data analysis of an EEXE simulation is allowed by the CLI :code:`analyze_EEXE`, which -share the same input YAML file as the CLI :code:`run_EEXE`. Relevant parameters specified in the YAML +Automated data analysis of an REXEE simulation is allowed by the CLI :code:`analyze_REXEE`, which +share the same input YAML file as the CLI :code:`run_REXEE`. Relevant parameters specified in the YAML file for data analysis can be found in this section: :ref:`doc_analysis_params`. - Analysis based on transitions between replicas diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 44a56717..9287a4d8 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -2,8 +2,8 @@ =============== :code:`ensemble_md` is a Python package that provides methods for setting up, running, and analyzing GROMACS simulation ensembles. The current implementation is -mainly for synchronous ensemble of expanded ensemble (EEXE), but we will develop -methods like asynchronous EEXE, or ensemble of alchemical metadynamics in the future. +mainly for synchronous replica exchange (REX) of expanded ensemble (EE), abbreviated as +REXEE. In the future, we will develop methods like asynchronous REXEE, or multi-topology REXEE. In the current implementation, the module :code:`subprocess` is used to launch GROMACS commands, but we will switch to `SCALE-MS`_ for this purpose in the future when possible. diff --git a/docs/index.rst b/docs/index.rst index 5e558db7..9d074720 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,7 +24,7 @@ the future. .. toctree:: simulations :maxdepth: 2 - :caption: Launching EEXE simulations: + :caption: Launching REXEE simulations: .. toctree:: analysis @@ -32,9 +32,9 @@ the future. :caption: Data analysis: .. toctree:: - examples/run_EEXE - examples/analyze_EEXE - examples/run_EEXE_modify_inputs + examples/run_REXEE + examples/analyze_REXEE + examples/run_REXEE_modify_inputs :maxdepth: 2 :caption: Tutorials: diff --git a/docs/simulations.rst b/docs/simulations.rst index 102e2dbe..69ecb7b3 100644 --- a/docs/simulations.rst +++ b/docs/simulations.rst @@ -2,26 +2,26 @@ 1. Command-line interface (CLI) =============================== -:code:`ensemble_md` provides three command-line interfaces (CLI), including :code:`explore_EEXE`, :code:`run_EEXE` and :code:`analyze_EEXE`. -:code:`explore_EEXE` helps the user to figure out possible combinations of EEXE parameters, while :code:`run_EEXE` and :code:`analyze_EEXE` -can be used to perform and analyze EEXE simulations, respectively. Below we provide more details about each of these CLIs. +:code:`ensemble_md` provides three command-line interfaces (CLI), including :code:`explore_REXEE`, :code:`run_REXEE` and :code:`analyze_REXEE`. +:code:`explore_REXEE` helps the user to figure out possible combinations of REXEE parameters, while :code:`run_REXEE` and :code:`analyze_REXEE` +can be used to perform and analyze REXEE simulations, respectively. Below we provide more details about each of these CLIs. -1.1. CLI :code:`explore_EEXE` +1.1. CLI :code:`explore_REXEE` ----------------------------- -Here is the help message of :code:`explore_EEXE`: +Here is the help message of :code:`explore_REXEE`: :: - usage: explore_EEXE [-h] -N N [-r R] [-n N] [-s S] [-c] [-e] + usage: explore_REXEE [-h] -N N [-r R] [-n N] [-s S] [-c] [-e] - This code explores the parameter space of homogenous EEXE to help you figure out all + This code explores the parameter space of homogenous REXEE to help you figure out all possible combinations of the number of replicas, the number of states in each replica, and the number of overlapping states, and the total number states. optional arguments: -h, --help show this help message and exit - -N N, --N N The total number of states of the EEXE simulation. - -r R, --r R The number of replicas that compose the EEXE simulation. + -N N, --N N The total number of states of the REXEE simulation. + -r R, --r R The number of replicas that compose the REXEE simulation. -n N, --n N The number of states for each replica. -s S, --s S The state shift between adjacent replicas. -c, --cnst Whether the apply the constraint such that the number of overlapping @@ -31,19 +31,19 @@ Here is the help message of :code:`explore_EEXE`: pairs for each solution. -1.2. CLI :code:`run_EEXE` +1.2. CLI :code:`run_REXEE` ------------------------- -Here is the help message of :code:`run_EEXE`: +Here is the help message of :code:`run_REXEE`: :: - usage: run_EEXE [-h] [-y YAML] [-c CKPT] [-g G_VECS] [-o OUTPUT] [-m MAXWARN] + usage: run_REXEE [-h] [-y YAML] [-c CKPT] [-g G_VECS] [-o OUTPUT] [-m MAXWARN] - This code runs an ensemble of expanded ensemble given necessary inputs. + This code runs a REXEE simulation given necessary inputs. optional arguments: -h, --help show this help message and exit - -y YAML, --yaml YAML The input YAML file that contains EEXE parameters. (Default: + -y YAML, --yaml YAML The input YAML file that contains REXEE parameters. (Default: params.yaml) -c CKPT, --ckpt CKPT The NPY file containing the replica-space trajectories. This file is a necessary checkpoint file for extending the simulaiton. @@ -55,40 +55,40 @@ Here is the help message of :code:`run_EEXE`: g_vecs.npy) -o OUTPUT, --output OUTPUT The output file for logging how replicas interact with each - other. (Default: run_EEXE_log.txt) + other. (Default: run_REXEE_log.txt) -m MAXWARN, --maxwarn MAXWARN The maximum number of warnings in parameter specification to be ignored. -In our current implementation, it is assumed that all replicas of an EEXE simulations are performed in -parallel using MPI. Naturally, performing an EEXE simulation using :code:`run_EEXE` requires a command-line interface +In our current implementation, it is assumed that all replicas of an REXEE simulations are performed in +parallel using MPI. Naturally, performing an REXEE simulation using :code:`run_REXEE` requires a command-line interface to launch MPI processes, such as :code:`mpirun` or :code:`mpiexec`. For example, on a 128-core node -in a cluster, one may use :code:`mpirun -np 4 run_EEXE` (or :code:`mpiexec -n 4 run_EEXE`) to run an EEXE simulation composed of 4 +in a cluster, one may use :code:`mpirun -np 4 run_REXEE` (or :code:`mpiexec -n 4 run_REXEE`) to run an REXEE simulation composed of 4 replicas with 4 MPI processes. Note that in this case, it is often recommended to explicitly specify more details about resources allocated for each replica. For example, one can specifies :code:`{'-nt': 32}` -for the EEXE parameter `runtime_args` (specified in the input YAML file, see :ref:`doc_EEXE_parameters`), +for the REXEE parameter `runtime_args` (specified in the input YAML file, see :ref:`doc_REXEE_parameters`), so each of the 4 replicas will use 32 threads (assuming thread-MPI GROMACS), taking the full advantage of 128 cores. -1.3. CLI :code:`analyze_EEXE` +1.3. CLI :code:`analyze_REXEE` ----------------------------- -Finally, here is the help message of :code:`analyze_EEXE`: +Finally, here is the help message of :code:`analyze_REXEE`: :: - usage: analyze_EEXE [-h] [-y YAML] [-o OUTPUT] [-rt REP_TRAJS] [-st STATE_TRAJS] + usage: analyze_REXEE [-h] [-y YAML] [-o OUTPUT] [-rt REP_TRAJS] [-st STATE_TRAJS] [-d DIR] [-m MAXWARN] - This code analyzes an ensemble of expanded ensemble. Note that the template MDP file + This code analyzes a REXEE simulation. Note that the template MDP file specified in the YAML file needs to be available in the working directory. optional arguments: -h, --help show this help message and exit - -y YAML, --yaml YAML The input YAML file used to run the EEXE simulation. (Default: + -y YAML, --yaml YAML The input YAML file used to run the REXEE simulation. (Default: params.yaml) -o OUTPUT, --output OUTPUT - The output log file that contains the analysis results of EEXE. - (Default: analyze_EEXE_log.txt) + The output log file that contains the analysis results of REXEE. + (Default: analyze_REXEE_log.txt) -rt REP_TRAJS, --rep_trajs REP_TRAJS The NPY file containing the replica-space trajectory. (Default: rep_trajs.npy) @@ -103,45 +103,45 @@ Finally, here is the help message of :code:`analyze_EEXE`: 2. Recommended workflow ======================= -In this section, we introduce the workflow adopted by the CLI :code:`run_EEXE` that can be used to -launch EEXE simulations. While this workflow is made as flexible as possible, interested users -can use functions defined :class:`ensemble_EXE` to develop their own workflow, or consider contributing -to the source code of the CLI :code:`run_EEXE`. As an example, a hands-on tutorial that uses this workflow (using the CLI :code:`run_EEXE`) can be found in -`Tutorial 1: Ensemble of expanded ensemble`_. +In this section, we introduce the workflow adopted by the CLI :code:`run_REXEE` that can be used to +launch REXEE simulations. While this workflow is made as flexible as possible, interested users +can use functions defined :class:`ReplicaExchangeEE` to develop their own workflow, or consider contributing +to the source code of the CLI :code:`run_REXEE`. As an example, a hands-on tutorial that uses this workflow (using the CLI :code:`run_REXEE`) can be found in +`Tutorial 1: Launching a REXEE simulation`_. -.. _`Tutorial 1: Ensemble of expanded ensemble`: examples/run_EEXE.ipynb +.. _`Tutorial 1: Launching a REXEE simulation`: examples/run_REXEE.ipynb Step 1: Set up parameters ------------------------- -To run an ensemble of expanded ensemble in GROMACS using :code:`run_EEXE.py`, one at +To run a REXEE simulation in GROMACS using :code:`run_REXEE.py`, one at least needs to following four files: * One GRO file of the system of interest * One TOP file of the system of interest * One MDP template for customizing different MDP files for different replicas. -* One YAML file that specify the EEXE-relevant parameters. +* One YAML file that specify the REXEE-relevant parameters. Currently, we only allow all replicas to be initiated with the same configuration represented by the single GRO file, but the user should also be able to initialize different replicas with different configurations (represented by multiple GRO files) in the near future. Also, the MDP template should contain parameters common across all replicas and define the coupling parmaeters for all possible intermediate states, so that we can cusotmize different MDP files by defining a subset of alchemical states in different -replicas. For EEXE simulations, some MDP parameters need additional care to be taken, which we describe in -:ref:`doc_mdp_params`. Importantly, to extend an EEXE simulation, one needs to additionally provide the following +replicas. For REXEE simulations, some MDP parameters need additional care to be taken, which we describe in +:ref:`doc_mdp_params`. Importantly, to extend an REXEE simulation, one needs to additionally provide the following two checkpoint files: -* One NPY file containing the replica-space trajectories of different configurations saved by the previous run of EEXE simulation with a default name as :code:`rep_trajs.npy`. -* One NPY file containing the timeseries of the whole-range alchemical weights saved by the previous run of EEXE simulation with a default name as :code:`g_vecs.npy`. +* One NPY file containing the replica-space trajectories of different configurations saved by the previous run of REXEE simulation with a default name as :code:`rep_trajs.npy`. +* One NPY file containing the timeseries of the whole-range alchemical weights saved by the previous run of REXEE simulation with a default name as :code:`g_vecs.npy`. -In :code:`run_EEXE.py`, the class :class:`.EnsembleEXE` is instantiated with the given YAML file, where +In :code:`run_REXEE.py`, the class :class:`.ReplicaExchangeEE` is instantiated with the given YAML file, where the user needs to specify how the replicas should be set up or interact with each other during the simulation ensemble. Check :ref:`doc_parameters` for more details. Step 2: Run the 1st iteration ----------------------------- With all the input files/parameters set up in the previous run, one can use run the first iteration, -using :obj:`.run_EEXE`, which uses :code:`subprocess.run` to launch GROMACS :code:`grompp` +using :obj:`.run_REXEE`, which uses :code:`subprocess.run` to launch GROMACS :code:`grompp` and :code:`mdrun` commands in parallel. Step 3: Set up the new iteration @@ -210,62 +210,62 @@ iterations (:code:`n_iterations`) is reached. 3. Input YAML parameters ======================== In the current implementation of the algorithm, 28 parameters can be specified in the input YAML file. -Note that the two CLIs :code:`run_EEXE` and :code:`analyze_EEXE` share the same input YAML file, so we also +Note that the two CLIs :code:`run_REXEE` and :code:`analyze_REXEE` share the same input YAML file, so we also include parameters for data analysis here. 3.1. GROMACS executable ----------------------- - :code:`gmx_executable`: (Optional, Default: :code:`gmx_mpi`) - The GROMACS executable to be used to run the EEXE simulation. The value could be as simple as :code:`gmx` + The GROMACS executable to be used to run the REXEE simulation. The value could be as simple as :code:`gmx` or :code:`gmx_mpi` if the exeutable has been sourced. Otherwise, the full path of the executable (e.g. :code:`/usr/local/gromacs/bin/gmx`, the path returned by the command :code:`which gmx`) should be used. - Note that EEXE only works with MPI-enabled GROMACS. + Note that REXEE only works with MPI-enabled GROMACS. 3.2. Input files ---------------- - :code:`gro`: (Required) - The input system configuration in the form of GRO file(s) used to initiate the EEXE simulation. If only one GRO file is specified, + The input system configuration in the form of GRO file(s) used to initiate the REXEE simulation. If only one GRO file is specified, it will be used to initiate all the replicas. If multiple GRO files are specified (using the YAML syntax), the number of GRO files has to be the same as the number of replicas. - :code:`top`: (Required) - The input system topology in the form of TOP file(s) used to initiate the EEXE simulation. If only one TOP file is specified, + The input system topology in the form of TOP file(s) used to initiate the REXEE simulation. If only one TOP file is specified, it will be used to initiate all the replicas. If multiple TOP files are specified (using the YAML syntax), the number of TOP files has to be the same as the number of replicas. In the case where multiple TOP and GRO files are specified, the i-th TOP file corresponds to the i-th GRO file. - :code:`mdp`: (Required) - The input MDP file used to initiate the EEXE simulation. Specifically, this input MDP file will serve as a template for + The input MDP file used to initiate the REXEE simulation. Specifically, this input MDP file will serve as a template for customizing MDP files for all replicas. Therefore, the MDP template must have the whole range of :math:`λ` values. - and the corresponding weights (in fixed-weight simulations). This holds for EEXE simulations for multiple serial mutations as well. - For example, in an EEXE simulation that mutates methane to ethane in one replica and ethane to propane in the other replica, if + and the corresponding weights (in fixed-weight simulations). This holds for REXEE simulations for multiple serial mutations as well. + For example, in an REXEE simulation that mutates methane to ethane in one replica and ethane to propane in the other replica, if exchanges only occur in the end states, then one could have :math:`λ` values like :code:`0.0 0.3 0.7 1.0 0.0 0.3 ...`. Notably, unlike the parameters :code:`gro` and :code:`top`, only one MDP file can be specified for the parameter :code:`mdp`. If you wish to use different parameters for different replicas, please use the parameter :code:`mdp_args`. - :code:`modify_coords`: (Optional, Default: :code:`None`) The name of the Python module (without including the :code:`.py` extension) for modifying the output coordinates of the swapping replicas - before the coordinate exchange, which is generally required in EEXE simulations for multiple serial mutations. - For the CLI :code:`run_EEXE` to work, here is the predefined contract for the module/function based on the assumptions :code:`run_EEXE` makes. + before the coordinate exchange, which is generally required in REXEE simulations for multiple serial mutations. + For the CLI :code:`run_REXEE` to work, here is the predefined contract for the module/function based on the assumptions :code:`run_REXEE` makes. Modules/functions not obeying the contract are unlikely to work. - Multiple functions can be defined in the module, but the function for coordinate manipulation must have the same name as the module itself. - The function must only have two compulsory arguments, which are the two GRO files to be modified. The function must not depend on the order of the input GRO files. - The function must return :code:`None` (i.e., no return value). - - The function must save the modified GRO file as :code:`confout.gro`. Specifically, if :code:`directory_A/output.gro` and :code:`directory_B/output.gro` are input, then :code:`directory_A/confout.gro` and :code:`directory_B/confout.gro` must be saved. (For more information, please visit `Tutorial 3: EEXE for multiple serial mutations`_.) Note that in the CLI :code:`run_EEXE`, :code:`confout.gro` generated as the simulation output will be automatically backed up (with a :code:`_backup` suffix) to prevent overwriting. + - The function must save the modified GRO file as :code:`confout.gro`. Specifically, if :code:`directory_A/output.gro` and :code:`directory_B/output.gro` are input, then :code:`directory_A/confout.gro` and :code:`directory_B/confout.gro` must be saved. (For more information, please visit `Tutorial 3: REXEE for multiple serial mutations`_.) Note that in the CLI :code:`run_REXEE`, :code:`confout.gro` generated as the simulation output will be automatically backed up (with a :code:`_backup` suffix) to prevent overwriting. -.. _`Tutorial 3: EEXE for multiple serial mutations`: examples/run_EEXE_modify_inputs.ipynb +.. _`Tutorial 3: REXEE for multiple serial mutations`: examples/run_REXEE_modify_inputs.ipynb -.. _doc_EEXE_parameters: +.. _doc_REXEE_parameters: -3.3. EEXE parameters +3.3. REXEE parameters -------------------- - :code:`n_sim`: (Required) The number of replica simulations. - :code:`n_iter`: (Required) - The number of iterations. In an EEXE simulation, one iteration means one exchange attempt. Notably, this can be used to extend the EEXE simulation. - For example, if one finishes an EEXE simulation with 10 iterations (with :code:`n_iter=10`) and wants to continue the simulation from iteration 11 to 30, - setting :code:`n_iter` in the next execution of :code:`run_EEXE` should suffice. + The number of iterations. In an REXEE simulation, one iteration means one exchange attempt. Notably, this can be used to extend the REXEE simulation. + For example, if one finishes an REXEE simulation with 10 iterations (with :code:`n_iter=10`) and wants to continue the simulation from iteration 11 to 30, + setting :code:`n_iter` in the next execution of :code:`run_REXEE` should suffice. - :code:`s`: (Required) The shift in the alchemical ranges between adjacent replicas (e.g. :math:`s = 2` if :math:`λ_2 = (2, 3, 4)` and :math:`λ_3 = (4, 5, 6)`. - :code:`nst_sim`: (Optional, Default: :code:`nsteps` in the template MDP file) @@ -274,7 +274,7 @@ include parameters for data analysis here. - :code:`add_swappables`: (Optional, Default: :code:`None`) A list of lists that additionally consider states (in global indices) that can be swapped. For example, :code:`add_swappables=[[4, 5], [14, 15]]` means that if a replica samples state 4, it can be swapped with another replica that samples state 5 and vice versa. The same logic applies to states 14 and 15. - This could be useful for EEXE simulations for multiple serial mutations, where we enforce exchanges between states 4 and 5 (and 14 and 15) and perform + This could be useful for REXEE simulations for multiple serial mutations, where we enforce exchanges between states 4 and 5 (and 14 and 15) and perform coordinate manipulation. - :code:`proposal`: (Optional, Default: :code:`exhaustive`) The method for proposing simulations to be swapped. Available options include :code:`single`, :code:`exhaustive`, :code:`neighboring`, and :code:`multiple`. @@ -283,7 +283,7 @@ include parameters for data analysis here. The Monte Carlo method for swapping simulations. Available options include :code:`same-state`/:code:`same_state`, :code:`metropolis`, and :code:`metropolis-eq`/:code:`metropolis_eq`. For more details, please refer to :ref:`doc_acceptance`. - :code:`w_combine`: (Optional, Default: :code:`None`) - The type of weights to be combined across multiple replicas in a weight-updating EEXE simulation. The following options are available: + The type of weights to be combined across multiple replicas in a weight-updating REXEE simulation. The following options are available: - :code:`None`: No weight combination. - :code:`final`: Combine the final weights. @@ -310,7 +310,7 @@ include parameters for data analysis here. always be a list of length of the number of replicas. For example, :code:`{'ref_p': [1.0, 1.01, 1.02, 1.03]}` means that the MDP parameter :code:`ref_p` will be set as 1.0 bar, 1.01 bar, 1.02 bar, and 1.03 bar for replicas 0, 1, 2, and 3, respectively. Note that while this feature allows high flexibility in parameter specification, not all parameters are suitable to be - varied across replicas. For example, varying :code:`nsteps` across replicas for synchronous EEXE simulations does not make sense. + varied across replicas. For example, varying :code:`nsteps` across replicas for synchronous REXEE simulations does not make sense. Additionally, this feature is a work in progress and differing :code:`ref_t` or :code:`dt` across replicas might cause issues. - :code:`grompp_args`: (Optional: Default: :code:`None`) Additional arguments to be appended to the GROMACS :code:`grompp` command provided in a dictionary. @@ -329,7 +329,7 @@ include parameters for data analysis here. The frequency for checkpointing in the number of iterations. - :code:`rm_cpt`: (Optional, Default: :code:`True`) Whether the GROMACS checkpoint file (:code:`state.cpt`) from each iteration should be deleted. - Normally we don't need CPT files for EEXE simulations (even for extension) so we recommend just + Normally we don't need CPT files for REXEE simulations (even for extension) so we recommend just deleting the CPT files (which could save a lot of space if you perform a huge number of iterations). If you wish to keep them, specify this parameter as :code:`False`. @@ -338,7 +338,7 @@ include parameters for data analysis here. 3.5. Data analysis ------------------ - :code:`msm`: (Optional, Default: :code:`False`) - Whether to build Markov state models (MSMs) for the EEXE simulation and perform relevant analysis. + Whether to build Markov state models (MSMs) for the REXEE simulation and perform relevant analysis. - :code:`free_energy`: (Optional, Default: :code:`False`) Whether to perform free energy calculations in data analysis or not. Note that free energy calculations could be computationally expensive depending on the relevant settings. @@ -381,7 +381,7 @@ infinity internally. mdp: modify_coords: null - # Section 3: EEXE parameters + # Section 3: REXEE parameters n_sim: n_iter: s: @@ -418,10 +418,10 @@ infinity internally. ======================= As mentioned above, a template MDP file should have all the parameters that will be shared across all replicas. It should also define the coupling parameters for the whole range of -states so that different MDP files can be customized for different replicas. For an EEXE simulation -launched by the CLI :code:`run_EEXE`, any GROMACS MDP parameter that could potentially lead to issues -in the EEXE simulation will raise a warning. If the number of warnings is larger than the value -specified for the flag `-m`/`--maxwarn` in the CLI :code:`run_EEXE`, the simulation will error +states so that different MDP files can be customized for different replicas. For an REXEE simulation +launched by the CLI :code:`run_REXEE`, any GROMACS MDP parameter that could potentially lead to issues +in the REXEE simulation will raise a warning. If the number of warnings is larger than the value +specified for the flag `-m`/`--maxwarn` in the CLI :code:`run_REXEE`, the simulation will error out. To avoid warnings arised from MDP specification, we need to take extra care for the following MDP parameters: @@ -435,12 +435,12 @@ MDP parameters: of the simulation can be correctly parsed from the LOG file. - The MDP parameter :code:`nstdhdl` must be a factor of the YAML parameter :code:`nst_sim` so that the time series of the state index can be correctly parsed from the DHDL file. -- In EEXE, the MDP parameter :code:`nstdhdl` must be a factor of the MDP parameter :code:`nstexpanded`, or +- In REXEE, the MDP parameter :code:`nstdhdl` must be a factor of the MDP parameter :code:`nstexpanded`, or the calculation of the acceptance ratio may be wrong. - Be careful with the pull code specification if you want to apply a distance restraint between two pull groups. - Specifically, in an EEXE simulation, all iterations should use the same reference distance. Otherwise, poor sampling - can be observed in a fixed-weight EEXE simulation and the equilibration time may be much longer for a weight-updating - EEXE simulation. To ensure the same reference distance across all iterations in an EEXE simulation, consider the + Specifically, in an REXEE simulation, all iterations should use the same reference distance. Otherwise, poor sampling + can be observed in a fixed-weight REXEE simulation and the equilibration time may be much longer for a weight-updating + REXEE simulation. To ensure the same reference distance across all iterations in an REXEE simulation, consider the following scenarios: - If you would like to use the COM distance between the pull groups in the input GRO file as the reference distance for all the iterations (whatever that value is), then specify :code:`pull_coord1_start = yes` with diff --git a/docs/theory.rst b/docs/theory.rst index 770e244d..41c5b02b 100644 --- a/docs/theory.rst +++ b/docs/theory.rst @@ -2,13 +2,12 @@ 1. Basic idea ============= -Ensemble of expanded ensemble (EEXE) integrates the core principles of replica exchange -molecular dynamics (REMD) and expanded ensemble (EXE). Specifically, an ensemble of -expanded ensembles includes multiple non-interacting, parallel expanded ensemble simulations +Replica exchange of expanded ensemble (REXEE) integrates the core principles of replica exchange +molecular dynamics (REMD) and expanded ensemble (EXE). Specifically, a REXEE simulation includes multiple non-interacting, parallel expanded ensemble simulations that collectively sample a number of alchemical states spanning between the coupled state (:math:`\lambda=0`) and the uncoupled state (:math:`\lambda=1`). Each expanded ensemble samples a subset of these states such that the range of its allowed alchemical states -overlaps with that of the adjacent replicas. In EEXE, the exchange of coordinates/alchemical +overlaps with that of the adjacent replicas. In REXEE, the exchange of coordinates/alchemical states occurs at a specified frequency, which is beneficial for allowing better mixing in the alchemical space given sufficiently long simulation time, properly specified parameters and highly parallelizable computing architectures. @@ -401,10 +400,10 @@ sampling different alchemical ranges would have different references. Therefore, 2.4. How is swapping performed? ------------------------------- -As implied in :ref:`doc_basic_idea`, in an EEXE simulation, we could either choose to swap configurations +As implied in :ref:`doc_basic_idea`, in an REXEE simulation, we could either choose to swap configurations (via swapping GRO files) or replicas (via swapping MDP files). In this package, we chose the former when -implementing the EEXE algorithm. Specifically, in the CLI :code:`run_EEXE`, the function :obj:`.get_swapping_pattern` -is called once for each iteration and returns a list :code:`swap_pattern` that informs :code:`run_EEXE` how +implementing the REXEE algorithm. Specifically, in the CLI :code:`run_REXEE`, the function :obj:`.get_swapping_pattern` +is called once for each iteration and returns a list :code:`swap_pattern` that informs :code:`run_REXEE` how the GRO files should be swapped. (To better understand the list :code:`swap_pattern`, see the docstring of the function :obj:`.get_swapping_pattern`.) Internally, the function :obj:`.get_swapping_pattern` not only swaps the list :code:`swap_pattern` when an attempted move is accepted, but also swaps elements in lists that contains @@ -419,7 +418,7 @@ in the list of states. Check the source code of :obj :`.get_swapping_pattern` if 3.1. Basic idea --------------- To leverage the stastics of the states collected from multiple replicas, we recommend -combining the alchemical weights of these states across replicas during an weight-updating EEXE simulation. +combining the alchemical weights of these states across replicas during an weight-updating REXEE simulation. Ideally, the modified weights should facilitate the convergence of the alchemical weights in expanded ensemble, which in the limit of inifinite simulation time correspond to dimensionless free energies of the alchemical states. The modified weights also directly influence the the accpetance ratio, hence the convergence of the simulation @@ -539,5 +538,5 @@ To deal with this, the user can choose to specify :code:`N_cutoff` in the input correction will performed only when :math:`\text{argmin}(N_k, N_{k-1})` is larger than the cutoff. Also, this histogram correction should always be carried out before weight combination. This method is implemented in :obj:`.histogram_correction`. -4. Parameter space of EEXE +4. Parameter space of REXEE =========================== \ No newline at end of file diff --git a/ensemble_md/cli/analyze_REXEE.py b/ensemble_md/cli/analyze_REXEE.py index 693f0065..a441a994 100644 --- a/ensemble_md/cli/analyze_REXEE.py +++ b/ensemble_md/cli/analyze_REXEE.py @@ -35,7 +35,7 @@ def initialize(args): parser = argparse.ArgumentParser( - description='This code analyzes an ensemble of expanded ensemble. Note that the template MDP\ + description='This code analyzes a REXEE simulation. Note that the template MDP\ file specified in the YAML file needs to be available in the working directory.') parser.add_argument('-y', '--yaml', diff --git a/ensemble_md/cli/run_REXEE.py b/ensemble_md/cli/run_REXEE.py index 239bc910..e2641dda 100644 --- a/ensemble_md/cli/run_REXEE.py +++ b/ensemble_md/cli/run_REXEE.py @@ -24,7 +24,7 @@ def initialize(args): parser = argparse.ArgumentParser( - description='This code runs an ensemble of expanded ensemble given necessary inputs.') + description='This code runs a REXEE simulation given necessary inputs.') parser.add_argument('-y', '--yaml', type=str, @@ -99,7 +99,7 @@ def main(): MDP = REXEE.initialize_MDP(i) MDP.write(f"sim_{i}/iteration_0/expanded.mdp", skipempty=True) - # 2-2. Run the first ensemble of simulations + # 2-2. Run the first set of simulations REXEE.run_REXEE(0) else: @@ -280,7 +280,7 @@ def main(): print(f'\nAn error occurred on rank 0:\n{traceback.format_exc()}') MPI.COMM_WORLD.Abort(1) - # 4-2. Run another ensemble of simulations + # 4-2. Run another set of simulations REXEE.run_REXEE(i, swap_pattern) # 4-3. Save data