diff --git a/ensemble_md/analysis/clustering.py b/ensemble_md/analysis/clustering.py index c3b5645a..02fa69d9 100644 --- a/ensemble_md/analysis/clustering.py +++ b/ensemble_md/analysis/clustering.py @@ -1,8 +1,17 @@ +#################################################################### +# # +# ensemble_md, # +# a python package for running GROMACS simulation ensembles # +# # +# Written by Wei-Tse Hsu # +# Copyright (c) 2022 University of Colorado Boulder # +# # +#################################################################### import numpy as np from ensemble_md.utils.utils import run_gmx_cmd -def cluster_traj(gmx_executable, inputs, grps, method='linkage', cutoff=0.1, suffix=None): +def cluster_traj(gmx_executable, inputs, grps, coupled_only=True, method='linkage', cutoff=0.1, suffix=None): """ Performs clustering analysis on a trajectory using the GROMACS command :code:`gmx cluster`. Note that only fully coupled configurations are considered. @@ -12,27 +21,46 @@ def cluster_traj(gmx_executable, inputs, grps, method='linkage', cutoff=0.1, suf gmx_executable : str The path to the GROMACS executable. inputs : dict - A dictionary that contains the file names of the input trajectory file (XTC or TRR), - the configuration file (TPR or GRO), the file that contains the time series of the - state index, and the index file (NDX). The must include the keys :code:`traj`, :code:`config`, - :code:`xvg`, and :code:`index`. Note that the value for the key :code:`index` can be :code:`None`. + A dictionary that contains the different input files required for the clustering analysis. + The dictionary must have the following four keys: :code:`traj` (input trajectory file in + XTC or TRR format), :code:`config` (the configuration file in TPR or GRO format), + :code:`xvg` (a GROMACS XVG file), and :code:`index` (an index/NDX file), with the values + being the paths. Note that the value of the key :code:`index` can be :code:`None`, in which + case the function will use a default index file generated by :code:`gmx make_ndx`. If the + parameter :code:`coupled_only` is set to :code:`True`, an XVG file that contains the time + series of the state index (e.g., :code:`dhdl.xvg`) must be provided with the key :code:`xvg`. + Otherwise, the key :code:`xvg` can be set to :code:`None`. grps : dict A dictionary that contains the names of the groups in the index file (NDX) for centering the system, calculating the RMSD, and outputting. The keys are :code:`center`, :code:`rmsd`, and :code:`output`. + coupled_only : bool + Whether to consider only the fully coupled configurations. The default is :code:`True`. method : str The method for clustering available for the GROMACS command :code:`gmx cluster`. The default is 'linkage'. + Check the GROMACS documentation for other available options. cutoff : float - The cutoff in RMSD for clustering. The default is 0.1. + The RMSD cutoff for clustering in nm. The default is 0.1. suffix : str The suffix for the output files. The default is :code:`None`, which means no suffix will be added. """ - # First check if all specified groups are present in the index file + # Check if the index file is provided + if inputs['index'] is None: + print('Running gmx make_ndx to generate an index file ...') + args = [ + gmx_executable, 'make_ndx', + '-f', inputs['config'], + '-o', 'index.ndx', + ] + returncode, stdout, stderr = run_gmx_cmd(args, prompt_input='q\n') + inputs['index'] = 'index.ndx' + + # Check if the groups are present in the index file with open(inputs['index'], 'r') as f: content = f.read() for key in grps: if grps[key] not in content: - raise ValueError(f'The group {grps[key]} is not present in the index file.') + raise ValueError(f'The group {grps[key]} is not present in the provided/generated index file.') outputs = { 'nojump': 'nojump.xtc', @@ -47,90 +75,104 @@ def cluster_traj(gmx_executable, inputs, grps, method='linkage', cutoff=0.1, suf for key in outputs: outputs[key] = outputs[key].replace('.', f'_{suffix}.') - print('Eliminating jumps across periodic boundaries for the input trajectory ...') - args = [ - gmx_executable, 'trjconv', - '-f', inputs['traj'], - '-s', inputs['config'], - '-o', outputs['nojump'], - '-center', 'yes', - '-pbc', 'nojump', - '-drop', inputs['xvg'], - '-dropover', '0' - ] - if inputs['index'] is not None: - args.extend(['-n', inputs['index']]) - returncode, stdout, stderr = run_gmx_cmd(args, prompt_input=f'{grps["center"]}\n{grps["output"]}\n') - if returncode != 0: - print(f'Error with return code: {returncode}):\n{stderr}') - - print('Centering the system ...') - args = [ - gmx_executable, 'trjconv', - '-f', outputs['nojump'], - '-s', inputs['config'], - '-o', outputs['center'], - '-center', 'yes', - '-pbc', 'mol', - '-ur', 'compact', - ] - if inputs['index'] is not None: - args.extend(['-n', inputs['index']]) - returncode, stdout, stderr = run_gmx_cmd(args, prompt_input=f'{grps["center"]}\n{grps["output"]}\n') - if returncode != 0: - print(f'Error with return code: {returncode}):\n{stderr}') - - print('Performing clustering analysis ...') - args = [ - gmx_executable, 'cluster', - '-f', outputs['center'], - '-s', inputs['config'], - '-o', outputs['rmsd-clust'], - '-dist', outputs['rmsd-dist'], - '-g', outputs['cluster-log'], - '-cl', outputs['cluster-pdb'], - '-cutoff', str(cutoff), - '-method', method, - ] - if inputs['index'] is not None: - args.extend(['-n', inputs['index']]) - returncode, stdout, stderr = run_gmx_cmd(args, prompt_input=f'{grps["rmsd"]}\n{grps["output"]}\n') - if returncode != 0: - print(f'Error with return code: {returncode}):\n{stderr}') - - rmsd_range, rmsd_avg, n_clusters = get_cluster_info(outputs['cluster-log']) - - print(f'Range of RMSD values: from {rmsd_range[0]:.3f} to {rmsd_range[1]:.3f} nm') - print(f'Average RMSD: {rmsd_avg:.3f} nm') - print(f'Number of clusters: {n_clusters}') - - if n_clusters > 1: - clusters, sizes = get_cluster_members(outputs['cluster-log']) - for i in range(1, n_clusters + 1): - print(f' - Cluster {i} accounts for {sizes[i] * 100:.2f}% of the total configurations.') - - n_transitions, t_transitions = count_transitions(clusters) - print(f'Number of transitions between the two biggest clusters: {n_transitions}') - print(f'Time frames of the transitions (ps): {t_transitions}') - - print('Calculating the inter-medoid RMSD between the two biggest clusters ...') - # Note that we pass outputs['cluster-pdb'] to -s so that the first medoid will be used as the reference + # Check if there is any fully coupled state in the trajectory + lambda_data = np.transpose(np.loadtxt(inputs['xvg'], comments=['#', '@']))[1] + if coupled_only is True and 0 not in lambda_data: + print('Terminating clustering analysis since no fully decoupled state is present in the input trajectory while coupled_only is set to True.') # noqa: E501 + else: + # Either coupled_only is False or coupled_only is True but there are coupled configurations. + print('Eliminating jumps across periodic boundaries for the input trajectory ...') args = [ - gmx_executable, 'rms', - '-f', outputs['cluster-pdb'], - '-s', outputs['cluster-pdb'], - '-o', outputs['rmsd'], + gmx_executable, 'trjconv', + '-f', inputs['traj'], + '-s', inputs['config'], + '-n', inputs['index'], + '-o', outputs['nojump'], + '-center', 'yes', + '-pbc', 'nojump', ] - if inputs['index'] is not None: - args.extend(['-n', inputs['index']]) - # Here we simply assume same groups for least-squares fitting and RMSD calculation - returncode, stdout, stderr = run_gmx_cmd(args, prompt_input=f'{grps["rmsd"]}\n{grps["rmsd"]}\n') + if coupled_only: + if inputs['xvg'] is None: + raise ValueError('The parameter "coupled_only" is set to True but no XVG file is provided.') + args.extend([ + '-drop', inputs['xvg'], + '-dropover', '0' + ]) + + returncode, stdout, stderr = run_gmx_cmd(args, prompt_input=f'{grps["center"]}\n{grps["output"]}\n') + if returncode != 0: + print(f'Error with return code: {returncode}):\n{stderr}') + + print('Centering the system ...') + args = [ + gmx_executable, 'trjconv', + '-f', outputs['nojump'], + '-s', inputs['config'], + '-n', inputs['index'], + '-o', outputs['center'], + '-center', 'yes', + '-pbc', 'mol', + '-ur', 'compact', + ] + returncode, stdout, stderr = run_gmx_cmd(args, prompt_input=f'{grps["center"]}\n{grps["output"]}\n') + if returncode != 0: + print(f'Error with return code: {returncode}):\n{stderr}') + + if coupled_only is True: + N_coupled = np.count_nonzero(lambda_data == 0) + print(f'Number of fully coupled configurations: {N_coupled}') + + print('Performing clustering analysis ...') + args = [ + gmx_executable, 'cluster', + '-f', outputs['center'], + '-s', inputs['config'], + '-n', inputs['index'], + '-o', outputs['rmsd-clust'], + '-dist', outputs['rmsd-dist'], + '-g', outputs['cluster-log'], + '-cl', outputs['cluster-pdb'], + '-cutoff', str(cutoff), + '-method', method, + ] + returncode, stdout, stderr = run_gmx_cmd(args, prompt_input=f'{grps["rmsd"]}\n{grps["output"]}\n') if returncode != 0: print(f'Error with return code: {returncode}):\n{stderr}') - rmsd = np.transpose(np.loadtxt(outputs['rmsd'], comments=['@', '#']))[1][1] # inter-medoid RMSD - print(f'Inter-medoid RMSD between the two biggest clusters: {rmsd:.3f} nm') + rmsd_range, rmsd_avg, n_clusters = get_cluster_info(outputs['cluster-log']) + + print(f'Range of RMSD values: from {rmsd_range[0]:.3f} to {rmsd_range[1]:.3f} nm') + print(f'Average RMSD: {rmsd_avg:.3f} nm') + print(f'Number of clusters: {n_clusters}') + + if n_clusters > 1: + clusters, sizes = get_cluster_members(outputs['cluster-log']) + for i in range(1, n_clusters + 1): + print(f' - Cluster {i} accounts for {sizes[i] * 100:.2f}% of the total configurations.') + + n_transitions, t_transitions = count_transitions(clusters) + print(f'Number of transitions between the two biggest clusters: {n_transitions}') + print(f'Time frames of the transitions (ps): {t_transitions}') + + print('Calculating the inter-medoid RMSD between the two biggest clusters ...') + # Note that we pass outputs['cluster-pdb'] to -s so that the first medoid will be used as the reference + args = [ + gmx_executable, 'rms', + '-f', outputs['cluster-pdb'], + '-s', outputs['cluster-pdb'], + '-o', outputs['rmsd'], + ] + if inputs['index'] is not None: + args.extend(['-n', inputs['index']]) + + # Here we simply assume same groups for least-squares fitting and RMSD calculation + returncode, stdout, stderr = run_gmx_cmd(args, prompt_input=f'{grps["rmsd"]}\n{grps["rmsd"]}\n') + if returncode != 0: + print(f'Error with return code: {returncode}):\n{stderr}') + + rmsd = np.transpose(np.loadtxt(outputs['rmsd'], comments=['@', '#']))[1][1] # inter-medoid RMSD + print(f'Inter-medoid RMSD between the two biggest clusters: {rmsd:.3f} nm') def get_cluster_info(cluster_log): diff --git a/ensemble_md/cli/analyze_REXEE.py b/ensemble_md/cli/analyze_REXEE.py index 6548a606..e4a8a297 100644 --- a/ensemble_md/cli/analyze_REXEE.py +++ b/ensemble_md/cli/analyze_REXEE.py @@ -88,7 +88,6 @@ def main(): sys.stdout = utils.Logger(logfile=args.output) sys.stderr = utils.Logger(logfile=args.output) section_idx = 0 - poor_sampling = None rc('font', **{ 'family': 'sans-serif', @@ -264,9 +263,6 @@ def main(): print(f' - Trajectory {j} ({len(t_list[j])} events): {np.mean(t_list[j]):.2f} {units}') print(f' - Average of the above: {np.mean([np.mean(i) for i in t_list]):.2f} {units} (std: {np.std([np.mean(i) for i in t_list], ddof=1):.2f} {units})') # noqa: E501 - if np.sum(np.isnan([np.mean(i) for i in t_list])) != 0: - poor_sampling = True - if REXEE.msm is True: section_idx += 1 @@ -389,9 +385,6 @@ def main(): # Section 4 (or Section 3). Free energy calculations if REXEE.free_energy is True: - if poor_sampling is True: - print('\nFree energy calculation is not performed since the sampling appears poor.') - sys.exit() section_idx += 1 print(f'\n[ Section {section_idx}. Free energy calculations ]') @@ -401,19 +394,22 @@ def main(): if os.path.isfile(f'{args.dir}/u_nk_data.pickle') is True: print('Loading the preprocessed data u_nk ...') with open(f'{args.dir}/u_nk_data.pickle', 'rb') as handle: - data_list = pickle.load(handle) + data_all = pickle.load(handle) + data_list, t_idx_list, g_list = data_all[0], data_all[1], data_all[2] else: # should always be 'dhdl' if os.path.isfile(f'{args.dir}/dHdl_data.pickle') is True: print('Loading the preprocessed data dHdl ...') with open(f'{args.dir}/dHdl_data.pickle', 'rb') as handle: - data_list = pickle.load(handle) + data_all = pickle.load(handle) + data_list, t_idx_list, g_list = data_all[0], data_all[1], data_all[2] if data_list == []: files_list = [natsort.natsorted(glob.glob(f'sim_{i}/iteration_*/*dhdl*xvg')) for i in range(REXEE.n_sim)] data_list, t_idx_list, g_list = analyze_free_energy.preprocess_data(files_list, REXEE.temp, REXEE.df_data_type, REXEE.df_spacing) # noqa: E501 + data_all = [data_list, t_idx_list, g_list] with open(f'{args.dir}/{REXEE.df_data_type}_data.pickle', 'wb') as handle: - pickle.dump(data_list, handle, protocol=pickle.HIGHEST_PROTOCOL) + pickle.dump(data_all, handle, protocol=pickle.HIGHEST_PROTOCOL) # 4-2. Calculate the free energy profile f, f_err, estimators = analyze_free_energy.calculate_free_energy(data_list, REXEE.state_ranges, REXEE.df_method, REXEE.err_method, REXEE.n_bootstrap, REXEE.seed) # noqa: E501 @@ -439,9 +435,10 @@ def main(): print(f'Averaged start index: {t_avg}') print(f'Averaged statistical inefficiency: {g_avg:.2f}') - data_list, _, _ = analyze_free_energy.preprocess_data(files_list, REXEE.temp, REXEE.df_data_type, REXEE.df_spacing, t_avg, g_avg) # noqa: E501 + data_list, t_idx_list, g_list = analyze_free_energy.preprocess_data(files_list, REXEE.temp, REXEE.df_data_type, REXEE.df_spacing, t_avg, g_avg) # noqa: E501 + data_all = [data_list, t_idx_list, g_list] with open(f'{args.dir}/{REXEE.df_data_type}_data_avg_subsampling.pickle', 'wb') as handle: - pickle.dump(data_list, handle, protocol=pickle.HIGHEST_PROTOCOL) + pickle.dump(data_all, handle, protocol=pickle.HIGHEST_PROTOCOL) f, f_err, estimators = analyze_free_energy.calculate_free_energy(data_list, REXEE.state_ranges, REXEE.df_method, REXEE.err_method, REXEE.n_bootstrap, REXEE.seed) # noqa: E501 print('Plotting the full-range free energy profile ...')