From 9c8eb183335f4d6c16751ccab30b8e61ccbfb43a Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 26 Dec 2024 11:01:43 +0800 Subject: [PATCH 1/3] update tools --- tools/pca_sampling/pca_sampling_R2.py | 141 ++++++++++++++++++++++++++ tools/readme.md | 1 + tools/xtd2exyz/xtd2exyz.pl | 108 ++++++++++++++++++++ 3 files changed, 250 insertions(+) create mode 100644 tools/pca_sampling/pca_sampling_R2.py create mode 100644 tools/xtd2exyz/xtd2exyz.pl diff --git a/tools/pca_sampling/pca_sampling_R2.py b/tools/pca_sampling/pca_sampling_R2.py new file mode 100644 index 000000000..52c0d8050 --- /dev/null +++ b/tools/pca_sampling/pca_sampling_R2.py @@ -0,0 +1,141 @@ +""" + Purpose: + Select structures from orginal large reference dataset based on principal component Analysis (PCA) of + descriptor space using farthest point sampling. We use the PbTe as a toy example to show how this script + works, one need to modify the path of reference dataset, nep model, and selected frame number case by case. + + Ref: + calorine: https://calorine.materialsmodeling.org/tutorials/visualize_descriptor_space_with_pca.html + https://github.com/bigd4/PyNEP/blob/master/examples/plot_select_structure.py + + Author: + Penghua Ying + Zherui Chen +""" + + +from ase.io import read, write +from pylab import * +from calorine.nep import get_descriptors +from sklearn.decomposition import PCA +from tqdm import tqdm +import numpy as np +from scipy.spatial.distance import cdist + +# ------------------------------------------------------------------- +# Input/Output and Configuration +# ------------------------------------------------------------------- +INPUT_FILE = "train.xyz" # Input file containing structures +NEP_MODEL_FILE = "nep.txt" # Input NEP model file for descriptor generation +OUTPUT_SELECTED_FILE = "selected_structures.xyz" # Output file for selected structures +OUTPUT_UNSELECTED_FILE = "unselected_structures.xyz" # Output file for unselected structures +OUTPUT_PLOT_FILE = "FPS.png" # Output file for visualization + +R2_THRESHOLD = 0.90 # R2 threshold to stop sampling +PCA_COMPONENTS = 2 # Number of PCA components to use for dimensionality reduction +# ------------------------------------------------------------------- + +# Incremental Farthest Point Sampling with early stopping +def incremental_fps_with_r2(pc, r2_threshold): + n_points = pc.shape[0] + overall_mean = np.mean(pc, axis=0) # Mean of all points + total_variance = np.sum((pc - overall_mean) ** 2) # Total variance of data + + # Randomly select the first point + selected_indices = [np.random.randint(n_points)] + min_distances = np.full(n_points, np.inf) # Initialize distances to infinity + + explained_variance = 0 + r2 = 0.0 + + # Loop to incrementally select points + for _ in range(1, n_points): + # Update min_distances: minimum distance from each point to the selected points + new_point_idx = selected_indices[-1] + new_distances = cdist(pc, pc[[new_point_idx]]).flatten() + min_distances = np.minimum(min_distances, new_distances) + + # Select the next farthest point + next_index = np.argmax(min_distances) + selected_indices.append(next_index) + + # Calculate explained variance incrementally + explained_variance = np.sum((pc[selected_indices] - overall_mean) ** 2) + r2 = explained_variance / total_variance + + # Early stopping if R2 exceeds threshold + if r2 >= r2_threshold: + break + + return selected_indices, r2 + + +# Set figure properties for plotting +def set_fig_properties(ax_list): + tl = 8 + tw = 2 + tlm = 4 + for ax in ax_list: + ax.tick_params(which="major", length=tl, width=tw) + ax.tick_params(which="minor", length=tlm, width=tw) + ax.tick_params(which="both", axis="both", direction="out", right=False, top=False) + + +# ------------------------------------------------------------------- +# Main script +# ------------------------------------------------------------------- + +# Load dataset +tol = read(INPUT_FILE, ":") # Read original larger reference.xyz + +# Generate descriptors +descriptors = [] +for i, t in tqdm(enumerate(tol)): + d = get_descriptors(t, model_filename=NEP_MODEL_FILE) # Use NEP model file as input for descriptors + d_mean = np.mean(d, axis=0) # Use average of each atomic descriptor to get structure descriptors + descriptors.append(d_mean) + +descriptors = np.array(descriptors) +print(f"Total number of structures in dataset: {descriptors.shape[0]}") +print(f"Number of descriptor components: {descriptors.shape[1]}") + +# PCA +pca = PCA(n_components=PCA_COMPONENTS) +pc = pca.fit_transform(descriptors) +p0 = pca.explained_variance_ratio_[0] +p1 = pca.explained_variance_ratio_[1] +print(f"Explained variance for component 0: {p0:.2f}") +print(f"Explained variance for component 1: {p1:.2f}") + +# Find minimum samples to achieve the threshold +selected_indices, final_r2 = incremental_fps_with_r2(pc, R2_THRESHOLD) + +# Separate selected and unselected structures +selected_structures = [tol[i] for i in selected_indices] +unselected_structures = [t for i, t in enumerate(tol) if i not in selected_indices] + +# Save the selected and unselected structures +write(OUTPUT_SELECTED_FILE, selected_structures) +write(OUTPUT_UNSELECTED_FILE, unselected_structures) + +# Visualization +figure(figsize=(10, 8)) +set_fig_properties([gca()]) +scatter(pc[:, 0], pc[:, 1], alpha=0.5, c="C0", label="All structures") +scatter(pc[selected_indices, 0], pc[selected_indices, 1], s=8, color="C1", label="Selected structures") +xlabel("PC1") +ylabel("PC2") +legend() + +# Add final R2 and number of samples to the plot +text( + 0.05, + 0.95, + f"R$^2$: {final_r2:.3f}\nSamples: {len(selected_indices)}", + transform=gca().transAxes, + fontsize=14, + verticalalignment="top", + bbox=dict(boxstyle="round", facecolor="white", alpha=0.5), +) + +savefig(OUTPUT_PLOT_FILE, bbox_inches="tight") \ No newline at end of file diff --git a/tools/readme.md b/tools/readme.md index 315a20af3..c645a0277 100644 --- a/tools/readme.md +++ b/tools/readme.md @@ -34,6 +34,7 @@ | split_xyz | Yong Wang | yongw@princeton.edu | Some functionalities for training/test data. | | vasp2xyz | Yanzhou Wang | yanzhowang@gmail.com | Get `train.xyz` from `VASP` outputs. | | vim | Ke Xu | twtdq@qq.com | Highlight GPUMD grammar in `vim`. | +| xtd2exyz | Zherui Chen | chenzherui0124@foxmail.com | Convert XTD files into XYZ format. | | xyz2gro | Who? | | Convert `xyz` file to `gro` file. | diff --git a/tools/xtd2exyz/xtd2exyz.pl b/tools/xtd2exyz/xtd2exyz.pl new file mode 100644 index 000000000..aec2acf23 --- /dev/null +++ b/tools/xtd2exyz/xtd2exyz.pl @@ -0,0 +1,108 @@ +#!perl +#********************************************************** +#* * +#* XTD2XYZ - Convert XTD files into XYZ format * +#* * +#********************************************************** +# Version: 0.1 +# Author: Andrea Minoia +# Date: 08/09/2010 +# +# Convert MS trajectory xtd file into XYZ trajectory file. +# Backup of files that are about to be overwritten is managed +# by MS. The most recent file is that with higher index number (N) +# The script has to be in the same directory of the +# structure to modify and the user has to update the +# variable $doc (line 31) according to the name of the +# file containing the trajectory. +# The xmol trajectory is stored in trj.txt file and it is not +# possible to rename the file within MS, nor it is possible to +# automatically export it as xyz or car file. You should manage +# the new trajectory manually for further use (e.g. VMD) +# +# Modificator: Sobereva (sobereva@sina.com) +# Date: 2012-May-23 + +# Modificator: nanxu (tamas@zju.edu.cn) +# Date: 2022-Jau-03 +# Add support for lattice + +# Date: 2024-4-25 +# Modificator: chenzherui (chenzherui0124@foxmail.com) +# Energy unit eV; Force unit eV/Å +# Materials Studio xtd trajectory to exyz +# Optimize programs to improve efficienc, fix bugs + +use strict; +use MaterialsScript qw(:all); + +# Open the multiframe trajectory structure file or die +my $doc = $Documents{"small.xtd"}; + +die "no document" unless $doc; + +my $trajectory = $doc->Trajectory; +my $num_frames = $trajectory->NumFrames; + +if ($num_frames > 1) { + print "Found $num_frames frames in the trajectory\n"; + + # Open new xmol trajectory file once + my $xmolFile = Documents->New("trj.txt"); + + # Get atoms in the structure once + my $atoms = $doc->UnitCell->Atoms; + my $Natoms = @$atoms; + + # Pre-calculate conversion factor once + my $conv_factor = 0.0433641042385997; + + # Loop over the frames + my $start_frame = 1; # Starting frame (modify as needed) + my $end_frame = $num_frames; # Ending frame (modify as needed) + my $step = 1; # Save every nth frame (modify as needed) + + # Loop over the frames with the specified range and step + for (my $frame = $start_frame; $frame <= $end_frame; $frame += $step) { + $trajectory->CurrentFrame = $frame; + my $potentialEnergy = $doc->PotentialEnergy * $conv_factor; # Convert from kcal/mol to eV + + # Get lattice parameters and convert angles to radians for the current frame + my $lattice = $doc->Lattice3D; + my $a = $lattice->LengthA; + my $b = $lattice->LengthB; + my $c = $lattice->LengthC; + + my $pi = 3.141592653589793238462643383279; + + my $alpha = $lattice->AngleAlpha / 180 * $pi; + my $beta = $lattice->AngleBeta / 180 * $pi; + my $gamma = $lattice->AngleGamma / 180 * $pi; + + # Calculate lattice vectors for the current frame + my $bc2 = $b**2 + $c**2 - 2 * $b * $c * cos($alpha); + my $h1 = $a; + my $h2 = $b * cos($gamma); + my $h3 = $b * sin($gamma); + my $h4 = $c * cos($beta); + my $h5 = (($h2 - $h4)**2 + $h3**2 + $c**2 - $h4**2 - $bc2) / (2 * $h3); + my $h6 = sqrt($c**2 - $h4**2 - $h5**2); + + # Write header xyz once per frame + $xmolFile->Append("$Natoms\n"); + $xmolFile->Append(sprintf "energy=%.10f config_type=ms2xyz Lattice=\"%f %f %f %f %f %f %f %f %f\" pbc=\"T T T\" Properties=species:S:1:pos:R:3:force:R:3\n", $potentialEnergy, $h1, 0, 0, $h2, $h3, 0, $h4, $h5, $h6); + + # Loop over the atoms + for my $atom (@$atoms) { + # Write atom symbol, x-y-z- coordinates, and force vector + my $force = $atom->Force; + my $force_str = $force ? sprintf("%.10f %.10f %.10f", $force->X * $conv_factor, $force->Y * $conv_factor, $force->Z * $conv_factor) : "0 0 0"; + $xmolFile->Append(sprintf "%s %.10f %.10f %.10f %s\n", $atom->ElementSymbol, $atom->X, $atom->Y, $atom->Z, $force_str); + } + } + + # Close trajectory file once after all frames are processed + $xmolFile->Close; +} else { + print "The " . $doc->Name . " is not a multiframe trajectory file \n"; +} \ No newline at end of file From 5c4c95d16aff5972318e7184f6fbbdfc7e348307 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 26 Dec 2024 11:06:05 +0800 Subject: [PATCH 2/3] update readme --- tools/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/readme.md b/tools/readme.md index c645a0277..9b99f1633 100644 --- a/tools/readme.md +++ b/tools/readme.md @@ -16,7 +16,7 @@ | deep2nep | Ke Xu | twtdq@qq.com | Oudated? | | doc_3.3.1 | Zheyong Fan | brucenju@gmail.com | Documentation for some parts of GPUMD-v3.3.1. | | dp2xyz | Ke Xu | twtdq@qq.com | Convert `DP` training data to `xyz` format. | -| exyz2pdb | Zherui Chem | chenzherui0124@foxmail.com | Convert `exyz` to `pdb`. | +| exyz2pdb | Zherui Chen | chenzherui0124@foxmail.com | Convert `exyz` to `pdb`. | | for_coding | Zheyong Fan | brucenju@gmail.com | Something useful for Zheyong Fan only. | | get_max_rmse_xyz | Ke Xu | twtdq@qq.com | Identify structures with the largest errors. | | gmx2exyz | Zherui Chen | chenzherui0124@foxmail.com | Convert the `trr` trajectory of `gmx` to the `exyz` trajectory. | From 3cf359c8116218b8b6309c507288643de0169d73 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 26 Dec 2024 11:07:34 +0800 Subject: [PATCH 3/3] update readme --- tools/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/readme.md b/tools/readme.md index 9b99f1633..1040e65eb 100644 --- a/tools/readme.md +++ b/tools/readme.md @@ -34,7 +34,7 @@ | split_xyz | Yong Wang | yongw@princeton.edu | Some functionalities for training/test data. | | vasp2xyz | Yanzhou Wang | yanzhowang@gmail.com | Get `train.xyz` from `VASP` outputs. | | vim | Ke Xu | twtdq@qq.com | Highlight GPUMD grammar in `vim`. | -| xtd2exyz | Zherui Chen | chenzherui0124@foxmail.com | Convert XTD files into XYZ format. | +| xtd2exyz | Zherui Chen | chenzherui0124@foxmail.com | Get train.xyz from Materials Studio outputs. | | xyz2gro | Who? | | Convert `xyz` file to `gro` file. |