Skip to content

Commit

Permalink
Merge pull request brucefan1983#833 from zhyan0603/zhyan0603-GPUMD
Browse files Browse the repository at this point in the history
Update Tools
  • Loading branch information
brucefan1983 authored Dec 26, 2024
2 parents 67cfeb1 + 3cf359c commit 716032a
Show file tree
Hide file tree
Showing 3 changed files with 251 additions and 1 deletion.
141 changes: 141 additions & 0 deletions tools/pca_sampling/pca_sampling_R2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""
Purpose:
Select structures from orginal large reference dataset based on principal component Analysis (PCA) of
descriptor space using farthest point sampling. We use the PbTe as a toy example to show how this script
works, one need to modify the path of reference dataset, nep model, and selected frame number case by case.
Ref:
calorine: https://calorine.materialsmodeling.org/tutorials/visualize_descriptor_space_with_pca.html
https://github.com/bigd4/PyNEP/blob/master/examples/plot_select_structure.py
Author:
Penghua Ying <hityingph(at)163.com>
Zherui Chen <chenzherui0124@foxmail.com>
"""


from ase.io import read, write
from pylab import *
from calorine.nep import get_descriptors
from sklearn.decomposition import PCA
from tqdm import tqdm
import numpy as np
from scipy.spatial.distance import cdist

# -------------------------------------------------------------------
# Input/Output and Configuration
# -------------------------------------------------------------------
INPUT_FILE = "train.xyz" # Input file containing structures
NEP_MODEL_FILE = "nep.txt" # Input NEP model file for descriptor generation
OUTPUT_SELECTED_FILE = "selected_structures.xyz" # Output file for selected structures
OUTPUT_UNSELECTED_FILE = "unselected_structures.xyz" # Output file for unselected structures
OUTPUT_PLOT_FILE = "FPS.png" # Output file for visualization

R2_THRESHOLD = 0.90 # R2 threshold to stop sampling
PCA_COMPONENTS = 2 # Number of PCA components to use for dimensionality reduction
# -------------------------------------------------------------------

# Incremental Farthest Point Sampling with early stopping
def incremental_fps_with_r2(pc, r2_threshold):
n_points = pc.shape[0]
overall_mean = np.mean(pc, axis=0) # Mean of all points
total_variance = np.sum((pc - overall_mean) ** 2) # Total variance of data

# Randomly select the first point
selected_indices = [np.random.randint(n_points)]
min_distances = np.full(n_points, np.inf) # Initialize distances to infinity

explained_variance = 0
r2 = 0.0

# Loop to incrementally select points
for _ in range(1, n_points):
# Update min_distances: minimum distance from each point to the selected points
new_point_idx = selected_indices[-1]
new_distances = cdist(pc, pc[[new_point_idx]]).flatten()
min_distances = np.minimum(min_distances, new_distances)

# Select the next farthest point
next_index = np.argmax(min_distances)
selected_indices.append(next_index)

# Calculate explained variance incrementally
explained_variance = np.sum((pc[selected_indices] - overall_mean) ** 2)
r2 = explained_variance / total_variance

# Early stopping if R2 exceeds threshold
if r2 >= r2_threshold:
break

return selected_indices, r2


# Set figure properties for plotting
def set_fig_properties(ax_list):
tl = 8
tw = 2
tlm = 4
for ax in ax_list:
ax.tick_params(which="major", length=tl, width=tw)
ax.tick_params(which="minor", length=tlm, width=tw)
ax.tick_params(which="both", axis="both", direction="out", right=False, top=False)


# -------------------------------------------------------------------
# Main script
# -------------------------------------------------------------------

# Load dataset
tol = read(INPUT_FILE, ":") # Read original larger reference.xyz

# Generate descriptors
descriptors = []
for i, t in tqdm(enumerate(tol)):
d = get_descriptors(t, model_filename=NEP_MODEL_FILE) # Use NEP model file as input for descriptors
d_mean = np.mean(d, axis=0) # Use average of each atomic descriptor to get structure descriptors
descriptors.append(d_mean)

descriptors = np.array(descriptors)
print(f"Total number of structures in dataset: {descriptors.shape[0]}")
print(f"Number of descriptor components: {descriptors.shape[1]}")

# PCA
pca = PCA(n_components=PCA_COMPONENTS)
pc = pca.fit_transform(descriptors)
p0 = pca.explained_variance_ratio_[0]
p1 = pca.explained_variance_ratio_[1]
print(f"Explained variance for component 0: {p0:.2f}")
print(f"Explained variance for component 1: {p1:.2f}")

# Find minimum samples to achieve the threshold
selected_indices, final_r2 = incremental_fps_with_r2(pc, R2_THRESHOLD)

# Separate selected and unselected structures
selected_structures = [tol[i] for i in selected_indices]
unselected_structures = [t for i, t in enumerate(tol) if i not in selected_indices]

# Save the selected and unselected structures
write(OUTPUT_SELECTED_FILE, selected_structures)
write(OUTPUT_UNSELECTED_FILE, unselected_structures)

# Visualization
figure(figsize=(10, 8))
set_fig_properties([gca()])
scatter(pc[:, 0], pc[:, 1], alpha=0.5, c="C0", label="All structures")
scatter(pc[selected_indices, 0], pc[selected_indices, 1], s=8, color="C1", label="Selected structures")
xlabel("PC1")
ylabel("PC2")
legend()

# Add final R2 and number of samples to the plot
text(
0.05,
0.95,
f"R$^2$: {final_r2:.3f}\nSamples: {len(selected_indices)}",
transform=gca().transAxes,
fontsize=14,
verticalalignment="top",
bbox=dict(boxstyle="round", facecolor="white", alpha=0.5),
)

savefig(OUTPUT_PLOT_FILE, bbox_inches="tight")
3 changes: 2 additions & 1 deletion tools/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
| deep2nep | Ke Xu | twtdq@qq.com | Oudated? |
| doc_3.3.1 | Zheyong Fan | brucenju@gmail.com | Documentation for some parts of GPUMD-v3.3.1. |
| dp2xyz | Ke Xu | twtdq@qq.com | Convert `DP` training data to `xyz` format. |
| exyz2pdb | Zherui Chem | chenzherui0124@foxmail.com | Convert `exyz` to `pdb`. |
| exyz2pdb | Zherui Chen | chenzherui0124@foxmail.com | Convert `exyz` to `pdb`. |
| for_coding | Zheyong Fan | brucenju@gmail.com | Something useful for Zheyong Fan only. |
| get_max_rmse_xyz | Ke Xu | twtdq@qq.com | Identify structures with the largest errors. |
| gmx2exyz | Zherui Chen | chenzherui0124@foxmail.com | Convert the `trr` trajectory of `gmx` to the `exyz` trajectory. |
Expand All @@ -34,6 +34,7 @@
| split_xyz | Yong Wang | yongw@princeton.edu | Some functionalities for training/test data. |
| vasp2xyz | Yanzhou Wang | yanzhowang@gmail.com | Get `train.xyz` from `VASP` outputs. |
| vim | Ke Xu | twtdq@qq.com | Highlight GPUMD grammar in `vim`. |
| xtd2exyz | Zherui Chen | chenzherui0124@foxmail.com | Get train.xyz from Materials Studio outputs. |
| xyz2gro | Who? | | Convert `xyz` file to `gro` file. |


Expand Down
108 changes: 108 additions & 0 deletions tools/xtd2exyz/xtd2exyz.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!perl
#**********************************************************
#* *
#* XTD2XYZ - Convert XTD files into XYZ format *
#* *
#**********************************************************
# Version: 0.1
# Author: Andrea Minoia
# Date: 08/09/2010
#
# Convert MS trajectory xtd file into XYZ trajectory file.
# Backup of files that are about to be overwritten is managed
# by MS. The most recent file is that with higher index number (N)
# The script has to be in the same directory of the
# structure to modify and the user has to update the
# variable $doc (line 31) according to the name of the
# file containing the trajectory.
# The xmol trajectory is stored in trj.txt file and it is not
# possible to rename the file within MS, nor it is possible to
# automatically export it as xyz or car file. You should manage
# the new trajectory manually for further use (e.g. VMD)
#
# Modificator: Sobereva (sobereva@sina.com)
# Date: 2012-May-23

# Modificator: nanxu (tamas@zju.edu.cn)
# Date: 2022-Jau-03
# Add support for lattice

# Date: 2024-4-25
# Modificator: chenzherui (chenzherui0124@foxmail.com)
# Energy unit eV; Force unit eV/Å
# Materials Studio xtd trajectory to exyz
# Optimize programs to improve efficienc, fix bugs

use strict;
use MaterialsScript qw(:all);

# Open the multiframe trajectory structure file or die
my $doc = $Documents{"small.xtd"};

die "no document" unless $doc;

my $trajectory = $doc->Trajectory;
my $num_frames = $trajectory->NumFrames;

if ($num_frames > 1) {
print "Found $num_frames frames in the trajectory\n";

# Open new xmol trajectory file once
my $xmolFile = Documents->New("trj.txt");

# Get atoms in the structure once
my $atoms = $doc->UnitCell->Atoms;
my $Natoms = @$atoms;

# Pre-calculate conversion factor once
my $conv_factor = 0.0433641042385997;

# Loop over the frames
my $start_frame = 1; # Starting frame (modify as needed)
my $end_frame = $num_frames; # Ending frame (modify as needed)
my $step = 1; # Save every nth frame (modify as needed)

# Loop over the frames with the specified range and step
for (my $frame = $start_frame; $frame <= $end_frame; $frame += $step) {
$trajectory->CurrentFrame = $frame;
my $potentialEnergy = $doc->PotentialEnergy * $conv_factor; # Convert from kcal/mol to eV

# Get lattice parameters and convert angles to radians for the current frame
my $lattice = $doc->Lattice3D;
my $a = $lattice->LengthA;
my $b = $lattice->LengthB;
my $c = $lattice->LengthC;

my $pi = 3.141592653589793238462643383279;

my $alpha = $lattice->AngleAlpha / 180 * $pi;
my $beta = $lattice->AngleBeta / 180 * $pi;
my $gamma = $lattice->AngleGamma / 180 * $pi;

# Calculate lattice vectors for the current frame
my $bc2 = $b**2 + $c**2 - 2 * $b * $c * cos($alpha);
my $h1 = $a;
my $h2 = $b * cos($gamma);
my $h3 = $b * sin($gamma);
my $h4 = $c * cos($beta);
my $h5 = (($h2 - $h4)**2 + $h3**2 + $c**2 - $h4**2 - $bc2) / (2 * $h3);
my $h6 = sqrt($c**2 - $h4**2 - $h5**2);

# Write header xyz once per frame
$xmolFile->Append("$Natoms\n");
$xmolFile->Append(sprintf "energy=%.10f config_type=ms2xyz Lattice=\"%f %f %f %f %f %f %f %f %f\" pbc=\"T T T\" Properties=species:S:1:pos:R:3:force:R:3\n", $potentialEnergy, $h1, 0, 0, $h2, $h3, 0, $h4, $h5, $h6);

# Loop over the atoms
for my $atom (@$atoms) {
# Write atom symbol, x-y-z- coordinates, and force vector
my $force = $atom->Force;
my $force_str = $force ? sprintf("%.10f %.10f %.10f", $force->X * $conv_factor, $force->Y * $conv_factor, $force->Z * $conv_factor) : "0 0 0";
$xmolFile->Append(sprintf "%s %.10f %.10f %.10f %s\n", $atom->ElementSymbol, $atom->X, $atom->Y, $atom->Z, $force_str);
}
}

# Close trajectory file once after all frames are processed
$xmolFile->Close;
} else {
print "The " . $doc->Name . " is not a multiframe trajectory file \n";
}

0 comments on commit 716032a

Please sign in to comment.