From 9c8eb183335f4d6c16751ccab30b8e61ccbfb43a Mon Sep 17 00:00:00 2001
From: unknown <yanzihan@westlake.edu.cn>
Date: Thu, 26 Dec 2024 11:01:43 +0800
Subject: [PATCH 1/3] update tools

---
 tools/pca_sampling/pca_sampling_R2.py | 141 ++++++++++++++++++++++++++
 tools/readme.md                       |   1 +
 tools/xtd2exyz/xtd2exyz.pl            | 108 ++++++++++++++++++++
 3 files changed, 250 insertions(+)
 create mode 100644 tools/pca_sampling/pca_sampling_R2.py
 create mode 100644 tools/xtd2exyz/xtd2exyz.pl

diff --git a/tools/pca_sampling/pca_sampling_R2.py b/tools/pca_sampling/pca_sampling_R2.py
new file mode 100644
index 000000000..52c0d8050
--- /dev/null
+++ b/tools/pca_sampling/pca_sampling_R2.py
@@ -0,0 +1,141 @@
+"""
+    Purpose:
+        Select structures from orginal large reference dataset based on principal component Analysis (PCA) of 
+        descriptor space using farthest point sampling. We use the PbTe as a toy example to show how this script
+        works, one need to modify the path of reference dataset, nep model, and selected frame number case by case.
+
+    Ref:
+        calorine: https://calorine.materialsmodeling.org/tutorials/visualize_descriptor_space_with_pca.html
+        https://github.com/bigd4/PyNEP/blob/master/examples/plot_select_structure.py
+
+    Author:
+        Penghua Ying <hityingph(at)163.com>
+        Zherui Chen <chenzherui0124@foxmail.com>
+"""
+
+
+from ase.io import read, write
+from pylab import *
+from calorine.nep import get_descriptors
+from sklearn.decomposition import PCA
+from tqdm import tqdm
+import numpy as np
+from scipy.spatial.distance import cdist
+
+# -------------------------------------------------------------------
+# Input/Output and Configuration
+# -------------------------------------------------------------------
+INPUT_FILE = "train.xyz"               # Input file containing structures
+NEP_MODEL_FILE = "nep.txt"             # Input NEP model file for descriptor generation
+OUTPUT_SELECTED_FILE = "selected_structures.xyz"  # Output file for selected structures
+OUTPUT_UNSELECTED_FILE = "unselected_structures.xyz"  # Output file for unselected structures
+OUTPUT_PLOT_FILE = "FPS.png"           # Output file for visualization
+
+R2_THRESHOLD = 0.90  # R2 threshold to stop sampling
+PCA_COMPONENTS = 2   # Number of PCA components to use for dimensionality reduction
+# -------------------------------------------------------------------
+
+# Incremental Farthest Point Sampling with early stopping
+def incremental_fps_with_r2(pc, r2_threshold):
+    n_points = pc.shape[0]
+    overall_mean = np.mean(pc, axis=0)  # Mean of all points
+    total_variance = np.sum((pc - overall_mean) ** 2)  # Total variance of data
+
+    # Randomly select the first point
+    selected_indices = [np.random.randint(n_points)]
+    min_distances = np.full(n_points, np.inf)  # Initialize distances to infinity
+
+    explained_variance = 0
+    r2 = 0.0
+
+    # Loop to incrementally select points
+    for _ in range(1, n_points):
+        # Update min_distances: minimum distance from each point to the selected points
+        new_point_idx = selected_indices[-1]
+        new_distances = cdist(pc, pc[[new_point_idx]]).flatten()
+        min_distances = np.minimum(min_distances, new_distances)
+
+        # Select the next farthest point
+        next_index = np.argmax(min_distances)
+        selected_indices.append(next_index)
+
+        # Calculate explained variance incrementally
+        explained_variance = np.sum((pc[selected_indices] - overall_mean) ** 2)
+        r2 = explained_variance / total_variance
+
+        # Early stopping if R2 exceeds threshold
+        if r2 >= r2_threshold:
+            break
+
+    return selected_indices, r2
+
+
+# Set figure properties for plotting
+def set_fig_properties(ax_list):
+    tl = 8
+    tw = 2
+    tlm = 4
+    for ax in ax_list:
+        ax.tick_params(which="major", length=tl, width=tw)
+        ax.tick_params(which="minor", length=tlm, width=tw)
+        ax.tick_params(which="both", axis="both", direction="out", right=False, top=False)
+
+
+# -------------------------------------------------------------------
+# Main script
+# -------------------------------------------------------------------
+
+# Load dataset
+tol = read(INPUT_FILE, ":")  # Read original larger reference.xyz
+
+# Generate descriptors
+descriptors = []
+for i, t in tqdm(enumerate(tol)):
+    d = get_descriptors(t, model_filename=NEP_MODEL_FILE)  # Use NEP model file as input for descriptors
+    d_mean = np.mean(d, axis=0)  # Use average of each atomic descriptor to get structure descriptors
+    descriptors.append(d_mean)
+
+descriptors = np.array(descriptors)
+print(f"Total number of structures in dataset: {descriptors.shape[0]}")
+print(f"Number of descriptor components: {descriptors.shape[1]}")
+
+# PCA
+pca = PCA(n_components=PCA_COMPONENTS)
+pc = pca.fit_transform(descriptors)
+p0 = pca.explained_variance_ratio_[0]
+p1 = pca.explained_variance_ratio_[1]
+print(f"Explained variance for component 0: {p0:.2f}")
+print(f"Explained variance for component 1: {p1:.2f}")
+
+# Find minimum samples to achieve the threshold
+selected_indices, final_r2 = incremental_fps_with_r2(pc, R2_THRESHOLD)
+
+# Separate selected and unselected structures
+selected_structures = [tol[i] for i in selected_indices]
+unselected_structures = [t for i, t in enumerate(tol) if i not in selected_indices]
+
+# Save the selected and unselected structures
+write(OUTPUT_SELECTED_FILE, selected_structures)
+write(OUTPUT_UNSELECTED_FILE, unselected_structures)
+
+# Visualization
+figure(figsize=(10, 8))
+set_fig_properties([gca()])
+scatter(pc[:, 0], pc[:, 1], alpha=0.5, c="C0", label="All structures")
+scatter(pc[selected_indices, 0], pc[selected_indices, 1], s=8, color="C1", label="Selected structures")
+xlabel("PC1")
+ylabel("PC2")
+legend()
+
+# Add final R2 and number of samples to the plot
+text(
+    0.05,
+    0.95,
+    f"R$^2$: {final_r2:.3f}\nSamples: {len(selected_indices)}",
+    transform=gca().transAxes,
+    fontsize=14,
+    verticalalignment="top",
+    bbox=dict(boxstyle="round", facecolor="white", alpha=0.5),
+)
+
+savefig(OUTPUT_PLOT_FILE, bbox_inches="tight")
\ No newline at end of file
diff --git a/tools/readme.md b/tools/readme.md
index 315a20af3..c645a0277 100644
--- a/tools/readme.md
+++ b/tools/readme.md
@@ -34,6 +34,7 @@
 | split_xyz            | Yong Wang    | yongw@princeton.edu | Some functionalities for training/test data.         |
 | vasp2xyz             | Yanzhou Wang | yanzhowang@gmail.com     | Get `train.xyz` from `VASP` outputs.                         |
 | vim                  | Ke Xu        | twtdq@qq.com                | Highlight GPUMD grammar in `vim`.                            |
+| xtd2exyz | Zherui Chen | chenzherui0124@foxmail.com | Convert XTD files into XYZ format. |
 | xyz2gro              | Who? |                             | Convert `xyz` file to `gro` file.                            |
 
 
diff --git a/tools/xtd2exyz/xtd2exyz.pl b/tools/xtd2exyz/xtd2exyz.pl
new file mode 100644
index 000000000..aec2acf23
--- /dev/null
+++ b/tools/xtd2exyz/xtd2exyz.pl
@@ -0,0 +1,108 @@
+#!perl
+#**********************************************************
+#*                                                        *
+#*     XTD2XYZ - Convert XTD files into XYZ format        *
+#*                                                        *
+#**********************************************************
+# Version: 0.1
+# Author: Andrea Minoia
+# Date: 08/09/2010
+#
+# Convert MS trajectory xtd file into XYZ trajectory file.
+# Backup of files that are about to be overwritten is managed
+# by MS. The most recent file is that with higher index number (N)
+# The script has to be in the same directory of the
+# structure to modify and the user has to update the
+# variable $doc (line 31) according to the name of the
+# file containing the trajectory.
+# The xmol trajectory is stored in trj.txt file and it is not
+# possible to rename the file within MS, nor it is possible to
+# automatically export it as xyz or car file. You should manage
+# the new trajectory manually for further use (e.g. VMD)
+#
+# Modificator: Sobereva (sobereva@sina.com)
+# Date: 2012-May-23
+
+# Modificator: nanxu (tamas@zju.edu.cn)
+# Date: 2022-Jau-03
+# Add support for lattice
+
+# Date: 2024-4-25
+# Modificator: chenzherui (chenzherui0124@foxmail.com)
+# Energy unit eV; Force unit eV/Å
+# Materials Studio xtd trajectory to exyz 
+# Optimize programs to improve efficienc, fix bugs
+
+use strict;
+use MaterialsScript qw(:all);
+
+# Open the multiframe trajectory structure file or die
+my $doc = $Documents{"small.xtd"};
+
+die "no document" unless $doc;
+
+my $trajectory = $doc->Trajectory;
+my $num_frames = $trajectory->NumFrames;
+
+if ($num_frames > 1) {
+    print "Found $num_frames frames in the trajectory\n";
+
+    # Open new xmol trajectory file once
+    my $xmolFile = Documents->New("trj.txt");
+
+    # Get atoms in the structure once
+    my $atoms = $doc->UnitCell->Atoms;
+    my $Natoms = @$atoms;
+
+    # Pre-calculate conversion factor once
+    my $conv_factor = 0.0433641042385997;
+
+    # Loop over the frames
+    my $start_frame = 1;  # Starting frame (modify as needed)
+    my $end_frame = $num_frames;  # Ending frame (modify as needed)
+    my $step = 1;  # Save every nth frame (modify as needed)
+
+    # Loop over the frames with the specified range and step
+    for (my $frame = $start_frame; $frame <= $end_frame; $frame += $step) {
+        $trajectory->CurrentFrame = $frame;
+        my $potentialEnergy = $doc->PotentialEnergy * $conv_factor; # Convert from kcal/mol to eV
+
+        # Get lattice parameters and convert angles to radians for the current frame
+        my $lattice = $doc->Lattice3D;
+        my $a = $lattice->LengthA;
+        my $b = $lattice->LengthB;
+        my $c = $lattice->LengthC;
+        
+        my $pi =  3.141592653589793238462643383279;
+        
+        my $alpha = $lattice->AngleAlpha / 180 * $pi;
+        my $beta = $lattice->AngleBeta / 180 * $pi;
+        my $gamma = $lattice->AngleGamma / 180 * $pi;
+
+        # Calculate lattice vectors for the current frame
+        my $bc2 = $b**2 + $c**2 - 2 * $b * $c * cos($alpha);
+        my $h1 = $a;
+        my $h2 = $b * cos($gamma);
+        my $h3 = $b * sin($gamma);
+        my $h4 = $c * cos($beta);
+        my $h5 = (($h2 - $h4)**2 + $h3**2 + $c**2 - $h4**2 - $bc2) / (2 * $h3);
+        my $h6 = sqrt($c**2 - $h4**2 - $h5**2);
+
+        # Write header xyz once per frame
+        $xmolFile->Append("$Natoms\n");
+        $xmolFile->Append(sprintf "energy=%.10f config_type=ms2xyz Lattice=\"%f %f %f %f %f %f %f %f %f\" pbc=\"T T T\" Properties=species:S:1:pos:R:3:force:R:3\n", $potentialEnergy, $h1, 0, 0, $h2, $h3, 0, $h4, $h5, $h6);
+
+        # Loop over the atoms
+        for my $atom (@$atoms) {
+            # Write atom symbol, x-y-z- coordinates, and force vector
+            my $force = $atom->Force;
+            my $force_str = $force ? sprintf("%.10f %.10f %.10f", $force->X * $conv_factor, $force->Y * $conv_factor, $force->Z * $conv_factor) : "0 0 0";
+            $xmolFile->Append(sprintf "%s %.10f %.10f %.10f %s\n", $atom->ElementSymbol, $atom->X, $atom->Y, $atom->Z, $force_str);
+        }
+    }
+
+    # Close trajectory file once after all frames are processed
+    $xmolFile->Close;
+} else {
+    print "The " . $doc->Name . " is not a multiframe trajectory file \n";
+}
\ No newline at end of file

From 5c4c95d16aff5972318e7184f6fbbdfc7e348307 Mon Sep 17 00:00:00 2001
From: unknown <yanzihan@westlake.edu.cn>
Date: Thu, 26 Dec 2024 11:06:05 +0800
Subject: [PATCH 2/3] update readme

---
 tools/readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/readme.md b/tools/readme.md
index c645a0277..9b99f1633 100644
--- a/tools/readme.md
+++ b/tools/readme.md
@@ -16,7 +16,7 @@
 | deep2nep             | Ke Xu        | twtdq@qq.com                                     | Oudated?                                                     |
 | doc_3.3.1            | Zheyong Fan  | brucenju@gmail.com | Documentation for some parts of GPUMD-v3.3.1.                |
 | dp2xyz               | Ke Xu        | twtdq@qq.com        | Convert `DP` training data to `xyz` format.                  |
-| exyz2pdb             | Zherui Chem  | chenzherui0124@foxmail.com  | Convert `exyz` to `pdb`. |
+| exyz2pdb             | Zherui Chen | chenzherui0124@foxmail.com  | Convert `exyz` to `pdb`. |
 | for_coding           | Zheyong Fan  | brucenju@gmail.com     | Something useful for Zheyong Fan only.                       |
 | get_max_rmse_xyz     | Ke Xu  | twtdq@qq.com | Identify structures with the largest errors.          |
 | gmx2exyz             | Zherui Chen  | chenzherui0124@foxmail.com  | Convert the `trr` trajectory of `gmx` to the `exyz` trajectory. |

From 3cf359c8116218b8b6309c507288643de0169d73 Mon Sep 17 00:00:00 2001
From: unknown <yanzihan@westlake.edu.cn>
Date: Thu, 26 Dec 2024 11:07:34 +0800
Subject: [PATCH 3/3] update readme

---
 tools/readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/readme.md b/tools/readme.md
index 9b99f1633..1040e65eb 100644
--- a/tools/readme.md
+++ b/tools/readme.md
@@ -34,7 +34,7 @@
 | split_xyz            | Yong Wang    | yongw@princeton.edu | Some functionalities for training/test data.         |
 | vasp2xyz             | Yanzhou Wang | yanzhowang@gmail.com     | Get `train.xyz` from `VASP` outputs.                         |
 | vim                  | Ke Xu        | twtdq@qq.com                | Highlight GPUMD grammar in `vim`.                            |
-| xtd2exyz | Zherui Chen | chenzherui0124@foxmail.com | Convert XTD files into XYZ format. |
+| xtd2exyz | Zherui Chen | chenzherui0124@foxmail.com | Get train.xyz from Materials Studio outputs. |
 | xyz2gro              | Who? |                             | Convert `xyz` file to `gro` file.                            |