From 5f45cd2744c1a0d205cad8e171b0573131da0621 Mon Sep 17 00:00:00 2001
From: Sean Colby <seanmcolby@gmail.com>
Date: Tue, 23 Apr 2024 09:56:37 -0700
Subject: [PATCH] Overhaul XTB parser

---
 isicle/parse.py | 221 ++++++++++++++++++------------------------------
 1 file changed, 80 insertions(+), 141 deletions(-)

diff --git a/isicle/parse.py b/isicle/parse.py
index 3385f20..fd3b9c8 100644
--- a/isicle/parse.py
+++ b/isicle/parse.py
@@ -321,7 +321,8 @@ def parse(self):
         # Add result info to geometry object
         if "geometry" in result:
             result["geometry"].add___dict__(
-                {k: v for k, v in result.items() if k != 'geometry'})
+                {k: v for k, v in result.items() if k != "geometry"}
+            )
 
         # Store attribute
         self.result = result
@@ -348,7 +349,7 @@ def _parse_geometry(self):
         Add docstring
         """
 
-        return self.data['xyz']['final']
+        return self.data["xyz"]["final"]
 
     def _parse_energy(self):
         """
@@ -360,7 +361,7 @@ def _parse_energy(self):
         energy = None
 
         # Cycle through file
-        for line in self.data['out'].split("\n"):
+        for line in self.data["out"].split("\n"):
             if "Total DFT energy" in line:
                 # Overwrite last saved energy
                 energy = float(line.split()[-1])
@@ -377,7 +378,7 @@ def _parse_shielding(self):
         shield_atoms = []
         shields = []
 
-        for line in self.data['out'].split("\n"):
+        for line in self.data["out"].split("\n"):
             if " SHIELDING" in line:
                 shield_idxs = [int(x) for x in line.split()[2:]]
                 if len(shield_idxs) == 0:
@@ -407,7 +408,7 @@ def _parse_spin(self):
         """
 
         # No spin
-        if "SPINSPIN" not in self.data['nw']:
+        if "SPINSPIN" not in self.data["nw"]:
             return None
 
         # TO DO: Add g-factors
@@ -419,7 +420,7 @@ def _parse_spin(self):
         g_factor = []
         ready = False
 
-        for line in self.data['out'].split("\n"):
+        for line in self.data["out"].split("\n"):
             if "Atom  " in line:
                 line = line.split()
                 idx1 = int((line[1].split(":"))[0])
@@ -468,7 +469,7 @@ def _parse_frequency(self):
         natoms = None
         has_frequency = None
 
-        lines = self.data['out'].split("\n")
+        lines = self.data["out"].split("\n")
         for i, line in enumerate(lines):
             if ("geometry" in line) and (natoms is None):
                 atom_start = i + 7
@@ -535,7 +536,7 @@ def _parse_charge(self):
         charges = []
         ready = False
 
-        for line in self.data['out'].split("\n"):
+        for line in self.data["out"].split("\n"):
             # Load charges from table
             if "Atom       Charge   Shell Charges" in line:
                 # Table header found. Overwrite anything saved previously
@@ -593,7 +594,7 @@ def _parse_timing(self):
         opt = False
         freq = False
 
-        for i, line in enumerate(self.data['out'].split("\n")):
+        for i, line in enumerate(self.data["out"].split("\n")):
             # ?
             if "No." in line and len(indices) == 0:
                 indices.append(i + 2)  # 0
@@ -638,8 +639,8 @@ def _parse_molden(self):
         """
         Add docstring
         """
-        if 'molden' in self.data:
-            return self['molden']
+        if "molden" in self.data:
+            return self["molden"]
 
         return None
 
@@ -653,9 +654,9 @@ def _parse_connectivity(self):
         """
         Add docstring
         """
-        
+
         # Split lines
-        lines = self.data['out'].split("\n")
+        lines = self.data["out"].split("\n")
 
         # Extracting Atoms & Coordinates
         coor_substr = "internuclear distances"
@@ -684,7 +685,7 @@ def _parse_connectivity(self):
         # Check for result
         if len(connectivity) > 0:
             return connectivity
-        
+
         return None
 
     def parse(self):
@@ -712,9 +713,10 @@ def parse(self):
         result = {k: v for k, v in result.items() if v is not None}
 
         # Add result info to geometry object
-        if 'geometry' in result:
-            result['geometry'].add___dict__(
-                {k: v for k, v in result.items() if k != 'geometry'})
+        if "geometry" in result:
+            result["geometry"].add___dict__(
+                {k: v for k, v in result.items() if k != "geometry"}
+            )
 
         # Store attribute
         self.result = result
@@ -870,22 +872,15 @@ class XTBParser(FileParserInterface):
     Add docstring
     """
 
-    def __init__(self):
-        """
-        Add docstring
-        """
-        self.contents = None
-        self.result = None
-        self.path = None
+    def __init__(self, data=None):
+        self.data = data
 
-    def load(self, path: str):
-        """
-        Load in the data file
-        """
-        with open(path, "r") as f:
-            self.contents = f.readlines()
-        self.path = path
-        # return self.contents
+        self.result = {}
+
+    def load(self, path):
+        self.data = isicle.io.load_pickle(path)
+
+        self.lines = self.data["out"].split("\n")
 
     def _crest_energy(self):
         """
@@ -896,11 +891,11 @@ def _crest_energy(self):
         population = []
 
         ready = False
-        for h in range(len(self.contents), 0, -1):
-            if "Erel/kcal" in self.contents[h]:
+        for h in range(len(self.lines), 0, -1):
+            if "Erel/kcal" in self.lines[h]:
                 g = h + 1
-                for j in range(g, len(self.contents)):
-                    line = self.contents[j].split()
+                for j in range(g, len(self.lines)):
+                    line = self.lines[j].split()
                     if len(line) == 8:
                         relative_energy.append(float(line[1]))
                         total_energy.append(float(line[2]))
@@ -909,7 +904,7 @@ def _crest_energy(self):
 
                     if "/K" in line[1]:
                         break
-            if ready == True:
+            if ready is True:
                 break
 
         return {
@@ -930,7 +925,7 @@ def grab_time(line):
             return ":".join(line[1:]).strip("\n")
 
         ready = False
-        for line in self.contents:
+        for line in self.lines:
             if "test MD wall time" in line:
                 test_MD = grab_time(line)
                 ready = True
@@ -941,7 +936,7 @@ def grab_time(line):
             if "multilevel OPT wall time" in line:
                 multilevel_OPT = grab_time(line)
 
-            if "MD wall time" in line and ready == True:
+            if "MD wall time" in line and ready is True:
                 MD = grab_time(line)
                 ready = False
 
@@ -967,19 +962,19 @@ def _isomer_energy(self):
         complete = False
         relative_energies = []
         total_energies = []
-        for i in range(len(self.contents), 0, -1):
-            if "structure    ΔE(kcal/mol)   Etot(Eh)" in self.contents[i]:
+        for i in range(len(self.lines), 0, -1):
+            if "structure    ΔE(kcal/mol)   Etot(Eh)" in self.lines[i]:
                 h = i + 1
-                for j in range(h, len(self.contents)):
-                    if self.contents[j] != " \n":
-                        line = self.contents[j].split()
+                for j in range(h, len(self.lines)):
+                    if self.lines[j] != " \n":
+                        line = self.lines[j].split()
                         relative_energies.append(float(line[1]))
                         total_energies.append(float(line[2]))
                     else:
                         complete = True
                         break
 
-            if complete == True:
+            if complete is True:
                 break
 
         return {"relative energy": relative_energies, "total energy": total_energies}
@@ -995,7 +990,7 @@ def grab_time(line):
 
             return ":".join(line[1:]).strip("\n")
 
-        for line in self.contents:
+        for line in self.lines:
             if "LMO calc. wall time" in line:
                 LMO_time = grab_time(line)
 
@@ -1015,7 +1010,7 @@ def _opt_energy(self):
         """
         Add docstring
         """
-        for line in self.contents:
+        for line in self.lines:
             if "TOTAL ENERGY" in line:
                 energy = line.split()[3] + " Hartrees"
 
@@ -1036,7 +1031,7 @@ def grab_time(line):
         scf = False
         anc = False
 
-        for line in self.contents:
+        for line in self.lines:
             if "wall-time" in line and tot is False:
                 total_time = grab_time(line)
                 tot = True
@@ -1055,28 +1050,6 @@ def grab_time(line):
             "ANC optimizer wall time": anc_time,
         }
 
-    def _parse_energy(self):
-        """
-        Add docstring
-        """
-        if self.parse_crest == True:
-            return self._crest_energy()
-        if self.parse_opt == True:
-            return self._opt_energy()
-        if self.parse_isomer == True:
-            return self._isomer_energy()
-
-    def _parse_timing(self):
-        """
-        Add docstring
-        """
-        if self.parse_crest == True:
-            return self._crest_timing()
-        if self.parse_opt == True:
-            return self._opt_timing()
-        if self.parse_isomer == True:
-            return self._isomer_timing()
-
     def _parse_protocol(self):
         """
         Add docstring
@@ -1090,26 +1063,25 @@ def _parse_protocol(self):
                 protocol = (line.split(":")[1]).strip("\n")
         return protocol
 
-    def _parse_xyz(self):
+    def _parse_geometry(self):
         """
         Split .xyz into separate XYZGeometry instances
         """
-
-        FILE = self.xyz_path
-        if len(list(pybel.readfile("xyz", FILE))) > 1:
-            geom_list = []
-            count = 1
-            XYZ = FILE.split(".")[0]
-            for geom in pybel.readfile("xyz", FILE):
-                geom.write("xyz", "%s_%d.xyz" % (XYZ, count))
-                geom_list.append("%s_%d.xyz" % (XYZ, count))
-                count += 1
-
-            # TODO: charge needs to propagate to here
-            # Improve loading scheme as well
-            return isicle.conformers.ConformationalEnsemble([isicle.io.load(i) for i in geom_list])
-
-        return isicle.io.load(self.xyz_path)
+        geometries = {}
+        # Add geometry info
+        for key in [
+            "conformers",
+            "rotamers",
+            "final",
+            "best",
+            "protonated",
+            "deprotonated",
+            "tautomers",
+        ]:
+            if key in self.data:
+                geometries[key] = self.data[key]
+        
+        return geometries
 
     def parse(self):
         """
@@ -1128,63 +1100,30 @@ def parse(self):
         ):
             raise RuntimeError("XTB job failed: {}".format(self.path))
 
-        self.parse_crest = False
-        self.parse_opt = False
-        self.parse_isomer = False
-
         # Initialize result object to store info
-        result = {}
-        result["protocol"] = self._parse_protocol()
-
-        try:
-            result["timing"] = self._parse_timing()
-        except:
-            pass
-
-        try:
-            result["energy"] = self._parse_energy()
-        except:
-            pass
-
-        # Parse geometry from assoc. XYZ file
-        if self.path.endswith("xyz"):
-            try:
-                result["geometry"] = self._parse_xyz()
-
-            except:
-                pass
-
-        if self.path.endswith("out") or self.path.endswith("log"):
-            # try geometry parsing
-            XYZ = None
-            if result["protocol"].split()[0] == "xtb":
-                self.parse_opt = True
-                XYZ = "xtbopt.xyz"
-            if result["protocol"].split()[1] == "crest":
-                if "-deprotonate" in result["protocol"]:
-                    self.parse_isomer = True
-                    XYZ = "deprotonated.xyz"
-                elif "-protonate" in result["protocol"]:
-                    self.parse_isomer = True
-                    XYZ = "protonated.xyz"
-                elif "-tautomer" in result["protocol"]:
-                    self.parse_isomer = True
-                    XYZ = "tautomers.xyz"
-                else:
-                    self.parse_crest = True
-                    XYZ = "crest_conformers.xyz"
-
-            if XYZ is None:
-                raise RuntimeError(
-                    "XYZ file associated with XTB job not available,\
-                                please parse separately."
-                )
+        result = {
+            "protocol": self._parse_protocol(),
+            "timing": self._parse_timing(),
+            "energy": self._parse_energy(),
+            "geometry": self._parse_geometry()
+        }
 
+        if result["protocol"].split()[0] == "xtb":
+            result["timing"] = self._opt_timing()
+            result["energy"] = self._opt_energy()
+
+        if result["protocol"].split()[1] == "crest":
+            if any(
+                [
+                    x in result["protocol"]
+                    for x in ["-deprotonate", "-protonate", "-tautomer"]
+                ]
+            ):
+                result["timing"] = self._isomer_timing()
+                result["energy"] = self._isomer_energy()
             else:
-                temp_dir = os.path.dirname(self.path)
-                self.xyz_path = os.path.join(temp_dir, XYZ)
-
-                result["geometry"] = self._parse_xyz()
+                result["timing"] = self._crest_timing()
+                result["energy"] = self._crest_energy()
 
         return result