From 5f45cd2744c1a0d205cad8e171b0573131da0621 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Tue, 23 Apr 2024 09:56:37 -0700 Subject: [PATCH] Overhaul XTB parser --- isicle/parse.py | 221 ++++++++++++++++++------------------------------ 1 file changed, 80 insertions(+), 141 deletions(-) diff --git a/isicle/parse.py b/isicle/parse.py index 3385f20..fd3b9c8 100644 --- a/isicle/parse.py +++ b/isicle/parse.py @@ -321,7 +321,8 @@ def parse(self): # Add result info to geometry object if "geometry" in result: result["geometry"].add___dict__( - {k: v for k, v in result.items() if k != 'geometry'}) + {k: v for k, v in result.items() if k != "geometry"} + ) # Store attribute self.result = result @@ -348,7 +349,7 @@ def _parse_geometry(self): Add docstring """ - return self.data['xyz']['final'] + return self.data["xyz"]["final"] def _parse_energy(self): """ @@ -360,7 +361,7 @@ def _parse_energy(self): energy = None # Cycle through file - for line in self.data['out'].split("\n"): + for line in self.data["out"].split("\n"): if "Total DFT energy" in line: # Overwrite last saved energy energy = float(line.split()[-1]) @@ -377,7 +378,7 @@ def _parse_shielding(self): shield_atoms = [] shields = [] - for line in self.data['out'].split("\n"): + for line in self.data["out"].split("\n"): if " SHIELDING" in line: shield_idxs = [int(x) for x in line.split()[2:]] if len(shield_idxs) == 0: @@ -407,7 +408,7 @@ def _parse_spin(self): """ # No spin - if "SPINSPIN" not in self.data['nw']: + if "SPINSPIN" not in self.data["nw"]: return None # TO DO: Add g-factors @@ -419,7 +420,7 @@ def _parse_spin(self): g_factor = [] ready = False - for line in self.data['out'].split("\n"): + for line in self.data["out"].split("\n"): if "Atom " in line: line = line.split() idx1 = int((line[1].split(":"))[0]) @@ -468,7 +469,7 @@ def _parse_frequency(self): natoms = None has_frequency = None - lines = self.data['out'].split("\n") + lines = self.data["out"].split("\n") for i, line in enumerate(lines): if ("geometry" in line) and (natoms is None): atom_start = i + 7 @@ -535,7 +536,7 @@ def _parse_charge(self): charges = [] ready = False - for line in self.data['out'].split("\n"): + for line in self.data["out"].split("\n"): # Load charges from table if "Atom Charge Shell Charges" in line: # Table header found. Overwrite anything saved previously @@ -593,7 +594,7 @@ def _parse_timing(self): opt = False freq = False - for i, line in enumerate(self.data['out'].split("\n")): + for i, line in enumerate(self.data["out"].split("\n")): # ? if "No." in line and len(indices) == 0: indices.append(i + 2) # 0 @@ -638,8 +639,8 @@ def _parse_molden(self): """ Add docstring """ - if 'molden' in self.data: - return self['molden'] + if "molden" in self.data: + return self["molden"] return None @@ -653,9 +654,9 @@ def _parse_connectivity(self): """ Add docstring """ - + # Split lines - lines = self.data['out'].split("\n") + lines = self.data["out"].split("\n") # Extracting Atoms & Coordinates coor_substr = "internuclear distances" @@ -684,7 +685,7 @@ def _parse_connectivity(self): # Check for result if len(connectivity) > 0: return connectivity - + return None def parse(self): @@ -712,9 +713,10 @@ def parse(self): result = {k: v for k, v in result.items() if v is not None} # Add result info to geometry object - if 'geometry' in result: - result['geometry'].add___dict__( - {k: v for k, v in result.items() if k != 'geometry'}) + if "geometry" in result: + result["geometry"].add___dict__( + {k: v for k, v in result.items() if k != "geometry"} + ) # Store attribute self.result = result @@ -870,22 +872,15 @@ class XTBParser(FileParserInterface): Add docstring """ - def __init__(self): - """ - Add docstring - """ - self.contents = None - self.result = None - self.path = None + def __init__(self, data=None): + self.data = data - def load(self, path: str): - """ - Load in the data file - """ - with open(path, "r") as f: - self.contents = f.readlines() - self.path = path - # return self.contents + self.result = {} + + def load(self, path): + self.data = isicle.io.load_pickle(path) + + self.lines = self.data["out"].split("\n") def _crest_energy(self): """ @@ -896,11 +891,11 @@ def _crest_energy(self): population = [] ready = False - for h in range(len(self.contents), 0, -1): - if "Erel/kcal" in self.contents[h]: + for h in range(len(self.lines), 0, -1): + if "Erel/kcal" in self.lines[h]: g = h + 1 - for j in range(g, len(self.contents)): - line = self.contents[j].split() + for j in range(g, len(self.lines)): + line = self.lines[j].split() if len(line) == 8: relative_energy.append(float(line[1])) total_energy.append(float(line[2])) @@ -909,7 +904,7 @@ def _crest_energy(self): if "/K" in line[1]: break - if ready == True: + if ready is True: break return { @@ -930,7 +925,7 @@ def grab_time(line): return ":".join(line[1:]).strip("\n") ready = False - for line in self.contents: + for line in self.lines: if "test MD wall time" in line: test_MD = grab_time(line) ready = True @@ -941,7 +936,7 @@ def grab_time(line): if "multilevel OPT wall time" in line: multilevel_OPT = grab_time(line) - if "MD wall time" in line and ready == True: + if "MD wall time" in line and ready is True: MD = grab_time(line) ready = False @@ -967,19 +962,19 @@ def _isomer_energy(self): complete = False relative_energies = [] total_energies = [] - for i in range(len(self.contents), 0, -1): - if "structure ΔE(kcal/mol) Etot(Eh)" in self.contents[i]: + for i in range(len(self.lines), 0, -1): + if "structure ΔE(kcal/mol) Etot(Eh)" in self.lines[i]: h = i + 1 - for j in range(h, len(self.contents)): - if self.contents[j] != " \n": - line = self.contents[j].split() + for j in range(h, len(self.lines)): + if self.lines[j] != " \n": + line = self.lines[j].split() relative_energies.append(float(line[1])) total_energies.append(float(line[2])) else: complete = True break - if complete == True: + if complete is True: break return {"relative energy": relative_energies, "total energy": total_energies} @@ -995,7 +990,7 @@ def grab_time(line): return ":".join(line[1:]).strip("\n") - for line in self.contents: + for line in self.lines: if "LMO calc. wall time" in line: LMO_time = grab_time(line) @@ -1015,7 +1010,7 @@ def _opt_energy(self): """ Add docstring """ - for line in self.contents: + for line in self.lines: if "TOTAL ENERGY" in line: energy = line.split()[3] + " Hartrees" @@ -1036,7 +1031,7 @@ def grab_time(line): scf = False anc = False - for line in self.contents: + for line in self.lines: if "wall-time" in line and tot is False: total_time = grab_time(line) tot = True @@ -1055,28 +1050,6 @@ def grab_time(line): "ANC optimizer wall time": anc_time, } - def _parse_energy(self): - """ - Add docstring - """ - if self.parse_crest == True: - return self._crest_energy() - if self.parse_opt == True: - return self._opt_energy() - if self.parse_isomer == True: - return self._isomer_energy() - - def _parse_timing(self): - """ - Add docstring - """ - if self.parse_crest == True: - return self._crest_timing() - if self.parse_opt == True: - return self._opt_timing() - if self.parse_isomer == True: - return self._isomer_timing() - def _parse_protocol(self): """ Add docstring @@ -1090,26 +1063,25 @@ def _parse_protocol(self): protocol = (line.split(":")[1]).strip("\n") return protocol - def _parse_xyz(self): + def _parse_geometry(self): """ Split .xyz into separate XYZGeometry instances """ - - FILE = self.xyz_path - if len(list(pybel.readfile("xyz", FILE))) > 1: - geom_list = [] - count = 1 - XYZ = FILE.split(".")[0] - for geom in pybel.readfile("xyz", FILE): - geom.write("xyz", "%s_%d.xyz" % (XYZ, count)) - geom_list.append("%s_%d.xyz" % (XYZ, count)) - count += 1 - - # TODO: charge needs to propagate to here - # Improve loading scheme as well - return isicle.conformers.ConformationalEnsemble([isicle.io.load(i) for i in geom_list]) - - return isicle.io.load(self.xyz_path) + geometries = {} + # Add geometry info + for key in [ + "conformers", + "rotamers", + "final", + "best", + "protonated", + "deprotonated", + "tautomers", + ]: + if key in self.data: + geometries[key] = self.data[key] + + return geometries def parse(self): """ @@ -1128,63 +1100,30 @@ def parse(self): ): raise RuntimeError("XTB job failed: {}".format(self.path)) - self.parse_crest = False - self.parse_opt = False - self.parse_isomer = False - # Initialize result object to store info - result = {} - result["protocol"] = self._parse_protocol() - - try: - result["timing"] = self._parse_timing() - except: - pass - - try: - result["energy"] = self._parse_energy() - except: - pass - - # Parse geometry from assoc. XYZ file - if self.path.endswith("xyz"): - try: - result["geometry"] = self._parse_xyz() - - except: - pass - - if self.path.endswith("out") or self.path.endswith("log"): - # try geometry parsing - XYZ = None - if result["protocol"].split()[0] == "xtb": - self.parse_opt = True - XYZ = "xtbopt.xyz" - if result["protocol"].split()[1] == "crest": - if "-deprotonate" in result["protocol"]: - self.parse_isomer = True - XYZ = "deprotonated.xyz" - elif "-protonate" in result["protocol"]: - self.parse_isomer = True - XYZ = "protonated.xyz" - elif "-tautomer" in result["protocol"]: - self.parse_isomer = True - XYZ = "tautomers.xyz" - else: - self.parse_crest = True - XYZ = "crest_conformers.xyz" - - if XYZ is None: - raise RuntimeError( - "XYZ file associated with XTB job not available,\ - please parse separately." - ) + result = { + "protocol": self._parse_protocol(), + "timing": self._parse_timing(), + "energy": self._parse_energy(), + "geometry": self._parse_geometry() + } + if result["protocol"].split()[0] == "xtb": + result["timing"] = self._opt_timing() + result["energy"] = self._opt_energy() + + if result["protocol"].split()[1] == "crest": + if any( + [ + x in result["protocol"] + for x in ["-deprotonate", "-protonate", "-tautomer"] + ] + ): + result["timing"] = self._isomer_timing() + result["energy"] = self._isomer_energy() else: - temp_dir = os.path.dirname(self.path) - self.xyz_path = os.path.join(temp_dir, XYZ) - - result["geometry"] = self._parse_xyz() + result["timing"] = self._crest_timing() + result["energy"] = self._crest_energy() return result