get_cbs.py

!/usr/bin/env python


##################################################################################################################################################
#                                                                                                                                                #
#       This script performs a complete basis set (CBS) extrapolation from  CCSD(T)/cc-pVTZ and CCSD(T)/cc-pVQZ single point energies.           #
#                               The relevant data is extracted from output files generated by the ORCA programm.                                 #
#                                               -> CCSD(T)/cc-pVTZ single points must end with "_TZ.out"                                         #
#                                               -> CCSD(T)/cc-pVQZ single points must end with "_QZ.out"                                         #
#                                                                                                                                                #
#                                               Relevant data will be generated in a .txt file (cbs_data.txt)!                                   #
#                                                                                                                                                #
#                                                       Version: 1.03                                                                            #
#                                                                                                                                                #
#                                               Author:         Fabian L. Zott                                                                   #
#                                               Last modified:  16.06.2021                                                                       #
#                                                                                                                                                #
##################################################################################################################################################


import pandas as pd
import os
import glob
import sys
import re
import fileinput
import shutil           # for copying files
import pathlib
import math

pd.options.display.float_format = '{:10.6f}'.format


#################################################  Some Variables  ###############################################################################

criteria = float(0.026255)               # kJ/mol
criteria_hartree = float(0.00001)        # energy criteria in Hartree 0.0000099999992
hartconv = float(2625.498)               # conversion factor of [Hartree] to [kJ/mol]
R = float(0.008314511)                   # gas constant as [kJ/K*mol]
temp = float(289.15)                     # standard condition temperature
kcalvonv = float(4184)                   # conversion factor [kcal] to [kJ/mol]


elemDict = {"1" : "H", "2" : "He", "3" : "Li", "4" : "Be", "5" : "B", \
"6"  : "C", "7"  : "N", "8"  : "O",  "9" : "F", "10" : "Ne", \
"11" : "Na" , "12" : "Mg" , "13" : "Al" , "14" : "Si" , "15" : "P", \
"16" : "S"  , "17" : "Cl" , "18" : "Ar" , "19" : "K"  , "20" : "Ca", \
"21" : "Sc" , "22" : "Ti" , "23" : "V"  , "24" : "Cr" , "25" : "Mn", \
"26" : "Fe" , "27" : "Co" , "28" : "Ni" , "29" : "Cu" , "30" : "Zn", \
"31" : "Ga" , "32" : "Ge" , "33" : "As" , "34" : "Se" , "35" : "Br", \
"36" : "Kr" , "37" : "Rb" , "38" : "Sr" , "39" : "Y"  , "40" : "Zr", \
"41" : "Nb" , "42" : "Mo" , "43" : "Tc" , "44" : "Ru" , "45" : "Rh", \
"46" : "Pd" , "47" : "Ag" , "48" : "Cd" , "49" : "In" , "50" : "Sn", \
"51" : "Sb" , "52" : "Te" , "53" : "I"  , "54" : "Xe" , "55" : "Cs", \
"56" : "Ba" , "57" : "La" , "58" : "Ce" , "59" : "Pr" , "60" : "Nd", \
"61" : "Pm" , "62" : "Sm" , "63" : "Eu" , "64" : "Gd" , "65" : "Tb", \
"66" : "Dy" , "67" : "Ho" , "68" : "Er" , "69" : "Tm" , "70" : "Yb", \
"71" : "Lu" , "72" : "Hf" , "73" : "Ta" , "74" : "W"  , "75" : "Re", \
"76" : "Os" , "77" : "Ir" , "78" : "Pt" , "79" : "Au" , "80" : "Hg", \
"81" : "Tl" , "82" : "Pb" , "83" : "Bi" , "84" : "Po" , "85" : "At", \
"86" : "Rn" , "87" : "Fr" , "88" : "Ra" , "89" : "Ac" , "90" : "Th", \
"91" : "Pa" , "92" : "U"  , "93" : "Np" , "94" : "Pu" , "95" : "Am", \
"96" : "Cm" , "97" : "Bk" , "98" : "Cf" , "99" : "Es" ,"100" : "Fm", \
"101": "Md" ,"102" : "No" ,"103" : "Lr" ,"104" : "Rf" ,"105" : "Db", \
"106": "Sg" ,"107" : "Bh" ,"108" : "Hs" ,"109" : "Mt" ,"110" : "Ds", \
"111": "Rg" ,"112" : "Uub","113" : "Uut","114" : "Uuq","115" : "Uup", \
"116": "Uuh","117" : "Uus","118" : "Uuo"}



########################  Functions  ##############################################################################################




def getSinglePointEnergyFromORCA(file) :

        f = open(file)                                  # Open file on read mode
        dataList = f.read().split("\n")                 # Create a list containing all lines
        f.close()
        #print(dataList)
        for index, value in enumerate(dataList):        # fill datalist
            if 'FINAL SINGLE POINT ENERGY' in value:
                #print(index, value)
                index = int(index)
                E_tot_DPLNO = value.split()[4]          # get energy value by splitting line by whitespace and returning 4th value
        #print(E_tot_DPLNO)


        return E_tot_DPLNO



def completeBasisSet(file_TZ, file_QZ) :

        ####################  some variables for CBS extrapolation  ##########################
        alfa = float(-5.46)
        beta = float(3.05)
        n = 3
        m = 4

        f = open(file_TZ)                                  # open file on read mode
        dataList_TZ = f.read().split("\n")                 # create a list containing all lines
        f.close()
        for index, value in enumerate(dataList_TZ):        # fill datalist
            if 'Reference energy' in value:
                #print(index, value)
                index = int(index)
                RefTZ = float(value.split()[3])            # get energy value by splitting line by whitespace and returning 3rd value
            if 'Final correlation energy' in value:
                #print(index, value)
                index = int(index)
                CorrTZ = float(value.split()[4])           # get energy value by splitting line by whitespace and returning 4th value
        #print("Reference energy TZ:", RefTZ)
        #print("Final correlation energy TZ", CorrTZ)

        f = open(file_QZ)                                  # open file on read mode
        dataList_QZ = f.read().split("\n")                 # create a list containing all lines
        f.close()
        for index, value in enumerate(dataList_QZ):        # fill datalist
            if 'Reference energy' in value:
                #print(index, value)
                index = int(index)
                RefQZ = float(value.split()[3])            # get energy value by splitting line by whitespace and returning 3rd value
            if 'Final correlation energy' in value:
                #print(index, value)
                index = int(index)
                CorrQZ = float(value.split()[4])           # get energy value by splitting line by whitespace and returning 4th value
        #print("Reference energy QZ:", RefQZ)
        #print("Final correlation energy QZ", CorrQZ)

        ##########################  Extrapolating to CBS  #####################################
        corrAlfaN = (math.exp(alfa * math.sqrt(n)))/1
        corrAlfaM = (math.exp(alfa * math.sqrt(m)))/1
        RE_extrapolated = ((RefTZ * corrAlfaM) - (RefQZ * corrAlfaN))/(corrAlfaM - corrAlfaN)
        Corr_extrapolated = (((n ** beta) * CorrTZ) - ((m ** beta) * CorrQZ))/((n ** beta)-(m ** beta))
        SCF_E_extrapolated = RE_extrapolated + Corr_extrapolated

        return SCF_E_extrapolated




#########################Prepare Folder for Execution of GetData####################

directory_in_str = str(os.getcwd())
directory = os.fsencode(directory_in_str)

current_dir = pathlib.Path.cwd()
parent_dir =  current_dir.parent


############################  Save Methode code as string  #######################################################


for file in os.listdir(directory):
    filename = os.fsdecode(file)


    if filename.endswith("_TZ.out"):                    #-------------------------------------------
        method_code_TZ = "_TZ"                          #
        file_end_TZ = "_TZ.out"                         #   !! common filnames for DPLNO QZ and TZ!!
    elif filename.endswith("_QZ.out"):                  #
        method_code_QZ = "_QZ"                          #
        file_end_QZ = "_QZ.out"                         #-------------------------------------------


    else:
        print("No specific file extension found in:", filename)



##################################  Extracting TZ Total Energy from .out  ########################################

data_TZ = pd.DataFrame(columns=["Structure","E_tot_DPLNO(TZ)"])

for file in os.listdir(directory):
     filename = os.fsdecode(file)
     if any(filename.endswith(file_end_TZ) for file in os.listdir('.')):
         #print("ORCA-DPLNO-TZ Calculation found!!")
         #data_opt = pd.DataFrame(columns=["Structure","E_tot_B3LYP_SMD"])
         filename = os.fsdecode(file)
         basename  = str(os.path.splitext(filename)[0])
         E_tot_DPLNO_TZ = getSinglePointEnergyFromORCA(file)
         #print("Out:", E_tot_DPLNO_TZ)
         data_TZ = data_TZ.append({'Structure': basename, 'E_tot_DPLNO(TZ)': E_tot_DPLNO_TZ}, ignore_index=True)
         #data_opt = data_opt.append({'E_tot_B3LYP_SMD': E_tot}, ignore_index=True)

data_TZ = data_TZ.replace(method_code_TZ, '',regex=True)

#print(data_TZ)

##################################  Create Empty Dataframe  ######################################################

data = data_TZ.copy()
data = data[['Structure']]


##################################  Extracting QZ Total Energy from .out  ########################################

data_QZ = pd.DataFrame(columns=["Structure","E_tot_DPLNO(QZ)"])

for file in os.listdir(directory):
     filename = os.fsdecode(file)
     if any(filename.endswith(file_end_QZ) for file in os.listdir('.')):
         #print("ORCA-DPLNO-QZ Calculation found!!")
         #data_opt = pd.DataFrame(columns=["Structure","E_tot_B3LYP_SMD"])
         filename = os.fsdecode(file)
         basename  = str(os.path.splitext(filename)[0])
         E_tot_DPLNO_QZ = getSinglePointEnergyFromORCA(file)
         #print("Out:", E_tot_DPLNO_QZ)
         data_QZ = data_QZ.append({'Structure': basename, 'E_tot_DPLNO(QZ)': E_tot_DPLNO_QZ}, ignore_index=True)
         #data_opt = data_opt.append({'E_tot_B3LYP_SMD': E_tot}, ignore_index=True)

data_QZ = data_QZ.replace(method_code_QZ, '',regex=True)

#print(data_QZ)

#############################  Extrapolating Complete Basis Set for DPLNO  #######################################

data_CBS = pd.DataFrame(columns=["Structure","E_tot_DPLNO(CBS)"])

for file in os.listdir(directory):
     filename = os.fsdecode(file)
     if any(filename.endswith(file_end_TZ) for file in os.listdir('.')):
         filename = os.fsdecode(file)
         basename_TZ  = str(os.path.splitext(filename)[0])
         basename = basename_TZ.replace("_TZ", "")
         basename_QZ_out  = basename + "_QZ.out"
         basename_QZ = basename_QZ_out.replace(".out", "")
         print("-----------------------next----------------------------")
         print("CBS extraplolation for", basename_TZ, "and", basename_QZ)
         SCF_E_extrapolated = completeBasisSet(file, basename_QZ_out)                   #where first is TZ (as "file" in for loop) second is corresponding QZ!!!!
         data_CBS = data_CBS.append({'Structure': basename, 'E_tot_DPLNO(CBS)': SCF_E_extrapolated}, ignore_index=True)
         print("E_tot_CBS:", SCF_E_extrapolated)


#data_CBS = data_CBS.replace(method_code_QZ, '',regex=True)

#print(data_CBS)

#################################  Merge all Dataframes and print  ###############################################

data = pd.merge(data, data_TZ, on="Structure")
data = pd.merge(data, data_QZ, on="Structure")
data = pd.merge(data, data_CBS, on="Structure")

print("\n")
print("\n")
print("\n")
print("\n")
print("\n")
print("\n")
print(data)
print("\n")
print("\n")
print("\n")

#print(data.info())

data.to_csv(r'./cbs_data.txt', sep='\t', header='true', index=False, index_label=False, na_rep='NULL')
print("------------------------Code run smoothly!!------------------------")