- Conda is installed
git clone https://github.com/peterpaohuang/tapir.git
conda create -c rdkit -n tapir rdkit
conda activate tapir_env
- Download polymer_db.csv
- Move polymer_db.csv into tapir directory
python setup.py
while inside tapir_env conda environment
from depablo_box import PDBML, model
dx = PDBML()
df = dx.df
# list both polymer names and smiles
df[["polymer_name", "smiles"]]
# list only polymer names
df["polymer_name"]
# list only smiles
df["smiles"]
# list only inchi keys
df["inchi"]
# retrieve polymer row by polymer_name
df.loc[df["polymer_name"] == polymer_name]
# retrieve polymer row by smiles
df.loc[df["smiles"] == smiles]
# retrieve polymer row by inchi key
df.loc[df["inchi"] = inchi_key]
dx.chemical_descriptors
- ExactMolWt
- FpDensityMorgan1
- FpDensityMorgan2
- FpDensityMorgan3
- HeavyAtomMolWt
- MolWt
- etc
dx.experimental_descriptors
- Molar Volume Vm
- Density ρ
- Solubility Parameter δ
- Molar Cohesive Energy Ecoh
- Glass Transition Temperature Tg
- Molar Heat Capacity Cp
- Entanglement Molecular Weight Me
- Index of Refraction n
- Coefficient of Thermal Expansion α
- Molecular Weight of Repeat unit
- Van-der-Waals Volume VvW
dx.na_distribution()
dx.ml_methods
dx.conversion_formats
Note: currently, depablo_box is only able to handle the calculation of chemical descriptors. Experimental descriptors already exists within the database (dx.df)
descriptor_list = ["ExactMolWt", "HeavyAtomMolWt"]
polymer_identifier = "C=CC(=O)NC(C)(C)C" # can also be the polymer_name
descriptor_df = dx.get_descriptors(polymer_identifier, descriptor_list)
Protein Data Bank
Gaussian 98/03 Input
polymer_identifier = 'CC(=O)OC=C' # can also be the polymer_name
conversion_format = 'Gaussian 98/03 Input'
outpath = '/file/path/your_polymer.xyz'
dx.create_input_file(polymer_identifier, conversion_format, outpath)
dx.add_descriptors(descriptor_list)
dx.plot_properties(property_x="glass_transition_temperature", property_y="ExactMolWt")
dx.plot_many(property_list)
dx.property_correlation("molar_heat_capacity", "HeavyAtomMolWt")
dx.correlation_map(property_list)
dx.export_csv(outpath)
# input_properties must have already been added to PDBML().df
input_properties = ["molar_heat_capacity", "ExactMolWt", "HeavyAtomMolWt"]
output_property = "solubility_parameter"
na_strategy = "remove"
ml = model(df, input_properties, output_property, na_strategy=na_strategy)
Support Vector Regression
Linear Regression
Ridge Regression
Lasso Regression
Gaussian Process Regression
model_type = "Support Vector Regression"
ml.train(model_type)
ml.r_2
new_data = [["10.5", "29", "102.1"]]
results = ml.predict(new_data)
Note: model type Gaussian Process Regression does not support feature importances
ml.feature_importance()
ml.export_fitted_model(outpath)
import pickle
with open(outpath, "rb") as f:
ml = pickle.load(f)
results = ml.predict(new_data)
from depablo_box import polymer_scraper
scraper = polymer_scraper()
scraper.start()
outpath = /file/path/to/store/FILE.csv
scraper.store_data(outpath)