diff --git a/README.md b/README.md index 0e5ae456..b4907e3d 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,10 @@ UVVisML Predict optical properties of molecules with machine learning. -## Setup +## Colab Examples +A Google Colab notebook is available [here](https://colab.research.google.com/github/learningmatter-mit/uvvisml/blob/main/uvvisml_demo.ipynb) with examples of using the various types of models and predictions. Alternatively, you may use the command line instructions below. + +## Command Line Setup 0. Install [Anaconda or Miniconda](https://docs.conda.io/projects/continuumio-conda/en/latest/user-guide/install/index.html) if you have not yet done so. 1. `git clone git@github.com:learningmatter-mit/uvvisml.git` 2. `cd uvvisml` diff --git a/uvvisml_demo.ipynb b/uvvisml_demo.ipynb new file mode 100644 index 00000000..c9383afc --- /dev/null +++ b/uvvisml_demo.ipynb @@ -0,0 +1,3984 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "uvvisml_demo.ipynb", + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Setup" + ], + "metadata": { + "id": "6jZnD-vU9Fd-" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WGZxehx_44Rj", + "outputId": "bbf17a4d-3e75-4f9b-ea11-d42c39cad146" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2022-02-05 05:19:39-- https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh\n", + "Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.131.3, 104.16.130.3, 2606:4700::6810:8303, ...\n", + "Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.131.3|:443... connected.\n", + "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n", + "\n", + " The file is already fully retrieved; nothing to do.\n", + "\n", + "PREFIX=/usr/local\n", + "Unpacking payload ...\n", + "Collecting package metadata (current_repodata.json): - \b\b\\ \b\b| \b\b/ \b\b- \b\bdone\n", + "Solving environment: | \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \n", + "The environment is inconsistent, please check the package plan carefully\n", + "The following packages are causing the inconsistency:\n", + "\n", + " - conda-forge/linux-64::rdkit==2020.09.2=py37h713bca6_0\n", + " - conda-forge/noarch::sphinxcontrib-qthelp==1.0.3=py_0\n", + " - defaults/noarch::urllib3==1.26.6=pyhd3eb1b0_1\n", + " - defaults/linux-64::python==3.7.7=hcff3b4d_5\n", + " - defaults/linux-64::ncurses==6.2=he6710b0_1\n", + " - conda-forge/noarch::hyperopt==0.2.5=pyh9f0ad1d_0\n", + " - conda-forge/linux-64::pcre==8.45=h9c3ff4c_0\n", + " - defaults/noarch::tqdm==4.61.2=pyhd3eb1b0_1\n", + " - defaults/linux-64::chardet==4.0.0=py37h06a4308_1003\n", + " - conda-forge/linux-64::libpng==1.6.37=h21135ba_2\n", + " - conda-forge/linux-64::tornado==6.1=py37h5e8e339_1\n", + " - conda-forge/linux-64::pymongo==3.10.1=py37h3340039_2\n", + " - defaults/linux-64::libmklml==2019.0.5=h06a4308_0\n", + " - defaults/linux-64::tk==8.6.10=hbc83047_0\n", + " - conda-forge/linux-64::click==8.0.3=py37h89c1867_1\n", + " - conda-forge/linux-64::docutils==0.16=py37h89c1867_3\n", + " - defaults/linux-64::kiwisolver==1.3.1=py37h2531618_0\n", + " - conda-forge/linux-64::reportlab==3.5.68=py37h69800bb_0\n", + " - conda-forge/noarch::flask==2.0.2=pyhd8ed1ab_0\n", + " - conda-forge/linux-64::bzip2==1.0.8=h7f98852_4\n", + " - defaults/noarch::requests==2.25.1=pyhd3eb1b0_0\n", + " - conda-forge/linux-64::boost-cpp==1.74.0=h9359b55_0\n", + " - conda-forge/linux-64::scikit-learn==0.23.2=py37hddcf8d6_3\n", + " - conda-forge/noarch::python-dateutil==2.8.2=pyhd8ed1ab_0\n", + " - conda-forge/linux-64::libcblas==3.9.0=5_h92ddd45_netlib\n", + " - conda-forge/noarch::werkzeug==2.0.2=pyhd8ed1ab_0\n", + " - conda-forge/noarch::snowballstemmer==2.2.0=pyhd8ed1ab_0\n", + " - conda-forge/noarch::sphinxcontrib-jsmath==1.0.1=py_0\n", + " - conda-forge/linux-64::freetype==2.10.4=h0708190_1\n", + " - conda-forge/linux-64::lz4-c==1.9.3=h9c3ff4c_1\n", + " - defaults/noarch::pycparser==2.20=py_2\n", + " - conda-forge/noarch::pandas-flavor==0.2.0=py_0\n", + " - defaults/linux-64::yaml==0.2.5=h7b6447c_0\n", + " - conda-forge/noarch::pygments==2.11.2=pyhd8ed1ab_0\n", + " - conda-forge/noarch::pyparsing==3.0.7=pyhd8ed1ab_0\n", + " - conda-forge/linux-64::liblapack==3.9.0=5_h92ddd45_netlib\n", + " - conda-forge/noarch::chemprop==1.4.1=pyhd8ed1ab_0\n", + " - defaults/linux-64::libuuid==1.0.3=h7f8727e_2\n", + " - conda-forge/linux-64::certifi==2021.10.8=py37h89c1867_1\n", + " - conda-forge/linux-64::python_abi==3.7=2_cp37m\n", + " - conda-forge/noarch::cloudpickle==2.0.0=pyhd8ed1ab_0\n", + " - conda-forge/noarch::networkx==2.6.3=pyhd8ed1ab_1\n", + " - conda-forge/noarch::typed-argument-parser==1.7.2=pyhd8ed1ab_0\n", + " - conda-forge/linux-64::importlib-metadata==4.10.1=py37h89c1867_0\n", + " - conda-forge/noarch::packaging==21.3=pyhd8ed1ab_0\n", + " - defaults/linux-64::brotlipy==0.7.0=py37h27cfd23_1003\n", + " - defaults/linux-64::pysocks==1.7.1=py37_1\n", + " - defaults/linux-64::pip==21.1.3=py37h06a4308_0\n", + " - defaults/noarch::pyopenssl==20.0.1=pyhd3eb1b0_1\n", + " - conda-forge/noarch::sphinxcontrib-htmlhelp==2.0.0=pyhd8ed1ab_0\n", + " - conda-forge/noarch::dataclasses==0.8=pyhc8e2a94_3\n", + " - defaults/linux-64::xz==5.2.5=h7b6447c_0\n", + " - defaults/linux-64::cairo==1.16.0=hf32fb01_1\n", + " - defaults/linux-64::cryptography==3.4.7=py37hd23ed53_0\n", + " - conda-forge/noarch::colorama==0.4.4=pyh9f0ad1d_0\n", + " - defaults/linux-64::pycosat==0.6.3=py37h27cfd23_0\n", + " - conda-forge/linux-64::libiconv==1.16=h516909a_0\n", + " - conda-forge/linux-64::pycairo==1.20.1=py37hfff247e_0\n", + " - conda-forge/linux-64::pixman==0.40.0=h36c2ea0_0\n", + " - conda-forge/noarch::babel==2.9.1=pyh44b312d_0\n", + " - conda-forge/linux-64::sqlalchemy==1.3.23=py37h5e8e339_0\n", + " - conda-forge/noarch::threadpoolctl==3.1.0=pyh8a188c0_0\n", + " - defaults/linux-64::fontconfig==2.13.1=h6c09931_0\n", + " - defaults/linux-64::libedit==3.1.20181209=hc058e9b_0\n", + " - defaults/linux-64::readline==8.1=h27cfd23_0\n", + " - conda-forge/noarch::alabaster==0.7.12=py_0\n", + " - conda-forge/noarch::xarray==0.20.2=pyhd8ed1ab_0\n", + " - conda-forge/noarch::jinja2==3.0.3=pyhd8ed1ab_0\n", + " - conda-forge/noarch::itsdangerous==2.0.1=pyhd8ed1ab_0\n", + " - defaults/noarch::six==1.16.0=pyhd3eb1b0_0\n", + " - conda-forge/linux-64::zstd==1.4.9=ha95c52a_0\n", + " - defaults/linux-64::libxcb==1.14=h7b6447c_0\n", + " - defaults/noarch::wheel==0.36.2=pyhd3eb1b0_0\n", + " - conda-forge/noarch::typing_inspect==0.7.1=pyh6c4a22f_0\n", + " - defaults/linux-64::libstdcxx-ng==9.3.0=hd4cf53a_17\n", + " - defaults/linux-64::libgcc-ng==9.3.0=h5101ec6_17\n", + " - conda-forge/linux-64::pandas==1.2.3=py37hdc94413_0\n", + " - conda-forge/noarch::joblib==1.1.0=pyhd8ed1ab_0\n", + " - conda-forge/linux-64::libblas==3.9.0=1_h86c2bf4_netlib\n", + " - conda-forge/noarch::sphinxcontrib-applehelp==1.0.2=py_0\n", + " - conda-forge/linux-64::conda==4.11.0=py37h89c1867_0\n", + " - conda-forge/linux-64::scipy==1.5.3=py37h14a347d_0\n", + " - conda-forge/noarch::importlib_metadata==4.10.1=hd8ed1ab_0\n", + " - conda-forge/noarch::sphinx==4.4.0=pyh6c4a22f_1\n", + " - conda-forge/linux-64::mypy_extensions==0.4.3=py37h89c1867_4\n", + " - conda-forge/linux-64::protobuf==3.15.8=py37hcd2ae1e_0\n", + " - conda-forge/linux-64::boost==1.74.0=py37h6dcda5c_3\n", + " - conda-forge/noarch::sphinxcontrib-serializinghtml==1.1.5=pyhd8ed1ab_1\n", + " - defaults/linux-64::libffi==3.3=he6710b0_2\n", + " - conda-forge/noarch::olefile==0.46=pyh9f0ad1d_1\n", + " - conda-forge/linux-64::matplotlib-base==3.3.4=py37h0c9df89_0\n", + " - defaults/linux-64::jpeg==9d=h7f8727e_0\n", + " - defaults/linux-64::pytorch==1.8.1=cpu_py37h60491be_0\n", + " - conda-forge/noarch::pytz==2021.3=pyhd8ed1ab_0\n", + " - conda-forge/linux-64::openssl==1.1.1k=h7f98852_0\n", + " - conda-forge/linux-64::libxml2==2.9.10=h68273f3_2\n", + " - conda-forge/linux-64::numpy==1.20.3=py37h038b26d_1\n", + " - conda-forge/linux-64::ninja==1.10.2=h4bd325d_0\n", + " - conda-forge/noarch::typing-extensions==4.0.1=hd8ed1ab_0\n", + " - conda-forge/linux-64::libopenblas==0.3.17=pthreads_h8fe5266_1\n", + " - defaults/noarch::idna==2.10=pyhd3eb1b0_0\n", + " - conda-forge/linux-64::libprotobuf==3.15.8=h780b84a_0\n", + " - defaults/linux-64::ruamel_yaml==0.15.100=py37h27cfd23_0\n", + " - defaults/linux-64::glib==2.69.1=h4ff587b_1\n", + " - conda-forge/noarch::cycler==0.11.0=pyhd8ed1ab_0\n", + " - conda-forge/noarch::tensorboardx==2.4=pyhd8ed1ab_0\n", + " - conda-forge/linux-64::markupsafe==2.0.1=py37h5e8e339_0\n", + " - conda-forge/noarch::zipp==3.7.0=pyhd8ed1ab_1\n", + " - defaults/linux-64::zlib==1.2.11=h7b6447c_3\n", + " - conda-forge/linux-64::icu==67.1=he1b5a44_0\n", + " - conda-forge/noarch::sphinxcontrib-devhelp==1.0.2=py_0\n", + " - conda-forge/linux-64::pillow==6.2.1=py37h6b7be26_0\n", + " - defaults/linux-64::cffi==1.14.6=py37h400218f_0\n", + " - defaults/linux-64::sqlite==3.36.0=hc218d9a_0\n", + " - conda-forge/linux-64::libtiff==4.0.10=hc3755c2_1005\n", + " - defaults/linux-64::conda-package-handling==1.7.3=py37h27cfd23_1\n", + " - defaults/linux-64::setuptools==52.0.0=py37h06a4308_0\n", + " - conda-forge/linux-64::future==0.18.2=py37h89c1867_4\n", + " - conda-forge/noarch::imagesize==1.3.0=pyhd8ed1ab_0\n", + " - conda-forge/noarch::typing_extensions==4.0.1=pyha770c72_0\n", + "\b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\bdone\n", + "\n", + "## Package Plan ##\n", + "\n", + " environment location: /usr/local\n", + "\n", + " added / updated specs:\n", + " - _libgcc_mutex==0.1=main\n", + " - ca-certificates==2020.1.1=0\n", + " - certifi==2020.4.5.1=py37_0\n", + " - cffi==1.14.0=py37he30daa8_1\n", + " - chardet==3.0.4=py37_1003\n", + " - conda-package-handling==1.6.1=py37h7b6447c_0\n", + " - conda==4.8.3=py37_0\n", + " - cryptography==2.9.2=py37h1ba5d50_0\n", + " - idna==2.9=py_1\n", + " - ld_impl_linux-64==2.33.1=h53a641e_7\n", + " - libedit==3.1.20181209=hc058e9b_0\n", + " - libffi==3.3=he6710b0_1\n", + " - libgcc-ng==9.1.0=hdf63c60_0\n", + " - libstdcxx-ng==9.1.0=hdf63c60_0\n", + " - ncurses==6.2=he6710b0_1\n", + " - openssl==1.1.1g=h7b6447c_0\n", + " - pip==20.0.2=py37_3\n", + " - pycosat==0.6.3=py37h7b6447c_0\n", + " - pycparser==2.20=py_0\n", + " - pyopenssl==19.1.0=py37_0\n", + " - pysocks==1.7.1=py37_0\n", + " - python==3.7.7=hcff3b4d_5\n", + " - readline==8.0=h7b6447c_0\n", + " - requests==2.23.0=py37_0\n", + " - ruamel_yaml==0.15.87=py37h7b6447c_0\n", + " - setuptools==46.4.0=py37_0\n", + " - six==1.14.0=py37_0\n", + " - sqlite==3.31.1=h62c20be_1\n", + " - tk==8.6.8=hbc83047_0\n", + " - tqdm==4.46.0=py_0\n", + " - urllib3==1.25.8=py37_0\n", + " - wheel==0.34.2=py37_0\n", + " - xz==5.2.5=h7b6447c_0\n", + " - yaml==0.1.7=had09818_2\n", + " - zlib==1.2.11=h7b6447c_3\n", + "\n", + "\n", + "The following packages will be UPDATED:\n", + "\n", + " rdkit 2020.09.2-py37h713bca6_0 --> 2020.09.5-py37he53b9e1_0\n", + "\n", + "The following packages will be SUPERSEDED by a higher-priority channel:\n", + "\n", + " ca-certificates conda-forge::ca-certificates-2021.10.~ --> pkgs/main::ca-certificates-2020.1.1-0\n", + "\n", + "\n", + "Preparing transaction: / \b\b- \b\bdone\n", + "Executing transaction: | \b\b/ \b\b- \b\bdone\n", + "installation finished.\n", + "WARNING:\n", + " You currently have a PYTHONPATH environment variable set. This may cause\n", + " unexpected behavior when running the Python interpreter in Miniconda3.\n", + " For best results, please verify that your PYTHONPATH only points to\n", + " directories of packages that are compatible with the Python interpreter\n", + " in Miniconda3: /usr/local\n", + "Collecting package metadata (current_repodata.json): ...working... done\n", + "Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.\n", + "Collecting package metadata (repodata.json): ...working... done\n", + "Solving environment: ...working... done\n", + "\n", + "## Package Plan ##\n", + "\n", + " environment location: /usr/local\n", + "\n", + " added / updated specs:\n", + " - rdkit==2020.09.2\n", + "\n", + "\n", + "The following packages will be UPDATED:\n", + "\n", + " ca-certificates pkgs/main::ca-certificates-2020.1.1-0 --> conda-forge::ca-certificates-2021.10.8-ha878542_0\n", + "\n", + "The following packages will be DOWNGRADED:\n", + "\n", + " rdkit 2020.09.5-py37he53b9e1_0 --> 2020.09.2-py37h713bca6_0\n", + "\n", + "\n", + "Preparing transaction: ...working... done\n", + "Verifying transaction: ...working... done\n", + "Executing transaction: ...working... done\n", + "Collecting package metadata (current_repodata.json): ...working... done\n", + "Solving environment: ...working... done\n", + "\n", + "# All requested packages already installed.\n", + "\n", + "fatal: destination path 'uvvisml' already exists and is not an empty directory.\n", + "--2022-02-05 05:24:58-- https://zenodo.org/record/5573027/files/models.tar.gz\n", + "Resolving zenodo.org (zenodo.org)... 137.138.76.77\n", + "Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 651010218 (621M) [application/octet-stream]\n", + "Saving to: ‘models.tar.gz’\n", + "\n", + "models.tar.gz 100%[===================>] 620.85M 7.17MB/s in 1m 55s \n", + "\n", + "2022-02-05 05:26:55 (5.39 MB/s) - ‘models.tar.gz’ saved [651010218/651010218]\n", + "\n" + ] + } + ], + "source": [ + "!wget -c https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh\n", + "!chmod +x Miniconda3-py37_4.8.3-Linux-x86_64.sh\n", + "!bash ./Miniconda3-py37_4.8.3-Linux-x86_64.sh -b -f -p /usr/local\n", + "!conda install -q -y -c conda-forge rdkit==2020.09.2\n", + "!conda install -q -y -c conda-forge chemprop\n", + "!git clone https://github.com/learningmatter-mit/uvvisml\n", + "\n", + "import os\n", + "import sys\n", + "import pandas as pd\n", + "sys.path.append('/usr/local/lib/python3.7/site-packages/')\n", + "import chemprop\n", + "\n", + "os.chdir('uvvisml/uvvisml')\n", + "\n", + "!bash get_model_files.sh" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Data" + ], + "metadata": { + "id": "MQ67V6hr_6Yz" + } + }, + { + "cell_type": "code", + "source": [ + "test_file = 'data/splits/lambda_max_abs/deep4chem/group_by_smiles/smiles_target_test.csv'\n", + "df = pd.read_csv(test_file)\n", + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "pqtogu2T7OTd", + "outputId": "e85776ac-c71d-45cd-def3-61f1cc5d4cb5" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
smilessolventpeakwavs_max
0CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1C1CCCCC1376.0
1CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CCOC(C)=O392.0
2CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CC#N396.0
3CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CCO400.0
4CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1OCC(O)CO413.0
............
1705c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34C[N+](=O)[O-]424.0
1706c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34CS(C)=O432.0
1707COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5...ClCCl367.0
1708N#Cc1c(N2CCCCC2)cc(-c2cccc3ccccc23)c2c1-c1cccc...C1CCOC1358.0
1709N#Cc1c(N2CCCC2)cc(-c2ccccc2)c2c1Cc1ccccc1-2C1CCOC1382.0
\n", + "

1710 rows × 3 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " smiles ... peakwavs_max\n", + "0 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 376.0\n", + "1 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 392.0\n", + "2 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 396.0\n", + "3 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 400.0\n", + "4 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 413.0\n", + "... ... ... ...\n", + "1705 c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34 ... 424.0\n", + "1706 c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34 ... 432.0\n", + "1707 COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5... ... 367.0\n", + "1708 N#Cc1c(N2CCCCC2)cc(-c2cccc3ccccc23)c2c1-c1cccc... ... 358.0\n", + "1709 N#Cc1c(N2CCCC2)cc(-c2ccccc2)c2c1Cc1ccccc1-2 ... 382.0\n", + "\n", + "[1710 rows x 3 columns]" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Make Predictions" + ], + "metadata": { + "id": "WZktZ7DsAAMn" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Predict experimental peak with model trained on combined training set" + ], + "metadata": { + "id": "9qWycAVkKlS0" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Equivalent to command line:**\n", + "\n", + "python uvvisml/predict.py --test_file uvvisml/data/splits/lambda_max_abs/deep4chem/group_by_smiles/smiles_target_test.csv --property absorption_peak_nm_expt --method chemprop --preds_file test_preds.csv" + ], + "metadata": { + "id": "-g-bD5g6JCCy" + } + }, + { + "cell_type": "code", + "source": [ + "arguments = [\n", + " '--test_path', test_file,\n", + " '--preds_path', '/dev/null',\n", + " '--checkpoint_dir', 'models/lambda_max_abs/chemprop/combined/production/fold_0',\n", + " '--number_of_molecules', '2',\n", + " #'--gpu', '0'\n", + "]\n", + "\n", + "args = chemprop.args.PredictArgs().parse_args(arguments)\n", + "preds = chemprop.train.make_predictions(args=args)\n", + "\n", + "preds = [x[0] for x in preds]\n", + "df['peakwavs_max_pred'] = preds\n", + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "5a1-7UXcJCss", + "outputId": "eaedecae-627b-4006-e481-7eefc5df90b5" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loading training args\n", + "Setting molecule featurization parameters to default.\n", + "Loading data\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "1710it [00:00, 126450.28it/s]\n", + "100%|██████████| 1710/1710 [00:00<00:00, 221927.71it/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Validating SMILES\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:481: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", + " cpuset_checked))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test size = 1,710\n", + "Predicting with an ensemble of 5 models\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\r 0%| | 0/5 [00:00\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
smilessolventpeakwavs_maxpeakwavs_max_pred
0CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1C1CCCCC1376.0378.089791
1CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CCOC(C)=O392.0388.387075
2CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CC#N396.0394.557472
3CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CCO400.0400.817724
4CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1OCC(O)CO413.0410.902339
...............
1705c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34C[N+](=O)[O-]424.0425.084342
1706c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34CS(C)=O432.0428.209718
1707COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5...ClCCl367.0370.200150
1708N#Cc1c(N2CCCCC2)cc(-c2cccc3ccccc23)c2c1-c1cccc...C1CCOC1358.0355.987562
1709N#Cc1c(N2CCCC2)cc(-c2ccccc2)c2c1Cc1ccccc1-2C1CCOC1382.0368.792145
\n", + "

1710 rows × 4 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " " + ], + "text/plain": [ + " smiles ... peakwavs_max_pred\n", + "0 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 378.089791\n", + "1 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 388.387075\n", + "2 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 394.557472\n", + "3 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 400.817724\n", + "4 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 410.902339\n", + "... ... ... ...\n", + "1705 c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34 ... 425.084342\n", + "1706 c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34 ... 428.209718\n", + "1707 COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5... ... 370.200150\n", + "1708 N#Cc1c(N2CCCCC2)cc(-c2cccc3ccccc23)c2c1-c1cccc... ... 355.987562\n", + "1709 N#Cc1c(N2CCCC2)cc(-c2ccccc2)c2c1Cc1ccccc1-2 ... 368.792145\n", + "\n", + "[1710 rows x 4 columns]" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Predict TDDFT peak in vacuum" + ], + "metadata": { + "id": "GE9Tof7UK8cI" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Equivalent to command line:**\n", + "\n", + "python uvvisml/predict.py --test_file uvvisml/data/splits/lambda_max_abs/deep4chem/group_by_smiles/smiles_target_test.csv --property vertical_excitation_eV_tddft --method chemprop --preds_file test_preds.csv" + ], + "metadata": { + "id": "zypfNmVzJKEc" + } + }, + { + "cell_type": "code", + "source": [ + "arguments = [\n", + " '--test_path', test_file,\n", + " '--preds_path', '/dev/null',\n", + " '--checkpoint_dir', 'models/lambda_max_abs_wb97xd3/chemprop/all_wb97xd3/production/fold_0',\n", + " '--number_of_molecules', '1',\n", + " #'--gpu', '0'\n", + "]\n", + "\n", + "args = chemprop.args.PredictArgs().parse_args(arguments)\n", + "preds = chemprop.train.make_predictions(args=args)\n", + "\n", + "preds = [x[0] for x in preds] # predictions are in eV\n", + "df['peakwavs_max_pred'] = preds\n", + "df['peakwavs_max_pred'] = 1240/df['peakwavs_max_pred'] # convert from eV to nm\n", + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "kswx6y_uJHqW", + "outputId": "bd9ea495-c3ef-47bb-a138-508aeb281a5e" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loading training args\n", + "Setting molecule featurization parameters to default.\n", + "Loading data\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "1710it [00:00, 80922.70it/s]\n", + "100%|██████████| 1710/1710 [00:00<00:00, 161116.45it/s]\n", + "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:481: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", + " cpuset_checked))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Validating SMILES\n", + "Test size = 1,710\n", + "Predicting with an ensemble of 5 models\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\r 0%| | 0/5 [00:00\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
smilessolventpeakwavs_maxpeakwavs_max_pred
0CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1C1CCCCC1376.0309.772465
1CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CCOC(C)=O392.0309.772465
2CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CC#N396.0309.772465
3CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CCO400.0309.772465
4CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1OCC(O)CO413.0309.772465
...............
1705c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34C[N+](=O)[O-]424.0346.755560
1706c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34CS(C)=O432.0346.755560
1707COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5...ClCCl367.0318.639569
1708N#Cc1c(N2CCCCC2)cc(-c2cccc3ccccc23)c2c1-c1cccc...C1CCOC1358.0310.870571
1709N#Cc1c(N2CCCC2)cc(-c2ccccc2)c2c1Cc1ccccc1-2C1CCOC1382.0317.668449
\n", + "

1710 rows × 4 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " " + ], + "text/plain": [ + " smiles ... peakwavs_max_pred\n", + "0 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 309.772465\n", + "1 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 309.772465\n", + "2 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 309.772465\n", + "3 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 309.772465\n", + "4 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 309.772465\n", + "... ... ... ...\n", + "1705 c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34 ... 346.755560\n", + "1706 c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34 ... 346.755560\n", + "1707 COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5... ... 318.639569\n", + "1708 N#Cc1c(N2CCCCC2)cc(-c2cccc3ccccc23)c2c1-c1cccc... ... 310.870571\n", + "1709 N#Cc1c(N2CCCC2)cc(-c2ccccc2)c2c1Cc1ccccc1-2 ... 317.668449\n", + "\n", + "[1710 rows x 4 columns]" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Predict experimental peak with model trained on Deep4Chem training set" + ], + "metadata": { + "id": "54x-eGwxLEZ1" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Equivalent to command line:**\n", + "\n", + "python uvvisml/predict.py --test_file uvvisml/data/splits/lambda_max_abs/deep4chem/group_by_smiles/smiles_target_test.csv --property absorption_peak_nm_expt --method chemprop --preds_file test_preds.csv --train_dataset deep4chem" + ], + "metadata": { + "id": "_68WpdSDI110" + } + }, + { + "cell_type": "code", + "source": [ + "arguments = [\n", + " '--test_path', test_file,\n", + " '--preds_path', '/dev/null',\n", + " '--checkpoint_dir', 'models/lambda_max_abs/chemprop/deep4chem/production/fold_0',\n", + " '--number_of_molecules', '2',\n", + " #'--gpu', '0'\n", + "]\n", + "\n", + "args = chemprop.args.PredictArgs().parse_args(arguments)\n", + "preds = chemprop.train.make_predictions(args=args)\n", + "\n", + "preds = [x[0] for x in preds]\n", + "df['peakwavs_max_pred'] = preds\n", + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "ICmfelGD7rcf", + "outputId": "ebc7e05e-b415-40bb-b4d4-5f5fc4284a86" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loading training args\n", + "Setting molecule featurization parameters to default.\n", + "Loading data\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "1710it [00:00, 141559.62it/s]\n", + "100%|██████████| 1710/1710 [00:00<00:00, 153911.16it/s]\n", + "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:481: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", + " cpuset_checked))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Validating SMILES\n", + "Test size = 1,710\n", + "Predicting with an ensemble of 5 models\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\r 0%| | 0/5 [00:00\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
smilessolventpeakwavs_maxpeakwavs_max_pred
0CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1C1CCCCC1376.0382.903437
1CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CCOC(C)=O392.0395.478472
2CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CC#N396.0400.821401
3CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CCO400.0418.106349
4CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1OCC(O)CO413.0426.706045
...............
1705c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34C[N+](=O)[O-]424.0474.872657
1706c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34CS(C)=O432.0466.555822
1707COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5...ClCCl367.0365.403474
1708N#Cc1c(N2CCCCC2)cc(-c2cccc3ccccc23)c2c1-c1cccc...C1CCOC1358.0349.835862
1709N#Cc1c(N2CCCC2)cc(-c2ccccc2)c2c1Cc1ccccc1-2C1CCOC1382.0352.427928
\n", + "

1710 rows × 4 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " " + ], + "text/plain": [ + " smiles ... peakwavs_max_pred\n", + "0 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 382.903437\n", + "1 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 395.478472\n", + "2 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 400.821401\n", + "3 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 418.106349\n", + "4 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 426.706045\n", + "... ... ... ...\n", + "1705 c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34 ... 474.872657\n", + "1706 c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34 ... 466.555822\n", + "1707 COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5... ... 365.403474\n", + "1708 N#Cc1c(N2CCCCC2)cc(-c2cccc3ccccc23)c2c1-c1cccc... ... 349.835862\n", + "1709 N#Cc1c(N2CCCC2)cc(-c2ccccc2)c2c1Cc1ccccc1-2 ... 352.427928\n", + "\n", + "[1710 rows x 4 columns]" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Predict experimental peak with multi-fidelity model" + ], + "metadata": { + "id": "ha40pwkQKZ2F" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Equivalent to command line:**\n", + "\n", + "python uvvisml/predict.py --test_file uvvisml/data/splits/lambda_max_abs/deep4chem/group_by_smiles/smiles_target_test.csv --property absorption_peak_nm_expt --method chemprop_tddft --preds_file test_preds.csv" + ], + "metadata": { + "id": "8IpT4M5CJjln" + } + }, + { + "cell_type": "code", + "source": [ + "# TDDFT Predictions\n", + "arguments = [\n", + " '--test_path', test_file,\n", + " '--preds_path', 'test_tddft_preds.csv',\n", + " '--checkpoint_dir', 'models/lambda_max_abs_wb97xd3/chemprop/all_wb97xd3/production/fold_0',\n", + " '--number_of_molecules', '1',\n", + " #'--gpu', '0'\n", + "]\n", + "\n", + "args = chemprop.args.PredictArgs().parse_args(arguments)\n", + "_ = chemprop.train.make_predictions(args=args)\n", + "\n", + "# Convert Predictions to Features File\n", + "!python models/tddft_to_features_file.py\n", + "\n", + "# Experimental Predictions\n", + "arguments = [\n", + " '--test_path', test_file,\n", + " '--preds_path', '/dev/null',\n", + " '--checkpoint_dir', 'models/lambda_max_abs/chemprop_tddft/combined/production/fold_0',\n", + " '--number_of_molecules', '2',\n", + " '--features_path', 'features_test.csv'\n", + " #'--gpu', '0'\n", + "]\n", + "\n", + "args = chemprop.args.PredictArgs().parse_args(arguments)\n", + "preds = chemprop.train.make_predictions(args=args)\n", + "\n", + "preds = [x[0] for x in preds]\n", + "df['peakwavs_max_pred'] = preds\n", + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "6OuRAYrKJjaa", + "outputId": "31d1f5a5-4355-4bbb-d3ea-d0be94169167" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loading training args\n", + "Setting molecule featurization parameters to default.\n", + "Loading data\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "1710it [00:00, 134827.05it/s]\n", + "100%|██████████| 1710/1710 [00:00<00:00, 136684.77it/s]\n", + "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:481: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", + " cpuset_checked))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Validating SMILES\n", + "Test size = 1,710\n", + "Predicting with an ensemble of 5 models\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\r 0%| | 0/5 [00:00\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
smilessolventpeakwavs_maxpeakwavs_max_pred
0CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1C1CCCCC1376.0375.545257
1CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CCOC(C)=O392.0390.993980
2CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CC#N396.0397.488817
3CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CCO400.0400.081324
4CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1OCC(O)CO413.0412.967337
...............
1705c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34C[N+](=O)[O-]424.0424.124035
1706c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34CS(C)=O432.0428.538180
1707COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5...ClCCl367.0355.781207
1708N#Cc1c(N2CCCCC2)cc(-c2cccc3ccccc23)c2c1-c1cccc...C1CCOC1358.0358.098561
1709N#Cc1c(N2CCCC2)cc(-c2ccccc2)c2c1Cc1ccccc1-2C1CCOC1382.0380.867901
\n", + "

1710 rows × 4 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " " + ], + "text/plain": [ + " smiles ... peakwavs_max_pred\n", + "0 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 375.545257\n", + "1 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 390.993980\n", + "2 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 397.488817\n", + "3 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 400.081324\n", + "4 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 412.967337\n", + "... ... ... ...\n", + "1705 c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34 ... 424.124035\n", + "1706 c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34 ... 428.538180\n", + "1707 COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5... ... 355.781207\n", + "1708 N#Cc1c(N2CCCCC2)cc(-c2cccc3ccccc23)c2c1-c1cccc... ... 358.098561\n", + "1709 N#Cc1c(N2CCCC2)cc(-c2ccccc2)c2c1Cc1ccccc1-2 ... 380.867901\n", + "\n", + "[1710 rows x 4 columns]" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Predict experimental peak with model trained on combined training set (with ensemble variance)" + ], + "metadata": { + "id": "IHGJvB47P8lI" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Equivalent to command line:**\n", + "\n", + "python uvvisml/predict.py --test_file uvvisml/data/splits/lambda_max_abs/deep4chem/group_by_smiles/smiles_target_test.csv --property absorption_peak_nm_expt --method chemprop --preds_file test_preds.csv" + ], + "metadata": { + "id": "FBPuuTzeQCfW" + } + }, + { + "cell_type": "code", + "source": [ + "arguments = [\n", + " '--test_path', test_file,\n", + " '--preds_path', 'test_preds.csv',\n", + " '--checkpoint_dir', 'models/lambda_max_abs/chemprop/combined/production/fold_0',\n", + " '--number_of_molecules', '2',\n", + " '--ensemble_variance',\n", + " #'--gpu', '0'\n", + "]\n", + "\n", + "args = chemprop.args.PredictArgs().parse_args(arguments)\n", + "_ = chemprop.train.make_predictions(args=args)\n", + "\n", + "df = pd.read_csv('test_preds.csv')\n", + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "t--C7XlAP8bD", + "outputId": "525fb972-f0e0-4ebb-969b-53cb2706a24f" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loading training args\n", + "Setting molecule featurization parameters to default.\n", + "Loading data\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "1710it [00:00, 129826.41it/s]\n", + "100%|██████████| 1710/1710 [00:00<00:00, 108043.62it/s]\n", + "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:481: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", + " cpuset_checked))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Validating SMILES\n", + "Test size = 1,710\n", + "Predicting with an ensemble of 5 models\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\r 0%| | 0/5 [00:00\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
smilessolventpeakwavs_maxpeakwavs_max_epi_unc
0CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1C1CCCCC1378.0897915.984684
1CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CCOC(C)=O388.3870753.289096
2CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CC#N394.5574726.647632
3CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1CCO400.8177247.351677
4CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1OCC(O)CO410.9023395.109207
...............
1705c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34C[N+](=O)[O-]425.0843428.989382
1706c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34CS(C)=O428.2097189.853369
1707COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5...ClCCl370.200150128.172913
1708N#Cc1c(N2CCCCC2)cc(-c2cccc3ccccc23)c2c1-c1cccc...C1CCOC1355.9875622.114018
1709N#Cc1c(N2CCCC2)cc(-c2ccccc2)c2c1Cc1ccccc1-2C1CCOC1368.79214511.947813
\n", + "

1710 rows × 4 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " " + ], + "text/plain": [ + " smiles ... peakwavs_max_epi_unc\n", + "0 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 5.984684\n", + "1 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 3.289096\n", + "2 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 6.647632\n", + "3 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 7.351677\n", + "4 CCN(CC)c1ccc2c(C(F)(F)F)cc(=O)oc2c1 ... 5.109207\n", + "... ... ... ...\n", + "1705 c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34 ... 8.989382\n", + "1706 c1cc2c3ccc[n+]4cccc(c5ccc[n+](c1)c25)c34 ... 9.853369\n", + "1707 COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5... ... 128.172913\n", + "1708 N#Cc1c(N2CCCCC2)cc(-c2cccc3ccccc23)c2c1-c1cccc... ... 2.114018\n", + "1709 N#Cc1c(N2CCCC2)cc(-c2ccccc2)c2c1Cc1ccccc1-2 ... 11.947813\n", + "\n", + "[1710 rows x 4 columns]" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "VKVIvJcXR5S5" + }, + "execution_count": 8, + "outputs": [] + } + ] +}