From 2ec8d8cf8266620e43ce936ee179006aa2f41d8d Mon Sep 17 00:00:00 2001 From: ribesstefano Date: Fri, 7 Jun 2024 18:13:23 +0200 Subject: [PATCH] Updated README and Conda environment configuration file --- .gitignore | 1 + README.md | 43 ++++-- environment.yml | 399 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 427 insertions(+), 16 deletions(-) create mode 100644 environment.yml diff --git a/.gitignore b/.gitignore index 3c35710..7410c1a 100644 --- a/.gitignore +++ b/.gitignore @@ -165,5 +165,6 @@ cython_debug/ data/uniprot2embedding.h5 data/PROTAC-DB.csv data/PROTAC-Pedia.csv +data/cellosaurus.txt logs/ notebooks/per-protein* \ No newline at end of file diff --git a/README.md b/README.md index b903fe9..3490cac 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,19 @@ - -

- Maturity level-0 -

+# PROTAC-Degradation-Predictor -

PROTAC-Degradation-Predictor

- -

- A machine learning-based tool for predicting PROTAC protein degradation activity. -

+A machine learning-based tool for predicting PROTAC protein degradation activity. ## 📚 Table of Contents - [Data Curation](#-data-curation) - [Installation](#-installation) - [Usage](#-usage) +- [Training](#-training) +- [Citation](#-citation) +- [License](#-license) ## 📝 Data Curation @@ -36,12 +33,14 @@ The package has been developed on a Linux machine with Python 3.10.8. It is reco ## 🎯 Usage +For a thorough explanation on how to use the package, please refer to the tutorial notebook [`protac_degradation_tutorial.ipynb`](notebooks/protac_degradation_tutorial.ipynb). + After installing the package, you can use it as follows: ```python import protac_degradation_predictor as pdp -protac_smiles = 'CC(C)(C)OC(=O)N1CCN(CC1)C2=CC(=C(C=C2)C(=O)NC3=CC(=C(C=C3)F)Cl)C(=O)NC4=CC=C(C=C4)F' +protac_smiles = 'Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](NC(=O)COCCCCCCCCCOCC(=O)Nc2ccc(C(=O)Nc3ccc(F)cc3N)cc2)C(C)(C)C)cc1' e3_ligase = 'VHL' target_uniprot = 'P04637' cell_line = 'HeLa' @@ -51,8 +50,6 @@ active_protac = pdp.is_protac_active( e3_ligase, target_uniprot, cell_line, - device='cuda', # Default to 'cpu' - proba_threshold=0.5, # Default value ) print(f'The given PROTAC is: {"active" if active_protac else "inactive"}') @@ -62,12 +59,26 @@ This example demonstrates how to predict the activity of a PROTAC molecule. The The function supports batch computation by passing lists of SMILES strings, E3 ligases, UniProt IDs, and cell lines. In this case, it returns a list of booleans indicating the activity of each PROTAC. +## 📈 Training +Before running the experiments, here are some required steps to follow (assuming one is in the repository directory already): +1. Download the data from the [Cellosaurus database](https://web.expasy.org/cellosaurus/) and save it in the `data` directory: +```bash +wget https://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt data/ +``` +2. Make a copy of the Uniprot embeddings to be placed in the `data` directory: +```bash +cp protac_degradation_predictor/data/uniprot2embedding.h5 data/ +``` +3. Create a virtual environment and install the required packages by running the following commands: +```bash +conda env create -f environment.yaml +conda activate protac-degradation-predictor +``` +4. The code for training the model can be found in the file [`run_experiments.py`](src/run_experiments.py). -## 📈 Training - -The code for training the model can be found in the file [`run_experiments.py`](src/run_experiments.py). +(Don't forget to adjust the `PYTHONPATH` environment variable to include the repository directory: `export PYTHONPATH=$PYTHONPATH:/path/to/PROTAC-Degradation-Predictor`) ## 📄 Citation diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..c99860f --- /dev/null +++ b/environment.yml @@ -0,0 +1,399 @@ +name: env-protac-degradation-predictor +channels: + - pyg + - anaconda + - pytorch + - huggingface + - nvidia + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - _py-xgboost-mutex=2.0=cpu_0 + - abseil-cpp=20211102.0=h27087fc_1 + - absl-py=1.4.0=pyhd8ed1ab_0 + - accelerate=0.21.0=pyhd8ed1ab_0 + - aiohttp=3.8.3=py310h5eee18b_0 + - aiosignal=1.2.0=pyhd8ed1ab_0 + - alembic=1.12.0=pyhd8ed1ab_0 + - anyio=3.7.1=pyhd8ed1ab_0 + - arrow=1.2.3=pyhd8ed1ab_0 + - arrow-cpp=11.0.0=py310h7516544_0 + - async-timeout=4.0.2=pyhd8ed1ab_0 + - aws-c-common=0.4.57=he1b5a44_1 + - aws-c-event-stream=0.1.6=h72b8ae1_3 + - aws-checksums=0.1.9=h346380f_0 + - aws-sdk-cpp=1.8.185=hce553d0_0 + - backports=1.0=pyhd8ed1ab_3 + - backports.functools_lru_cache=1.6.5=pyhd8ed1ab_0 + - beautifulsoup4=4.12.2=pyha770c72_0 + - binaryornot=0.4.4=py_1 + - blas=1.0=mkl + - blessed=1.19.1=pyhe4f9e05_2 + - blinker=1.6.2=pyhd8ed1ab_0 + - boltons=23.0.0=pyhd8ed1ab_0 + - boost-cpp=1.73.0=h7f8727e_12 + - bottleneck=1.3.5=py310ha9d4c09_0 + - brotlipy=0.7.0=py310h5764c6d_1004 + - bzip2=1.0.8=h7f98852_4 + - c-ares=1.19.0=h5eee18b_0 + - ca-certificates=2023.08.22=h06a4308_0 + - cachecontrol=0.12.14=pyhd8ed1ab_0 + - certifi=2023.7.22=py310h06a4308_0 + - cffi=1.15.1=py310h5eee18b_3 + - chardet=5.1.0=py310hff52083_0 + - charset-normalizer=2.0.4=pyhd8ed1ab_0 + - cleo=2.0.1=pyhd8ed1ab_0 + - click=8.0.4=py310hff52083_0 + - cmaes=0.10.0=pyhd8ed1ab_0 + - colorama=0.4.6=pyhd8ed1ab_0 + - colorlog=6.7.0=py310hff52083_1 + - conda=23.7.4=py310hff52083_0 + - conda-content-trust=0.1.3=pyhd8ed1ab_0 + - conda-package-handling=1.9.0=py310h5eee18b_1 + - cookiecutter=2.3.0=pyh1a96a4e_0 + - crashtest=0.4.1=pyhd8ed1ab_0 + - croniter=1.3.15=pyhd8ed1ab_0 + - cryptography=38.0.1=py310h9ce1e76_0 + - cuda-cudart=11.8.89=0 + - cuda-cupti=11.8.87=0 + - cuda-libraries=11.8.0=0 + - cuda-nvrtc=11.8.89=0 + - cuda-nvtx=11.8.86=0 + - cuda-runtime=11.8.0=0 + - dataclasses=0.8=pyhc8e2a94_3 + - datasets=2.14.4=py_0 + - dateutils=0.6.12=py_0 + - dbus=1.13.0=h4e0c4b3_1000 + - deepdiff=6.2.2=pyhd8ed1ab_0 + - dill=0.3.6=pyhd8ed1ab_1 + - distlib=0.3.7=pyhd8ed1ab_0 + - dulwich=0.21.3=py310h5eee18b_0 + - evaluate=0.2.2=pyhd8ed1ab_0 + - exceptiongroup=1.1.3=pyhd8ed1ab_0 + - expat=2.4.8=h27087fc_0 + - fastapi=0.103.1=pyhd8ed1ab_0 + - ffmpeg=4.3=hf484d3e_0 + - filelock=3.9.0=pyhd8ed1ab_0 + - freetype=2.12.1=h4a9f257_0 + - frozenlist=1.3.3=py310h5eee18b_0 + - fsspec=2023.4.0=pyh1a96a4e_0 + - gflags=2.2.2=he1b5a44_1004 + - giflib=5.2.1=h36c2ea0_2 + - glog=0.5.0=h48cff8f_0 + - gmp=6.2.1=h58526e2_0 + - gmpy2=2.1.2=py310heeb90bb_0 + - gnutls=3.6.15=he1e5248_0 + - google-auth-oauthlib=0.4.1=py_2 + - greenlet=1.1.2=py310hd8f1fbe_2 + - grpc-cpp=1.46.1=h33aed49_1 + - grpcio=1.42.0=py310hce63b2e_0 + - h11=0.14.0=pyhd8ed1ab_0 + - html5lib=1.1=pyh9f0ad1d_0 + - huggingface_hub=0.16.4=py_0 + - icu=58.2=hf484d3e_1000 + - idna=3.4=pyhd8ed1ab_0 + - importlib-metadata=6.8.0=pyha770c72_0 + - importlib_metadata=6.8.0=hd8ed1ab_0 + - importlib_resources=6.0.1=pyhd8ed1ab_0 + - inquirer=3.1.3=pyhd8ed1ab_0 + - intel-openmp=2023.1.0=hdb19cb5_46305 + - itsdangerous=2.1.2=pyhd8ed1ab_0 + - jaraco.classes=3.3.0=pyhd8ed1ab_0 + - jeepney=0.8.0=pyhd8ed1ab_0 + - jinja2=3.1.2=pyhd8ed1ab_1 + - joblib=1.2.0=pyhd8ed1ab_0 + - jpeg=9e=h166bdaf_1 + - jsonpatch=1.32=pyhd8ed1ab_0 + - jsonpointer=2.1=pyhd3eb1b0_0 + - keyring=23.13.1=py310hff52083_0 + - krb5=1.20.1=h568e23c_1 + - lame=3.100=h7f98852_1001 + - lcms2=2.12=h3be6417_0 + - ld_impl_linux-64=2.38=h1181459_1 + - lerc=3.0=h9c3ff4c_0 + - libboost=1.73.0=h28710b8_12 + - libbrotlicommon=1.0.9=h166bdaf_7 + - libbrotlidec=1.0.9=h166bdaf_7 + - libbrotlienc=1.0.9=h166bdaf_7 + - libcublas=11.11.3.6=0 + - libcufft=10.9.0.58=0 + - libcufile=1.7.1.12=0 + - libcurand=10.3.3.129=0 + - libcurl=8.1.1=h91b91d3_2 + - libcusolver=11.4.1.48=0 + - libcusparse=11.7.5.86=0 + - libdeflate=1.8=h7f98852_0 + - libedit=3.1.20221030=h5eee18b_0 + - libev=4.33=h516909a_1 + - libevent=2.1.12=h8f2d780_0 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=11.2.0=h1234567_1 + - libgfortran-ng=11.2.0=h00389a5_1 + - libgfortran5=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libiconv=1.16=h516909a_0 + - libidn2=2.3.4=h5eee18b_0 + - libnghttp2=1.52.0=ha637b67_1 + - libnpp=11.8.0.86=0 + - libnvjpeg=11.9.0.86=0 + - libpng=1.6.39=h5eee18b_0 + - libprotobuf=3.20.3=he621ea3_0 + - libssh2=1.10.0=ha56f1ee_2 + - libstdcxx-ng=11.2.0=he4da1e4_16 + - libtasn1=4.19.0=h5eee18b_0 + - libthrift=0.15.0=h0d84882_2 + - libtiff=4.5.0=hecacb30_0 + - libunistring=0.9.10=h7f98852_0 + - libuuid=1.41.5=h5eee18b_0 + - libwebp=1.2.4=h11a3e52_1 + - libwebp-base=1.2.4=h5eee18b_1 + - libxgboost=1.5.1=cpu_h3d145d1_2 + - lightning=2.0.4=pyhd8ed1ab_0 + - lightning-cloud=0.5.38=pyhd8ed1ab_0 + - lightning-utilities=0.9.0=pyhd8ed1ab_0 + - lockfile=0.12.2=py_1 + - lz4-c=1.9.4=h6a678d5_0 + - mako=1.2.4=pyhd8ed1ab_0 + - markdown=3.4.4=pyhd8ed1ab_0 + - markdown-it-py=3.0.0=pyhd8ed1ab_0 + - markupsafe=2.1.1=py310h5764c6d_1 + - mdurl=0.1.0=pyhd8ed1ab_0 + - mkl=2023.1.0=h213fc3f_46343 + - mkl-service=2.4.0=py310h5eee18b_1 + - mkl_fft=1.3.6=py310h1128e8f_1 + - mkl_random=1.2.2=py310h1128e8f_1 + - more-itertools=10.1.0=pyhd8ed1ab_0 + - mpc=1.1.0=h04dde30_1009 + - mpfr=4.0.2=he80fd80_1 + - mpmath=1.3.0=pyhd8ed1ab_0 + - msgpack-python=1.0.3=py310hbf28c38_1 + - multidict=6.0.2=py310h5764c6d_1 + - multiprocess=0.70.14=py310h06a4308_0 + - ncurses=6.3=h27087fc_1 + - nettle=3.7.3=hbbd107a_1 + - networkx=3.1=pyhd8ed1ab_0 + - numexpr=2.8.4=py310h85018f9_1 + - numpy=1.25.2=py310h5f9d8c6_0 + - numpy-base=1.25.2=py310hb5e798b_0 + - oauthlib=3.2.2=pyhd8ed1ab_0 + - openh264=2.1.1=h780b84a_0 + - openssl=1.1.1w=h7f8727e_0 + - optuna=3.3.0=pyhd8ed1ab_0 + - orc=1.7.4=hb3bc3d3_1 + - ordered-set=4.1.0=pyhd8ed1ab_0 + - packaging=23.0=pyhd8ed1ab_0 + - pandas=1.5.3=py310h1128e8f_0 + - pexpect=4.8.0=pyh1a96a4e_2 + - pillow=9.4.0=py310h6a678d5_0 + - pip=22.3.1=pyhd8ed1ab_0 + - pkginfo=1.9.6=pyhd8ed1ab_0 + - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_1 + - platformdirs=2.6.2=pyhd8ed1ab_0 + - poetry=1.4.2=linux_pyhd8ed1ab_0 + - poetry-core=1.5.2=pyhd8ed1ab_0 + - poetry-plugin-export=1.3.1=pyhd8ed1ab_0 + - protobuf=3.20.3=py310h6a678d5_0 + - psutil=5.9.0=py310h5764c6d_1 + - ptyprocess=0.7.0=pyhd3deb0d_0 + - py-xgboost=1.5.1=cpu_py310hd1aba9c_2 + - pyarrow=11.0.0=py310h468efa6_0 + - pyasn1=0.4.8=py_0 + - pyasn1-modules=0.2.7=py_0 + - pycosat=0.6.4=py310h5eee18b_0 + - pycparser=2.21=pyhd8ed1ab_0 + - pydantic=1.9.1=py310h5764c6d_0 + - pyg=2.3.1=py310_torch_2.0.0_cu118 + - pygments=2.16.1=pyhd8ed1ab_0 + - pyjwt=2.8.0=pyhd8ed1ab_0 + - pyopenssl=22.0.0=pyhd8ed1ab_1 + - pyparsing=3.0.9=py310h06a4308_0 + - pyproject_hooks=1.0.0=pyhd8ed1ab_0 + - pyrsistent=0.18.1=py310h5764c6d_1 + - pysocks=1.7.1=pyha2e5f31_6 + - python=3.10.8=h7a1cb2a_1 + - python-build=0.10.0=pyhd8ed1ab_1 + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python-editor=1.0.4=py_0 + - python-installer=0.7.0=pyhd8ed1ab_0 + - python-multipart=0.0.6=pyhd8ed1ab_0 + - python-slugify=8.0.1=pyhd8ed1ab_0 + - python-xxhash=2.0.2=py310h6acc77f_1 + - python_abi=3.10=2_cp310 + - pytorch=2.0.1=py3.10_cuda11.8_cudnn8.7.0_0 + - pytorch-cuda=11.8=h7e8668a_5 + - pytorch-lightning=2.0.9=pyhd8ed1ab_0 + - pytorch-mutex=1.0=cuda + - pytz=2022.7.1=pyhd8ed1ab_0 + - pyu2f=0.1.5=pyhd8ed1ab_0 + - pyyaml=6.0=py310h5764c6d_4 + - rapidfuzz=2.13.7=py310h1128e8f_0 + - re2=2022.04.01=h27087fc_0 + - readchar=4.0.5=pyhd8ed1ab_0 + - readline=8.2=h5eee18b_0 + - regex=2022.7.9=py310h5eee18b_0 + - requests-oauthlib=1.3.1=pyhd8ed1ab_0 + - requests-toolbelt=0.10.1=pyhd8ed1ab_0 + - responses=0.18.0=pyhd8ed1ab_0 + - rich=13.5.1=pyhd8ed1ab_0 + - rsa=4.9=pyhd8ed1ab_0 + - ruamel.yaml=0.17.21=py310h5764c6d_1 + - ruamel.yaml.clib=0.2.6=py310h5764c6d_1 + - sacremoses=master=py_0 + - scikit-learn=1.2.0=py310h6a678d5_1 + - scipy=1.9.3=py310h5f9d8c6_2 + - secretstorage=3.3.3=py310hff52083_1 + - setuptools=65.5.0=pyhd8ed1ab_0 + - shellingham=1.5.3=pyhd8ed1ab_0 + - six=1.16.0=pyh6c4a22f_0 + - snappy=1.1.9=hbd366e4_1 + - sniffio=1.3.0=pyhd8ed1ab_0 + - soupsieve=2.5=pyhd8ed1ab_1 + - sqlalchemy=1.4.36=py310h5764c6d_0 + - sqlite=3.40.0=h5082296_0 + - starlette=0.27.0=pyhd8ed1ab_0 + - starsessions=1.3.0=pyhd8ed1ab_0 + - sympy=1.11.1=pyh04b8f61_3 + - tabulate=0.9.0=pyhd8ed1ab_1 + - tbb=2021.8.0=hdb19cb5_0 + - tensorboard=2.6.0=py_0 + - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0 + - text-unidecode=1.3=py_0 + - threadpoolctl=2.2.0=pyh0d69192_0 + - tk=8.6.12=h1ccaba5_0 + - tokenizers=0.13.2=py310he7d60b5_1 + - tomli=2.0.1=pyhd8ed1ab_0 + - tomlkit=0.12.1=pyha770c72_0 + - toolz=0.12.0=pyhd8ed1ab_0 + - torchaudio=2.0.2=py310_cu118 + - torchmetrics=1.1.2=pyhd8ed1ab_0 + - torchtriton=2.0.0=py310 + - torchvision=0.15.2=py310_cu118 + - tqdm=4.64.1=pyhd8ed1ab_0 + - transformers=4.29.2=pyhd8ed1ab_0 + - trove-classifiers=2023.8.7=pyhd8ed1ab_0 + - typing-extensions=4.7.1=hd8ed1ab_0 + - typing_extensions=4.7.1=pyha770c72_0 + - tzdata=2022g=h191b570_0 + - unidecode=1.3.6=pyhd8ed1ab_0 + - urllib3=1.26.13=pyhd8ed1ab_0 + - utf8proc=2.6.1=h27cfd23_0 + - uvicorn=0.23.2=py310hff52083_0 + - virtualenv=20.21.1=pyhd8ed1ab_0 + - wcwidth=0.2.6=pyhd8ed1ab_0 + - webencodings=0.5.1=pyhd8ed1ab_2 + - websockets=10.3=py310h5764c6d_0 + - werkzeug=2.3.6=pyhd8ed1ab_0 + - wheel=0.37.1=pyhd8ed1ab_0 + - xxhash=0.8.0=h7f98852_3 + - xz=5.2.8=h5eee18b_0 + - yaml=0.2.5=h7f98852_2 + - yarl=1.8.1=py310h5eee18b_0 + - zipp=3.16.2=pyhd8ed1ab_0 + - zlib=1.2.13=h5eee18b_0 + - zstd=1.5.2=ha4553b6_0 + - pip: + - appdirs==1.4.4 + - argon2-cffi==23.1.0 + - argon2-cffi-bindings==21.2.0 + - asttokens==2.4.1 + - async-lru==2.0.4 + - attrs==23.2.0 + - babel==2.14.0 + - bleach==6.1.0 + - cachetools==4.2.4 + - comm==0.2.2 + - contourpy==1.2.1 + - cycler==0.12.1 + - debugpy==1.8.1 + - decorator==5.1.1 + - defusedxml==0.7.1 + - docker-pycreds==0.4.0 + - docstring-parser==0.15 + - executing==2.0.1 + - fastjsonschema==2.19.1 + - fonttools==4.51.0 + - fqdn==1.5.1 + - gitdb==4.0.11 + - gitpython==3.1.42 + - google-auth==1.35.0 + - h5py==3.10.0 + - httpcore==1.0.5 + - httpx==0.27.0 + - imbalanced-learn==0.12.0 + - iniconfig==2.0.0 + - ipykernel==6.29.4 + - ipython==8.23.0 + - ipywidgets==8.1.2 + - isoduration==20.11.0 + - jedi==0.19.1 + - json5==0.9.25 + - jsonargparse==4.23.1 + - jsonschema==4.21.1 + - jsonschema-specifications==2023.12.1 + - jupyter==1.0.0 + - jupyter-client==8.6.1 + - jupyter-console==6.6.3 + - jupyter-core==5.7.2 + - jupyter-events==0.10.0 + - jupyter-lsp==2.2.5 + - jupyter-server==2.14.0 + - jupyter-server-terminals==0.5.3 + - jupyterlab==4.1.6 + - jupyterlab-pygments==0.3.0 + - jupyterlab-server==2.26.0 + - jupyterlab-widgets==3.0.10 + - kiwisolver==1.4.5 + - llvmlite==0.42.0 + - matplotlib==3.8.4 + - matplotlib-inline==0.1.7 + - mistune==3.0.2 + - nbclient==0.10.0 + - nbconvert==7.16.3 + - nbformat==5.10.4 + - nest-asyncio==1.6.0 + - nltk==3.8.1 + - notebook==7.1.3 + - notebook-shim==0.2.4 + - numba==0.59.1 + - overrides==7.7.0 + - pandocfilters==1.5.1 + - parso==0.8.4 + - pluggy==1.5.0 + - prometheus-client==0.20.0 + - prompt-toolkit==3.0.43 + - pure-eval==0.2.2 + - pynndescent==0.5.12 + - pynvml==11.5.0 + - pytest==8.1.1 + - python-json-logger==2.0.7 + - pyzmq==26.0.1 + - qtconsole==5.5.1 + - qtpy==2.4.1 + - rdkit==2023.9.5 + - rdkit-pypi==2022.9.5 + - referencing==0.34.0 + - requests==2.31.0 + - rfc3339-validator==0.1.4 + - rfc3986-validator==0.1.1 + - rouge-score==0.1.2 + - rpds-py==0.18.0 + - seaborn==0.13.2 + - send2trash==1.8.3 + - sentry-sdk==1.41.0 + - setproctitle==1.3.3 + - smmap==5.0.1 + - stack-data==0.6.3 + - tensorboard-data-server==0.6.1 + - terminado==0.18.1 + - tinycss2==1.2.1 + - tornado==6.4 + - traitlets==5.14.3 + - typeshed-client==2.3.0 + - umap-learn==0.5.6 + - uri-template==1.3.0 + - wandb==0.16.4 + - webcolors==1.13 + - websocket-client==1.7.0 + - widgetsnbextension==4.0.10