Add DiffDock w/o STC baseline

BioinfoMachineLearning · Aug 29, 2024 · d21dab4 · d21dab4
1 parent 7926b35
commit d21dab4
Show file tree

Hide file tree

Showing 67 changed files with 25,858 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -211,7 +211,7 @@ rm dockgen_ensemble_benchmark_method_predictions.tar.gz
 rm casp15_ensemble_benchmark_method_predictions.tar.gz
 ```
 
-**NOTE:** One can reproduce the *pocket-only* experiments with the PoseBusters Benchmark set by adding the argument `pocket_only_baseline=true` to each command below used to run PoseBusters Benchmark dataset inference with all the baseline methods (n.b., besides `tulip`, which does not support pocket-level docking currently), since the pocket-only versions of the dataset's holo-aligned predicted protein structures have also been included in the downloadable Zenodo archive `posebusters_benchmark_set.tar.gz` referenced above. Similarly, one can reproduce the *NeuralPLexer w/o inter-ligand clash loss (ILCL)* experiments with the CASP15 set by adding the argument `no_ilcl=true` (`neuralplexer_no_ilcl=true`) to the commands `python3 posebench/models/neuralplexer_inference.py dataset=casp15 ...` and `python3 posebench/analysis/inference_analysis_casp.py dataset=casp15 ...` below (`python3 posebench/models/ensemble_generation.py ensemble_benchmarking_dataset=casp15 ...`) used to run CASP15 dataset inference with NeuralPLexer. Please see the config files within `configs/data/`, `configs/model/`, and `configs/analysis/` for more details.
+**NOTE:** One can reproduce the *pocket-only* experiments with the PoseBusters Benchmark set by adding the argument `pocket_only_baseline=true` to each command below used to run PoseBusters Benchmark dataset inference with all the baseline methods (n.b., besides `tulip`, which does not support pocket-level docking currently), since the pocket-only versions of the dataset's holo-aligned predicted protein structures have also been included in the downloadable Zenodo archive `posebusters_benchmark_set.tar.gz` referenced above. Similarly, one can reproduce the *NeuralPLexer w/o inter-ligand clash loss (ILCL)* experiments with the CASP15 set by adding the argument `no_ilcl=true` (`neuralplexer_no_ilcl=true`) to the commands `python3 posebench/models/neuralplexer_inference.py dataset=casp15 ...` and `python3 posebench/analysis/inference_analysis_casp.py dataset=casp15 ...` below (`python3 posebench/models/ensemble_generation.py ensemble_benchmarking_dataset=casp15 ...`) used to run CASP15 dataset inference with NeuralPLexer. Lastly, one can reproduce the *DiffDock w/o structural cluster training (SCT)* by adding the argument `v1_baseline=true` to the DiffDock inference commands below. Please see the config files within `configs/data/`, `configs/model/`, and `configs/analysis/` for more details.
 
 ### Downloading sequence databases (required only for RoseTTAFold-All-Atom inference)
 

diff --git a/configs/analysis/complex_alignment.yaml b/configs/analysis/complex_alignment.yaml
@@ -3,9 +3,10 @@ vina_binding_site_method: diffdock # the method to use for Vina binding site pre
 dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
-output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index},${pocket_only_baseline}} # the output directory to which to save the relaxed predictions
+output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index},${pocket_only_baseline},${v1_baseline}} # the output directory to which to save the relaxed predictions
 rank_to_align: 1 # the pose rank to align
 aligned_filename_suffix: "_aligned" # the suffix to append to each aligned complex filename
 force_process: false # whether to force processing of all complexes, even if they have already been processed
 repeat_index: 1 # the repeat index which was used for inference
 pocket_only_baseline: false # whether to prepare the pocket-only baseline
+v1_baseline: false # whether to prepare the v1 baseline
diff --git a/configs/analysis/inference_analysis.yaml b/configs/analysis/inference_analysis.yaml
@@ -7,7 +7,8 @@ input_csv_path: ${resolve_method_input_csv_path:${method},${dataset},${pocket_on
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
 posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
 dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test_rmsd_filtered.txt # the path to the (predicted RMSD-filtered) DockGen test set IDs file
-output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index},${pocket_only_baseline}} # the output directory to which to save the relaxed predictions
+output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index},${pocket_only_baseline},${v1_baseline}} # the output directory to which to save the relaxed predictions
 repeat_index: 1 # the repeat index which was used for inference
 pocket_only_baseline: false # whether to analyze the pocket-only baseline
+v1_baseline: false # whether to analyze the v1 baseline
 relax_protein: false # whether to relax the protein - NOTE: currently periodically yields unpredictable protein-ligand separation
diff --git a/configs/analysis/inference_analysis_casp.yaml b/configs/analysis/inference_analysis_casp.yaml
@@ -14,3 +14,4 @@ score_relaxed_structures: true # whether to score relaxed structures in addition
 repeat_index: 1 # the run index to use for scoring predictions
 no_ilcl: false # whether to score a model trained without an inter-ligand clash loss (ILCL) - NOTE: only applicable to the `neuralplexer` method
 relax_protein: false # whether to relax the protein - NOTE: currently periodically yields unpredictable protein-ligand separation
+v1_baseline: false # whether to score the v1 baseline predictions
diff --git a/configs/model/diffdock_inference.yaml b/configs/model/diffdock_inference.yaml
@@ -16,3 +16,4 @@ repeat_index: 1 # the repeat index to use for inference
 skip_existing: true # whether to skip inference for existing output directories
 pocket_only_baseline: false # whether to run the pocket-only baseline
 max_num_inputs: null # if provided, the number of (dataset subset) inputs over which to run inference
+v1_baseline: false # whether to run the v1 baseline
diff --git a/configs/model/ensemble_generation.yaml b/configs/model/ensemble_generation.yaml
@@ -58,6 +58,7 @@ diffdock_batch_size: 10 # the batch size to use for inference
 diffdock_actual_steps: 19 # the actual number of inference steps to run (i.e., after how many steps to halt the reverse diffusion process)
 diffdock_no_final_step_noise: true # whether to disable the final inference step's noise from being added
 diffdock_skip_existing: true # whether to skip existing predictions
+diffdock_v1_baseline: false # whether to run the v1 baseline
 # DynamicBind inference arguments:
 dynamicbind_python_exec_path: ${oc.env:PROJECT_ROOT}/forks/DynamicBind/DynamicBind/bin/python3 # the Python executable to use
 dynamicbind_exec_dir: ${oc.env:PROJECT_ROOT}/forks/DynamicBind # the DynamicBind directory in which to execute the inference scripts

diff --git a/configs/model/inference_relaxation.yaml b/configs/model/inference_relaxation.yaml
@@ -11,8 +11,8 @@ platform: "fastest" # platform on which to run relaxation
 cuda_device_index: 0 # CUDA device index
 log_level: "INFO" # logging level
 protein_dir: ${resolve_method_protein_dir:${method},${dataset},${repeat_index},${pocket_only_baseline}} # the directory from which to load (potentially inferred) proteins
-ligand_dir: ${resolve_method_ligand_dir:${method},${dataset},${vina_binding_site_method},${repeat_index},${pocket_only_baseline}} # the directory from which to load inferred ligands
-output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index},${pocket_only_baseline}} # the output directory to which to save the relaxed predictions
+ligand_dir: ${resolve_method_ligand_dir:${method},${dataset},${vina_binding_site_method},${repeat_index},${pocket_only_baseline},${v1_baseline}} # the directory from which to load inferred ligands
+output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index},${pocket_only_baseline},${v1_baseline}} # the output directory to which to save the relaxed predictions
 relax_protein: false # whether to relax the protein - NOTE: currently periodically yields unpredictable protein-ligand separation
 remove_initial_protein_hydrogens: false # whether to remove hydrogens from the initial protein
 assign_each_ligand_unique_force: false # when relaxing the protein, whether to assign each ligand a unique force constant
@@ -25,3 +25,4 @@ max_num_attempts: 5 # when relaxing the protein, maximum number of relaxation at
 skip_existing: true # whether to skip existing relaxed predictions
 repeat_index: 1 # the repeat index which was used for inference
 pocket_only_baseline: false # whether to prepare the pocket-only baseline
+v1_baseline: false # whether to prepare the v1 baseline
diff --git a/configs/model/vina_inference.yaml b/configs/model/vina_inference.yaml
@@ -5,7 +5,7 @@ ensemble_ranking_method: consensus # the method with which to rank-order and sel
 python2_exec_path: ${oc.env:PROJECT_ROOT}/forks/Vina/ADFR/bin/python # the path to the Python 2 executable
 p2rank_exec_path: ${oc.env:PROJECT_ROOT}/forks/P2Rank/p2rank_2.4.2/prank # the path to the P2Rank executable
 prepare_receptor_script_path: ${oc.env:PROJECT_ROOT}/forks/Vina/ADFR/CCSBpckgs/AutoDockTools/Utilities24/prepare_receptor4.py # the path to the prepare_receptor.py script
-input_dir: ${resolve_method_output_dir:${method},${dataset},${method},${ensemble_ranking_method},${repeat_index},${pocket_only_baseline}} # the input directory with which to run inference
+input_dir: ${resolve_method_output_dir:${method},${dataset},${method},${ensemble_ranking_method},${repeat_index},${pocket_only_baseline},${v1_baseline}} # the input directory with which to run inference
 input_protein_structure_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_predicted_structures # the input protein structure directory to parse
 output_dir: ${oc.env:PROJECT_ROOT}/data/test_cases/${dataset}/vina_${method}_${dataset}_outputs_${repeat_index} # the output directory to which to save the inference results
 cpu: 0 # the number of CPU workers to use with AutoDock Vina for parallel processing, 0 for all available
@@ -25,6 +25,7 @@ apo_protein_filepath: null # the apo protein file path to use for inference
 input_id: null # the input ID to use for inference
 repeat_index: 1 # the repeat index to use for inference
 pocket_only_baseline: false # whether to run the pocket-only baseline
+v1_baseline: false # whether to run the v1 baseline
 max_num_inputs: null # if provided, the number of (dataset subset) inputs over which to run inference
 # p2rank inference arguments:
 p2rank_exec_utility: predict # the P2Rank executable utility to use for inference

diff --git a/configs/scripts/build_inference_script.yaml b/configs/scripts/build_inference_script.yaml
@@ -7,6 +7,7 @@ repeat_index: 1 # the repeat index which was used for inference
 cuda_device_index: 0 # the CUDA device index to use for inference (for all methods except AutoDock-Vina)
 output_script_dir: ${oc.env:PROJECT_ROOT}/scripts/inference # the directory in which to save the output script
 pocket_only_baseline: null # whether to perform a pocket-only baseline for the PoseBusters Benchmark set - NOTE: not applicable only to `tulip`
+v1_baseline: false # whether to perform the V1 baseline for DiffDock
 no_ilcl: null # whether to use model weights trained with an inter-ligand clash loss (ILCL) for the CASP15 set - NOTE: only applicable to `neuralplexer`
 relax_protein: null # whether to relax the protein structure before scoring - NOTE: currently in an experimental state
 export_hpc_headers: true # whether to insert high-performance computing (by default, SLURM) headers into the output script

diff --git a/forks/DiffDockv1/.gitattributes b/forks/DiffDockv1/.gitattributes
@@ -0,0 +1,12 @@
+*.ipynb linguist-vendored=false
+*.ipynb linguist-detectable=false
+
+/jupyter_notebooks linguist-vendored=false
+
+jupyter_notebooks/** linguist-vendored
+
+jupyter_notebooks/** linguist-vendored=false
+
+
+jupyter_notebooks/* linguist-vendored
+jupyter_notebooks/* linguist-vendored=false
diff --git a/forks/DiffDockv1/.gitignore b/forks/DiffDockv1/.gitignore
@@ -0,0 +1,166 @@
+homework
+inference_out_dir_not_specified
+.plotly_cache
+.DS_store
+renew.sh
+tmux_renew.sh
+images
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+.so3_*
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+local_config_inference2.yml
+.vscode/
+
+
+*.zip
+
+.idea/
+
+
+#################### Project specific
+.p.npy
+.score.npy
+# this ignores everything in data except for the file
+!/data
+/data/*
+!/data/splits
+!/data/protein_ligand_example_csv.csv
+!/data/testset_csv.csv
+!/data/INDEX_general_PL_data.2020
+test_run
+
+cache
+wandb
+logs
+
+# temporary files
+.openbabel_cache
+temp/
+bsub*
+stderr*
+stdout*
+!/workdir
+/workdir/*
+!/workdir/paper_confidence_model
+!/workdir/paper_score_model
+runs2
+results
+# this excludes everything in the runs directory except for that specific run
+!/runs
+/runs/*
+!/runs/rigid_redocking
+!/runs/flexible_self_docking
+local_config.yml
+local_config_inference.yml
+local_config_confidence.yml
+temp1.py
+temp5.py
+temp3.py
+temp4.py
+temp5.py
+temp6.py
+temp7.py
+esm
+
diff --git a/forks/DiffDockv1/LICENSE b/forks/DiffDockv1/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Gabriele Corso, Hannes Stärk, Bowen Jing
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.