Add bulk support for Chai-1

BioinfoMachineLearning · Sep 24, 2024 · e169af7 · e169af7
1 parent 4706f3f
commit e169af7
Show file tree

Hide file tree

Showing 24 changed files with 848 additions and 41 deletions.
diff --git a/.gitignore b/.gitignore
@@ -164,6 +164,8 @@ configs/local/default.yaml
 # Forks
 /workdir/
 /forks/chai-lab/chai-lab/
+/forks/chai-lab/prediction_inputs/
+/forks/chai-lab/prediction_outputs/
 /forks/DiffDock1.0/
 /forks/DiffDock/DiffDock/
 /forks/DynamicBind/*.npy

diff --git a/README.md b/README.md
@@ -99,6 +99,7 @@ cd forks/RoseTTAFold-All-Atom/rf2aa/SE3Transformer/ && pip3 install --no-cache-d
 # - Chai-1 environment (~6 GB)
 mamba env create -f environments/chai_lab_environment.yaml --prefix forks/chai-lab/chai-lab/
 conda activate forks/chai-lab/chai-lab/  # NOTE: one still needs to use `conda` to (de)activate environments
+pip3 install forks/chai-lab/
 # - AutoDock Vina Tools environment (~1 GB)
 mamba env create -f environments/adfr_environment.yaml --prefix forks/Vina/ADFR/
 conda activate forks/Vina/ADFR/  # NOTE: one still needs to use `conda` to (de)activate environments
@@ -191,6 +192,10 @@ rm neuralplexer_benchmark_method_predictions.tar.gz
 wget https://zenodo.org/records/11477766/files/rfaa_benchmark_method_predictions.tar.gz
 tar -xzf rfaa_benchmark_method_predictions.tar.gz
 rm rfaa_benchmark_method_predictions.tar.gz
+# Chai-1 predictions and results
+wget https://zenodo.org/records/11477766/files/chai_benchmark_method_predictions.tar.gz
+tar -xzf chai_benchmark_method_predictions.tar.gz
+rm chai_benchmark_method_predictions.tar.gz
 # TULIP predictions and results
 wget https://zenodo.org/records/11477766/files/tulip_benchmark_method_predictions.tar.gz
 tar -xzf tulip_benchmark_method_predictions.tar.gz
@@ -301,11 +306,12 @@ python3 posebench/data/components/protein_apo_to_holo_alignment.py dataset=astex
 
 #### Flexible Protein Methods
 
-| Name                   | Source                                                                | Astex Benchmarked | PoseBusters Benchmarked | DockGen Benchmarked | CASP Benchmarked |
-| ---------------------- | --------------------------------------------------------------------- | ----------------- | ----------------------- | ------------------- | ---------------- |
-| `DynamicBind`          | [Lu et al.](https://www.nature.com/articles/s41467-024-45461-2)       | ✓                 | ✓                       | ✓                   | ✓                |
-| `NeuralPLexer`         | [Qiao et al.](https://www.nature.com/articles/s42256-024-00792-z)     | ✓                 | ✓                       | ✓                   | ✓                |
-| `RoseTTAFold-All-Atom` | [Krishna et al.](https://www.science.org/doi/10.1126/science.adl2528) | ✓                 | ✓                       | ✓                   | ✓                |
+| Name                   | Source                                                                        | Astex Benchmarked | PoseBusters Benchmarked | DockGen Benchmarked | CASP Benchmarked |
+| ---------------------- | ----------------------------------------------------------------------------- | ----------------- | ----------------------- | ------------------- | ---------------- |
+| `DynamicBind`          | [Lu et al.](https://www.nature.com/articles/s41467-024-45461-2)               | ✓                 | ✓                       | ✓                   | ✓                |
+| `NeuralPLexer`         | [Qiao et al.](https://www.nature.com/articles/s42256-024-00792-z)             | ✓                 | ✓                       | ✓                   | ✓                |
+| `RoseTTAFold-All-Atom` | [Krishna et al.](https://www.science.org/doi/10.1126/science.adl2528)         | ✓                 | ✓                       | ✓                   | ✓                |
+| `Chai-1`               | [Chai Discovery](https://chaiassets.com/chai-1/paper/technical_report_v1.pdf) | ✓                 | ✓                       | ✓                   | ✓                |
 
 ### Methods available for ensembling
 
@@ -319,11 +325,12 @@ python3 posebench/data/components/protein_apo_to_holo_alignment.py dataset=astex
 
 #### Flexible Protein Methods
 
-| Name                   | Source                                                                | Astex Benchmarked | PoseBusters Benchmarked | DockGen Benchmarked | CASP Benchmarked |
-| ---------------------- | --------------------------------------------------------------------- | ----------------- | ----------------------- | ------------------- | ---------------- |
-| `DynamicBind`          | [Lu et al.](https://www.nature.com/articles/s41467-024-45461-2)       | ✓                 | ✓                       | ✓                   | ✓                |
-| `NeuralPLexer`         | [Qiao et al.](https://www.nature.com/articles/s42256-024-00792-z)     | ✓                 | ✓                       | ✓                   | ✓                |
-| `RoseTTAFold-All-Atom` | [Krishna et al.](https://www.science.org/doi/10.1126/science.adl2528) | ✓                 | ✓                       | ✓                   | ✓                |
+| Name                   | Source                                                                        | Astex Benchmarked | PoseBusters Benchmarked | DockGen Benchmarked | CASP Benchmarked |
+| ---------------------- | ----------------------------------------------------------------------------- | ----------------- | ----------------------- | ------------------- | ---------------- |
+| `DynamicBind`          | [Lu et al.](https://www.nature.com/articles/s41467-024-45461-2)               | ✓                 | ✓                       | ✓                   | ✓                |
+| `NeuralPLexer`         | [Qiao et al.](https://www.nature.com/articles/s42256-024-00792-z)             | ✓                 | ✓                       | ✓                   | ✓                |
+| `RoseTTAFold-All-Atom` | [Krishna et al.](https://www.science.org/doi/10.1126/science.adl2528)         | ✓                 | ✓                       | ✓                   | ✓                |
+| `Chai-1`               | [Chai Discovery](https://chaiassets.com/chai-1/paper/technical_report_v1.pdf) | ✓                 | ✓                       | ✓                   | ✓                |
 
 **NOTE**: Have a new method to add? Please let us know by creating a pull request. We would be happy to work with you to integrate new methodology into this benchmark!
 
@@ -652,6 +659,74 @@ python3 posebench/analysis/inference_analysis_casp.py method=rfaa dataset=casp15
 ...
 ```
 
+### How to run inference with `Chai-1`
+
+Prepare CSV input files
+
+```bash
+python3 posebench/data/chai_input_preparation.py dataset=posebusters_benchmark
+python3 posebench/data/chai_input_preparation.py dataset=astex_diverse
+python3 posebench/data/chai_input_preparation.py dataset=dockgen
+python3 posebench/data/chai_input_preparation.py dataset=casp15 input_data_dir=data/casp15_set/targets
+```
+
+Run inference on each dataset
+
+```bash
+conda activate forks/chai-lab/chai-lab/
+python3 posebench/models/chai_inference.py dataset=posebusters_benchmark
+python3 posebench/models/chai_inference.py dataset=astex_diverse
+python3 posebench/models/chai_inference.py dataset=dockgen
+python3 posebench/models/chai_inference.py dataset=casp15
+conda deactivate
+```
+
+Extract predictions into separate files for proteins and ligands
+
+```bash
+python3 posebench/data/chai_output_extraction.py dataset=posebusters_benchmark
+python3 posebench/data/chai_output_extraction.py dataset=astex_diverse
+python3 posebench/data/chai_output_extraction.py dataset=dockgen
+python3 posebench/data/chai_output_extraction.py dataset=casp15
+```
+
+Relax the generated ligand structures inside of their respective protein pockets
+
+```bash
+python3 posebench/models/inference_relaxation.py method=chai-lab dataset=posebusters_benchmark remove_initial_protein_hydrogens=true
+python3 posebench/models/inference_relaxation.py method=chai-lab dataset=astex_diverse remove_initial_protein_hydrogens=true
+python3 posebench/models/inference_relaxation.py method=chai-lab dataset=dockgen remove_initial_protein_hydrogens=true
+```
+
+Align predicted protein-ligand structures to ground-truth complex structures
+
+```bash
+python3 posebench/analysis/complex_alignment.py method=chai-lab dataset=posebusters_benchmark
+python3 posebench/analysis/complex_alignment.py method=chai-lab dataset=astex_diverse
+python3 posebench/analysis/complex_alignment.py method=chai-lab dataset=dockgen
+```
+
+Analyze inference results for each dataset
+
+```bash
+python3 posebench/analysis/inference_analysis.py method=chai-lab dataset=posebusters_benchmark
+python3 posebench/analysis/inference_analysis.py method=chai-lab dataset=astex_diverse
+python3 posebench/analysis/inference_analysis.py method=chai-lab dataset=dockgen
+```
+
+Analyze inference results for the CASP15 dataset
+
+```bash
+# first assemble (unrelaxed and post ranking-relaxed) CASP15-compliant prediction submission files for scoring
+python3 posebench/models/ensemble_generation.py ensemble_methods=\[chai-lab\] input_csv_filepath=data/test_cases/casp15/ensemble_inputs.csv output_dir=data/test_cases/casp15/top_chai-lab_ensemble_predictions_1 skip_existing=true relax_method_ligands_post_ranking=false export_file_format=casp15 export_top_n=5 combine_casp_output_files=true max_method_predictions=40 method_top_n_to_select=5 resume=true ensemble_benchmarking=true ensemble_benchmarking_dataset=casp15 cuda_device_index=0 ensemble_benchmarking_repeat_index=1
+python3 posebench/models/ensemble_generation.py ensemble_methods=\[chai-lab\] input_csv_filepath=data/test_cases/casp15/ensemble_inputs.csv output_dir=data/test_cases/casp15/top_chai-lab_ensemble_predictions_1 skip_existing=true relax_method_ligands_post_ranking=true export_file_format=casp15 export_top_n=5 combine_casp_output_files=true max_method_predictions=40 method_top_n_to_select=5 resume=true ensemble_benchmarking=true ensemble_benchmarking_dataset=casp15 cuda_device_index=0 ensemble_benchmarking_repeat_index=1
+# NOTE: the suffixes for both `output_dir` and `ensemble_benchmarking_repeat_index` should be modified to e.g., 2, 3, ...
+...
+# now score the CASP15-compliant submissions using the official CASP scoring pipeline
+python3 posebench/analysis/inference_analysis_casp.py method=chai-lab dataset=casp15 repeat_index=1
+...
+```
+
 ### How to run inference with `AutoDock Vina`
 
 Prepare CSV input files

diff --git a/configs/analysis/complex_alignment.yaml b/configs/analysis/complex_alignment.yaml
@@ -1,5 +1,5 @@
-method: neuralplexer # the method for which to align predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
-vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
+method: neuralplexer # the method for which to align predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`)
+vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`)
 dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse

diff --git a/configs/analysis/inference_analysis.yaml b/configs/analysis/inference_analysis.yaml
@@ -1,6 +1,6 @@
 full_report: true # whether to generate a full PoseBusters report (i.e. with all metrics) or a summary report (i.e. with only the most important metrics)
-method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `vina`, `ensemble`)
-vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `p2rank`)
+method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`, `vina`, `ensemble`)
+vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`, `p2rank`)
 dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 input_csv_path: ${resolve_method_input_csv_path:${method},${dataset},${pocket_only_baseline}} # the input CSV filepath with which to run inference

diff --git a/configs/analysis/inference_analysis_casp.yaml b/configs/analysis/inference_analysis_casp.yaml
@@ -1,8 +1,8 @@
 full_report: true # whether to generate a full PoseBusters report (i.e. with all metrics) or a summary report (i.e. with only the most important metrics)
 python_exec_path: ${oc.env:HOME}/mambaforge/envs/casp15_ligand_scoring/bin/python3 # the Python executable to use
 scoring_script_path: ${oc.env:PROJECT_ROOT}/posebench/analysis/casp15_ligand_scoring/score_predictions.py # the path to the script to use for scoring CASP predictions
-method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `vina`, `ensemble`, `tulip`)
-vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
+method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`, `vina`, `ensemble`, `tulip`)
+vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`)
 dataset: casp15 # the dataset to use - NOTE: must be one of (`casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 predictions_dir: ${oc.env:PROJECT_ROOT}/data/test_cases/${dataset}/top_${method}_ensemble_predictions_${repeat_index} # the directory containing the predictions to analyze

diff --git a/configs/data/chai_input_preparation.yaml b/configs/data/chai_input_preparation.yaml
@@ -0,0 +1,9 @@
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
+input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
+output_scripts_path: ${oc.env:PROJECT_ROOT}/forks/chai-lab/prediction_inputs/${dataset} # the output directory in which to save the input files
+posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
+dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
+protein_filepath: null # the path to the protein structure file to use
+ligand_smiles: null # the ligand SMILES string for which to predict the binding pose
+input_id: null # the input ID to use for inference
+pocket_only_baseline: false # whether to prepare the pocket-only baseline
diff --git a/configs/data/chai_output_extraction.yaml b/configs/data/chai_output_extraction.yaml
@@ -0,0 +1,13 @@
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
+prediction_inputs_dir: ${oc.env:PROJECT_ROOT}/forks/chai-lab/prediction_inputs/${dataset}
+prediction_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/chai-lab/prediction_outputs/${dataset}_${repeat_index}
+inference_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/chai-lab/inference/chai-lab_${dataset}_outputs_${repeat_index}
+input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
+posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
+dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
+complex_filepath: null # if not `null`, this should be the path to the complex PDB file for which to extract outputs
+complex_id: null # if not `null`, this should be the complex ID of the single complex for which to extract outputs
+ligand_smiles: null # if not `null`, this should be the (i.e., `.` fragment-separated) complex ligand SMILES string of the single complex for which to extract outputs
+output_dir: null # if not `null`, this should be the path to the output file to which to write the extracted outputs
+repeat_index: 1 # the repeat index with which inference was run
+pocket_only_baseline: false # whether to prepare the pocket-only baseline
diff --git a/configs/model/chai_inference.yaml b/configs/model/chai_inference.yaml
@@ -0,0 +1,8 @@
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
+input_dir: ${oc.env:PROJECT_ROOT}/forks/chai-lab/prediction_inputs/${dataset} # the input directory with which to run inference
+output_dir: ${oc.env:PROJECT_ROOT}/forks/chai-lab/prediction_outputs/${dataset}_${repeat_index} # the output directory to which to save the inference results
+cuda_device_index: 0 # the CUDA device to use for inference, or `null` to use CPU
+repeat_index: 1 # the repeat index to use for inference
+skip_existing: true # whether to skip running inference if the prediction for a target already exists
+pocket_only_baseline: false # whether to run the pocket-only baseline
+max_num_inputs: null # if provided, the number of (dataset subset) inputs over which to run inference
diff --git a/configs/model/ensemble_generation.yaml b/configs/model/ensemble_generation.yaml
@@ -1,5 +1,5 @@
 # General inference arguments:
-ensemble_methods: [diffdock, dynamicbind, neuralplexer, rfaa] # the methods from which to gather predictions for ensembling - NOTE: must be one of (`diffdock`, `dynamicbind`, `neuralplexer`, `rfaa`, `vina`, `tulip`)
+ensemble_methods: [diffdock, dynamicbind, neuralplexer, rfaa] # the methods from which to gather predictions for ensembling - NOTE: must be one of (`diffdock`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`, `vina`, `tulip`)
 generate_vina_scripts: false # whether to generate Vina scripts using other methods' binding site predictions - NOTE: `resume` must also be `true` when this is `true`, meaning other methods' predictions must have already been generated locally
 rank_single_method_intrinsically: true # whether to rank single-method predictions using either `consensus` or `vina` ranking (false) or instead using their intrinsic (explicit) rank assignment (true)
 output_bash_file_dir: ensemble_generation_scripts # the directory in which to save the generated Bash scripts
@@ -107,6 +107,9 @@ rfaa_output_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/inference/rfa
 rfaa_max_cycles: 10 # the maximum number recycling iterations to run
 rfaa_inference_config_name: null # the name of the inference config to use - NOTE: if `run_inference_directly` is true, this must reference a valid YAML config file name e.g., that was generated by `python posebench/models/rfaa_inference.py` with `run_inference_directly=false`
 rfaa_inference_dir_name: null # the name of the inference output directory to use
+# Chai-1 inference arguments:
+chai_out_path: ${oc.env:PROJECT_ROOT}/forks/chai-lab/inference/chai-lab_ensemble_outputs # the output directory to which to write the predictions
+chai_skip_existing: true # whether to skip running inference if the prediction for a target already exists
 # Vina inference arguments:
 vina_binding_site_methods: [diffdock, p2rank] # the methods to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `dynamicbind`, `neuralplexer`, `p2rank`)
 vina_python2_exec_path: ${oc.env:PROJECT_ROOT}/forks/Vina/ADFR/bin/python # the path to the Python 2 executable