Skip to content

Commit

Permalink
Merge pull request #7 from BioinfoMachineLearning/af
Browse files Browse the repository at this point in the history
v0.5.0 - Add results with AlphaFold 3 predicted structures and for Chai-1
  • Loading branch information
amorehead authored Sep 30, 2024
2 parents 7b98b21 + e8db61e commit 4ac71fd
Show file tree
Hide file tree
Showing 311 changed files with 42,472 additions and 3,096 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/code-quality-main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ jobs:
python-version: "3.10"

- name: Run pre-commits
uses: pre-commit/action@v2.0.3
uses: pre-commit/action@v3.0.1
2 changes: 1 addition & 1 deletion .github/workflows/code-quality-pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ jobs:
run: echo '${{ steps.file_changes.outputs.files}}'

- name: Run pre-commits
uses: pre-commit/action@v2.0.3
uses: pre-commit/action@v3.0.1
with:
extra_args: --files ${{ steps.file_changes.outputs.files}}
2 changes: 1 addition & 1 deletion .github/workflows/release-drafter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@ jobs:

steps:
# Drafts your next Release notes as Pull Requests are merged into "master"
- uses: release-drafter/release-drafter@v5
- uses: release-drafter/release-drafter@v6
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ configs/local/default.yaml

# Forks
/workdir/
/forks/chai-lab/chai-lab/
/forks/chai-lab/prediction_inputs/
/forks/chai-lab/prediction_outputs/
/forks/DiffDock1.0/
/forks/DiffDock/DiffDock/
/forks/DynamicBind/*.npy
Expand All @@ -189,4 +192,4 @@ configs/local/default.yaml
/forks/RoseTTAFold-All-Atom/psipred/
/forks/TULIP/outputs/
/forks/Vina/ADFR/
scripts/inference/
scripts/*inference*/
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
### 0.5.0 - 09/30/2024

- Added results with AlphaFold 3 predicted structures (now the default)
- Added results for the new Chai-1 model from Chai Discovery
- Added a new inference sweep pipeline for HPC clusters to allow users to quickly run an exhaustive sweep of all baseline methods, datasets, and tasks e.g., using generated batch scripts and a SLURM scheduler
- Updated Zenodo links to point to the latest version of the project's Zenodo record, which now includes the above-mentioned AlphaFold 3 predicted structures and baseline method results using them
- Updated documentation project-wide according to the additions listed above
- Fixed some CI testing issues

### 0.4.0 - 08/12/2024

- Renamed `src` root directory to `posebench` to support `pip` packaging
Expand Down
302 changes: 205 additions & 97 deletions README.md

Large diffs are not rendered by default.

9 changes: 5 additions & 4 deletions configs/analysis/complex_alignment.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
method: neuralplexer # the method for which to align predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
method: neuralplexer # the method for which to align predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`)
vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index}} # the output directory to which to save the relaxed predictions
output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index},${pocket_only_baseline},${v1_baseline}} # the output directory to which to save the relaxed predictions
rank_to_align: 1 # the pose rank to align
aligned_filename_postfix: "_aligned" # the postfix to append to each aligned complex filename
aligned_filename_suffix: "_aligned" # the suffix to append to each aligned complex filename
force_process: false # whether to force processing of all complexes, even if they have already been processed
repeat_index: 1 # the repeat index which was used for inference
pocket_only_baseline: false # whether to prepare the pocket-only baseline
v1_baseline: false # whether to prepare the v1 baseline
11 changes: 6 additions & 5 deletions configs/analysis/inference_analysis.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
full_report: true # whether to generate a full PoseBusters report (i.e. with all metrics) or a summary report (i.e. with only the most important metrics)
method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `vina`, `ensemble`)
vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `p2rank`)
method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`, `vina`, `ensemble`)
vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`, `p2rank`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
input_csv_path: ${resolve_method_input_csv_path:${method},${dataset}} # the input CSV filepath with which to run inference
input_csv_path: ${resolve_method_input_csv_path:${method},${dataset},${pocket_only_baseline}} # the input CSV filepath with which to run inference
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test_rmsd_filtered.txt # the path to the (ESMFold RMSD-filtered) DockGen test set IDs file
output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index}} # the output directory to which to save the relaxed predictions
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test_rmsd_filtered.txt # the path to the (predicted RMSD-filtered) DockGen test set IDs file
output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index},${pocket_only_baseline},${v1_baseline}} # the output directory to which to save the relaxed predictions
repeat_index: 1 # the repeat index which was used for inference
pocket_only_baseline: false # whether to analyze the pocket-only baseline
v1_baseline: false # whether to analyze the v1 baseline
relax_protein: false # whether to relax the protein - NOTE: currently periodically yields unpredictable protein-ligand separation
8 changes: 5 additions & 3 deletions configs/analysis/inference_analysis_casp.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
full_report: true # whether to generate a full PoseBusters report (i.e. with all metrics) or a summary report (i.e. with only the most important metrics)
python_exec_path: ${oc.env:HOME}/mambaforge/envs/casp15_ligand_scoring/bin/python3 # the Python executable to use
scoring_script_path: ${oc.env:PROJECT_ROOT}/posebench/analysis/casp15_ligand_scoring/score_predictions.py # the path to the script to use for scoring CASP predictions
method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `vina`, `ensemble`, `tulip`)
vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`, `vina`, `ensemble`, `tulip`)
vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `chai-lab`)
dataset: casp15 # the dataset to use - NOTE: must be one of (`casp15`)
ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
predictions_dir: ${oc.env:PROJECT_ROOT}/data/test_cases/${dataset}/top_${method}_ensemble_predictions_${repeat_index} # the directory containing the predictions to analyze
Expand All @@ -12,4 +12,6 @@ fault_tolerant: true # whether to continue processing targets if an error occurs
skip_existing: true # whether to skip processing targets for which output already exists
score_relaxed_structures: true # whether to score relaxed structures in addition to the original (unrelaxed) structures
repeat_index: 1 # the run index to use for scoring predictions
no_pretraining: false # whether to score a model without pretraining
no_ilcl: false # whether to score a model trained without an inter-ligand clash loss (ILCL) - NOTE: only applicable to the `neuralplexer` method
relax_protein: false # whether to relax the protein - NOTE: currently periodically yields unpredictable protein-ligand separation
v1_baseline: false # whether to score the v1 baseline predictions
9 changes: 9 additions & 0 deletions configs/data/chai_input_preparation.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
output_scripts_path: ${oc.env:PROJECT_ROOT}/forks/chai-lab/prediction_inputs/${dataset} # the output directory in which to save the input files
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
protein_filepath: null # the path to the protein structure file to use
ligand_smiles: null # the ligand SMILES string for which to predict the binding pose
input_id: null # the input ID to use for inference
pocket_only_baseline: false # whether to prepare the pocket-only baseline
13 changes: 13 additions & 0 deletions configs/data/chai_output_extraction.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
prediction_inputs_dir: ${oc.env:PROJECT_ROOT}/forks/chai-lab/prediction_inputs/${dataset}
prediction_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/chai-lab/prediction_outputs/${dataset}_${repeat_index}
inference_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/chai-lab/inference/chai-lab_${dataset}_outputs_${repeat_index}
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
complex_filepath: null # if not `null`, this should be the path to the complex PDB file for which to extract outputs
complex_id: null # if not `null`, this should be the complex ID of the single complex for which to extract outputs
ligand_smiles: null # if not `null`, this should be the (i.e., `.` fragment-separated) complex ligand SMILES string of the single complex for which to extract outputs
output_dir: null # if not `null`, this should be the path to the output file to which to write the extracted outputs
repeat_index: 1 # the repeat index with which inference was run
pocket_only_baseline: false # whether to prepare the pocket-only baseline
2 changes: 1 addition & 1 deletion configs/data/diffdock_input_preparation.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
input_protein_structure_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # the input protein structure directory to parse
input_protein_structure_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_predicted_structures # the input protein structure directory to parse
output_csv_path: ${oc.env:PROJECT_ROOT}/forks/DiffDock/inference/diffdock_${dataset}_inputs.csv # the output CSV filepath to which to write the parsed input data
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
Expand Down
1 change: 1 addition & 0 deletions configs/data/dynamicbind_input_preparation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_id
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
protein_filepath: null # the path to the protein structure file to use
ligand_smiles: null # the ligand SMILES string for which to predict the binding pose
pocket_only_baseline: false # whether to prepare the pocket-only baseline
1 change: 1 addition & 0 deletions configs/data/fabind_input_preparation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-l
output_csv_path: ${oc.env:PROJECT_ROOT}/forks/FABind/inference/fabind_${dataset}_inputs.csv # the output CSV filepath to which to write the parsed input data
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
pocket_only_baseline: false # whether to prepare the pocket-only baseline
2 changes: 1 addition & 1 deletion configs/data/neuralplexer_input_preparation.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
input_receptor_structure_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # if not `null`, the input template protein structure directory to parse
input_receptor_structure_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_predicted_structures # if not `null`, the input template protein structure directory to parse
output_csv_path: ${oc.env:PROJECT_ROOT}/forks/NeuralPLexer/inference/neuralplexer_${dataset}_inputs.csv # the output CSV filepath to which to write the parsed input data
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
Expand Down
1 change: 1 addition & 0 deletions configs/data/rfaa_output_extraction.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ complex_id: null # if not `null`, this should be the complex ID of the single co
ligand_smiles: null # if not `null`, this should be the (i.e., `.` fragment-separated) complex ligand SMILES string of the single complex for which to extract outputs
output_dir: null # if not `null`, this should be the path to the output file to which to write the extracted outputs
repeat_index: 1 # the repeat index with which inference was run
pocket_only_baseline: false # whether to prepare the pocket-only baseline
1 change: 1 addition & 0 deletions configs/data/tulip_output_extraction.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@ dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`pos
prediction_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/TULIP/outputs/${dataset}_${repeat_index}
inference_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/TULIP/inference/tulip_${dataset}_outputs_${repeat_index}
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
method_top_n_to_select: 5 # the number of top models for each target to select for analysis
repeat_index: 1 # the repeat index to use
8 changes: 8 additions & 0 deletions configs/model/chai_inference.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_dir: ${oc.env:PROJECT_ROOT}/forks/chai-lab/prediction_inputs/${dataset} # the input directory with which to run inference
output_dir: ${oc.env:PROJECT_ROOT}/forks/chai-lab/prediction_outputs/${dataset}_${repeat_index} # the output directory to which to save the inference results
cuda_device_index: 0 # the CUDA device to use for inference, or `null` to use CPU
repeat_index: 1 # the repeat index to use for inference
skip_existing: true # whether to skip running inference if the prediction for a target already exists
pocket_only_baseline: false # whether to run the pocket-only baseline
max_num_inputs: null # if provided, the number of (dataset subset) inputs over which to run inference
2 changes: 2 additions & 0 deletions configs/model/diffdock_inference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,6 @@ actual_steps: 19 # the actual number of inference steps to run (i.e., after how
no_final_step_noise: true # whether to disable the final inference step's noise from being added
repeat_index: 1 # the repeat index to use for inference
skip_existing: true # whether to skip inference for existing output directories
pocket_only_baseline: false # whether to run the pocket-only baseline
max_num_inputs: null # if provided, the number of (dataset subset) inputs over which to run inference
v1_baseline: false # whether to run the v1 baseline
3 changes: 2 additions & 1 deletion configs/model/dynamicbind_inference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ cuda_device_index: 0 # the CUDA device to use for inference, or `null` to use CP
python_exec_path: ${oc.env:PROJECT_ROOT}/forks/DynamicBind/DynamicBind/bin/python3 # the Python executable to use
dynamicbind_exec_dir: ${oc.env:PROJECT_ROOT}/forks/DynamicBind # the DynamicBind directory in which to execute the inference scripts
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # the input protein-ligand complex directory to recursively parse for protein inputs
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_predicted_structures # the input protein-ligand complex directory to recursively parse for protein inputs
input_ligand_csv_dir: ${oc.env:PROJECT_ROOT}/forks/DynamicBind/inference/dynamicbind_${dataset}_inputs # the input CSV directory with which to run inference
samples_per_complex: 40 # the number of samples to generate per complex
savings_per_complex: 1 # the (top-N) number of sample visualizations to save per complex
inference_steps: 20 # the number of inference steps to run for each complex
batch_size: 5 # the batch size to use for inference
cache_path: ${oc.env:PROJECT_ROOT}/data/dynamicbind_cache/cache # the cache directory to use for storing intermediate data files
header: ${dataset} # name of the results directory to create
num_workers: 1 # the number of workers to use for native relaxation during inference
skip_existing: true # whether to skip existing predictions
Expand Down
Loading

0 comments on commit 4ac71fd

Please sign in to comment.