@@ -76,6 +76,7 @@ cd FlowDock
7676mamba env create -f environments/flowdock_environment.yaml
7777conda activate FlowDock # NOTE: one still needs to use `conda` to (de)activate environments
7878pip3 install -e . # install local project as package
79+ pip3 install prody==2.4.1 --no-dependencies # install ProDy without NumPy dependency
7980```
8081
8182Download checkpoints
@@ -159,6 +160,16 @@ mv pdb_2021aug02/ pdbsidechain/
159160cd ../
160161```
161162
163+ Lastly, to finetune ` FlowDock ` using the ` PLINDER ` dataset, one must first prepare this data for training
164+
165+ ``` bash
166+ # fetch PLINDER data (NOTE: requires ~1 hour to download and ~750G of storage)
167+ export PLINDER_MOUNT=" $( pwd) /data/PLINDER"
168+ mkdir -p " $PLINDER_MOUNT " # create the directory if it doesn't exist
169+
170+ plinder_download -y
171+ ```
172+
162173### Generating ESM2 embeddings for each protein (optional, cached input data available on SharePoint)
163174
164175To generate the ESM2 embeddings for the protein inputs,
@@ -260,10 +271,10 @@ python flowdock/train.py experiment=flowdock_fm
260271python flowdock/train.py experiment=flowdock_fm trainer.max_epochs=20 data.batch_size=8
261272```
262273
263- For example, override parameters to finetune ` FlowDock ` 's pretrained weights using a new dataset
274+ For example, override parameters to finetune ` FlowDock ` 's pretrained weights using a new dataset such as [ PLINDER ] ( https://www.plinder.sh/ )
264275
265276``` bash
266- python flowdock/train.py experiment=flowdock_fm data=my_new_datamodule ckpt_path=checkpoints/esmfold_prior_paper_weights.ckpt
277+ python flowdock/train.py experiment=flowdock_fm data=plinder ckpt_path=checkpoints/esmfold_prior_paper_weights.ckpt
267278```
268279
269280</details >
@@ -277,7 +288,7 @@ To reproduce `FlowDock`'s evaluation results for structure prediction, please re
277288To reproduce ` FlowDock ` 's evaluation results for binding affinity prediction using the PDBBind dataset
278289
279290``` bash
280- python flowdock/eval.py data.test_datasets=[pdbbind] ckpt_path=checkpoints/esmfold_prior_paper_weights_EMA .ckpt trainer=gpu
291+ python flowdock/eval.py data.test_datasets=[pdbbind] ckpt_path=checkpoints/esmfold_prior_paper_weights-EMA .ckpt trainer=gpu
281292... # re-run two more times to gather triplicate results
282293```
283294
@@ -353,13 +364,13 @@ jupyter notebook notebooks/casp16_binding_affinity_prediction_results_plotting.i
353364For example, generate new protein-ligand complexes for a pair of protein sequence and ligand SMILES strings such as those of the PDBBind 2020 test target ` 6i67 `
354365
355366``` bash
356- python flowdock/sample.py ckpt_path=checkpoints/esmfold_prior_paper_weights_EMA .ckpt model.cfg.prior_type=esmfold sampling_task=batched_structure_sampling input_receptor=' YNKIVHLLVAEPEKIYAMPDPTVPDSDIKALTTLCDLADRELVVIIGWAKHIPGFSTLSLADQMSLLQSAWMEILILGVVYRSLFEDELVYADDYIMDEDQSKLAGLLDLNNAILQLVKKYKSMKLEKEEFVTLKAIALANSDSMHIEDVEAVQKLQDVLHEALQDYEAGQHMEDPRRAGKMLMTLPLLRQTSTKAVQHFYNKLEGKVPMHKLFLEMLEAKV' input_ligand=' "c1cc2c(cc1O)CCCC2"' input_template=data/pdbbind/pdbbind_holo_aligned_esmfold_structures/6i67_holo_aligned_esmfold_protein.pdb sample_id=' 6i67' out_path=' ./6i67_sampled_structures/' n_samples=5 chunk_size=5 num_steps=40 sampler=VDODE sampler_eta=1.0 start_time=' 1.0' use_template=true separate_pdb=true visualize_sample_trajectories=true auxiliary_estimation_only=false esmfold_chunk_size=null trainer=gpu
367+ python flowdock/sample.py ckpt_path=checkpoints/esmfold_prior_paper_weights-EMA .ckpt model.cfg.prior_type=esmfold sampling_task=batched_structure_sampling input_receptor=' YNKIVHLLVAEPEKIYAMPDPTVPDSDIKALTTLCDLADRELVVIIGWAKHIPGFSTLSLADQMSLLQSAWMEILILGVVYRSLFEDELVYADDYIMDEDQSKLAGLLDLNNAILQLVKKYKSMKLEKEEFVTLKAIALANSDSMHIEDVEAVQKLQDVLHEALQDYEAGQHMEDPRRAGKMLMTLPLLRQTSTKAVQHFYNKLEGKVPMHKLFLEMLEAKV' input_ligand=' "c1cc2c(cc1O)CCCC2"' input_template=data/pdbbind/pdbbind_holo_aligned_esmfold_structures/6i67_holo_aligned_esmfold_protein.pdb sample_id=' 6i67' out_path=' ./6i67_sampled_structures/' n_samples=5 chunk_size=5 num_steps=40 sampler=VDODE sampler_eta=1.0 start_time=' 1.0' use_template=true separate_pdb=true visualize_sample_trajectories=true auxiliary_estimation_only=false esmfold_chunk_size=null trainer=gpu
357368```
358369
359370Or, for example, generate new protein-ligand complexes for pairs of protein sequences and (multi-)ligand SMILES strings (delimited via ` | ` ) such as those of the CASP15 target ` T1152 `
360371
361372``` bash
362- python flowdock/sample.py ckpt_path=checkpoints/esmfold_prior_paper_weights_EMA .ckpt model.cfg.prior_type=esmfold sampling_task=batched_structure_sampling input_receptor=' MYTVKPGDTMWKIAVKYQIGISEIIAANPQIKNPNLIYPGQKINIP|MYTVKPGDTMWKIAVKYQIGISEIIAANPQIKNPNLIYPGQKINIP|MYTVKPGDTMWKIAVKYQIGISEIIAANPQIKNPNLIYPGQKINIPN' input_ligand=' "CC(=O)NC1C(O)OC(CO)C(OC2OC(CO)C(OC3OC(CO)C(O)C(O)C3NC(C)=O)C(O)C2NC(C)=O)C1O"' input_template=data/test_cases/predicted_structures/T1152.pdb sample_id=' T1152' out_path=' ./T1152_sampled_structures/' n_samples=5 chunk_size=5 num_steps=40 sampler=VDODE sampler_eta=1.0 start_time=' 1.0' use_template=true separate_pdb=true visualize_sample_trajectories=true auxiliary_estimation_only=false esmfold_chunk_size=null trainer=gpu
373+ python flowdock/sample.py ckpt_path=checkpoints/esmfold_prior_paper_weights-EMA .ckpt model.cfg.prior_type=esmfold sampling_task=batched_structure_sampling input_receptor=' MYTVKPGDTMWKIAVKYQIGISEIIAANPQIKNPNLIYPGQKINIP|MYTVKPGDTMWKIAVKYQIGISEIIAANPQIKNPNLIYPGQKINIP|MYTVKPGDTMWKIAVKYQIGISEIIAANPQIKNPNLIYPGQKINIPN' input_ligand=' "CC(=O)NC1C(O)OC(CO)C(OC2OC(CO)C(OC3OC(CO)C(O)C(O)C3NC(C)=O)C(O)C2NC(C)=O)C1O"' input_template=data/test_cases/predicted_structures/T1152.pdb sample_id=' T1152' out_path=' ./T1152_sampled_structures/' n_samples=5 chunk_size=5 num_steps=40 sampler=VDODE sampler_eta=1.0 start_time=' 1.0' use_template=true separate_pdb=true visualize_sample_trajectories=true auxiliary_estimation_only=false esmfold_chunk_size=null trainer=gpu
363374```
364375
365376If you do not already have a template protein structure available for your target of interest, set ` input_template=null ` to instead have the sampling script predict the ESMFold structure of your provided ` input_protein ` sequence before running the sampling pipeline. For more information regarding the input arguments available for sampling, please refer to the config at ` configs/sample.yaml ` .
@@ -369,7 +380,7 @@ If you do not already have a template protein structure available for your targe
369380For instance, one can perform batched prediction as follows:
370381
371382``` bash
372- python flowdock/sample.py ckpt_path=checkpoints/esmfold_prior_paper_weights_EMA .ckpt model.cfg.prior_type=esmfold sampling_task=batched_structure_sampling csv_path=' ./data/test_cases/prediction_inputs/flowdock_batched_inputs.csv' out_path=' ./T1152_batch_sampled_structures/' n_samples=5 chunk_size=5 num_steps=40 sampler=VDODE sampler_eta=1.0 start_time=' 1.0' use_template=true separate_pdb=true visualize_sample_trajectories=false auxiliary_estimation_only=false esmfold_chunk_size=null trainer=gpu
383+ python flowdock/sample.py ckpt_path=checkpoints/esmfold_prior_paper_weights-EMA .ckpt model.cfg.prior_type=esmfold sampling_task=batched_structure_sampling csv_path=' ./data/test_cases/prediction_inputs/flowdock_batched_inputs.csv' out_path=' ./T1152_batch_sampled_structures/' n_samples=5 chunk_size=5 num_steps=40 sampler=VDODE sampler_eta=1.0 start_time=' 1.0' use_template=true separate_pdb=true visualize_sample_trajectories=false auxiliary_estimation_only=false esmfold_chunk_size=null trainer=gpu
373384```
374385
375386</details >
0 commit comments