update: refactor code structure for improved readability and maintainability

amorehead · amorehead · commit 30d79d9a73db · 2026-03-13T18:36:15.000Z
diff --git a/Dockerfile b/Dockerfile
@@ -46,6 +46,7 @@ WORKDIR /app/posebench
 ARG GIT_TAG=main
 RUN git clone https://github.com/BioinfoMachineLearning/posebench . --branch ${GIT_TAG} \
     && conda env update -f environments/posebench_environment.yaml \
+    && conda install -y -c conda-forge openff-toolkit=0.16.0 \
     && pip install -e . \
     && pip install numpy==1.26.4 --no-dependencies \
     && pip install prody==2.4.1 --no-dependencies \
diff --git a/configs/scripts/build_inference_script.yaml b/configs/scripts/build_inference_script.yaml
@@ -1,37 +1,37 @@
 # run arguments:
-method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `boltz`, `vina`, `ensemble`)
-vina_binding_site_method: p2rank # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `boltz`, `p2rank`)
+method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `boltz`, `alphafold3`, `vina`, `ensemble`)
+vina_binding_site_method: p2rank # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `boltz`, `alphafold3`, `p2rank`)
 ensemble_ranking_method: consensus # the method to use for ensemble ranking - NOTE: must be one of (`consensus`, `ff`)
 dataset: astex_diverse # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 repeat_index: 1 # the repeat index which was used for inference
 cuda_device_index: 0 # the CUDA device index to use for inference (for all methods except AutoDock-Vina)
 output_script_dir: ${oc.env:PROJECT_ROOT}/scripts/inference # the directory in which to save the output script
-pocket_only_baseline: null # whether to perform a pocket-only baseline for the PoseBusters Benchmark set - NOTE: not applicable only to `tulip`
+pocket_only_baseline: false # whether to perform a pocket-only baseline for the PoseBusters Benchmark set - NOTE: not applicable only to `tulip`
 v1_baseline: false # whether to perform the V1 baseline for DiffDock
-no_ilcl: null # whether to use model weights trained with an inter-ligand clash loss (ILCL) for the CASP15 set - NOTE: only applicable to `neuralplexer`
-relax_protein: null # whether to relax the protein structure before scoring - NOTE: currently in an experimental state
+no_ilcl: false # whether to use model weights trained with an inter-ligand clash loss (ILCL) for the CASP15 set - NOTE: only applicable to `neuralplexer`
+relax_protein: false # whether to relax the protein structure before scoring - NOTE: currently in an experimental state
 export_hpc_headers: true # whether to insert high-performance computing (by default, SLURM) headers into the output script
 verbose: false # whether to print verbose (e.g., invalid configuration) output
 # sweep arguments:
 sweep: false # whether to build all combinations of method-dataset run scripts
 methods_to_sweep: [
     "diffdock",
-    "fabind",
     "dynamicbind",
     "neuralplexer",
-    "flowdock",
     "rfaa",
+    # "chai-lab_ss",
     "chai-lab",
+    # "boltz_ss",
     "boltz",
+    # "alphafold3_ss",
+    "alphafold3",
     "vina",
-    "ensemble",
   ] # the methods to sweep
-vina_binding_site_methods_to_sweep: ["diffdock", "p2rank"] # the Vina binding site prediction methods to sweep
+vina_binding_site_methods_to_sweep: ["p2rank"] # the Vina binding site prediction methods to sweep
 ensemble_ranking_methods_to_sweep: ["consensus"] # the ensemble ranking methods to sweep - NOTE: must be one of (`consensus`, `ff`)
 datasets_to_sweep: [
     "posebusters_benchmark",
     "astex_diverse",
     "dockgen",
-    "casp15",
   ] # the datasets to sweep
 num_sweep_repeats: 3 # the number of repeats to run for each method-dataset sweep (if the method is a generative method)
diff --git a/posebench/models/ensemble_generation.py b/posebench/models/ensemble_generation.py
@@ -38,7 +38,7 @@
 from posebench.analysis.complex_alignment import align_complex_to_protein_only
 from posebench.data.components.protein_apo_to_holo_alignment import read_molecule
 from posebench.models.inference_relaxation import relax_single_filepair
-from posebench.models.minimize_energy import minimize_energy
+# from posebench.models.minimize_energy import minimize_energy
 from posebench.utils.data_utils import (
     extract_sequences_from_protein_structure_file,
     renumber_biopython_structure_residues,
@@ -145,32 +145,26 @@ def insert_hpc_headers(
     :return: Batch headers string for SLURM job scheduling.
     """
     return f"""######################### Batch Headers #########################
-#SBATCH --partition {gpu_partition} # use reserved partition `chengji-lab-gpu`
-#SBATCH --account {gpu_account}  # NOTE: this must be specified to use the reserved partition above
-#SBATCH --nodes=1              # NOTE: this needs to match Lightning's `Trainer(num_nodes=...)`
-#SBATCH --gres gpu:{f'{gpu_type}:' if gpu_type else ''}1      # request {gpu_type} GPU resource(s)
-#SBATCH --ntasks-per-node=1    # NOTE: this needs to be `1` on SLURM clusters when using Lightning's `ddp_spawn` strategy`; otherwise, set to match Lightning's quantity of `Trainer(devices=...)`
-#SBATCH --mem={cpu_memory_in_gb}G              # NOTE: use `--mem=0` to request all memory "available" on the assigned node
-#SBATCH -t {time_limit}          # time limit for the job (up to two days: `2-00:00:00`)
-#SBATCH -J posebench_{method}_ensembling # job name
-#SBATCH --output=R-%x.%j.out   # output log file
-#SBATCH --error=R-%x.%j.err    # error log file
-
-module purge
-module load cuda/11.8.0_gcc_9.5.0
-
-# determine location of the project directory
-use_private_project_dir=false # NOTE: customize as needed
-if [ "$use_private_project_dir" = true ]; then
-    project_dir="/home/$USER/data/Repositories/Lab_Repositories/PoseBench"
-else
-    project_dir="/cluster/pixstor/chengji-lab/$USER/Repositories/Lab_Repositories/PoseBench"
-fi
-
-# shellcheck source=/dev/null
-source /home/$USER/mambaforge/etc/profile.d/conda.sh
-
-cd "$project_dir" || exit"""
+#SBATCH --qos=shared                                          # use specified partition for job
+#SBATCH --image=registry.nersc.gov/m5008/acmwhb/posebench:0.0.1 # use specified container image
+#SBATCH --account=m5008                                       # use specified account for billing (e.g., `m5008` for AI4Science projects)
+#SBATCH --nodes=1                                             # NOTE: this needs to match Lightning's `Trainer(num_nodes=...)`
+#SBATCH --ntasks-per-node=1                                   # NOTE: this needs to be `1` on SLURM clusters when using Lightning's `ddp_spawn` strategy`; otherwise, set to match Lightning's quantity of `Trainer(devices=...)`
+#SBATCH --time=00-05:00:00                                    # time limit for the job (up to 2 days: `02-00:00:00`)
+#SBATCH --job-name=inference_analysis_sweep                         # job name
+#SBATCH --output=scripts/perlmutter/regular/logs/inference_analysis_sweep%j.out  # output log file
+#SBATCH --error=scripts/perlmutter/regular/logs/inference_analysis_sweep%j.err   # error log file
+
+# Wait for 5-10 seconds randomly to avoid race condition
+sleep $((RANDOM % 6 + 5))
+
+# Determine location of the project's directory
+# PROJECT_ID="m5008"
+# PROJECT_DIR="/global/cfs/cdirs/$PROJECT_ID/$USER/Repositories/posebench"            # long term storage community drive
+PROJECT_DIR="/pscratch/sd/a/$USER/Repositories/posebench"                   # high-performance storage scratch drive with an 8-week purge policy
+cd "$PROJECT_DIR" || exit
+
+"""
 
 
 def create_diffdock_bash_script(
diff --git a/scripts/build_inference_script.py b/scripts/build_inference_script.py