From a8bee21439b5e96e919d9233ce393092a6af8803 Mon Sep 17 00:00:00 2001 From: Dima Molodenskiy Date: Mon, 8 Jun 2026 18:38:17 +0200 Subject: [PATCH] Document Tokamax RTX GPU compatibility --- README.md | 7 +++++-- config/config.yaml | 10 +++++++--- test/test_memory_resources.py | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index c4ad8eb..5790979 100644 --- a/README.md +++ b/README.md @@ -263,8 +263,11 @@ you hit these. (bracket ranges may be glob-expanded by the shell). Multi-partition routing (e.g. EMBL's bigger `gpu-training` cards) is out of scope — keep one partition and let unified memory spill the tail. - **Exclude specific nodes** with `slurm_exclude_nodes` → passed verbatim to `sbatch --exclude` - (e.g. `"gpu50,gpu51"`). Use it for nodes whose GPU the container can't use — e.g. a CUDA compute - capability newer than the container's bundled `ptxas` (fails `ptxas too old` / `UNIMPLEMENTED`). + (e.g. `"gpu50,gpu51"`). Use it as a fallback for nodes whose GPU the container can't use — e.g. + a CUDA compute capability newer than the container's bundled `ptxas` (fails `ptxas too old` / + `UNIMPLEMENTED`). The RTX PRO 6000 / Blackwell failure mode seen on EMBL `gpu50-53` was an + old/pre-Tokamax AlphaFold 3 image issue; updated AF3 v3.0.2/Tokamax images should run on those + cards, so excluding them is not proof of RTX compatibility. `--exclude` is allowed in `slurm_extra` whereas `--constraint`/`--gres`/`--gpus` are not, so it is the supported way to drop a few nodes while keeping the rest of the partition. - **`structure_inference_max_runtime`** caps per-job wall time (minutes). Wall time scales as diff --git a/config/config.yaml b/config/config.yaml index 9b0da20..ff20aa6 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -150,11 +150,15 @@ structure_inference_gpu_model: "" # - {min_vram_gb: 40, nodes: "gpu25,gpu26,gpu27,gpu28"} # A100 40GB # - {min_vram_gb: 48, nodes: "gpu40,gpu41,gpu42,gpu43,gpu44,gpu45,gpu46,gpu47,gpu48"} # L40s/A40 48GB # - {min_vram_gb: 80, nodes: "gpu38,gpu39"} # H100 PCIe 80GB -# Note: RTX PRO 6000 (gpu50-53, 96GB) are ptxas-incompatible -> keep in slurm_exclude_nodes. +# Note: RTX PRO 6000 / Blackwell nodes (gpu50-53, 96GB at EMBL) were incompatible +# with pre-Tokamax AlphaFold 3 containers that bundled an old JAX/JAX-Triton/ptxas +# stack. Updated AF3 v3.0.2/Tokamax containers should run there; keep those nodes +# in slurm_exclude_nodes only when using old images or locally built containers that +# still fail with "ptxas too old" / UNIMPLEMENTED. # H100-SXM/H200/B200 live on the separate gpu-training partition (not routed here). # Optional: comma-separated nodes to keep structure_inference OFF, passed to sbatch -# as --exclude. Useful for GPUs the prediction container cannot use (e.g. a CUDA -# compute capability the bundled ptxas is too old for). Example: +# as --exclude. Useful as a fallback for GPUs the prediction container cannot use +# (e.g. a CUDA compute capability the bundled ptxas is too old for). Example: # slurm_exclude_nodes: "gpu50,gpu51,gpu52,gpu53" # slurm_exclude_nodes: "" # Cap structure_inference wall time (minutes) so retry scaling (1440 * attempt) cannot diff --git a/test/test_memory_resources.py b/test/test_memory_resources.py index 56b80da..5b74766 100644 --- a/test/test_memory_resources.py +++ b/test/test_memory_resources.py @@ -277,7 +277,7 @@ def test_gpu_exclude_nodes_vram_routing(): assert common.gpu_exclude_nodes(3500, tiers, c) == "n24a,n24b,n48a,n48b" # bigger than every tier -> use largest tier (spill), exclude all smaller assert common.gpu_exclude_nodes(20000, tiers, c) == "n24a,n24b,n48a,n48b" - # static extra excludes are always appended (e.g. ptxas-incompatible cards) + # static extra excludes are always appended (fallback for old container/GPU incompatibilities) assert common.gpu_exclude_nodes(800, tiers, c, extra_exclude="gpu50,gpu51") == "gpu50,gpu51" assert ( common.gpu_exclude_nodes(2400, tiers, c, extra_exclude="gpu50")