From a8bee21439b5e96e919d9233ce393092a6af8803 Mon Sep 17 00:00:00 2001
From: Dima Molodenskiy <dmolodenskiy@embl-hamburg.de>
Date: Mon, 8 Jun 2026 18:38:17 +0200
Subject: [PATCH] Document Tokamax RTX GPU compatibility

---
 README.md                     |  7 +++++--
 config/config.yaml            | 10 +++++++---
 test/test_memory_resources.py |  2 +-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index c4ad8eb..5790979 100644
--- a/README.md
+++ b/README.md
@@ -263,8 +263,11 @@ you hit these.
   (bracket ranges may be glob-expanded by the shell). Multi-partition routing (e.g. EMBL's bigger
   `gpu-training` cards) is out of scope — keep one partition and let unified memory spill the tail.
 - **Exclude specific nodes** with `slurm_exclude_nodes` → passed verbatim to `sbatch --exclude`
-  (e.g. `"gpu50,gpu51"`). Use it for nodes whose GPU the container can't use — e.g. a CUDA compute
-  capability newer than the container's bundled `ptxas` (fails `ptxas too old` / `UNIMPLEMENTED`).
+  (e.g. `"gpu50,gpu51"`). Use it as a fallback for nodes whose GPU the container can't use — e.g.
+  a CUDA compute capability newer than the container's bundled `ptxas` (fails `ptxas too old` /
+  `UNIMPLEMENTED`). The RTX PRO 6000 / Blackwell failure mode seen on EMBL `gpu50-53` was an
+  old/pre-Tokamax AlphaFold 3 image issue; updated AF3 v3.0.2/Tokamax images should run on those
+  cards, so excluding them is not proof of RTX compatibility.
   `--exclude` is allowed in `slurm_extra` whereas `--constraint`/`--gres`/`--gpus` are not, so it is
   the supported way to drop a few nodes while keeping the rest of the partition.
 - **`structure_inference_max_runtime`** caps per-job wall time (minutes). Wall time scales as
diff --git a/config/config.yaml b/config/config.yaml
index 9b0da20..ff20aa6 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -150,11 +150,15 @@ structure_inference_gpu_model: ""
 #   - {min_vram_gb: 40, nodes: "gpu25,gpu26,gpu27,gpu28"}                                # A100 40GB
 #   - {min_vram_gb: 48, nodes: "gpu40,gpu41,gpu42,gpu43,gpu44,gpu45,gpu46,gpu47,gpu48"}  # L40s/A40 48GB
 #   - {min_vram_gb: 80, nodes: "gpu38,gpu39"}                                            # H100 PCIe 80GB
-# Note: RTX PRO 6000 (gpu50-53, 96GB) are ptxas-incompatible -> keep in slurm_exclude_nodes.
+# Note: RTX PRO 6000 / Blackwell nodes (gpu50-53, 96GB at EMBL) were incompatible
+# with pre-Tokamax AlphaFold 3 containers that bundled an old JAX/JAX-Triton/ptxas
+# stack. Updated AF3 v3.0.2/Tokamax containers should run there; keep those nodes
+# in slurm_exclude_nodes only when using old images or locally built containers that
+# still fail with "ptxas too old" / UNIMPLEMENTED.
 # H100-SXM/H200/B200 live on the separate gpu-training partition (not routed here).
 # Optional: comma-separated nodes to keep structure_inference OFF, passed to sbatch
-# as --exclude. Useful for GPUs the prediction container cannot use (e.g. a CUDA
-# compute capability the bundled ptxas is too old for). Example:
+# as --exclude. Useful as a fallback for GPUs the prediction container cannot use
+# (e.g. a CUDA compute capability the bundled ptxas is too old for). Example:
 # slurm_exclude_nodes: "gpu50,gpu51,gpu52,gpu53"
 # slurm_exclude_nodes: ""
 # Cap structure_inference wall time (minutes) so retry scaling (1440 * attempt) cannot
diff --git a/test/test_memory_resources.py b/test/test_memory_resources.py
index 56b80da..5b74766 100644
--- a/test/test_memory_resources.py
+++ b/test/test_memory_resources.py
@@ -277,7 +277,7 @@ def test_gpu_exclude_nodes_vram_routing():
     assert common.gpu_exclude_nodes(3500, tiers, c) == "n24a,n24b,n48a,n48b"
     # bigger than every tier -> use largest tier (spill), exclude all smaller
     assert common.gpu_exclude_nodes(20000, tiers, c) == "n24a,n24b,n48a,n48b"
-    # static extra excludes are always appended (e.g. ptxas-incompatible cards)
+    # static extra excludes are always appended (fallback for old container/GPU incompatibilities)
     assert common.gpu_exclude_nodes(800, tiers, c, extra_exclude="gpu50,gpu51") == "gpu50,gpu51"
     assert (
         common.gpu_exclude_nodes(2400, tiers, c, extra_exclude="gpu50")