Add --chunk-index for direct chunk assignment (no orchestrator competition)

Donglai Wei · claude · Donglai Wei · commit 1d1a91fa1dde · 2026-03-30T14:36:38.000-04:00
sbatch --array=0-25 now directly assigns chunk N to worker N.
No task claiming, no file locks, no race conditions.
Auto-detects SLURM_ARRAY_TASK_ID when --chunk-index is not set.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/scripts/decode_large.py b/scripts/decode_large.py
@@ -44,6 +44,10 @@ def main():
     parser.add_argument("--config", required=True, help="YAML config file")
     parser.add_argument("--init-only", action="store_true", help="Initialize workflow and exit")
     parser.add_argument("--worker", action="store_true", help="Run as a worker (claim tasks)")
+    parser.add_argument("--chunk-index", type=int, default=None,
+                        help="Decode a specific chunk by index (for sbatch --array)")
+    parser.add_argument("--chunk-range", type=str, default=None,
+                        help="Decode chunk range 'start-end' (inclusive)")
     parser.add_argument("--wait", action="store_true", help="Wait for all tasks to complete")
     parser.add_argument("--assemble", action="store_true", help="Assemble final output volume")
     parser.add_argument("--parallel", type=int, default=None,
@@ -115,6 +119,30 @@ def main():
         print("Workflow initialized. Launch workers to execute tasks.")
         return
 
+    # Direct chunk assignment (no orchestrator competition)
+    chunk_index = args.chunk_index
+    if chunk_index is None and os.environ.get("SLURM_ARRAY_TASK_ID"):
+        # Auto-detect from SLURM array index
+        chunk_index = int(os.environ["SLURM_ARRAY_TASK_ID"])
+
+    if chunk_index is not None or args.chunk_range is not None:
+        if args.chunk_range:
+            start, end = args.chunk_range.split("-")
+            indices = list(range(int(start), int(end) + 1))
+        else:
+            indices = [chunk_index]
+        for idx in indices:
+            if idx >= len(chunks):
+                print(f"Chunk index {idx} out of range (0-{len(chunks)-1}), skipping")
+                continue
+            chunk = chunks[idx]
+            print(f"Decoding chunk {idx}/{len(chunks)}: {chunk.key}")
+            from waterz.orchestrator import TaskRecord, TaskSpec
+            record = TaskRecord(spec=TaskSpec(stage="decode", key=chunk.key))
+            result = runner.handle_decode_chunk(record)
+            print(f"  Done: {result}")
+        return
+
     if args.worker:
         worker_id = args.worker_id or os.environ.get("SLURM_JOB_ID", None)
         job_id = args.job_id or os.environ.get("SLURM_ARRAY_TASK_ID", None)
diff --git a/scripts/decode_large_worker.sh b/scripts/decode_large_worker.sh
@@ -1,17 +1,15 @@
 #!/bin/bash
 #SBATCH --job-name=waterz_worker
 #SBATCH --mem=64G
-#SBATCH --cpus-per-task=4
+#SBATCH --cpus-per-task=2
 #SBATCH --time=12:00:00
 #SBATCH --output=slurm_outputs/waterz_worker_%A_%a.out
 #SBATCH --error=slurm_outputs/waterz_worker_%A_%a.err
 
 # Usage:
-#   sbatch --array=0-7 scripts/decode_large_worker.sh tutorials/waterz_decoding_large.yaml
+#   sbatch --array=0-25 scripts/decode_large_worker.sh tutorials/waterz_decoding_large.yaml
 #
-# Each array task is an independent worker that claims and executes
-# tasks from the shared workflow directory. Workers coordinate via
-# file locks — no central scheduler needed.
+# Worker N decodes chunk N directly — no task competition, no race conditions.
 
 CONFIG=${1:-tutorials/waterz_decoding_large.yaml}
 
@@ -26,9 +24,6 @@ echo "Start: $(date)"
 
 python scripts/decode_large.py \
     --config ${CONFIG} \
-    --worker \
-    --worker-id "$(hostname)-${SLURM_ARRAY_TASK_ID}" \
-    --job-id "${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}" \
-    --idle-timeout 120
+    --chunk-index ${SLURM_ARRAY_TASK_ID}
 
 echo "End: $(date)"