This repository was archived by the owner on Feb 22, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheval.sbatch
More file actions
130 lines (105 loc) · 4.47 KB
/
eval.sbatch
File metadata and controls
130 lines (105 loc) · 4.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/bin/bash
#SBATCH --job-name=castor_eval
#SBATCH --output=slurm_logs/slurm-%x-%j.out # %x for job name, %j for job ID
#SBATCH --error=slurm_logs/slurm-%x-%j.err
# User will specify --nodes and --partition on sbatch command line
# e.g., sbatch --nodes=2 --partition=my_partition eval.sbatch
#SBATCH --ntasks-per-node=1 # We run one torchrun launcher per node
#SBATCH --gpus-per-node=8 # Each torchrun launcher will manage 8 processes, one per GPU
# --- Project and Log Directories ---
PROJECT_DIR=${PROJECT_DIR:-"/fsx/ubuntu/workspace/repo/Pollux"}
LOG_DIR=${LOG_DIR:-"/fsx/checkpoints/ablations/logs"}
echo "Changing directory to Project Directory: ${PROJECT_DIR}"
cd "${PROJECT_DIR}" || { echo "Failed to cd into ${PROJECT_DIR}"; exit 1; }
echo "Current working directory: $(pwd)"
# --- User defined ENVs for AWS Hyperpod ---
export NCCL_PROTO="Simple"
export FI_PROVIDER="efa"
export FI_EFA_USE_DEVICE_RDMA="1"
export FI_EFA_USE_HUGE_PAGE="0"
export FI_EFA_SET_CUDA_SYNC_MEMOPS="0"
export NCCL_SOCKET_IFNAME="^docker,lo,veth,eth"
export LD_PRELOAD="/usr/local/cuda-12.8/lib/libnccl.so"
# --- Conda environment ---
CONDA_ENV_NAME="pollux"
CONDA_PATH=${CONDA_PATH:-"/fsx/ubuntu/miniconda3"}
export PATH="$CONDA_PATH/bin:$PATH"
source $CONDA_PATH/etc/profile.d/conda.sh
echo "Attempting to activate conda environment: ${CONDA_ENV_NAME}"
_CONDA_ROOT=$(conda info --base 2>/dev/null)
if [ -z "${_CONDA_ROOT}" ]; then
echo "Error: conda command not found or conda base not determined."
echo "Please ensure conda is installed and initialized."
exit 1
fi
conda activate "${CONDA_ENV_NAME}"
if [ $? -ne 0 ]; then
echo "Error: Failed to activate conda environment: ${CONDA_ENV_NAME}"
echo "Please ensure the environment exists and conda is correctly set up."
exit 1
fi
echo "Conda environment ${CONDA_ENV_NAME} activated successfully."
echo "Python executable: $(which python)"
echo "PYTHONPATH: $PYTHONPATH"
# --- PyTorch distributed setup ---
# Determine Master Address and Port from Slurm
export PytorchMASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export PytorchMASTER_PORT=29500 # Default port
echo "--- Slurm Job Information ---"
echo "SLURM_JOB_ID: ${SLURM_JOB_ID}"
echo "SLURM_JOB_NODELIST: ${SLURM_JOB_NODELIST}"
echo "SLURM_NNODES: ${SLURM_NNODES}"
echo "SLURM_NTASKS_PER_NODE: ${SLURM_NTASKS_PER_NODE}"
echo "SLURM_SUBMIT_DIR: ${SLURM_SUBMIT_DIR}"
echo "PytorchMASTER_ADDR: ${PytorchMASTER_ADDR}"
echo "PytorchMASTER_PORT: ${PytorchMASTER_PORT}"
echo "--- End Slurm Job Information ---"
AUTO_RESUME=""
if [ -d "/opt/sagemaker_cluster" ]; then
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
AUTO_RESUME="--auto-resume=1"
fi
TORCHRUN_CMD="torchrun"
# TORCHRUN_ARGS:
# torchrun will use the PytorchMASTER_ADDR and PytorchMASTER_PORT for rendezvous.
# nnodes and node_rank are typically auto-detected by torchrun from Slurm environment variables.
declare -a TORCHRUN_ARGS=(
"--nnodes=${SLURM_NNODES}"
"--nproc_per_node=1"
"--rdzv_backend=c10d"
"--rdzv_endpoint=${PytorchMASTER_ADDR}:${PytorchMASTER_PORT}"
"--log_dir=${LOG_DIR}/torchrun_logs/job_${SLURM_JOB_ID}_node_${SLURM_NODEID}" # Per-node torchrun logs
)
# Training script module and its arguments
declare -a TRAIN_SCRIPT_ARGS=(
"-m"
"apps.Castor.eval"
)
declare -a TRAINING_ARGS=(
"config=apps/Castor/configs/eval.yaml"
)
echo "--- srun command execution ---"
echo "Starting evaluation with ${SLURM_NNODES} nodes."
echo "Host where sbatch script is running: $(hostname)"
echo "User: $(whoami)"
echo "Current working directory: $(pwd)"
# The srun command structure requested by user.
# The -l flag labels srun output lines with the task number.
# srun will launch this command once per node (due to --ntasks-per-node=1).
echo "TORCHRUN_CMD: ${TORCHRUN_CMD}"
echo "TORCHRUN_ARGS: ${TORCHRUN_ARGS[*]}"
echo "TRAIN_SCRIPT_ARGS: ${TRAIN_SCRIPT_ARGS[*]}"
echo "TRAINING_ARGS: ${TRAINING_ARGS[*]}"
# Ensure all necessary variables are exported for srun tasks
export PATH FI_PROVIDER FI_EFA_USE_DEVICE_RDMA FI_EFA_USE_HUGE_PAGE FI_EFA_SET_CUDA_SYNC_MEMOPS NCCL_PROTO NCCL_SOCKET_IFNAME LD_PRELOAD
srun ${AUTO_RESUME} \
"${TORCHRUN_CMD}" \
"${TORCHRUN_ARGS[@]}" \
"${TRAIN_SCRIPT_ARGS[@]}" \
"${TRAINING_ARGS[@]}"
EXIT_CODE=$?
echo "srun command finished with exit code ${EXIT_CODE}."
if [ ${EXIT_CODE} -ne 0 ]; then
echo "Evaluation job failed. Please check logs in slurm-${SLURM_JOB_NAME}-${SLURM_JOB_ID}.out/err and any application specific logs."
fi
exit ${EXIT_CODE}