File tree Expand file tree Collapse file tree
examples/distributed-training/open-r1 Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -29,22 +29,17 @@ commands:
2929 - cd open-r1
3030 - uv pip install .
3131 - |
32- # Get the last IP from DSTACK_NODES_IPS for vLLM node
33- VLLM_HOST=$(echo $DSTACK_NODES_IPS | tr ' ' '\n' | tail -n 1)
34- echo "VLLM host IP (last node): $VLLM_HOST"
35-
3632 if [ "$USE_VLLM" = "true" ]; then
33+ # Get the last IP from DSTACK_NODES_IPS for vLLM node
34+ VLLM_HOST=$(echo $DSTACK_NODES_IPS | tr ' ' '\n' | tail -n 1)
3735 if [ "$DSTACK_NODE_RANK" -eq $(($DSTACK_NODES_NUM - 1)) ]; then
3836 # Last Node runs VLLM server
39- echo "Starting VLLM server on Last Node (IP: $VLLM_HOST)"
4037 trl vllm-serve --model $MODEL --tensor_parallel_size $TP --data_parallel_size $DP --host 0.0.0.0
4138 else
4239 # Training node - adjust world size and nodes count for training
43- GPUS_PER_NODE=$(($DSTACK_GPUS_NUM / $DSTACK_NODES_NUM))
4440 ADJUSTED_NODES_NUM=$(($DSTACK_NODES_NUM - 1))
45- ADJUSTED_GPUS_TOTAL=$(($GPUS_PER_NODE * $ADJUSTED_NODES_NUM))
41+ ADJUSTED_GPUS_TOTAL=$(($DSTACK_GPUS_PER_NODE * $ADJUSTED_NODES_NUM))
4642 # Other nodes run training
47- echo "Starting training with VLLM on $VLLM_HOST"
4843 accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \
4944 --num_processes=$ADJUSTED_GPUS_TOTAL \
5045 --num_machines=$ADJUSTED_NODES_NUM \
You can’t perform that action at this time.
0 commit comments