Skip to content

Commit c88164f

Browse files
Bihan  RanaBihan  Rana
authored andcommitted
Resolve Review Comments
1 parent 9073229 commit c88164f

1 file changed

Lines changed: 3 additions & 8 deletions

File tree

examples/distributed-training/open-r1/.dstack.yml

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,22 +29,17 @@ commands:
2929
- cd open-r1
3030
- uv pip install .
3131
- |
32-
# Get the last IP from DSTACK_NODES_IPS for vLLM node
33-
VLLM_HOST=$(echo $DSTACK_NODES_IPS | tr ' ' '\n' | tail -n 1)
34-
echo "VLLM host IP (last node): $VLLM_HOST"
35-
3632
if [ "$USE_VLLM" = "true" ]; then
33+
# Get the last IP from DSTACK_NODES_IPS for vLLM node
34+
VLLM_HOST=$(echo $DSTACK_NODES_IPS | tr ' ' '\n' | tail -n 1)
3735
if [ "$DSTACK_NODE_RANK" -eq $(($DSTACK_NODES_NUM - 1)) ]; then
3836
# Last Node runs VLLM server
39-
echo "Starting VLLM server on Last Node (IP: $VLLM_HOST)"
4037
trl vllm-serve --model $MODEL --tensor_parallel_size $TP --data_parallel_size $DP --host 0.0.0.0
4138
else
4239
# Training node - adjust world size and nodes count for training
43-
GPUS_PER_NODE=$(($DSTACK_GPUS_NUM / $DSTACK_NODES_NUM))
4440
ADJUSTED_NODES_NUM=$(($DSTACK_NODES_NUM - 1))
45-
ADJUSTED_GPUS_TOTAL=$(($GPUS_PER_NODE * $ADJUSTED_NODES_NUM))
41+
ADJUSTED_GPUS_TOTAL=$(($DSTACK_GPUS_PER_NODE * $ADJUSTED_NODES_NUM))
4642
# Other nodes run training
47-
echo "Starting training with VLLM on $VLLM_HOST"
4843
accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \
4944
--num_processes=$ADJUSTED_GPUS_TOTAL \
5045
--num_machines=$ADJUSTED_NODES_NUM \

0 commit comments

Comments
 (0)