-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathrun_master_bert.sh
More file actions
20 lines (17 loc) · 1.28 KB
/
run_master_bert.sh
File metadata and controls
20 lines (17 loc) · 1.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#!/bin/bash
PIPELINE_PARALLEL=2
MODEL_CHUNKS=4
TENSOR_PARALLEL=4
VIRTUAL_STAGE_LAYER=$((16 / (PIPELINE_PARALLEL * MODEL_CHUNKS)))
sed -i "s/^NODE_RANK=[0-9]\+/NODE_RANK=0/" examples/bert/train_bert_340m_distributed_master.sh
sed -i "s/^NUM_NODES=[0-9]\+/NUM_NODES=2/" examples/bert/train_bert_340m_distributed_master.sh
sed -i "s/^PIPELINE_PARALLEL=[0-9]\+/PIPELINE_PARALLEL=$PIPELINE_PARALLEL/" examples/bert/train_bert_340m_distributed_master.sh
sed -i "s/^VPP=[0-9]\+/VPP=$VIRTUAL_STAGE_LAYER/" examples/bert/train_bert_340m_distributed_master.sh
sed -i "s/^TENSOR_PARALLEL=[0-9]\+/TENSOR_PARALLEL=$TENSOR_PARALLEL/" examples/bert/train_bert_340m_distributed_master.sh
sed -i "s/^PIPELINE_PARALLEL=[0-9]\+/PIPELINE_PARALLEL=$PIPELINE_PARALLEL/" examples/bert/train_bert_340m_distributed_worker.sh
sed -i "s/^VPP=[0-9]\+/VPP=$VIRTUAL_STAGE_LAYER/" examples/bert/train_bert_340m_distributed_worker.sh
sed -i "s/^TENSOR_PARALLEL=[0-9]\+/TENSOR_PARALLEL=$TENSOR_PARALLEL/" examples/bert/train_bert_340m_distributed_worker.sh
POD_IP=$(hostname -i)
sed -i "s/^MASTER_ADDR=.*/MASTER_ADDR=$POD_IP/" examples/bert/train_bert_340m_distributed_master.sh
sed -i "s/^MASTER_ADDR=.*/MASTER_ADDR=$POD_IP/" examples/bert/train_bert_340m_distributed_worker.sh
bash examples/bert/train_bert_340m_distributed_master.sh